changeset 8630:d30647171e49

Merge
author aph
date Thu, 02 Jul 2015 11:12:59 +0100
parents 9fcbb6768a78 f7b19ca15ef8
children 9ad1e00a9f13
files src/cpu/x86/vm/sharedRuntime_x86_64.cpp src/cpu/x86/vm/stubGenerator_x86_64.cpp src/cpu/x86/vm/vm_version_x86.cpp src/share/vm/classfile/vmSymbols.hpp src/share/vm/opto/c2_globals.hpp src/share/vm/opto/escape.cpp src/share/vm/opto/library_call.cpp src/share/vm/opto/runtime.cpp src/share/vm/opto/runtime.hpp src/share/vm/runtime/stubRoutines.cpp src/share/vm/runtime/stubRoutines.hpp test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForSupportedSparcCPU.java test/compiler/intrinsics/sha/cli/testcases/UseSHAIntrinsicsSpecificTestCaseForUnsupportedSparcCPU.java test/compiler/intrinsics/sha/cli/testcases/UseSHASpecificTestCaseForSupportedSparcCPU.java test/compiler/intrinsics/sha/cli/testcases/UseSHASpecificTestCaseForUnsupportedSparcCPU.java
diffstat 349 files changed, 14993 insertions(+), 4022 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Tue Jun 16 17:31:53 2015 +0100
+++ b/.hgtags	Thu Jul 02 11:12:59 2015 +0100
@@ -469,3 +469,6 @@
 bf92b8db249cdfa5651ef954b6c0743a7e0ea4cd jdk9-b64
 e7ae94c4f35e940ea423fc1dd260435df34a77c0 jdk9-b65
 197e94e0dacddd16816f101d24fc0442ab518326 jdk9-b66
+d47dfabd16d48eb96a451edd1b61194a39ee0eb5 jdk9-b67
+11af3990d56c97b40318bc1f20608e86f051a3f7 jdk9-b68
+ff0929a59ced0e144201aa05819ae2e47d6f2c61 jdk9-b69
--- a/agent/make/Makefile	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/make/Makefile	Thu Jul 02 11:12:59 2015 +0100
@@ -58,6 +58,7 @@
 sun.jvm.hotspot.debugger.dummy \
 sun.jvm.hotspot.debugger.linux \
 sun.jvm.hotspot.debugger.linux.amd64 \
+sun.jvm.hotspot.debugger.linux.aarch64 \
 sun.jvm.hotspot.debugger.linux.ppc64 \
 sun.jvm.hotspot.debugger.linux.x86 \
 sun.jvm.hotspot.debugger.posix \
@@ -65,6 +66,7 @@
 sun.jvm.hotspot.debugger.ppc64 \
 sun.jvm.hotspot.debugger.proc \
 sun.jvm.hotspot.debugger.proc.amd64 \
+sun.jvm.hotspot.debugger.proc.aarch64 \
 sun.jvm.hotspot.debugger.proc.ppc64 \
 sun.jvm.hotspot.debugger.proc.sparc \
 sun.jvm.hotspot.debugger.proc.x86 \
@@ -91,11 +93,13 @@
 sun.jvm.hotspot.prims \
 sun.jvm.hotspot.runtime \
 sun.jvm.hotspot.runtime.amd64 \
+sun.jvm.hotspot.runtime.aarch64 \
 sun.jvm.hotspot.runtime.bsd \
 sun.jvm.hotspot.runtime.bsd_amd64 \
 sun.jvm.hotspot.runtime.bsd_x86 \
 sun.jvm.hotspot.runtime.linux \
 sun.jvm.hotspot.runtime.linux_amd64 \
+sun.jvm.hotspot.runtime.linux_aarch64 \
 sun.jvm.hotspot.runtime.linux_ppc64 \
 sun.jvm.hotspot.runtime.linux_sparc \
 sun.jvm.hotspot.runtime.linux_x86 \
@@ -149,16 +153,19 @@
 sun/jvm/hotspot/debugger/linux/*.java \
 sun/jvm/hotspot/debugger/linux/ppc64/*.java \
 sun/jvm/hotspot/debugger/linux/x86/*.java \
+sun/jvm/hotspot/debugger/linux/aarch64/*.java \
 sun/jvm/hotspot/debugger/posix/*.java \
 sun/jvm/hotspot/debugger/posix/elf/*.java \
 sun/jvm/hotspot/debugger/ppc64/*.java \
 sun/jvm/hotspot/debugger/proc/*.java \
 sun/jvm/hotspot/debugger/proc/amd64/*.java \
+sun/jvm/hotspot/debugger/proc/aarch64/*.java \
 sun/jvm/hotspot/debugger/proc/ppc64/*.java \
 sun/jvm/hotspot/debugger/proc/sparc/*.java \
 sun/jvm/hotspot/debugger/proc/x86/*.java \
 sun/jvm/hotspot/debugger/remote/*.java \
 sun/jvm/hotspot/debugger/remote/amd64/*.java \
+sun/jvm/hotspot/debugger/remote/aarch64/*.java \
 sun/jvm/hotspot/debugger/remote/ppc64/*.java \
 sun/jvm/hotspot/debugger/remote/sparc/*.java \
 sun/jvm/hotspot/debugger/remote/x86/*.java \
@@ -178,11 +185,13 @@
 sun/jvm/hotspot/prims/*.java \
 sun/jvm/hotspot/runtime/*.java \
 sun/jvm/hotspot/runtime/amd64/*.java \
+sun/jvm/hotspot/runtime/aarch64/*.java \
 sun/jvm/hotspot/runtime/bsd/*.java \
 sun/jvm/hotspot/runtime/bsd_amd64/*.java \
 sun/jvm/hotspot/runtime/bsd_x86/*.java \
 sun/jvm/hotspot/runtime/linux/*.java \
 sun/jvm/hotspot/runtime/linux_amd64/*.java \
+sun/jvm/hotspot/runtime/linux_aarch64/*.java \
 sun/jvm/hotspot/runtime/linux_ppc64/*.java \
 sun/jvm/hotspot/runtime/linux_sparc/*.java \
 sun/jvm/hotspot/runtime/linux_x86/*.java \
--- a/agent/src/os/linux/LinuxDebuggerLocal.c	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/os/linux/LinuxDebuggerLocal.c	Thu Jul 02 11:12:59 2015 +0100
@@ -53,6 +53,10 @@
 #include "sun_jvm_hotspot_debugger_ppc64_PPC64ThreadContext.h"
 #endif
 
+#ifdef aarch64
+#include "sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext.h"
+#endif
+
 static jfieldID p_ps_prochandle_ID = 0;
 static jfieldID threadList_ID = 0;
 static jfieldID loadObjectList_ID = 0;
@@ -368,7 +372,7 @@
 #define NPRGREG sun_jvm_hotspot_debugger_amd64_AMD64ThreadContext_NPRGREG
 #endif
 #ifdef aarch64
-#define NPRGREG 32
+#define NPRGREG sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext_NPRGREG
 #endif
 #if defined(sparc) || defined(sparcv9)
 #define NPRGREG sun_jvm_hotspot_debugger_sparc_SPARCThreadContext_NPRGREG
@@ -473,6 +477,13 @@
 
 #define REG_INDEX(reg) sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext_##reg
 
+  {
+    int i;
+    for (i = 0; i < 31; i++)
+      regs[i] = gregs.regs[i];
+    regs[REG_INDEX(SP)] = gregs.sp;
+    regs[REG_INDEX(PC)] = gregs.pc;
+  }
 #endif /* aarch64 */
 
 #ifdef ppc64
--- a/agent/src/os/linux/Makefile	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/os/linux/Makefile	Thu Jul 02 11:12:59 2015 +0100
@@ -53,14 +53,15 @@
         $(JAVAH) -jni -classpath ../../../build/classes -d $(ARCH) \
 		sun.jvm.hotspot.debugger.x86.X86ThreadContext \
 		sun.jvm.hotspot.debugger.sparc.SPARCThreadContext \
-		sun.jvm.hotspot.debugger.amd64.AMD64ThreadContext 
+		sun.jvm.hotspot.debugger.amd64.AMD64ThreadContext \
+		sun.jvm.hotspot.debugger.aarch64.AARCH64ThreadContext 
         $(GCC) $(CFLAGS) $< -o $@
 
 $(ARCH)/sadis.o:  ../../share/native/sadis.c
         $(JAVAH) -jni -classpath ../../../build/classes -d $(ARCH) \
                 sun.jvm.hotspot.asm.Disassembler
         $(GCC) $(CFLAGS) $< -o $@
- 
+
 $(ARCH)/%.o: %.c
         $(GCC) $(CFLAGS) $< -o $@
 
--- a/agent/src/os/linux/libproc.h	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/os/linux/libproc.h	Thu Jul 02 11:12:59 2015 +0100
@@ -72,6 +72,7 @@
 #define user_regs_struct  pt_regs
 #endif
 #if defined(aarch64)
+#include <asm/ptrace.h>
 #define user_regs_struct user_pt_regs
 #endif
 
--- a/agent/src/os/linux/proc_service.h	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/os/linux/proc_service.h	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -30,7 +30,7 @@
 
 // Linux does not have the proc service library, though it does provide the
 // thread_db library which can be used to manipulate threads without having
-// to know the details of LinuxThreads or NPTL
+// to know the details of NPTL
 
 // copied from Solaris "proc_service.h"
 typedef enum {
--- a/agent/src/share/classes/sun/jvm/hotspot/HSDB.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/HSDB.java	Thu Jul 02 11:12:59 2015 +0100
@@ -983,19 +983,15 @@
                                                      curFrame.getFP(),
                                                      anno));
             } else {
-              if (VM.getVM().getCPU().equals("x86") || VM.getVM().getCPU().equals("amd64")) {
-                // For C2, which has null frame pointers on x86/amd64
-                CodeBlob cb = VM.getVM().getCodeCache().findBlob(curFrame.getPC());
-                Address sp = curFrame.getSP();
-                if (Assert.ASSERTS_ENABLED) {
-                  Assert.that(cb.getFrameSize() > 0, "CodeBlob must have non-zero frame size");
-                }
-                annoPanel.addAnnotation(new Annotation(sp,
-                                                       sp.addOffsetTo(cb.getFrameSize()),
-                                                       anno));
-              } else {
-                Assert.that(VM.getVM().getCPU().equals("ia64"), "only ia64 should reach here");
+              // For C2, which has null frame pointers on x86/amd64/aarch64
+              CodeBlob cb = VM.getVM().getCodeCache().findBlob(curFrame.getPC());
+              Address sp = curFrame.getSP();
+              if (Assert.ASSERTS_ENABLED) {
+                Assert.that(cb.getFrameSize() > 0, "CodeBlob must have non-zero frame size");
               }
+              annoPanel.addAnnotation(new Annotation(sp,
+                                                     sp.addOffsetTo(cb.getFrameSize()),
+                                                     anno));
             }
 
             // Add interpreter frame annotations
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/aarch64/AARCH64ThreadContext.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.aarch64;
+
+import java.lang.annotation.Native;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.cdbg.*;
+
+/** Specifies the thread context on aarch64 platforms; only a sub-portion
+ * of the context is guaranteed to be present on all operating
+ * systems. */
+
+public abstract class AARCH64ThreadContext implements ThreadContext {
+    // Taken from /usr/include/asm/sigcontext.h on Linux/AARCH64.
+
+    // NOTE: the indices for the various registers must be maintained as
+    // listed across various operating systems. However, only a small
+    // subset of the registers' values are guaranteed to be present (and
+    // must be present for the SA's stack walking to work)
+
+    // One instance of the Native annotation is enough to trigger header generation
+    // for this file.
+    @Native
+    public static final int R0 = 0;
+    public static final int R1 = 1;
+    public static final int R2 = 2;
+    public static final int R3 = 3;
+    public static final int R4 = 4;
+    public static final int R5 = 5;
+    public static final int R6 = 6;
+    public static final int R7 = 7;
+    public static final int R8 = 8;
+    public static final int R9 = 9;
+    public static final int R10 = 10;
+    public static final int R11 = 11;
+    public static final int R12 = 12;
+    public static final int R13 = 13;
+    public static final int R14 = 14;
+    public static final int R15 = 15;
+    public static final int R16 = 16;
+    public static final int R17 = 17;
+    public static final int R18 = 18;
+    public static final int R19 = 19;
+    public static final int R20 = 20;
+    public static final int R21 = 21;
+    public static final int R22 = 22;
+    public static final int R23 = 23;
+    public static final int R24 = 24;
+    public static final int R25 = 25;
+    public static final int R26 = 26;
+    public static final int R27 = 27;
+    public static final int R28 = 28;
+    public static final int FP = 29;
+    public static final int LR = 30;
+    public static final int SP = 31;
+    public static final int PC = 32;
+
+    public static final int NPRGREG = 33;
+
+    private long[] data;
+
+    public AARCH64ThreadContext() {
+        data = new long[NPRGREG];
+    }
+
+    public int getNumRegisters() {
+        return NPRGREG;
+    }
+
+    public String getRegisterName(int index) {
+        switch (index) {
+        case LR: return "lr";
+        case SP: return "sp";
+        case PC: return "pc";
+        default:
+            return "r" + index;
+        }
+    }
+
+    public void setRegister(int index, long value) {
+        data[index] = value;
+    }
+
+    public long getRegister(int index) {
+        return data[index];
+    }
+
+    public CFrame getTopFrame(Debugger dbg) {
+        return null;
+    }
+
+    /** This can't be implemented in this class since we would have to
+     * tie the implementation to, for example, the debugging system */
+    public abstract void setRegisterAsAddress(int index, Address value);
+
+    /** This can't be implemented in this class since we would have to
+     * tie the implementation to, for example, the debugging system */
+    public abstract Address getRegisterAsAddress(int index);
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxCDebugger.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxCDebugger.java	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -31,12 +32,14 @@
 import sun.jvm.hotspot.debugger.cdbg.*;
 import sun.jvm.hotspot.debugger.x86.*;
 import sun.jvm.hotspot.debugger.amd64.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
 import sun.jvm.hotspot.debugger.sparc.*;
 import sun.jvm.hotspot.debugger.ppc64.*;
 import sun.jvm.hotspot.debugger.linux.x86.*;
 import sun.jvm.hotspot.debugger.linux.amd64.*;
 import sun.jvm.hotspot.debugger.linux.sparc.*;
 import sun.jvm.hotspot.debugger.linux.ppc64.*;
+import sun.jvm.hotspot.debugger.linux.aarch64.*;
 import sun.jvm.hotspot.utilities.*;
 
 class LinuxCDebugger implements CDebugger {
@@ -106,6 +109,13 @@
         Address pc  = context.getRegisterAsAddress(PPC64ThreadContext.PC);
         if (pc == null) return null;
         return new LinuxPPC64CFrame(dbg, sp, pc, LinuxDebuggerLocal.getAddressSize());
+    } else if (cpu.equals("aarch64")) {
+       AARCH64ThreadContext context = (AARCH64ThreadContext) thread.getContext();
+       Address fp = context.getRegisterAsAddress(AARCH64ThreadContext.FP);
+       if (fp == null) return null;
+       Address pc  = context.getRegisterAsAddress(AARCH64ThreadContext.PC);
+       if (pc == null) return null;
+       return new LinuxAARCH64CFrame(dbg, fp, pc);
      } else {
        // Runtime exception thrown by LinuxThreadContextFactory if unknown cpu
        ThreadContext context = (ThreadContext) thread.getContext();
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/aarch64/LinuxAARCH64CFrame.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.linux.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.linux.*;
+import sun.jvm.hotspot.debugger.cdbg.*;
+import sun.jvm.hotspot.debugger.cdbg.basic.*;
+
+final public class LinuxAARCH64CFrame extends BasicCFrame {
+   public LinuxAARCH64CFrame(LinuxDebugger dbg, Address fp, Address pc) {
+      super(dbg.getCDebugger());
+      this.fp = fp;
+      this.pc = pc;
+      this.dbg = dbg;
+   }
+
+   // override base class impl to avoid ELF parsing
+   public ClosestSymbol closestSymbolToPC() {
+      // try native lookup in debugger.
+      return dbg.lookup(dbg.getAddressValue(pc()));
+   }
+
+   public Address pc() {
+      return pc;
+   }
+
+   public Address localVariableBase() {
+      return fp;
+   }
+
+   public CFrame sender(ThreadProxy thread) {
+      AARCH64ThreadContext context = (AARCH64ThreadContext) thread.getContext();
+      Address rsp = context.getRegisterAsAddress(AARCH64ThreadContext.SP);
+
+      if ((fp == null) || fp.lessThan(rsp)) {
+        return null;
+      }
+
+      // Check alignment of fp
+      if (dbg.getAddressValue(fp) % (2 * ADDRESS_SIZE) != 0) {
+        return null;
+      }
+
+      Address nextFP = fp.getAddressAt(0 * ADDRESS_SIZE);
+      if (nextFP == null || nextFP.lessThanOrEqual(fp)) {
+        return null;
+      }
+      Address nextPC  = fp.getAddressAt(1 * ADDRESS_SIZE);
+      if (nextPC == null) {
+        return null;
+      }
+      return new LinuxAARCH64CFrame(dbg, nextFP, nextPC);
+   }
+
+   // package/class internals only
+   private static final int ADDRESS_SIZE = 8;
+   private Address pc;
+   private Address sp;
+   private Address fp;
+   private LinuxDebugger dbg;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/aarch64/LinuxAARCH64ThreadContext.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.linux.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.linux.*;
+
+public class LinuxAARCH64ThreadContext extends AARCH64ThreadContext {
+  private LinuxDebugger debugger;
+
+  public LinuxAARCH64ThreadContext(LinuxDebugger debugger) {
+    super();
+    this.debugger = debugger;
+  }
+
+  public void setRegisterAsAddress(int index, Address value) {
+    setRegister(index, debugger.getAddressValue(value));
+  }
+
+  public Address getRegisterAsAddress(int index) {
+    return debugger.newAddress(getRegister(index));
+  }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcDebuggerLocal.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcDebuggerLocal.java	Thu Jul 02 11:12:59 2015 +0100
@@ -31,11 +31,13 @@
 import sun.jvm.hotspot.debugger.*;
 import sun.jvm.hotspot.debugger.cdbg.*;
 import sun.jvm.hotspot.debugger.proc.amd64.*;
+import sun.jvm.hotspot.debugger.proc.aarch64.*;
 import sun.jvm.hotspot.debugger.proc.sparc.*;
 import sun.jvm.hotspot.debugger.proc.ppc64.*;
 import sun.jvm.hotspot.debugger.proc.x86.*;
 import sun.jvm.hotspot.debugger.ppc64.*;
 import sun.jvm.hotspot.debugger.amd64.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
 import sun.jvm.hotspot.debugger.sparc.*;
 import sun.jvm.hotspot.debugger.x86.*;
 import sun.jvm.hotspot.utilities.*;
@@ -88,6 +90,10 @@
             threadFactory = new ProcAMD64ThreadFactory(this);
             pcRegIndex = AMD64ThreadContext.RIP;
             fpRegIndex = AMD64ThreadContext.RBP;
+        } else if (cpu.equals("aarch64")) {
+            threadFactory = new ProcAARCH64ThreadFactory(this);
+            pcRegIndex = AARCH64ThreadContext.PC;
+            fpRegIndex = AARCH64ThreadContext.FP;
         } else if (cpu.equals("ppc64")) {
             threadFactory = new ProcPPC64ThreadFactory(this);
             pcRegIndex = PPC64ThreadContext.PC;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/aarch64/ProcAARCH64Thread.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.proc.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.proc.*;
+import sun.jvm.hotspot.utilities.*;
+
+public class ProcAARCH64Thread implements ThreadProxy {
+    private ProcDebugger debugger;
+    private int         id;
+
+    public ProcAARCH64Thread(ProcDebugger debugger, Address addr) {
+        this.debugger = debugger;
+
+        // FIXME: the size here should be configurable. However, making it
+        // so would produce a dependency on the "types" package from the
+        // debugger package, which is not desired.
+        this.id       = (int) addr.getCIntegerAt(0, 4, true);
+    }
+
+    public ProcAARCH64Thread(ProcDebugger debugger, long id) {
+        this.debugger = debugger;
+        this.id = (int) id;
+    }
+
+    public ThreadContext getContext() throws IllegalThreadStateException {
+        ProcAARCH64ThreadContext context = new ProcAARCH64ThreadContext(debugger);
+        long[] regs = debugger.getThreadIntegerRegisterSet(id);
+        if (Assert.ASSERTS_ENABLED) {
+            Assert.that(regs.length == AARCH64ThreadContext.NPRGREG, "size mismatch");
+        }
+        for (int i = 0; i < regs.length; i++) {
+            context.setRegister(i, regs[i]);
+        }
+        return context;
+    }
+
+    public boolean canSetContext() throws DebuggerException {
+        return false;
+    }
+
+    public void setContext(ThreadContext context)
+    throws IllegalThreadStateException, DebuggerException {
+        throw new DebuggerException("Unimplemented");
+    }
+
+    public String toString() {
+        return "t@" + id;
+    }
+
+    public boolean equals(Object obj) {
+        if ((obj == null) || !(obj instanceof ProcAARCH64Thread)) {
+            return false;
+        }
+
+        return (((ProcAARCH64Thread) obj).id == id);
+    }
+
+    public int hashCode() {
+        return id;
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/aarch64/ProcAARCH64ThreadContext.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.proc.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.proc.*;
+
+public class ProcAARCH64ThreadContext extends AARCH64ThreadContext {
+    private ProcDebugger debugger;
+
+    public ProcAARCH64ThreadContext(ProcDebugger debugger) {
+        super();
+        this.debugger = debugger;
+    }
+
+    public void setRegisterAsAddress(int index, Address value) {
+        setRegister(index, debugger.getAddressValue(value));
+    }
+
+    public Address getRegisterAsAddress(int index) {
+        return debugger.newAddress(getRegister(index));
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/aarch64/ProcAARCH64ThreadFactory.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.proc.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.proc.*;
+
+public class ProcAARCH64ThreadFactory implements ProcThreadFactory {
+    private ProcDebugger debugger;
+
+    public ProcAARCH64ThreadFactory(ProcDebugger debugger) {
+        this.debugger = debugger;
+    }
+
+    public ThreadProxy createThreadWrapper(Address threadIdentifierAddr) {
+        return new ProcAARCH64Thread(debugger, threadIdentifierAddr);
+    }
+
+    public ThreadProxy createThreadWrapper(long id) {
+        return new ProcAARCH64Thread(debugger, id);
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/aarch64/RemoteAARCH64Thread.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.remote.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.remote.*;
+import sun.jvm.hotspot.utilities.*;
+
+public class RemoteAARCH64Thread extends RemoteThread  {
+  public RemoteAARCH64Thread(RemoteDebuggerClient debugger, Address addr) {
+     super(debugger, addr);
+  }
+
+  public RemoteAARCH64Thread(RemoteDebuggerClient debugger, long id) {
+     super(debugger, id);
+  }
+
+  public ThreadContext getContext() throws IllegalThreadStateException {
+    RemoteAARCH64ThreadContext context = new RemoteAARCH64ThreadContext(debugger);
+    long[] regs = (addr != null)? debugger.getThreadIntegerRegisterSet(addr) :
+                                  debugger.getThreadIntegerRegisterSet(id);
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(regs.length == AARCH64ThreadContext.NPRGREG, "size of register set must match");
+    }
+    for (int i = 0; i < regs.length; i++) {
+      context.setRegister(i, regs[i]);
+    }
+    return context;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/aarch64/RemoteAARCH64ThreadContext.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.remote.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.remote.*;
+
+public class RemoteAARCH64ThreadContext extends AARCH64ThreadContext {
+  private RemoteDebuggerClient debugger;
+
+  public RemoteAARCH64ThreadContext(RemoteDebuggerClient debugger) {
+    super();
+    this.debugger = debugger;
+  }
+
+  public void setRegisterAsAddress(int index, Address value) {
+    setRegister(index, debugger.getAddressValue(value));
+  }
+
+  public Address getRegisterAsAddress(int index) {
+    return debugger.newAddress(getRegister(index));
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/aarch64/RemoteAARCH64ThreadFactory.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.remote.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.remote.*;
+
+public class RemoteAARCH64ThreadFactory implements RemoteThreadFactory {
+  private RemoteDebuggerClient debugger;
+
+  public RemoteAARCH64ThreadFactory(RemoteDebuggerClient debugger) {
+    this.debugger = debugger;
+  }
+
+  public ThreadProxy createThreadWrapper(Address threadIdentifierAddr) {
+    return new RemoteAARCH64Thread(debugger, threadIdentifierAddr);
+  }
+
+  public ThreadProxy createThreadWrapper(long id) {
+    return new RemoteAARCH64Thread(debugger, id);
+  }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/gc/shared/Generation.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/gc/shared/Generation.java	Thu Jul 02 11:12:59 2015 +0100
@@ -49,7 +49,6 @@
 public abstract class Generation extends VMObject {
   private static long          reservedFieldOffset;
   private static long          virtualSpaceFieldOffset;
-  private static CIntegerField levelField;
   protected static final int  K = 1024;
   // Fields for class StatRecord
   private static Field         statRecordField;
@@ -75,7 +74,6 @@
 
     reservedFieldOffset     = type.getField("_reserved").getOffset();
     virtualSpaceFieldOffset = type.getField("_virtual_space").getOffset();
-    levelField              = type.getCIntegerField("_level");
     // StatRecord
     statRecordField         = type.getField("_stat_record");
     type                    = db.lookupType("Generation::StatRecord");
@@ -130,14 +128,6 @@
      }
   }
 
-  public GenerationSpec spec() {
-    return ((GenCollectedHeap) VM.getVM().getUniverse().heap()).spec(level());
-  }
-
-  public int level() {
-    return (int) levelField.getValue(addr);
-  }
-
   public int invocations() {
     return getStatRecord().getInvocations();
   }
--- a/agent/src/share/classes/sun/jvm/hotspot/runtime/Frame.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/Frame.java	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -357,12 +357,6 @@
   // FIXME: avoiding implementing this for now if possible
   //  public void interpreter_frame_set_monitor_end(BasicObjectLock* value);
   //  public void interpreter_frame_verify_monitor(BasicObjectLock* value) const;
-  //
-  // Tells whether the current interpreter_frame frame pointer
-  // corresponds to the old compiled/deoptimized fp
-  // The receiver used to be a top level frame
-  // public boolean interpreter_frame_equals_unpacked_fp(intptr_t* fp);
-
   //--------------------------------------------------------------------------------
   // Method and constant pool cache:
   //
--- a/agent/src/share/classes/sun/jvm/hotspot/runtime/Threads.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/Threads.java	Thu Jul 02 11:12:59 2015 +0100
@@ -35,6 +35,7 @@
 import sun.jvm.hotspot.runtime.win32_x86.Win32X86JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_x86.LinuxX86JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_amd64.LinuxAMD64JavaThreadPDAccess;
+import sun.jvm.hotspot.runtime.linux_aarch64.LinuxAARCH64JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_ppc64.LinuxPPC64JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_sparc.LinuxSPARCJavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.bsd_x86.BsdX86JavaThreadPDAccess;
@@ -91,6 +92,8 @@
                 access = new LinuxSPARCJavaThreadPDAccess();
             } else if (cpu.equals("ppc64")) {
                 access = new LinuxPPC64JavaThreadPDAccess();
+            } else if (cpu.equals("aarch64")) {
+                access = new LinuxAARCH64JavaThreadPDAccess();
             } else {
               try {
                 access = (JavaThreadPDAccess)
--- a/agent/src/share/classes/sun/jvm/hotspot/runtime/VM.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/VM.java	Thu Jul 02 11:12:59 2015 +0100
@@ -121,6 +121,8 @@
   private Flag[] commandLineFlags;
   private Map flagsMap;
 
+  private static Type intType;
+  private static Type uintType;
   private static Type intxType;
   private static Type uintxType;
   private static Type sizetType;
@@ -170,6 +172,28 @@
         return addr.getCIntegerAt(0, boolType.getSize(), boolType.isUnsigned()) != 0;
      }
 
+     public boolean isInt() {
+        return type.equals("int");
+     }
+
+     public long getInt() {
+        if (Assert.ASSERTS_ENABLED) {
+           Assert.that(isInt(), "not an int flag!");
+        }
+        return addr.getCIntegerAt(0, intType.getSize(), false);
+     }
+
+     public boolean isUInt() {
+        return type.equals("uint");
+     }
+
+     public long getUInt() {
+        if (Assert.ASSERTS_ENABLED) {
+           Assert.that(isUInt(), "not a uint flag!");
+        }
+        return addr.getCIntegerAt(0, uintType.getSize(), false);
+     }
+
      public boolean isIntx() {
         return type.equals("intx");
      }
@@ -206,6 +230,10 @@
      public String getValue() {
         if (isBool()) {
            return new Boolean(getBool()).toString();
+        } else if (isInt()) {
+           return new Long(getInt()).toString();
+        } else if (isUInt()) {
+           return new Long(getUInt()).toString();
         } else if (isIntx()) {
            return new Long(getIntx()).toString();
         } else if (isUIntx()) {
@@ -334,6 +362,8 @@
     heapWordSize = db.lookupIntConstant("HeapWordSize").intValue();
     oopSize  = db.lookupIntConstant("oopSize").intValue();
 
+    intType = db.lookupType("int");
+    uintType = db.lookupType("uint");
     intxType = db.lookupType("intx");
     uintxType = db.lookupType("uintx");
     sizetType = db.lookupType("size_t");
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64CurrentFrameGuess.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2003, 2006, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.code.*;
+import sun.jvm.hotspot.interpreter.*;
+import sun.jvm.hotspot.runtime.*;
+import sun.jvm.hotspot.runtime.aarch64.*;
+
+/** <P> Should be able to be used on all aarch64 platforms we support
+    (Linux/aarch64) to implement JavaThread's "currentFrameGuess()"
+    functionality. Input is an AARCH64ThreadContext; output is SP, FP,
+    and PC for an AARCH64Frame. Instantiation of the AARCH64Frame is
+    left to the caller, since we may need to subclass AARCH64Frame to
+    support signal handler frames on Unix platforms. </P>
+
+    <P> Algorithm is to walk up the stack within a given range (say,
+    512K at most) looking for a plausible PC and SP for a Java frame,
+    also considering those coming in from the context. If we find a PC
+    that belongs to the VM (i.e., in generated code like the
+    interpreter or CodeCache) then we try to find an associated FP.
+    We repeat this until we either find a complete frame or run out of
+    stack to look at. </P> */
+
+public class AARCH64CurrentFrameGuess {
+  private AARCH64ThreadContext context;
+  private JavaThread       thread;
+  private Address          spFound;
+  private Address          fpFound;
+  private Address          pcFound;
+
+  private static final boolean DEBUG = System.getProperty("sun.jvm.hotspot.runtime.aarch64.AARCH64Frame.DEBUG")
+                                       != null;
+
+  public AARCH64CurrentFrameGuess(AARCH64ThreadContext context,
+                              JavaThread thread) {
+    this.context = context;
+    this.thread  = thread;
+  }
+
+  /** Returns false if not able to find a frame within a reasonable range. */
+  public boolean run(long regionInBytesToSearch) {
+    Address sp  = context.getRegisterAsAddress(AARCH64ThreadContext.SP);
+    Address pc  = context.getRegisterAsAddress(AARCH64ThreadContext.PC);
+    Address fp  = context.getRegisterAsAddress(AARCH64ThreadContext.FP);
+    if (sp == null) {
+      // Bail out if no last java frame either
+      if (thread.getLastJavaSP() != null) {
+        setValues(thread.getLastJavaSP(), thread.getLastJavaFP(), null);
+        return true;
+      }
+      return false;
+    }
+    Address end = sp.addOffsetTo(regionInBytesToSearch);
+    VM vm       = VM.getVM();
+
+    setValues(null, null, null); // Assume we're not going to find anything
+
+    if (vm.isJavaPCDbg(pc)) {
+      if (vm.isClientCompiler()) {
+        // If the topmost frame is a Java frame, we are (pretty much)
+        // guaranteed to have a viable FP. We should be more robust
+        // than this (we have the potential for losing entire threads'
+        // stack traces) but need to see how much work we really have
+        // to do here. Searching the stack for an (SP, FP) pair is
+        // hard since it's easy to misinterpret inter-frame stack
+        // pointers as base-of-frame pointers; we also don't know the
+        // sizes of C1 frames (not registered in the nmethod) so can't
+        // derive them from SP.
+
+        setValues(sp, fp, pc);
+        return true;
+      } else {
+        if (vm.getInterpreter().contains(pc)) {
+          if (DEBUG) {
+            System.out.println("CurrentFrameGuess: choosing interpreter frame: sp = " +
+                               sp + ", fp = " + fp + ", pc = " + pc);
+          }
+          setValues(sp, fp, pc);
+          return true;
+        }
+
+        // For the server compiler, FP is not guaranteed to be valid
+        // for compiled code. In addition, an earlier attempt at a
+        // non-searching algorithm (see below) failed because the
+        // stack pointer from the thread context was pointing
+        // (considerably) beyond the ostensible end of the stack, into
+        // garbage; walking from the topmost frame back caused a crash.
+        //
+        // This algorithm takes the current PC as a given and tries to
+        // find the correct corresponding SP by walking up the stack
+        // and repeatedly performing stackwalks (very inefficient).
+        //
+        // FIXME: there is something wrong with stackwalking across
+        // adapter frames...this is likely to be the root cause of the
+        // failure with the simpler algorithm below.
+
+        for (long offset = 0;
+             offset < regionInBytesToSearch;
+             offset += vm.getAddressSize()) {
+          try {
+            Address curSP = sp.addOffsetTo(offset);
+            Frame frame = new AARCH64Frame(curSP, null, pc);
+            RegisterMap map = thread.newRegisterMap(false);
+            while (frame != null) {
+              if (frame.isEntryFrame() && frame.entryFrameIsFirst()) {
+                // We were able to traverse all the way to the
+                // bottommost Java frame.
+                // This sp looks good. Keep it.
+                if (DEBUG) {
+                  System.out.println("CurrentFrameGuess: Choosing sp = " + curSP + ", pc = " + pc);
+                }
+                setValues(curSP, null, pc);
+                return true;
+              }
+              frame = frame.sender(map);
+            }
+          } catch (Exception e) {
+            if (DEBUG) {
+              System.out.println("CurrentFrameGuess: Exception " + e + " at offset " + offset);
+            }
+            // Bad SP. Try another.
+          }
+        }
+
+        // Were not able to find a plausible SP to go with this PC.
+        // Bail out.
+        return false;
+
+        /*
+        // Original algorithm which does not work because SP was
+        // pointing beyond where it should have:
+
+        // For the server compiler, FP is not guaranteed to be valid
+        // for compiled code. We see whether the PC is in the
+        // interpreter and take care of that, otherwise we run code
+        // (unfortunately) duplicated from AARCH64Frame.senderForCompiledFrame.
+
+        CodeCache cc = vm.getCodeCache();
+        if (cc.contains(pc)) {
+          CodeBlob cb = cc.findBlob(pc);
+
+          // See if we can derive a frame pointer from SP and PC
+          // NOTE: This is the code duplicated from AARCH64Frame
+          Address saved_fp = null;
+          int llink_offset = cb.getLinkOffset();
+          if (llink_offset >= 0) {
+            // Restore base-pointer, since next frame might be an interpreter frame.
+            Address fp_addr = sp.addOffsetTo(VM.getVM().getAddressSize() * llink_offset);
+            saved_fp = fp_addr.getAddressAt(0);
+          }
+
+          setValues(sp, saved_fp, pc);
+          return true;
+        }
+        */
+      }
+    } else {
+      // If the current program counter was not known to us as a Java
+      // PC, we currently assume that we are in the run-time system
+      // and attempt to look to thread-local storage for saved SP and
+      // FP. Note that if these are null (because we were, in fact,
+      // in Java code, i.e., vtable stubs or similar, and the SA
+      // didn't have enough insight into the target VM to understand
+      // that) then we are going to lose the entire stack trace for
+      // the thread, which is sub-optimal. FIXME.
+
+      if (DEBUG) {
+        System.out.println("CurrentFrameGuess: choosing last Java frame: sp = " +
+                           thread.getLastJavaSP() + ", fp = " + thread.getLastJavaFP());
+      }
+      if (thread.getLastJavaSP() == null) {
+        return false; // No known Java frames on stack
+      }
+
+      // The runtime has a nasty habit of not saving fp in the frame
+      // anchor, leaving us to grovel about in the stack to find a
+      // plausible address.  Fortunately, this only happens in
+      // compiled code; there we always have a valid PC, and we always
+      // push LR and FP onto the stack as a pair, with FP at the lower
+      // address.
+      pc = thread.getLastJavaPC();
+      fp = thread.getLastJavaFP();
+      sp = thread.getLastJavaSP();
+
+      if (fp == null) {
+        CodeCache cc = vm.getCodeCache();
+        if (cc.contains(pc)) {
+          CodeBlob cb = cc.findBlob(pc);
+          if (DEBUG) {
+            System.out.println("FP is null.  Found blob frame size " + cb.getFrameSize());
+          }
+          // See if we can derive a frame pointer from SP and PC
+          long link_offset = cb.getFrameSize() - 2 * VM.getVM().getAddressSize();
+          if (link_offset >= 0) {
+            fp = sp.addOffsetTo(link_offset);
+          }
+        }
+      }
+
+      setValues(sp, fp, null);
+
+      return true;
+    }
+  }
+
+  public Address getSP() { return spFound; }
+  public Address getFP() { return fpFound; }
+  /** May be null if getting values from thread-local storage; take
+      care to call the correct AARCH64Frame constructor to recover this if
+      necessary */
+  public Address getPC() { return pcFound; }
+
+  private void setValues(Address sp, Address fp, Address pc) {
+    spFound = sp;
+    fpFound = fp;
+    pcFound = pc;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64Frame.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,555 @@
+/*
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import java.util.*;
+import sun.jvm.hotspot.code.*;
+import sun.jvm.hotspot.compiler.*;
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.oops.*;
+import sun.jvm.hotspot.runtime.*;
+import sun.jvm.hotspot.types.*;
+import sun.jvm.hotspot.utilities.*;
+
+/** Specialization of and implementation of abstract methods of the
+    Frame class for the aarch64 family of CPUs. */
+
+public class AARCH64Frame extends Frame {
+  private static final boolean DEBUG;
+  static {
+    DEBUG = System.getProperty("sun.jvm.hotspot.runtime.aarch64.AARCH64Frame.DEBUG") != null;
+  }
+
+  // All frames
+  private static final int LINK_OFFSET                =  0;
+  private static final int RETURN_ADDR_OFFSET         =  1;
+  private static final int SENDER_SP_OFFSET           =  2;
+
+  // Interpreter frames
+  private static final int INTERPRETER_FRAME_MIRROR_OFFSET    =  2; // for native calls only
+  private static final int INTERPRETER_FRAME_SENDER_SP_OFFSET = -1;
+  private static final int INTERPRETER_FRAME_LAST_SP_OFFSET   = INTERPRETER_FRAME_SENDER_SP_OFFSET - 1;
+  private static final int INTERPRETER_FRAME_METHOD_OFFSET    = INTERPRETER_FRAME_LAST_SP_OFFSET - 1;
+  private static       int INTERPRETER_FRAME_MDX_OFFSET;         // Non-core builds only
+  private static       int INTERPRETER_FRAME_CACHE_OFFSET;
+  private static       int INTERPRETER_FRAME_LOCALS_OFFSET;
+  private static       int INTERPRETER_FRAME_BCX_OFFSET;
+  private static       int INTERPRETER_FRAME_INITIAL_SP_OFFSET;
+  private static       int INTERPRETER_FRAME_MONITOR_BLOCK_TOP_OFFSET;
+  private static       int INTERPRETER_FRAME_MONITOR_BLOCK_BOTTOM_OFFSET;
+
+  // Entry frames
+  private static       int ENTRY_FRAME_CALL_WRAPPER_OFFSET = -8;
+
+  // Native frames
+  private static final int NATIVE_FRAME_INITIAL_PARAM_OFFSET =  2;
+
+  private static VMReg fp = new VMReg(29);
+
+  static {
+    VM.registerVMInitializedObserver(new Observer() {
+        public void update(Observable o, Object data) {
+          initialize(VM.getVM().getTypeDataBase());
+        }
+      });
+  }
+
+  private static synchronized void initialize(TypeDataBase db) {
+    INTERPRETER_FRAME_MDX_OFFSET                  = INTERPRETER_FRAME_METHOD_OFFSET - 1;
+    INTERPRETER_FRAME_CACHE_OFFSET                = INTERPRETER_FRAME_MDX_OFFSET - 1;
+    INTERPRETER_FRAME_LOCALS_OFFSET               = INTERPRETER_FRAME_CACHE_OFFSET - 1;
+    INTERPRETER_FRAME_BCX_OFFSET                  = INTERPRETER_FRAME_LOCALS_OFFSET - 1;
+    INTERPRETER_FRAME_INITIAL_SP_OFFSET           = INTERPRETER_FRAME_BCX_OFFSET - 1;
+    INTERPRETER_FRAME_MONITOR_BLOCK_TOP_OFFSET    = INTERPRETER_FRAME_INITIAL_SP_OFFSET;
+    INTERPRETER_FRAME_MONITOR_BLOCK_BOTTOM_OFFSET = INTERPRETER_FRAME_INITIAL_SP_OFFSET;
+  }
+
+
+  // an additional field beyond sp and pc:
+  Address raw_fp; // frame pointer
+  private Address raw_unextendedSP;
+
+  private AARCH64Frame() {
+  }
+
+  private void adjustForDeopt() {
+    if ( pc != null) {
+      // Look for a deopt pc and if it is deopted convert to original pc
+      CodeBlob cb = VM.getVM().getCodeCache().findBlob(pc);
+      if (cb != null && cb.isJavaMethod()) {
+        NMethod nm = (NMethod) cb;
+        if (pc.equals(nm.deoptHandlerBegin())) {
+          if (Assert.ASSERTS_ENABLED) {
+            Assert.that(this.getUnextendedSP() != null, "null SP in Java frame");
+          }
+          // adjust pc if frame is deoptimized.
+          pc = this.getUnextendedSP().getAddressAt(nm.origPCOffset());
+          deoptimized = true;
+        }
+      }
+    }
+  }
+
+  public AARCH64Frame(Address raw_sp, Address raw_fp, Address pc) {
+    this.raw_sp = raw_sp;
+    this.raw_unextendedSP = raw_sp;
+    this.raw_fp = raw_fp;
+    this.pc = pc;
+    adjustUnextendedSP();
+
+    // Frame must be fully constructed before this call
+    adjustForDeopt();
+
+    if (DEBUG) {
+      System.out.println("AARCH64Frame(sp, fp, pc): " + this);
+      dumpStack();
+    }
+  }
+
+  public AARCH64Frame(Address raw_sp, Address raw_fp) {
+    this.raw_sp = raw_sp;
+    this.raw_unextendedSP = raw_sp;
+    this.raw_fp = raw_fp;
+    this.pc = raw_sp.getAddressAt(-1 * VM.getVM().getAddressSize());
+    adjustUnextendedSP();
+
+    // Frame must be fully constructed before this call
+    adjustForDeopt();
+
+    if (DEBUG) {
+      System.out.println("AARCH64Frame(sp, fp): " + this);
+      dumpStack();
+    }
+  }
+
+  public AARCH64Frame(Address raw_sp, Address raw_unextendedSp, Address raw_fp, Address pc) {
+    this.raw_sp = raw_sp;
+    this.raw_unextendedSP = raw_unextendedSp;
+    this.raw_fp = raw_fp;
+    this.pc = pc;
+    adjustUnextendedSP();
+
+    // Frame must be fully constructed before this call
+    adjustForDeopt();
+
+    if (DEBUG) {
+      System.out.println("AARCH64Frame(sp, unextendedSP, fp, pc): " + this);
+      dumpStack();
+    }
+
+  }
+
+  public Object clone() {
+    AARCH64Frame frame = new AARCH64Frame();
+    frame.raw_sp = raw_sp;
+    frame.raw_unextendedSP = raw_unextendedSP;
+    frame.raw_fp = raw_fp;
+    frame.pc = pc;
+    frame.deoptimized = deoptimized;
+    return frame;
+  }
+
+  public boolean equals(Object arg) {
+    if (arg == null) {
+      return false;
+    }
+
+    if (!(arg instanceof AARCH64Frame)) {
+      return false;
+    }
+
+    AARCH64Frame other = (AARCH64Frame) arg;
+
+    return (AddressOps.equal(getSP(), other.getSP()) &&
+            AddressOps.equal(getUnextendedSP(), other.getUnextendedSP()) &&
+            AddressOps.equal(getFP(), other.getFP()) &&
+            AddressOps.equal(getPC(), other.getPC()));
+  }
+
+  public int hashCode() {
+    if (raw_sp == null) {
+      return 0;
+    }
+
+    return raw_sp.hashCode();
+  }
+
+  public String toString() {
+    return "sp: " + (getSP() == null? "null" : getSP().toString()) +
+         ", unextendedSP: " + (getUnextendedSP() == null? "null" : getUnextendedSP().toString()) +
+         ", fp: " + (getFP() == null? "null" : getFP().toString()) +
+         ", pc: " + (pc == null? "null" : pc.toString());
+  }
+
+  // accessors for the instance variables
+  public Address getFP() { return raw_fp; }
+  public Address getSP() { return raw_sp; }
+  public Address getID() { return raw_sp; }
+
+  // FIXME: not implemented yet
+  public boolean isSignalHandlerFrameDbg() { return false; }
+  public int     getSignalNumberDbg()      { return 0;     }
+  public String  getSignalNameDbg()        { return null;  }
+
+  public boolean isInterpretedFrameValid() {
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(isInterpretedFrame(), "Not an interpreted frame");
+    }
+
+    // These are reasonable sanity checks
+    if (getFP() == null || getFP().andWithMask(0x3) != null) {
+      return false;
+    }
+
+    if (getSP() == null || getSP().andWithMask(0x3) != null) {
+      return false;
+    }
+
+    if (getFP().addOffsetTo(INTERPRETER_FRAME_INITIAL_SP_OFFSET * VM.getVM().getAddressSize()).lessThan(getSP())) {
+      return false;
+    }
+
+    // These are hacks to keep us out of trouble.
+    // The problem with these is that they mask other problems
+    if (getFP().lessThanOrEqual(getSP())) {
+      // this attempts to deal with unsigned comparison above
+      return false;
+    }
+
+    if (getFP().minus(getSP()) > 4096 * VM.getVM().getAddressSize()) {
+      // stack frames shouldn't be large.
+      return false;
+    }
+
+    return true;
+  }
+
+  // FIXME: not applicable in current system
+  //  void    patch_pc(Thread* thread, address pc);
+
+  public Frame sender(RegisterMap regMap, CodeBlob cb) {
+    AARCH64RegisterMap map = (AARCH64RegisterMap) regMap;
+
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map != null, "map must be set");
+    }
+
+    // Default is we done have to follow them. The sender_for_xxx will
+    // update it accordingly
+    map.setIncludeArgumentOops(false);
+
+    if (isEntryFrame())       return senderForEntryFrame(map);
+    if (isInterpretedFrame()) return senderForInterpreterFrame(map);
+
+    if(cb == null) {
+      cb = VM.getVM().getCodeCache().findBlob(getPC());
+    } else {
+      if (Assert.ASSERTS_ENABLED) {
+        Assert.that(cb.equals(VM.getVM().getCodeCache().findBlob(getPC())), "Must be the same");
+      }
+    }
+
+    if (cb != null) {
+      return senderForCompiledFrame(map, cb);
+    }
+
+    // Must be native-compiled frame, i.e. the marshaling code for native
+    // methods that exists in the core system.
+    return new AARCH64Frame(getSenderSP(), getLink(), getSenderPC());
+  }
+
+  private Frame senderForEntryFrame(AARCH64RegisterMap map) {
+    if (DEBUG) {
+      System.out.println("senderForEntryFrame");
+    }
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map != null, "map must be set");
+    }
+    // Java frame called from C; skip all C frames and return top C
+    // frame of that chunk as the sender
+    AARCH64JavaCallWrapper jcw = (AARCH64JavaCallWrapper) getEntryFrameCallWrapper();
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(!entryFrameIsFirst(), "next Java fp must be non zero");
+      Assert.that(jcw.getLastJavaSP().greaterThan(getSP()), "must be above this frame on stack");
+    }
+    AARCH64Frame fr;
+    if (jcw.getLastJavaPC() != null) {
+      fr = new AARCH64Frame(jcw.getLastJavaSP(), jcw.getLastJavaFP(), jcw.getLastJavaPC());
+    } else {
+      fr = new AARCH64Frame(jcw.getLastJavaSP(), jcw.getLastJavaFP());
+    }
+    map.clear();
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map.getIncludeArgumentOops(), "should be set by clear");
+    }
+    return fr;
+  }
+
+  //------------------------------------------------------------------------------
+  // frame::adjust_unextended_sp
+  private void adjustUnextendedSP() {
+    // If we are returning to a compiled MethodHandle call site, the
+    // saved_fp will in fact be a saved value of the unextended SP.  The
+    // simplest way to tell whether we are returning to such a call site
+    // is as follows:
+
+    CodeBlob cb = cb();
+    NMethod senderNm = (cb == null) ? null : cb.asNMethodOrNull();
+    if (senderNm != null) {
+      // If the sender PC is a deoptimization point, get the original
+      // PC.  For MethodHandle call site the unextended_sp is stored in
+      // saved_fp.
+      if (senderNm.isDeoptMhEntry(getPC())) {
+        // DEBUG_ONLY(verifyDeoptMhOriginalPc(senderNm, getFP()));
+        raw_unextendedSP = getFP();
+      }
+      else if (senderNm.isDeoptEntry(getPC())) {
+        // DEBUG_ONLY(verifyDeoptOriginalPc(senderNm, raw_unextendedSp));
+      }
+      else if (senderNm.isMethodHandleReturn(getPC())) {
+        raw_unextendedSP = getFP();
+      }
+    }
+  }
+
+  private Frame senderForInterpreterFrame(AARCH64RegisterMap map) {
+    if (DEBUG) {
+      System.out.println("senderForInterpreterFrame");
+    }
+    Address unextendedSP = addressOfStackSlot(INTERPRETER_FRAME_SENDER_SP_OFFSET).getAddressAt(0);
+    Address sp = addressOfStackSlot(SENDER_SP_OFFSET);
+    // We do not need to update the callee-save register mapping because above
+    // us is either another interpreter frame or a converter-frame, but never
+    // directly a compiled frame.
+    // 11/24/04 SFG. With the removal of adapter frames this is no longer true.
+    // However c2 no longer uses callee save register for java calls so there
+    // are no callee register to find.
+
+    if (map.getUpdateMap())
+      updateMapWithSavedLink(map, addressOfStackSlot(LINK_OFFSET));
+
+    return new AARCH64Frame(sp, unextendedSP, getLink(), getSenderPC());
+  }
+
+  private void updateMapWithSavedLink(RegisterMap map, Address savedFPAddr) {
+    map.setLocation(fp, savedFPAddr);
+  }
+
+  private Frame senderForCompiledFrame(AARCH64RegisterMap map, CodeBlob cb) {
+    if (DEBUG) {
+      System.out.println("senderForCompiledFrame");
+    }
+
+    //
+    // NOTE: some of this code is (unfortunately) duplicated  AARCH64CurrentFrameGuess
+    //
+
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map != null, "map must be set");
+    }
+
+    // frame owned by optimizing compiler
+    if (Assert.ASSERTS_ENABLED) {
+        Assert.that(cb.getFrameSize() >= 0, "must have non-zero frame size");
+    }
+    Address senderSP = getUnextendedSP().addOffsetTo(cb.getFrameSize());
+
+    // The return_address is always the word on the stack
+    Address senderPC = senderSP.getAddressAt(-1 * VM.getVM().getAddressSize());
+
+    // This is the saved value of FP which may or may not really be an FP.
+    // It is only an FP if the sender is an interpreter frame.
+    Address savedFPAddr = senderSP.addOffsetTo(- SENDER_SP_OFFSET * VM.getVM().getAddressSize());
+
+    if (map.getUpdateMap()) {
+      // Tell GC to use argument oopmaps for some runtime stubs that need it.
+      // For C1, the runtime stub might not have oop maps, so set this flag
+      // outside of update_register_map.
+      map.setIncludeArgumentOops(cb.callerMustGCArguments());
+
+      if (cb.getOopMaps() != null) {
+        ImmutableOopMapSet.updateRegisterMap(this, cb, map, true);
+      }
+
+      // Since the prolog does the save and restore of FP there is no oopmap
+      // for it so we must fill in its location as if there was an oopmap entry
+      // since if our caller was compiled code there could be live jvm state in it.
+      updateMapWithSavedLink(map, savedFPAddr);
+    }
+
+    return new AARCH64Frame(senderSP, savedFPAddr.getAddressAt(0), senderPC);
+  }
+
+  protected boolean hasSenderPD() {
+    return true;
+  }
+
+  public long frameSize() {
+    return (getSenderSP().minus(getSP()) / VM.getVM().getAddressSize());
+  }
+
+    public Address getLink() {
+        try {
+            if (DEBUG) {
+                System.out.println("Reading link at " + addressOfStackSlot(LINK_OFFSET)
+                        + " = " + addressOfStackSlot(LINK_OFFSET).getAddressAt(0));
+            }
+            return addressOfStackSlot(LINK_OFFSET).getAddressAt(0);
+        } catch (Exception e) {
+            if (DEBUG)
+                System.out.println("Returning null");
+            return null;
+        }
+    }
+
+  // FIXME: not implementable yet
+  //inline void      frame::set_link(intptr_t* addr)  { *(intptr_t **)addr_at(link_offset) = addr; }
+
+  public Address getUnextendedSP() { return raw_unextendedSP; }
+
+  // Return address:
+  public Address getSenderPCAddr() { return addressOfStackSlot(RETURN_ADDR_OFFSET); }
+  public Address getSenderPC()     { return getSenderPCAddr().getAddressAt(0);      }
+
+  // return address of param, zero origin index.
+  public Address getNativeParamAddr(int idx) {
+    return addressOfStackSlot(NATIVE_FRAME_INITIAL_PARAM_OFFSET + idx);
+  }
+
+  public Address getSenderSP()     { return addressOfStackSlot(SENDER_SP_OFFSET); }
+
+  public Address addressOfInterpreterFrameLocals() {
+    return addressOfStackSlot(INTERPRETER_FRAME_LOCALS_OFFSET);
+  }
+
+  private Address addressOfInterpreterFrameBCX() {
+    return addressOfStackSlot(INTERPRETER_FRAME_BCX_OFFSET);
+  }
+
+  public int getInterpreterFrameBCI() {
+    // FIXME: this is not atomic with respect to GC and is unsuitable
+    // for use in a non-debugging, or reflective, system. Need to
+    // figure out how to express this.
+    Address bcp = addressOfInterpreterFrameBCX().getAddressAt(0);
+    Address methodHandle = addressOfInterpreterFrameMethod().getAddressAt(0);
+    Method method = (Method)Metadata.instantiateWrapperFor(methodHandle);
+    return bcpToBci(bcp, method);
+  }
+
+  public Address addressOfInterpreterFrameMDX() {
+    return addressOfStackSlot(INTERPRETER_FRAME_MDX_OFFSET);
+  }
+
+  // FIXME
+  //inline int frame::interpreter_frame_monitor_size() {
+  //  return BasicObjectLock::size();
+  //}
+
+  // expression stack
+  // (the max_stack arguments are used by the GC; see class FrameClosure)
+
+  public Address addressOfInterpreterFrameExpressionStack() {
+    Address monitorEnd = interpreterFrameMonitorEnd().address();
+    return monitorEnd.addOffsetTo(-1 * VM.getVM().getAddressSize());
+  }
+
+  public int getInterpreterFrameExpressionStackDirection() { return -1; }
+
+  // top of expression stack
+  public Address addressOfInterpreterFrameTOS() {
+    return getSP();
+  }
+
+  /** Expression stack from top down */
+  public Address addressOfInterpreterFrameTOSAt(int slot) {
+    return addressOfInterpreterFrameTOS().addOffsetTo(slot * VM.getVM().getAddressSize());
+  }
+
+  public Address getInterpreterFrameSenderSP() {
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(isInterpretedFrame(), "interpreted frame expected");
+    }
+    return addressOfStackSlot(INTERPRETER_FRAME_SENDER_SP_OFFSET).getAddressAt(0);
+  }
+
+  // Monitors
+  public BasicObjectLock interpreterFrameMonitorBegin() {
+    return new BasicObjectLock(addressOfStackSlot(INTERPRETER_FRAME_MONITOR_BLOCK_BOTTOM_OFFSET));
+  }
+
+  public BasicObjectLock interpreterFrameMonitorEnd() {
+    Address result = addressOfStackSlot(INTERPRETER_FRAME_MONITOR_BLOCK_TOP_OFFSET).getAddressAt(0);
+    if (Assert.ASSERTS_ENABLED) {
+      // make sure the pointer points inside the frame
+      Assert.that(AddressOps.gt(getFP(), result), "result must <  than frame pointer");
+      Assert.that(AddressOps.lte(getSP(), result), "result must >= than stack pointer");
+    }
+    return new BasicObjectLock(result);
+  }
+
+  public int interpreterFrameMonitorSize() {
+    return BasicObjectLock.size();
+  }
+
+  // Method
+  public Address addressOfInterpreterFrameMethod() {
+    return addressOfStackSlot(INTERPRETER_FRAME_METHOD_OFFSET);
+  }
+
+  // Constant pool cache
+  public Address addressOfInterpreterFrameCPCache() {
+    return addressOfStackSlot(INTERPRETER_FRAME_CACHE_OFFSET);
+  }
+
+  // Entry frames
+  public JavaCallWrapper getEntryFrameCallWrapper() {
+    return new AARCH64JavaCallWrapper(addressOfStackSlot(ENTRY_FRAME_CALL_WRAPPER_OFFSET).getAddressAt(0));
+  }
+
+  protected Address addressOfSavedOopResult() {
+    // offset is 2 for compiler2 and 3 for compiler1
+    return getSP().addOffsetTo((VM.getVM().isClientCompiler() ? 2 : 3) *
+                               VM.getVM().getAddressSize());
+  }
+
+  protected Address addressOfSavedReceiver() {
+    return getSP().addOffsetTo(-4 * VM.getVM().getAddressSize());
+  }
+
+  private void dumpStack() {
+    for (Address addr = getSP().addOffsetTo(-4 * VM.getVM().getAddressSize());
+         AddressOps.lt(addr, getSP());
+         addr = addr.addOffsetTo(VM.getVM().getAddressSize())) {
+      System.out.println(addr + ": " + addr.getAddressAt(0));
+    }
+    System.out.println("-----------------------");
+    for (Address addr = getSP();
+         AddressOps.lte(addr, getSP().addOffsetTo(20 * VM.getVM().getAddressSize()));
+         addr = addr.addOffsetTo(VM.getVM().getAddressSize())) {
+      System.out.println(addr + ": " + addr.getAddressAt(0));
+    }
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64JavaCallWrapper.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import java.util.*;
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.types.*;
+import sun.jvm.hotspot.runtime.*;
+
+public class AARCH64JavaCallWrapper extends JavaCallWrapper {
+  private static AddressField lastJavaFPField;
+
+  static {
+    VM.registerVMInitializedObserver(new Observer() {
+        public void update(Observable o, Object data) {
+          initialize(VM.getVM().getTypeDataBase());
+        }
+      });
+  }
+
+  private static synchronized void initialize(TypeDataBase db) {
+    Type type = db.lookupType("JavaFrameAnchor");
+
+    lastJavaFPField  = type.getAddressField("_last_Java_fp");
+  }
+
+  public AARCH64JavaCallWrapper(Address addr) {
+    super(addr);
+  }
+
+  public Address getLastJavaFP() {
+    return lastJavaFPField.getValue(addr.addOffsetTo(anchorField.getOffset()));
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64RegisterMap.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.runtime.*;
+
+public class AARCH64RegisterMap extends RegisterMap {
+
+  /** This is the only public constructor */
+  public AARCH64RegisterMap(JavaThread thread, boolean updateMap) {
+    super(thread, updateMap);
+  }
+
+  protected AARCH64RegisterMap(RegisterMap map) {
+    super(map);
+  }
+
+  public Object clone() {
+    AARCH64RegisterMap retval = new AARCH64RegisterMap(this);
+    return retval;
+  }
+
+  // no PD state to clear or copy:
+  protected void clearPD() {}
+  protected void initializePD() {}
+  protected void initializeFromPD(RegisterMap map) {}
+  protected Address getLocationPD(VMReg reg) { return null; }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/linux_aarch64/LinuxAARCH64JavaThreadPDAccess.java	Thu Jul 02 11:12:59 2015 +0100
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.linux_aarch64;
+
+import java.io.*;
+import java.util.*;
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.runtime.*;
+import sun.jvm.hotspot.runtime.aarch64.*;
+import sun.jvm.hotspot.types.*;
+import sun.jvm.hotspot.utilities.*;
+
+public class LinuxAARCH64JavaThreadPDAccess implements JavaThreadPDAccess {
+  private static AddressField  lastJavaFPField;
+  private static AddressField  osThreadField;
+
+  // Field from OSThread
+  private static CIntegerField osThreadThreadIDField;
+
+  // This is currently unneeded but is being kept in case we change
+  // the currentFrameGuess algorithm
+  private static final long GUESS_SCAN_RANGE = 128 * 1024;
+
+  static {
+    VM.registerVMInitializedObserver(new Observer() {
+        public void update(Observable o, Object data) {
+          initialize(VM.getVM().getTypeDataBase());
+        }
+      });
+  }
+
+  private static synchronized void initialize(TypeDataBase db) {
+    Type type = db.lookupType("JavaThread");
+    osThreadField           = type.getAddressField("_osthread");
+
+    Type anchorType = db.lookupType("JavaFrameAnchor");
+    lastJavaFPField         = anchorType.getAddressField("_last_Java_fp");
+
+    Type osThreadType = db.lookupType("OSThread");
+    osThreadThreadIDField   = osThreadType.getCIntegerField("_thread_id");
+  }
+
+  public Address getLastJavaFP(Address addr) {
+    return lastJavaFPField.getValue(addr.addOffsetTo(sun.jvm.hotspot.runtime.JavaThread.getAnchorField().getOffset()));
+  }
+
+  public Address getLastJavaPC(Address addr) {
+    return null;
+  }
+
+  public Address getBaseOfStackPointer(Address addr) {
+    return null;
+  }
+
+  public Frame getLastFramePD(JavaThread thread, Address addr) {
+    Address fp = thread.getLastJavaFP();
+    if (fp == null) {
+      return null; // no information
+    }
+    return new AARCH64Frame(thread.getLastJavaSP(), fp);
+  }
+
+  public RegisterMap newRegisterMap(JavaThread thread, boolean updateMap) {
+    return new AARCH64RegisterMap(thread, updateMap);
+  }
+
+  public Frame getCurrentFrameGuess(JavaThread thread, Address addr) {
+    ThreadProxy t = getThreadProxy(addr);
+    AARCH64ThreadContext context = (AARCH64ThreadContext) t.getContext();
+    AARCH64CurrentFrameGuess guesser = new AARCH64CurrentFrameGuess(context, thread);
+    if (!guesser.run(GUESS_SCAN_RANGE)) {
+      return null;
+    }
+    if (guesser.getPC() == null) {
+      return new AARCH64Frame(guesser.getSP(), guesser.getFP());
+    } else {
+      return new AARCH64Frame(guesser.getSP(), guesser.getFP(), guesser.getPC());
+    }
+  }
+
+  public void printThreadIDOn(Address addr, PrintStream tty) {
+    tty.print(getThreadProxy(addr));
+  }
+
+  public void printInfoOn(Address threadAddr, PrintStream tty) {
+    tty.print("Thread id: ");
+    printThreadIDOn(threadAddr, tty);
+//    tty.println("\nPostJavaState: " + getPostJavaState(threadAddr));
+  }
+
+  public Address getLastSP(Address addr) {
+    ThreadProxy t = getThreadProxy(addr);
+    AARCH64ThreadContext context = (AARCH64ThreadContext) t.getContext();
+    return context.getRegisterAsAddress(AARCH64ThreadContext.SP);
+  }
+
+  public ThreadProxy getThreadProxy(Address addr) {
+    // Addr is the address of the JavaThread.
+    // Fetch the OSThread (for now and for simplicity, not making a
+    // separate "OSThread" class in this package)
+    Address osThreadAddr = osThreadField.getValue(addr);
+    // Get the address of the _thread_id from the OSThread
+    Address threadIdAddr = osThreadAddr.addOffsetTo(osThreadThreadIDField.getOffset());
+
+    JVMDebugger debugger = VM.getVM().getDebugger();
+    return debugger.getThreadForIdentifierAddress(threadIdAddr);
+  }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/utilities/AltPlatformInfo.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/utilities/AltPlatformInfo.java	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,7 +25,10 @@
 package sun.jvm.hotspot.utilities;
 
 public interface AltPlatformInfo {
+
   // Additional cpu types can be tested via this interface
+  public boolean knownCPU(String cpu);
 
-  public boolean knownCPU(String cpu);
-}
\ No newline at end of file
+  // Mangle a cpu name if necessary
+  public String getCPU(String cpu);
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/utilities/PlatformInfo.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/utilities/PlatformInfo.java	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -52,27 +52,54 @@
     }
   }
 
-  /* Returns "sparc" for SPARC based platforms and "x86" for x86 based
-     platforms. Otherwise returns the value of os.arch.  If the value
-     is not recognized as supported, an exception is thrown instead. */
+  public static boolean knownCPU(String cpu) {
+    final String[] KNOWN =
+        new String[] {"i386", "x86", "x86_64", "amd64", "sparc", "sparcv9", "ppc64", "aarch64"};
+
+    for(String s : KNOWN) {
+      if(s.equals(cpu))
+        return true;
+    }
+
+    return false;
+  }
+
+  /* Returns "sparc" for SPARC based platforms "x86" for x86 based
+     platforms and x86_64 for 64bit x86 based platform. Otherwise
+     returns the value of os.arch. If the value is not recognized as supported,
+     an exception is thrown instead. */
+
   public static String getCPU() throws UnsupportedPlatformException {
     String cpu = System.getProperty("os.arch");
-    if (cpu.equals("i386") || cpu.equals("x86")) {
+
+    // Let any additional CPU mangling fire first
+    try {
+      Class pic = Class.forName("sun.jvm.hotspot.utilities.PlatformInfoClosed");
+      AltPlatformInfo api = (AltPlatformInfo) pic.newInstance();
+      if (api.knownCPU(cpu)) {
+        return api.getCPU(cpu);
+      }
+    } catch (Exception e) {
+       // Ignored
+    }
+
+    // Check that CPU is supported
+    if (!knownCPU(cpu)) {
+       throw new UnsupportedPlatformException("CPU type " + cpu + " not yet supported");
+    }
+
+    // Tweeks
+    if (cpu.equals("i386"))
       return "x86";
-    } else if (cpu.equals("sparc") || cpu.equals("sparcv9")) {
+
+    if (cpu.equals("sparcv9"))
       return "sparc";
-    } else if (cpu.equals("ia64") || cpu.equals("amd64") || cpu.equals("x86_64") || cpu.equals("ppc64") || cpu.equals("aarch64")) {
-      return cpu;
-    } else {
-      try {
-        Class pic = Class.forName("sun.jvm.hotspot.utilities.PlatformInfoClosed");
-        AltPlatformInfo api = (AltPlatformInfo)pic.newInstance();
-        if (api.knownCPU(cpu)) {
-          return cpu;
-        }
-      } catch (Exception e) {}
-      throw new UnsupportedPlatformException("CPU type " + cpu + " not yet supported");
-    }
+
+    if (cpu.equals("x86_64"))
+      return "amd64";
+
+    return cpu;
+
   }
 
   // this main is invoked from Makefile to make platform specific agent Makefile(s).
--- a/agent/src/share/classes/sun/jvm/hotspot/utilities/PointerLocation.java	Tue Jun 16 17:31:53 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/utilities/PointerLocation.java	Thu Jul 02 11:12:59 2015 +0100
@@ -84,11 +84,11 @@
   }
 
   public boolean isInNewGen() {
-    return ((gen != null) && (gen.level() == 0));
+    return ((gen != null) && (gen == ((GenCollectedHeap)heap).getGen(0)));
   }
 
   public boolean isInOldGen() {
-    return ((gen != null) && (gen.level() == 1));
+    return ((gen != null) && (gen == ((GenCollectedHeap)heap).getGen(1)));
   }
 
   public boolean inOtherGen() {
@@ -207,8 +207,6 @@
           tty.print("In new generation ");
         } else if (isInOldGen()) {
           tty.print("In old generation ");
-        } else if (gen != null) {
-          tty.print("In Generation " + getGeneration().level());
         } else {
           tty.print("In unknown section of Java heap");
         }
--- a/make/bsd/makefiles/dtrace.make	Tue Jun 16 17:31:53 2015 +0100
+++ b/make/bsd/makefiles/dtrace.make	Thu Jul 02 11:12:59 2015 +0100
@@ -263,14 +263,19 @@
 $(DtraceOutDir):
 	mkdir $(DtraceOutDir)
 
+# When building using a devkit, dtrace cannot find the correct preprocessor so
+# we run it explicitly before runing dtrace.
 $(DtraceOutDir)/hotspot.h: $(DTRACE_COMMON_SRCDIR)/hotspot.d | $(DtraceOutDir)
-	$(QUIETLY) $(DTRACE_PROG) $(DTRACE_OPTS) -C -I. -h -o $@ -s $(DTRACE_COMMON_SRCDIR)/hotspot.d
+	$(QUIETLY) $(CC) -E $(DTRACE_OPTS) -I. -x c $(DTRACE_COMMON_SRCDIR)/hotspot.d > $(DtraceOutDir)/hotspot.d
+	$(QUIETLY) $(DTRACE_PROG) -h -o $@ -s $(DtraceOutDir)/hotspot.d
 
 $(DtraceOutDir)/hotspot_jni.h: $(DTRACE_COMMON_SRCDIR)/hotspot_jni.d | $(DtraceOutDir)
-	$(QUIETLY) $(DTRACE_PROG) $(DTRACE_OPTS) -C -I. -h -o $@ -s $(DTRACE_COMMON_SRCDIR)/hotspot_jni.d
+	$(QUIETLY) $(CC) -E $(DTRACE_OPTS) -I. -x c $(DTRACE_COMMON_SRCDIR)/hotspot_jni.d > $(DtraceOutDir)/hotspot_jni.d
+	$(QUIETLY) $(DTRACE_PROG) -h -o $@ -s $(DtraceOutDir)/hotspot_jni.d
 
 $(DtraceOutDir)/hs_private.h: $(DTRACE_COMMON_SRCDIR)/hs_private.d | $(DtraceOutDir)
-	$(QUIETLY) $(DTRACE_PROG) $(DTRACE_OPTS) -C -I. -h -o $@ -s $(DTRACE_COMMON_SRCDIR)/hs_private.d
+	$(QUIETLY) $(CC) -E $(DTRACE_OPTS) -I. -x c $(DTRACE_COMMON_SRCDIR)/hs_private.d > $(DtraceOutDir)/hs_private.d
+	$(QUIETLY) $(DTRACE_PROG) -h -o $@ -s $(DtraceOutDir)/hs_private.d
 
 dtrace_gen_headers: $(DtraceOutDir)/hotspot.h $(DtraceOutDir)/hotspot_jni.h $(DtraceOutDir)/hs_private.h 
 
--- a/make/bsd/makefiles/universal.gmk	Tue Jun 16 17:31:53 2015 +0100
+++ b/make/bsd/makefiles/universal.gmk	Thu Jul 02 11:12:59 2015 +0100
@@ -56,13 +56,14 @@
 universalize: $(UNIVERSAL_LIPO_LIST) $(UNIVERSAL_COPY_LIST)
 	$(RM) -r $(EXPORT_PATH)/lib/{i386,amd64}
 
+LIPO ?= lipo
 
 # Package built libraries in a universal binary
 $(UNIVERSAL_LIPO_LIST):
 	BUILT_LIPO_FILES="`find $(EXPORT_LIB_DIR)/{i386,amd64}/$(subst $(EXPORT_LIB_DIR)/,,$@) 2>/dev/null`" || test $$? = "1"; \
 	if [ -n "$${BUILT_LIPO_FILES}" ]; then \
 	  $(MKDIR) -p $(shell dirname $@); \
-	  lipo -create -output $@ $${BUILT_LIPO_FILES}; \
+	  $(LIPO) -create -output $@ $${BUILT_LIPO_FILES}; \
 	fi
 
 
--- a/make/sa.files	Tue Jun 16 17:31:53 2015 +0100
+++ b/make/sa.files	Thu Jul 02 11:12:59 2015 +0100
@@ -44,6 +44,7 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/compiler/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/bsd/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/bsd/amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/bsd/x86/*.java \
@@ -55,6 +56,7 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/ia64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/ppc64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/posix/*.java \
@@ -63,6 +65,7 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/ppc64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/*.java \
@@ -70,6 +73,7 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/ppc64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/x86/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/win32/coff/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/windbg/*.java \
@@ -92,11 +96,13 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/prims/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/bsd/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/bsd_amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/bsd_x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_ppc64/*.java \
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Jul 02 11:12:59 2015 +0100
@@ -865,6 +865,42 @@
     V31, V31_H
 );
 
+// Class for all 64bit vector registers
+reg_class vectord_reg(
+    V0, V0_H,
+    V1, V1_H,
+    V2, V2_H,
+    V3, V3_H,
+    V4, V4_H,
+    V5, V5_H,
+    V6, V6_H,
+    V7, V7_H,
+    V8, V8_H,
+    V9, V9_H,
+    V10, V10_H,
+    V11, V11_H,
+    V12, V12_H,
+    V13, V13_H,
+    V14, V14_H,
+    V15, V15_H,
+    V16, V16_H,
+    V17, V17_H,
+    V18, V18_H,
+    V19, V19_H,
+    V20, V20_H,
+    V21, V21_H,
+    V22, V22_H,
+    V23, V23_H,
+    V24, V24_H,
+    V25, V25_H,
+    V26, V26_H,
+    V27, V27_H,
+    V28, V28_H,
+    V29, V29_H,
+    V30, V30_H,
+    V31, V31_H
+);
+
 // Class for all 128bit vector registers
 reg_class vectorx_reg(
     V0, V0_H, V0_J, V0_K,
@@ -2133,40 +2169,48 @@
 
   if (bottom_type()->isa_vect() != NULL) {
     uint len = 4;
+    uint ireg = ideal_reg();
+    assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector");
     if (cbuf) {
       MacroAssembler _masm(cbuf);
-      uint ireg = ideal_reg();
       assert((src_lo_rc != rc_int && dst_lo_rc != rc_int), "sanity");
-      assert(ireg == Op_VecX, "sanity");
       if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) {
         // stack->stack
         int src_offset = ra_->reg2offset(src_lo);
         int dst_offset = ra_->reg2offset(dst_lo);
         assert((src_offset & 7) && (dst_offset & 7), "unaligned stack offset");
         len = 8;
-        if (src_offset < 512) {
-          __ ldp(rscratch1, rscratch2, Address(sp, src_offset));
+        if (ireg == Op_VecD) {
+          __ ldr(rscratch1, Address(sp, src_offset));
+          __ str(rscratch1, Address(sp, dst_offset));
         } else {
-          __ ldr(rscratch1, Address(sp, src_offset));
-          __ ldr(rscratch2, Address(sp, src_offset+4));
-          len += 4;
-        }
-        if (dst_offset < 512) {
-          __ stp(rscratch1, rscratch2, Address(sp, dst_offset));
-        } else {
-          __ str(rscratch1, Address(sp, dst_offset));
-          __ str(rscratch2, Address(sp, dst_offset+4));
-          len += 4;
+          if (src_offset < 512) {
+            __ ldp(rscratch1, rscratch2, Address(sp, src_offset));
+          } else {
+            __ ldr(rscratch1, Address(sp, src_offset));
+            __ ldr(rscratch2, Address(sp, src_offset+4));
+            len += 4;
+          }
+          if (dst_offset < 512) {
+            __ stp(rscratch1, rscratch2, Address(sp, dst_offset));
+          } else {
+            __ str(rscratch1, Address(sp, dst_offset));
+            __ str(rscratch2, Address(sp, dst_offset+4));
+            len += 4;
+          }
         }
       } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) {
-        __ orr(as_FloatRegister(Matcher::_regEncode[dst_lo]), __ T16B,
+        __ orr(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+               ireg == Op_VecD ? __ T8B : __ T16B,
                as_FloatRegister(Matcher::_regEncode[src_lo]),
                as_FloatRegister(Matcher::_regEncode[src_lo]));
       } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) {
-        __ str(as_FloatRegister(Matcher::_regEncode[src_lo]), __ Q,
+        __ str(as_FloatRegister(Matcher::_regEncode[src_lo]),
+               ireg == Op_VecD ? __ D : __ Q,
                Address(sp, ra_->reg2offset(dst_lo)));
       } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) {
-        __ ldr(as_FloatRegister(Matcher::_regEncode[dst_lo]), __ Q,
+        __ ldr(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+               ireg == Op_VecD ? __ D : __ Q,
                Address(sp, ra_->reg2offset(src_lo)));
       } else {
         ShouldNotReachHere();
@@ -2176,17 +2220,22 @@
         // stack->stack
         int src_offset = ra_->reg2offset(src_lo);
         int dst_offset = ra_->reg2offset(dst_lo);
-        if (src_offset < 512) {
-          st->print("ldp  rscratch1, rscratch2, [sp, #%d]", src_offset);
+        if (ireg == Op_VecD) {
+          st->print("ldr  rscratch1, [sp, #%d]", src_offset);
+          st->print("str  rscratch1, [sp, #%d]", dst_offset);
         } else {
-          st->print("ldr  rscratch1, [sp, #%d]", src_offset);
-          st->print("\nldr  rscratch2, [sp, #%d]", src_offset+4);
-        }
-        if (dst_offset < 512) {
-          st->print("\nstp  rscratch1, rscratch2, [sp, #%d]", dst_offset);
-        } else {
-          st->print("\nstr  rscratch1, [sp, #%d]", dst_offset);
-          st->print("\nstr  rscratch2, [sp, #%d]", dst_offset+4);
+          if (src_offset < 512) {
+            st->print("ldp  rscratch1, rscratch2, [sp, #%d]", src_offset);
+          } else {
+            st->print("ldr  rscratch1, [sp, #%d]", src_offset);
+            st->print("\nldr  rscratch2, [sp, #%d]", src_offset+4);
+          }
+          if (dst_offset < 512) {
+            st->print("\nstp  rscratch1, rscratch2, [sp, #%d]", dst_offset);
+          } else {
+            st->print("\nstr  rscratch1, [sp, #%d]", dst_offset);
+            st->print("\nstr  rscratch2, [sp, #%d]", dst_offset+4);
+          }
         }
         st->print("\t# vector spill, stack to stack");
       } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) {
@@ -2638,17 +2687,22 @@
   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 }
 const int Matcher::min_vector_size(const BasicType bt) {
-  //return (type2aelembytes(bt) == 1) ? 4 : 2;
-  // For the moment, only support 1 vector size, 128 bits
-  return max_vector_size(bt);
+//  For the moment limit the vector size to 8 bytes
+    int size = 8 / type2aelembytes(bt);
+    if (size < 2) size = 2;
+    return size;
 }
 
 // Vector ideal reg.
 const int Matcher::vector_ideal_reg(int len) {
-  return Op_VecX;
+  switch(len) {
+    case  8: return Op_VecD;
+    case 16: return Op_VecX;
+  }
+  ShouldNotReachHere();
+  return 0;
 }
 
-// Only lowest bits of xmm reg are used for vector shift count.
 const int Matcher::vector_shift_count_ideal_reg(int size) {
   return Op_VecX;
 }
@@ -2660,9 +2714,7 @@
 
 // x86 supports misaligned vectors store/load.
 const bool Matcher::misaligned_vectors_ok() {
-  // TODO fixme
-  // return !AlignVector; // can be changed by flag
-  return false;
+  return !AlignVector; // can be changed by flag
 }
 
 // false => size gets scaled to BytesPerLong, ok.
@@ -3073,13 +3125,13 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
-  enc_class aarch64_enc_ldrvS(vecX dst, memory mem) %{
+  enc_class aarch64_enc_ldrvS(vecD dst, memory mem) %{
     FloatRegister dst_reg = as_FloatRegister($dst$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::S,
        $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
-  enc_class aarch64_enc_ldrvD(vecX dst, memory mem) %{
+  enc_class aarch64_enc_ldrvD(vecD dst, memory mem) %{
     FloatRegister dst_reg = as_FloatRegister($dst$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::D,
        $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
@@ -3159,13 +3211,13 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
-  enc_class aarch64_enc_strvS(vecX src, memory mem) %{
+  enc_class aarch64_enc_strvS(vecD src, memory mem) %{
     FloatRegister src_reg = as_FloatRegister($src$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::S,
        $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
-  enc_class aarch64_enc_strvD(vecX src, memory mem) %{
+  enc_class aarch64_enc_strvD(vecD src, memory mem) %{
     FloatRegister src_reg = as_FloatRegister($src$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::D,
        $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
@@ -5187,6 +5239,16 @@
   interface(REG_INTER);
 %}
 
+operand vecD()
+%{
+  constraint(ALLOC_IN_RC(vectord_reg));
+  match(VecD);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecX()
 %{
   constraint(ALLOC_IN_RC(vectorx_reg));
@@ -7402,6 +7464,96 @@
   ins_pipe(ialu_reg);
 %}
 
+//---------- Population Count Instructions -------------------------------------
+//
+
+instruct popCountI(iRegINoSp dst, iRegIorL2I src, vRegF tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountI src));
+  effect(TEMP tmp);
+  ins_cost(INSN_COST * 13);
+
+  format %{ "movw   $src, $src\n\t"
+            "mov    $tmp, $src\t# vector (1D)\n\t"
+            "cnt    $tmp, $tmp\t# vector (8B)\n\t"
+            "addv   $tmp, $tmp\t# vector (8B)\n\t"
+            "mov    $dst, $tmp\t# vector (1D)" %}
+  ins_encode %{
+    __ movw($src$$Register, $src$$Register); // ensure top 32 bits 0
+    __ mov($tmp$$FloatRegister, __ T1D, 0, $src$$Register);
+    __ cnt($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ addv($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ mov($dst$$Register, $tmp$$FloatRegister, __ T1D, 0);
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
+instruct popCountI_mem(iRegINoSp dst, memory mem, vRegF tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountI (LoadI mem)));
+  effect(TEMP tmp);
+  ins_cost(INSN_COST * 13);
+
+  format %{ "ldrs   $tmp, $mem\n\t"
+            "cnt    $tmp, $tmp\t# vector (8B)\n\t"
+            "addv   $tmp, $tmp\t# vector (8B)\n\t"
+            "mov    $dst, $tmp\t# vector (1D)" %}
+  ins_encode %{
+    FloatRegister tmp_reg = as_FloatRegister($tmp$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldrs, tmp_reg, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+    __ cnt($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ addv($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ mov($dst$$Register, $tmp$$FloatRegister, __ T1D, 0);
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
+// Note: Long.bitCount(long) returns an int.
+instruct popCountL(iRegINoSp dst, iRegL src, vRegD tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountL src));
+  effect(TEMP tmp);
+  ins_cost(INSN_COST * 13);
+
+  format %{ "mov    $tmp, $src\t# vector (1D)\n\t"
+            "cnt    $tmp, $tmp\t# vector (8B)\n\t"
+            "addv   $tmp, $tmp\t# vector (8B)\n\t"
+            "mov    $dst, $tmp\t# vector (1D)" %}
+  ins_encode %{
+    __ mov($tmp$$FloatRegister, __ T1D, 0, $src$$Register);
+    __ cnt($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ addv($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ mov($dst$$Register, $tmp$$FloatRegister, __ T1D, 0);
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
+instruct popCountL_mem(iRegINoSp dst, memory mem, vRegD tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountL (LoadL mem)));
+  effect(TEMP tmp);
+  ins_cost(INSN_COST * 13);
+
+  format %{ "ldrd   $tmp, $mem\n\t"
+            "cnt    $tmp, $tmp\t# vector (8B)\n\t"
+            "addv   $tmp, $tmp\t# vector (8B)\n\t"
+            "mov    $dst, $tmp\t# vector (1D)" %}
+  ins_encode %{
+    FloatRegister tmp_reg = as_FloatRegister($tmp$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldrd, tmp_reg, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+    __ cnt($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ addv($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ mov($dst$$Register, $tmp$$FloatRegister, __ T1D, 0);
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
 // ============================================================================
 // MemBar Instruction
 
@@ -13194,7 +13346,7 @@
 // ====================VECTOR INSTRUCTIONS=====================================
 
 // Load vector (32 bits)
-instruct loadV4(vecX dst, vmem mem)
+instruct loadV4(vecD dst, vmem mem)
 %{
   predicate(n->as_LoadVector()->memory_size() == 4);
   match(Set dst (LoadVector mem));
@@ -13205,7 +13357,7 @@
 %}
 
 // Load vector (64 bits)
-instruct loadV8(vecX dst, vmem mem)
+instruct loadV8(vecD dst, vmem mem)
 %{
   predicate(n->as_LoadVector()->memory_size() == 8);
   match(Set dst (LoadVector mem));
@@ -13227,7 +13379,7 @@
 %}
 
 // Store Vector (32 bits)
-instruct storeV4(vecX src, vmem mem)
+instruct storeV4(vecD src, vmem mem)
 %{
   predicate(n->as_StoreVector()->memory_size() == 4);
   match(Set mem (StoreVector mem src));
@@ -13238,7 +13390,7 @@
 %}
 
 // Store Vector (64 bits)
-instruct storeV8(vecX src, vmem mem)
+instruct storeV8(vecD src, vmem mem)
 %{
   predicate(n->as_StoreVector()->memory_size() == 8);
   match(Set mem (StoreVector mem src));
@@ -13259,8 +13411,22 @@
   ins_pipe(pipe_class_memory);
 %}
 
+instruct replicate8B(vecD dst, iRegIorL2I src)
+%{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (8B)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T8B, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct replicate16B(vecX dst, iRegIorL2I src)
 %{
+  predicate(n->as_Vector()->length() == 16);
   match(Set dst (ReplicateB src));
   ins_cost(INSN_COST);
   format %{ "dup  $dst, $src\t# vector (16B)" %}
@@ -13270,19 +13436,47 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct replicate8B_imm(vecD dst, immI con)
+%{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(8B)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T8B, $con$$constant & 0xff);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct replicate16B_imm(vecX dst, immI con)
 %{
+  predicate(n->as_Vector()->length() == 16);
   match(Set dst (ReplicateB con));
   ins_cost(INSN_COST);
   format %{ "movi  $dst, $con\t# vector(16B)" %}
   ins_encode %{
-    __ mov(as_FloatRegister($dst$$reg), __ T16B, $con$$constant);
+    __ mov(as_FloatRegister($dst$$reg), __ T16B, $con$$constant & 0xff);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate4S(vecD dst, iRegIorL2I src)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (4S)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T4H, as_Register($src$$reg));
   %}
   ins_pipe(pipe_class_default);
 %}
 
 instruct replicate8S(vecX dst, iRegIorL2I src)
 %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (ReplicateS src));
   ins_cost(INSN_COST);
   format %{ "dup  $dst, $src\t# vector (8S)" %}
@@ -13292,19 +13486,46 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct replicate4S_imm(vecD dst, immI con)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(4H)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T4H, $con$$constant & 0xffff);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct replicate8S_imm(vecX dst, immI con)
 %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (ReplicateS con));
   ins_cost(INSN_COST);
   format %{ "movi  $dst, $con\t# vector(8H)" %}
   ins_encode %{
-    __ mov(as_FloatRegister($dst$$reg), __ T8H, $con$$constant);
+    __ mov(as_FloatRegister($dst$$reg), __ T8H, $con$$constant & 0xffff);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2I(vecD dst, iRegIorL2I src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (2I)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T2S, as_Register($src$$reg));
   %}
   ins_pipe(pipe_class_default);
 %}
 
 instruct replicate4I(vecX dst, iRegIorL2I src)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (ReplicateI src));
   ins_cost(INSN_COST);
   format %{ "dup  $dst, $src\t# vector (4I)" %}
@@ -13314,8 +13535,21 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct replicate2I_imm(vecD dst, immI con)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(2I)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T2S, $con$$constant);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct replicate4I_imm(vecX dst, immI con)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (ReplicateI con));
   ins_cost(INSN_COST);
   format %{ "movi  $dst, $con\t# vector(4I)" %}
@@ -13327,6 +13561,7 @@
 
 instruct replicate2L(vecX dst, iRegL src)
 %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (ReplicateL src));
   ins_cost(INSN_COST);
   format %{ "dup  $dst, $src\t# vector (2L)" %}
@@ -13338,6 +13573,7 @@
 
 instruct replicate2L_zero(vecX dst, immI0 zero)
 %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (ReplicateI zero));
   ins_cost(INSN_COST);
   format %{ "movi  $dst, $zero\t# vector(4I)" %}
@@ -13349,8 +13585,22 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct replicate2F(vecD dst, vRegF src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (2F)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T2S,
+           as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct replicate4F(vecX dst, vRegF src)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (ReplicateF src));
   ins_cost(INSN_COST);
   format %{ "dup  $dst, $src\t# vector (4F)" %}
@@ -13363,6 +13613,7 @@
 
 instruct replicate2D(vecX dst, vRegD src)
 %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (ReplicateD src));
   ins_cost(INSN_COST);
   format %{ "dup  $dst, $src\t# vector (2D)" %}
@@ -13375,6 +13626,25 @@
 
 // ====================REDUCTION ARITHMETIC====================================
 
+instruct reduce_add2I(iRegINoSp dst, iRegIorL2I src1, vecD src2, iRegI tmp, iRegI tmp2)
+%{
+  match(Set dst (AddReductionVI src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "umov  $tmp, $src2, S, 0\n\t"
+            "umov  $tmp2, $src2, S, 1\n\t"
+            "addw  $dst, $src1, $tmp\n\t"
+            "addw  $dst, $dst, $tmp2\t add reduction2i"
+  %}
+  ins_encode %{
+    __ umov($tmp$$Register, as_FloatRegister($src2$$reg), __ S, 0);
+    __ umov($tmp2$$Register, as_FloatRegister($src2$$reg), __ S, 1);
+    __ addw($dst$$Register, $src1$$Register, $tmp$$Register);
+    __ addw($dst$$Register, $dst$$Register, $tmp2$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct reduce_add4I(iRegINoSp dst, iRegIorL2I src1, vecX src2, vecX tmp, iRegI tmp2)
 %{
   match(Set dst (AddReductionVI src1 src2));
@@ -13393,6 +13663,25 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct reduce_mul2I(iRegINoSp dst, iRegIorL2I src1, vecD src2, iRegI tmp)
+%{
+  match(Set dst (MulReductionVI src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP dst);
+  format %{ "umov  $tmp, $src2, S, 0\n\t"
+            "mul   $dst, $tmp, $src1\n\t"
+            "umov  $tmp, $src2, S, 1\n\t"
+            "mul   $dst, $tmp, $dst\t mul reduction2i\n\t"
+  %}
+  ins_encode %{
+    __ umov($tmp$$Register, as_FloatRegister($src2$$reg), __ S, 0);
+    __ mul($dst$$Register, $tmp$$Register, $src1$$Register);
+    __ umov($tmp$$Register, as_FloatRegister($src2$$reg), __ S, 1);
+    __ mul($dst$$Register, $tmp$$Register, $dst$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct reduce_mul4I(iRegINoSp dst, iRegIorL2I src1, vecX src2, vecX tmp, iRegI tmp2)
 %{
   match(Set dst (MulReductionVI src1 src2));
@@ -13418,6 +13707,26 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct reduce_add2F(vRegF dst, vRegF src1, vecD src2, vecD tmp)
+%{
+  match(Set dst (AddReductionVF src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP dst);
+  format %{ "fadds $dst, $src1, $src2\n\t"
+            "ins   $tmp, S, $src2, 0, 1\n\t"
+            "fadds $dst, $dst, $tmp\t add reduction2f"
+  %}
+  ins_encode %{
+    __ fadds(as_FloatRegister($dst$$reg),
+             as_FloatRegister($src1$$reg), as_FloatRegister($src2$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ S,
+           as_FloatRegister($src2$$reg), 0, 1);
+    __ fadds(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct reduce_add4F(vRegF dst, vRegF src1, vecX src2, vecX tmp)
 %{
   match(Set dst (AddReductionVF src1 src2));
@@ -13450,6 +13759,26 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct reduce_mul2F(vRegF dst, vRegF src1, vecD src2, vecD tmp)
+%{
+  match(Set dst (MulReductionVF src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP dst);
+  format %{ "fmuls $dst, $src1, $src2\n\t"
+            "ins   $tmp, S, $src2, 0, 1\n\t"
+            "fmuls $dst, $dst, $tmp\t add reduction4f"
+  %}
+  ins_encode %{
+    __ fmuls(as_FloatRegister($dst$$reg),
+             as_FloatRegister($src1$$reg), as_FloatRegister($src2$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ S,
+           as_FloatRegister($src2$$reg), 0, 1);
+    __ fmuls(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct reduce_mul4F(vRegF dst, vRegF src1, vecX src2, vecX tmp)
 %{
   match(Set dst (MulReductionVF src1 src2));
@@ -13526,8 +13855,24 @@
 
 // --------------------------------- ADD --------------------------------------
 
+instruct vadd8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (AddVB src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vadd16B(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 16);
   match(Set dst (AddVB src1 src2));
   ins_cost(INSN_COST);
   format %{ "addv  $dst,$src1,$src2\t# vector (16B)" %}
@@ -13539,8 +13884,24 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vadd4S(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (AddVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (4H)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vadd8S(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (AddVS src1 src2));
   ins_cost(INSN_COST);
   format %{ "addv  $dst,$src1,$src2\t# vector (8H)" %}
@@ -13552,8 +13913,23 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vadd2I(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vadd4I(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (AddVI src1 src2));
   ins_cost(INSN_COST);
   format %{ "addv  $dst,$src1,$src2\t# vector (4S)" %}
@@ -13567,6 +13943,7 @@
 
 instruct vadd2L(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (AddVL src1 src2));
   ins_cost(INSN_COST);
   format %{ "addv  $dst,$src1,$src2\t# vector (2L)" %}
@@ -13578,8 +13955,23 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vadd2F(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fadd  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ fadd(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vadd4F(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (AddVF src1 src2));
   ins_cost(INSN_COST);
   format %{ "fadd  $dst,$src1,$src2\t# vector (4S)" %}
@@ -13606,8 +13998,24 @@
 
 // --------------------------------- SUB --------------------------------------
 
+instruct vsub8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (SubVB src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsub16B(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 16);
   match(Set dst (SubVB src1 src2));
   ins_cost(INSN_COST);
   format %{ "subv  $dst,$src1,$src2\t# vector (16B)" %}
@@ -13619,8 +14027,24 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsub4S(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (SubVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (4H)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsub8S(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (SubVS src1 src2));
   ins_cost(INSN_COST);
   format %{ "subv  $dst,$src1,$src2\t# vector (8H)" %}
@@ -13632,8 +14056,23 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsub2I(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsub4I(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (SubVI src1 src2));
   ins_cost(INSN_COST);
   format %{ "subv  $dst,$src1,$src2\t# vector (4S)" %}
@@ -13647,6 +14086,7 @@
 
 instruct vsub2L(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (SubVL src1 src2));
   ins_cost(INSN_COST);
   format %{ "subv  $dst,$src1,$src2\t# vector (2L)" %}
@@ -13658,8 +14098,23 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsub2F(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fsub  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ fsub(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsub4F(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (SubVF src1 src2));
   ins_cost(INSN_COST);
   format %{ "fsub  $dst,$src1,$src2\t# vector (4S)" %}
@@ -13673,6 +14128,7 @@
 
 instruct vsub2D(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (SubVD src1 src2));
   ins_cost(INSN_COST);
   format %{ "fsub  $dst,$src1,$src2\t# vector (2D)" %}
@@ -13686,8 +14142,24 @@
 
 // --------------------------------- MUL --------------------------------------
 
+instruct vmul4S(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (MulVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "mulv  $dst,$src1,$src2\t# vector (4H)" %}
+  ins_encode %{
+    __ mulv(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vmul8S(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (MulVS src1 src2));
   ins_cost(INSN_COST);
   format %{ "mulv  $dst,$src1,$src2\t# vector (8H)" %}
@@ -13699,8 +14171,23 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vmul2I(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "mulv  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ mulv(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vmul4I(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (MulVI src1 src2));
   ins_cost(INSN_COST);
   format %{ "mulv  $dst,$src1,$src2\t# vector (4S)" %}
@@ -13712,8 +14199,23 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vmul2F(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fmul  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ fmul(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vmul4F(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (MulVF src1 src2));
   ins_cost(INSN_COST);
   format %{ "fmul  $dst,$src1,$src2\t# vector (4S)" %}
@@ -13727,6 +14229,7 @@
 
 instruct vmul2D(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (MulVD src1 src2));
   ins_cost(INSN_COST);
   format %{ "fmul  $dst,$src1,$src2\t# vector (2D)" %}
@@ -13740,8 +14243,23 @@
 
 // --------------------------------- DIV --------------------------------------
 
+instruct vdiv2F(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fdiv  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ fdiv(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vdiv4F(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (DivVF src1 src2));
   ins_cost(INSN_COST);
   format %{ "fdiv  $dst,$src1,$src2\t# vector (4S)" %}
@@ -13755,6 +14273,7 @@
 
 instruct vdiv2D(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (DivVD src1 src2));
   ins_cost(INSN_COST);
   format %{ "fdiv  $dst,$src1,$src2\t# vector (2D)" %}
@@ -13768,8 +14287,24 @@
 
 // --------------------------------- AND --------------------------------------
 
+instruct vand8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length_in_bytes() == 4 ||
+            n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "and  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ andr(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vand16B(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (AndV src1 src2));
   ins_cost(INSN_COST);
   format %{ "and  $dst,$src1,$src2\t# vector (16B)" %}
@@ -13783,8 +14318,24 @@
 
 // --------------------------------- OR ---------------------------------------
 
+instruct vor8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length_in_bytes() == 4 ||
+            n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (OrV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "and  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ orr(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vor16B(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (OrV src1 src2));
   ins_cost(INSN_COST);
   format %{ "orr  $dst,$src1,$src2\t# vector (16B)" %}
@@ -13798,8 +14349,24 @@
 
 // --------------------------------- XOR --------------------------------------
 
+instruct vxor8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length_in_bytes() == 4 ||
+            n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "xor  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ eor(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vxor16B(vecX dst, vecX src1, vecX src2)
 %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (XorV src1 src2));
   ins_cost(INSN_COST);
   format %{ "xor  $dst,$src1,$src2\t# vector (16B)" %}
@@ -13833,7 +14400,23 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsll8B(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVB src shift));
+  match(Set dst (RShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (8B)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsll16B(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
   match(Set dst (LShiftVB src shift));
   match(Set dst (RShiftVB src shift));
   ins_cost(INSN_COST);
@@ -13846,7 +14429,22 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsrl8B(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (8B)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsrl16B(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
   match(Set dst (URShiftVB src shift));
   ins_cost(INSN_COST);
   format %{ "ushl  $dst,$src,$shift\t# vector (16B)" %}
@@ -13858,7 +14456,28 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsll8B_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (8B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ shl(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsll16B_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
   match(Set dst (LShiftVB src shift));
   ins_cost(INSN_COST);
   format %{ "shl    $dst, $src, $shift\t# vector (16B)" %}
@@ -13876,7 +14495,24 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsra8B_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (8B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) sh = 7;
+    sh = -sh & 7;
+    __ sshr(as_FloatRegister($dst$$reg), __ T8B,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsra16B_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
   match(Set dst (RShiftVB src shift));
   ins_cost(INSN_COST);
   format %{ "sshr    $dst, $src, $shift\t# vector (16B)" %}
@@ -13890,7 +14526,28 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsrl8B_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (8B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ ushr(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg), -sh & 7);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsrl16B_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
   match(Set dst (URShiftVB src shift));
   ins_cost(INSN_COST);
   format %{ "ushr    $dst, $src, $shift\t# vector (16B)" %}
@@ -13908,7 +14565,23 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsll4S(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  match(Set dst (RShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (4H)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsll8S(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (LShiftVS src shift));
   match(Set dst (RShiftVS src shift));
   ins_cost(INSN_COST);
@@ -13921,7 +14594,22 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsrl4S(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (4H)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsrl8S(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (URShiftVS src shift));
   ins_cost(INSN_COST);
   format %{ "ushl  $dst,$src,$shift\t# vector (8H)" %}
@@ -13933,7 +14621,28 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsll4S_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (4H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ shl(as_FloatRegister($dst$$reg), __ T4H,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsll8S_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (LShiftVS src shift));
   ins_cost(INSN_COST);
   format %{ "shl    $dst, $src, $shift\t# vector (8H)" %}
@@ -13951,7 +14660,24 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsra4S_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (4H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) sh = 15;
+    sh = -sh & 15;
+    __ sshr(as_FloatRegister($dst$$reg), __ T4H,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsra8S_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (RShiftVS src shift));
   ins_cost(INSN_COST);
   format %{ "sshr    $dst, $src, $shift\t# vector (8H)" %}
@@ -13965,7 +14691,28 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsrl4S_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (4H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ ushr(as_FloatRegister($dst$$reg), __ T4H,
+             as_FloatRegister($src$$reg), -sh & 15);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsrl8S_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
   match(Set dst (URShiftVS src shift));
   ins_cost(INSN_COST);
   format %{ "ushr    $dst, $src, $shift\t# vector (8H)" %}
@@ -13983,7 +14730,22 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsll2I(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI src shift));
+  match(Set dst (RShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (2S)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsll4I(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (LShiftVI src shift));
   match(Set dst (RShiftVI src shift));
   ins_cost(INSN_COST);
@@ -13996,7 +14758,21 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsrl2I(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (2S)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsrl4I(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (URShiftVI src shift));
   ins_cost(INSN_COST);
   format %{ "ushl  $dst,$src,$shift\t# vector (4S)" %}
@@ -14008,7 +14784,21 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsll2I_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (2S)" %}
+  ins_encode %{
+    __ shl(as_FloatRegister($dst$$reg), __ T2S,
+           as_FloatRegister($src$$reg),
+           (int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsll4I_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (LShiftVI src shift));
   ins_cost(INSN_COST);
   format %{ "shl    $dst, $src, $shift\t# vector (4S)" %}
@@ -14020,7 +14810,21 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsra2I_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (2S)" %}
+  ins_encode %{
+    __ sshr(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsra4I_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (RShiftVI src shift));
   ins_cost(INSN_COST);
   format %{ "sshr    $dst, $src, $shift\t# vector (4S)" %}
@@ -14032,7 +14836,21 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct vsrl2I_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (2S)" %}
+  ins_encode %{
+    __ ushr(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct vsrl4I_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
   match(Set dst (URShiftVI src shift));
   ins_cost(INSN_COST);
   format %{ "ushr    $dst, $src, $shift\t# vector (4S)" %}
@@ -14045,6 +14863,7 @@
 %}
 
 instruct vsll2L(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (LShiftVL src shift));
   match(Set dst (RShiftVL src shift));
   ins_cost(INSN_COST);
@@ -14058,6 +14877,7 @@
 %}
 
 instruct vsrl2L(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (URShiftVL src shift));
   ins_cost(INSN_COST);
   format %{ "ushl  $dst,$src,$shift\t# vector (2D)" %}
@@ -14070,6 +14890,7 @@
 %}
 
 instruct vsll2L_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (LShiftVL src shift));
   ins_cost(INSN_COST);
   format %{ "shl    $dst, $src, $shift\t# vector (2D)" %}
@@ -14082,6 +14903,7 @@
 %}
 
 instruct vsra2L_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (RShiftVL src shift));
   ins_cost(INSN_COST);
   format %{ "sshr    $dst, $src, $shift\t# vector (2D)" %}
@@ -14094,6 +14916,7 @@
 %}
 
 instruct vsrl2L_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
   match(Set dst (URShiftVL src shift));
   ins_cost(INSN_COST);
   format %{ "ushr    $dst, $src, $shift\t# vector (2D)" %}
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -491,6 +491,11 @@
         i->rf(_index, 16);
         i->f(_ext.option(), 15, 13);
         unsigned size = i->get(31, 30);
+        if (i->get(26, 26) && i->get(23, 23)) {
+          // SIMD Q Type - Size = 128 bits
+          assert(size == 0, "bad size");
+          size = 0b100;
+        }
         if (size == 0) // It's a byte
           i->f(_ext.shift() >= 0, 12);
         else {
@@ -2050,6 +2055,9 @@
   INSN(negr,  1, 0b100000101110);
   INSN(notr,  1, 0b100000010110);
   INSN(addv,  0, 0b110001101110);
+  INSN(cls,   0, 0b100000010010);
+  INSN(clz,   1, 0b100000010010);
+  INSN(cnt,   0, 0b100000010110);
 
 #undef INSN
 
--- a/src/cpu/aarch64/vm/frame_aarch64.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/aarch64/vm/frame_aarch64.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -526,16 +526,6 @@
   return frame(sender_sp(), link(), sender_pc());
 }
 
-bool frame::interpreter_frame_equals_unpacked_fp(intptr_t* fp) {
-  assert(is_interpreted_frame(), "must be interpreter frame");
-  Method* method = interpreter_frame_method();
-  // When unpacking an optimized frame the frame pointer is
-  // adjusted with:
-  int diff = (method->max_locals() - method->size_of_parameters()) *
-             Interpreter::stackElementWords;
-  return _fp == (fp - diff);
-}
-
 bool frame::is_interpreted_frame_valid(JavaThread* thread) const {
 // QQQ
 #ifdef CC_INTERP
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -84,7 +84,7 @@
 
 #ifdef BUILTIN_SIM
 #define UseBuiltinSim           true
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                         \
   product(bool, NotifySimulator, UseBuiltinSim,                         \
          "tell the AArch64 sim where we are in method code")            \
@@ -112,7 +112,7 @@
 #define NotifySimulator         false
 #define UseSimulatorCache       false
 #define DisableBCCheck          true
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                         \
   product(bool, NearCpool, true,                                        \
          "constant pool is close to instructions")                      \
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1408,6 +1408,52 @@
   movk(r, imm64 & 0xffff, 32);
 }
 
+// Macro to mov replicated immediate to vector register.
+//  Vd will get the following values for different arrangements in T
+//   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
+//   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
+//   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
+//   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
+//   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
+//   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
+//   T1D/T2D: invalid
+void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
+  assert(T != T1D && T != T2D, "invalid arrangement");
+  if (T == T8B || T == T16B) {
+    assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
+    movi(Vd, T, imm32 & 0xff, 0);
+    return;
+  }
+  u_int32_t nimm32 = ~imm32;
+  if (T == T4H || T == T8H) {
+    assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
+    imm32 &= 0xffff;
+    nimm32 &= 0xffff;
+  }
+  u_int32_t x = imm32;
+  int movi_cnt = 0;
+  int movn_cnt = 0;
+  while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
+  x = nimm32;
+  while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
+  if (movn_cnt < movi_cnt) imm32 = nimm32;
+  unsigned lsl = 0;
+  while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
+  if (movn_cnt < movi_cnt)
+    mvni(Vd, T, imm32 & 0xff, lsl);
+  else
+    movi(Vd, T, imm32 & 0xff, lsl);
+  imm32 >>= 8; lsl += 8;
+  while (imm32) {
+    while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
+    if (movn_cnt < movi_cnt)
+      bici(Vd, T, imm32 & 0xff, lsl);
+    else
+      orri(Vd, T, imm32 & 0xff, lsl);
+    lsl += 8; imm32 >>= 8;
+  }
+}
+
 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
 {
 #ifndef PRODUCT
@@ -2888,41 +2934,40 @@
   cmp(src1, rscratch1);
 }
 
+void MacroAssembler::store_check(Register obj, Address dst) {
+  store_check(obj);
+}
+
 void MacroAssembler::store_check(Register obj) {
   // Does a store check for the oop in register obj. The content of
   // register obj is destroyed afterwards.
-  store_check_part_1(obj);
-  store_check_part_2(obj);
-}
-
-void MacroAssembler::store_check(Register obj, Address dst) {
-  store_check(obj);
-}
-
-
-// split the store check operation so that other instructions can be scheduled inbetween
-void MacroAssembler::store_check_part_1(Register obj) {
+
   BarrierSet* bs = Universe::heap()->barrier_set();
   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+
+  CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
   lsr(obj, obj, CardTableModRefBS::card_shift);
-}
-
-void MacroAssembler::store_check_part_2(Register obj) {
-  BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
-  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
-  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
-
-  // The calculation for byte_map_base is as follows:
-  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
-  // So this essentially converts an address to a displacement and
-  // it will never need to be relocated.
-
-  // FIXME: It's not likely that disp will fit into an offset so we
-  // don't bother to check, but it could save an instruction.
-  intptr_t disp = (intptr_t) ct->byte_map_base;
-  mov(rscratch1, disp);
-  strb(zr, Address(obj, rscratch1));
+
+  assert(CardTableModRefBS::dirty_card_val() == 0, "must be");
+
+  {
+    ExternalAddress cardtable((address) ct->byte_map_base);
+    unsigned long offset;
+    adrp(rscratch1, cardtable, offset);
+    assert(offset == 0, "byte_map_base is misaligned");
+  }
+
+  if (UseCondCardMark) {
+    Label L_already_dirty;
+    ldrb(rscratch2,  Address(obj, rscratch1));
+    cbz(rscratch2, L_already_dirty);
+    strb(zr, Address(obj, rscratch1));
+    bind(L_already_dirty);
+  } else {
+    strb(zr, Address(obj, rscratch1));
+  }
 }
 
 void MacroAssembler::load_klass(Register dst, Register src) {
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -36,6 +36,7 @@
 class MacroAssembler: public Assembler {
   friend class LIR_Assembler;
 
+ public:
   using Assembler::mov;
   using Assembler::movi;
 
@@ -465,44 +466,7 @@
 
   void movptr(Register r, uintptr_t imm64);
 
-  // Macro to mov replicated immediate to vector register.
-  // Where imm32 == hex abcdefgh, Vd will get the following values
-  // for different arrangements in T
-  //   T8B:  Vd = ghghghghghghghgh
-  //   T16B: Vd = ghghghghghghghghghghghghghghghgh
-  //   T4H:  Vd = efghefghefghefgh
-  //   T8H:  Vd = efghefghefghefghefghefghefghefgh
-  //   T2S:  Vd = abcdefghabcdefgh
-  //   T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
-  //   T1D/T2D: invalid
-  void mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
-    assert(T != T1D && T != T2D, "invalid arrangement");
-    u_int32_t nimm32 = ~imm32;
-    if (T == T8B || T == T16B) { imm32 &= 0xff; nimm32 &= 0xff; }
-    if (T == T4H || T == T8H) { imm32 &= 0xffff; nimm32 &= 0xffff; }
-    u_int32_t x = imm32;
-    int movi_cnt = 0;
-    int movn_cnt = 0;
-    while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
-    x = nimm32;
-    while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
-    if (movn_cnt < movi_cnt) imm32 = nimm32;
-    unsigned lsl = 0;
-    while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
-    if (movn_cnt < movi_cnt)
-      mvni(Vd, T, imm32 & 0xff, lsl);
-    else
-      movi(Vd, T, imm32 & 0xff, lsl);
-    imm32 >>= 8; lsl += 8;
-    while (imm32) {
-      while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
-      if (movn_cnt < movi_cnt)
-        bici(Vd, T, imm32 & 0xff, lsl);
-      else
-        orri(Vd, T, imm32 & 0xff, lsl);
-      lsl += 8; imm32 >>= 8;
-    }
-  }
+  void mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32);
 
   // macro instructions for accessing and updating floating point
   // status register
@@ -756,10 +720,6 @@
 
 #endif // INCLUDE_ALL_GCS
 
-  // split store_check(Register obj) to enhance instruction interleaving
-  void store_check_part_1(Register obj);
-  void store_check_part_2(Register obj);
-
   // oop manipulations
   void load_klass(Register dst, Register src);
   void store_klass(Register dst, Register src);
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -2120,6 +2120,7 @@
       save_native_result(masm, ret_type, stack_slots);
     }
 
+    __ mov(c_rarg2, rthread);
     __ lea(c_rarg1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
     __ mov(c_rarg0, obj_reg);
 
@@ -2128,7 +2129,7 @@
     __ ldr(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
     __ str(zr, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 
-    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 2, 0, 1);
+    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 3, 0, 1);
 
 #ifdef ASSERT
     {
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -190,10 +190,21 @@
     }
   }
 
+  if (UseGHASHIntrinsics) {
+    warning("GHASH intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
     UseCRC32Intrinsics = true;
   }
 
+  if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
+      warning("CRC32C intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
   if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
     if (FLAG_IS_DEFAULT(UseSHA)) {
       FLAG_SET_DEFAULT(UseSHA, true);
@@ -228,6 +239,9 @@
       warning("SHA512 instruction (for SHA-384 and SHA-512) is not available on this CPU.");
       FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
     }
+    if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA, false);
+    }
   }
 
   // This machine allows unaligned memory accesses
@@ -243,6 +257,10 @@
     UseBarriersForVolatile = (_cpuFeatures & CPU_DMB_ATOMICS) != 0;
   }
 
+  if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
+    UsePopCountInstruction = true;
+  }
+
 #ifdef COMPILER2
   if (FLAG_IS_DEFAULT(OptoScheduling)) {
     OptoScheduling = true;
--- a/src/cpu/ppc/vm/globals_ppc.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/ppc/vm/globals_ppc.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -63,7 +63,7 @@
 define_pd_global(uintx, TypeProfileLevel, 111);
 
 // Platform dependent flag handling: flags only defined on this platform.
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct)  \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint)  \
                                                                             \
   /* Load poll address from thread. This is used to implement per-thread */ \
   /* safepoints on platforms != IA64. */                                    \
--- a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -2475,7 +2475,8 @@
 
     // Slow case of monitor enter.
     // Inline a special case of call_VM that disallows any pending_exception.
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), r_oop, r_box);
+    // Arguments are (oop obj, BasicLock* lock, JavaThread* thread).
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), r_oop, r_box, R16_thread);
 
     __ asm_assert_mem8_is_zero(thread_(pending_exception),
        "no pending exception allowed on exit from SharedRuntime::complete_monitor_unlocking_C", 0);
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -176,6 +176,11 @@
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
 
+  if (UseGHASHIntrinsics) {
+    warning("GHASH intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   if (UseSHA) {
     warning("SHA instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseSHA, false);
@@ -186,6 +191,13 @@
     FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
     FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
   }
+
+  if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
+      warning("CRC32C intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
   // Adjust RTM (Restricted Transactional Memory) flags.
   if (!has_tcheck() && UseRTMLocking) {
     // Can't continue because UseRTMLocking affects UseBiasedLocking flag
@@ -505,7 +517,8 @@
 
 void VM_Version::determine_features() {
 #if defined(ABI_ELFv2)
-  const int code_size = (num_features+1+2*7)*BytesPerInstWord; // TODO(asmundak): calculation is incorrect.
+  // 1 InstWord per call for the blr instruction.
+  const int code_size = (num_features+1+2*1)*BytesPerInstWord;
 #else
   // 7 InstWords for each call (function descriptor + blr instruction).
   const int code_size = (num_features+1+2*7)*BytesPerInstWord;
@@ -540,7 +553,8 @@
   a->popcntw(R7, R5);                          // code[6]  -> popcntw
   a->fcfids(F3, F4);                           // code[7]  -> fcfids
   a->vand(VR0, VR0, VR0);                      // code[8]  -> vand
-  a->lqarx_unchecked(R7, R3_ARG1, R4_ARG2, 1); // code[9]  -> lqarx_m
+  // arg0 of lqarx must be an even register, (arg1 + arg2) must be a multiple of 16
+  a->lqarx_unchecked(R6, R3_ARG1, R4_ARG2, 1); // code[9]  -> lqarx_m
   a->vcipher(VR0, VR1, VR2);                   // code[10] -> vcipher
   a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
   a->tcheck(0);                                // code[12] -> tcheck
@@ -572,7 +586,8 @@
 
   // Execute code. Illegal instructions will be replaced by 0 in the signal handler.
   VM_Version::_is_determine_features_test_running = true;
-  (*test)((address)mid_of_test_area, (uint64_t)0);
+  // We must align the first argument to 16 bytes because of the lqarx check.
+  (*test)((address)align_size_up((intptr_t)mid_of_test_area, 16), (uint64_t)0);
   VM_Version::_is_determine_features_test_running = false;
 
   // determine which instructions are legal.
@@ -614,12 +629,12 @@
   MacroAssembler* a = new MacroAssembler(&cb);
 
   // Emit code.
-  uint64_t (*get_dscr)() = (uint64_t(*)())(void *)a->emit_fd();
+  uint64_t (*get_dscr)() = (uint64_t(*)())(void *)a->function_entry();
   uint32_t *code = (uint32_t *)a->pc();
   a->mfdscr(R3);
   a->blr();
 
-  void (*set_dscr)(long) = (void(*)(long))(void *)a->emit_fd();
+  void (*set_dscr)(long) = (void(*)(long))(void *)a->function_entry();
   a->mtdscr(R3);
   a->blr();
 
--- a/src/cpu/sparc/vm/assembler_sparc.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -128,7 +128,11 @@
     faligndata_op3 = 0x36,
     flog3_op3    = 0x36,
     edge_op3     = 0x36,
+    fzero_op3    = 0x36,
     fsrc_op3     = 0x36,
+    fnot_op3     = 0x36,
+    xmulx_op3    = 0x36,
+    crc32c_op3   = 0x36,
     impdep2_op3  = 0x37,
     stpartialf_op3 = 0x37,
     jmpl_op3     = 0x38,
@@ -220,6 +224,8 @@
     mdtox_opf          = 0x110,
     mstouw_opf         = 0x111,
     mstosw_opf         = 0x113,
+    xmulx_opf          = 0x115,
+    xmulxhi_opf        = 0x116,
     mxtod_opf          = 0x118,
     mwtos_opf          = 0x119,
 
@@ -228,7 +234,9 @@
 
     sha1_opf           = 0x141,
     sha256_opf         = 0x142,
-    sha512_opf         = 0x143
+    sha512_opf         = 0x143,
+
+    crc32c_opf         = 0x147
   };
 
   enum op5s {
@@ -597,6 +605,11 @@
     return x & ((1 << 10) - 1);
   }
 
+  // create a low12 __value__ (not a field) for a given a 32-bit constant
+  static int low12( int x ) {
+    return x & ((1 << 12) - 1);
+  }
+
   // AES crypto instructions supported only on certain processors
   static void aes_only() { assert( VM_Version::has_aes(), "This instruction only works on SPARC with AES instructions support"); }
 
@@ -605,6 +618,9 @@
   static void sha256_only() { assert( VM_Version::has_sha256(), "This instruction only works on SPARC with SHA256"); }
   static void sha512_only() { assert( VM_Version::has_sha512(), "This instruction only works on SPARC with SHA512"); }
 
+  // CRC32C instruction supported only on certain processors
+  static void crc32c_only() { assert( VM_Version::has_crc32c(), "This instruction only works on SPARC with CRC32C"); }
+
   // instruction only in VIS1
   static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }
 
@@ -1019,6 +1035,7 @@
 
   void nop() { emit_int32( op(branch_op) | op2(sethi_op2) ); }
 
+  void sw_count() { emit_int32( op(branch_op) | op2(sethi_op2) | 0x3f0 ); }
 
   // pp 202
 
@@ -1195,8 +1212,14 @@
 
   void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); }
 
+  void fzero( FloatRegisterImpl::Width w, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fzero_op3) | opf(0x62 - w)); }
+
   void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); }
 
+  void fnot1( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fnot_op3) | fs1(s1, w) | opf(0x6C - w)); }
+
+  void fpmerge( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(0x36) | fs1(s1, FloatRegisterImpl::S) | opf(0x4b) | fs2(s2, FloatRegisterImpl::S)); }
+
   void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); }
 
   //  VIS2 instructions
@@ -1212,12 +1235,19 @@
   void movwtos( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
   void movxtod( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
 
+  void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
+  void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
+
   // Crypto SHA instructions
 
   void sha1()   { sha1_only();    emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
   void sha256() { sha256_only();  emit_int32( op(arith_op) | op3(sha_op3) | opf(sha256_opf)); }
   void sha512() { sha512_only();  emit_int32( op(arith_op) | op3(sha_op3) | opf(sha512_opf)); }
 
+  // CRC32C instruction
+
+  void crc32c( FloatRegister s1, FloatRegister s2, FloatRegister d ) { crc32c_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(crc32c_op3) | fs1(s1, FloatRegisterImpl::D) | opf(crc32c_opf) | fs2(s2, FloatRegisterImpl::D)); }
+
   // Creation
   Assembler(CodeBuffer* code) : AbstractAssembler(code) {
 #ifdef CHECK_DELAY
--- a/src/cpu/sparc/vm/frame_sparc.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/sparc/vm/frame_sparc.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -599,12 +599,6 @@
   return next_younger_sp_or_null(valid_sp, sp) != NULL;
 }
 
-
-bool frame::interpreter_frame_equals_unpacked_fp(intptr_t* fp) {
-  assert(is_interpreted_frame(), "must be interpreter frame");
-  return this->fp() == fp;
-}
-
 bool frame::is_interpreted_frame_valid(JavaThread* thread) const {
 #ifdef CC_INTERP
   // Is there anything to do?
--- a/src/cpu/sparc/vm/globals_sparc.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/sparc/vm/globals_sparc.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -81,7 +81,7 @@
 
 define_pd_global(uintx, TypeProfileLevel, 111);
 
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                             \
   product(intx, UseVIS, 99,                                                 \
           "Highest supported VIS instructions set on Sparc")                \
--- a/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -956,6 +956,7 @@
 
   int hi = (int)(value >> 32);
   int lo = (int)(value & ~0);
+  int bits_33to2 = (int)((value >> 2) & ~0);
   // (Matcher::isSimpleConstant64 knows about the following optimizations.)
   if (Assembler::is_simm13(lo) && value == lo) {
     or3(G0, lo, d);
@@ -964,6 +965,12 @@
     if (low10(lo) != 0)
       or3(d, low10(lo), d);
   }
+  else if ((hi >> 2) == 0) {
+    Assembler::sethi(bits_33to2, d);  // hardware version zero-extends to upper 32
+    sllx(d, 2, d);
+    if (low12(lo) != 0)
+      or3(d, low12(lo), d);
+  }
   else if (hi == -1) {
     Assembler::sethi(~lo, d);  // hardware version zero-extends to upper 32
     xor3(d, low10(lo) ^ ~low10(~0), d);
@@ -4351,3 +4358,52 @@
   cmp_and_brx_short(to, end, Assembler::lessUnsigned, Assembler::pt, small_loop);
   nop(); // Separate short branches
 }
+
+/**
+ * Update CRC-32[C] with a byte value according to constants in table
+ *
+ * @param [in,out]crc   Register containing the crc.
+ * @param [in]val       Register containing the byte to fold into the CRC.
+ * @param [in]table     Register containing the table of crc constants.
+ *
+ * uint32_t crc;
+ * val = crc_table[(val ^ crc) & 0xFF];
+ * crc = val ^ (crc >> 8);
+ */
+void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
+  xor3(val, crc, val);
+  and3(val, 0xFF, val);
+  sllx(val, 2, val);
+  lduw(table, val, val);
+  srlx(crc, 8, crc);
+  xor3(val, crc, crc);
+}
+
+// Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros
+void MacroAssembler::reverse_bytes_32(Register src, Register dst, Register tmp) {
+  srlx(src, 24, dst);
+
+  sllx(src, 32+8, tmp);
+  srlx(tmp, 32+24, tmp);
+  sllx(tmp, 8, tmp);
+  or3(dst, tmp, dst);
+
+  sllx(src, 32+16, tmp);
+  srlx(tmp, 32+24, tmp);
+  sllx(tmp, 16, tmp);
+  or3(dst, tmp, dst);
+
+  sllx(src, 32+24, tmp);
+  srlx(tmp, 32, tmp);
+  or3(dst, tmp, dst);
+}
+
+void MacroAssembler::movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2) {
+  reverse_bytes_32(src, tmp1, tmp2);
+  movxtod(tmp1, dst);
+}
+
+void MacroAssembler::movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2) {
+  movdtox(src, tmp1);
+  reverse_bytes_32(tmp1, dst, tmp2);
+}
--- a/src/cpu/sparc/vm/macroAssembler_sparc.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/sparc/vm/macroAssembler_sparc.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -903,6 +903,10 @@
   inline void ldf(FloatRegisterImpl::Width w, Register s1, RegisterOrConstant s2, FloatRegister d);
   inline void ldf(FloatRegisterImpl::Width w, const Address& a, FloatRegister d, int offset = 0);
 
+  // little-endian
+  inline void ldxl(Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
+  inline void ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d) { ldfa(w, s1, s2, ASI_PRIMARY_LITTLE, d); }
+
   // membar psuedo instruction.  takes into account target memory model.
   inline void membar( Assembler::Membar_mask_bits const7a );
 
@@ -1436,6 +1440,14 @@
   // Use BIS for zeroing
   void bis_zeroing(Register to, Register count, Register temp, Label& Ldone);
 
+  // Update CRC-32[C] with a byte value according to constants in table
+  void update_byte_crc32(Register crc, Register val, Register table);
+
+  // Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros
+  void reverse_bytes_32(Register src, Register dst, Register tmp);
+  void movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2);
+  void movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2);
+
 #undef VIRTUAL
 };
 
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -4786,6 +4786,330 @@
     return start;
   }
 
+  /* Single and multi-block ghash operations */
+  address generate_ghash_processBlocks() {
+      __ align(CodeEntryAlignment);
+      Label L_ghash_loop, L_aligned, L_main;
+      StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+      address start = __ pc();
+
+      Register state = I0;
+      Register subkeyH = I1;
+      Register data = I2;
+      Register len = I3;
+
+      __ save_frame(0);
+
+      __ ldx(state, 0, O0);
+      __ ldx(state, 8, O1);
+
+      // Loop label for multiblock operations
+      __ BIND(L_ghash_loop);
+
+      // Check if 'data' is unaligned
+      __ andcc(data, 7, G1);
+      __ br(Assembler::zero, false, Assembler::pt, L_aligned);
+      __ delayed()->nop();
+
+      Register left_shift = L1;
+      Register right_shift = L2;
+      Register data_ptr = L3;
+
+      // Get left and right shift values in bits
+      __ sll(G1, LogBitsPerByte, left_shift);
+      __ mov(64, right_shift);
+      __ sub(right_shift, left_shift, right_shift);
+
+      // Align to read 'data'
+      __ sub(data, G1, data_ptr);
+
+      // Load first 8 bytes of 'data'
+      __ ldx(data_ptr, 0, O4);
+      __ sllx(O4, left_shift, O4);
+      __ ldx(data_ptr, 8, O5);
+      __ srlx(O5, right_shift, G4);
+      __ bset(G4, O4);
+
+      // Load second 8 bytes of 'data'
+      __ sllx(O5, left_shift, O5);
+      __ ldx(data_ptr, 16, G4);
+      __ srlx(G4, right_shift, G4);
+      __ ba(L_main);
+      __ delayed()->bset(G4, O5);
+
+      // If 'data' is aligned, load normally
+      __ BIND(L_aligned);
+      __ ldx(data, 0, O4);
+      __ ldx(data, 8, O5);
+
+      __ BIND(L_main);
+      __ ldx(subkeyH, 0, O2);
+      __ ldx(subkeyH, 8, O3);
+
+      __ xor3(O0, O4, O0);
+      __ xor3(O1, O5, O1);
+
+      __ xmulxhi(O0, O3, G3);
+      __ xmulx(O0, O2, O5);
+      __ xmulxhi(O1, O2, G4);
+      __ xmulxhi(O1, O3, G5);
+      __ xmulx(O0, O3, G1);
+      __ xmulx(O1, O3, G2);
+      __ xmulx(O1, O2, O3);
+      __ xmulxhi(O0, O2, O4);
+
+      __ mov(0xE1, O0);
+      __ sllx(O0, 56, O0);
+
+      __ xor3(O5, G3, O5);
+      __ xor3(O5, G4, O5);
+      __ xor3(G5, G1, G1);
+      __ xor3(G1, O3, G1);
+      __ srlx(G2, 63, O1);
+      __ srlx(G1, 63, G3);
+      __ sllx(G2, 63, O3);
+      __ sllx(G2, 58, O2);
+      __ xor3(O3, O2, O2);
+
+      __ sllx(G1, 1, G1);
+      __ or3(G1, O1, G1);
+
+      __ xor3(G1, O2, G1);
+
+      __ sllx(G2, 1, G2);
+
+      __ xmulxhi(G1, O0, O1);
+      __ xmulx(G1, O0, O2);
+      __ xmulxhi(G2, O0, O3);
+      __ xmulx(G2, O0, G1);
+
+      __ xor3(O4, O1, O4);
+      __ xor3(O5, O2, O5);
+      __ xor3(O5, O3, O5);
+
+      __ sllx(O4, 1, O2);
+      __ srlx(O5, 63, O3);
+
+      __ or3(O2, O3, O0);
+
+      __ sllx(O5, 1, O1);
+      __ srlx(G1, 63, O2);
+      __ or3(O1, O2, O1);
+      __ xor3(O1, G3, O1);
+
+      __ deccc(len);
+      __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
+      __ delayed()->add(data, 16, data);
+
+      __ stx(O0, I0, 0);
+      __ stx(O1, I0, 8);
+
+      __ ret();
+      __ delayed()->restore();
+
+      return start;
+  }
+
+#define CHUNK_LEN   128          /* 128 x 8B = 1KB */
+#define CHUNK_K1    0x1307a0206  /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
+#define CHUNK_K2    0x1a0f717c4  /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
+#define CHUNK_K3    0x0170076fa  /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
+
+  /**
+   *  Arguments:
+   *
+   * Inputs:
+   *   O0   - int   crc
+   *   O1   - byte* buf
+   *   O2   - int   len
+   *   O3   - int*  table
+   *
+   * Output:
+   *   O0   - int crc result
+   */
+  address generate_updateBytesCRC32C() {
+    assert(UseCRC32CIntrinsics, "need CRC32C instruction");
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+    address start = __ pc();
+
+    const Register crc   = O0;  // crc
+    const Register buf   = O1;  // source java byte array address
+    const Register len   = O2;  // number of bytes
+    const Register table = O3;  // byteTable
+
+    Label L_crc32c_head, L_crc32c_aligned;
+    Label L_crc32c_parallel, L_crc32c_parallel_loop;
+    Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
+    Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
+
+    __ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
+
+    // clear upper 32 bits of crc
+    __ clruwu(crc);
+
+    __ and3(buf, 7, G4);
+    __ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
+
+    __ mov(8, G1);
+    __ sub(G1, G4, G4);
+
+    // ------ process the misaligned head (7 bytes or less) ------
+    __ BIND(L_crc32c_head);
+
+    // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+    __ ldub(buf, 0, G1);
+    __ update_byte_crc32(crc, G1, table);
+
+    __ inc(buf);
+    __ dec(len);
+    __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
+    __ dec(G4);
+    __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
+
+    // ------ process the 8-byte-aligned body ------
+    __ BIND(L_crc32c_aligned);
+    __ nop();
+    __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
+
+    // reverse the byte order of lower 32 bits to big endian, and move to FP side
+    __ movitof_revbytes(crc, F0, G1, G3);
+
+    __ set(CHUNK_LEN*8*4, G4);
+    __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
+
+    // ------ process four 1KB chunks in parallel ------
+    __ BIND(L_crc32c_parallel);
+
+    __ fzero(FloatRegisterImpl::D, F2);
+    __ fzero(FloatRegisterImpl::D, F4);
+    __ fzero(FloatRegisterImpl::D, F6);
+
+    __ mov(CHUNK_LEN - 1, G4);
+    __ BIND(L_crc32c_parallel_loop);
+    // schedule ldf's ahead of crc32c's to hide the load-use latency
+    __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
+    __ crc32c(F0, F8,  F0);
+    __ crc32c(F2, F10, F2);
+    __ crc32c(F4, F12, F4);
+    __ crc32c(F6, F14, F6);
+    __ inc(buf, 8);
+    __ dec(G4);
+    __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
+
+    __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+    __ crc32c(F0, F8,  F0);
+    __ crc32c(F2, F10, F2);
+    __ crc32c(F4, F12, F4);
+
+    __ inc(buf, CHUNK_LEN*24);
+    __ ldfl(FloatRegisterImpl::D, buf, G0, F14);  // load in little endian
+    __ inc(buf, 8);
+
+    __ prefetch(buf, 0,            Assembler::severalReads);
+    __ prefetch(buf, CHUNK_LEN*8,  Assembler::severalReads);
+    __ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
+    __ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
+
+    // move to INT side, and reverse the byte order of lower 32 bits to little endian
+    __ movftoi_revbytes(F0, O4, G1, G4);
+    __ movftoi_revbytes(F2, O5, G1, G4);
+    __ movftoi_revbytes(F4, G5, G1, G4);
+
+    // combine the results of 4 chunks
+    __ set64(CHUNK_K1, G3, G1);
+    __ xmulx(O4, G3, O4);
+    __ set64(CHUNK_K2, G3, G1);
+    __ xmulx(O5, G3, O5);
+    __ set64(CHUNK_K3, G3, G1);
+    __ xmulx(G5, G3, G5);
+
+    __ movdtox(F14, G4);
+    __ xor3(O4, O5, O5);
+    __ xor3(G5, O5, O5);
+    __ xor3(G4, O5, O5);
+
+    // reverse the byte order to big endian, via stack, and move to FP side
+    __ add(SP, -8, G1);
+    __ srlx(G1, 3, G1);
+    __ sllx(G1, 3, G1);
+    __ stx(O5, G1, G0);
+    __ ldfl(FloatRegisterImpl::D, G1, G0, F2);  // load in little endian
+
+    __ crc32c(F6, F2, F0);
+
+    __ set(CHUNK_LEN*8*4, G4);
+    __ sub(len, G4, len);
+    __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
+    __ nop();
+    __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
+
+    __ BIND(L_crc32c_serial);
+
+    __ mov(32, G4);
+    __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
+
+    // ------ process 32B chunks ------
+    __ BIND(L_crc32c_x32_loop);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ dec(len, 32);
+    __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
+
+    __ BIND(L_crc32c_x8);
+    __ nop();
+    __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
+
+    // ------ process 8B chunks ------
+    __ BIND(L_crc32c_x8_loop);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ dec(len, 8);
+    __ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
+
+    __ BIND(L_crc32c_done);
+
+    // move to INT side, and reverse the byte order of lower 32 bits to little endian
+    __ movftoi_revbytes(F0, crc, G1, G3);
+
+    __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
+
+    // ------ process the misaligned tail (7 bytes or less) ------
+    __ BIND(L_crc32c_tail);
+
+    // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+    __ ldub(buf, 0, G1);
+    __ update_byte_crc32(crc, G1, table);
+
+    __ inc(buf);
+    __ dec(len);
+    __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
+
+    __ BIND(L_crc32c_return);
+    __ nop();
+    __ retl();
+    __ delayed()->nop();
+
+    return start;
+  }
+
   void generate_initial() {
     // Generates all stubs and initializes the entry points
 
@@ -4859,6 +5183,10 @@
       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
+    // generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    }
 
     // generate SHA1/SHA256/SHA512 intrinsics code
     if (UseSHA1Intrinsics) {
@@ -4873,6 +5201,11 @@
       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
     }
+
+    // generate CRC32C intrinsic code
+    if (UseCRC32CIntrinsics) {
+      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
+    }
   }
 
 
--- a/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -41,7 +41,7 @@
 enum /* platform_dependent_constants */ {
   // %%%%%%%% May be able to shrink this a lot
   code_size1 = 20000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 23000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 24000            // simply increase if too small (assembler will crash if too small)
 };
 
 class Sparc {
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -230,7 +230,7 @@
   assert((OptoLoopAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size");
 
   char buf[512];
-  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                (has_v9() ? ", v9" : (has_v8() ? ", v8" : "")),
                (has_hardware_popc() ? ", popc" : ""),
                (has_vis1() ? ", vis1" : ""),
@@ -242,6 +242,7 @@
                (has_sha1() ? ", sha1" : ""),
                (has_sha256() ? ", sha256" : ""),
                (has_sha512() ? ", sha512" : ""),
+               (has_crc32c() ? ", crc32c" : ""),
                (is_ultra3() ? ", ultra3" : ""),
                (is_sun4v() ? ", sun4v" : ""),
                (is_niagara_plus() ? ", niagara_plus" : (is_niagara() ? ", niagara" : "")),
@@ -300,6 +301,17 @@
     }
   }
 
+  // GHASH/GCM intrinsics
+  if (has_vis3() && (UseVIS > 2)) {
+    if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+      UseGHASHIntrinsics = true;
+    }
+  } else if (UseGHASHIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+      warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   // SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
   if (has_sha1() || has_sha256() || has_sha512()) {
     if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
@@ -352,6 +364,23 @@
     }
   }
 
+  // SPARC T4 and above should have support for CRC32C instruction
+  if (has_crc32c()) {
+    if (UseVIS > 2) { // CRC32C intrinsics use VIS3 instructions
+      if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
+        FLAG_SET_DEFAULT(UseCRC32CIntrinsics, true);
+      }
+    } else {
+      if (UseCRC32CIntrinsics) {
+        warning("SPARC CRC32C intrinsics require VIS3 instruction support. Intrinsics will be disabled.");
+        FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+      }
+    }
+  } else if (UseCRC32CIntrinsics) {
+    warning("CRC32C instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
     (cache_line_size > ContendedPaddingWidth))
     ContendedPaddingWidth = cache_line_size;
--- a/src/cpu/sparc/vm/vm_version_sparc.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/sparc/vm/vm_version_sparc.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -53,7 +53,8 @@
     aes_instructions     = 19,
     sha1_instruction     = 20,
     sha256_instruction   = 21,
-    sha512_instruction   = 22
+    sha512_instruction   = 22,
+    crc32c_instruction   = 23
   };
 
   enum Feature_Flag_Set {
@@ -83,6 +84,7 @@
     sha1_instruction_m      = 1 << sha1_instruction,
     sha256_instruction_m    = 1 << sha256_instruction,
     sha512_instruction_m    = 1 << sha512_instruction,
+    crc32c_instruction_m    = 1 << crc32c_instruction,
 
     generic_v8_m        = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m,
     generic_v9_m        = generic_v8_m | v9_instructions_m,
@@ -141,6 +143,7 @@
   static bool has_sha1()                { return (_features & sha1_instruction_m) != 0; }
   static bool has_sha256()              { return (_features & sha256_instruction_m) != 0; }
   static bool has_sha512()              { return (_features & sha512_instruction_m) != 0; }
+  static bool has_crc32c()              { return (_features & crc32c_instruction_m) != 0; }
 
   static bool supports_compare_and_exchange()
                                         { return has_v9(); }
--- a/src/cpu/x86/vm/assembler_x86.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1347,7 +1347,7 @@
 
 void Assembler::andnl(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(dst, src1, src2, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2, false);
   emit_int8((unsigned char)0xF2);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1355,7 +1355,7 @@
 void Assembler::andnl(Register dst, Register src1, Address src2) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(dst, src1, src2, false);
+  vex_prefix_0F38_legacy(dst, src1, src2, false);
   emit_int8((unsigned char)0xF2);
   emit_operand(dst, src2);
 }
@@ -1382,7 +1382,7 @@
 
 void Assembler::blsil(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(rbx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1390,14 +1390,14 @@
 void Assembler::blsil(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rbx, dst, src, false);
+  vex_prefix_0F38_legacy(rbx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_operand(rbx, src);
 }
 
 void Assembler::blsmskl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(rdx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1412,7 +1412,7 @@
 
 void Assembler::blsrl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(rcx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1420,7 +1420,7 @@
 void Assembler::blsrl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rcx, dst, src, false);
+  vex_prefix_0F38_legacy(rcx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_operand(rcx, src);
 }
@@ -3095,26 +3095,35 @@
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F,
-                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
 }
 
+void Assembler::pslldq(XMMRegister dst, int shift) {
+  // Shift left 128 bit value in xmm register by number of bytes.
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  emit_int8(0x73);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(shift);
+}
+
 void Assembler::ptest(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false,
+              VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
 
 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                      false, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3126,7 +3135,7 @@
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len, true, false);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
@@ -3135,7 +3144,7 @@
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38);
+                                     vector_len, VEX_OPCODE_0F_38, true, false);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3146,12 +3155,12 @@
   if (VM_Version::supports_evex()) {
     tuple_type = EVEX_FVM;
   }
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
 }
 
 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
 }
 
 void Assembler::punpckldq(XMMRegister dst, Address src) {
@@ -4979,7 +4988,51 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 4-bytes integer data from src into 8 locations in dest
+// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+                                     vector_len, VEX_OPCODE_0F_38, false);
+  emit_int8(0x78);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_8bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  emit_int8(0x78);
+  emit_operand(dst, src);
+}
+
+// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+                                     vector_len, VEX_OPCODE_0F_38, false);
+  emit_int8(0x79);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_16bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  emit_int8(0x79);
+  emit_operand(dst, src);
+}
+
+// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
@@ -4988,6 +5041,121 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  emit_int8(0x58);
+  emit_operand(dst, src);
+}
+
+// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  emit_int8(0x59);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_64bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  emit_int8(0x59);
+  emit_operand(dst, src);
+}
+
+// duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  emit_int8(0x18);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  emit_int8(0x18);
+  emit_operand(dst, src);
+}
+
+// duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_64bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  emit_int8(0x19);
+  emit_operand(dst, src);
+}
+
+// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  emit_int8(0x7A);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  emit_int8(0x7B);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  emit_int8(0x7C);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  emit_int8(0x7C);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
@@ -5598,7 +5766,7 @@
 
 void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre,
                            VexOpcode opc, bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg) {
-  bool vex_r = (xreg_enc >= 8);
+  bool vex_r = ((xreg_enc & 8) == 8) ? 1 : 0;
   bool vex_b = adr.base_needs_rex();
   bool vex_x = adr.index_needs_rex();
   avx_vector_len = vector_len;
@@ -5626,8 +5794,8 @@
 
 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
                                      bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg ) {
-  bool vex_r = (dst_enc >= 8);
-  bool vex_b = (src_enc >= 8);
+  bool vex_r = ((dst_enc & 8) == 8) ? 1 : 0;
+  bool vex_b = ((src_enc & 8) == 8) ? 1 : 0;
   bool vex_x = false;
   avx_vector_len = vector_len;
 
@@ -6272,19 +6440,15 @@
 
 void Assembler::andnq(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_q(dst, src1, src2);
+  int encode = vex_prefix_0F38_and_encode_q_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::andnq(Register dst, Register src1, Address src2) {
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-  }
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_q(dst, src1, src2);
+  vex_prefix_0F38_q_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_operand(dst, src2);
 }
@@ -6311,7 +6475,7 @@
 
 void Assembler::blsiq(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_q(rbx, dst, src);
+  int encode = vex_prefix_0F38_and_encode_q_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6319,14 +6483,14 @@
 void Assembler::blsiq(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_q(rbx, dst, src);
+  vex_prefix_0F38_q_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rbx, src);
 }
 
 void Assembler::blsmskq(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_q(rdx, dst, src);
+  int encode = vex_prefix_0F38_and_encode_q_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6334,14 +6498,14 @@
 void Assembler::blsmskq(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_q(rdx, dst, src);
+  vex_prefix_0F38_q_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rdx, src);
 }
 
 void Assembler::blsrq(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_q(rcx, dst, src);
+  int encode = vex_prefix_0F38_and_encode_q_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6349,7 +6513,7 @@
 void Assembler::blsrq(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_q(rcx, dst, src);
+  vex_prefix_0F38_q_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rcx, src);
 }
--- a/src/cpu/x86/vm/assembler_x86.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -661,6 +661,14 @@
                vector_len, no_mask_reg);
   }
 
+  void vex_prefix_0F38_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
+    bool vex_w = false;
+    int vector_len = AVX_128bit;
+    vex_prefix(src, nds->encoding(), dst->encoding(),
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
+               vector_len, true, no_mask_reg);
+  }
+
   void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) {
     bool vex_w = true;
     int vector_len = AVX_128bit;
@@ -668,6 +676,15 @@
                VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
                vector_len, no_mask_reg);
   }
+
+  void vex_prefix_0F38_q_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
+    bool vex_w = true;
+    int vector_len = AVX_128bit;
+    vex_prefix(src, nds->encoding(), dst->encoding(),
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
+               vector_len, true, no_mask_reg);
+  }
+
   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
                              VexSimdPrefix pre, VexOpcode opc,
                              bool vex_w, int vector_len,
@@ -680,6 +697,15 @@
                                  VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
                                  false, no_mask_reg);
   }
+
+  int  vex_prefix_0F38_and_encode_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
+    bool vex_w = false;
+    int vector_len = AVX_128bit;
+    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+      VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
+      true, no_mask_reg);
+  }
+
   int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) {
     bool vex_w = true;
     int vector_len = AVX_128bit;
@@ -687,6 +713,15 @@
                                  VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
                                  false, no_mask_reg);
   }
+
+  int  vex_prefix_0F38_and_encode_q_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
+    bool vex_w = true;
+    int vector_len = AVX_128bit;
+    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
+                                 true, no_mask_reg);
+  }
+
   int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
                              VexSimdPrefix pre, int vector_len = AVX_128bit,
                              VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false,
@@ -1666,6 +1701,8 @@
 
   // Shift Right by bytes Logical DoubleQuadword Immediate
   void psrldq(XMMRegister dst, int shift);
+  // Shift Left by bytes Logical DoubleQuadword Immediate
+  void pslldq(XMMRegister dst, int shift);
 
   // Logical Compare 128bit
   void ptest(XMMRegister dst, XMMRegister src);
@@ -2024,8 +2061,25 @@
   // duplicate 4-bytes integer data from src into 8 locations in dest
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
 
-  // duplicate 4-bytes integer data from src into vector_len locations in dest
+  // duplicate n-bytes integer data from src into vector_len locations in dest
+  void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
+  void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastw(XMMRegister dst, Address src, int vector_len);
   void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastd(XMMRegister dst, Address src, int vector_len);
+  void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
+
+  void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
+  void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
+
+  void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
+  void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
+  void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
+  void evpbroadcastq(XMMRegister dst, Register src, int vector_len);
 
   // Carry-Less Multiplication Quadword
   void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
--- a/src/cpu/x86/vm/c2_init_x86.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/c2_init_x86.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -58,4 +58,6 @@
       OptoReg::invalidate(i);
     }
   }
+
+  SuperWordLoopUnrollAnalysis = true;
 }
--- a/src/cpu/x86/vm/frame_x86.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/frame_x86.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -524,17 +524,6 @@
   return frame(sender_sp(), link(), sender_pc());
 }
 
-
-bool frame::interpreter_frame_equals_unpacked_fp(intptr_t* fp) {
-  assert(is_interpreted_frame(), "must be interpreter frame");
-  Method* method = interpreter_frame_method();
-  // When unpacking an optimized frame the frame pointer is
-  // adjusted with:
-  int diff = (method->max_locals() - method->size_of_parameters()) *
-             Interpreter::stackElementWords;
-  return _fp == (fp - diff);
-}
-
 bool frame::is_interpreted_frame_valid(JavaThread* thread) const {
 // QQQ
 #ifdef CC_INTERP
--- a/src/cpu/x86/vm/globals_x86.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/globals_x86.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -84,7 +84,7 @@
 
 define_pd_global(bool, PreserveFramePointer, false);
 
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                             \
   develop(bool, IEEEPrecision, true,                                        \
           "Enables IEEE precision (for INTEL only)")                        \
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -4260,31 +4260,24 @@
 //////////////////////////////////////////////////////////////////////////////////
 
 
+void MacroAssembler::store_check(Register obj, Address dst) {
+  store_check(obj);
+}
+
 void MacroAssembler::store_check(Register obj) {
   // Does a store check for the oop in register obj. The content of
   // register obj is destroyed afterwards.
-  store_check_part_1(obj);
-  store_check_part_2(obj);
-}
-
-void MacroAssembler::store_check(Register obj, Address dst) {
-  store_check(obj);
-}
-
-
-// split the store check operation so that other instructions can be scheduled inbetween
-void MacroAssembler::store_check_part_1(Register obj) {
+
   BarrierSet* bs = Universe::heap()->barrier_set();
   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
-  shrptr(obj, CardTableModRefBS::card_shift);
-}
-
-void MacroAssembler::store_check_part_2(Register obj) {
-  BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+
   CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 
+  shrptr(obj, CardTableModRefBS::card_shift);
+
+  Address card_addr;
+
   // The calculation for byte_map_base is as follows:
   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
   // So this essentially converts an address to a displacement and it will
@@ -4292,8 +4285,7 @@
   // large for a 32bit displacement.
   intptr_t disp = (intptr_t) ct->byte_map_base;
   if (is_simm32(disp)) {
-    Address cardtable(noreg, obj, Address::times_1, disp);
-    movb(cardtable, 0);
+    card_addr = Address(noreg, obj, Address::times_1, disp);
   } else {
     // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
     // displacement and done in a single instruction given favorable mapping and a
@@ -4301,7 +4293,21 @@
     // entry and that entry is not properly handled by the relocation code.
     AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
     Address index(noreg, obj, Address::times_1);
-    movb(as_Address(ArrayAddress(cardtable, index)), 0);
+    card_addr = as_Address(ArrayAddress(cardtable, index));
+  }
+
+  int dirty = CardTableModRefBS::dirty_card_val();
+  if (UseCondCardMark) {
+    Label L_already_dirty;
+    if (UseConcMarkSweepGC) {
+      membar(Assembler::StoreLoad);
+    }
+    cmpb(card_addr, dirty);
+    jcc(Assembler::equal, L_already_dirty);
+    movb(card_addr, dirty);
+    bind(L_already_dirty);
+  } else {
+    movb(card_addr, dirty);
   }
 }
 
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -315,10 +315,6 @@
 
 #endif // INCLUDE_ALL_GCS
 
-  // split store_check(Register obj) to enhance instruction interleaving
-  void store_check_part_1(Register obj);
-  void store_check_part_2(Register obj);
-
   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
   void c2bool(Register x);
 
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -368,22 +368,22 @@
     map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
     if (UseAVX > 2) {
-      map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next());
     }
   }
 
@@ -469,7 +469,7 @@
       __ vinsertf64x4h(xmm29, Address(rsp, 928));
       __ vinsertf64x4h(xmm30, Address(rsp, 960));
       __ vinsertf64x4h(xmm31, Address(rsp, 992));
-      __ subptr(rsp, 1024);
+      __ addptr(rsp, 1024);
     }
   }
 #else
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -2727,6 +2727,167 @@
     return start;
   }
 
+  // byte swap x86 long
+  address generate_ghash_long_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+    address start = __ pc();
+    __ emit_data(0x0b0a0908, relocInfo::none, 0);
+    __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
+    __ emit_data(0x03020100, relocInfo::none, 0);
+    __ emit_data(0x07060504, relocInfo::none, 0);
+
+  return start;
+  }
+
+  // byte swap x86 byte array
+  address generate_ghash_byte_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+    address start = __ pc();
+    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+    __ emit_data(0x08090a0b, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+  return start;
+  }
+
+  /* Single and multi-block ghash operations */
+  address generate_ghash_processBlocks() {
+    assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
+    __ align(CodeEntryAlignment);
+    Label L_ghash_loop, L_exit;
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+    address start = __ pc();
+
+    const Register state        = rdi;
+    const Register subkeyH      = rsi;
+    const Register data         = rdx;
+    const Register blocks       = rcx;
+
+    const Address  state_param(rbp, 8+0);
+    const Address  subkeyH_param(rbp, 8+4);
+    const Address  data_param(rbp, 8+8);
+    const Address  blocks_param(rbp, 8+12);
+
+    const XMMRegister xmm_temp0 = xmm0;
+    const XMMRegister xmm_temp1 = xmm1;
+    const XMMRegister xmm_temp2 = xmm2;
+    const XMMRegister xmm_temp3 = xmm3;
+    const XMMRegister xmm_temp4 = xmm4;
+    const XMMRegister xmm_temp5 = xmm5;
+    const XMMRegister xmm_temp6 = xmm6;
+    const XMMRegister xmm_temp7 = xmm7;
+
+    __ enter();
+
+    __ movptr(state, state_param);
+    __ movptr(subkeyH, subkeyH_param);
+    __ movptr(data, data_param);
+    __ movptr(blocks, blocks_param);
+
+    __ movdqu(xmm_temp0, Address(state, 0));
+    __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+    __ movdqu(xmm_temp1, Address(subkeyH, 0));
+    __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+    __ BIND(L_ghash_loop);
+    __ movdqu(xmm_temp2, Address(data, 0));
+    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+    __ pxor(xmm_temp0, xmm_temp2);
+
+    //
+    // Multiply with the hash key
+    //
+    __ movdqu(xmm_temp3, xmm_temp0);
+    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
+    __ movdqu(xmm_temp4, xmm_temp0);
+    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
+
+    __ movdqu(xmm_temp5, xmm_temp0);
+    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
+    __ movdqu(xmm_temp6, xmm_temp0);
+    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
+
+    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
+
+    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
+    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
+    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
+    __ pxor(xmm_temp3, xmm_temp5);
+    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
+                                        // of the carry-less multiplication of
+                                        // xmm0 by xmm1.
+
+    // We shift the result of the multiplication by one bit position
+    // to the left to cope for the fact that the bits are reversed.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp4, xmm_temp6);
+    __ pslld (xmm_temp3, 1);
+    __ pslld(xmm_temp6, 1);
+    __ psrld(xmm_temp7, 31);
+    __ psrld(xmm_temp4, 31);
+    __ movdqu(xmm_temp5, xmm_temp7);
+    __ pslldq(xmm_temp4, 4);
+    __ pslldq(xmm_temp7, 4);
+    __ psrldq(xmm_temp5, 12);
+    __ por(xmm_temp3, xmm_temp7);
+    __ por(xmm_temp6, xmm_temp4);
+    __ por(xmm_temp6, xmm_temp5);
+
+    //
+    // First phase of the reduction
+    //
+    // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
+    // independently.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp4, xmm_temp3);
+    __ movdqu(xmm_temp5, xmm_temp3);
+    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
+    __ pslld(xmm_temp4, 30);    // packed right shift shifting << 30
+    __ pslld(xmm_temp5, 25);    // packed right shift shifting << 25
+    __ pxor(xmm_temp7, xmm_temp4);      // xor the shifted versions
+    __ pxor(xmm_temp7, xmm_temp5);
+    __ movdqu(xmm_temp4, xmm_temp7);
+    __ pslldq(xmm_temp7, 12);
+    __ psrldq(xmm_temp4, 4);
+    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
+
+    //
+    // Second phase of the reduction
+    //
+    // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
+    // shift operations.
+    __ movdqu(xmm_temp2, xmm_temp3);
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp5, xmm_temp3);
+    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
+    __ psrld(xmm_temp7, 2);     // packed left shifting >> 2
+    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
+    __ pxor(xmm_temp2, xmm_temp7);      // xor the shifted versions
+    __ pxor(xmm_temp2, xmm_temp5);
+    __ pxor(xmm_temp2, xmm_temp4);
+    __ pxor(xmm_temp3, xmm_temp2);
+    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
+
+    __ decrement(blocks);
+    __ jcc(Assembler::zero, L_exit);
+    __ movdqu(xmm_temp0, xmm_temp6);
+    __ addptr(data, 16);
+    __ jmp(L_ghash_loop);
+
+    __ BIND(L_exit);
+       // Byte swap 16-byte result
+    __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
+
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
   /**
    *  Arguments:
    *
@@ -3026,6 +3187,13 @@
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
     }
 
+    // Generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    }
+
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
                                                    &StubRoutines::_safefetch32_fault_pc,
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -382,7 +382,12 @@
 
     // restore regs belonging to calling function
 #ifdef _WIN64
-    for (int i = 15; i >= 6; i--) {
+    int xmm_ub = 15;
+    if (UseAVX > 2) {
+      xmm_ub = 31;
+    }
+    // emit the restores for xmm regs
+    for (int i = 6; i <= xmm_ub; i++) {
       __ movdqu(as_XMMRegister(i), xmm_save(i));
     }
 #endif
@@ -3681,6 +3686,175 @@
     return start;
   }
 
+
+  // byte swap x86 long
+  address generate_ghash_long_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+    address start = __ pc();
+    __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
+    __ emit_data64(0x0706050403020100, relocInfo::none );
+  return start;
+  }
+
+  // byte swap x86 byte array
+  address generate_ghash_byte_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+    address start = __ pc();
+    __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
+    __ emit_data64(0x0001020304050607, relocInfo::none );
+  return start;
+  }
+
+  /* Single and multi-block ghash operations */
+  address generate_ghash_processBlocks() {
+    __ align(CodeEntryAlignment);
+    Label L_ghash_loop, L_exit;
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+    address start = __ pc();
+
+    const Register state        = c_rarg0;
+    const Register subkeyH      = c_rarg1;
+    const Register data         = c_rarg2;
+    const Register blocks       = c_rarg3;
+
+#ifdef _WIN64
+    const int XMM_REG_LAST  = 10;
+#endif
+
+    const XMMRegister xmm_temp0 = xmm0;
+    const XMMRegister xmm_temp1 = xmm1;
+    const XMMRegister xmm_temp2 = xmm2;
+    const XMMRegister xmm_temp3 = xmm3;
+    const XMMRegister xmm_temp4 = xmm4;
+    const XMMRegister xmm_temp5 = xmm5;
+    const XMMRegister xmm_temp6 = xmm6;
+    const XMMRegister xmm_temp7 = xmm7;
+    const XMMRegister xmm_temp8 = xmm8;
+    const XMMRegister xmm_temp9 = xmm9;
+    const XMMRegister xmm_temp10 = xmm10;
+
+    __ enter();
+
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-10
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+#endif
+
+    __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+    __ movdqu(xmm_temp0, Address(state, 0));
+    __ pshufb(xmm_temp0, xmm_temp10);
+
+
+    __ BIND(L_ghash_loop);
+    __ movdqu(xmm_temp2, Address(data, 0));
+    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+    __ movdqu(xmm_temp1, Address(subkeyH, 0));
+    __ pshufb(xmm_temp1, xmm_temp10);
+
+    __ pxor(xmm_temp0, xmm_temp2);
+
+    //
+    // Multiply with the hash key
+    //
+    __ movdqu(xmm_temp3, xmm_temp0);
+    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
+    __ movdqu(xmm_temp4, xmm_temp0);
+    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
+
+    __ movdqu(xmm_temp5, xmm_temp0);
+    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
+    __ movdqu(xmm_temp6, xmm_temp0);
+    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
+
+    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
+
+    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
+    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
+    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
+    __ pxor(xmm_temp3, xmm_temp5);
+    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
+                                        // of the carry-less multiplication of
+                                        // xmm0 by xmm1.
+
+    // We shift the result of the multiplication by one bit position
+    // to the left to cope for the fact that the bits are reversed.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp8, xmm_temp6);
+    __ pslld(xmm_temp3, 1);
+    __ pslld(xmm_temp6, 1);
+    __ psrld(xmm_temp7, 31);
+    __ psrld(xmm_temp8, 31);
+    __ movdqu(xmm_temp9, xmm_temp7);
+    __ pslldq(xmm_temp8, 4);
+    __ pslldq(xmm_temp7, 4);
+    __ psrldq(xmm_temp9, 12);
+    __ por(xmm_temp3, xmm_temp7);
+    __ por(xmm_temp6, xmm_temp8);
+    __ por(xmm_temp6, xmm_temp9);
+
+    //
+    // First phase of the reduction
+    //
+    // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+    // independently.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp8, xmm_temp3);
+    __ movdqu(xmm_temp9, xmm_temp3);
+    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
+    __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
+    __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
+    __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
+    __ pxor(xmm_temp7, xmm_temp9);
+    __ movdqu(xmm_temp8, xmm_temp7);
+    __ pslldq(xmm_temp7, 12);
+    __ psrldq(xmm_temp8, 4);
+    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
+
+    //
+    // Second phase of the reduction
+    //
+    // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+    // shift operations.
+    __ movdqu(xmm_temp2, xmm_temp3);
+    __ movdqu(xmm_temp4, xmm_temp3);
+    __ movdqu(xmm_temp5, xmm_temp3);
+    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
+    __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
+    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
+    __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
+    __ pxor(xmm_temp2, xmm_temp5);
+    __ pxor(xmm_temp2, xmm_temp8);
+    __ pxor(xmm_temp3, xmm_temp2);
+    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
+
+    __ decrement(blocks);
+    __ jcc(Assembler::zero, L_exit);
+    __ movdqu(xmm_temp0, xmm_temp6);
+    __ addptr(data, 16);
+    __ jmp(L_ghash_loop);
+
+    __ BIND(L_exit);
+    __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
+    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
+
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+#endif
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
   /**
    *  Arguments:
    *
@@ -4120,6 +4294,13 @@
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
 
+    // Generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    }
+
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
                                                        &StubRoutines::_safefetch32_fault_pc,
--- a/src/cpu/x86/vm/stubRoutines_x86.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/stubRoutines_x86.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,6 +33,8 @@
 
 address StubRoutines::x86::_verify_mxcsr_entry = NULL;
 address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
+address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
+address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
 
 uint64_t StubRoutines::x86::_crc_by128_masks[] =
 {
--- a/src/cpu/x86/vm/stubRoutines_x86.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/stubRoutines_x86.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,10 +36,15 @@
   // masks and table for CRC32
   static uint64_t _crc_by128_masks[];
   static juint    _crc_table[];
+  // swap mask for ghash
+  static address _ghash_long_swap_mask_addr;
+  static address _ghash_byte_swap_mask_addr;
 
  public:
   static address verify_mxcsr_entry()    { return _verify_mxcsr_entry; }
   static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
   static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
+  static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
+  static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
 
 #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -33,7 +33,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 23000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -677,6 +677,17 @@
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
 
+  // GHASH/GCM intrinsics
+  if (UseCLMUL && (UseSSE > 2)) {
+    if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+      UseGHASHIntrinsics = true;
+    }
+  } else if (UseGHASHIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+      warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   if (UseSHA) {
     warning("SHA instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseSHA, false);
@@ -688,6 +699,12 @@
     FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
   }
 
+  if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
+      warning("CRC32C intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
   // Adjust RTM (Restricted Transactional Memory) flags
   if (!supports_rtm() && UseRTMLocking) {
     // Can't continue because UseRTMLocking affects UseBiasedLocking flag
--- a/src/cpu/x86/vm/vm_version_x86.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/vm_version_x86.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -702,6 +702,7 @@
   static bool supports_avx512cd() { return (_cpuFeatures & CPU_AVX512CD) != 0; }
   static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
   static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
+  static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/src/cpu/x86/vm/x86.ad	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/x86/vm/x86.ad	Thu Jul 02 11:12:59 2015 +0100
@@ -2894,6 +2894,457 @@
   ins_pipe( pipe_slow );
 %}
 
+// ====================LEGACY REPLICATE=======================================
+
+instruct Repl4B_mem(vecS dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate4B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_mem(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate8B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\t! replicate16B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\t! replicate16B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate4S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S_mem(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\t! replicate8S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "pshuflw $dst,$mem,0x00\n\t"
+            "punpcklqdq $dst,$dst\t! replicate8S" %}
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "pshuflw $dst,$mem,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "pshufd  $dst,$mem,0x00\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
+            "punpcklqdq $dst,$dst" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Long could be loaded into xmm register directly from memory.
+instruct Repl2L_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "movq    $dst,$mem\n\t"
+            "punpcklqdq $dst,$dst\t! replicate2L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl4L(vecY dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  format %{ "movdq   $dst,$src\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movdq($dst$$XMMRegister, $src$$Register);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+instruct Repl4L_imm(vecY dst, immL con) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress($con));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "movq    $dst,$mem\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl2F_mem(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4F_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F(vecY dst, regF src) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$src,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "pshufd  $dst,$mem,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl2D_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D(vecY dst, regD src) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD src));
+  format %{ "pshufd  $dst,$src,0x44\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "pshufd  $dst,$mem,0x44\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ====================GENERIC REPLICATE==========================================
+
 // Replicate byte scalar to be vector
 instruct Repl4B(vecS dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 4);
@@ -2923,60 +3374,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl16B(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateB src));
-  format %{ "movd    $dst,$src\n\t"
-            "punpcklbw $dst,$dst\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\t! replicate16B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl32B(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateB src));
-  format %{ "movd    $dst,$src\n\t"
-            "punpcklbw $dst,$dst\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl64B(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 64);
-  match(Set dst (ReplicateB src));
-  format %{ "movd    $dst,$src\n\t"
-            "punpcklbw $dst,$dst\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate32B\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate632B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate byte scalar immediate to be vector by loading from const table.
 instruct Repl4B_imm(vecS dst, immI con) %{
   predicate(n->as_Vector()->length() == 4);
@@ -2998,48 +3395,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl16B_imm(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateB con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl32B_imm(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateB con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl64B_imm(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 64);
-  match(Set dst (ReplicateB con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate32B($con)\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate32B($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate byte scalar zero to be vector
 instruct Repl4B_zero(vecS dst, immI0 zero) %{
   predicate(n->as_Vector()->length() == 4);
@@ -3083,18 +3438,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl64B_zero(vecZ dst, immI0 zero) %{
-  predicate(n->as_Vector()->length() == 64);
-  match(Set dst (ReplicateB zero));
-  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
-  ins_encode %{
-    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
-    int vector_len = 2;
-    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate char/short (2 byte) scalar to be vector
 instruct Repl2S(vecS dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3108,66 +3451,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl4S(vecD dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateS src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshuflw $dst,$dst,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl8S(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateS src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\t! replicate8S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16S(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateS src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl32S(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateS src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate16S\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
 instruct Repl2S_imm(vecS dst, immI con) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3189,48 +3472,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl8S_imm(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateS con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16S_imm(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateS con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl32S_imm(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateS con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate16S($con)\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate char/short (2 byte) scalar zero to be vector
 instruct Repl2S_zero(vecS dst, immI0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3274,18 +3515,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl32S_zero(vecZ dst, immI0 zero) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateS zero));
-  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
-  ins_encode %{
-    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
-    int vector_len = 2;
-    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate integer (4 byte) scalar to be vector
 instruct Repl2I(vecD dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3299,101 +3528,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl4I(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateI src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateI src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateI src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
-instruct Repl2I_imm(vecD dst, immI con) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateI con));
-  format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4I_imm(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateI con));
-  format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
-            "punpcklqdq $dst,$dst" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I_imm(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateI con));
-  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I_imm(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateI con));
-  format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Integer could be loaded into xmm register directly from memory.
 instruct Repl2I_mem(vecD dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3407,46 +3541,15 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl4I_mem(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateI (LoadI mem)));
-  format %{ "movd    $dst,$mem\n\t"
-            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $mem$$Address);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I_mem(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateI (LoadI mem)));
-  format %{ "movd    $dst,$mem\n\t"
-            "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $mem$$Address);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I_mem(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateI (LoadI mem)));
-  format %{ "movd    $dst,$mem\n\t"
-            "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $mem$$Address);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
+// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2I_imm(vecD dst, immI con) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+  %}
+  ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate integer (4 byte) scalar zero to be vector
@@ -3482,18 +3585,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl16I_zero(vecZ dst, immI0 zero) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateI zero));
-  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
-  ins_encode %{
-    // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
-    int vector_len = 2;
-    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate long (8 byte) scalar to be vector
 #ifdef _LP64
 instruct Repl2L(vecX dst, rRegL src) %{
@@ -3507,36 +3598,6 @@
   %}
   ins_pipe( pipe_slow );
 %}
-
-instruct Repl4L(vecY dst, rRegL src) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL src));
-  format %{ "movdq   $dst,$src\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
-  ins_encode %{
-    __ movdq($dst$$XMMRegister, $src$$Register);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L(vecZ dst, rRegL src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateL src));
-  format %{ "movdq   $dst,$src\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
-  ins_encode %{
-    __ movdq($dst$$XMMRegister, $src$$Register);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
 #else // _LP64
 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3554,45 +3615,6 @@
   %}
   ins_pipe( pipe_slow );
 %}
-
-instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL src));
-  effect(TEMP dst, USE src, TEMP tmp);
-  format %{ "movdl   $dst,$src.lo\n\t"
-            "movdl   $tmp,$src.hi\n\t"
-            "punpckldq $dst,$tmp\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
-    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L(vecZ dst, eRegL src, regD tmp) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL src));
-  effect(TEMP dst, USE src, TEMP tmp);
-  format %{ "movdl   $dst,$src.lo\n\t"
-            "movdl   $tmp,$src.hi\n\t"
-            "punpckldq $dst,$tmp\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
-    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
 #endif // _LP64
 
 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
@@ -3608,79 +3630,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl4L_imm(vecY dst, immL con) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress($con));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_imm(vecZ dst, immL con) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateL con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate4L($con)\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress($con));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Long could be loaded into xmm register directly from memory.
-instruct Repl2L_mem(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateL (LoadL mem)));
-  format %{ "movq    $dst,$mem\n\t"
-            "punpcklqdq $dst,$dst\t! replicate2L" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl4L_mem(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL (LoadL mem)));
-  format %{ "movq    $dst,$mem\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_mem(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateL (LoadL mem)));
-  format %{ "movq    $dst,$mem\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate long (8 byte) scalar zero to be vector
 instruct Repl2L_zero(vecX dst, immL0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3704,18 +3653,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl8L_zero(vecZ dst, immL0 zero) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateL zero));
-  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
-  ins_encode %{
-    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
-    int vector_len = 2;
-    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate float (4 byte) scalar to be vector
 instruct Repl2F(vecD dst, regF src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3737,32 +3674,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl8F(vecY dst, regF src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateF src));
-  format %{ "pshufd  $dst,$src,0x00\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16F(vecZ dst, regF src) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateF src));
-  format %{ "pshufd  $dst,$src,0x00\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! lower replicate8F\n\t"
-            "vinsertf64x4h $dst k0,$dst,$dst\t! lower replicate8F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate float (4 byte) scalar zero to be vector
 instruct Repl2F_zero(vecD dst, immF0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3795,17 +3706,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl16F_zero(vecZ dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate double (8 bytes) scalar to be vector
 instruct Repl2D(vecX dst, regD src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3817,32 +3717,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl4D(vecY dst, regD src) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateD src));
-  format %{ "pshufd  $dst,$src,0x44\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8D(vecZ dst, regD src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateD src));
-  format %{ "pshufd  $dst,$src,0x44\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! lower replicate4D\n\t"
-            "vinsertf64x4h $dst k0,$dst,$dst\t! upper replicate4D" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate double (8 byte) scalar zero to be vector
 instruct Repl2D_zero(vecX dst, immD0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3865,8 +3739,636 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl8D_zero(vecZ dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 8);
+// ====================EVEX REPLICATE=============================================
+
+instruct Repl4B_mem_evex(vecS dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_mem_evex(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_evex(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_mem_evex(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_evex(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
+  ins_encode %{
+   int vector_len = 1;
+    __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_evex(vecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  match(Set dst (ReplicateB src));
+  format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
+  ins_encode %{
+   int vector_len = 2;
+    __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm_evex(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastb $dst,$dst\t! replicate16B" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm_evex(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastb $dst,$dst\t! replicate32B" %}
+  ins_encode %{
+   int vector_len = 1;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_imm_evex(vecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
+  ins_encode %{
+   int vector_len = 2;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  match(Set dst (ReplicateB zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S_evex(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S_mem_evex(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_evex(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_mem_evex(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_evex(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
+  ins_encode %{
+   int vector_len = 1;
+    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_evex(vecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  match(Set dst (ReplicateS src));
+  format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
+  ins_encode %{
+   int vector_len = 2;
+    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_imm_evex(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastw $dst,$dst\t! replicate8S" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm_evex(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastw $dst,$dst\t! replicate16S" %}
+  ins_encode %{
+   int vector_len = 1;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_imm_evex(vecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastw $dst,$dst\t! replicate32S" %}
+  ins_encode %{
+   int vector_len = 2;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  match(Set dst (ReplicateS zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_evex(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_mem_evex(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_evex(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_evex(vecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateI src));
+  format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_imm_evex(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+            "vpbroadcastd  $dst,$dst\t! replicate4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm_evex(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+            "vpbroadcastd  $dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_imm_evex(vecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
+            "vpbroadcastd  $dst,$dst\t! replicate16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateI zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl4L_evex(vecY dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(vecZ dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL src));
+  format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "vpbroadcastq  $dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "vpbroadcastq  $dst,$dst\t! replicate8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+instruct Repl4L_imm_evex(vecY dst, immL con) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastq  $dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ movq($dst$$XMMRegister, $constantaddress($con));
+    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_imm_evex(vecZ dst, immL con) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastq  $dst,$dst\t! replicate8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ movq($dst$$XMMRegister, $constantaddress($con));
+    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl2L_mem_evex(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_evex(vecY dst, regF src) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF src));
+  format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_evex(vecZ dst, regF src) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateF src));
+  format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_evex(vecY dst, regD src) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD src));
+  format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_evex(vecZ dst, regD src) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateD src));
+  format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateD zero));
   format %{ "vxorpd  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
   ins_encode %{
@@ -4972,6 +5474,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVB src (LoadVector mem)));
+  format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (AddVB dst src));
@@ -4993,6 +5506,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVB src (LoadVector mem)));
+  format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 16);
   match(Set dst (AddVB dst src));
@@ -5091,6 +5615,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVS src (LoadVector mem)));
+  format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd4S(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (AddVS dst src));
@@ -5112,6 +5647,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVS src (LoadVector mem)));
+  format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd8S(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (AddVS dst src));
@@ -5210,6 +5756,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVI src (LoadVector mem)));
+  format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd4I(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (AddVI dst src));
@@ -5385,6 +5942,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVF src (LoadVector mem)));
+  format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd4F(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (AddVF dst src));
@@ -5562,6 +6130,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVB src (LoadVector mem)));
+  format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (SubVB dst src));
@@ -5583,6 +6162,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVB src (LoadVector mem)));
+  format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 16);
   match(Set dst (SubVB dst src));
@@ -5681,6 +6271,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVS src (LoadVector mem)));
+  format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub4S(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (SubVS dst src));
@@ -5702,6 +6303,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVS src (LoadVector mem)));
+  format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub8S(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (SubVS dst src));
@@ -5800,6 +6412,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVI src (LoadVector mem)));
+  format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub4I(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (SubVI dst src));
@@ -5975,6 +6598,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVF src (LoadVector mem)));
+  format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub4F(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (SubVF dst src));
@@ -6152,6 +6786,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVS src (LoadVector mem)));
+  format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul4S(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (MulVS dst src));
@@ -6173,6 +6818,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVS src (LoadVector mem)));
+  format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul8S(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (MulVS dst src));
@@ -6271,6 +6927,49 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I(vecX dst, vecX src) %{
+  predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI dst src));
+  format %{ "pmulld  $dst,$src\t! mul packed4I" %}
+  ins_encode %{
+    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src1 src2));
+  format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulVL src1 src2));
@@ -6282,34 +6981,13 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul4I(vecX dst, vecX src) %{
-  predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
-  match(Set dst (MulVI dst src));
-  format %{ "pmulld  $dst,$src\t! mul packed4I" %}
-  ins_encode %{
-    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
-  match(Set dst (MulVI src1 src2));
-  format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
-  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
-  match(Set dst (MulVI src (LoadVector mem)));
-  format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src (LoadVector mem)));
+  format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -6336,6 +7014,28 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src1 src2));
+  format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src (LoadVector mem)));
+  format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
   match(Set dst (MulVI src1 src2));
@@ -6347,13 +7047,13 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
-  match(Set dst (MulVL src1 src2));
-  format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -6369,28 +7069,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
-  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
-  match(Set dst (MulVI src (LoadVector mem)));
-  format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
-  match(Set dst (MulVL src (LoadVector mem)));
-  format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
   match(Set dst (MulVI src (LoadVector mem)));
@@ -6424,6 +7102,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVF src (LoadVector mem)));
+  format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul4F(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (MulVF dst src));
@@ -6601,6 +7290,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (DivVF src (LoadVector mem)));
+  format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vdiv4F(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (DivVF dst src));
@@ -7878,6 +8578,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (AndV src (LoadVector mem)));
+  format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vand8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (AndV dst src));
@@ -7899,6 +8610,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV src (LoadVector mem)));
+  format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vand16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (AndV dst src));
@@ -7998,6 +8720,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (OrV src (LoadVector mem)));
+  format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vor8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (OrV dst src));
@@ -8019,6 +8752,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (OrV src (LoadVector mem)));
+  format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vor16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (OrV dst src));
@@ -8118,6 +8862,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (XorV src (LoadVector mem)));
+  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vxor8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (XorV dst src));
@@ -8139,6 +8894,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV src (LoadVector mem)));
+  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vxor16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (XorV dst src));
--- a/src/cpu/zero/vm/globals_zero.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/cpu/zero/vm/globals_zero.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -63,7 +63,8 @@
 
 define_pd_global(bool, PreserveFramePointer, false);
 
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct)  \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint)  \
+                                                                            \
   product(bool, UseFastEmptyMethods, true,                                  \
           "Use fast method entry code for empty methods")                   \
                                                                             \
--- a/src/os/aix/vm/decoder_aix.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/aix/vm/decoder_aix.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2013 SAP AG. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -38,8 +38,8 @@
 
   virtual bool demangle(const char* symbol, char* buf, int buflen) { return false; } // demangled by getFuncName
 
-  virtual bool decode(address addr, char* buf, int buflen, int* offset, const char* modulepath) {
-    return (::getFuncName((codeptr_t)addr, buf, buflen, offset, 0, 0, 0) == 0);
+  virtual bool decode(address addr, char* buf, int buflen, int* offset, const char* modulepath, bool demangle) {
+    return (::getFuncName((codeptr_t)addr, buf, buflen, offset, 0, 0, 0, demangle) == 0);
   }
   virtual bool decode(address addr, char *buf, int buflen, int* offset, const void *base) {
     ShouldNotReachHere();
--- a/src/os/aix/vm/globals_aix.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/aix/vm/globals_aix.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -29,7 +29,7 @@
 //
 // Defines Aix specific flags. They are not available on other platforms.
 //
-#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
+#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct, range, constraint) \
                                                                                     \
   /* Use 64K pages for virtual memory (shmat). */                                   \
   product(bool, Use64KPages, true,                                                  \
--- a/src/os/aix/vm/os_aix.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/aix/vm/os_aix.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1439,7 +1439,8 @@
 }
 
 bool os::dll_address_to_function_name(address addr, char *buf,
-                                      int buflen, int *offset) {
+                                      int buflen, int *offset,
+                                      bool demangle) {
   if (offset) {
     *offset = -1;
   }
@@ -1454,7 +1455,7 @@
   }
 
   // Go through Decoder::decode to call getFuncName which reads the name from the traceback table.
-  return Decoder::decode(addr, buf, buflen, offset);
+  return Decoder::decode(addr, buf, buflen, offset, demangle);
 }
 
 static int getModuleName(codeptr_t pc,                    // [in] program counter
@@ -1653,7 +1654,7 @@
   }
 }
 
-void os::pd_print_cpu_info(outputStream* st) {
+void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
   // cpu
   st->print("CPU:");
   st->print("total %d", os::processor_count());
@@ -3761,10 +3762,6 @@
   return fetcher.result();
 }
 
-// Not neede on Aix.
-// int os::Aix::safe_cond_timedwait(pthread_cond_t *_cond, pthread_mutex_t *_mutex, const struct timespec *_abstime) {
-// }
-
 ////////////////////////////////////////////////////////////////////////////////
 // debug support
 
--- a/src/os/aix/vm/porting_aix.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/aix/vm/porting_aix.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -114,7 +114,8 @@
     int* p_displacement,             // [out] optional: displacement (-1 if not available)
     const struct tbtable** p_tb,     // [out] optional: ptr to traceback table to get further
                                      //                 information (NULL if not available)
-    char* p_errmsg, size_t errmsglen // [out] optional: user provided buffer for error messages
+    char* p_errmsg, size_t errmsglen,// [out] optional: user provided buffer for error messages
+    bool demangle                    // [in] whether to demangle the name
   ) {
   struct tbtable* tb = 0;
   unsigned int searchcount = 0;
@@ -216,15 +217,17 @@
       p_name[0] = '\0';
 
       // If it is a C++ name, try and demangle it using the Demangle interface (see demangle.h).
-      char* rest;
-      Name* const name = Demangle(buf, rest);
-      if (name) {
-        const char* const demangled_name = name->Text();
-        if (demangled_name) {
-          strncpy(p_name, demangled_name, namelen-1);
-          p_name[namelen-1] = '\0';
+      if (demangle) {
+        char* rest;
+        Name* const name = Demangle(buf, rest);
+        if (name) {
+          const char* const demangled_name = name->Text();
+          if (demangled_name) {
+            strncpy(p_name, demangled_name, namelen-1);
+            p_name[namelen-1] = '\0';
+          }
+          delete name;
         }
-        delete name;
       }
 
       // Fallback: if demangling did not work, just provide the unmangled name.
@@ -325,7 +328,7 @@
       int displacement = 0;
 
       if (getFuncName((codeptr_t) p, funcname, sizeof(funcname), &displacement,
-                      NULL, NULL, 0) == 0) {
+                      NULL, NULL, 0, true /* demangle */) == 0) {
         if (funcname[0] != '\0') {
           const char* const interned = dladdr_fixed_strings.intern(funcname);
           info->dli_sname = interned;
--- a/src/os/aix/vm/porting_aix.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/aix/vm/porting_aix.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -87,7 +87,8 @@
       char* p_name, size_t namelen,    // [out] optional: user provided buffer for the function name
       int* p_displacement,             // [out] optional: displacement
       const struct tbtable** p_tb,     // [out] optional: ptr to traceback table to get further information
-      char* p_errmsg, size_t errmsglen // [out] optional: user provided buffer for error messages
+      char* p_errmsg, size_t errmsglen,// [out] optional: user provided buffer for error messages
+      bool demangle = true             // [in] whether to demangle the name
     );
 
 // -------------------------------------------------------------------------
--- a/src/os/bsd/vm/decoder_machO.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/bsd/vm/decoder_machO.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -42,7 +42,7 @@
   virtual bool decode(address pc, char* buf, int buflen, int* offset,
                       const void* base);
   virtual bool decode(address pc, char* buf, int buflen, int* offset,
-                      const char* module_path = NULL) {
+                      const char* module_path, bool demangle) {
     ShouldNotReachHere();
     return false;
   }
--- a/src/os/bsd/vm/globals_bsd.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/bsd/vm/globals_bsd.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,19 +28,20 @@
 //
 // Defines Bsd specific flags. They are not available on other platforms.
 //
-#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
-  product(bool, UseOprofile, false,                                     \
-        "enable support for Oprofile profiler")                         \
-                                                                        \
-  product(bool, UseBsdPosixThreadCPUClocks, true,                     \
-          "enable fast Bsd Posix clocks where available")             \
-/*  NB: The default value of UseBsdPosixThreadCPUClocks may be        \
-    overridden in Arguments::parse_each_vm_init_arg.  */                \
-                                                                        \
-  product(bool, UseHugeTLBFS, false,                                    \
-          "Use MAP_HUGETLB for large pages")                            \
-                                                                        \
-  product(bool, UseSHM, false,                                          \
+#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct, range, constraint) \
+                                                                                \
+  product(bool, UseOprofile, false,                                             \
+        "enable support for Oprofile profiler")                                 \
+                                                                                \
+  /*  NB: The default value of UseBsdPosixThreadCPUClocks may be  */            \
+  /*  overridden in Arguments::parse_each_vm_init_arg.            */            \
+  product(bool, UseBsdPosixThreadCPUClocks, true,                               \
+          "enable fast Bsd Posix clocks where available")                       \
+                                                                                \
+  product(bool, UseHugeTLBFS, false,                                            \
+          "Use MAP_HUGETLB for large pages")                                    \
+                                                                                \
+  product(bool, UseSHM, false,                                                  \
           "Use SYSV shared memory for large pages")
 
 //
--- a/src/os/bsd/vm/os_bsd.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/bsd/vm/os_bsd.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -637,11 +637,6 @@
 //////////////////////////////////////////////////////////////////////////////
 // create new thread
 
-// check if it's safe to start a new thread
-static bool _thread_safety_check(Thread* thread) {
-  return true;
-}
-
 #ifdef __APPLE__
 // library handle for calling objc_registerThreadWithCollector()
 // without static linking to the libobjc library
@@ -681,15 +676,6 @@
   OSThread* osthread = thread->osthread();
   Monitor* sync = osthread->startThread_lock();
 
-  // non floating stack BsdThreads needs extra check, see above
-  if (!_thread_safety_check(thread)) {
-    // notify parent thread
-    MutexLockerEx ml(sync, Mutex::_no_safepoint_check_flag);
-    osthread->set_state(ZOMBIE);
-    sync->notify_all();
-    return NULL;
-  }
-
   osthread->set_thread_id(os::Bsd::gettid());
 
 #ifdef __APPLE__
@@ -1339,7 +1325,8 @@
 #define MACH_MAXSYMLEN 256
 
 bool os::dll_address_to_function_name(address addr, char *buf,
-                                      int buflen, int *offset) {
+                                      int buflen, int *offset,
+                                      bool demangle) {
   // buf is not optional, but offset is optional
   assert(buf != NULL, "sanity check");
 
@@ -1349,7 +1336,7 @@
   if (dladdr((void*)addr, &dlinfo) != 0) {
     // see if we have a matching symbol
     if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) {
-      if (!Decoder::demangle(dlinfo.dli_sname, buf, buflen)) {
+      if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
         jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
       }
       if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
@@ -1358,15 +1345,16 @@
     // no matching symbol so try for just file info
     if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
       if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
-                          buf, buflen, offset, dlinfo.dli_fname)) {
+                          buf, buflen, offset, dlinfo.dli_fname, demangle)) {
         return true;
       }
     }
 
     // Handle non-dynamic manually:
     if (dlinfo.dli_fbase != NULL &&
-        Decoder::decode(addr, localbuf, MACH_MAXSYMLEN, offset, dlinfo.dli_fbase)) {
-      if (!Decoder::demangle(localbuf, buf, buflen)) {
+        Decoder::decode(addr, localbuf, MACH_MAXSYMLEN, offset,
+                        dlinfo.dli_fbase)) {
+      if (!(demangle && Decoder::demangle(localbuf, buf, buflen))) {
         jio_snprintf(buf, buflen, "%s", localbuf);
       }
       return true;
@@ -1706,7 +1694,7 @@
   os::Posix::print_load_average(st);
 }
 
-void os::pd_print_cpu_info(outputStream* st) {
+void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
   // Nothing to do for now.
 }
 
@@ -2276,8 +2264,6 @@
   return os::uncommit_memory(addr, size);
 }
 
-static address _highest_vm_reserved_address = NULL;
-
 // If 'fixed' is true, anon_mmap() will attempt to reserve anonymous memory
 // at 'requested_addr'. If there are existing memory mappings at the same
 // location, however, they will be overwritten. If 'fixed' is false,
@@ -2300,23 +2286,9 @@
   addr = (char*)::mmap(requested_addr, bytes, PROT_NONE,
                        flags, -1, 0);
 
-  if (addr != MAP_FAILED) {
-    // anon_mmap() should only get called during VM initialization,
-    // don't need lock (actually we can skip locking even it can be called
-    // from multiple threads, because _highest_vm_reserved_address is just a
-    // hint about the upper limit of non-stack memory regions.)
-    if ((address)addr + bytes > _highest_vm_reserved_address) {
-      _highest_vm_reserved_address = (address)addr + bytes;
-    }
-  }
-
   return addr == MAP_FAILED ? NULL : addr;
 }
 
-// Don't update _highest_vm_reserved_address, because there might be memory
-// regions above addr + size. If so, releasing a memory region only creates
-// a hole in the address space, it doesn't help prevent heap-stack collision.
-//
 static int anon_munmap(char * addr, size_t size) {
   return ::munmap(addr, size) == 0;
 }
@@ -2490,15 +2462,7 @@
   assert(bytes % os::vm_page_size() == 0, "reserving unexpected size block");
 
   // Repeatedly allocate blocks until the block is allocated at the
-  // right spot. Give up after max_tries. Note that reserve_memory() will
-  // automatically update _highest_vm_reserved_address if the call is
-  // successful. The variable tracks the highest memory address every reserved
-  // by JVM. It is used to detect heap-stack collision if running with
-  // fixed-stack BsdThreads. Because here we may attempt to reserve more
-  // space than needed, it could confuse the collision detecting code. To
-  // solve the problem, save current _highest_vm_reserved_address and
-  // calculate the correct value before return.
-  address old_highest = _highest_vm_reserved_address;
+  // right spot.
 
   // Bsd mmap allows caller to pass an address as hint; give it a try first,
   // if kernel honors the hint then we can return immediately.
@@ -2552,10 +2516,8 @@
   }
 
   if (i < max_tries) {
-    _highest_vm_reserved_address = MAX2(old_highest, (address)requested_addr + bytes);
     return requested_addr;
   } else {
-    _highest_vm_reserved_address = old_highest;
     return NULL;
   }
 }
@@ -3715,12 +3677,6 @@
   return fetcher.result();
 }
 
-int os::Bsd::safe_cond_timedwait(pthread_cond_t *_cond,
-                                 pthread_mutex_t *_mutex,
-                                 const struct timespec *_abstime) {
-  return pthread_cond_timedwait(_cond, _mutex, _abstime);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // debug support
 
@@ -4286,7 +4242,7 @@
   // In that case, we should propagate the notify to another waiter.
 
   while (_Event < 0) {
-    status = os::Bsd::safe_cond_timedwait(_cond, _mutex, &abst);
+    status = pthread_cond_timedwait(_cond, _mutex, &abst);
     if (status != 0 && WorkAroundNPTLTimedWaitHang) {
       pthread_cond_destroy(_cond);
       pthread_cond_init(_cond, NULL);
@@ -4492,7 +4448,7 @@
   if (time == 0) {
     status = pthread_cond_wait(_cond, _mutex);
   } else {
-    status = os::Bsd::safe_cond_timedwait(_cond, _mutex, &absTime);
+    status = pthread_cond_timedwait(_cond, _mutex, &absTime);
     if (status != 0 && WorkAroundNPTLTimedWaitHang) {
       pthread_cond_destroy(_cond);
       pthread_cond_init(_cond, NULL);
--- a/src/os/bsd/vm/os_bsd.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/bsd/vm/os_bsd.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -30,9 +30,6 @@
 // Information about the protection of the page at address '0' on this os.
 static bool zero_page_read_protected() { return true; }
 
-// pthread_getattr_np comes with BsdThreads-0.9-7 on RedHat 7.1
-typedef int (*pthread_getattr_func_type)(pthread_t, pthread_attr_t *);
-
 #ifdef __APPLE__
 // Mac OS X doesn't support clock_gettime. Stub out the type, it is
 // unused
@@ -145,9 +142,6 @@
 
   // none present
 
-  // BsdThreads work-around for 6292965
-  static int safe_cond_timedwait(pthread_cond_t *_cond, pthread_mutex_t *_mutex, const struct timespec *_abstime);
-
  private:
   typedef int (*sched_getcpu_func_t)(void);
   typedef int (*numa_node_to_cpus_func_t)(int node, unsigned long *buffer, int bufferlen);
--- a/src/os/linux/vm/globals_linux.hpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/linux/vm/globals_linux.hpp	Thu Jul 02 11:12:59 2015 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,14 +28,15 @@
 //
 // Defines Linux specific flags. They are not available on other platforms.
 //
-#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
+#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct, range, constraint) \
+                                                                        \
   product(bool, UseOprofile, false,                                     \
         "enable support for Oprofile profiler")                         \
                                                                         \
+  /*  NB: The default value of UseLinuxPosixThreadCPUClocks may be   */ \
+  /* overridden in Arguments::parse_each_vm_init_arg.                */ \
   product(bool, UseLinuxPosixThreadCPUClocks, true,                     \
           "enable fast Linux Posix clocks where available")             \
-/*  NB: The default value of UseLinuxPosixThreadCPUClocks may be        \
-    overridden in Arguments::parse_each_vm_init_arg.  */                \
                                                                         \
   product(bool, UseHugeTLBFS, false,                                    \
           "Use MAP_HUGETLB for large pages")                            \
--- a/src/os/linux/vm/os_linux.cpp	Tue Jun 16 17:31:53 2015 +0100
+++ b/src/os/linux/vm/os_linux.cpp	Thu Jul 02 11:12:59 2015 +0100
@@ -135,8 +135,6 @@
 pthread_t os::Linux::_main_thread;
 int os::Linux::_page_size = -1;
 const int os::Linux::_vm_default_page_size = (8 * K);
-bool os::Linux::_is_floating_stack = false;
-bool os::Linux::_is_NPTL = false;
 bool os::Linux::_supports_fast_thread_cpu_time = false;
 const char * os::Linux::_glibc_version = NULL;
 const char * os::Linux::_libpthread_version = NULL;
@@ -150,8 +148,6 @@
 static sigset_t check_signal_done;
 static bool check_signals = true;
 
-static pid_t _initial_pid = 0;
-
 // Signal number used to suspend/resume a thread
 
 // do not use any signal number less than SIGSEGV, see 4355769
@@ -223,18 +219,10 @@
 //
 // Returns the kernel thread id of the currently running thread. Kernel
 // thread id is used to access /proc.
-//
-// (Note that getpid() on LinuxThreads returns kernel thread id too; but
-// on NPTL, it returns the same pid for all threads, as required by POSIX.)
-//
 pid_t os::Linux::gettid() {
   int rslt = syscall(SYS_gettid);
-  if (rslt == -1) {
-    // old kernel, no NPTL support
-    return getpid();
-  } else {
-    return (pid_t)rslt;
-  }
+  assert(rslt != -1, "must be."); // old linuxthreads implementation?
+  return (pid_t)rslt;
 }
 
 // Most versions of linux have a bug where the number of processors are
@@ -508,68 +496,48 @@
 // detecting pthread library
 
 void os::Linux::libpthread_init() {
-  // Save glibc and pthread version strings. Note that _CS_GNU_LIBC_VERSION
-  // and _CS_GNU_LIBPTHREAD_VERSION are supported in glibc >= 2.3.2. Use a
-  // generic name for earlier versions.
-  // Define macros here so we can build HotSpot on old systems.
-#ifndef _CS_GNU_LIBC_VERSION
-  #define _CS_GNU_LIBC_VERSION 2
+  // Save glibc and pthread version strings.
+#if !defined(_CS_GNU_LIBC_VERSION) || \
+    !defined(_CS_GNU_LIBPTHREAD_VERSION)
+  #error "glibc too old (< 2.3.2)"
 #endif
-#ifndef _CS_GNU_LIBPTHREAD_VERSION
-  #define _CS_GNU_LIBPTHREAD_VERSION 3
-#endif
 
   size_t n = confstr(_CS_GNU_LIBC_VERSION, NULL, 0);
-  if (n > 0) {
-    char *str = (char *)malloc(n, mtInternal);
-    confstr(_CS_GNU_LIBC_VERSION, str, n);
-    os::Linux::set_glibc_version(str);
-  } else {
-    // _CS_GNU_LIBC_VERSION is not supported, try gnu_get_libc_version()
-    static char _gnu_libc_version[32];
-    jio_snprintf(_gnu_libc_version, sizeof(_gnu_libc_version),
-                 "glibc %s %s", gnu_get_libc_version(), gnu_get_libc_release());
-    os::Linux::set_glibc_version(_gnu_libc_version);
-  }
+  assert(n > 0, "cannot retrieve glibc version");
+  char *str = (char *)malloc(n, mtInternal);
+  confstr(_CS_GNU_LIBC_VERSION, str, n);
+  os::Linux::set_glibc_version(str);
 
   n = confstr(_CS_GNU_LIBPTHREAD_VERSION, NULL, 0);
-  if (n > 0) {
-    char *str = (char *)malloc(n, mtInternal);
-    confstr(_CS_GNU_LIBPTHREAD_VERSION, str, n);
-    // Vanilla RH-9 (glibc 2.3.2) has a bug that confstr() always tells
-    // us "NPTL-0.29" even we are running with LinuxThreads. Check if this
-    // is the case. LinuxThreads has a hard limit on max number of threads.
-    // So sysconf(_SC_THREAD_THREADS_MAX) will return a positive value.
-    // On the other hand, NPTL does not have such a limit, sysconf()
-    // will return -1 and errno is not changed. Check if it is really NPTL.
-    if (strcmp(os::Linux::glibc_version(), "glibc 2.3.2") == 0 &&
-        strstr(str, "NPTL") &&
-        sysconf(_SC_THREAD_THREADS_MAX) > 0) {
-      free(str);
-      os::Linux::set_libpthread_version("linuxthreads");
-    } else {
-      os::Linux::set_libpthread_version(str);
-    }
-  } else {
-    // glibc before 2.3.2 only has LinuxThreads.
-    os::Linux::set_libpthread_version("linuxthreads");
-  }
-
-  if (strstr(libpthread_version(), "NPTL")) {
-    os::Linux::set_is_NPTL();
-  } else {
-    os::Linux::set_is_LinuxThreads();
-  }
-
-  // LinuxThreads have two flavors: floating-stack mode, which allows variable
-  // stack size; and fixed-stack mode. NPTL is always floating-stack.
-  if (os::Linux::is_NPTL() || os::Linux::supports_variable_stack_size()) {
-    os::Linux::set_is_floating_stack();
-  }
+  assert(n > 0, "cannot retrieve pthread version");
+  str = (char *)malloc(n, mtInternal);
+  confstr(_CS_GNU_LIBPTHREAD_VERSION, str, n);
+  os::Linux::set_libpthread_version(str);
 }
 
 /////////////////////////////////////////////////////////////////////////////
-// thread stack
+// thread stack expansion
+
+// os::Linux::manually_expand_stack() takes care of expanding the thread
+// stack. Note that this is normally not needed: pthread stacks allocate
+// thread stack using mmap() without MAP_NORESERVE, so the stack is already
+// committed. Therefore it is not necessary to expand the stack manually.
+//
+// Manually expanding the stack was historically needed on LinuxThreads
+// thread stacks, which were allocated with mmap(MAP_GROWSDOWN). Nowadays
+// it is kept to deal with very rare corner cases:
+//
+// For one, user may run the VM on an own implementation of threads
+// whose stacks are - like the old LinuxThreads - implemented using
+// mmap(MAP_GROWSDOWN).
+//
+// Also, this coding may be needed if the VM is running on the primordial
+// thread. Normally we avoid running on the primordial thread; however,
+// user may still invoke the VM on the primord