changeset 8614:c1b2825ef47e jdk9-b72

Merge
author lana
date Thu, 02 Jul 2015 16:09:16 -0700
parents e0e81b7ec6cd aec5456c3e72
children aa457465c1cd d49e3c587121
files
diffstat 293 files changed, 12232 insertions(+), 3390 deletions(-) [+]
line wrap: on
line diff
--- a/agent/make/Makefile	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/make/Makefile	Thu Jul 02 16:09:16 2015 -0700
@@ -58,6 +58,7 @@
 sun.jvm.hotspot.debugger.dummy \
 sun.jvm.hotspot.debugger.linux \
 sun.jvm.hotspot.debugger.linux.amd64 \
+sun.jvm.hotspot.debugger.linux.aarch64 \
 sun.jvm.hotspot.debugger.linux.ppc64 \
 sun.jvm.hotspot.debugger.linux.x86 \
 sun.jvm.hotspot.debugger.posix \
@@ -65,6 +66,7 @@
 sun.jvm.hotspot.debugger.ppc64 \
 sun.jvm.hotspot.debugger.proc \
 sun.jvm.hotspot.debugger.proc.amd64 \
+sun.jvm.hotspot.debugger.proc.aarch64 \
 sun.jvm.hotspot.debugger.proc.ppc64 \
 sun.jvm.hotspot.debugger.proc.sparc \
 sun.jvm.hotspot.debugger.proc.x86 \
@@ -91,11 +93,13 @@
 sun.jvm.hotspot.prims \
 sun.jvm.hotspot.runtime \
 sun.jvm.hotspot.runtime.amd64 \
+sun.jvm.hotspot.runtime.aarch64 \
 sun.jvm.hotspot.runtime.bsd \
 sun.jvm.hotspot.runtime.bsd_amd64 \
 sun.jvm.hotspot.runtime.bsd_x86 \
 sun.jvm.hotspot.runtime.linux \
 sun.jvm.hotspot.runtime.linux_amd64 \
+sun.jvm.hotspot.runtime.linux_aarch64 \
 sun.jvm.hotspot.runtime.linux_ppc64 \
 sun.jvm.hotspot.runtime.linux_sparc \
 sun.jvm.hotspot.runtime.linux_x86 \
@@ -149,16 +153,19 @@
 sun/jvm/hotspot/debugger/linux/*.java \
 sun/jvm/hotspot/debugger/linux/ppc64/*.java \
 sun/jvm/hotspot/debugger/linux/x86/*.java \
+sun/jvm/hotspot/debugger/linux/aarch64/*.java \
 sun/jvm/hotspot/debugger/posix/*.java \
 sun/jvm/hotspot/debugger/posix/elf/*.java \
 sun/jvm/hotspot/debugger/ppc64/*.java \
 sun/jvm/hotspot/debugger/proc/*.java \
 sun/jvm/hotspot/debugger/proc/amd64/*.java \
+sun/jvm/hotspot/debugger/proc/aarch64/*.java \
 sun/jvm/hotspot/debugger/proc/ppc64/*.java \
 sun/jvm/hotspot/debugger/proc/sparc/*.java \
 sun/jvm/hotspot/debugger/proc/x86/*.java \
 sun/jvm/hotspot/debugger/remote/*.java \
 sun/jvm/hotspot/debugger/remote/amd64/*.java \
+sun/jvm/hotspot/debugger/remote/aarch64/*.java \
 sun/jvm/hotspot/debugger/remote/ppc64/*.java \
 sun/jvm/hotspot/debugger/remote/sparc/*.java \
 sun/jvm/hotspot/debugger/remote/x86/*.java \
@@ -178,11 +185,13 @@
 sun/jvm/hotspot/prims/*.java \
 sun/jvm/hotspot/runtime/*.java \
 sun/jvm/hotspot/runtime/amd64/*.java \
+sun/jvm/hotspot/runtime/aarch64/*.java \
 sun/jvm/hotspot/runtime/bsd/*.java \
 sun/jvm/hotspot/runtime/bsd_amd64/*.java \
 sun/jvm/hotspot/runtime/bsd_x86/*.java \
 sun/jvm/hotspot/runtime/linux/*.java \
 sun/jvm/hotspot/runtime/linux_amd64/*.java \
+sun/jvm/hotspot/runtime/linux_aarch64/*.java \
 sun/jvm/hotspot/runtime/linux_ppc64/*.java \
 sun/jvm/hotspot/runtime/linux_sparc/*.java \
 sun/jvm/hotspot/runtime/linux_x86/*.java \
--- a/agent/src/os/linux/LinuxDebuggerLocal.c	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/os/linux/LinuxDebuggerLocal.c	Thu Jul 02 16:09:16 2015 -0700
@@ -53,6 +53,10 @@
 #include "sun_jvm_hotspot_debugger_ppc64_PPC64ThreadContext.h"
 #endif
 
+#ifdef aarch64
+#include "sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext.h"
+#endif
+
 static jfieldID p_ps_prochandle_ID = 0;
 static jfieldID threadList_ID = 0;
 static jfieldID loadObjectList_ID = 0;
@@ -368,7 +372,7 @@
 #define NPRGREG sun_jvm_hotspot_debugger_amd64_AMD64ThreadContext_NPRGREG
 #endif
 #ifdef aarch64
-#define NPRGREG 32
+#define NPRGREG sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext_NPRGREG
 #endif
 #if defined(sparc) || defined(sparcv9)
 #define NPRGREG sun_jvm_hotspot_debugger_sparc_SPARCThreadContext_NPRGREG
@@ -473,6 +477,13 @@
 
 #define REG_INDEX(reg) sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext_##reg
 
+  {
+    int i;
+    for (i = 0; i < 31; i++)
+      regs[i] = gregs.regs[i];
+    regs[REG_INDEX(SP)] = gregs.sp;
+    regs[REG_INDEX(PC)] = gregs.pc;
+  }
 #endif /* aarch64 */
 
 #ifdef ppc64
--- a/agent/src/os/linux/Makefile	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/os/linux/Makefile	Thu Jul 02 16:09:16 2015 -0700
@@ -53,14 +53,15 @@
         $(JAVAH) -jni -classpath ../../../build/classes -d $(ARCH) \
 		sun.jvm.hotspot.debugger.x86.X86ThreadContext \
 		sun.jvm.hotspot.debugger.sparc.SPARCThreadContext \
-		sun.jvm.hotspot.debugger.amd64.AMD64ThreadContext 
+		sun.jvm.hotspot.debugger.amd64.AMD64ThreadContext \
+		sun.jvm.hotspot.debugger.aarch64.AARCH64ThreadContext 
         $(GCC) $(CFLAGS) $< -o $@
 
 $(ARCH)/sadis.o:  ../../share/native/sadis.c
         $(JAVAH) -jni -classpath ../../../build/classes -d $(ARCH) \
                 sun.jvm.hotspot.asm.Disassembler
         $(GCC) $(CFLAGS) $< -o $@
- 
+
 $(ARCH)/%.o: %.c
         $(GCC) $(CFLAGS) $< -o $@
 
--- a/agent/src/os/linux/libproc.h	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/os/linux/libproc.h	Thu Jul 02 16:09:16 2015 -0700
@@ -72,6 +72,7 @@
 #define user_regs_struct  pt_regs
 #endif
 #if defined(aarch64)
+#include <asm/ptrace.h>
 #define user_regs_struct user_pt_regs
 #endif
 
--- a/agent/src/os/linux/proc_service.h	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/os/linux/proc_service.h	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -30,7 +30,7 @@
 
 // Linux does not have the proc service library, though it does provide the
 // thread_db library which can be used to manipulate threads without having
-// to know the details of LinuxThreads or NPTL
+// to know the details of NPTL
 
 // copied from Solaris "proc_service.h"
 typedef enum {
--- a/agent/src/share/classes/sun/jvm/hotspot/HSDB.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/HSDB.java	Thu Jul 02 16:09:16 2015 -0700
@@ -983,19 +983,15 @@
                                                      curFrame.getFP(),
                                                      anno));
             } else {
-              if (VM.getVM().getCPU().equals("x86") || VM.getVM().getCPU().equals("amd64")) {
-                // For C2, which has null frame pointers on x86/amd64
-                CodeBlob cb = VM.getVM().getCodeCache().findBlob(curFrame.getPC());
-                Address sp = curFrame.getSP();
-                if (Assert.ASSERTS_ENABLED) {
-                  Assert.that(cb.getFrameSize() > 0, "CodeBlob must have non-zero frame size");
-                }
-                annoPanel.addAnnotation(new Annotation(sp,
-                                                       sp.addOffsetTo(cb.getFrameSize()),
-                                                       anno));
-              } else {
-                Assert.that(VM.getVM().getCPU().equals("ia64"), "only ia64 should reach here");
+              // For C2, which has null frame pointers on x86/amd64/aarch64
+              CodeBlob cb = VM.getVM().getCodeCache().findBlob(curFrame.getPC());
+              Address sp = curFrame.getSP();
+              if (Assert.ASSERTS_ENABLED) {
+                Assert.that(cb.getFrameSize() > 0, "CodeBlob must have non-zero frame size");
               }
+              annoPanel.addAnnotation(new Annotation(sp,
+                                                     sp.addOffsetTo(cb.getFrameSize()),
+                                                     anno));
             }
 
             // Add interpreter frame annotations
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/aarch64/AARCH64ThreadContext.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.aarch64;
+
+import java.lang.annotation.Native;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.cdbg.*;
+
+/** Specifies the thread context on aarch64 platforms; only a sub-portion
+ * of the context is guaranteed to be present on all operating
+ * systems. */
+
+public abstract class AARCH64ThreadContext implements ThreadContext {
+    // Taken from /usr/include/asm/sigcontext.h on Linux/AARCH64.
+
+    // NOTE: the indices for the various registers must be maintained as
+    // listed across various operating systems. However, only a small
+    // subset of the registers' values are guaranteed to be present (and
+    // must be present for the SA's stack walking to work)
+
+    // One instance of the Native annotation is enough to trigger header generation
+    // for this file.
+    @Native
+    public static final int R0 = 0;
+    public static final int R1 = 1;
+    public static final int R2 = 2;
+    public static final int R3 = 3;
+    public static final int R4 = 4;
+    public static final int R5 = 5;
+    public static final int R6 = 6;
+    public static final int R7 = 7;
+    public static final int R8 = 8;
+    public static final int R9 = 9;
+    public static final int R10 = 10;
+    public static final int R11 = 11;
+    public static final int R12 = 12;
+    public static final int R13 = 13;
+    public static final int R14 = 14;
+    public static final int R15 = 15;
+    public static final int R16 = 16;
+    public static final int R17 = 17;
+    public static final int R18 = 18;
+    public static final int R19 = 19;
+    public static final int R20 = 20;
+    public static final int R21 = 21;
+    public static final int R22 = 22;
+    public static final int R23 = 23;
+    public static final int R24 = 24;
+    public static final int R25 = 25;
+    public static final int R26 = 26;
+    public static final int R27 = 27;
+    public static final int R28 = 28;
+    public static final int FP = 29;
+    public static final int LR = 30;
+    public static final int SP = 31;
+    public static final int PC = 32;
+
+    public static final int NPRGREG = 33;
+
+    private long[] data;
+
+    public AARCH64ThreadContext() {
+        data = new long[NPRGREG];
+    }
+
+    public int getNumRegisters() {
+        return NPRGREG;
+    }
+
+    public String getRegisterName(int index) {
+        switch (index) {
+        case LR: return "lr";
+        case SP: return "sp";
+        case PC: return "pc";
+        default:
+            return "r" + index;
+        }
+    }
+
+    public void setRegister(int index, long value) {
+        data[index] = value;
+    }
+
+    public long getRegister(int index) {
+        return data[index];
+    }
+
+    public CFrame getTopFrame(Debugger dbg) {
+        return null;
+    }
+
+    /** This can't be implemented in this class since we would have to
+     * tie the implementation to, for example, the debugging system */
+    public abstract void setRegisterAsAddress(int index, Address value);
+
+    /** This can't be implemented in this class since we would have to
+     * tie the implementation to, for example, the debugging system */
+    public abstract Address getRegisterAsAddress(int index);
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxCDebugger.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxCDebugger.java	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -31,12 +32,14 @@
 import sun.jvm.hotspot.debugger.cdbg.*;
 import sun.jvm.hotspot.debugger.x86.*;
 import sun.jvm.hotspot.debugger.amd64.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
 import sun.jvm.hotspot.debugger.sparc.*;
 import sun.jvm.hotspot.debugger.ppc64.*;
 import sun.jvm.hotspot.debugger.linux.x86.*;
 import sun.jvm.hotspot.debugger.linux.amd64.*;
 import sun.jvm.hotspot.debugger.linux.sparc.*;
 import sun.jvm.hotspot.debugger.linux.ppc64.*;
+import sun.jvm.hotspot.debugger.linux.aarch64.*;
 import sun.jvm.hotspot.utilities.*;
 
 class LinuxCDebugger implements CDebugger {
@@ -106,6 +109,13 @@
         Address pc  = context.getRegisterAsAddress(PPC64ThreadContext.PC);
         if (pc == null) return null;
         return new LinuxPPC64CFrame(dbg, sp, pc, LinuxDebuggerLocal.getAddressSize());
+    } else if (cpu.equals("aarch64")) {
+       AARCH64ThreadContext context = (AARCH64ThreadContext) thread.getContext();
+       Address fp = context.getRegisterAsAddress(AARCH64ThreadContext.FP);
+       if (fp == null) return null;
+       Address pc  = context.getRegisterAsAddress(AARCH64ThreadContext.PC);
+       if (pc == null) return null;
+       return new LinuxAARCH64CFrame(dbg, fp, pc);
      } else {
        // Runtime exception thrown by LinuxThreadContextFactory if unknown cpu
        ThreadContext context = (ThreadContext) thread.getContext();
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/aarch64/LinuxAARCH64CFrame.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.linux.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.linux.*;
+import sun.jvm.hotspot.debugger.cdbg.*;
+import sun.jvm.hotspot.debugger.cdbg.basic.*;
+
+final public class LinuxAARCH64CFrame extends BasicCFrame {
+   public LinuxAARCH64CFrame(LinuxDebugger dbg, Address fp, Address pc) {
+      super(dbg.getCDebugger());
+      this.fp = fp;
+      this.pc = pc;
+      this.dbg = dbg;
+   }
+
+   // override base class impl to avoid ELF parsing
+   public ClosestSymbol closestSymbolToPC() {
+      // try native lookup in debugger.
+      return dbg.lookup(dbg.getAddressValue(pc()));
+   }
+
+   public Address pc() {
+      return pc;
+   }
+
+   public Address localVariableBase() {
+      return fp;
+   }
+
+   public CFrame sender(ThreadProxy thread) {
+      AARCH64ThreadContext context = (AARCH64ThreadContext) thread.getContext();
+      Address rsp = context.getRegisterAsAddress(AARCH64ThreadContext.SP);
+
+      if ((fp == null) || fp.lessThan(rsp)) {
+        return null;
+      }
+
+      // Check alignment of fp
+      if (dbg.getAddressValue(fp) % (2 * ADDRESS_SIZE) != 0) {
+        return null;
+      }
+
+      Address nextFP = fp.getAddressAt(0 * ADDRESS_SIZE);
+      if (nextFP == null || nextFP.lessThanOrEqual(fp)) {
+        return null;
+      }
+      Address nextPC  = fp.getAddressAt(1 * ADDRESS_SIZE);
+      if (nextPC == null) {
+        return null;
+      }
+      return new LinuxAARCH64CFrame(dbg, nextFP, nextPC);
+   }
+
+   // package/class internals only
+   private static final int ADDRESS_SIZE = 8;
+   private Address pc;
+   private Address sp;
+   private Address fp;
+   private LinuxDebugger dbg;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/aarch64/LinuxAARCH64ThreadContext.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.linux.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.linux.*;
+
+public class LinuxAARCH64ThreadContext extends AARCH64ThreadContext {
+  private LinuxDebugger debugger;
+
+  public LinuxAARCH64ThreadContext(LinuxDebugger debugger) {
+    super();
+    this.debugger = debugger;
+  }
+
+  public void setRegisterAsAddress(int index, Address value) {
+    setRegister(index, debugger.getAddressValue(value));
+  }
+
+  public Address getRegisterAsAddress(int index) {
+    return debugger.newAddress(getRegister(index));
+  }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcDebuggerLocal.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcDebuggerLocal.java	Thu Jul 02 16:09:16 2015 -0700
@@ -31,11 +31,13 @@
 import sun.jvm.hotspot.debugger.*;
 import sun.jvm.hotspot.debugger.cdbg.*;
 import sun.jvm.hotspot.debugger.proc.amd64.*;
+import sun.jvm.hotspot.debugger.proc.aarch64.*;
 import sun.jvm.hotspot.debugger.proc.sparc.*;
 import sun.jvm.hotspot.debugger.proc.ppc64.*;
 import sun.jvm.hotspot.debugger.proc.x86.*;
 import sun.jvm.hotspot.debugger.ppc64.*;
 import sun.jvm.hotspot.debugger.amd64.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
 import sun.jvm.hotspot.debugger.sparc.*;
 import sun.jvm.hotspot.debugger.x86.*;
 import sun.jvm.hotspot.utilities.*;
@@ -88,6 +90,10 @@
             threadFactory = new ProcAMD64ThreadFactory(this);
             pcRegIndex = AMD64ThreadContext.RIP;
             fpRegIndex = AMD64ThreadContext.RBP;
+        } else if (cpu.equals("aarch64")) {
+            threadFactory = new ProcAARCH64ThreadFactory(this);
+            pcRegIndex = AARCH64ThreadContext.PC;
+            fpRegIndex = AARCH64ThreadContext.FP;
         } else if (cpu.equals("ppc64")) {
             threadFactory = new ProcPPC64ThreadFactory(this);
             pcRegIndex = PPC64ThreadContext.PC;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/aarch64/ProcAARCH64Thread.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.proc.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.proc.*;
+import sun.jvm.hotspot.utilities.*;
+
+public class ProcAARCH64Thread implements ThreadProxy {
+    private ProcDebugger debugger;
+    private int         id;
+
+    public ProcAARCH64Thread(ProcDebugger debugger, Address addr) {
+        this.debugger = debugger;
+
+        // FIXME: the size here should be configurable. However, making it
+        // so would produce a dependency on the "types" package from the
+        // debugger package, which is not desired.
+        this.id       = (int) addr.getCIntegerAt(0, 4, true);
+    }
+
+    public ProcAARCH64Thread(ProcDebugger debugger, long id) {
+        this.debugger = debugger;
+        this.id = (int) id;
+    }
+
+    public ThreadContext getContext() throws IllegalThreadStateException {
+        ProcAARCH64ThreadContext context = new ProcAARCH64ThreadContext(debugger);
+        long[] regs = debugger.getThreadIntegerRegisterSet(id);
+        if (Assert.ASSERTS_ENABLED) {
+            Assert.that(regs.length == AARCH64ThreadContext.NPRGREG, "size mismatch");
+        }
+        for (int i = 0; i < regs.length; i++) {
+            context.setRegister(i, regs[i]);
+        }
+        return context;
+    }
+
+    public boolean canSetContext() throws DebuggerException {
+        return false;
+    }
+
+    public void setContext(ThreadContext context)
+    throws IllegalThreadStateException, DebuggerException {
+        throw new DebuggerException("Unimplemented");
+    }
+
+    public String toString() {
+        return "t@" + id;
+    }
+
+    public boolean equals(Object obj) {
+        if ((obj == null) || !(obj instanceof ProcAARCH64Thread)) {
+            return false;
+        }
+
+        return (((ProcAARCH64Thread) obj).id == id);
+    }
+
+    public int hashCode() {
+        return id;
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/aarch64/ProcAARCH64ThreadContext.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.proc.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.proc.*;
+
+public class ProcAARCH64ThreadContext extends AARCH64ThreadContext {
+    private ProcDebugger debugger;
+
+    public ProcAARCH64ThreadContext(ProcDebugger debugger) {
+        super();
+        this.debugger = debugger;
+    }
+
+    public void setRegisterAsAddress(int index, Address value) {
+        setRegister(index, debugger.getAddressValue(value));
+    }
+
+    public Address getRegisterAsAddress(int index) {
+        return debugger.newAddress(getRegister(index));
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/aarch64/ProcAARCH64ThreadFactory.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.proc.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.proc.*;
+
+public class ProcAARCH64ThreadFactory implements ProcThreadFactory {
+    private ProcDebugger debugger;
+
+    public ProcAARCH64ThreadFactory(ProcDebugger debugger) {
+        this.debugger = debugger;
+    }
+
+    public ThreadProxy createThreadWrapper(Address threadIdentifierAddr) {
+        return new ProcAARCH64Thread(debugger, threadIdentifierAddr);
+    }
+
+    public ThreadProxy createThreadWrapper(long id) {
+        return new ProcAARCH64Thread(debugger, id);
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/aarch64/RemoteAARCH64Thread.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.remote.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.remote.*;
+import sun.jvm.hotspot.utilities.*;
+
+public class RemoteAARCH64Thread extends RemoteThread  {
+  public RemoteAARCH64Thread(RemoteDebuggerClient debugger, Address addr) {
+     super(debugger, addr);
+  }
+
+  public RemoteAARCH64Thread(RemoteDebuggerClient debugger, long id) {
+     super(debugger, id);
+  }
+
+  public ThreadContext getContext() throws IllegalThreadStateException {
+    RemoteAARCH64ThreadContext context = new RemoteAARCH64ThreadContext(debugger);
+    long[] regs = (addr != null)? debugger.getThreadIntegerRegisterSet(addr) :
+                                  debugger.getThreadIntegerRegisterSet(id);
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(regs.length == AARCH64ThreadContext.NPRGREG, "size of register set must match");
+    }
+    for (int i = 0; i < regs.length; i++) {
+      context.setRegister(i, regs[i]);
+    }
+    return context;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/aarch64/RemoteAARCH64ThreadContext.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.remote.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.remote.*;
+
+public class RemoteAARCH64ThreadContext extends AARCH64ThreadContext {
+  private RemoteDebuggerClient debugger;
+
+  public RemoteAARCH64ThreadContext(RemoteDebuggerClient debugger) {
+    super();
+    this.debugger = debugger;
+  }
+
+  public void setRegisterAsAddress(int index, Address value) {
+    setRegister(index, debugger.getAddressValue(value));
+  }
+
+  public Address getRegisterAsAddress(int index) {
+    return debugger.newAddress(getRegister(index));
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/aarch64/RemoteAARCH64ThreadFactory.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.remote.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.remote.*;
+
+public class RemoteAARCH64ThreadFactory implements RemoteThreadFactory {
+  private RemoteDebuggerClient debugger;
+
+  public RemoteAARCH64ThreadFactory(RemoteDebuggerClient debugger) {
+    this.debugger = debugger;
+  }
+
+  public ThreadProxy createThreadWrapper(Address threadIdentifierAddr) {
+    return new RemoteAARCH64Thread(debugger, threadIdentifierAddr);
+  }
+
+  public ThreadProxy createThreadWrapper(long id) {
+    return new RemoteAARCH64Thread(debugger, id);
+  }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/gc/shared/Generation.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/gc/shared/Generation.java	Thu Jul 02 16:09:16 2015 -0700
@@ -49,7 +49,6 @@
 public abstract class Generation extends VMObject {
   private static long          reservedFieldOffset;
   private static long          virtualSpaceFieldOffset;
-  private static CIntegerField levelField;
   protected static final int  K = 1024;
   // Fields for class StatRecord
   private static Field         statRecordField;
@@ -75,7 +74,6 @@
 
     reservedFieldOffset     = type.getField("_reserved").getOffset();
     virtualSpaceFieldOffset = type.getField("_virtual_space").getOffset();
-    levelField              = type.getCIntegerField("_level");
     // StatRecord
     statRecordField         = type.getField("_stat_record");
     type                    = db.lookupType("Generation::StatRecord");
@@ -130,14 +128,6 @@
      }
   }
 
-  public GenerationSpec spec() {
-    return ((GenCollectedHeap) VM.getVM().getUniverse().heap()).spec(level());
-  }
-
-  public int level() {
-    return (int) levelField.getValue(addr);
-  }
-
   public int invocations() {
     return getStatRecord().getInvocations();
   }
--- a/agent/src/share/classes/sun/jvm/hotspot/runtime/Frame.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/Frame.java	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -357,12 +357,6 @@
   // FIXME: avoiding implementing this for now if possible
   //  public void interpreter_frame_set_monitor_end(BasicObjectLock* value);
   //  public void interpreter_frame_verify_monitor(BasicObjectLock* value) const;
-  //
-  // Tells whether the current interpreter_frame frame pointer
-  // corresponds to the old compiled/deoptimized fp
-  // The receiver used to be a top level frame
-  // public boolean interpreter_frame_equals_unpacked_fp(intptr_t* fp);
-
   //--------------------------------------------------------------------------------
   // Method and constant pool cache:
   //
--- a/agent/src/share/classes/sun/jvm/hotspot/runtime/Threads.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/Threads.java	Thu Jul 02 16:09:16 2015 -0700
@@ -35,6 +35,7 @@
 import sun.jvm.hotspot.runtime.win32_x86.Win32X86JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_x86.LinuxX86JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_amd64.LinuxAMD64JavaThreadPDAccess;
+import sun.jvm.hotspot.runtime.linux_aarch64.LinuxAARCH64JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_ppc64.LinuxPPC64JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_sparc.LinuxSPARCJavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.bsd_x86.BsdX86JavaThreadPDAccess;
@@ -91,6 +92,8 @@
                 access = new LinuxSPARCJavaThreadPDAccess();
             } else if (cpu.equals("ppc64")) {
                 access = new LinuxPPC64JavaThreadPDAccess();
+            } else if (cpu.equals("aarch64")) {
+                access = new LinuxAARCH64JavaThreadPDAccess();
             } else {
               try {
                 access = (JavaThreadPDAccess)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64CurrentFrameGuess.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2003, 2006, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.code.*;
+import sun.jvm.hotspot.interpreter.*;
+import sun.jvm.hotspot.runtime.*;
+import sun.jvm.hotspot.runtime.aarch64.*;
+
+/** <P> Should be able to be used on all aarch64 platforms we support
+    (Linux/aarch64) to implement JavaThread's "currentFrameGuess()"
+    functionality. Input is an AARCH64ThreadContext; output is SP, FP,
+    and PC for an AARCH64Frame. Instantiation of the AARCH64Frame is
+    left to the caller, since we may need to subclass AARCH64Frame to
+    support signal handler frames on Unix platforms. </P>
+
+    <P> Algorithm is to walk up the stack within a given range (say,
+    512K at most) looking for a plausible PC and SP for a Java frame,
+    also considering those coming in from the context. If we find a PC
+    that belongs to the VM (i.e., in generated code like the
+    interpreter or CodeCache) then we try to find an associated FP.
+    We repeat this until we either find a complete frame or run out of
+    stack to look at. </P> */
+
+public class AARCH64CurrentFrameGuess {
+  private AARCH64ThreadContext context;
+  private JavaThread       thread;
+  private Address          spFound;
+  private Address          fpFound;
+  private Address          pcFound;
+
+  private static final boolean DEBUG = System.getProperty("sun.jvm.hotspot.runtime.aarch64.AARCH64Frame.DEBUG")
+                                       != null;
+
+  public AARCH64CurrentFrameGuess(AARCH64ThreadContext context,
+                              JavaThread thread) {
+    this.context = context;
+    this.thread  = thread;
+  }
+
+  /** Returns false if not able to find a frame within a reasonable range. */
+  public boolean run(long regionInBytesToSearch) {
+    Address sp  = context.getRegisterAsAddress(AARCH64ThreadContext.SP);
+    Address pc  = context.getRegisterAsAddress(AARCH64ThreadContext.PC);
+    Address fp  = context.getRegisterAsAddress(AARCH64ThreadContext.FP);
+    if (sp == null) {
+      // Bail out if no last java frame either
+      if (thread.getLastJavaSP() != null) {
+        setValues(thread.getLastJavaSP(), thread.getLastJavaFP(), null);
+        return true;
+      }
+      return false;
+    }
+    Address end = sp.addOffsetTo(regionInBytesToSearch);
+    VM vm       = VM.getVM();
+
+    setValues(null, null, null); // Assume we're not going to find anything
+
+    if (vm.isJavaPCDbg(pc)) {
+      if (vm.isClientCompiler()) {
+        // If the topmost frame is a Java frame, we are (pretty much)
+        // guaranteed to have a viable FP. We should be more robust
+        // than this (we have the potential for losing entire threads'
+        // stack traces) but need to see how much work we really have
+        // to do here. Searching the stack for an (SP, FP) pair is
+        // hard since it's easy to misinterpret inter-frame stack
+        // pointers as base-of-frame pointers; we also don't know the
+        // sizes of C1 frames (not registered in the nmethod) so can't
+        // derive them from SP.
+
+        setValues(sp, fp, pc);
+        return true;
+      } else {
+        if (vm.getInterpreter().contains(pc)) {
+          if (DEBUG) {
+            System.out.println("CurrentFrameGuess: choosing interpreter frame: sp = " +
+                               sp + ", fp = " + fp + ", pc = " + pc);
+          }
+          setValues(sp, fp, pc);
+          return true;
+        }
+
+        // For the server compiler, FP is not guaranteed to be valid
+        // for compiled code. In addition, an earlier attempt at a
+        // non-searching algorithm (see below) failed because the
+        // stack pointer from the thread context was pointing
+        // (considerably) beyond the ostensible end of the stack, into
+        // garbage; walking from the topmost frame back caused a crash.
+        //
+        // This algorithm takes the current PC as a given and tries to
+        // find the correct corresponding SP by walking up the stack
+        // and repeatedly performing stackwalks (very inefficient).
+        //
+        // FIXME: there is something wrong with stackwalking across
+        // adapter frames...this is likely to be the root cause of the
+        // failure with the simpler algorithm below.
+
+        for (long offset = 0;
+             offset < regionInBytesToSearch;
+             offset += vm.getAddressSize()) {
+          try {
+            Address curSP = sp.addOffsetTo(offset);
+            Frame frame = new AARCH64Frame(curSP, null, pc);
+            RegisterMap map = thread.newRegisterMap(false);
+            while (frame != null) {
+              if (frame.isEntryFrame() && frame.entryFrameIsFirst()) {
+                // We were able to traverse all the way to the
+                // bottommost Java frame.
+                // This sp looks good. Keep it.
+                if (DEBUG) {
+                  System.out.println("CurrentFrameGuess: Choosing sp = " + curSP + ", pc = " + pc);
+                }
+                setValues(curSP, null, pc);
+                return true;
+              }
+              frame = frame.sender(map);
+            }
+          } catch (Exception e) {
+            if (DEBUG) {
+              System.out.println("CurrentFrameGuess: Exception " + e + " at offset " + offset);
+            }
+            // Bad SP. Try another.
+          }
+        }
+
+        // Were not able to find a plausible SP to go with this PC.
+        // Bail out.
+        return false;
+
+        /*
+        // Original algorithm which does not work because SP was
+        // pointing beyond where it should have:
+
+        // For the server compiler, FP is not guaranteed to be valid
+        // for compiled code. We see whether the PC is in the
+        // interpreter and take care of that, otherwise we run code
+        // (unfortunately) duplicated from AARCH64Frame.senderForCompiledFrame.
+
+        CodeCache cc = vm.getCodeCache();
+        if (cc.contains(pc)) {
+          CodeBlob cb = cc.findBlob(pc);
+
+          // See if we can derive a frame pointer from SP and PC
+          // NOTE: This is the code duplicated from AARCH64Frame
+          Address saved_fp = null;
+          int llink_offset = cb.getLinkOffset();
+          if (llink_offset >= 0) {
+            // Restore base-pointer, since next frame might be an interpreter frame.
+            Address fp_addr = sp.addOffsetTo(VM.getVM().getAddressSize() * llink_offset);
+            saved_fp = fp_addr.getAddressAt(0);
+          }
+
+          setValues(sp, saved_fp, pc);
+          return true;
+        }
+        */
+      }
+    } else {
+      // If the current program counter was not known to us as a Java
+      // PC, we currently assume that we are in the run-time system
+      // and attempt to look to thread-local storage for saved SP and
+      // FP. Note that if these are null (because we were, in fact,
+      // in Java code, i.e., vtable stubs or similar, and the SA
+      // didn't have enough insight into the target VM to understand
+      // that) then we are going to lose the entire stack trace for
+      // the thread, which is sub-optimal. FIXME.
+
+      if (DEBUG) {
+        System.out.println("CurrentFrameGuess: choosing last Java frame: sp = " +
+                           thread.getLastJavaSP() + ", fp = " + thread.getLastJavaFP());
+      }
+      if (thread.getLastJavaSP() == null) {
+        return false; // No known Java frames on stack
+      }
+
+      // The runtime has a nasty habit of not saving fp in the frame
+      // anchor, leaving us to grovel about in the stack to find a
+      // plausible address.  Fortunately, this only happens in
+      // compiled code; there we always have a valid PC, and we always
+      // push LR and FP onto the stack as a pair, with FP at the lower
+      // address.
+      pc = thread.getLastJavaPC();
+      fp = thread.getLastJavaFP();
+      sp = thread.getLastJavaSP();
+
+      if (fp == null) {
+        CodeCache cc = vm.getCodeCache();
+        if (cc.contains(pc)) {
+          CodeBlob cb = cc.findBlob(pc);
+          if (DEBUG) {
+            System.out.println("FP is null.  Found blob frame size " + cb.getFrameSize());
+          }
+          // See if we can derive a frame pointer from SP and PC
+          long link_offset = cb.getFrameSize() - 2 * VM.getVM().getAddressSize();
+          if (link_offset >= 0) {
+            fp = sp.addOffsetTo(link_offset);
+          }
+        }
+      }
+
+      setValues(sp, fp, null);
+
+      return true;
+    }
+  }
+
+  public Address getSP() { return spFound; }
+  public Address getFP() { return fpFound; }
+  /** May be null if getting values from thread-local storage; take
+      care to call the correct AARCH64Frame constructor to recover this if
+      necessary */
+  public Address getPC() { return pcFound; }
+
+  private void setValues(Address sp, Address fp, Address pc) {
+    spFound = sp;
+    fpFound = fp;
+    pcFound = pc;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64Frame.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,555 @@
+/*
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import java.util.*;
+import sun.jvm.hotspot.code.*;
+import sun.jvm.hotspot.compiler.*;
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.oops.*;
+import sun.jvm.hotspot.runtime.*;
+import sun.jvm.hotspot.types.*;
+import sun.jvm.hotspot.utilities.*;
+
+/** Specialization of and implementation of abstract methods of the
+    Frame class for the aarch64 family of CPUs. */
+
+public class AARCH64Frame extends Frame {
+  private static final boolean DEBUG;
+  static {
+    DEBUG = System.getProperty("sun.jvm.hotspot.runtime.aarch64.AARCH64Frame.DEBUG") != null;
+  }
+
+  // All frames
+  private static final int LINK_OFFSET                =  0;
+  private static final int RETURN_ADDR_OFFSET         =  1;
+  private static final int SENDER_SP_OFFSET           =  2;
+
+  // Interpreter frames
+  private static final int INTERPRETER_FRAME_MIRROR_OFFSET    =  2; // for native calls only
+  private static final int INTERPRETER_FRAME_SENDER_SP_OFFSET = -1;
+  private static final int INTERPRETER_FRAME_LAST_SP_OFFSET   = INTERPRETER_FRAME_SENDER_SP_OFFSET - 1;
+  private static final int INTERPRETER_FRAME_METHOD_OFFSET    = INTERPRETER_FRAME_LAST_SP_OFFSET - 1;
+  private static       int INTERPRETER_FRAME_MDX_OFFSET;         // Non-core builds only
+  private static       int INTERPRETER_FRAME_CACHE_OFFSET;
+  private static       int INTERPRETER_FRAME_LOCALS_OFFSET;
+  private static       int INTERPRETER_FRAME_BCX_OFFSET;
+  private static       int INTERPRETER_FRAME_INITIAL_SP_OFFSET;
+  private static       int INTERPRETER_FRAME_MONITOR_BLOCK_TOP_OFFSET;
+  private static       int INTERPRETER_FRAME_MONITOR_BLOCK_BOTTOM_OFFSET;
+
+  // Entry frames
+  private static       int ENTRY_FRAME_CALL_WRAPPER_OFFSET = -8;
+
+  // Native frames
+  private static final int NATIVE_FRAME_INITIAL_PARAM_OFFSET =  2;
+
+  private static VMReg fp = new VMReg(29);
+
+  static {
+    VM.registerVMInitializedObserver(new Observer() {
+        public void update(Observable o, Object data) {
+          initialize(VM.getVM().getTypeDataBase());
+        }
+      });
+  }
+
+  private static synchronized void initialize(TypeDataBase db) {
+    INTERPRETER_FRAME_MDX_OFFSET                  = INTERPRETER_FRAME_METHOD_OFFSET - 1;
+    INTERPRETER_FRAME_CACHE_OFFSET                = INTERPRETER_FRAME_MDX_OFFSET - 1;
+    INTERPRETER_FRAME_LOCALS_OFFSET               = INTERPRETER_FRAME_CACHE_OFFSET - 1;
+    INTERPRETER_FRAME_BCX_OFFSET                  = INTERPRETER_FRAME_LOCALS_OFFSET - 1;
+    INTERPRETER_FRAME_INITIAL_SP_OFFSET           = INTERPRETER_FRAME_BCX_OFFSET - 1;
+    INTERPRETER_FRAME_MONITOR_BLOCK_TOP_OFFSET    = INTERPRETER_FRAME_INITIAL_SP_OFFSET;
+    INTERPRETER_FRAME_MONITOR_BLOCK_BOTTOM_OFFSET = INTERPRETER_FRAME_INITIAL_SP_OFFSET;
+  }
+
+
+  // an additional field beyond sp and pc:
+  Address raw_fp; // frame pointer
+  private Address raw_unextendedSP;
+
+  private AARCH64Frame() {
+  }
+
+  private void adjustForDeopt() {
+    if ( pc != null) {
+      // Look for a deopt pc and if it is deopted convert to original pc
+      CodeBlob cb = VM.getVM().getCodeCache().findBlob(pc);
+      if (cb != null && cb.isJavaMethod()) {
+        NMethod nm = (NMethod) cb;
+        if (pc.equals(nm.deoptHandlerBegin())) {
+          if (Assert.ASSERTS_ENABLED) {
+            Assert.that(this.getUnextendedSP() != null, "null SP in Java frame");
+          }
+          // adjust pc if frame is deoptimized.
+          pc = this.getUnextendedSP().getAddressAt(nm.origPCOffset());
+          deoptimized = true;
+        }
+      }
+    }
+  }
+
+  public AARCH64Frame(Address raw_sp, Address raw_fp, Address pc) {
+    this.raw_sp = raw_sp;
+    this.raw_unextendedSP = raw_sp;
+    this.raw_fp = raw_fp;
+    this.pc = pc;
+    adjustUnextendedSP();
+
+    // Frame must be fully constructed before this call
+    adjustForDeopt();
+
+    if (DEBUG) {
+      System.out.println("AARCH64Frame(sp, fp, pc): " + this);
+      dumpStack();
+    }
+  }
+
+  public AARCH64Frame(Address raw_sp, Address raw_fp) {
+    this.raw_sp = raw_sp;
+    this.raw_unextendedSP = raw_sp;
+    this.raw_fp = raw_fp;
+    this.pc = raw_sp.getAddressAt(-1 * VM.getVM().getAddressSize());
+    adjustUnextendedSP();
+
+    // Frame must be fully constructed before this call
+    adjustForDeopt();
+
+    if (DEBUG) {
+      System.out.println("AARCH64Frame(sp, fp): " + this);
+      dumpStack();
+    }
+  }
+
+  public AARCH64Frame(Address raw_sp, Address raw_unextendedSp, Address raw_fp, Address pc) {
+    this.raw_sp = raw_sp;
+    this.raw_unextendedSP = raw_unextendedSp;
+    this.raw_fp = raw_fp;
+    this.pc = pc;
+    adjustUnextendedSP();
+
+    // Frame must be fully constructed before this call
+    adjustForDeopt();
+
+    if (DEBUG) {
+      System.out.println("AARCH64Frame(sp, unextendedSP, fp, pc): " + this);
+      dumpStack();
+    }
+
+  }
+
+  public Object clone() {
+    AARCH64Frame frame = new AARCH64Frame();
+    frame.raw_sp = raw_sp;
+    frame.raw_unextendedSP = raw_unextendedSP;
+    frame.raw_fp = raw_fp;
+    frame.pc = pc;
+    frame.deoptimized = deoptimized;
+    return frame;
+  }
+
+  public boolean equals(Object arg) {
+    if (arg == null) {
+      return false;
+    }
+
+    if (!(arg instanceof AARCH64Frame)) {
+      return false;
+    }
+
+    AARCH64Frame other = (AARCH64Frame) arg;
+
+    return (AddressOps.equal(getSP(), other.getSP()) &&
+            AddressOps.equal(getUnextendedSP(), other.getUnextendedSP()) &&
+            AddressOps.equal(getFP(), other.getFP()) &&
+            AddressOps.equal(getPC(), other.getPC()));
+  }
+
+  public int hashCode() {
+    if (raw_sp == null) {
+      return 0;
+    }
+
+    return raw_sp.hashCode();
+  }
+
+  public String toString() {
+    return "sp: " + (getSP() == null? "null" : getSP().toString()) +
+         ", unextendedSP: " + (getUnextendedSP() == null? "null" : getUnextendedSP().toString()) +
+         ", fp: " + (getFP() == null? "null" : getFP().toString()) +
+         ", pc: " + (pc == null? "null" : pc.toString());
+  }
+
+  // accessors for the instance variables
+  public Address getFP() { return raw_fp; }
+  public Address getSP() { return raw_sp; }
+  public Address getID() { return raw_sp; }
+
+  // FIXME: not implemented yet
+  public boolean isSignalHandlerFrameDbg() { return false; }
+  public int     getSignalNumberDbg()      { return 0;     }
+  public String  getSignalNameDbg()        { return null;  }
+
+  public boolean isInterpretedFrameValid() {
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(isInterpretedFrame(), "Not an interpreted frame");
+    }
+
+    // These are reasonable sanity checks
+    if (getFP() == null || getFP().andWithMask(0x3) != null) {
+      return false;
+    }
+
+    if (getSP() == null || getSP().andWithMask(0x3) != null) {
+      return false;
+    }
+
+    if (getFP().addOffsetTo(INTERPRETER_FRAME_INITIAL_SP_OFFSET * VM.getVM().getAddressSize()).lessThan(getSP())) {
+      return false;
+    }
+
+    // These are hacks to keep us out of trouble.
+    // The problem with these is that they mask other problems
+    if (getFP().lessThanOrEqual(getSP())) {
+      // this attempts to deal with unsigned comparison above
+      return false;
+    }
+
+    if (getFP().minus(getSP()) > 4096 * VM.getVM().getAddressSize()) {
+      // stack frames shouldn't be large.
+      return false;
+    }
+
+    return true;
+  }
+
+  // FIXME: not applicable in current system
+  //  void    patch_pc(Thread* thread, address pc);
+
+  public Frame sender(RegisterMap regMap, CodeBlob cb) {
+    AARCH64RegisterMap map = (AARCH64RegisterMap) regMap;
+
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map != null, "map must be set");
+    }
+
+    // Default is we done have to follow them. The sender_for_xxx will
+    // update it accordingly
+    map.setIncludeArgumentOops(false);
+
+    if (isEntryFrame())       return senderForEntryFrame(map);
+    if (isInterpretedFrame()) return senderForInterpreterFrame(map);
+
+    if(cb == null) {
+      cb = VM.getVM().getCodeCache().findBlob(getPC());
+    } else {
+      if (Assert.ASSERTS_ENABLED) {
+        Assert.that(cb.equals(VM.getVM().getCodeCache().findBlob(getPC())), "Must be the same");
+      }
+    }
+
+    if (cb != null) {
+      return senderForCompiledFrame(map, cb);
+    }
+
+    // Must be native-compiled frame, i.e. the marshaling code for native
+    // methods that exists in the core system.
+    return new AARCH64Frame(getSenderSP(), getLink(), getSenderPC());
+  }
+
+  private Frame senderForEntryFrame(AARCH64RegisterMap map) {
+    if (DEBUG) {
+      System.out.println("senderForEntryFrame");
+    }
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map != null, "map must be set");
+    }
+    // Java frame called from C; skip all C frames and return top C
+    // frame of that chunk as the sender
+    AARCH64JavaCallWrapper jcw = (AARCH64JavaCallWrapper) getEntryFrameCallWrapper();
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(!entryFrameIsFirst(), "next Java fp must be non zero");
+      Assert.that(jcw.getLastJavaSP().greaterThan(getSP()), "must be above this frame on stack");
+    }
+    AARCH64Frame fr;
+    if (jcw.getLastJavaPC() != null) {
+      fr = new AARCH64Frame(jcw.getLastJavaSP(), jcw.getLastJavaFP(), jcw.getLastJavaPC());
+    } else {
+      fr = new AARCH64Frame(jcw.getLastJavaSP(), jcw.getLastJavaFP());
+    }
+    map.clear();
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map.getIncludeArgumentOops(), "should be set by clear");
+    }
+    return fr;
+  }
+
+  //------------------------------------------------------------------------------
+  // frame::adjust_unextended_sp
+  private void adjustUnextendedSP() {
+    // If we are returning to a compiled MethodHandle call site, the
+    // saved_fp will in fact be a saved value of the unextended SP.  The
+    // simplest way to tell whether we are returning to such a call site
+    // is as follows:
+
+    CodeBlob cb = cb();
+    NMethod senderNm = (cb == null) ? null : cb.asNMethodOrNull();
+    if (senderNm != null) {
+      // If the sender PC is a deoptimization point, get the original
+      // PC.  For MethodHandle call site the unextended_sp is stored in
+      // saved_fp.
+      if (senderNm.isDeoptMhEntry(getPC())) {
+        // DEBUG_ONLY(verifyDeoptMhOriginalPc(senderNm, getFP()));
+        raw_unextendedSP = getFP();
+      }
+      else if (senderNm.isDeoptEntry(getPC())) {
+        // DEBUG_ONLY(verifyDeoptOriginalPc(senderNm, raw_unextendedSp));
+      }
+      else if (senderNm.isMethodHandleReturn(getPC())) {
+        raw_unextendedSP = getFP();
+      }
+    }
+  }
+
+  private Frame senderForInterpreterFrame(AARCH64RegisterMap map) {
+    if (DEBUG) {
+      System.out.println("senderForInterpreterFrame");
+    }
+    Address unextendedSP = addressOfStackSlot(INTERPRETER_FRAME_SENDER_SP_OFFSET).getAddressAt(0);
+    Address sp = addressOfStackSlot(SENDER_SP_OFFSET);
+    // We do not need to update the callee-save register mapping because above
+    // us is either another interpreter frame or a converter-frame, but never
+    // directly a compiled frame.
+    // 11/24/04 SFG. With the removal of adapter frames this is no longer true.
+    // However c2 no longer uses callee save register for java calls so there
+    // are no callee register to find.
+
+    if (map.getUpdateMap())
+      updateMapWithSavedLink(map, addressOfStackSlot(LINK_OFFSET));
+
+    return new AARCH64Frame(sp, unextendedSP, getLink(), getSenderPC());
+  }
+
+  private void updateMapWithSavedLink(RegisterMap map, Address savedFPAddr) {
+    map.setLocation(fp, savedFPAddr);
+  }
+
+  private Frame senderForCompiledFrame(AARCH64RegisterMap map, CodeBlob cb) {
+    if (DEBUG) {
+      System.out.println("senderForCompiledFrame");
+    }
+
+    //
+    // NOTE: some of this code is (unfortunately) duplicated  AARCH64CurrentFrameGuess
+    //
+
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map != null, "map must be set");
+    }
+
+    // frame owned by optimizing compiler
+    if (Assert.ASSERTS_ENABLED) {
+        Assert.that(cb.getFrameSize() >= 0, "must have non-zero frame size");
+    }
+    Address senderSP = getUnextendedSP().addOffsetTo(cb.getFrameSize());
+
+    // The return_address is always the word on the stack
+    Address senderPC = senderSP.getAddressAt(-1 * VM.getVM().getAddressSize());
+
+    // This is the saved value of FP which may or may not really be an FP.
+    // It is only an FP if the sender is an interpreter frame.
+    Address savedFPAddr = senderSP.addOffsetTo(- SENDER_SP_OFFSET * VM.getVM().getAddressSize());
+
+    if (map.getUpdateMap()) {
+      // Tell GC to use argument oopmaps for some runtime stubs that need it.
+      // For C1, the runtime stub might not have oop maps, so set this flag
+      // outside of update_register_map.
+      map.setIncludeArgumentOops(cb.callerMustGCArguments());
+
+      if (cb.getOopMaps() != null) {
+        ImmutableOopMapSet.updateRegisterMap(this, cb, map, true);
+      }
+
+      // Since the prolog does the save and restore of FP there is no oopmap
+      // for it so we must fill in its location as if there was an oopmap entry
+      // since if our caller was compiled code there could be live jvm state in it.
+      updateMapWithSavedLink(map, savedFPAddr);
+    }
+
+    return new AARCH64Frame(senderSP, savedFPAddr.getAddressAt(0), senderPC);
+  }
+
+  protected boolean hasSenderPD() {
+    return true;
+  }
+
+  public long frameSize() {
+    return (getSenderSP().minus(getSP()) / VM.getVM().getAddressSize());
+  }
+
+    public Address getLink() {
+        try {
+            if (DEBUG) {
+                System.out.println("Reading link at " + addressOfStackSlot(LINK_OFFSET)
+                        + " = " + addressOfStackSlot(LINK_OFFSET).getAddressAt(0));
+            }
+            return addressOfStackSlot(LINK_OFFSET).getAddressAt(0);
+        } catch (Exception e) {
+            if (DEBUG)
+                System.out.println("Returning null");
+            return null;
+        }
+    }
+
+  // FIXME: not implementable yet
+  //inline void      frame::set_link(intptr_t* addr)  { *(intptr_t **)addr_at(link_offset) = addr; }
+
+  public Address getUnextendedSP() { return raw_unextendedSP; }
+
+  // Return address:
+  public Address getSenderPCAddr() { return addressOfStackSlot(RETURN_ADDR_OFFSET); }
+  public Address getSenderPC()     { return getSenderPCAddr().getAddressAt(0);      }
+
+  // return address of param, zero origin index.
+  public Address getNativeParamAddr(int idx) {
+    return addressOfStackSlot(NATIVE_FRAME_INITIAL_PARAM_OFFSET + idx);
+  }
+
+  public Address getSenderSP()     { return addressOfStackSlot(SENDER_SP_OFFSET); }
+
+  public Address addressOfInterpreterFrameLocals() {
+    return addressOfStackSlot(INTERPRETER_FRAME_LOCALS_OFFSET);
+  }
+
+  private Address addressOfInterpreterFrameBCX() {
+    return addressOfStackSlot(INTERPRETER_FRAME_BCX_OFFSET);
+  }
+
+  public int getInterpreterFrameBCI() {
+    // FIXME: this is not atomic with respect to GC and is unsuitable
+    // for use in a non-debugging, or reflective, system. Need to
+    // figure out how to express this.
+    Address bcp = addressOfInterpreterFrameBCX().getAddressAt(0);
+    Address methodHandle = addressOfInterpreterFrameMethod().getAddressAt(0);
+    Method method = (Method)Metadata.instantiateWrapperFor(methodHandle);
+    return bcpToBci(bcp, method);
+  }
+
+  public Address addressOfInterpreterFrameMDX() {
+    return addressOfStackSlot(INTERPRETER_FRAME_MDX_OFFSET);
+  }
+
+  // FIXME
+  //inline int frame::interpreter_frame_monitor_size() {
+  //  return BasicObjectLock::size();
+  //}
+
+  // expression stack
+  // (the max_stack arguments are used by the GC; see class FrameClosure)
+
+  public Address addressOfInterpreterFrameExpressionStack() {
+    Address monitorEnd = interpreterFrameMonitorEnd().address();
+    return monitorEnd.addOffsetTo(-1 * VM.getVM().getAddressSize());
+  }
+
+  public int getInterpreterFrameExpressionStackDirection() { return -1; }
+
+  // top of expression stack
+  public Address addressOfInterpreterFrameTOS() {
+    return getSP();
+  }
+
+  /** Expression stack from top down */
+  public Address addressOfInterpreterFrameTOSAt(int slot) {
+    return addressOfInterpreterFrameTOS().addOffsetTo(slot * VM.getVM().getAddressSize());
+  }
+
+  public Address getInterpreterFrameSenderSP() {
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(isInterpretedFrame(), "interpreted frame expected");
+    }
+    return addressOfStackSlot(INTERPRETER_FRAME_SENDER_SP_OFFSET).getAddressAt(0);
+  }
+
+  // Monitors
+  public BasicObjectLock interpreterFrameMonitorBegin() {
+    return new BasicObjectLock(addressOfStackSlot(INTERPRETER_FRAME_MONITOR_BLOCK_BOTTOM_OFFSET));
+  }
+
+  public BasicObjectLock interpreterFrameMonitorEnd() {
+    Address result = addressOfStackSlot(INTERPRETER_FRAME_MONITOR_BLOCK_TOP_OFFSET).getAddressAt(0);
+    if (Assert.ASSERTS_ENABLED) {
+      // make sure the pointer points inside the frame
+      Assert.that(AddressOps.gt(getFP(), result), "result must <  than frame pointer");
+      Assert.that(AddressOps.lte(getSP(), result), "result must >= than stack pointer");
+    }
+    return new BasicObjectLock(result);
+  }
+
+  public int interpreterFrameMonitorSize() {
+    return BasicObjectLock.size();
+  }
+
+  // Method
+  public Address addressOfInterpreterFrameMethod() {
+    return addressOfStackSlot(INTERPRETER_FRAME_METHOD_OFFSET);
+  }
+
+  // Constant pool cache
+  public Address addressOfInterpreterFrameCPCache() {
+    return addressOfStackSlot(INTERPRETER_FRAME_CACHE_OFFSET);
+  }
+
+  // Entry frames
+  public JavaCallWrapper getEntryFrameCallWrapper() {
+    return new AARCH64JavaCallWrapper(addressOfStackSlot(ENTRY_FRAME_CALL_WRAPPER_OFFSET).getAddressAt(0));
+  }
+
+  protected Address addressOfSavedOopResult() {
+    // offset is 2 for compiler2 and 3 for compiler1
+    return getSP().addOffsetTo((VM.getVM().isClientCompiler() ? 2 : 3) *
+                               VM.getVM().getAddressSize());
+  }
+
+  protected Address addressOfSavedReceiver() {
+    return getSP().addOffsetTo(-4 * VM.getVM().getAddressSize());
+  }
+
+  private void dumpStack() {
+    for (Address addr = getSP().addOffsetTo(-4 * VM.getVM().getAddressSize());
+         AddressOps.lt(addr, getSP());
+         addr = addr.addOffsetTo(VM.getVM().getAddressSize())) {
+      System.out.println(addr + ": " + addr.getAddressAt(0));
+    }
+    System.out.println("-----------------------");
+    for (Address addr = getSP();
+         AddressOps.lte(addr, getSP().addOffsetTo(20 * VM.getVM().getAddressSize()));
+         addr = addr.addOffsetTo(VM.getVM().getAddressSize())) {
+      System.out.println(addr + ": " + addr.getAddressAt(0));
+    }
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64JavaCallWrapper.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import java.util.*;
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.types.*;
+import sun.jvm.hotspot.runtime.*;
+
+public class AARCH64JavaCallWrapper extends JavaCallWrapper {
+  private static AddressField lastJavaFPField;
+
+  static {
+    VM.registerVMInitializedObserver(new Observer() {
+        public void update(Observable o, Object data) {
+          initialize(VM.getVM().getTypeDataBase());
+        }
+      });
+  }
+
+  private static synchronized void initialize(TypeDataBase db) {
+    Type type = db.lookupType("JavaFrameAnchor");
+
+    lastJavaFPField  = type.getAddressField("_last_Java_fp");
+  }
+
+  public AARCH64JavaCallWrapper(Address addr) {
+    super(addr);
+  }
+
+  public Address getLastJavaFP() {
+    return lastJavaFPField.getValue(addr.addOffsetTo(anchorField.getOffset()));
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64RegisterMap.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.runtime.*;
+
+public class AARCH64RegisterMap extends RegisterMap {
+
+  /** This is the only public constructor */
+  public AARCH64RegisterMap(JavaThread thread, boolean updateMap) {
+    super(thread, updateMap);
+  }
+
+  protected AARCH64RegisterMap(RegisterMap map) {
+    super(map);
+  }
+
+  public Object clone() {
+    AARCH64RegisterMap retval = new AARCH64RegisterMap(this);
+    return retval;
+  }
+
+  // no PD state to clear or copy:
+  protected void clearPD() {}
+  protected void initializePD() {}
+  protected void initializeFromPD(RegisterMap map) {}
+  protected Address getLocationPD(VMReg reg) { return null; }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/linux_aarch64/LinuxAARCH64JavaThreadPDAccess.java	Thu Jul 02 16:09:16 2015 -0700
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.linux_aarch64;
+
+import java.io.*;
+import java.util.*;
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.runtime.*;
+import sun.jvm.hotspot.runtime.aarch64.*;
+import sun.jvm.hotspot.types.*;
+import sun.jvm.hotspot.utilities.*;
+
+public class LinuxAARCH64JavaThreadPDAccess implements JavaThreadPDAccess {
+  private static AddressField  lastJavaFPField;
+  private static AddressField  osThreadField;
+
+  // Field from OSThread
+  private static CIntegerField osThreadThreadIDField;
+
+  // This is currently unneeded but is being kept in case we change
+  // the currentFrameGuess algorithm
+  private static final long GUESS_SCAN_RANGE = 128 * 1024;
+
+  static {
+    VM.registerVMInitializedObserver(new Observer() {
+        public void update(Observable o, Object data) {
+          initialize(VM.getVM().getTypeDataBase());
+        }
+      });
+  }
+
+  private static synchronized void initialize(TypeDataBase db) {
+    Type type = db.lookupType("JavaThread");
+    osThreadField           = type.getAddressField("_osthread");
+
+    Type anchorType = db.lookupType("JavaFrameAnchor");
+    lastJavaFPField         = anchorType.getAddressField("_last_Java_fp");
+
+    Type osThreadType = db.lookupType("OSThread");
+    osThreadThreadIDField   = osThreadType.getCIntegerField("_thread_id");
+  }
+
+  public Address getLastJavaFP(Address addr) {
+    return lastJavaFPField.getValue(addr.addOffsetTo(sun.jvm.hotspot.runtime.JavaThread.getAnchorField().getOffset()));
+  }
+
+  public Address getLastJavaPC(Address addr) {
+    return null;
+  }
+
+  public Address getBaseOfStackPointer(Address addr) {
+    return null;
+  }
+
+  public Frame getLastFramePD(JavaThread thread, Address addr) {
+    Address fp = thread.getLastJavaFP();
+    if (fp == null) {
+      return null; // no information
+    }
+    return new AARCH64Frame(thread.getLastJavaSP(), fp);
+  }
+
+  public RegisterMap newRegisterMap(JavaThread thread, boolean updateMap) {
+    return new AARCH64RegisterMap(thread, updateMap);
+  }
+
+  public Frame getCurrentFrameGuess(JavaThread thread, Address addr) {
+    ThreadProxy t = getThreadProxy(addr);
+    AARCH64ThreadContext context = (AARCH64ThreadContext) t.getContext();
+    AARCH64CurrentFrameGuess guesser = new AARCH64CurrentFrameGuess(context, thread);
+    if (!guesser.run(GUESS_SCAN_RANGE)) {
+      return null;
+    }
+    if (guesser.getPC() == null) {
+      return new AARCH64Frame(guesser.getSP(), guesser.getFP());
+    } else {
+      return new AARCH64Frame(guesser.getSP(), guesser.getFP(), guesser.getPC());
+    }
+  }
+
+  public void printThreadIDOn(Address addr, PrintStream tty) {
+    tty.print(getThreadProxy(addr));
+  }
+
+  public void printInfoOn(Address threadAddr, PrintStream tty) {
+    tty.print("Thread id: ");
+    printThreadIDOn(threadAddr, tty);
+//    tty.println("\nPostJavaState: " + getPostJavaState(threadAddr));
+  }
+
+  public Address getLastSP(Address addr) {
+    ThreadProxy t = getThreadProxy(addr);
+    AARCH64ThreadContext context = (AARCH64ThreadContext) t.getContext();
+    return context.getRegisterAsAddress(AARCH64ThreadContext.SP);
+  }
+
+  public ThreadProxy getThreadProxy(Address addr) {
+    // Addr is the address of the JavaThread.
+    // Fetch the OSThread (for now and for simplicity, not making a
+    // separate "OSThread" class in this package)
+    Address osThreadAddr = osThreadField.getValue(addr);
+    // Get the address of the _thread_id from the OSThread
+    Address threadIdAddr = osThreadAddr.addOffsetTo(osThreadThreadIDField.getOffset());
+
+    JVMDebugger debugger = VM.getVM().getDebugger();
+    return debugger.getThreadForIdentifierAddress(threadIdAddr);
+  }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/utilities/AltPlatformInfo.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/utilities/AltPlatformInfo.java	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,7 +25,10 @@
 package sun.jvm.hotspot.utilities;
 
 public interface AltPlatformInfo {
+
   // Additional cpu types can be tested via this interface
+  public boolean knownCPU(String cpu);
 
-  public boolean knownCPU(String cpu);
-}
\ No newline at end of file
+  // Mangle a cpu name if necessary
+  public String getCPU(String cpu);
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/utilities/PlatformInfo.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/utilities/PlatformInfo.java	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -52,27 +52,54 @@
     }
   }
 
-  /* Returns "sparc" for SPARC based platforms and "x86" for x86 based
-     platforms. Otherwise returns the value of os.arch.  If the value
-     is not recognized as supported, an exception is thrown instead. */
+  public static boolean knownCPU(String cpu) {
+    final String[] KNOWN =
+        new String[] {"i386", "x86", "x86_64", "amd64", "sparc", "sparcv9", "ppc64", "aarch64"};
+
+    for(String s : KNOWN) {
+      if(s.equals(cpu))
+        return true;
+    }
+
+    return false;
+  }
+
+  /* Returns "sparc" for SPARC based platforms "x86" for x86 based
+     platforms and x86_64 for 64bit x86 based platform. Otherwise
+     returns the value of os.arch. If the value is not recognized as supported,
+     an exception is thrown instead. */
+
   public static String getCPU() throws UnsupportedPlatformException {
     String cpu = System.getProperty("os.arch");
-    if (cpu.equals("i386") || cpu.equals("x86")) {
+
+    // Let any additional CPU mangling fire first
+    try {
+      Class pic = Class.forName("sun.jvm.hotspot.utilities.PlatformInfoClosed");
+      AltPlatformInfo api = (AltPlatformInfo) pic.newInstance();
+      if (api.knownCPU(cpu)) {
+        return api.getCPU(cpu);
+      }
+    } catch (Exception e) {
+       // Ignored
+    }
+
+    // Check that CPU is supported
+    if (!knownCPU(cpu)) {
+       throw new UnsupportedPlatformException("CPU type " + cpu + " not yet supported");
+    }
+
+    // Tweeks
+    if (cpu.equals("i386"))
       return "x86";
-    } else if (cpu.equals("sparc") || cpu.equals("sparcv9")) {
+
+    if (cpu.equals("sparcv9"))
       return "sparc";
-    } else if (cpu.equals("ia64") || cpu.equals("amd64") || cpu.equals("x86_64") || cpu.equals("ppc64") || cpu.equals("aarch64")) {
-      return cpu;
-    } else {
-      try {
-        Class pic = Class.forName("sun.jvm.hotspot.utilities.PlatformInfoClosed");
-        AltPlatformInfo api = (AltPlatformInfo)pic.newInstance();
-        if (api.knownCPU(cpu)) {
-          return cpu;
-        }
-      } catch (Exception e) {}
-      throw new UnsupportedPlatformException("CPU type " + cpu + " not yet supported");
-    }
+
+    if (cpu.equals("x86_64"))
+      return "amd64";
+
+    return cpu;
+
   }
 
   // this main is invoked from Makefile to make platform specific agent Makefile(s).
--- a/agent/src/share/classes/sun/jvm/hotspot/utilities/PointerLocation.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/utilities/PointerLocation.java	Thu Jul 02 16:09:16 2015 -0700
@@ -84,11 +84,11 @@
   }
 
   public boolean isInNewGen() {
-    return ((gen != null) && (gen.level() == 0));
+    return ((gen != null) && (gen == ((GenCollectedHeap)heap).getGen(0)));
   }
 
   public boolean isInOldGen() {
-    return ((gen != null) && (gen.level() == 1));
+    return ((gen != null) && (gen == ((GenCollectedHeap)heap).getGen(1)));
   }
 
   public boolean inOtherGen() {
@@ -207,8 +207,6 @@
           tty.print("In new generation ");
         } else if (isInOldGen()) {
           tty.print("In old generation ");
-        } else if (gen != null) {
-          tty.print("In Generation " + getGeneration().level());
         } else {
           tty.print("In unknown section of Java heap");
         }
--- a/make/bsd/makefiles/dtrace.make	Thu Jul 02 08:53:58 2015 -0700
+++ b/make/bsd/makefiles/dtrace.make	Thu Jul 02 16:09:16 2015 -0700
@@ -263,14 +263,19 @@
 $(DtraceOutDir):
 	mkdir $(DtraceOutDir)
 
+# When building using a devkit, dtrace cannot find the correct preprocessor so
+# we run it explicitly before runing dtrace.
 $(DtraceOutDir)/hotspot.h: $(DTRACE_COMMON_SRCDIR)/hotspot.d | $(DtraceOutDir)
-	$(QUIETLY) $(DTRACE_PROG) $(DTRACE_OPTS) -C -I. -h -o $@ -s $(DTRACE_COMMON_SRCDIR)/hotspot.d
+	$(QUIETLY) $(CC) -E $(DTRACE_OPTS) -I. -x c $(DTRACE_COMMON_SRCDIR)/hotspot.d > $(DtraceOutDir)/hotspot.d
+	$(QUIETLY) $(DTRACE_PROG) -h -o $@ -s $(DtraceOutDir)/hotspot.d
 
 $(DtraceOutDir)/hotspot_jni.h: $(DTRACE_COMMON_SRCDIR)/hotspot_jni.d | $(DtraceOutDir)
-	$(QUIETLY) $(DTRACE_PROG) $(DTRACE_OPTS) -C -I. -h -o $@ -s $(DTRACE_COMMON_SRCDIR)/hotspot_jni.d
+	$(QUIETLY) $(CC) -E $(DTRACE_OPTS) -I. -x c $(DTRACE_COMMON_SRCDIR)/hotspot_jni.d > $(DtraceOutDir)/hotspot_jni.d
+	$(QUIETLY) $(DTRACE_PROG) -h -o $@ -s $(DtraceOutDir)/hotspot_jni.d
 
 $(DtraceOutDir)/hs_private.h: $(DTRACE_COMMON_SRCDIR)/hs_private.d | $(DtraceOutDir)
-	$(QUIETLY) $(DTRACE_PROG) $(DTRACE_OPTS) -C -I. -h -o $@ -s $(DTRACE_COMMON_SRCDIR)/hs_private.d
+	$(QUIETLY) $(CC) -E $(DTRACE_OPTS) -I. -x c $(DTRACE_COMMON_SRCDIR)/hs_private.d > $(DtraceOutDir)/hs_private.d
+	$(QUIETLY) $(DTRACE_PROG) -h -o $@ -s $(DtraceOutDir)/hs_private.d
 
 dtrace_gen_headers: $(DtraceOutDir)/hotspot.h $(DtraceOutDir)/hotspot_jni.h $(DtraceOutDir)/hs_private.h 
 
--- a/make/bsd/makefiles/universal.gmk	Thu Jul 02 08:53:58 2015 -0700
+++ b/make/bsd/makefiles/universal.gmk	Thu Jul 02 16:09:16 2015 -0700
@@ -56,13 +56,14 @@
 universalize: $(UNIVERSAL_LIPO_LIST) $(UNIVERSAL_COPY_LIST)
 	$(RM) -r $(EXPORT_PATH)/lib/{i386,amd64}
 
+LIPO ?= lipo
 
 # Package built libraries in a universal binary
 $(UNIVERSAL_LIPO_LIST):
 	BUILT_LIPO_FILES="`find $(EXPORT_LIB_DIR)/{i386,amd64}/$(subst $(EXPORT_LIB_DIR)/,,$@) 2>/dev/null`" || test $$? = "1"; \
 	if [ -n "$${BUILT_LIPO_FILES}" ]; then \
 	  $(MKDIR) -p $(shell dirname $@); \
-	  lipo -create -output $@ $${BUILT_LIPO_FILES}; \
+	  $(LIPO) -create -output $@ $${BUILT_LIPO_FILES}; \
 	fi
 
 
--- a/make/sa.files	Thu Jul 02 08:53:58 2015 -0700
+++ b/make/sa.files	Thu Jul 02 16:09:16 2015 -0700
@@ -44,6 +44,7 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/compiler/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/bsd/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/bsd/amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/bsd/x86/*.java \
@@ -55,6 +56,7 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/ia64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/ppc64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/posix/*.java \
@@ -63,6 +65,7 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/ppc64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/*.java \
@@ -70,6 +73,7 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/ppc64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/x86/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/win32/coff/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/windbg/*.java \
@@ -92,11 +96,13 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/prims/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/bsd/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/bsd_amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/bsd_x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_ppc64/*.java \
--- a/src/cpu/aarch64/vm/frame_aarch64.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/aarch64/vm/frame_aarch64.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -526,16 +526,6 @@
   return frame(sender_sp(), link(), sender_pc());
 }
 
-bool frame::interpreter_frame_equals_unpacked_fp(intptr_t* fp) {
-  assert(is_interpreted_frame(), "must be interpreter frame");
-  Method* method = interpreter_frame_method();
-  // When unpacking an optimized frame the frame pointer is
-  // adjusted with:
-  int diff = (method->max_locals() - method->size_of_parameters()) *
-             Interpreter::stackElementWords;
-  return _fp == (fp - diff);
-}
-
 bool frame::is_interpreted_frame_valid(JavaThread* thread) const {
 // QQQ
 #ifdef CC_INTERP
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -84,7 +84,7 @@
 
 #ifdef BUILTIN_SIM
 #define UseBuiltinSim           true
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                         \
   product(bool, NotifySimulator, UseBuiltinSim,                         \
          "tell the AArch64 sim where we are in method code")            \
@@ -112,7 +112,7 @@
 #define NotifySimulator         false
 #define UseSimulatorCache       false
 #define DisableBCCheck          true
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                         \
   product(bool, NearCpool, true,                                        \
          "constant pool is close to instructions")                      \
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -2934,41 +2934,40 @@
   cmp(src1, rscratch1);
 }
 
+void MacroAssembler::store_check(Register obj, Address dst) {
+  store_check(obj);
+}
+
 void MacroAssembler::store_check(Register obj) {
   // Does a store check for the oop in register obj. The content of
   // register obj is destroyed afterwards.
-  store_check_part_1(obj);
-  store_check_part_2(obj);
-}
-
-void MacroAssembler::store_check(Register obj, Address dst) {
-  store_check(obj);
-}
-
-
-// split the store check operation so that other instructions can be scheduled inbetween
-void MacroAssembler::store_check_part_1(Register obj) {
+
   BarrierSet* bs = Universe::heap()->barrier_set();
   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+
+  CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
   lsr(obj, obj, CardTableModRefBS::card_shift);
-}
-
-void MacroAssembler::store_check_part_2(Register obj) {
-  BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
-  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
-  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
-
-  // The calculation for byte_map_base is as follows:
-  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
-  // So this essentially converts an address to a displacement and
-  // it will never need to be relocated.
-
-  // FIXME: It's not likely that disp will fit into an offset so we
-  // don't bother to check, but it could save an instruction.
-  intptr_t disp = (intptr_t) ct->byte_map_base;
-  mov(rscratch1, disp);
-  strb(zr, Address(obj, rscratch1));
+
+  assert(CardTableModRefBS::dirty_card_val() == 0, "must be");
+
+  {
+    ExternalAddress cardtable((address) ct->byte_map_base);
+    unsigned long offset;
+    adrp(rscratch1, cardtable, offset);
+    assert(offset == 0, "byte_map_base is misaligned");
+  }
+
+  if (UseCondCardMark) {
+    Label L_already_dirty;
+    ldrb(rscratch2,  Address(obj, rscratch1));
+    cbz(rscratch2, L_already_dirty);
+    strb(zr, Address(obj, rscratch1));
+    bind(L_already_dirty);
+  } else {
+    strb(zr, Address(obj, rscratch1));
+  }
 }
 
 void MacroAssembler::load_klass(Register dst, Register src) {
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -719,10 +719,6 @@
 
 #endif // INCLUDE_ALL_GCS
 
-  // split store_check(Register obj) to enhance instruction interleaving
-  void store_check_part_1(Register obj);
-  void store_check_part_2(Register obj);
-
   // oop manipulations
   void load_klass(Register dst, Register src);
   void store_klass(Register dst, Register src);
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -2120,6 +2120,7 @@
       save_native_result(masm, ret_type, stack_slots);
     }
 
+    __ mov(c_rarg2, rthread);
     __ lea(c_rarg1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
     __ mov(c_rarg0, obj_reg);
 
@@ -2128,7 +2129,7 @@
     __ ldr(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
     __ str(zr, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 
-    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 2, 0, 1);
+    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 3, 0, 1);
 
 #ifdef ASSERT
     {
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -190,6 +190,11 @@
     }
   }
 
+  if (UseGHASHIntrinsics) {
+    warning("GHASH intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
     UseCRC32Intrinsics = true;
   }
--- a/src/cpu/ppc/vm/globals_ppc.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/ppc/vm/globals_ppc.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -63,7 +63,7 @@
 define_pd_global(uintx, TypeProfileLevel, 111);
 
 // Platform dependent flag handling: flags only defined on this platform.
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct)  \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint)  \
                                                                             \
   /* Load poll address from thread. This is used to implement per-thread */ \
   /* safepoints on platforms != IA64. */                                    \
--- a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -2475,7 +2475,8 @@
 
     // Slow case of monitor enter.
     // Inline a special case of call_VM that disallows any pending_exception.
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), r_oop, r_box);
+    // Arguments are (oop obj, BasicLock* lock, JavaThread* thread).
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), r_oop, r_box, R16_thread);
 
     __ asm_assert_mem8_is_zero(thread_(pending_exception),
        "no pending exception allowed on exit from SharedRuntime::complete_monitor_unlocking_C", 0);
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -176,6 +176,11 @@
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
 
+  if (UseGHASHIntrinsics) {
+    warning("GHASH intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   if (UseSHA) {
     warning("SHA instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseSHA, false);
@@ -505,7 +510,8 @@
 
 void VM_Version::determine_features() {
 #if defined(ABI_ELFv2)
-  const int code_size = (num_features+1+2*7)*BytesPerInstWord; // TODO(asmundak): calculation is incorrect.
+  // 1 InstWord per call for the blr instruction.
+  const int code_size = (num_features+1+2*1)*BytesPerInstWord;
 #else
   // 7 InstWords for each call (function descriptor + blr instruction).
   const int code_size = (num_features+1+2*7)*BytesPerInstWord;
@@ -540,7 +546,8 @@
   a->popcntw(R7, R5);                          // code[6]  -> popcntw
   a->fcfids(F3, F4);                           // code[7]  -> fcfids
   a->vand(VR0, VR0, VR0);                      // code[8]  -> vand
-  a->lqarx_unchecked(R7, R3_ARG1, R4_ARG2, 1); // code[9]  -> lqarx_m
+  // arg0 of lqarx must be an even register, (arg1 + arg2) must be a multiple of 16
+  a->lqarx_unchecked(R6, R3_ARG1, R4_ARG2, 1); // code[9]  -> lqarx_m
   a->vcipher(VR0, VR1, VR2);                   // code[10] -> vcipher
   a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
   a->tcheck(0);                                // code[12] -> tcheck
@@ -572,7 +579,8 @@
 
   // Execute code. Illegal instructions will be replaced by 0 in the signal handler.
   VM_Version::_is_determine_features_test_running = true;
-  (*test)((address)mid_of_test_area, (uint64_t)0);
+  // We must align the first argument to 16 bytes because of the lqarx check.
+  (*test)((address)align_size_up((intptr_t)mid_of_test_area, 16), (uint64_t)0);
   VM_Version::_is_determine_features_test_running = false;
 
   // determine which instructions are legal.
@@ -614,12 +622,12 @@
   MacroAssembler* a = new MacroAssembler(&cb);
 
   // Emit code.
-  uint64_t (*get_dscr)() = (uint64_t(*)())(void *)a->emit_fd();
+  uint64_t (*get_dscr)() = (uint64_t(*)())(void *)a->function_entry();
   uint32_t *code = (uint32_t *)a->pc();
   a->mfdscr(R3);
   a->blr();
 
-  void (*set_dscr)(long) = (void(*)(long))(void *)a->emit_fd();
+  void (*set_dscr)(long) = (void(*)(long))(void *)a->function_entry();
   a->mtdscr(R3);
   a->blr();
 
--- a/src/cpu/sparc/vm/assembler_sparc.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -129,6 +129,7 @@
     flog3_op3    = 0x36,
     edge_op3     = 0x36,
     fsrc_op3     = 0x36,
+    xmulx_op3    = 0x36,
     impdep2_op3  = 0x37,
     stpartialf_op3 = 0x37,
     jmpl_op3     = 0x38,
@@ -220,6 +221,8 @@
     mdtox_opf          = 0x110,
     mstouw_opf         = 0x111,
     mstosw_opf         = 0x113,
+    xmulx_opf          = 0x115,
+    xmulxhi_opf        = 0x116,
     mxtod_opf          = 0x118,
     mwtos_opf          = 0x119,
 
@@ -1212,6 +1215,9 @@
   void movwtos( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
   void movxtod( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
 
+  void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
+  void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
+
   // Crypto SHA instructions
 
   void sha1()   { sha1_only();    emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
--- a/src/cpu/sparc/vm/frame_sparc.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/sparc/vm/frame_sparc.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -599,12 +599,6 @@
   return next_younger_sp_or_null(valid_sp, sp) != NULL;
 }
 
-
-bool frame::interpreter_frame_equals_unpacked_fp(intptr_t* fp) {
-  assert(is_interpreted_frame(), "must be interpreter frame");
-  return this->fp() == fp;
-}
-
 bool frame::is_interpreted_frame_valid(JavaThread* thread) const {
 #ifdef CC_INTERP
   // Is there anything to do?
--- a/src/cpu/sparc/vm/globals_sparc.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/sparc/vm/globals_sparc.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -81,7 +81,7 @@
 
 define_pd_global(uintx, TypeProfileLevel, 111);
 
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                             \
   product(intx, UseVIS, 99,                                                 \
           "Highest supported VIS instructions set on Sparc")                \
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -4786,6 +4786,130 @@
     return start;
   }
 
+  /* Single and multi-block ghash operations */
+  address generate_ghash_processBlocks() {
+      __ align(CodeEntryAlignment);
+      Label L_ghash_loop, L_aligned, L_main;
+      StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+      address start = __ pc();
+
+      Register state = I0;
+      Register subkeyH = I1;
+      Register data = I2;
+      Register len = I3;
+
+      __ save_frame(0);
+
+      __ ldx(state, 0, O0);
+      __ ldx(state, 8, O1);
+
+      // Loop label for multiblock operations
+      __ BIND(L_ghash_loop);
+
+      // Check if 'data' is unaligned
+      __ andcc(data, 7, G1);
+      __ br(Assembler::zero, false, Assembler::pt, L_aligned);
+      __ delayed()->nop();
+
+      Register left_shift = L1;
+      Register right_shift = L2;
+      Register data_ptr = L3;
+
+      // Get left and right shift values in bits
+      __ sll(G1, LogBitsPerByte, left_shift);
+      __ mov(64, right_shift);
+      __ sub(right_shift, left_shift, right_shift);
+
+      // Align to read 'data'
+      __ sub(data, G1, data_ptr);
+
+      // Load first 8 bytes of 'data'
+      __ ldx(data_ptr, 0, O4);
+      __ sllx(O4, left_shift, O4);
+      __ ldx(data_ptr, 8, O5);
+      __ srlx(O5, right_shift, G4);
+      __ bset(G4, O4);
+
+      // Load second 8 bytes of 'data'
+      __ sllx(O5, left_shift, O5);
+      __ ldx(data_ptr, 16, G4);
+      __ srlx(G4, right_shift, G4);
+      __ ba(L_main);
+      __ delayed()->bset(G4, O5);
+
+      // If 'data' is aligned, load normally
+      __ BIND(L_aligned);
+      __ ldx(data, 0, O4);
+      __ ldx(data, 8, O5);
+
+      __ BIND(L_main);
+      __ ldx(subkeyH, 0, O2);
+      __ ldx(subkeyH, 8, O3);
+
+      __ xor3(O0, O4, O0);
+      __ xor3(O1, O5, O1);
+
+      __ xmulxhi(O0, O3, G3);
+      __ xmulx(O0, O2, O5);
+      __ xmulxhi(O1, O2, G4);
+      __ xmulxhi(O1, O3, G5);
+      __ xmulx(O0, O3, G1);
+      __ xmulx(O1, O3, G2);
+      __ xmulx(O1, O2, O3);
+      __ xmulxhi(O0, O2, O4);
+
+      __ mov(0xE1, O0);
+      __ sllx(O0, 56, O0);
+
+      __ xor3(O5, G3, O5);
+      __ xor3(O5, G4, O5);
+      __ xor3(G5, G1, G1);
+      __ xor3(G1, O3, G1);
+      __ srlx(G2, 63, O1);
+      __ srlx(G1, 63, G3);
+      __ sllx(G2, 63, O3);
+      __ sllx(G2, 58, O2);
+      __ xor3(O3, O2, O2);
+
+      __ sllx(G1, 1, G1);
+      __ or3(G1, O1, G1);
+
+      __ xor3(G1, O2, G1);
+
+      __ sllx(G2, 1, G2);
+
+      __ xmulxhi(G1, O0, O1);
+      __ xmulx(G1, O0, O2);
+      __ xmulxhi(G2, O0, O3);
+      __ xmulx(G2, O0, G1);
+
+      __ xor3(O4, O1, O4);
+      __ xor3(O5, O2, O5);
+      __ xor3(O5, O3, O5);
+
+      __ sllx(O4, 1, O2);
+      __ srlx(O5, 63, O3);
+
+      __ or3(O2, O3, O0);
+
+      __ sllx(O5, 1, O1);
+      __ srlx(G1, 63, O2);
+      __ or3(O1, O2, O1);
+      __ xor3(O1, G3, O1);
+
+      __ deccc(len);
+      __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
+      __ delayed()->add(data, 16, data);
+
+      __ stx(O0, I0, 0);
+      __ stx(O1, I0, 8);
+
+      __ ret();
+      __ delayed()->restore();
+
+      return start;
+  }
+
   void generate_initial() {
     // Generates all stubs and initializes the entry points
 
@@ -4859,6 +4983,10 @@
       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
+    // generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    }
 
     // generate SHA1/SHA256/SHA512 intrinsics code
     if (UseSHA1Intrinsics) {
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -300,6 +300,17 @@
     }
   }
 
+  // GHASH/GCM intrinsics
+  if (has_vis3() && (UseVIS > 2)) {
+    if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+      UseGHASHIntrinsics = true;
+    }
+  } else if (UseGHASHIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+      warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   // SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
   if (has_sha1() || has_sha256() || has_sha512()) {
     if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
--- a/src/cpu/x86/vm/assembler_x86.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1347,7 +1347,7 @@
 
 void Assembler::andnl(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(dst, src1, src2, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2, false);
   emit_int8((unsigned char)0xF2);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1355,7 +1355,7 @@
 void Assembler::andnl(Register dst, Register src1, Address src2) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(dst, src1, src2, false);
+  vex_prefix_0F38_legacy(dst, src1, src2, false);
   emit_int8((unsigned char)0xF2);
   emit_operand(dst, src2);
 }
@@ -1382,7 +1382,7 @@
 
 void Assembler::blsil(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(rbx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1390,14 +1390,14 @@
 void Assembler::blsil(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rbx, dst, src, false);
+  vex_prefix_0F38_legacy(rbx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_operand(rbx, src);
 }
 
 void Assembler::blsmskl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(rdx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1412,7 +1412,7 @@
 
 void Assembler::blsrl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(rcx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1420,7 +1420,7 @@
 void Assembler::blsrl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rcx, dst, src, false);
+  vex_prefix_0F38_legacy(rcx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_operand(rcx, src);
 }
@@ -3095,26 +3095,35 @@
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F,
-                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
 }
 
+void Assembler::pslldq(XMMRegister dst, int shift) {
+  // Shift left 128 bit value in xmm register by number of bytes.
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  emit_int8(0x73);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(shift);
+}
+
 void Assembler::ptest(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false,
+              VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
 
 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                      false, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3126,7 +3135,7 @@
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len, true, false);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
@@ -3135,7 +3144,7 @@
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38);
+                                     vector_len, VEX_OPCODE_0F_38, true, false);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3146,12 +3155,12 @@
   if (VM_Version::supports_evex()) {
     tuple_type = EVEX_FVM;
   }
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
 }
 
 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
 }
 
 void Assembler::punpckldq(XMMRegister dst, Address src) {
@@ -4979,7 +4988,51 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 4-bytes integer data from src into 8 locations in dest
+// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+                                     vector_len, VEX_OPCODE_0F_38, false);
+  emit_int8(0x78);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_8bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  emit_int8(0x78);
+  emit_operand(dst, src);
+}
+
+// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+                                     vector_len, VEX_OPCODE_0F_38, false);
+  emit_int8(0x79);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_16bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  emit_int8(0x79);
+  emit_operand(dst, src);
+}
+
+// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
@@ -4988,6 +5041,121 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  emit_int8(0x58);
+  emit_operand(dst, src);
+}
+
+// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  emit_int8(0x59);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_64bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  emit_int8(0x59);
+  emit_operand(dst, src);
+}
+
+// duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  emit_int8(0x18);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  emit_int8(0x18);
+  emit_operand(dst, src);
+}
+
+// duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  tuple_type = EVEX_T1S;
+  input_size_in_bits = EVEX_64bit;
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  emit_int8(0x19);
+  emit_operand(dst, src);
+}
+
+// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  emit_int8(0x7A);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  emit_int8(0x7B);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  emit_int8(0x7C);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  emit_int8(0x7C);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
@@ -5598,7 +5766,7 @@
 
 void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre,
                            VexOpcode opc, bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg) {
-  bool vex_r = (xreg_enc >= 8);
+  bool vex_r = ((xreg_enc & 8) == 8) ? 1 : 0;
   bool vex_b = adr.base_needs_rex();
   bool vex_x = adr.index_needs_rex();
   avx_vector_len = vector_len;
@@ -5626,8 +5794,8 @@
 
 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
                                      bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg ) {
-  bool vex_r = (dst_enc >= 8);
-  bool vex_b = (src_enc >= 8);
+  bool vex_r = ((dst_enc & 8) == 8) ? 1 : 0;
+  bool vex_b = ((src_enc & 8) == 8) ? 1 : 0;
   bool vex_x = false;
   avx_vector_len = vector_len;
 
@@ -6272,19 +6440,15 @@
 
 void Assembler::andnq(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_q(dst, src1, src2);
+  int encode = vex_prefix_0F38_and_encode_q_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::andnq(Register dst, Register src1, Address src2) {
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-  }
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_q(dst, src1, src2);
+  vex_prefix_0F38_q_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_operand(dst, src2);
 }
@@ -6311,7 +6475,7 @@
 
 void Assembler::blsiq(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_q(rbx, dst, src);
+  int encode = vex_prefix_0F38_and_encode_q_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6319,14 +6483,14 @@
 void Assembler::blsiq(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_q(rbx, dst, src);
+  vex_prefix_0F38_q_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rbx, src);
 }
 
 void Assembler::blsmskq(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_q(rdx, dst, src);
+  int encode = vex_prefix_0F38_and_encode_q_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6334,14 +6498,14 @@
 void Assembler::blsmskq(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_q(rdx, dst, src);
+  vex_prefix_0F38_q_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rdx, src);
 }
 
 void Assembler::blsrq(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_q(rcx, dst, src);
+  int encode = vex_prefix_0F38_and_encode_q_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6349,7 +6513,7 @@
 void Assembler::blsrq(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_q(rcx, dst, src);
+  vex_prefix_0F38_q_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rcx, src);
 }
--- a/src/cpu/x86/vm/assembler_x86.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -661,6 +661,14 @@
                vector_len, no_mask_reg);
   }
 
+  void vex_prefix_0F38_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
+    bool vex_w = false;
+    int vector_len = AVX_128bit;
+    vex_prefix(src, nds->encoding(), dst->encoding(),
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
+               vector_len, true, no_mask_reg);
+  }
+
   void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) {
     bool vex_w = true;
     int vector_len = AVX_128bit;
@@ -668,6 +676,15 @@
                VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
                vector_len, no_mask_reg);
   }
+
+  void vex_prefix_0F38_q_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
+    bool vex_w = true;
+    int vector_len = AVX_128bit;
+    vex_prefix(src, nds->encoding(), dst->encoding(),
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
+               vector_len, true, no_mask_reg);
+  }
+
   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
                              VexSimdPrefix pre, VexOpcode opc,
                              bool vex_w, int vector_len,
@@ -680,6 +697,15 @@
                                  VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
                                  false, no_mask_reg);
   }
+
+  int  vex_prefix_0F38_and_encode_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
+    bool vex_w = false;
+    int vector_len = AVX_128bit;
+    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+      VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
+      true, no_mask_reg);
+  }
+
   int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) {
     bool vex_w = true;
     int vector_len = AVX_128bit;
@@ -687,6 +713,15 @@
                                  VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
                                  false, no_mask_reg);
   }
+
+  int  vex_prefix_0F38_and_encode_q_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
+    bool vex_w = true;
+    int vector_len = AVX_128bit;
+    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
+                                 true, no_mask_reg);
+  }
+
   int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
                              VexSimdPrefix pre, int vector_len = AVX_128bit,
                              VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false,
@@ -1666,6 +1701,8 @@
 
   // Shift Right by bytes Logical DoubleQuadword Immediate
   void psrldq(XMMRegister dst, int shift);
+  // Shift Left by bytes Logical DoubleQuadword Immediate
+  void pslldq(XMMRegister dst, int shift);
 
   // Logical Compare 128bit
   void ptest(XMMRegister dst, XMMRegister src);
@@ -2024,8 +2061,25 @@
   // duplicate 4-bytes integer data from src into 8 locations in dest
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
 
-  // duplicate 4-bytes integer data from src into vector_len locations in dest
+  // duplicate n-bytes integer data from src into vector_len locations in dest
+  void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
+  void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastw(XMMRegister dst, Address src, int vector_len);
   void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastd(XMMRegister dst, Address src, int vector_len);
+  void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
+
+  void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
+  void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
+
+  void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
+  void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
+  void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
+  void evpbroadcastq(XMMRegister dst, Register src, int vector_len);
 
   // Carry-Less Multiplication Quadword
   void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
--- a/src/cpu/x86/vm/c2_init_x86.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/c2_init_x86.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -58,4 +58,6 @@
       OptoReg::invalidate(i);
     }
   }
+
+  SuperWordLoopUnrollAnalysis = true;
 }
--- a/src/cpu/x86/vm/frame_x86.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/frame_x86.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -524,17 +524,6 @@
   return frame(sender_sp(), link(), sender_pc());
 }
 
-
-bool frame::interpreter_frame_equals_unpacked_fp(intptr_t* fp) {
-  assert(is_interpreted_frame(), "must be interpreter frame");
-  Method* method = interpreter_frame_method();
-  // When unpacking an optimized frame the frame pointer is
-  // adjusted with:
-  int diff = (method->max_locals() - method->size_of_parameters()) *
-             Interpreter::stackElementWords;
-  return _fp == (fp - diff);
-}
-
 bool frame::is_interpreted_frame_valid(JavaThread* thread) const {
 // QQQ
 #ifdef CC_INTERP
--- a/src/cpu/x86/vm/globals_x86.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/globals_x86.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -84,7 +84,7 @@
 
 define_pd_global(bool, PreserveFramePointer, false);
 
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                             \
   develop(bool, IEEEPrecision, true,                                        \
           "Enables IEEE precision (for INTEL only)")                        \
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -4260,31 +4260,24 @@
 //////////////////////////////////////////////////////////////////////////////////
 
 
+void MacroAssembler::store_check(Register obj, Address dst) {
+  store_check(obj);
+}
+
 void MacroAssembler::store_check(Register obj) {
   // Does a store check for the oop in register obj. The content of
   // register obj is destroyed afterwards.
-  store_check_part_1(obj);
-  store_check_part_2(obj);
-}
-
-void MacroAssembler::store_check(Register obj, Address dst) {
-  store_check(obj);
-}
-
-
-// split the store check operation so that other instructions can be scheduled inbetween
-void MacroAssembler::store_check_part_1(Register obj) {
+
   BarrierSet* bs = Universe::heap()->barrier_set();
   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
-  shrptr(obj, CardTableModRefBS::card_shift);
-}
-
-void MacroAssembler::store_check_part_2(Register obj) {
-  BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+
   CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 
+  shrptr(obj, CardTableModRefBS::card_shift);
+
+  Address card_addr;
+
   // The calculation for byte_map_base is as follows:
   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
   // So this essentially converts an address to a displacement and it will
@@ -4292,8 +4285,7 @@
   // large for a 32bit displacement.
   intptr_t disp = (intptr_t) ct->byte_map_base;
   if (is_simm32(disp)) {
-    Address cardtable(noreg, obj, Address::times_1, disp);
-    movb(cardtable, 0);
+    card_addr = Address(noreg, obj, Address::times_1, disp);
   } else {
     // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
     // displacement and done in a single instruction given favorable mapping and a
@@ -4301,7 +4293,21 @@
     // entry and that entry is not properly handled by the relocation code.
     AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
     Address index(noreg, obj, Address::times_1);
-    movb(as_Address(ArrayAddress(cardtable, index)), 0);
+    card_addr = as_Address(ArrayAddress(cardtable, index));
+  }
+
+  int dirty = CardTableModRefBS::dirty_card_val();
+  if (UseCondCardMark) {
+    Label L_already_dirty;
+    if (UseConcMarkSweepGC) {
+      membar(Assembler::StoreLoad);
+    }
+    cmpb(card_addr, dirty);
+    jcc(Assembler::equal, L_already_dirty);
+    movb(card_addr, dirty);
+    bind(L_already_dirty);
+  } else {
+    movb(card_addr, dirty);
   }
 }
 
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -315,10 +315,6 @@
 
 #endif // INCLUDE_ALL_GCS
 
-  // split store_check(Register obj) to enhance instruction interleaving
-  void store_check_part_1(Register obj);
-  void store_check_part_2(Register obj);
-
   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
   void c2bool(Register x);
 
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -365,22 +365,22 @@
     map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
     if (UseAVX > 2) {
-      map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg());
-      map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next());
+      map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next());
     }
   }
 
@@ -466,7 +466,7 @@
       __ vinsertf64x4h(xmm29, Address(rsp, 928));
       __ vinsertf64x4h(xmm30, Address(rsp, 960));
       __ vinsertf64x4h(xmm31, Address(rsp, 992));
-      __ subptr(rsp, 1024);
+      __ addptr(rsp, 1024);
     }
   }
 #else
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -2727,6 +2727,167 @@
     return start;
   }
 
+  // byte swap x86 long
+  address generate_ghash_long_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+    address start = __ pc();
+    __ emit_data(0x0b0a0908, relocInfo::none, 0);
+    __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
+    __ emit_data(0x03020100, relocInfo::none, 0);
+    __ emit_data(0x07060504, relocInfo::none, 0);
+
+  return start;
+  }
+
+  // byte swap x86 byte array
+  address generate_ghash_byte_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+    address start = __ pc();
+    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+    __ emit_data(0x08090a0b, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+  return start;
+  }
+
+  /* Single and multi-block ghash operations */
+  address generate_ghash_processBlocks() {
+    assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
+    __ align(CodeEntryAlignment);
+    Label L_ghash_loop, L_exit;
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+    address start = __ pc();
+
+    const Register state        = rdi;
+    const Register subkeyH      = rsi;
+    const Register data         = rdx;
+    const Register blocks       = rcx;
+
+    const Address  state_param(rbp, 8+0);
+    const Address  subkeyH_param(rbp, 8+4);
+    const Address  data_param(rbp, 8+8);
+    const Address  blocks_param(rbp, 8+12);
+
+    const XMMRegister xmm_temp0 = xmm0;
+    const XMMRegister xmm_temp1 = xmm1;
+    const XMMRegister xmm_temp2 = xmm2;
+    const XMMRegister xmm_temp3 = xmm3;
+    const XMMRegister xmm_temp4 = xmm4;
+    const XMMRegister xmm_temp5 = xmm5;
+    const XMMRegister xmm_temp6 = xmm6;
+    const XMMRegister xmm_temp7 = xmm7;
+
+    __ enter();
+
+    __ movptr(state, state_param);
+    __ movptr(subkeyH, subkeyH_param);
+    __ movptr(data, data_param);
+    __ movptr(blocks, blocks_param);
+
+    __ movdqu(xmm_temp0, Address(state, 0));
+    __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+    __ movdqu(xmm_temp1, Address(subkeyH, 0));
+    __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+    __ BIND(L_ghash_loop);
+    __ movdqu(xmm_temp2, Address(data, 0));
+    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+    __ pxor(xmm_temp0, xmm_temp2);
+
+    //
+    // Multiply with the hash key
+    //
+    __ movdqu(xmm_temp3, xmm_temp0);
+    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
+    __ movdqu(xmm_temp4, xmm_temp0);
+    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
+
+    __ movdqu(xmm_temp5, xmm_temp0);
+    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
+    __ movdqu(xmm_temp6, xmm_temp0);
+    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
+
+    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
+
+    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
+    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
+    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
+    __ pxor(xmm_temp3, xmm_temp5);
+    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
+                                        // of the carry-less multiplication of
+                                        // xmm0 by xmm1.
+
+    // We shift the result of the multiplication by one bit position
+    // to the left to cope for the fact that the bits are reversed.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp4, xmm_temp6);
+    __ pslld (xmm_temp3, 1);
+    __ pslld(xmm_temp6, 1);
+    __ psrld(xmm_temp7, 31);
+    __ psrld(xmm_temp4, 31);
+    __ movdqu(xmm_temp5, xmm_temp7);
+    __ pslldq(xmm_temp4, 4);
+    __ pslldq(xmm_temp7, 4);
+    __ psrldq(xmm_temp5, 12);
+    __ por(xmm_temp3, xmm_temp7);
+    __ por(xmm_temp6, xmm_temp4);
+    __ por(xmm_temp6, xmm_temp5);
+
+    //
+    // First phase of the reduction
+    //
+    // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
+    // independently.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp4, xmm_temp3);
+    __ movdqu(xmm_temp5, xmm_temp3);
+    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
+    __ pslld(xmm_temp4, 30);    // packed right shift shifting << 30
+    __ pslld(xmm_temp5, 25);    // packed right shift shifting << 25
+    __ pxor(xmm_temp7, xmm_temp4);      // xor the shifted versions
+    __ pxor(xmm_temp7, xmm_temp5);
+    __ movdqu(xmm_temp4, xmm_temp7);
+    __ pslldq(xmm_temp7, 12);
+    __ psrldq(xmm_temp4, 4);
+    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
+
+    //
+    // Second phase of the reduction
+    //
+    // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
+    // shift operations.
+    __ movdqu(xmm_temp2, xmm_temp3);
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp5, xmm_temp3);
+    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
+    __ psrld(xmm_temp7, 2);     // packed left shifting >> 2
+    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
+    __ pxor(xmm_temp2, xmm_temp7);      // xor the shifted versions
+    __ pxor(xmm_temp2, xmm_temp5);
+    __ pxor(xmm_temp2, xmm_temp4);
+    __ pxor(xmm_temp3, xmm_temp2);
+    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
+
+    __ decrement(blocks);
+    __ jcc(Assembler::zero, L_exit);
+    __ movdqu(xmm_temp0, xmm_temp6);
+    __ addptr(data, 16);
+    __ jmp(L_ghash_loop);
+
+    __ BIND(L_exit);
+       // Byte swap 16-byte result
+    __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
+
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
   /**
    *  Arguments:
    *
@@ -3026,6 +3187,13 @@
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
     }
 
+    // Generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    }
+
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
                                                    &StubRoutines::_safefetch32_fault_pc,
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -382,7 +382,12 @@
 
     // restore regs belonging to calling function
 #ifdef _WIN64
-    for (int i = 15; i >= 6; i--) {
+    int xmm_ub = 15;
+    if (UseAVX > 2) {
+      xmm_ub = 31;
+    }
+    // emit the restores for xmm regs
+    for (int i = 6; i <= xmm_ub; i++) {
       __ movdqu(as_XMMRegister(i), xmm_save(i));
     }
 #endif
@@ -3681,6 +3686,175 @@
     return start;
   }
 
+
+  // byte swap x86 long
+  address generate_ghash_long_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+    address start = __ pc();
+    __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
+    __ emit_data64(0x0706050403020100, relocInfo::none );
+  return start;
+  }
+
+  // byte swap x86 byte array
+  address generate_ghash_byte_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+    address start = __ pc();
+    __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
+    __ emit_data64(0x0001020304050607, relocInfo::none );
+  return start;
+  }
+
+  /* Single and multi-block ghash operations */
+  address generate_ghash_processBlocks() {
+    __ align(CodeEntryAlignment);
+    Label L_ghash_loop, L_exit;
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+    address start = __ pc();
+
+    const Register state        = c_rarg0;
+    const Register subkeyH      = c_rarg1;
+    const Register data         = c_rarg2;
+    const Register blocks       = c_rarg3;
+
+#ifdef _WIN64
+    const int XMM_REG_LAST  = 10;
+#endif
+
+    const XMMRegister xmm_temp0 = xmm0;
+    const XMMRegister xmm_temp1 = xmm1;
+    const XMMRegister xmm_temp2 = xmm2;
+    const XMMRegister xmm_temp3 = xmm3;
+    const XMMRegister xmm_temp4 = xmm4;
+    const XMMRegister xmm_temp5 = xmm5;
+    const XMMRegister xmm_temp6 = xmm6;
+    const XMMRegister xmm_temp7 = xmm7;
+    const XMMRegister xmm_temp8 = xmm8;
+    const XMMRegister xmm_temp9 = xmm9;
+    const XMMRegister xmm_temp10 = xmm10;
+
+    __ enter();
+
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-10
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+#endif
+
+    __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+    __ movdqu(xmm_temp0, Address(state, 0));
+    __ pshufb(xmm_temp0, xmm_temp10);
+
+
+    __ BIND(L_ghash_loop);
+    __ movdqu(xmm_temp2, Address(data, 0));
+    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+    __ movdqu(xmm_temp1, Address(subkeyH, 0));
+    __ pshufb(xmm_temp1, xmm_temp10);
+
+    __ pxor(xmm_temp0, xmm_temp2);
+
+    //
+    // Multiply with the hash key
+    //
+    __ movdqu(xmm_temp3, xmm_temp0);
+    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
+    __ movdqu(xmm_temp4, xmm_temp0);
+    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
+
+    __ movdqu(xmm_temp5, xmm_temp0);
+    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
+    __ movdqu(xmm_temp6, xmm_temp0);
+    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
+
+    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
+
+    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
+    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
+    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
+    __ pxor(xmm_temp3, xmm_temp5);
+    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
+                                        // of the carry-less multiplication of
+                                        // xmm0 by xmm1.
+
+    // We shift the result of the multiplication by one bit position
+    // to the left to cope for the fact that the bits are reversed.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp8, xmm_temp6);
+    __ pslld(xmm_temp3, 1);
+    __ pslld(xmm_temp6, 1);
+    __ psrld(xmm_temp7, 31);
+    __ psrld(xmm_temp8, 31);
+    __ movdqu(xmm_temp9, xmm_temp7);
+    __ pslldq(xmm_temp8, 4);
+    __ pslldq(xmm_temp7, 4);
+    __ psrldq(xmm_temp9, 12);
+    __ por(xmm_temp3, xmm_temp7);
+    __ por(xmm_temp6, xmm_temp8);
+    __ por(xmm_temp6, xmm_temp9);
+
+    //
+    // First phase of the reduction
+    //
+    // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+    // independently.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp8, xmm_temp3);
+    __ movdqu(xmm_temp9, xmm_temp3);
+    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
+    __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
+    __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
+    __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
+    __ pxor(xmm_temp7, xmm_temp9);
+    __ movdqu(xmm_temp8, xmm_temp7);
+    __ pslldq(xmm_temp7, 12);
+    __ psrldq(xmm_temp8, 4);
+    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
+
+    //
+    // Second phase of the reduction
+    //
+    // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+    // shift operations.
+    __ movdqu(xmm_temp2, xmm_temp3);
+    __ movdqu(xmm_temp4, xmm_temp3);
+    __ movdqu(xmm_temp5, xmm_temp3);
+    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
+    __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
+    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
+    __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
+    __ pxor(xmm_temp2, xmm_temp5);
+    __ pxor(xmm_temp2, xmm_temp8);
+    __ pxor(xmm_temp3, xmm_temp2);
+    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
+
+    __ decrement(blocks);
+    __ jcc(Assembler::zero, L_exit);
+    __ movdqu(xmm_temp0, xmm_temp6);
+    __ addptr(data, 16);
+    __ jmp(L_ghash_loop);
+
+    __ BIND(L_exit);
+    __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
+    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
+
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+#endif
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
   /**
    *  Arguments:
    *
@@ -4120,6 +4294,13 @@
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
 
+    // Generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    }
+
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
                                                        &StubRoutines::_safefetch32_fault_pc,
--- a/src/cpu/x86/vm/stubRoutines_x86.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/stubRoutines_x86.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,6 +33,8 @@
 
 address StubRoutines::x86::_verify_mxcsr_entry = NULL;
 address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
+address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
+address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
 
 uint64_t StubRoutines::x86::_crc_by128_masks[] =
 {
--- a/src/cpu/x86/vm/stubRoutines_x86.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/stubRoutines_x86.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,10 +36,15 @@
   // masks and table for CRC32
   static uint64_t _crc_by128_masks[];
   static juint    _crc_table[];
+  // swap mask for ghash
+  static address _ghash_long_swap_mask_addr;
+  static address _ghash_byte_swap_mask_addr;
 
  public:
   static address verify_mxcsr_entry()    { return _verify_mxcsr_entry; }
   static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
   static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
+  static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
+  static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
 
 #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -33,7 +33,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 23000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -677,6 +677,17 @@
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
 
+  // GHASH/GCM intrinsics
+  if (UseCLMUL && (UseSSE > 2)) {
+    if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+      UseGHASHIntrinsics = true;
+    }
+  } else if (UseGHASHIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+      warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   if (UseSHA) {
     warning("SHA instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseSHA, false);
--- a/src/cpu/x86/vm/vm_version_x86.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/vm_version_x86.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -702,6 +702,7 @@
   static bool supports_avx512cd() { return (_cpuFeatures & CPU_AVX512CD) != 0; }
   static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
   static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
+  static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/src/cpu/x86/vm/x86.ad	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/x86/vm/x86.ad	Thu Jul 02 16:09:16 2015 -0700
@@ -2894,6 +2894,457 @@
   ins_pipe( pipe_slow );
 %}
 
+// ====================LEGACY REPLICATE=======================================
+
+instruct Repl4B_mem(vecS dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate4B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_mem(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate8B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\t! replicate16B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\t! replicate16B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate4S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S_mem(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\t! replicate8S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "pshuflw $dst,$mem,0x00\n\t"
+            "punpcklqdq $dst,$dst\t! replicate8S" %}
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "pshuflw $dst,$mem,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "pshufd  $dst,$mem,0x00\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
+            "punpcklqdq $dst,$dst" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Long could be loaded into xmm register directly from memory.
+instruct Repl2L_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "movq    $dst,$mem\n\t"
+            "punpcklqdq $dst,$dst\t! replicate2L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl4L(vecY dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  format %{ "movdq   $dst,$src\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movdq($dst$$XMMRegister, $src$$Register);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+instruct Repl4L_imm(vecY dst, immL con) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress($con));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "movq    $dst,$mem\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl2F_mem(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4F_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F(vecY dst, regF src) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$src,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "pshufd  $dst,$mem,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl2D_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D(vecY dst, regD src) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD src));
+  format %{ "pshufd  $dst,$src,0x44\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "pshufd  $dst,$mem,0x44\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ====================GENERIC REPLICATE==========================================
+
 // Replicate byte scalar to be vector
 instruct Repl4B(vecS dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 4);
@@ -2923,60 +3374,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl16B(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateB src));
-  format %{ "movd    $dst,$src\n\t"
-            "punpcklbw $dst,$dst\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\t! replicate16B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl32B(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateB src));
-  format %{ "movd    $dst,$src\n\t"
-            "punpcklbw $dst,$dst\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl64B(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 64);
-  match(Set dst (ReplicateB src));
-  format %{ "movd    $dst,$src\n\t"
-            "punpcklbw $dst,$dst\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate32B\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate632B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate byte scalar immediate to be vector by loading from const table.
 instruct Repl4B_imm(vecS dst, immI con) %{
   predicate(n->as_Vector()->length() == 4);
@@ -2998,48 +3395,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl16B_imm(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateB con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl32B_imm(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateB con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl64B_imm(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 64);
-  match(Set dst (ReplicateB con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate32B($con)\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate32B($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate byte scalar zero to be vector
 instruct Repl4B_zero(vecS dst, immI0 zero) %{
   predicate(n->as_Vector()->length() == 4);
@@ -3083,18 +3438,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl64B_zero(vecZ dst, immI0 zero) %{
-  predicate(n->as_Vector()->length() == 64);
-  match(Set dst (ReplicateB zero));
-  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
-  ins_encode %{
-    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
-    int vector_len = 2;
-    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate char/short (2 byte) scalar to be vector
 instruct Repl2S(vecS dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3108,66 +3451,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl4S(vecD dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateS src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshuflw $dst,$dst,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl8S(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateS src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\t! replicate8S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16S(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateS src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl32S(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateS src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshuflw $dst,$dst,0x00\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate16S\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
 instruct Repl2S_imm(vecS dst, immI con) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3189,48 +3472,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl8S_imm(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateS con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16S_imm(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateS con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl32S_imm(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateS con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate16S($con)\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate char/short (2 byte) scalar zero to be vector
 instruct Repl2S_zero(vecS dst, immI0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3274,18 +3515,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl32S_zero(vecZ dst, immI0 zero) %{
-  predicate(n->as_Vector()->length() == 32);
-  match(Set dst (ReplicateS zero));
-  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
-  ins_encode %{
-    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
-    int vector_len = 2;
-    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate integer (4 byte) scalar to be vector
 instruct Repl2I(vecD dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3299,101 +3528,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl4I(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateI src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateI src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateI src));
-  format %{ "movd    $dst,$src\n\t"
-            "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
-instruct Repl2I_imm(vecD dst, immI con) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateI con));
-  format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4I_imm(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateI con));
-  format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
-            "punpcklqdq $dst,$dst" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I_imm(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateI con));
-  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I_imm(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateI con));
-  format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Integer could be loaded into xmm register directly from memory.
 instruct Repl2I_mem(vecD dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3407,46 +3541,15 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl4I_mem(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateI (LoadI mem)));
-  format %{ "movd    $dst,$mem\n\t"
-            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $mem$$Address);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I_mem(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateI (LoadI mem)));
-  format %{ "movd    $dst,$mem\n\t"
-            "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $mem$$Address);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I_mem(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateI (LoadI mem)));
-  format %{ "movd    $dst,$mem\n\t"
-            "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $mem$$Address);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
+// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2I_imm(vecD dst, immI con) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+  %}
+  ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate integer (4 byte) scalar zero to be vector
@@ -3482,18 +3585,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl16I_zero(vecZ dst, immI0 zero) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateI zero));
-  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
-  ins_encode %{
-    // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
-    int vector_len = 2;
-    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate long (8 byte) scalar to be vector
 #ifdef _LP64
 instruct Repl2L(vecX dst, rRegL src) %{
@@ -3507,36 +3598,6 @@
   %}
   ins_pipe( pipe_slow );
 %}
-
-instruct Repl4L(vecY dst, rRegL src) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL src));
-  format %{ "movdq   $dst,$src\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
-  ins_encode %{
-    __ movdq($dst$$XMMRegister, $src$$Register);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L(vecZ dst, rRegL src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateL src));
-  format %{ "movdq   $dst,$src\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
-  ins_encode %{
-    __ movdq($dst$$XMMRegister, $src$$Register);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
 #else // _LP64
 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3554,45 +3615,6 @@
   %}
   ins_pipe( pipe_slow );
 %}
-
-instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL src));
-  effect(TEMP dst, USE src, TEMP tmp);
-  format %{ "movdl   $dst,$src.lo\n\t"
-            "movdl   $tmp,$src.hi\n\t"
-            "punpckldq $dst,$tmp\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
-    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L(vecZ dst, eRegL src, regD tmp) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL src));
-  effect(TEMP dst, USE src, TEMP tmp);
-  format %{ "movdl   $dst,$src.lo\n\t"
-            "movdl   $tmp,$src.hi\n\t"
-            "punpckldq $dst,$tmp\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
-    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
 #endif // _LP64
 
 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
@@ -3608,79 +3630,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl4L_imm(vecY dst, immL con) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress($con));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_imm(vecZ dst, immL con) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateL con));
-  format %{ "movq    $dst,[$constantaddress]\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate4L($con)\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L($con)" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $constantaddress($con));
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Long could be loaded into xmm register directly from memory.
-instruct Repl2L_mem(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateL (LoadL mem)));
-  format %{ "movq    $dst,$mem\n\t"
-            "punpcklqdq $dst,$dst\t! replicate2L" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl4L_mem(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateL (LoadL mem)));
-  format %{ "movq    $dst,$mem\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_mem(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateL (LoadL mem)));
-  format %{ "movq    $dst,$mem\n\t"
-            "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
-            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate long (8 byte) scalar zero to be vector
 instruct Repl2L_zero(vecX dst, immL0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3704,18 +3653,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl8L_zero(vecZ dst, immL0 zero) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateL zero));
-  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
-  ins_encode %{
-    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
-    int vector_len = 2;
-    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate float (4 byte) scalar to be vector
 instruct Repl2F(vecD dst, regF src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3737,32 +3674,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl8F(vecY dst, regF src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateF src));
-  format %{ "pshufd  $dst,$src,0x00\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl16F(vecZ dst, regF src) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateF src));
-  format %{ "pshufd  $dst,$src,0x00\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! lower replicate8F\n\t"
-            "vinsertf64x4h $dst k0,$dst,$dst\t! lower replicate8F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate float (4 byte) scalar zero to be vector
 instruct Repl2F_zero(vecD dst, immF0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3795,17 +3706,6 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl16F_zero(vecZ dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 16);
-  match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate double (8 bytes) scalar to be vector
 instruct Repl2D(vecX dst, regD src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3817,32 +3717,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl4D(vecY dst, regD src) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateD src));
-  format %{ "pshufd  $dst,$src,0x44\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8D(vecZ dst, regD src) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateD src));
-  format %{ "pshufd  $dst,$src,0x44\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! lower replicate4D\n\t"
-            "vinsertf64x4h $dst k0,$dst,$dst\t! upper replicate4D" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Replicate double (8 byte) scalar zero to be vector
 instruct Repl2D_zero(vecX dst, immD0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3865,8 +3739,636 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl8D_zero(vecZ dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 8);
+// ====================EVEX REPLICATE=============================================
+
+instruct Repl4B_mem_evex(vecS dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_mem_evex(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_evex(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_mem_evex(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_evex(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
+  ins_encode %{
+   int vector_len = 1;
+    __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_evex(vecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  match(Set dst (ReplicateB src));
+  format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
+  ins_encode %{
+   int vector_len = 2;
+    __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm_evex(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastb $dst,$dst\t! replicate16B" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm_evex(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastb $dst,$dst\t! replicate32B" %}
+  ins_encode %{
+   int vector_len = 1;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_imm_evex(vecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
+  ins_encode %{
+   int vector_len = 2;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  match(Set dst (ReplicateB zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S_evex(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S_mem_evex(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_evex(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_mem_evex(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_evex(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
+  ins_encode %{
+   int vector_len = 1;
+    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_evex(vecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  match(Set dst (ReplicateS src));
+  format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
+  ins_encode %{
+   int vector_len = 2;
+    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_imm_evex(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastw $dst,$dst\t! replicate8S" %}
+  ins_encode %{
+   int vector_len = 0;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm_evex(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastw $dst,$dst\t! replicate16S" %}
+  ins_encode %{
+   int vector_len = 1;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_imm_evex(vecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastw $dst,$dst\t! replicate32S" %}
+  ins_encode %{
+   int vector_len = 2;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  match(Set dst (ReplicateS zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_evex(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_mem_evex(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_evex(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_evex(vecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateI src));
+  format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_imm_evex(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+            "vpbroadcastd  $dst,$dst\t! replicate4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm_evex(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+            "vpbroadcastd  $dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_imm_evex(vecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
+            "vpbroadcastd  $dst,$dst\t! replicate16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateI zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl4L_evex(vecY dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(vecZ dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL src));
+  format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "vpbroadcastq  $dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "vpbroadcastq  $dst,$dst\t! replicate8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+instruct Repl4L_imm_evex(vecY dst, immL con) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastq  $dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ movq($dst$$XMMRegister, $constantaddress($con));
+    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_imm_evex(vecZ dst, immL con) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "vpbroadcastq  $dst,$dst\t! replicate8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ movq($dst$$XMMRegister, $constantaddress($con));
+    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl2L_mem_evex(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateL zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_evex(vecY dst, regF src) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF src));
+  format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_evex(vecZ dst, regF src) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateF src));
+  format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_evex(vecY dst, regD src) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD src));
+  format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D_mem_evex(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_evex(vecZ dst, regD src) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateD src));
+  format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateD zero));
   format %{ "vxorpd  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
   ins_encode %{
@@ -4972,6 +5474,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVB src (LoadVector mem)));
+  format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (AddVB dst src));
@@ -4993,6 +5506,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVB src (LoadVector mem)));
+  format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 16);
   match(Set dst (AddVB dst src));
@@ -5091,6 +5615,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVS src (LoadVector mem)));
+  format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd4S(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (AddVS dst src));
@@ -5112,6 +5647,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVS src (LoadVector mem)));
+  format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd8S(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (AddVS dst src));
@@ -5210,6 +5756,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVI src (LoadVector mem)));
+  format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd4I(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (AddVI dst src));
@@ -5385,6 +5942,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVF src (LoadVector mem)));
+  format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vadd4F(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (AddVF dst src));
@@ -5562,6 +6130,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVB src (LoadVector mem)));
+  format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (SubVB dst src));
@@ -5583,6 +6162,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVB src (LoadVector mem)));
+  format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 16);
   match(Set dst (SubVB dst src));
@@ -5681,6 +6271,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVS src (LoadVector mem)));
+  format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub4S(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (SubVS dst src));
@@ -5702,6 +6303,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVS src (LoadVector mem)));
+  format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub8S(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (SubVS dst src));
@@ -5800,6 +6412,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVI src (LoadVector mem)));
+  format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub4I(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (SubVI dst src));
@@ -5975,6 +6598,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVF src (LoadVector mem)));
+  format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vsub4F(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (SubVF dst src));
@@ -6152,6 +6786,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVS src (LoadVector mem)));
+  format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul4S(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (MulVS dst src));
@@ -6173,6 +6818,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVS src (LoadVector mem)));
+  format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul8S(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 8);
   match(Set dst (MulVS dst src));
@@ -6271,6 +6927,49 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I(vecX dst, vecX src) %{
+  predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI dst src));
+  format %{ "pmulld  $dst,$src\t! mul packed4I" %}
+  ins_encode %{
+    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src1 src2));
+  format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulVL src1 src2));
@@ -6282,34 +6981,13 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul4I(vecX dst, vecX src) %{
-  predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
-  match(Set dst (MulVI dst src));
-  format %{ "pmulld  $dst,$src\t! mul packed4I" %}
-  ins_encode %{
-    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
-  match(Set dst (MulVI src1 src2));
-  format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
-  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
-  match(Set dst (MulVI src (LoadVector mem)));
-  format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src (LoadVector mem)));
+  format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -6336,6 +7014,28 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src1 src2));
+  format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src (LoadVector mem)));
+  format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
   match(Set dst (MulVI src1 src2));
@@ -6347,13 +7047,13 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
-  match(Set dst (MulVL src1 src2));
-  format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -6369,28 +7069,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
-  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
-  match(Set dst (MulVI src (LoadVector mem)));
-  format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
-  match(Set dst (MulVL src (LoadVector mem)));
-  format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
   match(Set dst (MulVI src (LoadVector mem)));
@@ -6424,6 +7102,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVF src (LoadVector mem)));
+  format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vmul4F(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (MulVF dst src));
@@ -6601,6 +7290,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (DivVF src (LoadVector mem)));
+  format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vdiv4F(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (DivVF dst src));
@@ -7878,6 +8578,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (AndV src (LoadVector mem)));
+  format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vand8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (AndV dst src));
@@ -7899,6 +8610,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV src (LoadVector mem)));
+  format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vand16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (AndV dst src));
@@ -7998,6 +8720,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (OrV src (LoadVector mem)));
+  format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vor8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (OrV dst src));
@@ -8019,6 +8752,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (OrV src (LoadVector mem)));
+  format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vor16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (OrV dst src));
@@ -8118,6 +8862,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (XorV src (LoadVector mem)));
+  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vxor8B(vecD dst, vecD src) %{
   predicate(n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (XorV dst src));
@@ -8139,6 +8894,17 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV src (LoadVector mem)));
+  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vxor16B(vecX dst, vecX src) %{
   predicate(n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (XorV dst src));
--- a/src/cpu/zero/vm/globals_zero.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/cpu/zero/vm/globals_zero.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -63,7 +63,8 @@
 
 define_pd_global(bool, PreserveFramePointer, false);
 
-#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct)  \
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint)  \
+                                                                            \
   product(bool, UseFastEmptyMethods, true,                                  \
           "Use fast method entry code for empty methods")                   \
                                                                             \
--- a/src/os/aix/vm/decoder_aix.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/aix/vm/decoder_aix.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2013 SAP AG. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -38,8 +38,8 @@
 
   virtual bool demangle(const char* symbol, char* buf, int buflen) { return false; } // demangled by getFuncName
 
-  virtual bool decode(address addr, char* buf, int buflen, int* offset, const char* modulepath) {
-    return (::getFuncName((codeptr_t)addr, buf, buflen, offset, 0, 0, 0) == 0);
+  virtual bool decode(address addr, char* buf, int buflen, int* offset, const char* modulepath, bool demangle) {
+    return (::getFuncName((codeptr_t)addr, buf, buflen, offset, 0, 0, 0, demangle) == 0);
   }
   virtual bool decode(address addr, char *buf, int buflen, int* offset, const void *base) {
     ShouldNotReachHere();
--- a/src/os/aix/vm/globals_aix.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/aix/vm/globals_aix.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -29,7 +29,7 @@
 //
 // Defines Aix specific flags. They are not available on other platforms.
 //
-#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
+#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct, range, constraint) \
                                                                                     \
   /* Use 64K pages for virtual memory (shmat). */                                   \
   product(bool, Use64KPages, true,                                                  \
--- a/src/os/aix/vm/os_aix.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/aix/vm/os_aix.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1439,7 +1439,8 @@
 }
 
 bool os::dll_address_to_function_name(address addr, char *buf,
-                                      int buflen, int *offset) {
+                                      int buflen, int *offset,
+                                      bool demangle) {
   if (offset) {
     *offset = -1;
   }
@@ -1454,7 +1455,7 @@
   }
 
   // Go through Decoder::decode to call getFuncName which reads the name from the traceback table.
-  return Decoder::decode(addr, buf, buflen, offset);
+  return Decoder::decode(addr, buf, buflen, offset, demangle);
 }
 
 static int getModuleName(codeptr_t pc,                    // [in] program counter
@@ -1653,7 +1654,7 @@
   }
 }
 
-void os::pd_print_cpu_info(outputStream* st) {
+void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
   // cpu
   st->print("CPU:");
   st->print("total %d", os::processor_count());
@@ -3761,10 +3762,6 @@
   return fetcher.result();
 }
 
-// Not neede on Aix.
-// int os::Aix::safe_cond_timedwait(pthread_cond_t *_cond, pthread_mutex_t *_mutex, const struct timespec *_abstime) {
-// }
-
 ////////////////////////////////////////////////////////////////////////////////
 // debug support
 
--- a/src/os/aix/vm/porting_aix.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/aix/vm/porting_aix.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -114,7 +114,8 @@
     int* p_displacement,             // [out] optional: displacement (-1 if not available)
     const struct tbtable** p_tb,     // [out] optional: ptr to traceback table to get further
                                      //                 information (NULL if not available)
-    char* p_errmsg, size_t errmsglen // [out] optional: user provided buffer for error messages
+    char* p_errmsg, size_t errmsglen,// [out] optional: user provided buffer for error messages
+    bool demangle                    // [in] whether to demangle the name
   ) {
   struct tbtable* tb = 0;
   unsigned int searchcount = 0;
@@ -216,15 +217,17 @@
       p_name[0] = '\0';
 
       // If it is a C++ name, try and demangle it using the Demangle interface (see demangle.h).
-      char* rest;
-      Name* const name = Demangle(buf, rest);
-      if (name) {
-        const char* const demangled_name = name->Text();
-        if (demangled_name) {
-          strncpy(p_name, demangled_name, namelen-1);
-          p_name[namelen-1] = '\0';
+      if (demangle) {
+        char* rest;
+        Name* const name = Demangle(buf, rest);
+        if (name) {
+          const char* const demangled_name = name->Text();
+          if (demangled_name) {
+            strncpy(p_name, demangled_name, namelen-1);
+            p_name[namelen-1] = '\0';
+          }
+          delete name;
         }
-        delete name;
       }
 
       // Fallback: if demangling did not work, just provide the unmangled name.
@@ -325,7 +328,7 @@
       int displacement = 0;
 
       if (getFuncName((codeptr_t) p, funcname, sizeof(funcname), &displacement,
-                      NULL, NULL, 0) == 0) {
+                      NULL, NULL, 0, true /* demangle */) == 0) {
         if (funcname[0] != '\0') {
           const char* const interned = dladdr_fixed_strings.intern(funcname);
           info->dli_sname = interned;
--- a/src/os/aix/vm/porting_aix.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/aix/vm/porting_aix.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -87,7 +87,8 @@
       char* p_name, size_t namelen,    // [out] optional: user provided buffer for the function name
       int* p_displacement,             // [out] optional: displacement
       const struct tbtable** p_tb,     // [out] optional: ptr to traceback table to get further information
-      char* p_errmsg, size_t errmsglen // [out] optional: user provided buffer for error messages
+      char* p_errmsg, size_t errmsglen,// [out] optional: user provided buffer for error messages
+      bool demangle = true             // [in] whether to demangle the name
     );
 
 // -------------------------------------------------------------------------
--- a/src/os/bsd/vm/decoder_machO.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/bsd/vm/decoder_machO.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -42,7 +42,7 @@
   virtual bool decode(address pc, char* buf, int buflen, int* offset,
                       const void* base);
   virtual bool decode(address pc, char* buf, int buflen, int* offset,
-                      const char* module_path = NULL) {
+                      const char* module_path, bool demangle) {
     ShouldNotReachHere();
     return false;
   }
--- a/src/os/bsd/vm/globals_bsd.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/bsd/vm/globals_bsd.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,19 +28,20 @@
 //
 // Defines Bsd specific flags. They are not available on other platforms.
 //
-#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
-  product(bool, UseOprofile, false,                                     \
-        "enable support for Oprofile profiler")                         \
-                                                                        \
-  product(bool, UseBsdPosixThreadCPUClocks, true,                     \
-          "enable fast Bsd Posix clocks where available")             \
-/*  NB: The default value of UseBsdPosixThreadCPUClocks may be        \
-    overridden in Arguments::parse_each_vm_init_arg.  */                \
-                                                                        \
-  product(bool, UseHugeTLBFS, false,                                    \
-          "Use MAP_HUGETLB for large pages")                            \
-                                                                        \
-  product(bool, UseSHM, false,                                          \
+#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct, range, constraint) \
+                                                                                \
+  product(bool, UseOprofile, false,                                             \
+        "enable support for Oprofile profiler")                                 \
+                                                                                \
+  /*  NB: The default value of UseBsdPosixThreadCPUClocks may be  */            \
+  /*  overridden in Arguments::parse_each_vm_init_arg.            */            \
+  product(bool, UseBsdPosixThreadCPUClocks, true,                               \
+          "enable fast Bsd Posix clocks where available")                       \
+                                                                                \
+  product(bool, UseHugeTLBFS, false,                                            \
+          "Use MAP_HUGETLB for large pages")                                    \
+                                                                                \
+  product(bool, UseSHM, false,                                                  \
           "Use SYSV shared memory for large pages")
 
 //
--- a/src/os/bsd/vm/os_bsd.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/bsd/vm/os_bsd.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -637,11 +637,6 @@
 //////////////////////////////////////////////////////////////////////////////
 // create new thread
 
-// check if it's safe to start a new thread
-static bool _thread_safety_check(Thread* thread) {
-  return true;
-}
-
 #ifdef __APPLE__
 // library handle for calling objc_registerThreadWithCollector()
 // without static linking to the libobjc library
@@ -681,15 +676,6 @@
   OSThread* osthread = thread->osthread();
   Monitor* sync = osthread->startThread_lock();
 
-  // non floating stack BsdThreads needs extra check, see above
-  if (!_thread_safety_check(thread)) {
-    // notify parent thread
-    MutexLockerEx ml(sync, Mutex::_no_safepoint_check_flag);
-    osthread->set_state(ZOMBIE);
-    sync->notify_all();
-    return NULL;
-  }
-
   osthread->set_thread_id(os::Bsd::gettid());
 
 #ifdef __APPLE__
@@ -1339,7 +1325,8 @@
 #define MACH_MAXSYMLEN 256
 
 bool os::dll_address_to_function_name(address addr, char *buf,
-                                      int buflen, int *offset) {
+                                      int buflen, int *offset,
+                                      bool demangle) {
   // buf is not optional, but offset is optional
   assert(buf != NULL, "sanity check");
 
@@ -1349,7 +1336,7 @@
   if (dladdr((void*)addr, &dlinfo) != 0) {
     // see if we have a matching symbol
     if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) {
-      if (!Decoder::demangle(dlinfo.dli_sname, buf, buflen)) {
+      if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
         jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
       }
       if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
@@ -1358,15 +1345,16 @@
     // no matching symbol so try for just file info
     if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
       if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
-                          buf, buflen, offset, dlinfo.dli_fname)) {
+                          buf, buflen, offset, dlinfo.dli_fname, demangle)) {
         return true;
       }
     }
 
     // Handle non-dynamic manually:
     if (dlinfo.dli_fbase != NULL &&
-        Decoder::decode(addr, localbuf, MACH_MAXSYMLEN, offset, dlinfo.dli_fbase)) {
-      if (!Decoder::demangle(localbuf, buf, buflen)) {
+        Decoder::decode(addr, localbuf, MACH_MAXSYMLEN, offset,
+                        dlinfo.dli_fbase)) {
+      if (!(demangle && Decoder::demangle(localbuf, buf, buflen))) {
         jio_snprintf(buf, buflen, "%s", localbuf);
       }
       return true;
@@ -1706,7 +1694,7 @@
   os::Posix::print_load_average(st);
 }
 
-void os::pd_print_cpu_info(outputStream* st) {
+void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
   // Nothing to do for now.
 }
 
@@ -2276,8 +2264,6 @@
   return os::uncommit_memory(addr, size);
 }
 
-static address _highest_vm_reserved_address = NULL;
-
 // If 'fixed' is true, anon_mmap() will attempt to reserve anonymous memory
 // at 'requested_addr'. If there are existing memory mappings at the same
 // location, however, they will be overwritten. If 'fixed' is false,
@@ -2300,23 +2286,9 @@
   addr = (char*)::mmap(requested_addr, bytes, PROT_NONE,
                        flags, -1, 0);
 
-  if (addr != MAP_FAILED) {
-    // anon_mmap() should only get called during VM initialization,
-    // don't need lock (actually we can skip locking even it can be called
-    // from multiple threads, because _highest_vm_reserved_address is just a
-    // hint about the upper limit of non-stack memory regions.)
-    if ((address)addr + bytes > _highest_vm_reserved_address) {
-      _highest_vm_reserved_address = (address)addr + bytes;
-    }
-  }
-
   return addr == MAP_FAILED ? NULL : addr;
 }
 
-// Don't update _highest_vm_reserved_address, because there might be memory
-// regions above addr + size. If so, releasing a memory region only creates
-// a hole in the address space, it doesn't help prevent heap-stack collision.
-//
 static int anon_munmap(char * addr, size_t size) {
   return ::munmap(addr, size) == 0;
 }
@@ -2490,15 +2462,7 @@
   assert(bytes % os::vm_page_size() == 0, "reserving unexpected size block");
 
   // Repeatedly allocate blocks until the block is allocated at the
-  // right spot. Give up after max_tries. Note that reserve_memory() will
-  // automatically update _highest_vm_reserved_address if the call is
-  // successful. The variable tracks the highest memory address every reserved
-  // by JVM. It is used to detect heap-stack collision if running with
-  // fixed-stack BsdThreads. Because here we may attempt to reserve more
-  // space than needed, it could confuse the collision detecting code. To
-  // solve the problem, save current _highest_vm_reserved_address and
-  // calculate the correct value before return.
-  address old_highest = _highest_vm_reserved_address;
+  // right spot.
 
   // Bsd mmap allows caller to pass an address as hint; give it a try first,
   // if kernel honors the hint then we can return immediately.
@@ -2552,10 +2516,8 @@
   }
 
   if (i < max_tries) {
-    _highest_vm_reserved_address = MAX2(old_highest, (address)requested_addr + bytes);
     return requested_addr;
   } else {
-    _highest_vm_reserved_address = old_highest;
     return NULL;
   }
 }
@@ -3715,12 +3677,6 @@
   return fetcher.result();
 }
 
-int os::Bsd::safe_cond_timedwait(pthread_cond_t *_cond,
-                                 pthread_mutex_t *_mutex,
-                                 const struct timespec *_abstime) {
-  return pthread_cond_timedwait(_cond, _mutex, _abstime);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // debug support
 
@@ -4286,7 +4242,7 @@
   // In that case, we should propagate the notify to another waiter.
 
   while (_Event < 0) {
-    status = os::Bsd::safe_cond_timedwait(_cond, _mutex, &abst);
+    status = pthread_cond_timedwait(_cond, _mutex, &abst);
     if (status != 0 && WorkAroundNPTLTimedWaitHang) {
       pthread_cond_destroy(_cond);
       pthread_cond_init(_cond, NULL);
@@ -4492,7 +4448,7 @@
   if (time == 0) {
     status = pthread_cond_wait(_cond, _mutex);
   } else {
-    status = os::Bsd::safe_cond_timedwait(_cond, _mutex, &absTime);
+    status = pthread_cond_timedwait(_cond, _mutex, &absTime);
     if (status != 0 && WorkAroundNPTLTimedWaitHang) {
       pthread_cond_destroy(_cond);
       pthread_cond_init(_cond, NULL);
--- a/src/os/bsd/vm/os_bsd.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/bsd/vm/os_bsd.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -30,9 +30,6 @@
 // Information about the protection of the page at address '0' on this os.
 static bool zero_page_read_protected() { return true; }
 
-// pthread_getattr_np comes with BsdThreads-0.9-7 on RedHat 7.1
-typedef int (*pthread_getattr_func_type)(pthread_t, pthread_attr_t *);
-
 #ifdef __APPLE__
 // Mac OS X doesn't support clock_gettime. Stub out the type, it is
 // unused
@@ -145,9 +142,6 @@
 
   // none present
 
-  // BsdThreads work-around for 6292965
-  static int safe_cond_timedwait(pthread_cond_t *_cond, pthread_mutex_t *_mutex, const struct timespec *_abstime);
-
  private:
   typedef int (*sched_getcpu_func_t)(void);
   typedef int (*numa_node_to_cpus_func_t)(int node, unsigned long *buffer, int bufferlen);
--- a/src/os/linux/vm/globals_linux.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/linux/vm/globals_linux.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,14 +28,15 @@
 //
 // Defines Linux specific flags. They are not available on other platforms.
 //
-#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
+#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct, range, constraint) \
+                                                                        \
   product(bool, UseOprofile, false,                                     \
         "enable support for Oprofile profiler")                         \
                                                                         \
+  /*  NB: The default value of UseLinuxPosixThreadCPUClocks may be   */ \
+  /* overridden in Arguments::parse_each_vm_init_arg.                */ \
   product(bool, UseLinuxPosixThreadCPUClocks, true,                     \
           "enable fast Linux Posix clocks where available")             \
-/*  NB: The default value of UseLinuxPosixThreadCPUClocks may be        \
-    overridden in Arguments::parse_each_vm_init_arg.  */                \
                                                                         \
   product(bool, UseHugeTLBFS, false,                                    \
           "Use MAP_HUGETLB for large pages")                            \
--- a/src/os/linux/vm/os_linux.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/linux/vm/os_linux.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -135,8 +135,6 @@
 pthread_t os::Linux::_main_thread;
 int os::Linux::_page_size = -1;
 const int os::Linux::_vm_default_page_size = (8 * K);
-bool os::Linux::_is_floating_stack = false;
-bool os::Linux::_is_NPTL = false;
 bool os::Linux::_supports_fast_thread_cpu_time = false;
 const char * os::Linux::_glibc_version = NULL;
 const char * os::Linux::_libpthread_version = NULL;
@@ -150,8 +148,6 @@
 static sigset_t check_signal_done;
 static bool check_signals = true;
 
-static pid_t _initial_pid = 0;
-
 // Signal number used to suspend/resume a thread
 
 // do not use any signal number less than SIGSEGV, see 4355769
@@ -223,18 +219,10 @@
 //
 // Returns the kernel thread id of the currently running thread. Kernel
 // thread id is used to access /proc.
-//
-// (Note that getpid() on LinuxThreads returns kernel thread id too; but
-// on NPTL, it returns the same pid for all threads, as required by POSIX.)
-//
 pid_t os::Linux::gettid() {
   int rslt = syscall(SYS_gettid);
-  if (rslt == -1) {
-    // old kernel, no NPTL support
-    return getpid();
-  } else {
-    return (pid_t)rslt;
-  }
+  assert(rslt != -1, "must be."); // old linuxthreads implementation?
+  return (pid_t)rslt;
 }
 
 // Most versions of linux have a bug where the number of processors are
@@ -508,68 +496,48 @@
 // detecting pthread library
 
 void os::Linux::libpthread_init() {
-  // Save glibc and pthread version strings. Note that _CS_GNU_LIBC_VERSION
-  // and _CS_GNU_LIBPTHREAD_VERSION are supported in glibc >= 2.3.2. Use a
-  // generic name for earlier versions.
-  // Define macros here so we can build HotSpot on old systems.
-#ifndef _CS_GNU_LIBC_VERSION
-  #define _CS_GNU_LIBC_VERSION 2
+  // Save glibc and pthread version strings.
+#if !defined(_CS_GNU_LIBC_VERSION) || \
+    !defined(_CS_GNU_LIBPTHREAD_VERSION)
+  #error "glibc too old (< 2.3.2)"
 #endif
-#ifndef _CS_GNU_LIBPTHREAD_VERSION
-  #define _CS_GNU_LIBPTHREAD_VERSION 3
-#endif
 
   size_t n = confstr(_CS_GNU_LIBC_VERSION, NULL, 0);
-  if (n > 0) {
-    char *str = (char *)malloc(n, mtInternal);
-    confstr(_CS_GNU_LIBC_VERSION, str, n);
-    os::Linux::set_glibc_version(str);
-  } else {
-    // _CS_GNU_LIBC_VERSION is not supported, try gnu_get_libc_version()
-    static char _gnu_libc_version[32];
-    jio_snprintf(_gnu_libc_version, sizeof(_gnu_libc_version),
-                 "glibc %s %s", gnu_get_libc_version(), gnu_get_libc_release());
-    os::Linux::set_glibc_version(_gnu_libc_version);
-  }
+  assert(n > 0, "cannot retrieve glibc version");
+  char *str = (char *)malloc(n, mtInternal);
+  confstr(_CS_GNU_LIBC_VERSION, str, n);
+  os::Linux::set_glibc_version(str);
 
   n = confstr(_CS_GNU_LIBPTHREAD_VERSION, NULL, 0);
-  if (n > 0) {
-    char *str = (char *)malloc(n, mtInternal);
-    confstr(_CS_GNU_LIBPTHREAD_VERSION, str, n);
-    // Vanilla RH-9 (glibc 2.3.2) has a bug that confstr() always tells
-    // us "NPTL-0.29" even we are running with LinuxThreads. Check if this
-    // is the case. LinuxThreads has a hard limit on max number of threads.
-    // So sysconf(_SC_THREAD_THREADS_MAX) will return a positive value.
-    // On the other hand, NPTL does not have such a limit, sysconf()
-    // will return -1 and errno is not changed. Check if it is really NPTL.
-    if (strcmp(os::Linux::glibc_version(), "glibc 2.3.2") == 0 &&
-        strstr(str, "NPTL") &&
-        sysconf(_SC_THREAD_THREADS_MAX) > 0) {
-      free(str);
-      os::Linux::set_libpthread_version("linuxthreads");
-    } else {
-      os::Linux::set_libpthread_version(str);
-    }
-  } else {
-    // glibc before 2.3.2 only has LinuxThreads.
-    os::Linux::set_libpthread_version("linuxthreads");
-  }
-
-  if (strstr(libpthread_version(), "NPTL")) {
-    os::Linux::set_is_NPTL();
-  } else {
-    os::Linux::set_is_LinuxThreads();
-  }
-
-  // LinuxThreads have two flavors: floating-stack mode, which allows variable
-  // stack size; and fixed-stack mode. NPTL is always floating-stack.
-  if (os::Linux::is_NPTL() || os::Linux::supports_variable_stack_size()) {
-    os::Linux::set_is_floating_stack();
-  }
+  assert(n > 0, "cannot retrieve pthread version");
+  str = (char *)malloc(n, mtInternal);
+  confstr(_CS_GNU_LIBPTHREAD_VERSION, str, n);
+  os::Linux::set_libpthread_version(str);
 }
 
 /////////////////////////////////////////////////////////////////////////////
-// thread stack
+// thread stack expansion
+
+// os::Linux::manually_expand_stack() takes care of expanding the thread
+// stack. Note that this is normally not needed: pthread stacks allocate
+// thread stack using mmap() without MAP_NORESERVE, so the stack is already
+// committed. Therefore it is not necessary to expand the stack manually.
+//
+// Manually expanding the stack was historically needed on LinuxThreads
+// thread stacks, which were allocated with mmap(MAP_GROWSDOWN). Nowadays
+// it is kept to deal with very rare corner cases:
+//
+// For one, user may run the VM on an own implementation of threads
+// whose stacks are - like the old LinuxThreads - implemented using
+// mmap(MAP_GROWSDOWN).
+//
+// Also, this coding may be needed if the VM is running on the primordial
+// thread. Normally we avoid running on the primordial thread; however,
+// user may still invoke the VM on the primordial thread.
+//
+// The following historical comment describes the details about running
+// on a thread stack allocated with mmap(MAP_GROWSDOWN):
+
 
 // Force Linux kernel to expand current thread stack. If "bottom" is close
 // to the stack guard, caller should block all signals.
@@ -593,10 +561,7 @@
 //   stack overflow detection.
 //
 //   Newer version of LinuxThreads (since glibc-2.2, or, RH-7.x) and NPTL do
-//   not use this flag. However, the stack of initial thread is not created
-//   by pthread, it is still MAP_GROWSDOWN. Also it's possible (though
-//   unlikely) that user code can create a thread with MAP_GROWSDOWN stack
-//   and then attach the thread to JVM.
+//   not use MAP_GROWSDOWN.
 //
 // To get around the problem and allow stack banging on Linux, we need to
 // manually expand thread stack after receiving the SIGSEGV.
@@ -671,45 +636,6 @@
 //////////////////////////////////////////////////////////////////////////////
 // create new thread
 
-static address highest_vm_reserved_address();
-
-// check if it's safe to start a new thread
-static bool _thread_safety_check(Thread* thread) {
-  if (os::Linux::is_LinuxThreads() && !os::Linux::is_floating_stack()) {
-    // Fixed stack LinuxThreads (SuSE Linux/x86, and some versions of Redhat)
-    //   Heap is mmap'ed at lower end of memory space. Thread stacks are
-    //   allocated (MAP_FIXED) from high address space. Every thread stack
-    //   occupies a fixed size slot (usually 2Mbytes, but user can change
-    //   it to other values if they rebuild LinuxThreads).
-    //
-    // Problem with MAP_FIXED is that mmap() can still succeed even part of
-    // the memory region has already been mmap'ed. That means if we have too
-    // many threads and/or very large heap, eventually thread stack will
-    // collide with heap.
-    //
-    // Here we try to prevent heap/stack collision by comparing current
-    // stack bottom with the highest address that has been mmap'ed by JVM
-    // plus a safety margin for memory maps created by native code.
-    //
-    // This feature can be disabled by setting ThreadSafetyMargin to 0
-    //
-    if (ThreadSafetyMargin > 0) {
-      address stack_bottom = os::current_stack_base() - os::current_stack_size();
-
-      // not safe if our stack extends below the safety margin
-      return stack_bottom - ThreadSafetyMargin >= highest_vm_reserved_address();
-    } else {
-      return true;
-    }
-  } else {
-    // Floating stack LinuxThreads or NPTL:
-    //   Unlike fixed stack LinuxThreads, thread stacks are not MAP_FIXED. When
-    //   there's not enough space left, pthread_create() will fail. If we come
-    //   here, that means enough space has been reserved for stack.
-    return true;
-  }
-}
-
 // Thread start routine for all newly created threads
 static void *java_start(Thread *thread) {
   // Try to randomize the cache line index of hot stack frames.
@@ -726,15 +652,6 @@
   OSThread* osthread = thread->osthread();
   Monitor* sync = osthread->startThread_lock();
 
-  // non floating stack LinuxThreads needs extra check, see above
-  if (!_thread_safety_check(thread)) {
-    // notify parent thread
-    MutexLockerEx ml(sync, Mutex::_no_safepoint_check_flag);
-    osthread->set_state(ZOMBIE);
-    sync->notify_all();
-    return NULL;
-  }
-
   // thread_id is kernel thread id (similar to Solaris LWP id)
   osthread->set_thread_id(os::Linux::gettid());
 
@@ -833,12 +750,6 @@
   ThreadState state;
 
   {
-    // Serialize thread creation if we are running with fixed stack LinuxThreads
-    bool lock = os::Linux::is_LinuxThreads() && !os::Linux::is_floating_stack();
-    if (lock) {
-      os::Linux::createThread_lock()->lock_without_safepoint_check();
-    }
-
     pthread_t tid;
     int ret = pthread_create(&tid, &attr, (void* (*)(void*)) java_start, thread);
 
@@ -851,7 +762,6 @@
       // Need to clean up stuff we've allocated so far
       thread->set_osthread(NULL);
       delete osthread;
-      if (lock) os::Linux::createThread_lock()->unlock();
       return false;
     }
 
@@ -866,10 +776,6 @@
         sync_with_child->wait(Mutex::_no_safepoint_check_flag);
       }
     }
-
-    if (lock) {
-      os::Linux::createThread_lock()->unlock();
-    }
   }
 
   // Aborted due to thread limit being reached
@@ -1497,7 +1403,6 @@
 
 // Die immediately, no exit hook, no abort hook, no cleanup.
 void os::die() {
-  // _exit() on LinuxThreads only kills current thread
   ::abort();
 }
 
@@ -1520,24 +1425,7 @@
 
 intx os::current_thread_id() { return (intx)pthread_self(); }
 int os::current_process_id() {
-
-  // Under the old linux thread library, linux gives each thread
-  // its own process id. Because of this each thread will return
-  // a different pid if this method were to return the result
-  // of getpid(2). Linux provides no api that returns the pid
-  // of the launcher thread for the vm. This implementation
-  // returns a unique pid, the pid of the launcher thread
-  // that starts the vm 'process'.
-
-  // Under the NPTL, getpid() returns the same pid as the
-  // launcher thread rather than a unique pid per thread.
-  // Use gettid() if you want the old pre NPTL behaviour.
-
-  // if you are looking for the result of a call to getpid() that
-  // returns a unique pid for the calling thread, then look at the
-  // OSThread::thread_id() method in osThread_linux.hpp file
-
-  return (int)(_initial_pid ? _initial_pid : getpid());
+  return ::getpid();
 }
 
 // DLL functions
@@ -1623,7 +1511,8 @@
 }
 
 bool os::dll_address_to_function_name(address addr, char *buf,
-                                      int buflen, int *offset) {
+                                      int buflen, int *offset,
+                                      bool demangle) {
   // buf is not optional, but offset is optional
   assert(buf != NULL, "sanity check");
 
@@ -1632,7 +1521,7 @@
   if (dladdr((void*)addr, &dlinfo) != 0) {
     // see if we have a matching symbol
     if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) {
-      if (!Decoder::demangle(dlinfo.dli_sname, buf, buflen)) {
+      if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
         jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
       }
       if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
@@ -1641,7 +1530,7 @@
     // no matching symbol so try for just file info
     if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
       if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
-                          buf, buflen, offset, dlinfo.dli_fname)) {
+                          buf, buflen, offset, dlinfo.dli_fname, demangle)) {
         return true;
       }
     }
@@ -2183,9 +2072,6 @@
   st->print("libc:");
   st->print("%s ", os::Linux::glibc_version());
   st->print("%s ", os::Linux::libpthread_version());
-  if (os::Linux::is_LinuxThreads()) {
-    st->print("(%s stack)", os::Linux::is_floating_stack() ? "floating" : "fixed");
-  }
   st->cr();
 }
 
@@ -2215,12 +2101,52 @@
   st->cr();
 }
 
-void os::pd_print_cpu_info(outputStream* st) {
-  st->print("\n/proc/cpuinfo:\n");
-  if (!_print_ascii_file("/proc/cpuinfo", st)) {
-    st->print("  <Not Available>");
-  }
-  st->cr();
+// Print the first "model name" line and the first "flags" line
+// that we find and nothing more. We assume "model name" comes
+// before "flags" so if we find a second "model name", then the
+// "flags" field is considered missing.
+static bool print_model_name_and_flags(outputStream* st, char* buf, size_t buflen) {
+#if defined(IA32) || defined(AMD64)
+  // Other platforms have less repetitive cpuinfo files
+  FILE *fp = fopen("/proc/cpuinfo", "r");
+  if (fp) {
+    while (!feof(fp)) {
+      if (fgets(buf, buflen, fp)) {
+        // Assume model name comes before flags
+        bool model_name_printed = false;
+        if (strstr(buf, "model name") != NULL) {
+          if (!model_name_printed) {
+            st->print_raw("\nCPU Model and flags from /proc/cpuinfo:\n");
+            st->print_raw(buf);
+            model_name_printed = true;
+          } else {
+            // model name printed but not flags?  Odd, just return
+            fclose(fp);
+            return true;
+          }
+        }
+        // print the flags line too
+        if (strstr(buf, "flags") != NULL) {
+          st->print_raw(buf);
+          fclose(fp);
+          return true;
+        }
+      }
+    }
+    fclose(fp);
+  }
+#endif // x86 platforms
+  return false;
+}
+
+void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
+  // Only print the model name if the platform provides this as a summary
+  if (!print_model_name_and_flags(st, buf, buflen)) {
+    st->print("\n/proc/cpuinfo:\n");
+    if (!_print_ascii_file("/proc/cpuinfo", st)) {
+      st->print_cr("  <Not Available>");
+    }
+  }
 }
 
 void os::print_siginfo(outputStream* st, void* siginfo) {
@@ -3044,8 +2970,6 @@
   return os::uncommit_memory(addr, size);
 }
 
-static address _highest_vm_reserved_address = NULL;
-
 // If 'fixed' is true, anon_mmap() will attempt to reserve anonymous memory
 // at 'requested_addr'. If there are existing memory mappings at the same
 // location, however, they will be overwritten. If 'fixed' is false,
@@ -3068,23 +2992,9 @@
   addr = (char*)::mmap(requested_addr, bytes, PROT_NONE,
                        flags, -1, 0);
 
-  if (addr != MAP_FAILED) {
-    // anon_mmap() should only get called during VM initialization,
-    // don't need lock (actually we can skip locking even it can be called
-    // from multiple threads, because _highest_vm_reserved_address is just a
-    // hint about the upper limit of non-stack memory regions.)
-    if ((address)addr + bytes > _highest_vm_reserved_address) {
-      _highest_vm_reserved_address = (address)addr + bytes;
-    }
-  }
-
   return addr == MAP_FAILED ? NULL : addr;
 }
 
-// Don't update _highest_vm_reserved_address, because there might be memory
-// regions above addr + size. If so, releasing a memory region only creates
-// a hole in the address space, it doesn't help prevent heap-stack collision.
-//
 static int anon_munmap(char * addr, size_t size) {
   return ::munmap(addr, size) == 0;
 }
@@ -3098,10 +3008,6 @@
   return anon_munmap(addr, size);
 }
 
-static address highest_vm_reserved_address() {
-  return _highest_vm_reserved_address;
-}
-
 static bool linux_mprotect(char* addr, size_t size, int prot) {
   // Linux wants the mprotect address argument to be page aligned.
   char* bottom = (char*)align_size_down((intptr_t)addr, os::Linux::page_size());
@@ -3718,15 +3624,7 @@
   assert(bytes % os::vm_page_size() == 0, "reserving unexpected size block");
 
   // Repeatedly allocate blocks until the block is allocated at the
-  // right spot. Give up after max_tries. Note that reserve_memory() will
-  // automatically update _highest_vm_reserved_address if the call is
-  // successful. The variable tracks the highest memory address every reserved
-  // by JVM. It is used to detect heap-stack collision if running with
-  // fixed-stack LinuxThreads. Because here we may attempt to reserve more
-  // space than needed, it could confuse the collision detecting code. To
-  // solve the problem, save current _highest_vm_reserved_address and
-  // calculate the correct value before return.
-  address old_highest = _highest_vm_reserved_address;
+  // right spot.
 
   // Linux mmap allows caller to pass an address as hint; give it a try first,
   // if kernel honors the hint then we can return immediately.
@@ -3780,10 +3678,8 @@
   }
 
   if (i < max_tries) {
-    _highest_vm_reserved_address = MAX2(old_highest, (address)requested_addr + bytes);
     return requested_addr;
   } else {
-    _highest_vm_reserved_address = old_highest;
     return NULL;
   }
 }
@@ -4627,16 +4523,6 @@
   char dummy;   // used to get a guess on initial stack address
 //  first_hrtime = gethrtime();
 
-  // With LinuxThreads the JavaMain thread pid (primordial thread)
-  // is different than the pid of the java launcher thread.
-  // So, on Linux, the launcher thread pid is passed to the VM
-  // via the sun.java.launcher.pid property.
-  // Use this property instead of getpid() if it was correctly passed.
-  // See bug 6351349.
-  pid_t java_launcher_pid = (pid_t) Arguments::sun_java_launcher_pid();
-
-  _initial_pid = (java_launcher_pid > 0) ? java_launcher_pid : getpid();
-
   clock_tics_per_sec = sysconf(_SC_CLK_TCK);
 
   init_random(1234567);
@@ -4769,9 +4655,8 @@
 
   Linux::libpthread_init();
   if (PrintMiscellaneous && (Verbose || WizardMode)) {
-    tty->print_cr("[HotSpot is running with %s, %s(%s)]\n",
-                  Linux::glibc_version(), Linux::libpthread_version(),
-                  Linux::is_floating_stack() ? "floating stack" : "fixed stack");
+    tty->print_cr("[HotSpot is running with %s, %s]\n",
+                  Linux::glibc_version(), Linux::libpthread_version());
   }
 
   if (UseNUMA) {
@@ -4946,22 +4831,6 @@
   return fetcher.result();
 }
 
-int os::Linux::safe_cond_timedwait(pthread_cond_t *_cond,
-                                   pthread_mutex_t *_mutex,
-                                   const struct timespec *_abstime) {
-  if (is_NPTL()) {
-    return pthread_cond_timedwait(_cond, _mutex, _abstime);
-  } else {
-    // 6292965: LinuxThreads pthread_cond_timedwait() resets FPU control
-    // word back to default 64bit precision if condvar is signaled. Java
-    // wants 53bit precision.  Save and restore current value.
-    int fpu = get_fpu_control_word();
-    int status = pthread_cond_timedwait(_cond, _mutex, _abstime);
-    set_fpu_control_word(fpu);
-    return status;
-  }
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // debug support
 
@@ -5585,7 +5454,7 @@
   // In that case, we should propagate the notify to another waiter.
 
   while (_Event < 0) {
-    status = os::Linux::safe_cond_timedwait(_cond, _mutex, &abst);
+    status = pthread_cond_timedwait(_cond, _mutex, &abst);
     if (status != 0 && WorkAroundNPTLTimedWaitHang) {
       pthread_cond_destroy(_cond);
       pthread_cond_init(_cond, os::Linux::condAttr());
@@ -5813,7 +5682,7 @@
     status = pthread_cond_wait(&_cond[_cur_index], _mutex);
   } else {
     _cur_index = isAbsolute ? ABS_INDEX : REL_INDEX;
-    status = os::Linux::safe_cond_timedwait(&_cond[_cur_index], _mutex, &absTime);
+    status = pthread_cond_timedwait(&_cond[_cur_index], _mutex, &absTime);
     if (status != 0 && WorkAroundNPTLTimedWaitHang) {
       pthread_cond_destroy(&_cond[_cur_index]);
       pthread_cond_init(&_cond[_cur_index], isAbsolute ? NULL : os::Linux::condAttr());
--- a/src/os/linux/vm/os_linux.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/linux/vm/os_linux.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -27,9 +27,6 @@
 
 // Linux_OS defines the interface to Linux operating systems
 
-// pthread_getattr_np comes with LinuxThreads-0.9-7 on RedHat 7.1
-typedef int (*pthread_getattr_func_type)(pthread_t, pthread_attr_t *);
-
 // Information about the protection of the page at address '0' on this os.
 static bool zero_page_read_protected() { return true; }
 
@@ -63,8 +60,6 @@
   static const char *_glibc_version;
   static const char *_libpthread_version;
 
-  static bool _is_floating_stack;
-  static bool _is_NPTL;
   static bool _supports_fast_thread_cpu_time;
 
   static GrowableArray<int>* _cpu_to_node;
@@ -90,10 +85,6 @@
 
   static bool supports_variable_stack_size();
 
-  static void set_is_NPTL()                   { _is_NPTL = true;  }
-  static void set_is_LinuxThreads()           { _is_NPTL = false; }
-  static void set_is_floating_stack()         { _is_floating_stack = true; }
-
   static void rebuild_cpu_to_node_map();
   static GrowableArray<int>* cpu_to_node()    { return _cpu_to_node; }
 
@@ -178,14 +169,6 @@
   static const char *glibc_version()          { return _glibc_version; }
   static const char *libpthread_version()     { return _libpthread_version; }
 
-  // NPTL or LinuxThreads?
-  static bool is_LinuxThreads()               { return !_is_NPTL; }
-  static bool is_NPTL()                       { return _is_NPTL;  }
-
-  // NPTL is always floating stack. LinuxThreads could be using floating
-  // stack or fixed stack.
-  static bool is_floating_stack()             { return _is_floating_stack; }
-
   static void libpthread_init();
   static bool libnuma_init();
   static void* libnuma_dlsym(void* handle, const char* name);
@@ -234,9 +217,6 @@
 
   // none present
 
-  // LinuxThreads work-around for 6292965
-  static int safe_cond_timedwait(pthread_cond_t *_cond, pthread_mutex_t *_mutex, const struct timespec *_abstime);
-
  private:
   typedef int (*sched_getcpu_func_t)(void);
   typedef int (*numa_node_to_cpus_func_t)(int node, unsigned long *buffer, int bufferlen);
--- a/src/os/solaris/vm/globals_solaris.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/solaris/vm/globals_solaris.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,7 +28,7 @@
 //
 // Defines Solaris specific flags. They are not available on other platforms.
 //
-#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
+#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct, range, constraint) \
                                                                                \
   product(bool, UseExtendedFileIO, true,                                       \
           "Enable workaround for limitations of stdio FILE structure")
--- a/src/os/solaris/vm/os_solaris.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/solaris/vm/os_solaris.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1627,7 +1627,8 @@
 static dladdr1_func_type dladdr1_func = NULL;
 
 bool os::dll_address_to_function_name(address addr, char *buf,
-                                      int buflen, int * offset) {
+                                      int buflen, int * offset,
+                                      bool demangle) {
   // buf is not optional, but offset is optional
   assert(buf != NULL, "sanity check");
 
@@ -1655,7 +1656,7 @@
       if (dlinfo.dli_saddr != NULL &&
           (char *)dlinfo.dli_saddr + info->st_size > (char *)addr) {
         if (dlinfo.dli_sname != NULL) {
-          if (!Decoder::demangle(dlinfo.dli_sname, buf, buflen)) {
+          if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
             jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
           }
           if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
@@ -1665,7 +1666,7 @@
       // no matching symbol so try for just file info
       if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
         if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
-                            buf, buflen, offset, dlinfo.dli_fname)) {
+                            buf, buflen, offset, dlinfo.dli_fname, demangle)) {
           return true;
         }
       }
@@ -1679,7 +1680,7 @@
   if (dladdr((void *)addr, &dlinfo) != 0) {
     // see if we have a matching symbol
     if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) {
-      if (!Decoder::demangle(dlinfo.dli_sname, buf, buflen)) {
+      if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
         jio_snprintf(buf, buflen, dlinfo.dli_sname);
       }
       if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
@@ -1688,7 +1689,7 @@
     // no matching symbol so try for just file info
     if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
       if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
-                          buf, buflen, offset, dlinfo.dli_fname)) {
+                          buf, buflen, offset, dlinfo.dli_fname, demangle)) {
         return true;
       }
     }
@@ -1996,7 +1997,7 @@
   return status;
 }
 
-void os::pd_print_cpu_info(outputStream* st) {
+void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
   // Nothing to do for now.
 }
 
--- a/src/os/windows/vm/decoder_windows.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/windows/vm/decoder_windows.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -162,7 +162,7 @@
      // current function and comparing the result
      address addr = (address)Decoder::demangle;
      char buf[MAX_PATH];
-     if (decode(addr, buf, sizeof(buf), NULL)) {
+     if (decode(addr, buf, sizeof(buf), NULL, NULL, true /* demangle */)) {
        _can_decode_in_vm = !strcmp(buf, "Decoder::demangle");
      }
   }
@@ -187,7 +187,7 @@
 }
 
 
-bool WindowsDecoder::decode(address addr, char *buf, int buflen, int* offset, const char* modulepath)  {
+bool WindowsDecoder::decode(address addr, char *buf, int buflen, int* offset, const char* modulepath, bool demangle_name)  {
   if (_pfnSymGetSymFromAddr64 != NULL) {
     PIMAGEHLP_SYMBOL64 pSymbol;
     char symbolInfo[MAX_PATH + sizeof(IMAGEHLP_SYMBOL64)];
@@ -197,7 +197,7 @@
     DWORD64 displacement;
     if (_pfnSymGetSymFromAddr64(::GetCurrentProcess(), (DWORD64)addr, &displacement, pSymbol)) {
       if (buf != NULL) {
-        if (demangle(pSymbol->Name, buf, buflen)) {
+        if (!(demangle_name && demangle(pSymbol->Name, buf, buflen))) {
           jio_snprintf(buf, buflen, "%s", pSymbol->Name);
         }
       }
--- a/src/os/windows/vm/decoder_windows.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/windows/vm/decoder_windows.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -60,7 +60,7 @@
 
   bool can_decode_C_frame_in_vm() const;
   bool demangle(const char* symbol, char *buf, int buflen);
-  bool decode(address addr, char *buf, int buflen, int* offset, const char* modulepath = NULL);
+  bool decode(address addr, char *buf, int buflen, int* offset, const char* modulepath, bool demangle);
   bool decode(address addr, char *buf, int buflen, int* offset, const void* base) {
     ShouldNotReachHere();
     return false;
--- a/src/os/windows/vm/globals_windows.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/windows/vm/globals_windows.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,8 +28,7 @@
 //
 // Defines Windows specific flags. They are not available on other platforms.
 //
-#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd,       \
-                         diagnostic, notproduct)                         \
+#define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct, range, constraint) \
                                                                          \
   product(bool, UseUTCFileTimestamp, true,                               \
           "Adjust the timestamp returned from stat() to be UTC")
--- a/src/os/windows/vm/os_windows.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os/windows/vm/os_windows.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1369,11 +1369,12 @@
 }
 
 bool os::dll_address_to_function_name(address addr, char *buf,
-                                      int buflen, int *offset) {
+                                      int buflen, int *offset,
+                                      bool demangle) {
   // buf is not optional, but offset is optional
   assert(buf != NULL, "sanity check");
 
-  if (Decoder::decode(addr, buf, buflen, offset)) {
+  if (Decoder::decode(addr, buf, buflen, offset, demangle)) {
     return true;
   }
   if (offset != NULL)  *offset  = -1;
@@ -1732,7 +1733,7 @@
   st->cr();
 }
 
-void os::pd_print_cpu_info(outputStream* st) {
+void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
   // Nothing to do for now.
 }
 
--- a/src/os_cpu/linux_ppc/vm/os_linux_ppc.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os_cpu/linux_ppc/vm/os_linux_ppc.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -10,7 +10,7 @@
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file hat
+ * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
--- a/src/os_cpu/linux_x86/vm/os_linux_x86.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os_cpu/linux_x86/vm/os_linux_x86.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -619,53 +619,14 @@
 
 #ifdef AMD64
 size_t os::Linux::min_stack_allowed  = 64 * K;
-
-// amd64: pthread on amd64 is always in floating stack mode
-bool os::Linux::supports_variable_stack_size() {  return true; }
 #else
 size_t os::Linux::min_stack_allowed  =  (48 DEBUG_ONLY(+4))*K;
+#endif // AMD64
 
-#ifdef __GNUC__
-#define GET_GS() ({int gs; __asm__ volatile("movw %%gs, %w0":"=q"(gs)); gs&0xffff;})
-#endif
-
-// Test if pthread library can support variable thread stack size. LinuxThreads
-// in fixed stack mode allocates 2M fixed slot for each thread. LinuxThreads
-// in floating stack mode and NPTL support variable stack size.
+// Test if pthread library can support variable thread stack size.
 bool os::Linux::supports_variable_stack_size() {
-  if (os::Linux::is_NPTL()) {
-     // NPTL, yes
-     return true;
-
-  } else {
-    // Note: We can't control default stack size when creating a thread.
-    // If we use non-default stack size (pthread_attr_setstacksize), both
-    // floating stack and non-floating stack LinuxThreads will return the
-    // same value. This makes it impossible to implement this function by
-    // detecting thread stack size directly.
-    //
-    // An alternative approach is to check %gs. Fixed-stack LinuxThreads
-    // do not use %gs, so its value is 0. Floating-stack LinuxThreads use
-    // %gs (either as LDT selector or GDT selector, depending on kernel)
-    // to access thread specific data.
-    //
-    // Note that %gs is a reserved glibc register since early 2001, so
-    // applications are not allowed to change its value (Ulrich Drepper from
-    // Redhat confirmed that all known offenders have been modified to use
-    // either %fs or TSD). In the worst case scenario, when VM is embedded in
-    // a native application that plays with %gs, we might see non-zero %gs
-    // even LinuxThreads is running in fixed stack mode. As the result, we'll
-    // return true and skip _thread_safety_check(), so we may not be able to
-    // detect stack-heap collisions. But otherwise it's harmless.
-    //
-#ifdef __GNUC__
-    return (GET_GS() != 0);
-#else
-    return false;
-#endif
-  }
+  return true;
 }
-#endif // AMD64
 
 // return default stack size for thr_type
 size_t os::Linux::default_stack_size(os::ThreadType thr_type) {
--- a/src/os_cpu/linux_x86/vm/threadLS_linux_x86.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/os_cpu/linux_x86/vm/threadLS_linux_x86.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -40,8 +40,7 @@
 // actual memory pages are committed on demand.
 //
 // If an application creates and destroys a lot of threads, usually the
-// stack space freed by a thread will soon get reused by new thread
-// (this is especially true in NPTL or LinuxThreads in fixed-stack mode).
+// stack space freed by a thread will soon get reused by new thread.
 // No memory page in _sp_map is wasted.
 //
 // However, it's still possible that we might end up populating &
--- a/src/share/tools/ProjectCreator/WinGammaPlatformVC10.java	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/tools/ProjectCreator/WinGammaPlatformVC10.java	Thu Jul 02 16:09:16 2015 -0700
@@ -363,9 +363,6 @@
 
         // Set /On option
         addAttr(rv, "Optimization", opt);
-        // Set /FR option.
-        addAttr(rv, "BrowseInformation", "true");
-        addAttr(rv, "BrowseInformationFile", "$(IntDir)");
         // Set /MD option.
         addAttr(rv, "RuntimeLibrary", "MultiThreadedDLL");
         // Set /Oy- option
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1619,6 +1619,9 @@
   LIR_Opr dirty = LIR_OprFact::intConst(CardTableModRefBS::dirty_card_val());
   if (UseCondCardMark) {
     LIR_Opr cur_value = new_register(T_INT);
+    if (UseConcMarkSweepGC) {
+      __ membar_storeload();
+    }
     __ move(card_addr, cur_value);
 
     LabelObj* L_already_dirty = new LabelObj();
--- a/src/share/vm/c1/c1_globals.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/c1/c1_globals.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,4 +25,4 @@
 #include "precompiled.hpp"
 #include "c1/c1_globals.hpp"
 
-C1_FLAGS(MATERIALIZE_DEVELOPER_FLAG, MATERIALIZE_PD_DEVELOPER_FLAG, MATERIALIZE_PRODUCT_FLAG, MATERIALIZE_PD_PRODUCT_FLAG, MATERIALIZE_DIAGNOSTIC_FLAG, MATERIALIZE_NOTPRODUCT_FLAG)
+C1_FLAGS(MATERIALIZE_DEVELOPER_FLAG, MATERIALIZE_PD_DEVELOPER_FLAG, MATERIALIZE_PRODUCT_FLAG, MATERIALIZE_PD_PRODUCT_FLAG, MATERIALIZE_DIAGNOSTIC_FLAG, MATERIALIZE_NOTPRODUCT_FLAG, IGNORE_RANGE, IGNORE_CONSTRAINT)
--- a/src/share/vm/c1/c1_globals.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/c1/c1_globals.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -60,7 +60,7 @@
 //
 // Defines all global flags used by the client compiler.
 //
-#define C1_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
+#define C1_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct, range, constraint) \
                                                                             \
   /* Printing */                                                            \
   notproduct(bool, PrintC1Statistics, false,                                \
@@ -148,6 +148,7 @@
                                                                             \
   product(intx, ValueMapInitialSize, 11,                                    \
           "Initial size of a value map")                                    \
+          range(1, NOT_LP64(1*K) LP64_ONLY(32*K))                           \
                                                                             \
   product(intx, ValueMapMaxLoopSize, 8,                                     \
           "maximum size of a loop optimized by global value numbering")     \
@@ -191,6 +192,7 @@
                                                                             \
   develop(intx, NestedInliningSizeRatio, 90,                                \
           "Percentage of prev. allowed inline size in recursive inlining")  \
+          range(0, 100)                                                     \
                                                                             \
   notproduct(bool, PrintIRWithLIR, false,                                   \
           "Print IR instructions with generated LIR")                       \
@@ -338,10 +340,15 @@
   diagnostic(bool, C1PatchInvokeDynamic, true,                              \
              "Patch invokedynamic appendix not known at compile time")      \
                                                                             \
-
-
 // Read default values for c1 globals
 
-C1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_NOTPRODUCT_FLAG)
+C1_FLAGS(DECLARE_DEVELOPER_FLAG, \
+         DECLARE_PD_DEVELOPER_FLAG, \
+         DECLARE_PRODUCT_FLAG, \
+         DECLARE_PD_PRODUCT_FLAG, \
+         DECLARE_DIAGNOSTIC_FLAG, \
+         DECLARE_NOTPRODUCT_FLAG, \
+         IGNORE_RANGE, \
+         IGNORE_CONSTRAINT)
 
 #endif // SHARE_VM_C1_C1_GLOBALS_HPP
--- a/src/share/vm/classfile/classFileParser.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/classfile/classFileParser.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -949,8 +949,7 @@
         assert(runtime_visible_annotations != NULL, "null visible annotations");
         parse_annotations(runtime_visible_annotations,
                           runtime_visible_annotations_length,
-                          parsed_annotations,
-                          CHECK);
+                          parsed_annotations);
         cfs->skip_u1(runtime_visible_annotations_length, CHECK);
       } else if (attribute_name == vmSymbols::tag_runtime_invisible_annotations()) {
         if (runtime_invisible_annotations_exists) {
@@ -1643,7 +1642,6 @@
     index = skip_annotation(buffer, limit, index);
     break;
   default:
-    assert(false, "annotation tag");
     return limit;  //  bad tag byte
   }
   return index;
@@ -1651,8 +1649,7 @@
 
 // Sift through annotations, looking for those significant to the VM:
 void ClassFileParser::parse_annotations(u1* buffer, int limit,
-                                        ClassFileParser::AnnotationCollector* coll,
-                                        TRAPS) {
+                                        ClassFileParser::AnnotationCollector* coll) {
   // annotations := do(nann:u2) {annotation}
   int index = 0;
   if ((index += 2) >= limit)  return;  // read nann
@@ -2286,8 +2283,7 @@
         runtime_visible_annotations = cfs->get_u1_buffer();
         assert(runtime_visible_annotations != NULL, "null visible annotations");
         parse_annotations(runtime_visible_annotations,
-            runtime_visible_annotations_length, &parsed_annotations,
-            CHECK_(nullHandle));
+            runtime_visible_annotations_length, &parsed_annotations);
         cfs->skip_u1(runtime_visible_annotations_length, CHECK_(nullHandle));
       } else if (method_attribute_name == vmSymbols::tag_runtime_invisible_annotations()) {
         if (runtime_invisible_annotations_exists) {
@@ -2951,8 +2947,7 @@
         assert(runtime_visible_annotations != NULL, "null visible annotations");
         parse_annotations(runtime_visible_annotations,
                           runtime_visible_annotations_length,
-                          parsed_annotations,
-                          CHECK);
+                          parsed_annotations);
         cfs->skip_u1(runtime_visible_annotations_length, CHECK);
       } else if (tag == vmSymbols::tag_runtime_invisible_annotations()) {
         if (runtime_invisible_annotations_exists) {
--- a/src/share/vm/classfile/classFileParser.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/classfile/classFileParser.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -295,8 +295,7 @@
   int skip_annotation_value(u1* buffer, int limit, int index);
   void parse_annotations(u1* buffer, int limit,
                          /* Results (currently, only one result is supported): */
-                         AnnotationCollector* result,
-                         TRAPS);
+                         AnnotationCollector* result);
 
   // Final setup
   unsigned int compute_oop_map_count(instanceKlassHandle super,
--- a/src/share/vm/classfile/compactHashtable.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/classfile/compactHashtable.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "classfile/javaClasses.hpp"
 #include "memory/metaspaceShared.hpp"
+#include "prims/jvm.h"
 #include "utilities/numberSeq.hpp"
 #include <sys/stat.h>
 
@@ -32,11 +33,11 @@
 //
 // The compact hash table writer implementations
 //
-CompactHashtableWriter::CompactHashtableWriter(const char* table_name,
+CompactHashtableWriter::CompactHashtableWriter(int table_type,
                                                int num_entries,
                                                CompactHashtableStats* stats) {
   assert(DumpSharedSpaces, "dump-time only");
-  _table_name = table_name;
+  _type = table_type;
   _num_entries = num_entries;
   _num_buckets = number_of_buckets(_num_entries);
   _buckets = NEW_C_HEAP_ARRAY(Entry*, _num_buckets, mtSymbol);
@@ -99,7 +100,7 @@
                                           NumberSeq* summary) {
   int index;
   juint* compact_table = p;
-  // Find the start of the buckets, skip the compact_bucket_infos table
+  // Compute the start of the buckets, include the compact_bucket_infos table
   // and the table end offset.
   juint offset = _num_buckets + 1;
   *first_bucket = compact_table + offset;
@@ -130,10 +131,17 @@
 // Write the compact table's entries
 juint* CompactHashtableWriter::dump_buckets(juint* compact_table, juint* p,
                                             NumberSeq* summary) {
-  uintx base_address = uintx(MetaspaceShared::shared_rs()->base());
-  uintx max_delta    = uintx(MetaspaceShared::shared_rs()->size());
-  assert(max_delta <= 0x7fffffff, "range check");
+  uintx base_address = 0;
+  uintx max_delta = 0;
   int num_compact_buckets = 0;
+  if (_type == CompactHashtable<Symbol*, char>::_symbol_table) {
+    base_address = uintx(MetaspaceShared::shared_rs()->base());
+    max_delta    = uintx(MetaspaceShared::shared_rs()->size());
+    assert(max_delta <= 0x7fffffff, "range check");
+  } else {
+    assert((_type == CompactHashtable<oop, char>::_string_table), "unknown table");
+    assert(UseCompressedOops, "UseCompressedOops is required");
+  }
 
   assert(p != NULL, "sanity");
   for (int index = 0; index < _num_buckets; index++) {
@@ -148,12 +156,16 @@
     for (Entry* tent = _buckets[index]; tent;
          tent = tent->next()) {
       if (bucket_type == REGULAR_BUCKET_TYPE) {
-        *p++ = juint(tent->hash()); // write symbol hash
+        *p++ = juint(tent->hash()); // write entry hash
       }
-      uintx deltax = uintx(tent->value()) - base_address;
-      assert(deltax < max_delta, "range check");
-      juint delta = juint(deltax);
-      *p++ = delta; // write symbol offset
+      if (_type == CompactHashtable<Symbol*, char>::_symbol_table) {
+        uintx deltax = uintx(tent->value()) - base_address;
+        assert(deltax < max_delta, "range check");
+        juint delta = juint(deltax);
+        *p++ = delta; // write entry offset
+      } else {
+        *p++ = oopDesc::encode_heap_oop(tent->string());
+      }
       count ++;
     }
     assert(count == _bucket_sizes[index], "sanity");
@@ -174,6 +186,10 @@
 
   uintx base_address = uintx(MetaspaceShared::shared_rs()->base());
 
+  // Now write the following at the beginning of the table:
+  //      base_address (uintx)
+  //      num_entries  (juint)
+  //      num_buckets  (juint)
   *p++ = high(base_address);
   *p++ = low (base_address); // base address
   *p++ = _num_entries;  // number of entries in the table
@@ -191,7 +207,8 @@
     if (_num_entries > 0) {
       avg_cost = double(_required_bytes)/double(_num_entries);
     }
-    tty->print_cr("Shared %s table stats -------- base: " PTR_FORMAT, _table_name, (intptr_t)base_address);
+    tty->print_cr("Shared %s table stats -------- base: " PTR_FORMAT,
+                  table_name(), (intptr_t)base_address);
     tty->print_cr("Number of entries       : %9d", _num_entries);
     tty->print_cr("Total bytes used        : %9d", (int)((*top) - old_top));
     tty->print_cr("Average bytes per entry : %9.3f", avg_cost);
@@ -202,12 +219,24 @@
   }
 }
 
+const char* CompactHashtableWriter::table_name() {
+  switch (_type) {
+  case CompactHashtable<Symbol*, char>::_symbol_table: return "symbol";
+  case CompactHashtable<oop, char>::_string_table: return "string";
+  default:
+    ;
+  }
+  return "unknown";
+}
+
 /////////////////////////////////////////////////////////////
 //
 // The CompactHashtable implementation
 //
-template <class T, class N> const char* CompactHashtable<T, N>::init(const char* buffer) {
+template <class T, class N> const char* CompactHashtable<T, N>::init(
+                           CompactHashtableType type, const char* buffer) {
   assert(!DumpSharedSpaces, "run-time only");
+  _type = type;
   juint*p = (juint*)buffer;
   juint upper = *p++;
   juint lower = *p++;
@@ -245,8 +274,34 @@
   }
 }
 
+template <class T, class N> void CompactHashtable<T, N>::oops_do(OopClosure* f) {
+  assert(!DumpSharedSpaces, "run-time only");
+  assert(_type == _string_table || _bucket_count == 0, "sanity");
+  for (juint i = 0; i < _bucket_count; i ++) {
+    juint bucket_info = _buckets[i];
+    juint bucket_offset = BUCKET_OFFSET(bucket_info);
+    int   bucket_type = BUCKET_TYPE(bucket_info);
+    juint* bucket = _buckets + bucket_offset;
+    juint* bucket_end = _buckets;
+
+    narrowOop o;
+    if (bucket_type == COMPACT_BUCKET_TYPE) {
+      o = (narrowOop)bucket[0];
+      f->do_oop(&o);
+    } else {
+      bucket_end += BUCKET_OFFSET(_buckets[i + 1]);
+      while (bucket < bucket_end) {
+        o = (narrowOop)bucket[1];
+        f->do_oop(&o);
+        bucket += 2;
+      }
+    }
+  }
+}
+
 // Explicitly instantiate these types
 template class CompactHashtable<Symbol*, char>;
+template class CompactHashtable<oop, char>;
 
 #ifndef O_BINARY       // if defined (Win32) use binary files.
 #define O_BINARY 0     // otherwise do nothing.
@@ -273,6 +328,8 @@
   _p = _base;
   _end = _base + st.st_size;
   _filename = filename;
+  _prefix_type = Unknown;
+  _line_no = 1;
 }
 
 HashtableTextDump::~HashtableTextDump() {
@@ -286,9 +343,11 @@
   vm_exit_during_initialization(err, msg);
 }
 
-void HashtableTextDump::corrupted(const char *p) {
-  char info[60];
-  sprintf(info, "corrupted at pos %d", (int)(p - _base));
+void HashtableTextDump::corrupted(const char *p, const char* msg) {
+  char info[100];
+  jio_snprintf(info, sizeof(info),
+               "%s. Corrupted at line %d (file pos %d)",
+               msg, _line_no, (int)(p - _base));
   quit(info, _filename);
 }
 
@@ -298,8 +357,9 @@
   } else if (_p[0] == '\n') {
     _p += 1;
   } else {
-    corrupted(_p);
+    corrupted(_p, "Unexpected character");
   }
+  _line_no ++;
   return true;
 }
 
@@ -328,26 +388,60 @@
   skip_newline();
 }
 
+void HashtableTextDump::scan_prefix_type() {
+  _p ++;
+  if (strncmp(_p, "SECTION: String", 15) == 0) {
+    _p += 15;
+    _prefix_type = StringPrefix;
+  } else if (strncmp(_p, "SECTION: Symbol", 15) == 0) {
+    _p += 15;
+    _prefix_type = SymbolPrefix;
+  } else {
+    _prefix_type = Unknown;
+  }
+  skip_newline();
+}
 
-int HashtableTextDump::scan_prefix() {
+int HashtableTextDump::scan_prefix(int* utf8_length) {
+  if (*_p == '@') {
+    scan_prefix_type();
+  }
+
+  switch (_prefix_type) {
+  case SymbolPrefix:
+    *utf8_length = scan_symbol_prefix(); break;
+  case StringPrefix:
+    *utf8_length = scan_string_prefix(); break;
+  default:
+    tty->print_cr("Shared input data type: Unknown.");
+    corrupted(_p, "Unknown data type");
+  }
+
+  return _prefix_type;
+}
+
+int HashtableTextDump::scan_string_prefix() {
   // Expect /[0-9]+: /
-  int utf8_length = get_num(':');
+  int utf8_length;
+  get_num(':', &utf8_length);
   if (*_p != ' ') {
-    corrupted(_p);
+    corrupted(_p, "Wrong prefix format for string");
   }
   _p++;
   return utf8_length;
 }
 
-int HashtableTextDump::scan_prefix2() {
+int HashtableTextDump::scan_symbol_prefix() {
   // Expect /[0-9]+ (-|)[0-9]+: /
-  int utf8_length = get_num(' ');
-  if (*_p == '-') {
-    _p++;
+  int utf8_length;
+  get_num(' ', &utf8_length);
+    if (*_p == '-') {
+     _p++;
   }
-  (void)get_num(':');
+  int ref_num;
+  (void)get_num(':', &ref_num);
   if (*_p != ' ') {
-    corrupted(_p);
+    corrupted(_p, "Wrong prefix format for symbol");
   }
   _p++;
   return utf8_length;
@@ -408,7 +502,7 @@
       case 'r':  *to++ = '\r'; break;
       case '\\': *to++ = '\\'; break;
       default:
-        ShouldNotReachHere();
+        corrupted(_p, "Unsupported character");
       }
     }
   }
--- a/src/share/vm/classfile/compactHashtable.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/classfile/compactHashtable.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -28,6 +28,7 @@
 #include "classfile/stringTable.hpp"
 #include "classfile/symbolTable.hpp"
 #include "memory/allocation.inline.hpp"
+#include "oops/oop.inline.hpp"
 #include "oops/symbol.hpp"
 #include "services/diagnosticCommand.hpp"
 #include "utilities/hashtable.hpp"
@@ -49,7 +50,7 @@
 // the compact table to the shared archive.
 //
 // At dump time, the CompactHashtableWriter obtains all entries from the
-// symbol table and adds them to a new temporary hash table. The hash
+// symbol/string table and adds them to a new temporary hash table. The hash
 // table size (number of buckets) is calculated using
 // '(num_entries + bucket_size - 1) / bucket_size'. The default bucket
 // size is 4 and can be changed by -XX:SharedSymbolTableBucketSize option.
@@ -57,14 +58,14 @@
 // faster lookup. It also has relatively small number of empty buckets and
 // good distribution of the entries.
 //
-// We use a simple hash function (symbol_hash % num_bucket) for the table.
+// We use a simple hash function (hash % num_bucket) for the table.
 // The new table is compacted when written out. Please see comments
 // above the CompactHashtable class for the table layout detail. The bucket
 // offsets are written to the archive as part of the compact table. The
 // bucket offset is encoded in the low 30-bit (0-29) and the bucket type
 // (regular or compact) are encoded in bit[31, 30]. For buckets with more
-// than one entry, both symbol hash and symbol offset are written to the
-// table. For buckets with only one entry, only the symbol offset is written
+// than one entry, both hash and entry offset are written to the
+// table. For buckets with only one entry, only the entry offset is written
 // to the table and the buckets are tagged as compact in their type bits.
 // Buckets without entry are skipped from the table. Their offsets are
 // still written out for faster lookup.
@@ -78,6 +79,7 @@
 
   public:
     Entry(unsigned int hash, Symbol *symbol) : _next(NULL), _hash(hash), _literal(symbol) {}
+    Entry(unsigned int hash, oop string)     : _next(NULL), _hash(hash), _literal(string) {}
 
     void *value() {
       return _literal;
@@ -85,6 +87,9 @@
     Symbol *symbol() {
       return (Symbol*)_literal;
     }
+    oop string() {
+      return (oop)_literal;
+    }
     unsigned int hash() {
       return _hash;
     }
@@ -95,7 +100,7 @@
 private:
   static int number_of_buckets(int num_entries);
 
-  const char* _table_name;
+  int _type;
   int _num_entries;
   int _num_buckets;
   juint* _bucket_sizes;
@@ -105,7 +110,7 @@
 
 public:
   // This is called at dump-time only
-  CompactHashtableWriter(const char* table_name, int num_entries, CompactHashtableStats* stats);
+  CompactHashtableWriter(int table_type, int num_entries, CompactHashtableStats* stats);
   ~CompactHashtableWriter();
 
   int get_required_bytes() {
@@ -116,6 +121,10 @@
     add(hash, new Entry(hash, symbol));
   }
 
+  void add(unsigned int hash, oop string) {
+    add(hash, new Entry(hash, string));
+  }
+
 private:
   void add(unsigned int hash, Entry* entry);
   juint* dump_table(juint* p, juint** first_bucket, NumberSeq* summary);
@@ -123,6 +132,7 @@
 
 public:
   void dump(char** top, char* end);
+  const char* table_name();
 };
 
 #define REGULAR_BUCKET_TYPE       0
@@ -136,23 +146,23 @@
 
 /////////////////////////////////////////////////////////////////////////////
 //
-// CompactHashtable is used to stored the CDS archive's symbol table. Used
+// CompactHashtable is used to stored the CDS archive's symbol/string table. Used
 // at runtime only to access the compact table from the archive.
 //
 // Because these tables are read-only (no entries can be added/deleted) at run-time
 // and tend to have large number of entries, we try to minimize the footprint
 // cost per entry.
 //
-// Layout of compact symbol table in the shared archive:
+// Layout of compact table in the shared archive:
 //
 //   uintx base_address;
-//   juint num_symbols;
+//   juint num_entries;
 //   juint num_buckets;
 //   juint bucket_infos[num_buckets+1]; // bit[31,30]: type; bit[29-0]: offset
 //   juint table[]
 //
 // -----------------------------------
-// | base_address  | num_symbols     |
+// | base_address  | num_entries     |
 // |---------------------------------|
 // | num_buckets   | bucket_info0    |
 // |---------------------------------|
@@ -177,9 +187,13 @@
 // compact buckets have '01' in their highest 2-bit, and regular buckets have
 // '00' in their highest 2-bit.
 //
-// For normal buckets, each symbol's entry is 8 bytes in the table[]:
-//   juint hash;    /* symbol hash */
-//   juint offset;  /* Symbol* sym = (Symbol*)(base_address + offset) */
+// For normal buckets, each entry is 8 bytes in the table[]:
+//   juint hash;    /* symbol/string hash */
+//   union {
+//     juint offset;  /* Symbol* sym = (Symbol*)(base_address + offset) */
+//     narrowOop str; /* String narrowOop encoding */
+//   }
+//
 //
 // For compact buckets, each entry has only the 4-byte 'offset' in the table[].
 //
@@ -189,19 +203,41 @@
 //
 template <class T, class N> class CompactHashtable VALUE_OBJ_CLASS_SPEC {
   friend class VMStructs;
+
+ public:
+  enum CompactHashtableType {
+    _symbol_table = 0,
+    _string_table = 1
+  };
+
+private:
+  CompactHashtableType _type;
   uintx  _base_address;
   juint  _entry_count;
   juint  _bucket_count;
   juint  _table_end_offset;
   juint* _buckets;
 
-  inline bool equals(T entry, const char* name, int len) {
-    if (entry->equals(name, len)) {
-      assert(entry->refcount() == -1, "must be shared");
-      return true;
-    } else {
-      return false;
+  inline Symbol* lookup_entry(CompactHashtable<Symbol*, char>* const t,
+                              juint* addr, const char* name, int len) {
+    Symbol* sym = (Symbol*)((void*)(_base_address + *addr));
+    if (sym->equals(name, len)) {
+      assert(sym->refcount() == -1, "must be shared");
+      return sym;
     }
+
+    return NULL;
+  }
+
+  inline oop lookup_entry(CompactHashtable<oop, char>* const t,
+                        juint* addr, const char* name, int len) {
+    narrowOop obj = (narrowOop)(*addr);
+    oop string = oopDesc::decode_heap_oop(obj);
+    if (java_lang_String::equals(string, (jchar*)name, len)) {
+      return string;
+    }
+
+    return NULL;
   }
 
 public:
@@ -211,7 +247,14 @@
     _table_end_offset = 0;
     _buckets = 0;
   }
-  const char* init(const char *buffer);
+  const char* init(CompactHashtableType type, const char *buffer);
+
+  void reset() {
+    _entry_count = 0;
+    _bucket_count = 0;
+    _table_end_offset = 0;
+    _buckets = 0;
+  }
 
   // Lookup an entry from the compact table
   inline T lookup(const N* name, unsigned int hash, int len) {
@@ -225,23 +268,22 @@
       juint* bucket_end = _buckets;
 
       if (bucket_type == COMPACT_BUCKET_TYPE) {
-        // the compact bucket has one entry with symbol offset only
-        T entry = (T)((void*)(_base_address + bucket[0]));
-        if (equals(entry, name, len)) {
-          return entry;
+        // the compact bucket has one entry with entry offset only
+        T res = lookup_entry(this, &bucket[0], name, len);
+        if (res != NULL) {
+          return res;
         }
       } else {
         // This is a regular bucket, which has more than one
-        // entries. Each entry is a pair of symbol (hash, offset).
+        // entries. Each entry is a pair of entry (hash, offset).
         // Seek until the end of the bucket.
         bucket_end += BUCKET_OFFSET(_buckets[index + 1]);
         while (bucket < bucket_end) {
           unsigned int h = (unsigned int)(bucket[0]);
           if (h == hash) {
-            juint offset = bucket[1];
-            T entry = (T)((void*)(_base_address + offset));
-            if (equals(entry, name, len)) {
-              return entry;
+            T res = lookup_entry(this, &bucket[1], name, len);
+            if (res != NULL) {
+              return res;
             }
           }
           bucket += 2;
@@ -253,12 +295,15 @@
 
   // iterate over symbols
   void symbols_do(SymbolClosure *cl);
+
+  // iterate over strings
+  void oops_do(OopClosure* f);
 };
 
 ////////////////////////////////////////////////////////////////////////
 //
 // Read/Write the contents of a hashtable textual dump (created by
-// SymbolTable::dump).
+// SymbolTable::dump and StringTable::dump).
 // Because the dump file may be big (hundred of MB in extreme cases),
 // we use mmap for fast access when reading it.
 //
@@ -269,21 +314,29 @@
   const char* _end;
   const char* _filename;
   size_t      _size;
+  int         _prefix_type;
+  int         _line_no;
 public:
   HashtableTextDump(const char* filename);
   ~HashtableTextDump();
 
+  enum {
+    SymbolPrefix = 1 << 0,
+    StringPrefix = 1 << 1,
+    Unknown = 1 << 2
+  };
+
   void quit(const char* err, const char* msg);
 
   inline int remain() {
     return (int)(_end - _p);
   }
 
-  void corrupted(const char *p);
+  void corrupted(const char *p, const char *msg);
 
   inline void corrupted_if(bool cond) {
     if (cond) {
-      corrupted(_p);
+      corrupted(_p, NULL);
     }
   }
 
@@ -292,7 +345,7 @@
   void skip_past(char c);
   void check_version(const char* ver);
 
-  inline int get_num(char delim) {
+  inline bool get_num(char delim, int *utf8_length) {
     const char* p   = _p;
     const char* end = _end;
     int num = 0;
@@ -303,18 +356,22 @@
         num = num * 10 + (c - '0');
       } else if (c == delim) {
         _p = p;
-        return num;
+        *utf8_length = num;
+        return true;
       } else {
-        corrupted(p-1);
+        // Not [0-9], not 'delim'
+        return false;
       }
     }
-    corrupted(_end);
+    corrupted(_end, "Incorrect format");
     ShouldNotReachHere();
-    return 0;
+    return false;
   }
 
-  int scan_prefix();
-  int scan_prefix2();
+  void scan_prefix_type();
+  int scan_prefix(int* utf8_length);
+  int scan_string_prefix();
+  int scan_symbol_prefix();
 
   jchar unescape(const char* from, const char* end, int count);
   void get_utf8(char* utf8_buffer, int utf8_length);
--- a/src/share/vm/classfile/javaClasses.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/classfile/javaClasses.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -118,6 +118,10 @@
     return hash_offset;
   }
 
+  static void set_value_raw(oop string, typeArrayOop buffer) {
+    assert(initialized, "Must be initialized");
+    string->obj_field_put_raw(value_offset, buffer);
+  }
   static void set_value(oop string, typeArrayOop buffer) {
     assert(initialized && (value_offset > 0), "Must be initialized");
     string->obj_field_put(value_offset, (oop)buffer);
@@ -210,6 +214,7 @@
   // Debugging
   static void print(oop java_string, outputStream* st);
   friend class JavaClasses;
+  friend class StringTable;
 };
 
 
--- a/src/share/vm/classfile/stringTable.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/classfile/stringTable.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -38,6 +38,7 @@
 #include "utilities/hashtable.inline.hpp"
 #include "utilities/macros.hpp"
 #if INCLUDE_ALL_GCS
+#include "gc/g1/g1CollectedHeap.hpp"
 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
 #include "gc/g1/g1StringDedup.hpp"
 #endif
@@ -87,19 +88,28 @@
 
 // --------------------------------------------------------------------------
 StringTable* StringTable::_the_table = NULL;
-
+bool StringTable::_ignore_shared_strings = false;
 bool StringTable::_needs_rehashing = false;
 
 volatile int StringTable::_parallel_claimed_idx = 0;
 
+CompactHashtable<oop, char> StringTable::_shared_table;
+
 // Pick hashing algorithm
 unsigned int StringTable::hash_string(const jchar* s, int len) {
   return use_alternate_hashcode() ? AltHashing::murmur3_32(seed(), s, len) :
                                     java_lang_String::hash_code(s, len);
 }
 
-oop StringTable::lookup(int index, jchar* name,
-                        int len, unsigned int hash) {
+oop StringTable::lookup_shared(jchar* name, int len) {
+  // java_lang_String::hash_code() was used to compute hash values in the shared table. Don't
+  // use the hash value from StringTable::hash_string() as it might use alternate hashcode.
+  return _shared_table.lookup((const char*)name,
+                              java_lang_String::hash_code(name, len), len);
+}
+
+oop StringTable::lookup_in_main_table(int index, jchar* name,
+                                int len, unsigned int hash) {
   int count = 0;
   for (HashtableEntry<oop, mtSymbol>* l = bucket(index); l != NULL; l = l->next()) {
     count++;
@@ -140,7 +150,8 @@
   // Since look-up was done lock-free, we need to check if another
   // thread beat us in the race to insert the symbol.
 
-  oop test = lookup(index, name, len, hashValue); // calls lookup(u1*, int)
+  // No need to lookup the shared table from here since the caller (intern()) already did
+  oop test = lookup_in_main_table(index, name, len, hashValue); // calls lookup(u1*, int)
   if (test != NULL) {
     // Entry already added
     return test;
@@ -172,9 +183,14 @@
 }
 
 oop StringTable::lookup(jchar* name, int len) {
+  oop string = lookup_shared(name, len);
+  if (string != NULL) {
+    return string;
+  }
+
   unsigned int hash = hash_string(name, len);
   int index = the_table()->hash_to_index(hash);
-  oop string = the_table()->lookup(index, name, len, hash);
+  string = the_table()->lookup_in_main_table(index, name, len, hash);
 
   ensure_string_alive(string);
 
@@ -184,9 +200,14 @@
 
 oop StringTable::intern(Handle string_or_null, jchar* name,
                         int len, TRAPS) {
+  oop found_string = lookup_shared(name, len);
+  if (found_string != NULL) {
+    return found_string;
+  }
+
   unsigned int hashValue = hash_string(name, len);
   int index = the_table()->hash_to_index(hashValue);
-  oop found_string = the_table()->lookup(index, name, len, hashValue);
+  found_string = the_table()->lookup_in_main_table(index, name, len, hashValue);
 
   // Found
   if (found_string != NULL) {
@@ -611,3 +632,131 @@
     return 0;
   }
 }
+
+// Sharing
+bool StringTable::copy_shared_string(GrowableArray<MemRegion> *string_space,
+                                     CompactHashtableWriter* ch_table) {
+#if INCLUDE_CDS && INCLUDE_ALL_GCS && defined(_LP64) && !defined(_WINDOWS)
+  assert(UseG1GC, "Only support G1 GC");
+  assert(UseCompressedOops && UseCompressedClassPointers,
+         "Only support UseCompressedOops and UseCompressedClassPointers enabled");
+
+  Thread* THREAD = Thread::current();
+  G1CollectedHeap::heap()->begin_archive_alloc_range();
+  for (int i = 0; i < the_table()->table_size(); ++i) {
+    HashtableEntry<oop, mtSymbol>* bucket = the_table()->bucket(i);
+    for ( ; bucket != NULL; bucket = bucket->next()) {
+      oop s = bucket->literal();
+      unsigned int hash = java_lang_String::hash_code(s);
+      if (hash == 0) {
+        continue;
+      }
+
+      // allocate the new 'value' array first
+      typeArrayOop v = java_lang_String::value(s);
+      int v_len = v->size();
+      typeArrayOop new_v;
+      if (G1CollectedHeap::heap()->is_archive_alloc_too_large(v_len)) {
+        continue; // skip the current String. The 'value' array is too large to handle
+      } else {
+        new_v = (typeArrayOop)G1CollectedHeap::heap()->archive_mem_allocate(v_len);
+        if (new_v == NULL) {
+          return false; // allocation failed
+        }
+      }
+      // now allocate the new String object
+      int s_len = s->size();
+      oop new_s = (oop)G1CollectedHeap::heap()->archive_mem_allocate(s_len);
+      if (new_s == NULL) {
+        return false;
+      }
+
+      s->identity_hash();
+      v->identity_hash();
+
+      // copy the objects' data
+      Copy::aligned_disjoint_words((HeapWord*)s, (HeapWord*)new_s, s_len);
+      Copy::aligned_disjoint_words((HeapWord*)v, (HeapWord*)new_v, v_len);
+
+      // adjust the pointer to the 'value' field in the new String oop. Also pre-compute and set the
+      // 'hash' field. That avoids "write" to the shared strings at runtime by the deduplication process.
+      java_lang_String::set_value_raw(new_s, new_v);
+      if (java_lang_String::hash(new_s) == 0) {
+        java_lang_String::set_hash(new_s, hash);
+      }
+
+      // add to the compact table
+      ch_table->add(hash, new_s);
+    }
+  }
+
+  G1CollectedHeap::heap()->end_archive_alloc_range(string_space, os::vm_allocation_granularity());
+  assert(string_space->length() <= 2, "sanity");
+#endif
+  return true;
+}
+
+bool StringTable::copy_compact_table(char** top, char *end, GrowableArray<MemRegion> *string_space,
+                                     size_t* space_size) {
+#if INCLUDE_CDS && defined(_LP64) && !defined(_WINDOWS)
+  if (!(UseG1GC && UseCompressedOops && UseCompressedClassPointers)) {
+    if (PrintSharedSpaces) {
+      tty->print_cr("Shared strings are excluded from the archive as UseG1GC, "
+                    "UseCompressedOops and UseCompressedClassPointers are required.");
+    }
+    return true;
+  }
+
+  CompactHashtableWriter ch_table(CompactHashtable<oop, char>::_string_table,
+                                  the_table()->number_of_entries(),
+                                  &MetaspaceShared::stats()->string);
+
+  // Copy the interned strings into the "string space" within the java heap
+  if (!copy_shared_string(string_space, &ch_table)) {
+    return false;
+  }
+
+  for (int i = 0; i < string_space->length(); i++) {
+    *space_size += string_space->at(i).byte_size();
+  }
+
+  // Now dump the compact table
+  if (*top + ch_table.get_required_bytes() > end) {
+    // not enough space left
+    return false;
+  }
+  ch_table.dump(top, end);
+  *top = (char*)align_pointer_up(*top, sizeof(void*));
+
+#endif
+  return true;
+}
+
+void StringTable::shared_oops_do(OopClosure* f) {
+#if INCLUDE_CDS && defined(_LP64) && !defined(_WINDOWS)
+  _shared_table.oops_do(f);
+#endif
+}
+
+const char* StringTable::init_shared_table(FileMapInfo *mapinfo, char *buffer) {
+#if INCLUDE_CDS && defined(_LP64) && !defined(_WINDOWS)
+  if (mapinfo->space_capacity(MetaspaceShared::first_string) == 0) {
+    // no shared string data
+    return buffer;
+  }
+
+  // initialize the shared table
+  juint *p = (juint*)buffer;
+  const char* end = _shared_table.init(
+          CompactHashtable<oop, char>::_string_table, (char*)p);
+  const char* aligned_end = (const char*)align_pointer_up(end, sizeof(void*));
+
+  if (_ignore_shared_strings) {
+    _shared_table.reset();
+  }
+
+  return aligned_end;
+#endif
+
+  return buffer;
+}
--- a/src/share/vm/classfile/stringTable.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/classfile/stringTable.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,6 +28,10 @@
 #include "memory/allocation.inline.hpp"
 #include "utilities/hashtable.hpp"
 
+template <class T, class N> class CompactHashtable;
+class CompactHashtableWriter;
+class FileMapInfo;
+
 class StringTable : public RehashableHashtable<oop, mtSymbol> {
   friend class VMStructs;
   friend class Symbol;
@@ -36,6 +40,10 @@
   // The string table
   static StringTable* _the_table;
 
+  // Shared string table
+  static CompactHashtable<oop, char> _shared_table;
+  static bool _ignore_shared_strings;
+
   // Set if one bucket is out of balance due to hash algorithm deficiency
   static bool _needs_rehashing;
 
@@ -46,7 +54,8 @@
   oop basic_add(int index, Handle string_or_null, jchar* name, int len,
                 unsigned int hashValue, TRAPS);
 
-  oop lookup(int index, jchar* chars, int length, unsigned int hashValue);
+  oop lookup_in_main_table(int index, jchar* chars, int length, unsigned int hashValue);
+  static oop lookup_shared(jchar* name, int len);
 
   // Apply the give oop closure to the entries to the buckets
   // in the range [start_idx, end_idx).
@@ -141,12 +150,14 @@
   static int verify_and_compare_entries();
 
   // Sharing
-  static void copy_buckets(char** top, char*end) {
-    the_table()->Hashtable<oop, mtSymbol>::copy_buckets(top, end);
-  }
-  static void copy_table(char** top, char*end) {
-    the_table()->Hashtable<oop, mtSymbol>::copy_table(top, end);
-  }
+  static void ignore_shared_strings(bool v) { _ignore_shared_strings = v; }
+  static bool shared_string_ignored()       { return _ignore_shared_strings; }
+  static void shared_oops_do(OopClosure* f);
+  static bool copy_shared_string(GrowableArray<MemRegion> *string_space,
+                                 CompactHashtableWriter* ch_table);
+  static bool copy_compact_table(char** top, char* end, GrowableArray<MemRegion> *string_space,
+                                 size_t* space_size);
+  static const char* init_shared_table(FileMapInfo *mapinfo, char* buffer);
   static void reverse() {
     the_table()->Hashtable<oop, mtSymbol>::reverse();
   }
--- a/src/share/vm/classfile/symbolTable.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/classfile/symbolTable.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -539,7 +539,8 @@
 
 bool SymbolTable::copy_compact_table(char** top, char*end) {
 #if INCLUDE_CDS
-  CompactHashtableWriter ch_table("symbol", the_table()->number_of_entries(),
+  CompactHashtableWriter ch_table(CompactHashtable<Symbol*, char>::_symbol_table,
+                                  the_table()->number_of_entries(),
                                   &MetaspaceShared::stats()->symbol);
   if (*top + ch_table.get_required_bytes() > end) {
     // not enough space left
@@ -556,7 +557,6 @@
     }
   }
 
-  char* old_top = *top;
   ch_table.dump(top, end);
 
   *top = (char*)align_pointer_up(*top, sizeof(void*));
@@ -565,7 +565,8 @@
 }
 
 const char* SymbolTable::init_shared_table(const char* buffer) {
-  const char* end = _shared_table.init(buffer);
+  const char* end = _shared_table.init(
+          CompactHashtable<Symbol*, char>::_symbol_table, buffer);
   return (const char*)align_pointer_up(end, sizeof(void*));
 }
 
--- a/src/share/vm/classfile/vmSymbols.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/classfile/vmSymbols.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -846,6 +846,12 @@
    do_name(     implCompressMB_name,                               "implCompressMultiBlock")                            \
    do_signature(implCompressMB_signature,                          "([BII)I")                                           \
                                                                                                                         \
+  /* support for com.sun.crypto.provider.GHASH */                                                                       \
+  do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH")                                              \
+  do_intrinsic(_ghash_processBlocks, com_sun_crypto_provider_ghash, processBlocks_name, ghash_processBlocks_signature, F_S) \
+   do_name(processBlocks_name, "processBlocks")                                                                         \
+   do_signature(ghash_processBlocks_signature, "([BII[J[J)V")                                                           \
+                                                                                                                        \
   /* support for java.util.zip */                                                                                       \
   do_class(java_util_zip_CRC32,           "java/util/zip/CRC32")                                                        \
   do_intrinsic(_updateCRC32,               java_util_zip_CRC32,   update_name, int2_int_signature,               F_SN)  \
--- a/src/share/vm/code/codeCache.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/code/codeCache.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -190,7 +190,12 @@
   static void set_needs_cache_clean(bool v)           { _needs_cache_clean = v;    }
   static void clear_inline_caches();                  // clear all inline caches
 
-  // Returns the CodeBlobType for nmethods of the given compilation level
+  // Returns the CodeBlobType for the given nmethod
+  static int get_code_blob_type(nmethod* nm) {
+    return get_code_heap(nm)->code_blob_type();
+  }
+
+  // Returns the CodeBlobType for the given compilation level
   static int get_code_blob_type(int comp_level) {
     if (comp_level == CompLevel_none ||
         comp_level == CompLevel_simple ||
@@ -287,7 +292,7 @@
       // Iterate over all CodeBlobs
       _code_blob_type = CodeBlobType::All;
     } else if (nm != NULL) {
-      _code_blob_type = CodeCache::get_code_blob_type(nm->comp_level());
+      _code_blob_type = CodeCache::get_code_blob_type(nm);
     } else {
       // Only iterate over method code heaps, starting with non-profiled
       _code_blob_type = CodeBlobType::MethodNonProfiled;
--- a/src/share/vm/code/debugInfo.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/code/debugInfo.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -26,6 +26,7 @@
 #include "code/debugInfo.hpp"
 #include "code/debugInfoRec.hpp"
 #include "code/nmethod.hpp"
+#include "oops/oop.inline.hpp"
 #include "runtime/handles.inline.hpp"
 
 PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
@@ -47,6 +48,12 @@
   write_int(recorder()->oop_recorder()->find_index(h));
 }
 
+oop DebugInfoReadStream::read_oop() {
+  oop o = code()->oop_at(read_int());
+  assert(o->is_oop_or_null(), "oop only");
+  return o;
+}
+
 ScopeValue* DebugInfoReadStream::read_object_value() {
   int id = read_int();
 #ifdef ASSERT
--- a/src/share/vm/code/debugInfo.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/code/debugInfo.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -266,11 +266,7 @@
 
   } ;
 
-  oop read_oop() {
-    oop o = code()->oop_at(read_int());
-    assert(o == NULL || o->is_oop(), "oop only");
-    return o;
-  }
+  oop read_oop();
   Method* read_method() {
     Method* o = (Method*)(code()->metadata_at(read_int()));
     // is_metadata() is a faster check than is_metaspace_object()
--- a/src/share/vm/code/nmethod.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/code/nmethod.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1421,7 +1421,7 @@
   Events::log(JavaThread::current(), "flushing nmethod " INTPTR_FORMAT, this);
   if (PrintMethodFlushing) {
     tty->print_cr("*flushing nmethod %3d/" INTPTR_FORMAT ". Live blobs:" UINT32_FORMAT "/Free CodeCache:" SIZE_FORMAT "Kb",
-        _compile_id, this, CodeCache::nof_blobs(), CodeCache::unallocated_capacity(CodeCache::get_code_blob_type(_comp_level))/1024);
+        _compile_id, this, CodeCache::nof_blobs(), CodeCache::unallocated_capacity(CodeCache::get_code_blob_type(this))/1024);
   }
 
   // We need to deallocate any ExceptionCache data.
--- a/src/share/vm/compiler/compileLog.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/compiler/compileLog.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -58,13 +58,15 @@
 CompileLog::~CompileLog() {
   delete _out; // Close fd in fileStream::~fileStream()
   _out = NULL;
+  // Remove partial file after merging in CompileLog::finish_log_on_error
+  unlink(_file);
   FREE_C_HEAP_ARRAY(char, _identities);
   FREE_C_HEAP_ARRAY(char, _file);
 }
 
 
 // see_tag, pop_tag:  Override the default do-nothing methods on xmlStream.
-// These methods provide a hook for managing the the extra context markup.
+// These methods provide a hook for managing the extra context markup.
 void CompileLog::see_tag(const char* tag, bool push) {
   if (_context.size() > 0 && _out != NULL) {
     _out->write(_context.base(), _context.size());
--- a/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -190,10 +190,10 @@
 };
 
 ConcurrentMarkSweepGeneration::ConcurrentMarkSweepGeneration(
-     ReservedSpace rs, size_t initial_byte_size, int level,
+     ReservedSpace rs, size_t initial_byte_size,
      CardTableRS* ct, bool use_adaptive_freelists,
      FreeBlockDictionary<FreeChunk>::DictionaryChoice dictionaryChoice) :
-  CardGeneration(rs, initial_byte_size, level, ct),
+  CardGeneration(rs, initial_byte_size, ct),
   _dilatation_factor(((double)MinChunkSize)/((double)(CollectedHeap::min_fill_size()))),
   _did_compact(false)
 {
@@ -285,9 +285,9 @@
     _ref_processor =
       new ReferenceProcessor(_span,                               // span
                              (ParallelGCThreads > 1) && ParallelRefProcEnabled, // mt processing
-                             (int) ParallelGCThreads,             // mt processing degree
+                             ParallelGCThreads,                   // mt processing degree
                              _cmsGen->refs_discovery_is_mt(),     // mt discovery
-                             (int) MAX2(ConcGCThreads, ParallelGCThreads), // mt discovery degree
+                             MAX2(ConcGCThreads, ParallelGCThreads), // mt discovery degree
                              _cmsGen->refs_discovery_is_atomic(), // discovery is not atomic
                              &_is_alive_closure);                 // closure for liveness info
     // Initialize the _ref_processor field of CMSGen
@@ -562,7 +562,7 @@
   // are not shared with parallel scavenge (ParNew).
   {
     uint i;
-    uint num_queues = (uint) MAX2(ParallelGCThreads, ConcGCThreads);
+    uint num_queues = MAX2(ParallelGCThreads, ConcGCThreads);
 
     if ((CMSParallelRemarkEnabled || CMSConcurrentMTEnabled
          || ParallelRefProcEnabled)
@@ -682,12 +682,17 @@
 void ConcurrentMarkSweepGeneration::printOccupancy(const char *s) {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
   if (PrintGCDetails) {
+    // I didn't want to change the logging when removing the level concept,
+    // but I guess this logging could say "old" or something instead of "1".
+    assert(gch->is_old_gen(this),
+           "The CMS generation should be the old generation");
+    uint level = 1;
     if (Verbose) {
-      gclog_or_tty->print("[%d %s-%s: "SIZE_FORMAT"("SIZE_FORMAT")]",
-        level(), short_name(), s, used(), capacity());
+      gclog_or_tty->print("[%u %s-%s: "SIZE_FORMAT"("SIZE_FORMAT")]",
+        level, short_name(), s, used(), capacity());
     } else {
-      gclog_or_tty->print("[%d %s-%s: "SIZE_FORMAT"K("SIZE_FORMAT"K)]",
-        level(), short_name(), s, used() / K, capacity() / K);
+      gclog_or_tty->print("[%u %s-%s: "SIZE_FORMAT"K("SIZE_FORMAT"K)]",
+        level, short_name(), s, used() / K, capacity() / K);
     }
   }
   if (Verbose) {
@@ -797,27 +802,22 @@
       gclog_or_tty->print_cr("\nFrom compute_new_size: ");
       gclog_or_tty->print_cr("  Free fraction %f", free_percentage);
       gclog_or_tty->print_cr("  Desired free fraction %f",
-        desired_free_percentage);
+              desired_free_percentage);
       gclog_or_tty->print_cr("  Maximum free fraction %f",
-        maximum_free_percentage);
+              maximum_free_percentage);
       gclog_or_tty->print_cr("  Capacity "SIZE_FORMAT, capacity()/1000);
       gclog_or_tty->print_cr("  Desired capacity "SIZE_FORMAT,
-        desired_capacity/1000);
-      int prev_level = level() - 1;
-      if (prev_level >= 0) {
-        size_t prev_size = 0;
-        GenCollectedHeap* gch = GenCollectedHeap::heap();
-        Generation* prev_gen = gch->young_gen();
-        prev_size = prev_gen->capacity();
-          gclog_or_tty->print_cr("  Younger gen size "SIZE_FORMAT,
-                                 prev_size/1000);
-      }
+              desired_capacity/1000);
+      GenCollectedHeap* gch = GenCollectedHeap::heap();
+      assert(gch->is_old_gen(this), "The CMS generation should always be the old generation");
+      size_t young_size = gch->young_gen()->capacity();
+      gclog_or_tty->print_cr("  Young gen size " SIZE_FORMAT, young_size / 1000);
       gclog_or_tty->print_cr("  unsafe_max_alloc_nogc "SIZE_FORMAT,
-        unsafe_max_alloc_nogc()/1000);
+              unsafe_max_alloc_nogc()/1000);
       gclog_or_tty->print_cr("  contiguous available "SIZE_FORMAT,
-        contiguous_available()/1000);
+              contiguous_available()/1000);
       gclog_or_tty->print_cr("  Expand by "SIZE_FORMAT" (bytes)",
-        expand_bytes);
+              expand_bytes);
     }
     // safe if expansion fails
     expand_for_gc_cause(expand_bytes, 0, CMSExpansionCause::_satisfy_free_ratio);
@@ -1650,8 +1650,7 @@
                                             _intra_sweep_estimate.padded_average());
   }
 
-  GenMarkSweep::invoke_at_safepoint(_cmsGen->level(),
-    ref_processor(), clear_all_soft_refs);
+  GenMarkSweep::invoke_at_safepoint(ref_processor(), clear_all_soft_refs);
   #ifdef ASSERT
     CompactibleFreeListSpace* cms_space = _cmsGen->cmsSpace();
     size_t free_size = cms_space->free();
@@ -2432,7 +2431,7 @@
     StrongRootsScope srs(1);
 
     gch->gen_process_roots(&srs,
-                           _cmsGen->level(),
+                           GenCollectedHeap::OldGen,
                            true,   // younger gens are roots
                            GenCollectedHeap::ScanningOption(roots_scanning_options()),
                            should_unload_classes(),
@@ -2504,7 +2503,7 @@
     StrongRootsScope srs(1);
 
     gch->gen_process_roots(&srs,
-                           _cmsGen->level(),
+                           GenCollectedHeap::OldGen,
                            true,   // younger gens are roots
                            GenCollectedHeap::ScanningOption(roots_scanning_options()),
                            should_unload_classes(),
@@ -3031,7 +3030,7 @@
       StrongRootsScope srs(1);
 
       gch->gen_process_roots(&srs,
-                             _cmsGen->level(),
+                             GenCollectedHeap::OldGen,
                              true,   // younger gens are roots
                              GenCollectedHeap::ScanningOption(roots_scanning_options()),
                              should_unload_classes(),
@@ -4282,15 +4281,12 @@
       FlagSetting fl(gch->_is_gc_active, false);
       NOT_PRODUCT(GCTraceTime t("Scavenge-Before-Remark",
         PrintGCDetails && Verbose, true, _gc_timer_cm, _gc_tracer_cm->gc_id());)
-      int level = _cmsGen->level() - 1;
-      if (level >= 0) {
-        gch->do_collection(true,        // full (i.e. force, see below)
-                           false,       // !clear_all_soft_refs
-                           0,           // size
-                           false,       // is_tlab
-                           level        // max_level
-                          );
-      }
+      gch->do_collection(true,                      // full (i.e. force, see below)
+                         false,                     // !clear_all_soft_refs
+                         0,                         // size
+                         false,                     // is_tlab
+                         GenCollectedHeap::YoungGen // type
+        );
     }
     FreelistLocker x(this);
     MutexLockerEx y(bitMapLock(),
@@ -4464,7 +4460,7 @@
   CLDToOopClosure cld_closure(&par_mri_cl, true);
 
   gch->gen_process_roots(_strong_roots_scope,
-                         _collector->_cmsGen->level(),
+                         GenCollectedHeap::OldGen,
                          false,     // yg was scanned above
                          GenCollectedHeap::ScanningOption(_collector->CMSCollector::roots_scanning_options()),
                          _collector->should_unload_classes(),
@@ -4603,7 +4599,7 @@
   _timer.reset();
   _timer.start();
   gch->gen_process_roots(_strong_roots_scope,
-                         _collector->_cmsGen->level(),
+                         GenCollectedHeap::OldGen,
                          false,     // yg was scanned above
                          GenCollectedHeap::ScanningOption(_collector->CMSCollector::roots_scanning_options()),
                          _collector->should_unload_classes(),
@@ -5184,7 +5180,7 @@
     StrongRootsScope srs(1);
 
     gch->gen_process_roots(&srs,
-                           _cmsGen->level(),
+                           GenCollectedHeap::OldGen,
                            true,  // younger gens as roots
                            GenCollectedHeap::ScanningOption(roots_scanning_options()),
                            should_unload_classes(),
@@ -5322,8 +5318,8 @@
    _bit_map(bit_map),
    _work_queue(work_queue),
    _mark_and_push(collector, span, bit_map, work_queue),
-   _low_water_mark(MIN2((uint)(work_queue->max_elems()/4),
-                        (uint)(CMSWorkQueueDrainThreshold * ParallelGCThreads)))
+   _low_water_mark(MIN2((work_queue->max_elems()/4),
+                        ((uint)CMSWorkQueueDrainThreshold * ParallelGCThreads)))
 { }
 
 // . see if we can share work_queues with ParNew? XXX
@@ -5648,11 +5644,12 @@
   return _cmsSpace->find_chunk_at_end();
 }
 
-void ConcurrentMarkSweepGeneration::update_gc_stats(int current_level,
+void ConcurrentMarkSweepGeneration::update_gc_stats(Generation* current_generation,
                                                     bool full) {
-  // The next lower level has been collected.  Gather any statistics
+  // If the young generation has been collected, gather any statistics
   // that are of interest at this point.
-  if (!full && (current_level + 1) == level()) {
+  bool current_is_young = GenCollectedHeap::heap()->is_young_gen(current_generation);
+  if (!full && current_is_young) {
     // Gather statistics on the young generation collection.
     collector()->stats().record_gc0_end(used());
   }
@@ -6251,8 +6248,8 @@
   _span(span),
   _bit_map(bit_map),
   _work_queue(work_queue),
-  _low_water_mark(MIN2((uint)(work_queue->max_elems()/4),
-                       (uint)(CMSWorkQueueDrainThreshold * ParallelGCThreads))),
+  _low_water_mark(MIN2((work_queue->max_elems()/4),
+                       ((uint)CMSWorkQueueDrainThreshold * ParallelGCThreads))),
   _par_pushAndMarkClosure(collector, span, rp, bit_map, work_queue)
 {
   _ref_processor = rp;
--- a/src/share/vm/gc/cms/concurrentMarkSweepGeneration.hpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/gc/cms/concurrentMarkSweepGeneration.hpp	Thu Jul 02 16:09:16 2015 -0700
@@ -1063,7 +1063,7 @@
   void shrink_free_list_by(size_t bytes);
 
   // Update statistics for GC
-  virtual void update_gc_stats(int level, bool full);
+  virtual void update_gc_stats(Generation* current_generation, bool full);
 
   // Maximum available space in the generation (including uncommitted)
   // space.
@@ -1079,7 +1079,7 @@
 
  public:
   ConcurrentMarkSweepGeneration(ReservedSpace rs, size_t initial_byte_size,
-                                int level, CardTableRS* ct,
+                                CardTableRS* ct,
                                 bool use_adaptive_freelists,
                                 FreeBlockDictionary<FreeChunk>::DictionaryChoice);
 
--- a/src/share/vm/gc/cms/parCardTableModRefBS.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/gc/cms/parCardTableModRefBS.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -42,7 +42,7 @@
                                                              uint n_threads) {
   assert(n_threads > 0, "expected n_threads > 0");
   assert(n_threads <= ParallelGCThreads,
-         err_msg("n_threads: %u > ParallelGCThreads: " UINTX_FORMAT, n_threads, ParallelGCThreads));
+         err_msg("n_threads: %u > ParallelGCThreads: %u", n_threads, ParallelGCThreads));
 
   // Make sure the LNC array is valid for the space.
   jbyte**   lowest_non_clean;
--- a/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Jul 02 08:53:58 2015 -0700
+++ b/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Jul 02 16:09:16 2015 -0700
@@ -62,25 +62,25 @@
 #pragma warning( disable:4355 ) // 'this' : used in base member initializer list
 #endif
 ParScanThreadState::ParScanThreadState(Space* to_space_,
-                                       ParNewGeneration* gen_,
+                                       ParNewGeneration* young_gen_,
                                        Generation* old_gen_,
                                        int thread_num_,
                                        ObjToScanQueueSet* work_queue_set_,
                                        Stack<oop, mtGC>* overflow_stacks_,
                                        size_t desired_plab_sz_,
                                        ParallelTaskTerminator& term_) :
-  _to_space(to_space_), _old_gen(old_gen_), _young_gen(gen_), _thread_num(thread_num_),
+  _to_space(to_space_), _old_gen(old_gen_), _young_gen(young_gen_), _thread_num(thread_num_),
   _work_queue(work_queue_set_->queue(thread_num_)), _to_space_full(false),
   _overflow_stack(overflow_stacks_ ? overflow_stacks_ + thread_num_ : NULL),
   _ageTable(false), // false ==> not the global age table, no perf data.
   _to_space_alloc_buffer(desired_plab_sz_),
-  _to_space_closure(gen_, this), _old_gen_closure(gen_, this),
-  _to_space_root_closure(gen_, this), _old_gen_root_closure(gen_, this),
-  _older_gen_closure(gen_, this),
+  _to_space_closure(young_gen_, this), _old_gen_closure(young_gen_, this),
+  _to_space_root_closure(young_gen_, this), _old_gen_root_closure(young_gen_, this),
+  _older_gen_closure(young_gen_, this),
   _evacuate_followers(this, &_to_space_closure, &_old_gen_closure,
-                      &_to_space_root_closure, gen_, &_old_gen_root_closure,
+                      &_to_space_root_closure, young_gen_, &_old_gen_root_closure,
                       work_queue_set_, &term_),
-  _is_alive_closure(gen_), _scan_weak_ref_closure(gen_, this),
+  _is_alive_closure(young_gen_), _scan_weak_ref_closure(young_gen_, this),
   _keep_alive_closure(&_scan_weak_ref_closure),
   _strong_roots_time(0.0), _term_time(0.0)
 {
@@ -481,7 +481,6 @@
                                ParScanThreadState* par_scan_state) :
   OopsInKlassOrGenClosure(g), _par_scan_state(par_scan_state), _g(g)
 {
-  assert(_g->level() == 0, "Optimized for youngest generation");
   _boundary = _g->reserved().end();
 }
 
@@ -566,11 +565,11 @@
   par_scan_state()->end_term_time();
 }
 
-ParNewGenTask::ParNewGenTask(ParNewGeneration* gen, Generation* old_gen,
+ParNewGenTask::ParNewGenTask(ParNewGeneration* young_gen, Generation* old_gen,
                              HeapWord* young_old_boundary, ParScanThreadStateSet* state_set,
                              StrongRootsScope* strong_roots_scope) :
     AbstractGangTask("ParNewGeneration collection"),
-    _gen(gen), _old_gen(old_gen),
+    _young_gen(young_gen), _old_gen(old_gen),
     _young_old_boundary(young_old_boundary),
     _state_set(state_set),
     _strong_roots_scope(strong_roots_scope)
@@ -596,7 +595,7 @@
 
   par_scan_state.start_strong_roots();
   gch->gen_process_roots(_strong_roots_scope,
-                         _gen->level(),
+                         GenCollectedHeap::YoungGen,
                          true,  // Process younger gens, if any,
                                 // as strong roots.
                          GenCollectedHeap::SO_ScavengeCodeCache,
@@ -616,8 +615,8 @@
 #pragma warning( disable:4355 ) // 'this' : used in base member initializer list
 #endif
 ParNewGeneration::
-ParNewGeneration(ReservedSpace rs, size_t initial_byte_size, int level)
-  : DefNewGeneration(rs, initial_byte_size, level, "PCopy"),
+ParNewGeneration(ReservedSpace rs, size_t initial_byte_size)
+  : DefNewGeneration(rs, initial_byte_size, "PCopy"),
   _overflow_list(NULL),
   _is_alive_closure(this),
   _plab_stats(YoungPLABSize, PLABWeight)
@@ -752,7 +751,7 @@
 private:
   virtual void work(uint worker_id);
 private:
-  ParNewGeneration&      _gen;
+  ParNewGeneration&      _young_gen;
   ProcessTask&           _task;
   Generation&            _old_gen;
   HeapWord*              _young_old_boundary;
@@ -760,12 +759,12 @@
 };
 
 ParNewRefProcTaskProxy::ParNewRefProcTaskProxy(ProcessTask& task,
-                                               ParNewGeneration& gen,
+                                               ParNewGeneration& young_gen,
                                                Generation& old_gen,
                                                HeapWord* young_old_boundary,
                                                ParScanThreadStateSet& state_set)
   : AbstractGangTask("ParNewGeneration parallel reference processing"),
-    _gen(gen),
+    _young_gen(young_gen),
     _task(task),
     _old_gen(old_gen),
     _young_old_boundary(young_old_boundary),
@@ -806,12 +805,12 @@
   GenCollectedHeap* gch = GenCollectedHeap::heap();
   FlexibleWorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
-  _state_set.reset(workers->active_workers(), _generation.promotion_failed());
-  ParNewRefProcTaskProxy rp_task(task, _generation, *_generation.next_gen(),
-                                 _generation.reserved().end(), _state_set);
+  _state_set.reset(workers->active_workers(), _young_gen.promotion_failed());
+  ParNewRefProcTaskProxy rp_task(task, _young_gen, _old_gen,
+                                 _young_gen.reserved().end(), _state_set);
   workers->run_task(&rp_task);
   _state_set.reset(0 /* bad value in debug if not reset */,
-                   _generation.promotion_failed());
+                   _young_gen.promotion_failed());
 }
 
 void ParNewRefProcTaskExecutor::execute(EnqueueTask& task)
@@ -835,10 +834,10 @@
   ScanClosure(g, gc_barrier) {}
 
 EvacuateFollowersClosureGeneral::
-EvacuateFollowersClosureGeneral(GenCollectedHeap* gch, int level,
+EvacuateFollowersClosureGeneral(GenCollectedHeap* gch,
                                 OopsInGenClosure* cur,
                                 OopsInGenClosure* older) :
-  _gch(gch), _level(level),
+  _gch(gch),
   _scan_cur_or_nonheap(cur), _scan_older(older)
 {}
 
@@ -846,10 +845,10 @@
   do {
     // Beware: this call will lead to closure applications via virtual
     // calls.
-    _gch->oop_since_save_marks_iterate(_level,
+    _gch->oop_since_save_marks_iterate(GenCollectedHeap::YoungGen,
                                        _scan_cur_or_nonheap,
                                        _scan_older);
-  } while (!_gch->no_allocs_since_save_marks(_level));
+  } while (!_gch->no_allocs_since_save_marks(true /* include_young */));
 }
 
 
@@ -972,14 +971,14 @@
   ScanClosure               scan_without_gc_barrier(this, false);
   ScanClosureWithParBarrier scan_with_gc_barrier(this, true);
   set_promo_failure_scan_stack_closure(&scan_without_gc_barrier);
-  EvacuateFollowersClosureGeneral evacuate_followers(gch, _level,
+  EvacuateFollowersClosureGeneral evacuate_followers(gch,
     &scan_without_gc_barrier, &scan_with_gc_barrier);
   rp->setup_policy(clear_all_soft_refs);
   // Can  the mt_degree be set later (at run_task() time would be best)?
   rp->set_active_mt_degree(active_workers);
   ReferenceProcessorStats stats;
   if (rp->processing_is_mt()) {
-    ParNewRefProcTaskExecutor task_executor(*this, thread_state_set);
+    ParNewRefProcTaskExecutor task_executor(*this, *_old_gen, thread_state_set);
     stats = rp->process_discovered_references(&is_alive, &keep_alive,
                                               &evacuate_followers, &task_executor,
                                               _gc_timer, _gc_tracer.gc_id());
@@ -1045,7 +1044,7 @@
 
   rp->set_enqueuing_is_done(true);
   if (rp->processing_is_mt()) {
-    ParNewRefProcTaskExecutor task_executor(*this, thread_state_set);
+    ParNewRefProcTaskExecutor task_executor(*this, *_old_gen, thread_state_set);
     rp->enqueue_discovered_references(&task_executor);
   } else {
     rp->enqueue_discovered_references(NULL);
@@ -1349,7 +1348,7 @@
   oop prefix = cast_to_oop(Atomic::xchg_ptr(BUSY, &_overflow_list));
   // Trim off a prefix of at most objsFromOverflow items
   Thread* tid = Thread::current();
-  size_t spin_count = (size_t)ParallelGCThreads;
+  size_t spin_count = ParallelGCThreads;
   size_t sleep_time_millis = MAX2((size_t)1, objsFromOverflow/100);
   for (size_t spin = 0; prefix == BUSY && spin < spin_count; spin++) {
     // someone grabbed it before we did ...
@@ -1466,9 +1465,9 @@<