changeset 2530:7c948af3e651

Merge
author asaha
date Tue, 24 May 2011 11:09:39 -0700
parents 66c17ec20ee2 c149193c768b
children ec7055a259a6
files src/os/windows/vm/os_windows.cpp
diffstat 175 files changed, 9689 insertions(+), 1806 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Fri May 06 14:32:44 2011 -0700
+++ b/.hgtags	Tue May 24 11:09:39 2011 -0700
@@ -170,3 +170,5 @@
 d283b82966712b353fa307845a1316da42a355f4 hs21-b10
 5d07913abd59261c77f24cc04a759cb75d804099 jdk7-b141
 3aea9e9feb073f5500e031be6186666bcae89aa2 hs21-b11
+9ad1548c6b63d596c411afc35147ffd5254426d9 jdk7-b142
+9ad1548c6b63d596c411afc35147ffd5254426d9 hs21-b12
--- a/agent/make/Makefile	Fri May 06 14:32:44 2011 -0700
+++ b/agent/make/Makefile	Tue May 24 11:09:39 2011 -0700
@@ -257,7 +257,7 @@
 all: filelist
 	@mkdir -p $(OUTPUT_DIR)
 	@echo "$(SA_BUILD_VERSION_PROP)" > $(SA_PROPERTIES)
-	$(JAVAC) -source 1.4 -classpath $(CLASSPATH) -deprecation -sourcepath $(SRC_DIR) -g -d $(OUTPUT_DIR) @filelist
+	$(JAVAC) -classpath $(CLASSPATH) -deprecation -sourcepath $(SRC_DIR) -g -d $(OUTPUT_DIR) @filelist
 	$(RMIC) -classpath $(OUTPUT_DIR) -d $(OUTPUT_DIR) sun.jvm.hotspot.debugger.remote.RemoteDebuggerServer
 	rm -f $(OUTPUT_DIR)/sun/jvm/hotspot/utilities/soql/sa.js
 	cp $(SRC_DIR)/sun/jvm/hotspot/utilities/soql/sa.js $(OUTPUT_DIR)/sun/jvm/hotspot/utilities/soql
@@ -269,7 +269,7 @@
 allprof: filelist
 	@mkdir -p $(OUTPUT_DIR)
 	@echo "$(SA_BUILD_VERSION_PROP)" > $(SA_PROPERTIES)
-	$(JAVAC) -source 1.4 -J-Xprof -classpath $(CLASSPATH) -deprecation -sourcepath $(SRC_DIR) -g -d $(OUTPUT_DIR) @filelist
+	$(JAVAC) -J-Xprof -classpath $(CLASSPATH) -deprecation -sourcepath $(SRC_DIR) -g -d $(OUTPUT_DIR) @filelist
 	$(RMIC) -classpath $(OUTPUT_DIR) -d $(OUTPUT_DIR) sun.jvm.hotspot.debugger.remote.RemoteDebuggerServer
 	rm -f $(OUTPUT_DIR)/sun/jvm/hotspot/utilities/soql/sa.js
 	cp $(SRC_DIR)/sun/jvm/hotspot/utilities/soql/sa.js $(OUTPUT_DIR)/sun/jvm/hotspot/utilities/soql
--- a/agent/src/os/solaris/proc/libproc.h	Fri May 06 14:32:44 2011 -0700
+++ b/agent/src/os/solaris/proc/libproc.h	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -420,7 +420,22 @@
 /*
  * Stack frame iteration interface.
  */
+#ifdef SOLARIS_11_B159_OR_LATER
+/* building on Nevada-B159 or later so define the new callback */
+typedef int proc_stack_f(
+    void *,             /* the cookie given to Pstack_iter() */
+    const prgregset_t,  /* the frame's registers */
+    uint_t,             /* argc for the frame's function */
+    const long *,       /* argv for the frame's function */
+    int,                /* bitwise flags describing the frame (see below) */
+    int);               /* a signal number */
+
+#define PR_SIGNAL_FRAME    1    /* called by a signal handler */
+#define PR_FOUND_SIGNAL    2    /* we found the corresponding signal number */
+#else
+/* building on Nevada-B158 or earlier so define the old callback */
 typedef int proc_stack_f(void *, const prgregset_t, uint_t, const long *);
+#endif
 
 extern int Pstack_iter(struct ps_prochandle *,
     const prgregset_t, proc_stack_f *, void *);
--- a/agent/src/os/solaris/proc/salibproc.h	Fri May 06 14:32:44 2011 -0700
+++ b/agent/src/os/solaris/proc/salibproc.h	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -101,7 +101,23 @@
 /*
  * Stack frame iteration interface.
  */
+#ifdef SOLARIS_11_B159_OR_LATER
+/* building on Nevada-B159 or later so define the new callback */
+typedef int proc_stack_f(
+    void *,             /* the cookie given to Pstack_iter() */
+    const prgregset_t,  /* the frame's registers */
+    uint_t,             /* argc for the frame's function */
+    const long *,       /* argv for the frame's function */
+    int,                /* bitwise flags describing the frame (see below) */
+    int);               /* a signal number */
+
+#define PR_SIGNAL_FRAME    1    /* called by a signal handler */
+#define PR_FOUND_SIGNAL    2    /* we found the corresponding signal number */
+#else
+/* building on Nevada-B158 or earlier so define the old callback */
 typedef int proc_stack_f(void *, const prgregset_t, uint_t, const long *);
+#endif
+
 extern int Pstack_iter(struct ps_prochandle *,
     const prgregset_t, proc_stack_f *, void *);
 
--- a/agent/src/os/solaris/proc/saproc.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/agent/src/os/solaris/proc/saproc.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -24,6 +24,9 @@
 
 #include "salibproc.h"
 #include "sun_jvm_hotspot_debugger_proc_ProcDebuggerLocal.h"
+#ifndef SOLARIS_11_B159_OR_LATER
+#include <sys/utsname.h>
+#endif
 #include <thread_db.h>
 #include <strings.h>
 #include <limits.h>
@@ -40,8 +43,22 @@
 #define SYMBOL_BUF_SIZE  256
 #define ERR_MSG_SIZE     (PATH_MAX + 256)
 
-// debug mode
+// debug modes
 static int _libsaproc_debug = 0;
+#ifndef SOLARIS_11_B159_OR_LATER
+static bool _Pstack_iter_debug = false;
+
+static void dprintf_2(const char* format,...) {
+  if (_Pstack_iter_debug) {
+    va_list alist;
+
+    va_start(alist, format);
+    fputs("Pstack_iter DEBUG: ", stderr);
+    vfprintf(stderr, format, alist);
+    va_end(alist);
+  }
+}
+#endif // !SOLARIS_11_B159_OR_LATER
 
 static void print_debug(const char* format,...) {
   if (_libsaproc_debug) {
@@ -450,6 +467,7 @@
   return 0;
 }
 
+// Pstack_iter() proc_stack_f callback prior to Nevada-B159
 static int
 fill_cframe_list(void *cd, const prgregset_t regs, uint_t argc, const long *argv) {
   DebuggerWith2Objects* dbgo2 = (DebuggerWith2Objects*) cd;
@@ -472,6 +490,14 @@
   return 0;
 }
 
+// Pstack_iter() proc_stack_f callback in Nevada-B159 or later
+/*ARGSUSED*/
+static int
+wrapper_fill_cframe_list(void *cd, const prgregset_t regs, uint_t argc,
+                         const long *argv, int frame_flags, int sig) {
+  return(fill_cframe_list(cd, regs, argc, argv));
+}
+
 // part of the class sharing workaround
 
 // FIXME: !!HACK ALERT!!
@@ -970,6 +996,11 @@
                    TD_THR_ANY_STATE, TD_THR_LOWEST_PRIORITY, TD_SIGNO_MASK, TD_THR_ANY_USER_FLAGS);
 }
 
+#ifndef SOLARIS_11_B159_OR_LATER
+// building on Nevada-B158 or earlier so more hoops to jump through
+static bool has_newer_Pstack_iter = false;  // older version by default
+#endif
+
 /*
  * Class:       sun_jvm_hotspot_debugger_proc_ProcDebuggerLocal
  * Method:      fillCFrameList0
@@ -997,7 +1028,24 @@
 
   env->ReleaseLongArrayElements(regsArray, ptr, JNI_ABORT);
   CHECK_EXCEPTION_(0);
-  Pstack_iter((struct ps_prochandle*) p_ps_prochandle, gregs, fill_cframe_list, &dbgo2);
+
+#ifdef SOLARIS_11_B159_OR_LATER
+  // building on Nevada-B159 or later so use the new callback
+  Pstack_iter((struct ps_prochandle*) p_ps_prochandle, gregs,
+              wrapper_fill_cframe_list, &dbgo2);
+#else
+  // building on Nevada-B158 or earlier so figure out which callback to use
+
+  if (has_newer_Pstack_iter) {
+    // Since we're building on Nevada-B158 or earlier, we have to
+    // cast wrapper_fill_cframe_list to make the compiler happy.
+    Pstack_iter((struct ps_prochandle*) p_ps_prochandle, gregs,
+                (proc_stack_f *)wrapper_fill_cframe_list, &dbgo2);
+  } else {
+    Pstack_iter((struct ps_prochandle*) p_ps_prochandle, gregs,
+                fill_cframe_list, &dbgo2);
+  }
+#endif // SOLARIS_11_B159_OR_LATER
   return dbgo2.obj;
 }
 
@@ -1218,6 +1266,102 @@
   return res;
 }
 
+#ifndef SOLARIS_11_B159_OR_LATER
+// Determine if the OS we're running on has the newer version
+// of libproc's Pstack_iter.
+//
+// Set env var PSTACK_ITER_DEBUG=true to debug this logic.
+// Set env var PSTACK_ITER_DEBUG_RELEASE to simulate a 'release' value.
+// Set env var PSTACK_ITER_DEBUG_VERSION to simulate a 'version' value.
+//
+// frankenputer 'uname -r -v': 5.10 Generic_141445-09
+// jurassic 'uname -r -v':     5.11 snv_164
+// lonepeak 'uname -r -v':     5.11 snv_127
+//
+static void set_has_newer_Pstack_iter(JNIEnv *env) {
+  static bool done_set = false;
+
+  if (done_set) {
+    // already set has_newer_Pstack_iter
+    return;
+  }
+
+  struct utsname name;
+  if (uname(&name) == -1) {
+    THROW_NEW_DEBUGGER_EXCEPTION("uname() failed!");
+  }
+  dprintf_2("release='%s'  version='%s'\n", name.release, name.version);
+
+  if (_Pstack_iter_debug) {
+    char *override = getenv("PSTACK_ITER_DEBUG_RELEASE");
+    if (override != NULL) {
+      strncpy(name.release, override, SYS_NMLN - 1);
+      name.release[SYS_NMLN - 2] = '\0';
+      dprintf_2("overriding with release='%s'\n", name.release);
+    }
+    override = getenv("PSTACK_ITER_DEBUG_VERSION");
+    if (override != NULL) {
+      strncpy(name.version, override, SYS_NMLN - 1);
+      name.version[SYS_NMLN - 2] = '\0';
+      dprintf_2("overriding with version='%s'\n", name.version);
+    }
+  }
+
+  // the major number corresponds to the old SunOS major number
+  int major = atoi(name.release);
+  if (major >= 6) {
+    dprintf_2("release is SunOS 6 or later\n");
+    has_newer_Pstack_iter = true;
+    done_set = true;
+    return;
+  }
+  if (major < 5) {
+    dprintf_2("release is SunOS 4 or earlier\n");
+    done_set = true;
+    return;
+  }
+
+  // some SunOS 5.* build so now check for Solaris versions
+  char *dot = strchr(name.release, '.');
+  int minor = 0;
+  if (dot != NULL) {
+    // release is major.minor format
+    *dot = NULL;
+    minor = atoi(dot + 1);
+  }
+
+  if (minor <= 10) {
+    dprintf_2("release is Solaris 10 or earlier\n");
+    done_set = true;
+    return;
+  } else if (minor >= 12) {
+    dprintf_2("release is Solaris 12 or later\n");
+    has_newer_Pstack_iter = true;
+    done_set = true;
+    return;
+  }
+
+  // some Solaris 11 build so now check for internal build numbers
+  if (strncmp(name.version, "snv_", 4) != 0) {
+    dprintf_2("release is Solaris 11 post-GA or later\n");
+    has_newer_Pstack_iter = true;
+    done_set = true;
+    return;
+  }
+
+  // version begins with "snv_" so a pre-GA build of Solaris 11
+  int build = atoi(&name.version[4]);
+  if (build >= 159) {
+    dprintf_2("release is Nevada-B159 or later\n");
+    has_newer_Pstack_iter = true;
+  } else {
+    dprintf_2("release is Nevada-B158 or earlier\n");
+  }
+
+  done_set = true;
+}
+#endif // !SOLARIS_11_B159_OR_LATER
+
 /*
  * Class:       sun_jvm_hotspot_debugger_proc_ProcDebuggerLocal
  * Method:      initIDs
@@ -1237,6 +1381,14 @@
   if (libproc_handle == 0)
      THROW_NEW_DEBUGGER_EXCEPTION("can't load libproc.so, if you are using Solaris 5.7 or below, copy libproc.so from 5.8!");
 
+#ifndef SOLARIS_11_B159_OR_LATER
+  _Pstack_iter_debug = getenv("PSTACK_ITER_DEBUG") != NULL;
+
+  set_has_newer_Pstack_iter(env);
+  CHECK_EXCEPTION;
+  dprintf_2("has_newer_Pstack_iter=%d\n", has_newer_Pstack_iter);
+#endif
+
   p_ps_prochandle_ID = env->GetFieldID(clazz, "p_ps_prochandle", "J");
   CHECK_EXCEPTION;
 
--- a/make/altsrc.make	Fri May 06 14:32:44 2011 -0700
+++ b/make/altsrc.make	Tue May 24 11:09:39 2011 -0700
@@ -24,7 +24,8 @@
 
 # This file defines variables and macros which are used in the makefiles to 
 # allow distributions to augment or replace common hotspot code with 
-# distribution-specific source files.
+# distribution-specific source files. This capability is disabled when
+# an OPENJDK build is requested, unless HS_ALT_SRC_REL has been set externally.
 
 # Requires: GAMMADIR
 # Provides:
@@ -33,14 +34,17 @@
 
 HS_COMMON_SRC_REL=src
 
-# This needs to be changed to a more generic location, but we keep it as this 
-# for now for compatibility
-HS_ALT_SRC_REL=src/closed
+ifneq ($(OPENJDK),true)
+  # This needs to be changed to a more generic location, but we keep it 
+  # as this for now for compatibility
+  HS_ALT_SRC_REL=src/closed
+else
+  HS_ALT_SRC_REL=NO_SUCH_PATH
+endif
 
 HS_COMMON_SRC=$(GAMMADIR)/$(HS_COMMON_SRC_REL)
 HS_ALT_SRC=$(GAMMADIR)/$(HS_ALT_SRC_REL)
 
-
 ## altsrc-equiv 
 # 
 # Convert a common source path to an alternative source path
--- a/make/hotspot_version	Fri May 06 14:32:44 2011 -0700
+++ b/make/hotspot_version	Tue May 24 11:09:39 2011 -0700
@@ -35,7 +35,7 @@
 
 HS_MAJOR_VER=21
 HS_MINOR_VER=0
-HS_BUILD_NUMBER=11
+HS_BUILD_NUMBER=13
 
 JDK_MAJOR_VER=1
 JDK_MINOR_VER=7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make/jdk6_hotspot_distro	Tue May 24 11:09:39 2011 -0700
@@ -0,0 +1,32 @@
+# 
+# Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 only, as
+# published by the Free Software Foundation.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# version 2 for more details (a copy is included in the LICENSE file that
+# accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version
+# 2 along with this work; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+# or visit www.oracle.com if you need additional information or have any
+# questions.
+# 
+
+#
+# This file format must remain compatible with both
+# GNU Makefile and Microsoft nmake formats.
+#
+
+# Don't put quotes (fail windows build).
+HOTSPOT_VM_DISTRO=Java HotSpot(TM)
+COMPANY_NAME=Sun Microsystems, Inc.
+PRODUCT_NAME=Java(TM) Platform SE
--- a/make/linux/makefiles/gcc.make	Fri May 06 14:32:44 2011 -0700
+++ b/make/linux/makefiles/gcc.make	Tue May 24 11:09:39 2011 -0700
@@ -205,7 +205,7 @@
 SHARED_FLAG = -shared
 
 # Keep symbols even they are not used
-AOUT_FLAGS += -export-dynamic
+AOUT_FLAGS += -Xlinker -export-dynamic
 
 #------------------------------------------------------------------------
 # Debug flags
--- a/make/linux/makefiles/vm.make	Fri May 06 14:32:44 2011 -0700
+++ b/make/linux/makefiles/vm.make	Tue May 24 11:09:39 2011 -0700
@@ -102,6 +102,10 @@
 CFLAGS += $(EXTRA_CFLAGS)
 LFLAGS += $(EXTRA_CFLAGS)
 
+# Don't set excutable bit on stack segment
+# the same could be done by separate execstack command
+LFLAGS += -Xlinker -z -Xlinker noexecstack
+
 LIBS += -lm -ldl -lpthread
 
 # By default, link the *.o into the library, not the executable.
--- a/make/solaris/makefiles/saproc.make	Fri May 06 14:32:44 2011 -0700
+++ b/make/solaris/makefiles/saproc.make	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -56,6 +56,33 @@
 SA_LFLAGS += -mt -xnolib -norunpath
 endif
 
+# The libproc Pstack_iter() interface changed in Nevada-B159.
+# Use 'uname -r -v' to determine the Solaris version as per
+# Solaris Nevada team request. This logic needs to match:
+# agent/src/os/solaris/proc/saproc.cpp: set_has_newer_Pstack_iter():
+#   - skip SunOS 4 or older
+#   - skip Solaris 10 or older
+#   - skip two digit internal Nevada builds
+#   - skip three digit internal Nevada builds thru 149
+#   - skip internal Nevada builds 150-158
+#   - if not skipped, print define for Nevada-B159 or later
+SOLARIS_11_B159_OR_LATER := \
+$(shell uname -r -v \
+    | sed -n \
+          -e '/^[0-4]\. /b' \
+          -e '/^5\.[0-9] /b' \
+          -e '/^5\.10 /b' \
+          -e '/ snv_[0-9][0-9]$/b' \
+          -e '/ snv_[01][0-4][0-9]$/b' \
+          -e '/ snv_15[0-8]$/b' \
+          -e 's/.*/-DSOLARIS_11_B159_OR_LATER/' \
+          -e 'p' \
+          )
+
+# Uncomment the following to simulate building on Nevada-B159 or later
+# when actually building on Nevada-B158 or earlier:
+#SOLARIS_11_B159_OR_LATER=-DSOLARIS_11_B159_OR_LATER
+
 $(LIBSAPROC): $(SASRCFILES) $(SAMAPFILE)
 	$(QUIETLY) if [ "$(BOOT_JAVA_HOME)" = "" ]; then \
 	  echo "ALT_BOOTDIR, BOOTDIR or JAVA_HOME needs to be defined to build SA"; \
@@ -68,6 +95,7 @@
 	           -I$(GENERATED)                                       \
 	           -I$(BOOT_JAVA_HOME)/include                          \
 	           -I$(BOOT_JAVA_HOME)/include/$(Platform_os_family)    \
+	           $(SOLARIS_11_B159_OR_LATER)                          \
 	           $(SASRCFILES)                                        \
 	           $(SA_LFLAGS)                                         \
 	           -o $@                                                \
--- a/make/solaris/makefiles/sparcWorks.make	Fri May 06 14:32:44 2011 -0700
+++ b/make/solaris/makefiles/sparcWorks.make	Tue May 24 11:09:39 2011 -0700
@@ -100,11 +100,6 @@
 
 LINK_LIB.CC/PRE_HOOK += $(JVM_CHECK_SYMBOLS) || exit 1;
 
-# Some interfaces (_lwp_create) changed with LP64 and Solaris 7
-SOLARIS_7_OR_LATER := \
-$(shell uname -r | awk -F. '{ if ($$2 >= 7) print "-DSOLARIS_7_OR_LATER"; }')
-CFLAGS += ${SOLARIS_7_OR_LATER}
-
 # New architecture options started in SS12 (5.9), we need both styles to build.
 #   The older arch options for SS11 (5.8) or older and also for /usr/ccs/bin/as.
 #   Note: default for 32bit sparc is now the same as v8plus, so the
--- a/make/windows/build.make	Fri May 06 14:32:44 2011 -0700
+++ b/make/windows/build.make	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -125,7 +125,25 @@
 # or make/hotspot_distro.
 !ifndef HOTSPOT_VM_DISTRO
 !if exists($(WorkSpace)\src\closed)
+
+# if the build is for JDK6 or earlier version, it should include jdk6_hotspot_distro,
+# instead of hotspot_distro.
+JDK6_OR_EARLIER=0
+!if "$(JDK_MAJOR_VERSION)" != "" && "$(JDK_MINOR_VERSION)" != "" && "$(JDK_MICRO_VERSION)" != ""
+!if $(JDK_MAJOR_VERSION) == 1 && $(JDK_MINOR_VERSION) < 7
+JDK6_OR_EARLIER=1
+!endif
+!else
+!if $(JDK_MAJOR_VER) == 1 && $(JDK_MINOR_VER) < 7
+JDK6_OR_EARLIER=1
+!endif
+!endif
+
+!if $(JDK6_OR_EARLIER) == 1
+!include $(WorkSpace)\make\jdk6_hotspot_distro
+!else
 !include $(WorkSpace)\make\hotspot_distro
+!endif
 !else
 !include $(WorkSpace)\make\openjdk_distro
 !endif
@@ -260,7 +278,7 @@
 	@ echo Variant=$(realVariant)				>> $@
 	@ echo WorkSpace=$(WorkSpace)				>> $@
 	@ echo BootStrapDir=$(BootStrapDir)			>> $@
-        @ if "$(USERNAME)" NEQ "" echo BuildUser=$(USERNAME)	>> $@
+	@ if "$(USERNAME)" NEQ "" echo BuildUser=$(USERNAME)	>> $@
 	@ echo HS_VER=$(HS_VER)					>> $@
 	@ echo HS_DOTVER=$(HS_DOTVER)				>> $@
 	@ echo HS_COMPANY=$(COMPANY_NAME)			>> $@
--- a/src/cpu/sparc/vm/cppInterpreter_sparc.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/sparc/vm/cppInterpreter_sparc.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2176,6 +2176,7 @@
                                            int tempcount, // Number of slots on java expression stack in use
                                            int popframe_extra_args,
                                            int moncount,  // Number of active monitors
+                                           int caller_actual_parameters,
                                            int callee_param_size,
                                            int callee_locals_size,
                                            frame* caller,
--- a/src/cpu/sparc/vm/frame_sparc.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/sparc/vm/frame_sparc.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -806,3 +806,34 @@
   int index = (Interpreter::expr_offset_in_bytes(offset)/wordSize) - 1;
   return &interpreter_frame_tos_address()[index];
 }
+
+
+#ifdef ASSERT
+
+#define DESCRIBE_FP_OFFSET(name) \
+  values.describe(frame_no, fp() + frame::name##_offset, #name)
+
+void frame::describe_pd(FrameValues& values, int frame_no) {
+  for (int w = 0; w < frame::register_save_words; w++) {
+    values.describe(frame_no, sp() + w, err_msg("register save area word %d", w), 1);
+  }
+
+  if (is_interpreted_frame()) {
+    DESCRIBE_FP_OFFSET(interpreter_frame_d_scratch_fp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_l_scratch_fp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_padding);
+    DESCRIBE_FP_OFFSET(interpreter_frame_oop_temp);
+  }
+
+  if (!is_compiled_frame()) {
+    if (frame::callee_aggregate_return_pointer_words != 0) {
+      values.describe(frame_no, sp() + frame::callee_aggregate_return_pointer_sp_offset, "callee_aggregate_return_pointer_word");
+    }
+    for (int w = 0; w < frame::callee_register_argument_save_area_words; w++) {
+      values.describe(frame_no, sp() + frame::callee_register_argument_save_area_sp_offset + w,
+                      err_msg("callee_register_argument_save_area_words %d", w));
+    }
+  }
+}
+
+#endif
--- a/src/cpu/sparc/vm/interpreter_sparc.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/sparc/vm/interpreter_sparc.cpp	Tue May 24 11:09:39 2011 -0700
@@ -423,25 +423,6 @@
   return true;
 }
 
-// This method tells the deoptimizer how big an interpreted frame must be:
-int AbstractInterpreter::size_activation(methodOop method,
-                                         int tempcount,
-                                         int popframe_extra_args,
-                                         int moncount,
-                                         int callee_param_count,
-                                         int callee_locals,
-                                         bool is_top_frame) {
-  return layout_activation(method,
-                           tempcount,
-                           popframe_extra_args,
-                           moncount,
-                           callee_param_count,
-                           callee_locals,
-                           (frame*)NULL,
-                           (frame*)NULL,
-                           is_top_frame);
-}
-
 void Deoptimization::unwind_callee_save_values(frame* f, vframeArray* vframe_array) {
 
   // This code is sort of the equivalent of C2IAdapter::setup_stack_frame back in
--- a/src/cpu/sparc/vm/methodHandles_sparc.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/sparc/vm/methodHandles_sparc.cpp	Tue May 24 11:09:39 2011 -0700
@@ -142,18 +142,8 @@
   Register O2_form    = O2_scratch;
   Register O3_adapter = O3_scratch;
   __ load_heap_oop(Address(O0_mtype, __ delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes,               O1_scratch)), O2_form);
-  // load_heap_oop(Address(O2_form,  __ delayed_value(java_lang_invoke_MethodTypeForm::genericInvoker_offset_in_bytes, O1_scratch)), O3_adapter);
-  // deal with old JDK versions:
-  __ add(          Address(O2_form,  __ delayed_value(java_lang_invoke_MethodTypeForm::genericInvoker_offset_in_bytes, O1_scratch)), O3_adapter);
-  __ cmp(O3_adapter, O2_form);
-  Label sorry_no_invoke_generic;
-  __ brx(Assembler::lessUnsigned, false, Assembler::pn, sorry_no_invoke_generic);
-  __ delayed()->nop();
-
-  __ load_heap_oop(Address(O3_adapter, 0), O3_adapter);
-  __ tst(O3_adapter);
-  __ brx(Assembler::zero, false, Assembler::pn, sorry_no_invoke_generic);
-  __ delayed()->nop();
+  __ load_heap_oop(Address(O2_form,  __ delayed_value(java_lang_invoke_MethodTypeForm::genericInvoker_offset_in_bytes, O1_scratch)), O3_adapter);
+  __ verify_oop(O3_adapter);
   __ st_ptr(O3_adapter, Address(O4_argbase, 1 * Interpreter::stackElementSize));
   // As a trusted first argument, pass the type being called, so the adapter knows
   // the actual types of the arguments and return values.
@@ -164,12 +154,6 @@
   trace_method_handle(_masm, "invokeGeneric");
   __ jump_to_method_handle_entry(G3_method_handle, O1_scratch);
 
-  __ bind(sorry_no_invoke_generic); // no invokeGeneric implementation available!
-  __ mov(O0_mtype, G5_method_type);  // required by throw_WrongMethodType
-  // mov(G3_method_handle, G3_method_handle);  // already in this register
-  __ jump_to(AddressLiteral(Interpreter::throw_WrongMethodType_entry()), O1_scratch);
-  __ delayed()->nop();
-
   return entry_point;
 }
 
@@ -350,8 +334,9 @@
 #ifndef PRODUCT
 extern "C" void print_method_handle(oop mh);
 void trace_method_handle_stub(const char* adaptername,
-                              oopDesc* mh) {
-  printf("MH %s mh="INTPTR_FORMAT"\n", adaptername, (intptr_t) mh);
+                              oopDesc* mh,
+                              intptr_t* saved_sp) {
+  tty->print_cr("MH %s mh="INTPTR_FORMAT " saved_sp=" INTPTR_FORMAT, adaptername, (intptr_t) mh, saved_sp);
   print_method_handle(mh);
 }
 void MethodHandles::trace_method_handle(MacroAssembler* _masm, const char* adaptername) {
@@ -361,6 +346,7 @@
   __ save_frame(16);
   __ set((intptr_t) adaptername, O0);
   __ mov(G3_method_handle, O1);
+  __ mov(I5_savedSP, O2);
   __ mov(G3_method_handle, L3);
   __ mov(Gargs, L4);
   __ mov(G5_method_type, L5);
@@ -643,9 +629,10 @@
 
       // Live at this point:
       // - G5_klass        :  klass required by the target method
+      // - O0_argslot      :  argslot index in vmarg; may be required in the failing path
       // - O1_scratch      :  argument klass to test
       // - G3_method_handle:  adapter method handle
-      __ check_klass_subtype(O1_scratch, G5_klass, O0_argslot, O2_scratch, done);
+      __ check_klass_subtype(O1_scratch, G5_klass, O2_scratch, O3_scratch, done);
 
       // If we get here, the type check failed!
       __ load_heap_oop(G3_amh_argument,        O2_required);  // required class
--- a/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1623,6 +1623,7 @@
                                            int tempcount,
                                            int popframe_extra_args,
                                            int moncount,
+                                           int caller_actual_parameters,
                                            int callee_param_count,
                                            int callee_local_count,
                                            frame* caller,
@@ -1698,7 +1699,6 @@
                      popframe_extra_args;
 
     int local_words = method->max_locals() * Interpreter::stackElementWords;
-    int parm_words  = method->size_of_parameters() * Interpreter::stackElementWords;
     NEEDS_CLEANUP;
     intptr_t* locals;
     if (caller->is_interpreted_frame()) {
@@ -1706,6 +1706,7 @@
       intptr_t* Lesp_ptr = caller->interpreter_frame_tos_address() - 1;
       // Note that this computation means we replace size_of_parameters() values from the caller
       // interpreter frame's expression stack with our argument locals
+      int parm_words  = caller_actual_parameters * Interpreter::stackElementWords;
       locals = Lesp_ptr + parm_words;
       int delta = local_words - parm_words;
       int computed_sp_adjustment = (delta > 0) ? round_to(delta, WordsPerLong) : 0;
--- a/src/cpu/x86/vm/assembler_x86.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Tue May 24 11:09:39 2011 -0700
@@ -6039,6 +6039,43 @@
   call_VM_leaf(entry_point, 3);
 }
 
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 1);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
+
+  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 2);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
+  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
+  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
+  pass_arg2(this, arg_2);
+  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 3);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
+  LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
+  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
+  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
+  pass_arg3(this, arg_3);
+  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
+  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
+  pass_arg2(this, arg_2);
+  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 4);
+}
+
 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
 }
 
--- a/src/cpu/x86/vm/assembler_x86.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Tue May 24 11:09:39 2011 -0700
@@ -234,6 +234,20 @@
     a._disp += disp;
     return a;
   }
+  Address plus_disp(RegisterOrConstant disp, ScaleFactor scale = times_1) const {
+    Address a = (*this);
+    a._disp += disp.constant_or_zero() * scale_size(scale);
+    if (disp.is_register()) {
+      assert(!a.index()->is_valid(), "competing indexes");
+      a._index = disp.as_register();
+      a._scale = scale;
+    }
+    return a;
+  }
+  bool is_same_address(Address a) const {
+    // disregard _rspec
+    return _base == a._base && _disp == a._disp && _index == a._index && _scale == a._scale;
+  }
 
   // The following two overloads are used in connection with the
   // ByteSize type (see sizes.hpp).  They simplify the use of
@@ -1655,6 +1669,14 @@
   void call_VM_leaf(address entry_point,
                     Register arg_1, Register arg_2, Register arg_3);
 
+  // These always tightly bind to MacroAssembler::call_VM_leaf_base
+  // bypassing the virtual implementation
+  void super_call_VM_leaf(address entry_point);
+  void super_call_VM_leaf(address entry_point, Register arg_1);
+  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
+  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
+  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
+
   // last Java Frame (fills frame anchor)
   void set_last_Java_frame(Register thread,
                            Register last_java_sp,
@@ -2021,6 +2043,10 @@
   void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
   void addptr(Register dst, int32_t src);
   void addptr(Register dst, Register src);
+  void addptr(Register dst, RegisterOrConstant src) {
+    if (src.is_constant()) addptr(dst, (int) src.as_constant());
+    else                   addptr(dst,       src.as_register());
+  }
 
   void andptr(Register dst, int32_t src);
   void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; }
@@ -2082,7 +2108,10 @@
   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
   void subptr(Register dst, int32_t src);
   void subptr(Register dst, Register src);
-
+  void subptr(Register dst, RegisterOrConstant src) {
+    if (src.is_constant()) subptr(dst, (int) src.as_constant());
+    else                   subptr(dst,       src.as_register());
+  }
 
   void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
   void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
@@ -2280,6 +2309,11 @@
 
   void movptr(Address dst, Register src);
 
+  void movptr(Register dst, RegisterOrConstant src) {
+    if (src.is_constant()) movptr(dst, src.as_constant());
+    else                   movptr(dst, src.as_register());
+  }
+
 #ifdef _LP64
   // Generally the next two are only used for moving NULL
   // Although there are situations in initializing the mark word where
--- a/src/cpu/x86/vm/cppInterpreter_x86.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/cppInterpreter_x86.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2339,14 +2339,15 @@
 }
 
 int AbstractInterpreter::layout_activation(methodOop method,
-                                                int tempcount,  //
-                                                int popframe_extra_args,
-                                                int moncount,
-                                                int callee_param_count,
-                                                int callee_locals,
-                                                frame* caller,
-                                                frame* interpreter_frame,
-                                                bool is_top_frame) {
+                                           int tempcount,  //
+                                           int popframe_extra_args,
+                                           int moncount,
+                                           int caller_actual_parameters,
+                                           int callee_param_count,
+                                           int callee_locals,
+                                           frame* caller,
+                                           frame* interpreter_frame,
+                                           bool is_top_frame) {
 
   assert(popframe_extra_args == 0, "FIX ME");
   // NOTE this code must exactly mimic what InterpreterGenerator::generate_compute_interpreter_state()
--- a/src/cpu/x86/vm/frame_x86.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/frame_x86.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -339,7 +339,6 @@
   return fr;
 }
 
-
 //------------------------------------------------------------------------------
 // frame::verify_deopt_original_pc
 //
@@ -361,6 +360,55 @@
 }
 #endif
 
+//------------------------------------------------------------------------------
+// frame::adjust_unextended_sp
+void frame::adjust_unextended_sp() {
+  // If we are returning to a compiled MethodHandle call site, the
+  // saved_fp will in fact be a saved value of the unextended SP.  The
+  // simplest way to tell whether we are returning to such a call site
+  // is as follows:
+
+  nmethod* sender_nm = (_cb == NULL) ? NULL : _cb->as_nmethod_or_null();
+  if (sender_nm != NULL) {
+    // If the sender PC is a deoptimization point, get the original
+    // PC.  For MethodHandle call site the unextended_sp is stored in
+    // saved_fp.
+    if (sender_nm->is_deopt_mh_entry(_pc)) {
+      DEBUG_ONLY(verify_deopt_mh_original_pc(sender_nm, _fp));
+      _unextended_sp = _fp;
+    }
+    else if (sender_nm->is_deopt_entry(_pc)) {
+      DEBUG_ONLY(verify_deopt_original_pc(sender_nm, _unextended_sp));
+    }
+    else if (sender_nm->is_method_handle_return(_pc)) {
+      _unextended_sp = _fp;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// frame::update_map_with_saved_link
+void frame::update_map_with_saved_link(RegisterMap* map, intptr_t** link_addr) {
+  // The interpreter and compiler(s) always save EBP/RBP in a known
+  // location on entry. We must record where that location is
+  // so this if EBP/RBP was live on callout from c2 we can find
+  // the saved copy no matter what it called.
+
+  // Since the interpreter always saves EBP/RBP if we record where it is then
+  // we don't have to always save EBP/RBP on entry and exit to c2 compiled
+  // code, on entry will be enough.
+  map->set_location(rbp->as_VMReg(), (address) link_addr);
+#ifdef AMD64
+  // this is weird "H" ought to be at a higher address however the
+  // oopMaps seems to have the "H" regs at the same address and the
+  // vanilla register.
+  // XXXX make this go away
+  if (true) {
+    map->set_location(rbp->as_VMReg()->next(), (address) link_addr);
+  }
+#endif // AMD64
+}
+
 
 //------------------------------------------------------------------------------
 // frame::sender_for_interpreter_frame
@@ -372,54 +420,13 @@
   // This is the sp before any possible extension (adapter/locals).
   intptr_t* unextended_sp = interpreter_frame_sender_sp();
 
-  // Stored FP.
-  intptr_t* saved_fp = link();
-
-  address sender_pc = this->sender_pc();
-  CodeBlob* sender_cb = CodeCache::find_blob_unsafe(sender_pc);
-  assert(sender_cb, "sanity");
-  nmethod* sender_nm = sender_cb->as_nmethod_or_null();
-
-  if (sender_nm != NULL) {
-    // If the sender PC is a deoptimization point, get the original
-    // PC.  For MethodHandle call site the unextended_sp is stored in
-    // saved_fp.
-    if (sender_nm->is_deopt_mh_entry(sender_pc)) {
-      DEBUG_ONLY(verify_deopt_mh_original_pc(sender_nm, saved_fp));
-      unextended_sp = saved_fp;
-    }
-    else if (sender_nm->is_deopt_entry(sender_pc)) {
-      DEBUG_ONLY(verify_deopt_original_pc(sender_nm, unextended_sp));
-    }
-    else if (sender_nm->is_method_handle_return(sender_pc)) {
-      unextended_sp = saved_fp;
-    }
-  }
-
-  // The interpreter and compiler(s) always save EBP/RBP in a known
-  // location on entry. We must record where that location is
-  // so this if EBP/RBP was live on callout from c2 we can find
-  // the saved copy no matter what it called.
-
-  // Since the interpreter always saves EBP/RBP if we record where it is then
-  // we don't have to always save EBP/RBP on entry and exit to c2 compiled
-  // code, on entry will be enough.
 #ifdef COMPILER2
   if (map->update_map()) {
-    map->set_location(rbp->as_VMReg(), (address) addr_at(link_offset));
-#ifdef AMD64
-    // this is weird "H" ought to be at a higher address however the
-    // oopMaps seems to have the "H" regs at the same address and the
-    // vanilla register.
-    // XXXX make this go away
-    if (true) {
-      map->set_location(rbp->as_VMReg()->next(), (address)addr_at(link_offset));
-    }
-#endif // AMD64
+    update_map_with_saved_link(map, (intptr_t**) addr_at(link_offset));
   }
 #endif // COMPILER2
 
-  return frame(sender_sp, unextended_sp, saved_fp, sender_pc);
+  return frame(sender_sp, unextended_sp, link(), sender_pc());
 }
 
 
@@ -427,6 +434,7 @@
 // frame::sender_for_compiled_frame
 frame frame::sender_for_compiled_frame(RegisterMap* map) const {
   assert(map != NULL, "map must be set");
+  assert(!is_ricochet_frame(), "caller must handle this");
 
   // frame owned by optimizing compiler
   assert(_cb->frame_size() >= 0, "must have non-zero frame size");
@@ -438,31 +446,7 @@
 
   // This is the saved value of EBP which may or may not really be an FP.
   // It is only an FP if the sender is an interpreter frame (or C1?).
-  intptr_t* saved_fp = (intptr_t*) *(sender_sp - frame::sender_sp_offset);
-
-  // If we are returning to a compiled MethodHandle call site, the
-  // saved_fp will in fact be a saved value of the unextended SP.  The
-  // simplest way to tell whether we are returning to such a call site
-  // is as follows:
-  CodeBlob* sender_cb = CodeCache::find_blob_unsafe(sender_pc);
-  assert(sender_cb, "sanity");
-  nmethod* sender_nm = sender_cb->as_nmethod_or_null();
-
-  if (sender_nm != NULL) {
-    // If the sender PC is a deoptimization point, get the original
-    // PC.  For MethodHandle call site the unextended_sp is stored in
-    // saved_fp.
-    if (sender_nm->is_deopt_mh_entry(sender_pc)) {
-      DEBUG_ONLY(verify_deopt_mh_original_pc(sender_nm, saved_fp));
-      unextended_sp = saved_fp;
-    }
-    else if (sender_nm->is_deopt_entry(sender_pc)) {
-      DEBUG_ONLY(verify_deopt_original_pc(sender_nm, unextended_sp));
-    }
-    else if (sender_nm->is_method_handle_return(sender_pc)) {
-      unextended_sp = saved_fp;
-    }
-  }
+  intptr_t** saved_fp_addr = (intptr_t**) (sender_sp - frame::sender_sp_offset);
 
   if (map->update_map()) {
     // Tell GC to use argument oopmaps for some runtime stubs that need it.
@@ -472,23 +456,15 @@
     if (_cb->oop_maps() != NULL) {
       OopMapSet::update_register_map(this, map);
     }
+
     // Since the prolog does the save and restore of EBP there is no oopmap
     // for it so we must fill in its location as if there was an oopmap entry
     // since if our caller was compiled code there could be live jvm state in it.
-    map->set_location(rbp->as_VMReg(), (address) (sender_sp - frame::sender_sp_offset));
-#ifdef AMD64
-    // this is weird "H" ought to be at a higher address however the
-    // oopMaps seems to have the "H" regs at the same address and the
-    // vanilla register.
-    // XXXX make this go away
-    if (true) {
-      map->set_location(rbp->as_VMReg()->next(), (address) (sender_sp - frame::sender_sp_offset));
-    }
-#endif // AMD64
+    update_map_with_saved_link(map, saved_fp_addr);
   }
 
   assert(sender_sp != sp(), "must have changed");
-  return frame(sender_sp, unextended_sp, saved_fp, sender_pc);
+  return frame(sender_sp, unextended_sp, *saved_fp_addr, sender_pc);
 }
 
 
@@ -502,6 +478,7 @@
   if (is_entry_frame())       return sender_for_entry_frame(map);
   if (is_interpreted_frame()) return sender_for_interpreter_frame(map);
   assert(_cb == CodeCache::find_blob(pc()),"Must be the same");
+  if (is_ricochet_frame())    return sender_for_ricochet_frame(map);
 
   if (_cb != NULL) {
     return sender_for_compiled_frame(map);
@@ -669,3 +646,23 @@
   int index = (Interpreter::expr_offset_in_bytes(offset)/wordSize);
   return &interpreter_frame_tos_address()[index];
 }
+
+#ifdef ASSERT
+
+#define DESCRIBE_FP_OFFSET(name) \
+  values.describe(frame_no, fp() + frame::name##_offset, #name)
+
+void frame::describe_pd(FrameValues& values, int frame_no) {
+  if (is_interpreted_frame()) {
+    DESCRIBE_FP_OFFSET(interpreter_frame_sender_sp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_last_sp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_method);
+    DESCRIBE_FP_OFFSET(interpreter_frame_mdx);
+    DESCRIBE_FP_OFFSET(interpreter_frame_cache);
+    DESCRIBE_FP_OFFSET(interpreter_frame_locals);
+    DESCRIBE_FP_OFFSET(interpreter_frame_bcx);
+    DESCRIBE_FP_OFFSET(interpreter_frame_initial_sp);
+  }
+
+}
+#endif
--- a/src/cpu/x86/vm/frame_x86.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/frame_x86.hpp	Tue May 24 11:09:39 2011 -0700
@@ -164,6 +164,7 @@
   // original sp we use that convention.
 
   intptr_t*     _unextended_sp;
+  void adjust_unextended_sp();
 
   intptr_t* ptr_at_addr(int offset) const {
     return (intptr_t*) addr_at(offset);
@@ -197,6 +198,9 @@
   // expression stack tos if we are nested in a java call
   intptr_t* interpreter_frame_last_sp() const;
 
+  // helper to update a map with callee-saved RBP
+  static void update_map_with_saved_link(RegisterMap* map, intptr_t** link_addr);
+
 #ifndef CC_INTERP
   // deoptimization support
   void interpreter_frame_set_last_sp(intptr_t* sp);
--- a/src/cpu/x86/vm/frame_x86.inline.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/frame_x86.inline.hpp	Tue May 24 11:09:39 2011 -0700
@@ -62,6 +62,7 @@
   _pc = pc;
   assert(pc != NULL, "no pc?");
   _cb = CodeCache::find_blob(pc);
+  adjust_unextended_sp();
 
   address original_pc = nmethod::get_deopt_original_pc(this);
   if (original_pc != NULL) {
--- a/src/cpu/x86/vm/interp_masm_x86_32.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/interp_masm_x86_32.cpp	Tue May 24 11:09:39 2011 -0700
@@ -383,32 +383,6 @@
   movptr(Address(rsp, Interpreter::expr_offset_in_bytes(n)), val);
 }
 
-void InterpreterMacroAssembler::super_call_VM_leaf(address entry_point) {
-  MacroAssembler::call_VM_leaf_base(entry_point, 0);
-}
-
-
-void InterpreterMacroAssembler::super_call_VM_leaf(address entry_point, Register arg_1) {
-  push(arg_1);
-  MacroAssembler::call_VM_leaf_base(entry_point, 1);
-}
-
-
-void InterpreterMacroAssembler::super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
-  push(arg_2);
-  push(arg_1);
-  MacroAssembler::call_VM_leaf_base(entry_point, 2);
-}
-
-
-void InterpreterMacroAssembler::super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
-  push(arg_3);
-  push(arg_2);
-  push(arg_1);
-  MacroAssembler::call_VM_leaf_base(entry_point, 3);
-}
-
-
 void InterpreterMacroAssembler::prepare_to_jump_from_interpreted() {
   // set sender sp
   lea(rsi, Address(rsp, wordSize));
--- a/src/cpu/x86/vm/interp_masm_x86_32.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/interp_masm_x86_32.hpp	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -124,12 +124,6 @@
   void load_ptr(int n, Register val);
   void store_ptr(int n, Register val);
 
-  // Super call_VM calls - correspond to MacroAssembler::call_VM(_leaf) calls
-  void super_call_VM_leaf(address entry_point);
-  void super_call_VM_leaf(address entry_point, Register arg_1);
-  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
-  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
-
   // Generate a subtype check: branch to ok_is_subtype if sub_klass is
   // a subtype of super_klass.  EAX holds the super_klass.  Blows ECX
   // and EDI.  Register sub_klass cannot be any of the above.
--- a/src/cpu/x86/vm/interp_masm_x86_64.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/interp_masm_x86_64.cpp	Tue May 24 11:09:39 2011 -0700
@@ -381,56 +381,6 @@
 }
 
 
-void InterpreterMacroAssembler::super_call_VM_leaf(address entry_point) {
-  MacroAssembler::call_VM_leaf_base(entry_point, 0);
-}
-
-
-void InterpreterMacroAssembler::super_call_VM_leaf(address entry_point,
-                                                   Register arg_1) {
-  if (c_rarg0 != arg_1) {
-    mov(c_rarg0, arg_1);
-  }
-  MacroAssembler::call_VM_leaf_base(entry_point, 1);
-}
-
-
-void InterpreterMacroAssembler::super_call_VM_leaf(address entry_point,
-                                                   Register arg_1,
-                                                   Register arg_2) {
-  assert(c_rarg0 != arg_2, "smashed argument");
-  assert(c_rarg1 != arg_1, "smashed argument");
-  if (c_rarg0 != arg_1) {
-    mov(c_rarg0, arg_1);
-  }
-  if (c_rarg1 != arg_2) {
-    mov(c_rarg1, arg_2);
-  }
-  MacroAssembler::call_VM_leaf_base(entry_point, 2);
-}
-
-void InterpreterMacroAssembler::super_call_VM_leaf(address entry_point,
-                                                   Register arg_1,
-                                                   Register arg_2,
-                                                   Register arg_3) {
-  assert(c_rarg0 != arg_2, "smashed argument");
-  assert(c_rarg0 != arg_3, "smashed argument");
-  assert(c_rarg1 != arg_1, "smashed argument");
-  assert(c_rarg1 != arg_3, "smashed argument");
-  assert(c_rarg2 != arg_1, "smashed argument");
-  assert(c_rarg2 != arg_2, "smashed argument");
-  if (c_rarg0 != arg_1) {
-    mov(c_rarg0, arg_1);
-  }
-  if (c_rarg1 != arg_2) {
-    mov(c_rarg1, arg_2);
-  }
-  if (c_rarg2 != arg_3) {
-    mov(c_rarg2, arg_3);
-  }
-  MacroAssembler::call_VM_leaf_base(entry_point, 3);
-}
-
 void InterpreterMacroAssembler::prepare_to_jump_from_interpreted() {
   // set sender sp
   lea(r13, Address(rsp, wordSize));
--- a/src/cpu/x86/vm/interp_masm_x86_64.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/interp_masm_x86_64.hpp	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -136,13 +136,6 @@
   void load_ptr(int n, Register val);
   void store_ptr(int n, Register val);
 
-  // Super call_VM calls - correspond to MacroAssembler::call_VM(_leaf) calls
-  void super_call_VM_leaf(address entry_point);
-  void super_call_VM_leaf(address entry_point, Register arg_1);
-  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
-  void super_call_VM_leaf(address entry_point,
-                          Register arg_1, Register arg_2, Register arg_3);
-
   // Generate a subtype check: branch to ok_is_subtype if sub_klass is
   // a subtype of super_klass.
   void gen_subtype_check( Register sub_klass, Label &ok_is_subtype );
--- a/src/cpu/x86/vm/interpreter_x86.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/interpreter_x86.hpp	Tue May 24 11:09:39 2011 -0700
@@ -26,7 +26,9 @@
 #define CPU_X86_VM_INTERPRETER_X86_HPP
 
  public:
-  static Address::ScaleFactor stackElementScale() { return Address::times_4; }
+  static Address::ScaleFactor stackElementScale() {
+    return NOT_LP64(Address::times_4) LP64_ONLY(Address::times_8);
+  }
 
   // Offset from rsp (which points to the last stack element)
   static int expr_offset_in_bytes(int i) { return stackElementSize * i; }
--- a/src/cpu/x86/vm/interpreter_x86_32.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/interpreter_x86_32.cpp	Tue May 24 11:09:39 2011 -0700
@@ -242,26 +242,6 @@
   return entry_point;
 }
 
-
-// This method tells the deoptimizer how big an interpreted frame must be:
-int AbstractInterpreter::size_activation(methodOop method,
-                                         int tempcount,
-                                         int popframe_extra_args,
-                                         int moncount,
-                                         int callee_param_count,
-                                         int callee_locals,
-                                         bool is_top_frame) {
-  return layout_activation(method,
-                           tempcount,
-                           popframe_extra_args,
-                           moncount,
-                           callee_param_count,
-                           callee_locals,
-                           (frame*) NULL,
-                           (frame*) NULL,
-                           is_top_frame);
-}
-
 void Deoptimization::unwind_callee_save_values(frame* f, vframeArray* vframe_array) {
 
   // This code is sort of the equivalent of C2IAdapter::setup_stack_frame back in
--- a/src/cpu/x86/vm/interpreter_x86_64.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/interpreter_x86_64.cpp	Tue May 24 11:09:39 2011 -0700
@@ -362,20 +362,6 @@
 
 }
 
-// This method tells the deoptimizer how big an interpreted frame must be:
-int AbstractInterpreter::size_activation(methodOop method,
-                                         int tempcount,
-                                         int popframe_extra_args,
-                                         int moncount,
-                                         int callee_param_count,
-                                         int callee_locals,
-                                         bool is_top_frame) {
-  return layout_activation(method,
-                           tempcount, popframe_extra_args, moncount,
-                           callee_param_count, callee_locals,
-                           (frame*) NULL, (frame*) NULL, is_top_frame);
-}
-
 void Deoptimization::unwind_callee_save_values(frame* f, vframeArray* vframe_array) {
 
   // This code is sort of the equivalent of C2IAdapter::setup_stack_frame back in
--- a/src/cpu/x86/vm/methodHandles_x86.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/methodHandles_x86.cpp	Tue May 24 11:09:39 2011 -0700
@@ -69,23 +69,475 @@
   return me;
 }
 
+// stack walking support
+
+frame MethodHandles::ricochet_frame_sender(const frame& fr, RegisterMap *map) {
+  RicochetFrame* f = RicochetFrame::from_frame(fr);
+  if (map->update_map())
+    frame::update_map_with_saved_link(map, &f->_sender_link);
+  return frame(f->extended_sender_sp(), f->exact_sender_sp(), f->sender_link(), f->sender_pc());
+}
+
+void MethodHandles::ricochet_frame_oops_do(const frame& fr, OopClosure* blk, const RegisterMap* reg_map) {
+  RicochetFrame* f = RicochetFrame::from_frame(fr);
+
+  // pick up the argument type descriptor:
+  Thread* thread = Thread::current();
+  Handle cookie(thread, f->compute_saved_args_layout(true, true));
+
+  // process fixed part
+  blk->do_oop((oop*)f->saved_target_addr());
+  blk->do_oop((oop*)f->saved_args_layout_addr());
+
+  // process variable arguments:
+  if (cookie.is_null())  return;  // no arguments to describe
+
+  // the cookie is actually the invokeExact method for my target
+  // his argument signature is what I'm interested in
+  assert(cookie->is_method(), "");
+  methodHandle invoker(thread, methodOop(cookie()));
+  assert(invoker->name() == vmSymbols::invokeExact_name(), "must be this kind of method");
+  assert(!invoker->is_static(), "must have MH argument");
+  int slot_count = invoker->size_of_parameters();
+  assert(slot_count >= 1, "must include 'this'");
+  intptr_t* base = f->saved_args_base();
+  intptr_t* retval = NULL;
+  if (f->has_return_value_slot())
+    retval = f->return_value_slot_addr();
+  int slot_num = slot_count;
+  intptr_t* loc = &base[slot_num -= 1];
+  //blk->do_oop((oop*) loc);   // original target, which is irrelevant
+  int arg_num = 0;
+  for (SignatureStream ss(invoker->signature()); !ss.is_done(); ss.next()) {
+    if (ss.at_return_type())  continue;
+    BasicType ptype = ss.type();
+    if (ptype == T_ARRAY)  ptype = T_OBJECT; // fold all refs to T_OBJECT
+    assert(ptype >= T_BOOLEAN && ptype <= T_OBJECT, "not array or void");
+    loc = &base[slot_num -= type2size[ptype]];
+    bool is_oop = (ptype == T_OBJECT && loc != retval);
+    if (is_oop)  blk->do_oop((oop*)loc);
+    arg_num += 1;
+  }
+  assert(slot_num == 0, "must have processed all the arguments");
+}
+
+oop MethodHandles::RicochetFrame::compute_saved_args_layout(bool read_cache, bool write_cache) {
+  oop cookie = NULL;
+  if (read_cache) {
+    cookie = saved_args_layout();
+    if (cookie != NULL)  return cookie;
+  }
+  oop target = saved_target();
+  oop mtype  = java_lang_invoke_MethodHandle::type(target);
+  oop mtform = java_lang_invoke_MethodType::form(mtype);
+  cookie = java_lang_invoke_MethodTypeForm::vmlayout(mtform);
+  if (write_cache)  {
+    (*saved_args_layout_addr()) = cookie;
+  }
+  return cookie;
+}
+
+void MethodHandles::RicochetFrame::generate_ricochet_blob(MacroAssembler* _masm,
+                                                          // output params:
+                                                          int* frame_size_in_words,
+                                                          int* bounce_offset,
+                                                          int* exception_offset) {
+  (*frame_size_in_words) = RicochetFrame::frame_size_in_bytes() / wordSize;
+
+  address start = __ pc();
+
 #ifdef ASSERT
-static void verify_argslot(MacroAssembler* _masm, Register argslot_reg,
-                           const char* error_message) {
+  __ hlt(); __ hlt(); __ hlt();
+  // here's a hint of something special:
+  __ push(MAGIC_NUMBER_1);
+  __ push(MAGIC_NUMBER_2);
+#endif //ASSERT
+  __ hlt();  // not reached
+
+  // A return PC has just been popped from the stack.
+  // Return values are in registers.
+  // The ebp points into the RicochetFrame, which contains
+  // a cleanup continuation we must return to.
+
+  (*bounce_offset) = __ pc() - start;
+  BLOCK_COMMENT("ricochet_blob.bounce");
+
+  if (VerifyMethodHandles)  RicochetFrame::verify_clean(_masm);
+  trace_method_handle(_masm, "return/ricochet_blob.bounce");
+
+  __ jmp(frame_address(continuation_offset_in_bytes()));
+  __ hlt();
+  DEBUG_ONLY(__ push(MAGIC_NUMBER_2));
+
+  (*exception_offset) = __ pc() - start;
+  BLOCK_COMMENT("ricochet_blob.exception");
+
+  // compare this to Interpreter::rethrow_exception_entry, which is parallel code
+  // for example, see TemplateInterpreterGenerator::generate_throw_exception
+  // Live registers in:
+  //   rax: exception
+  //   rdx: return address/pc that threw exception (ignored, always equal to bounce addr)
+  __ verify_oop(rax);
+
+  // no need to empty_FPU_stack or reinit_heapbase, since caller frame will do the same if needed
+
+  // Take down the frame.
+
+  // Cf. InterpreterMacroAssembler::remove_activation.
+  leave_ricochet_frame(_masm, /*rcx_recv=*/ noreg,
+                       saved_last_sp_register(),
+                       /*sender_pc_reg=*/ rdx);
+
+  // In between activations - previous activation type unknown yet
+  // compute continuation point - the continuation point expects the
+  // following registers set up:
+  //
+  // rax: exception
+  // rdx: return address/pc that threw exception
+  // rsp: expression stack of caller
+  // rbp: ebp of caller
+  __ push(rax);                                  // save exception
+  __ push(rdx);                                  // save return address
+  Register thread_reg = LP64_ONLY(r15_thread) NOT_LP64(rdi);
+  NOT_LP64(__ get_thread(thread_reg));
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address,
+                                   SharedRuntime::exception_handler_for_return_address),
+                  thread_reg, rdx);
+  __ mov(rbx, rax);                              // save exception handler
+  __ pop(rdx);                                   // restore return address
+  __ pop(rax);                                   // restore exception
+  __ jmp(rbx);                                   // jump to exception
+                                                 // handler of caller
+}
+
+void MethodHandles::RicochetFrame::enter_ricochet_frame(MacroAssembler* _masm,
+                                                        Register rcx_recv,
+                                                        Register rax_argv,
+                                                        address return_handler,
+                                                        Register rbx_temp) {
+  const Register saved_last_sp = saved_last_sp_register();
+  Address rcx_mh_vmtarget(    rcx_recv, java_lang_invoke_MethodHandle::vmtarget_offset_in_bytes() );
+  Address rcx_amh_conversion( rcx_recv, java_lang_invoke_AdapterMethodHandle::conversion_offset_in_bytes() );
+
+  // Push the RicochetFrame a word at a time.
+  // This creates something similar to an interpreter frame.
+  // Cf. TemplateInterpreterGenerator::generate_fixed_frame.
+  BLOCK_COMMENT("push RicochetFrame {");
+  DEBUG_ONLY(int rfo = (int) sizeof(RicochetFrame));
+  assert((rfo -= wordSize) == RicochetFrame::sender_pc_offset_in_bytes(), "");
+#define RF_FIELD(push_value, name)                                      \
+  { push_value;                                                         \
+    assert((rfo -= wordSize) == RicochetFrame::name##_offset_in_bytes(), ""); }
+  RF_FIELD(__ push(rbp),                   sender_link);
+  RF_FIELD(__ push(saved_last_sp),         exact_sender_sp);  // rsi/r13
+  RF_FIELD(__ pushptr(rcx_amh_conversion), conversion);
+  RF_FIELD(__ push(rax_argv),              saved_args_base);   // can be updated if args are shifted
+  RF_FIELD(__ push((int32_t) NULL_WORD),   saved_args_layout); // cache for GC layout cookie
+  if (UseCompressedOops) {
+    __ load_heap_oop(rbx_temp, rcx_mh_vmtarget);
+    RF_FIELD(__ push(rbx_temp),            saved_target);
+  } else {
+    RF_FIELD(__ pushptr(rcx_mh_vmtarget),  saved_target);
+  }
+  __ lea(rbx_temp, ExternalAddress(return_handler));
+  RF_FIELD(__ push(rbx_temp),              continuation);
+#undef RF_FIELD
+  assert(rfo == 0, "fully initialized the RicochetFrame");
+  // compute new frame pointer:
+  __ lea(rbp, Address(rsp, RicochetFrame::sender_link_offset_in_bytes()));
+  // Push guard word #1 in debug mode.
+  DEBUG_ONLY(__ push((int32_t) RicochetFrame::MAGIC_NUMBER_1));
+  // For debugging, leave behind an indication of which stub built this frame.
+  DEBUG_ONLY({ Label L; __ call(L, relocInfo::none); __ bind(L); });
+  BLOCK_COMMENT("} RicochetFrame");
+}
+
+void MethodHandles::RicochetFrame::leave_ricochet_frame(MacroAssembler* _masm,
+                                                        Register rcx_recv,
+                                                        Register new_sp_reg,
+                                                        Register sender_pc_reg) {
+  assert_different_registers(rcx_recv, new_sp_reg, sender_pc_reg);
+  const Register saved_last_sp = saved_last_sp_register();
+  // Take down the frame.
+  // Cf. InterpreterMacroAssembler::remove_activation.
+  BLOCK_COMMENT("end_ricochet_frame {");
+  // TO DO: If (exact_sender_sp - extended_sender_sp) > THRESH, compact the frame down.
+  // This will keep stack in bounds even with unlimited tailcalls, each with an adapter.
+  if (rcx_recv->is_valid())
+    __ movptr(rcx_recv,    RicochetFrame::frame_address(RicochetFrame::saved_target_offset_in_bytes()));
+  __ movptr(sender_pc_reg, RicochetFrame::frame_address(RicochetFrame::sender_pc_offset_in_bytes()));
+  __ movptr(saved_last_sp, RicochetFrame::frame_address(RicochetFrame::exact_sender_sp_offset_in_bytes()));
+  __ movptr(rbp,           RicochetFrame::frame_address(RicochetFrame::sender_link_offset_in_bytes()));
+  __ mov(rsp, new_sp_reg);
+  BLOCK_COMMENT("} end_ricochet_frame");
+}
+
+// Emit code to verify that RBP is pointing at a valid ricochet frame.
+#ifdef ASSERT
+enum {
+  ARG_LIMIT = 255, SLOP = 4,
+  // use this parameter for checking for garbage stack movements:
+  UNREASONABLE_STACK_MOVE = (ARG_LIMIT + SLOP)
+  // the slop defends against false alarms due to fencepost errors
+};
+
+void MethodHandles::RicochetFrame::verify_clean(MacroAssembler* _masm) {
+  // The stack should look like this:
+  //    ... keep1 | dest=42 | keep2 | RF | magic | handler | magic | recursive args |
+  // Check various invariants.
+  verify_offsets();
+
+  Register rdi_temp = rdi;
+  Register rcx_temp = rcx;
+  { __ push(rdi_temp); __ push(rcx_temp); }
+#define UNPUSH_TEMPS \
+  { __ pop(rcx_temp);  __ pop(rdi_temp); }
+
+  Address magic_number_1_addr  = RicochetFrame::frame_address(RicochetFrame::magic_number_1_offset_in_bytes());
+  Address magic_number_2_addr  = RicochetFrame::frame_address(RicochetFrame::magic_number_2_offset_in_bytes());
+  Address continuation_addr    = RicochetFrame::frame_address(RicochetFrame::continuation_offset_in_bytes());
+  Address conversion_addr      = RicochetFrame::frame_address(RicochetFrame::conversion_offset_in_bytes());
+  Address saved_args_base_addr = RicochetFrame::frame_address(RicochetFrame::saved_args_base_offset_in_bytes());
+
+  Label L_bad, L_ok;
+  BLOCK_COMMENT("verify_clean {");
+  // Magic numbers must check out:
+  __ cmpptr(magic_number_1_addr, (int32_t) MAGIC_NUMBER_1);
+  __ jcc(Assembler::notEqual, L_bad);
+  __ cmpptr(magic_number_2_addr, (int32_t) MAGIC_NUMBER_2);
+  __ jcc(Assembler::notEqual, L_bad);
+
+  // Arguments pointer must look reasonable:
+  __ movptr(rcx_temp, saved_args_base_addr);
+  __ cmpptr(rcx_temp, rbp);
+  __ jcc(Assembler::below, L_bad);
+  __ subptr(rcx_temp, UNREASONABLE_STACK_MOVE * Interpreter::stackElementSize);
+  __ cmpptr(rcx_temp, rbp);
+  __ jcc(Assembler::above, L_bad);
+
+  load_conversion_dest_type(_masm, rdi_temp, conversion_addr);
+  __ cmpl(rdi_temp, T_VOID);
+  __ jcc(Assembler::equal, L_ok);
+  __ movptr(rcx_temp, saved_args_base_addr);
+  load_conversion_vminfo(_masm, rdi_temp, conversion_addr);
+  __ cmpptr(Address(rcx_temp, rdi_temp, Interpreter::stackElementScale()),
+            (int32_t) RETURN_VALUE_PLACEHOLDER);
+  __ jcc(Assembler::equal, L_ok);
+  __ BIND(L_bad);
+  UNPUSH_TEMPS;
+  __ stop("damaged ricochet frame");
+  __ BIND(L_ok);
+  UNPUSH_TEMPS;
+  BLOCK_COMMENT("} verify_clean");
+
+#undef UNPUSH_TEMPS
+
+}
+#endif //ASSERT
+
+void MethodHandles::load_klass_from_Class(MacroAssembler* _masm, Register klass_reg) {
+  if (VerifyMethodHandles)
+    verify_klass(_masm, klass_reg, SystemDictionaryHandles::Class_klass(),
+                 "AMH argument is a Class");
+  __ load_heap_oop(klass_reg, Address(klass_reg, java_lang_Class::klass_offset_in_bytes()));
+}
+
+void MethodHandles::load_conversion_vminfo(MacroAssembler* _masm, Register reg, Address conversion_field_addr) {
+  int bits   = BitsPerByte;
+  int offset = (CONV_VMINFO_SHIFT / bits);
+  int shift  = (CONV_VMINFO_SHIFT % bits);
+  __ load_unsigned_byte(reg, conversion_field_addr.plus_disp(offset));
+  assert(CONV_VMINFO_MASK == right_n_bits(bits - shift), "else change type of previous load");
+  assert(shift == 0, "no shift needed");
+}
+
+void MethodHandles::load_conversion_dest_type(MacroAssembler* _masm, Register reg, Address conversion_field_addr) {
+  int bits   = BitsPerByte;
+  int offset = (CONV_DEST_TYPE_SHIFT / bits);
+  int shift  = (CONV_DEST_TYPE_SHIFT % bits);
+  __ load_unsigned_byte(reg, conversion_field_addr.plus_disp(offset));
+  assert(CONV_TYPE_MASK == right_n_bits(bits - shift), "else change type of previous load");
+  __ shrl(reg, shift);
+  DEBUG_ONLY(int conv_type_bits = (int) exact_log2(CONV_TYPE_MASK+1));
+  assert((shift + conv_type_bits) == bits, "left justified in byte");
+}
+
+void MethodHandles::load_stack_move(MacroAssembler* _masm,
+                                    Register rdi_stack_move,
+                                    Register rcx_amh,
+                                    bool might_be_negative) {
+  BLOCK_COMMENT("load_stack_move");
+  Address rcx_amh_conversion(rcx_amh, java_lang_invoke_AdapterMethodHandle::conversion_offset_in_bytes());
+  __ movl(rdi_stack_move, rcx_amh_conversion);
+  __ sarl(rdi_stack_move, CONV_STACK_MOVE_SHIFT);
+#ifdef _LP64
+  if (might_be_negative) {
+    // clean high bits of stack motion register (was loaded as an int)
+    __ movslq(rdi_stack_move, rdi_stack_move);
+  }
+#endif //_LP64
+  if (VerifyMethodHandles) {
+    Label L_ok, L_bad;
+    int32_t stack_move_limit = 0x4000;  // extra-large
+    __ cmpptr(rdi_stack_move, stack_move_limit);
+    __ jcc(Assembler::greaterEqual, L_bad);
+    __ cmpptr(rdi_stack_move, -stack_move_limit);
+    __ jcc(Assembler::greater, L_ok);
+    __ bind(L_bad);
+    __ stop("load_stack_move of garbage value");
+    __ BIND(L_ok);
+  }
+}
+
+#ifndef PRODUCT
+void MethodHandles::RicochetFrame::verify_offsets() {
+  // Check compatibility of this struct with the more generally used offsets of class frame:
+  int ebp_off = sender_link_offset_in_bytes();  // offset from struct base to local rbp value
+  assert(ebp_off + wordSize*frame::interpreter_frame_method_offset      == saved_args_base_offset_in_bytes(), "");
+  assert(ebp_off + wordSize*frame::interpreter_frame_last_sp_offset     == conversion_offset_in_bytes(), "");
+  assert(ebp_off + wordSize*frame::interpreter_frame_sender_sp_offset   == exact_sender_sp_offset_in_bytes(), "");
+  // These last two have to be exact:
+  assert(ebp_off + wordSize*frame::link_offset                          == sender_link_offset_in_bytes(), "");
+  assert(ebp_off + wordSize*frame::return_addr_offset                   == sender_pc_offset_in_bytes(), "");
+}
+
+void MethodHandles::RicochetFrame::verify() const {
+  verify_offsets();
+  assert(magic_number_1() == MAGIC_NUMBER_1, "");
+  assert(magic_number_2() == MAGIC_NUMBER_2, "");
+  if (!Universe::heap()->is_gc_active()) {
+    if (saved_args_layout() != NULL) {
+      assert(saved_args_layout()->is_method(), "must be valid oop");
+    }
+    if (saved_target() != NULL) {
+      assert(java_lang_invoke_MethodHandle::is_instance(saved_target()), "checking frame value");
+    }
+  }
+  int conv_op = adapter_conversion_op(conversion());
+  assert(conv_op == java_lang_invoke_AdapterMethodHandle::OP_COLLECT_ARGS ||
+         conv_op == java_lang_invoke_AdapterMethodHandle::OP_FOLD_ARGS ||
+         conv_op == java_lang_invoke_AdapterMethodHandle::OP_PRIM_TO_REF,
+         "must be a sane conversion");
+  if (has_return_value_slot()) {
+    assert(*return_value_slot_addr() == RETURN_VALUE_PLACEHOLDER, "");
+  }
+}
+#endif //PRODUCT
+
+#ifdef ASSERT
+void MethodHandles::verify_argslot(MacroAssembler* _masm,
+                                   Register argslot_reg,
+                                   const char* error_message) {
   // Verify that argslot lies within (rsp, rbp].
   Label L_ok, L_bad;
-  BLOCK_COMMENT("{ verify_argslot");
+  BLOCK_COMMENT("verify_argslot {");
   __ cmpptr(argslot_reg, rbp);
   __ jccb(Assembler::above, L_bad);
   __ cmpptr(rsp, argslot_reg);
   __ jccb(Assembler::below, L_ok);
   __ bind(L_bad);
   __ stop(error_message);
-  __ bind(L_ok);
+  __ BIND(L_ok);
   BLOCK_COMMENT("} verify_argslot");
 }
-#endif
 
+void MethodHandles::verify_argslots(MacroAssembler* _masm,
+                                    RegisterOrConstant arg_slots,
+                                    Register arg_slot_base_reg,
+                                    bool negate_argslots,
+                                    const char* error_message) {
+  // Verify that [argslot..argslot+size) lies within (rsp, rbp).
+  Label L_ok, L_bad;
+  Register rdi_temp = rdi;
+  BLOCK_COMMENT("verify_argslots {");
+  __ push(rdi_temp);
+  if (negate_argslots) {
+    if (arg_slots.is_constant()) {
+      arg_slots = -1 * arg_slots.as_constant();
+    } else {
+      __ movptr(rdi_temp, arg_slots);
+      __ negptr(rdi_temp);
+      arg_slots = rdi_temp;
+    }
+  }
+  __ lea(rdi_temp, Address(arg_slot_base_reg, arg_slots, Interpreter::stackElementScale()));
+  __ cmpptr(rdi_temp, rbp);
+  __ pop(rdi_temp);
+  __ jcc(Assembler::above, L_bad);
+  __ cmpptr(rsp, arg_slot_base_reg);
+  __ jcc(Assembler::below, L_ok);
+  __ bind(L_bad);
+  __ stop(error_message);
+  __ BIND(L_ok);
+  BLOCK_COMMENT("} verify_argslots");
+}
+
+// Make sure that arg_slots has the same sign as the given direction.
+// If (and only if) arg_slots is a assembly-time constant, also allow it to be zero.
+void MethodHandles::verify_stack_move(MacroAssembler* _masm,
+                                      RegisterOrConstant arg_slots, int direction) {
+  bool allow_zero = arg_slots.is_constant();
+  if (direction == 0) { direction = +1; allow_zero = true; }
+  assert(stack_move_unit() == -1, "else add extra checks here");
+  if (arg_slots.is_register()) {
+    Label L_ok, L_bad;
+    BLOCK_COMMENT("verify_stack_move {");
+    // testl(arg_slots.as_register(), -stack_move_unit() - 1);  // no need
+    // jcc(Assembler::notZero, L_bad);
+    __ cmpptr(arg_slots.as_register(), (int32_t) NULL_WORD);
+    if (direction > 0) {
+      __ jcc(allow_zero ? Assembler::less : Assembler::lessEqual, L_bad);
+      __ cmpptr(arg_slots.as_register(), (int32_t) UNREASONABLE_STACK_MOVE);
+      __ jcc(Assembler::less, L_ok);
+    } else {
+      __ jcc(allow_zero ? Assembler::greater : Assembler::greaterEqual, L_bad);
+      __ cmpptr(arg_slots.as_register(), (int32_t) -UNREASONABLE_STACK_MOVE);
+      __ jcc(Assembler::greater, L_ok);
+    }
+    __ bind(L_bad);
+    if (direction > 0)
+      __ stop("assert arg_slots > 0");
+    else
+      __ stop("assert arg_slots < 0");
+    __ BIND(L_ok);
+    BLOCK_COMMENT("} verify_stack_move");
+  } else {
+    intptr_t size = arg_slots.as_constant();
+    if (direction < 0)  size = -size;
+    assert(size >= 0, "correct direction of constant move");
+    assert(size < UNREASONABLE_STACK_MOVE, "reasonable size of constant move");
+  }
+}
+
+void MethodHandles::verify_klass(MacroAssembler* _masm,
+                                 Register obj, KlassHandle klass,
+                                 const char* error_message) {
+  oop* klass_addr = klass.raw_value();
+  assert(klass_addr >= SystemDictionaryHandles::Object_klass().raw_value() &&
+         klass_addr <= SystemDictionaryHandles::Long_klass().raw_value(),
+         "must be one of the SystemDictionaryHandles");
+  Register temp = rdi;
+  Label L_ok, L_bad;
+  BLOCK_COMMENT("verify_klass {");
+  __ verify_oop(obj);
+  __ testptr(obj, obj);
+  __ jcc(Assembler::zero, L_bad);
+  __ push(temp);
+  __ load_klass(temp, obj);
+  __ cmpptr(temp, ExternalAddress((address) klass_addr));
+  __ jcc(Assembler::equal, L_ok);
+  intptr_t super_check_offset = klass->super_check_offset();
+  __ movptr(temp, Address(temp, super_check_offset));
+  __ cmpptr(temp, ExternalAddress((address) klass_addr));
+  __ jcc(Assembler::equal, L_ok);
+  __ pop(temp);
+  __ bind(L_bad);
+  __ stop(error_message);
+  __ BIND(L_ok);
+  __ pop(temp);
+  BLOCK_COMMENT("} verify_klass");
+}
+#endif //ASSERT
 
 // Code generation
 address MethodHandles::generate_method_handle_interpreter_entry(MacroAssembler* _masm) {
@@ -116,6 +568,9 @@
   address entry_point = __ pc();
 
   // fetch the MethodType from the method handle into rax (the 'check' register)
+  // FIXME: Interpreter should transmit pre-popped stack pointer, to locate base of arg list.
+  // This would simplify several touchy bits of code.
+  // See 6984712: JSR 292 method handle calls need a clean argument base pointer
   {
     Register tem = rbx_method;
     for (jint* pchase = methodOopDesc::method_type_offsets_chain(); (*pchase) != -1; pchase++) {
@@ -128,17 +583,23 @@
   __ load_heap_oop(rdx_temp, Address(rax_mtype, __ delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes, rdi_temp)));
   Register rdx_vmslots = rdx_temp;
   __ movl(rdx_vmslots, Address(rdx_temp, __ delayed_value(java_lang_invoke_MethodTypeForm::vmslots_offset_in_bytes, rdi_temp)));
-  __ movptr(rcx_recv, __ argument_address(rdx_vmslots));
+  Address mh_receiver_slot_addr = __ argument_address(rdx_vmslots);
+  __ movptr(rcx_recv, mh_receiver_slot_addr);
 
   trace_method_handle(_masm, "invokeExact");
 
   __ check_method_handle_type(rax_mtype, rcx_recv, rdi_temp, wrong_method_type);
+
+  // Nobody uses the MH receiver slot after this.  Make sure.
+  DEBUG_ONLY(__ movptr(mh_receiver_slot_addr, (int32_t)0x999999));
+
   __ jump_to_method_handle_entry(rcx_recv, rdi_temp);
 
   // for invokeGeneric (only), apply argument and result conversions on the fly
   __ bind(invoke_generic_slow_path);
 #ifdef ASSERT
-  { Label L;
+  if (VerifyMethodHandles) {
+    Label L;
     __ cmpb(Address(rbx_method, methodOopDesc::intrinsic_id_offset_in_bytes()), (int) vmIntrinsics::_invokeGeneric);
     __ jcc(Assembler::equal, L);
     __ stop("bad methodOop::intrinsic_id");
@@ -150,22 +611,14 @@
   // make room on the stack for another pointer:
   Register rcx_argslot = rcx_recv;
   __ lea(rcx_argslot, __ argument_address(rdx_vmslots, 1));
-  insert_arg_slots(_masm, 2 * stack_move_unit(), _INSERT_REF_MASK,
+  insert_arg_slots(_masm, 2 * stack_move_unit(),
                    rcx_argslot, rbx_temp, rdx_temp);
 
   // load up an adapter from the calling type (Java weaves this)
-  __ load_heap_oop(rdx_temp, Address(rax_mtype, __ delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes, rdi_temp)));
   Register rdx_adapter = rdx_temp;
-  // __ load_heap_oop(rdx_adapter, Address(rdx_temp, java_lang_invoke_MethodTypeForm::genericInvoker_offset_in_bytes()));
-  // deal with old JDK versions:
-  __ lea(rdi_temp, Address(rdx_temp, __ delayed_value(java_lang_invoke_MethodTypeForm::genericInvoker_offset_in_bytes, rdi_temp)));
-  __ cmpptr(rdi_temp, rdx_temp);
-  Label sorry_no_invoke_generic;
-  __ jcc(Assembler::below, sorry_no_invoke_generic);
-
-  __ load_heap_oop(rdx_adapter, Address(rdi_temp, 0));
-  __ testptr(rdx_adapter, rdx_adapter);
-  __ jcc(Assembler::zero, sorry_no_invoke_generic);
+  __ load_heap_oop(rdx_temp,    Address(rax_mtype, __ delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes,               rdi_temp)));
+  __ load_heap_oop(rdx_adapter, Address(rdx_temp,  __ delayed_value(java_lang_invoke_MethodTypeForm::genericInvoker_offset_in_bytes, rdi_temp)));
+  __ verify_oop(rdx_adapter);
   __ movptr(Address(rcx_argslot, 1 * Interpreter::stackElementSize), rdx_adapter);
   // As a trusted first argument, pass the type being called, so the adapter knows
   // the actual types of the arguments and return values.
@@ -176,49 +629,31 @@
   trace_method_handle(_masm, "invokeGeneric");
   __ jump_to_method_handle_entry(rcx, rdi_temp);
 
-  __ bind(sorry_no_invoke_generic); // no invokeGeneric implementation available!
-  __ movptr(rcx_recv, Address(rcx_argslot, -1 * Interpreter::stackElementSize));  // recover original MH
-  __ push(rax_mtype);       // required mtype
-  __ push(rcx_recv);        // bad mh (1st stacked argument)
-  __ jump(ExternalAddress(Interpreter::throw_WrongMethodType_entry()));
-
   return entry_point;
 }
 
+// Workaround for C++ overloading nastiness on '0' for RegisterOrConstant.
+static RegisterOrConstant constant(int value) {
+  return RegisterOrConstant(value);
+}
+
 // Helper to insert argument slots into the stack.
-// arg_slots must be a multiple of stack_move_unit() and <= 0
+// arg_slots must be a multiple of stack_move_unit() and < 0
+// rax_argslot is decremented to point to the new (shifted) location of the argslot
+// But, rdx_temp ends up holding the original value of rax_argslot.
 void MethodHandles::insert_arg_slots(MacroAssembler* _masm,
                                      RegisterOrConstant arg_slots,
-                                     int arg_mask,
                                      Register rax_argslot,
-                                     Register rbx_temp, Register rdx_temp, Register temp3_reg) {
-  assert(temp3_reg == noreg, "temp3 not required");
+                                     Register rbx_temp, Register rdx_temp) {
+  // allow constant zero
+  if (arg_slots.is_constant() && arg_slots.as_constant() == 0)
+    return;
   assert_different_registers(rax_argslot, rbx_temp, rdx_temp,
                              (!arg_slots.is_register() ? rsp : arg_slots.as_register()));
-
-#ifdef ASSERT
-  verify_argslot(_masm, rax_argslot, "insertion point must fall within current frame");
-  if (arg_slots.is_register()) {
-    Label L_ok, L_bad;
-    __ cmpptr(arg_slots.as_register(), (int32_t) NULL_WORD);
-    __ jccb(Assembler::greater, L_bad);
-    __ testl(arg_slots.as_register(), -stack_move_unit() - 1);
-    __ jccb(Assembler::zero, L_ok);
-    __ bind(L_bad);
-    __ stop("assert arg_slots <= 0 and clear low bits");
-    __ bind(L_ok);
-  } else {
-    assert(arg_slots.as_constant() <= 0, "");
-    assert(arg_slots.as_constant() % -stack_move_unit() == 0, "");
-  }
-#endif //ASSERT
-
-#ifdef _LP64
-  if (arg_slots.is_register()) {
-    // clean high bits of stack motion register (was loaded as an int)
-    __ movslq(arg_slots.as_register(), arg_slots.as_register());
-  }
-#endif
+  if (VerifyMethodHandles)
+    verify_argslot(_masm, rax_argslot, "insertion point must fall within current frame");
+  if (VerifyMethodHandles)
+    verify_stack_move(_masm, arg_slots, -1);
 
   // Make space on the stack for the inserted argument(s).
   // Then pull down everything shallower than rax_argslot.
@@ -230,59 +665,39 @@
   //   argslot -= size;
   BLOCK_COMMENT("insert_arg_slots {");
   __ mov(rdx_temp, rsp);                        // source pointer for copy
-  __ lea(rsp, Address(rsp, arg_slots, Address::times_ptr));
+  __ lea(rsp, Address(rsp, arg_slots, Interpreter::stackElementScale()));
   {
     Label loop;
     __ BIND(loop);
     // pull one word down each time through the loop
     __ movptr(rbx_temp, Address(rdx_temp, 0));
-    __ movptr(Address(rdx_temp, arg_slots, Address::times_ptr), rbx_temp);
+    __ movptr(Address(rdx_temp, arg_slots, Interpreter::stackElementScale()), rbx_temp);
     __ addptr(rdx_temp, wordSize);
     __ cmpptr(rdx_temp, rax_argslot);
-    __ jccb(Assembler::less, loop);
+    __ jcc(Assembler::below, loop);
   }
 
   // Now move the argslot down, to point to the opened-up space.
-  __ lea(rax_argslot, Address(rax_argslot, arg_slots, Address::times_ptr));
+  __ lea(rax_argslot, Address(rax_argslot, arg_slots, Interpreter::stackElementScale()));
   BLOCK_COMMENT("} insert_arg_slots");
 }
 
 // Helper to remove argument slots from the stack.
-// arg_slots must be a multiple of stack_move_unit() and >= 0
+// arg_slots must be a multiple of stack_move_unit() and > 0
 void MethodHandles::remove_arg_slots(MacroAssembler* _masm,
-                                    RegisterOrConstant arg_slots,
-                                    Register rax_argslot,
-                                     Register rbx_temp, Register rdx_temp, Register temp3_reg) {
-  assert(temp3_reg == noreg, "temp3 not required");
+                                     RegisterOrConstant arg_slots,
+                                     Register rax_argslot,
+                                     Register rbx_temp, Register rdx_temp) {
+  // allow constant zero
+  if (arg_slots.is_constant() && arg_slots.as_constant() == 0)
+    return;
   assert_different_registers(rax_argslot, rbx_temp, rdx_temp,
                              (!arg_slots.is_register() ? rsp : arg_slots.as_register()));
-
-#ifdef ASSERT
-  // Verify that [argslot..argslot+size) lies within (rsp, rbp).
-  __ lea(rbx_temp, Address(rax_argslot, arg_slots, Address::times_ptr));
-  verify_argslot(_masm, rbx_temp, "deleted argument(s) must fall within current frame");
-  if (arg_slots.is_register()) {
-    Label L_ok, L_bad;
-    __ cmpptr(arg_slots.as_register(), (int32_t) NULL_WORD);
-    __ jccb(Assembler::less, L_bad);
-    __ testl(arg_slots.as_register(), -stack_move_unit() - 1);
-    __ jccb(Assembler::zero, L_ok);
-    __ bind(L_bad);
-    __ stop("assert arg_slots >= 0 and clear low bits");
-    __ bind(L_ok);
-  } else {
-    assert(arg_slots.as_constant() >= 0, "");
-    assert(arg_slots.as_constant() % -stack_move_unit() == 0, "");
-  }
-#endif //ASSERT
-
-#ifdef _LP64
-  if (false) {                  // not needed, since register is positive
-    // clean high bits of stack motion register (was loaded as an int)
-    if (arg_slots.is_register())
-      __ movslq(arg_slots.as_register(), arg_slots.as_register());
-  }
-#endif
+  if (VerifyMethodHandles)
+    verify_argslots(_masm, arg_slots, rax_argslot, false,
+                    "deleted argument(s) must fall within current frame");
+  if (VerifyMethodHandles)
+    verify_stack_move(_masm, arg_slots, +1);
 
   BLOCK_COMMENT("remove_arg_slots {");
   // Pull up everything shallower than rax_argslot.
@@ -299,19 +714,249 @@
     __ BIND(loop);
     // pull one word up each time through the loop
     __ movptr(rbx_temp, Address(rdx_temp, 0));
-    __ movptr(Address(rdx_temp, arg_slots, Address::times_ptr), rbx_temp);
+    __ movptr(Address(rdx_temp, arg_slots, Interpreter::stackElementScale()), rbx_temp);
     __ addptr(rdx_temp, -wordSize);
     __ cmpptr(rdx_temp, rsp);
-    __ jccb(Assembler::greaterEqual, loop);
+    __ jcc(Assembler::aboveEqual, loop);
   }
 
   // Now move the argslot up, to point to the just-copied block.
-  __ lea(rsp, Address(rsp, arg_slots, Address::times_ptr));
+  __ lea(rsp, Address(rsp, arg_slots, Interpreter::stackElementScale()));
   // And adjust the argslot address to point at the deletion point.
-  __ lea(rax_argslot, Address(rax_argslot, arg_slots, Address::times_ptr));
+  __ lea(rax_argslot, Address(rax_argslot, arg_slots, Interpreter::stackElementScale()));
   BLOCK_COMMENT("} remove_arg_slots");
 }
 
+// Helper to copy argument slots to the top of the stack.
+// The sequence starts with rax_argslot and is counted by slot_count
+// slot_count must be a multiple of stack_move_unit() and >= 0
+// This function blows the temps but does not change rax_argslot.
+void MethodHandles::push_arg_slots(MacroAssembler* _masm,
+                                   Register rax_argslot,
+                                   RegisterOrConstant slot_count,
+                                   int skip_words_count,
+                                   Register rbx_temp, Register rdx_temp) {
+  assert_different_registers(rax_argslot, rbx_temp, rdx_temp,
+                             (!slot_count.is_register() ? rbp : slot_count.as_register()),
+                             rsp);
+  assert(Interpreter::stackElementSize == wordSize, "else change this code");
+
+  if (VerifyMethodHandles)
+    verify_stack_move(_masm, slot_count, 0);
+
+  // allow constant zero
+  if (slot_count.is_constant() && slot_count.as_constant() == 0)
+    return;
+
+  BLOCK_COMMENT("push_arg_slots {");
+
+  Register rbx_top = rbx_temp;
+
+  // There is at most 1 word to carry down with the TOS.
+  switch (skip_words_count) {
+  case 1: __ pop(rdx_temp); break;
+  case 0:                   break;
+  default: ShouldNotReachHere();
+  }
+
+  if (slot_count.is_constant()) {
+    for (int i = slot_count.as_constant() - 1; i >= 0; i--) {
+      __ pushptr(Address(rax_argslot, i * wordSize));
+    }
+  } else {
+    Label L_plural, L_loop, L_break;
+    // Emit code to dynamically check for the common cases, zero and one slot.
+    __ cmpl(slot_count.as_register(), (int32_t) 1);
+    __ jccb(Assembler::greater, L_plural);
+    __ jccb(Assembler::less, L_break);
+    __ pushptr(Address(rax_argslot, 0));
+    __ jmpb(L_break);
+    __ BIND(L_plural);
+
+    // Loop for 2 or more:
+    //   rbx = &rax[slot_count]
+    //   while (rbx > rax)  *(--rsp) = *(--rbx)
+    __ lea(rbx_top, Address(rax_argslot, slot_count, Address::times_ptr));
+    __ BIND(L_loop);
+    __ subptr(rbx_top, wordSize);
+    __ pushptr(Address(rbx_top, 0));
+    __ cmpptr(rbx_top, rax_argslot);
+    __ jcc(Assembler::above, L_loop);
+    __ bind(L_break);
+  }
+  switch (skip_words_count) {
+  case 1: __ push(rdx_temp); break;
+  case 0:                    break;
+  default: ShouldNotReachHere();
+  }
+  BLOCK_COMMENT("} push_arg_slots");
+}
+
+// in-place movement; no change to rsp
+// blows rax_temp, rdx_temp
+void MethodHandles::move_arg_slots_up(MacroAssembler* _masm,
+                                      Register rbx_bottom,  // invariant
+                                      Address  top_addr,     // can use rax_temp
+                                      RegisterOrConstant positive_distance_in_slots,
+                                      Register rax_temp, Register rdx_temp) {
+  BLOCK_COMMENT("move_arg_slots_up {");
+  assert_different_registers(rbx_bottom,
+                             rax_temp, rdx_temp,
+                             positive_distance_in_slots.register_or_noreg());
+  Label L_loop, L_break;
+  Register rax_top = rax_temp;
+  if (!top_addr.is_same_address(Address(rax_top, 0)))
+    __ lea(rax_top, top_addr);
+  // Detect empty (or broken) loop:
+#ifdef ASSERT
+  if (VerifyMethodHandles) {
+    // Verify that &bottom < &top (non-empty interval)
+    Label L_ok, L_bad;
+    if (positive_distance_in_slots.is_register()) {
+      __ cmpptr(positive_distance_in_slots.as_register(), (int32_t) 0);
+      __ jcc(Assembler::lessEqual, L_bad);
+    }
+    __ cmpptr(rbx_bottom, rax_top);
+    __ jcc(Assembler::below, L_ok);
+    __ bind(L_bad);
+    __ stop("valid bounds (copy up)");
+    __ BIND(L_ok);
+  }
+#endif
+  __ cmpptr(rbx_bottom, rax_top);
+  __ jccb(Assembler::aboveEqual, L_break);
+  // work rax down to rbx, copying contiguous data upwards
+  // In pseudo-code:
+  //   [rbx, rax) = &[bottom, top)
+  //   while (--rax >= rbx) *(rax + distance) = *(rax + 0), rax--;
+  __ BIND(L_loop);
+  __ subptr(rax_top, wordSize);
+  __ movptr(rdx_temp, Address(rax_top, 0));
+  __ movptr(          Address(rax_top, positive_distance_in_slots, Address::times_ptr), rdx_temp);
+  __ cmpptr(rax_top, rbx_bottom);
+  __ jcc(Assembler::above, L_loop);
+  assert(Interpreter::stackElementSize == wordSize, "else change loop");
+  __ bind(L_break);
+  BLOCK_COMMENT("} move_arg_slots_up");
+}
+
+// in-place movement; no change to rsp
+// blows rax_temp, rdx_temp
+void MethodHandles::move_arg_slots_down(MacroAssembler* _masm,
+                                        Address  bottom_addr,  // can use rax_temp
+                                        Register rbx_top,      // invariant
+                                        RegisterOrConstant negative_distance_in_slots,
+                                        Register rax_temp, Register rdx_temp) {
+  BLOCK_COMMENT("move_arg_slots_down {");
+  assert_different_registers(rbx_top,
+                             negative_distance_in_slots.register_or_noreg(),
+                             rax_temp, rdx_temp);
+  Label L_loop, L_break;
+  Register rax_bottom = rax_temp;
+  if (!bottom_addr.is_same_address(Address(rax_bottom, 0)))
+    __ lea(rax_bottom, bottom_addr);
+  // Detect empty (or broken) loop:
+#ifdef ASSERT
+  assert(!negative_distance_in_slots.is_constant() || negative_distance_in_slots.as_constant() < 0, "");
+  if (VerifyMethodHandles) {
+    // Verify that &bottom < &top (non-empty interval)
+    Label L_ok, L_bad;
+    if (negative_distance_in_slots.is_register()) {
+      __ cmpptr(negative_distance_in_slots.as_register(), (int32_t) 0);
+      __ jcc(Assembler::greaterEqual, L_bad);
+    }
+    __ cmpptr(rax_bottom, rbx_top);
+    __ jcc(Assembler::below, L_ok);
+    __ bind(L_bad);
+    __ stop("valid bounds (copy down)");
+    __ BIND(L_ok);
+  }
+#endif
+  __ cmpptr(rax_bottom, rbx_top);
+  __ jccb(Assembler::aboveEqual, L_break);
+  // work rax up to rbx, copying contiguous data downwards
+  // In pseudo-code:
+  //   [rax, rbx) = &[bottom, top)
+  //   while (rax < rbx) *(rax - distance) = *(rax + 0), rax++;
+  __ BIND(L_loop);
+  __ movptr(rdx_temp, Address(rax_bottom, 0));
+  __ movptr(          Address(rax_bottom, negative_distance_in_slots, Address::times_ptr), rdx_temp);
+  __ addptr(rax_bottom, wordSize);
+  __ cmpptr(rax_bottom, rbx_top);
+  __ jcc(Assembler::below, L_loop);
+  assert(Interpreter::stackElementSize == wordSize, "else change loop");
+  __ bind(L_break);
+  BLOCK_COMMENT("} move_arg_slots_down");
+}
+
+// Copy from a field or array element to a stacked argument slot.
+// is_element (ignored) says whether caller is loading an array element instead of an instance field.
+void MethodHandles::move_typed_arg(MacroAssembler* _masm,
+                                   BasicType type, bool is_element,
+                                   Address slot_dest, Address value_src,
+                                   Register rbx_temp, Register rdx_temp) {
+  BLOCK_COMMENT(!is_element ? "move_typed_arg {" : "move_typed_arg { (array element)");
+  if (type == T_OBJECT || type == T_ARRAY) {
+    __ load_heap_oop(rbx_temp, value_src);
+    __ movptr(slot_dest, rbx_temp);
+  } else if (type != T_VOID) {
+    int  arg_size      = type2aelembytes(type);
+    bool arg_is_signed = is_signed_subword_type(type);
+    int  slot_size     = (arg_size > wordSize) ? arg_size : wordSize;
+    __ load_sized_value(  rdx_temp,  value_src, arg_size, arg_is_signed, rbx_temp);
+    __ store_sized_value( slot_dest, rdx_temp,  slot_size,               rbx_temp);
+  }
+  BLOCK_COMMENT("} move_typed_arg");
+}
+
+void MethodHandles::move_return_value(MacroAssembler* _masm, BasicType type,
+                                      Address return_slot) {
+  BLOCK_COMMENT("move_return_value {");
+  // Old versions of the JVM must clean the FPU stack after every return.
+#ifndef _LP64
+#ifdef COMPILER2
+  // The FPU stack is clean if UseSSE >= 2 but must be cleaned in other cases
+  if ((type == T_FLOAT && UseSSE < 1) || (type == T_DOUBLE && UseSSE < 2)) {
+    for (int i = 1; i < 8; i++) {
+        __ ffree(i);
+    }
+  } else if (UseSSE < 2) {
+    __ empty_FPU_stack();
+  }
+#endif //COMPILER2
+#endif //!_LP64
+
+  // Look at the type and pull the value out of the corresponding register.
+  if (type == T_VOID) {
+    // nothing to do
+  } else if (type == T_OBJECT) {
+    __ movptr(return_slot, rax);
+  } else if (type == T_INT || is_subword_type(type)) {
+    // write the whole word, even if only 32 bits is significant
+    __ movptr(return_slot, rax);
+  } else if (type == T_LONG) {
+    // store the value by parts
+    // Note: We assume longs are continguous (if misaligned) on the interpreter stack.
+    __ store_sized_value(return_slot, rax, BytesPerLong, rdx);
+  } else if (NOT_LP64((type == T_FLOAT  && UseSSE < 1) ||
+                      (type == T_DOUBLE && UseSSE < 2) ||)
+             false) {
+    // Use old x86 FPU registers:
+    if (type == T_FLOAT)
+      __ fstp_s(return_slot);
+    else
+      __ fstp_d(return_slot);
+  } else if (type == T_FLOAT) {
+    __ movflt(return_slot, xmm0);
+  } else if (type == T_DOUBLE) {
+    __ movdbl(return_slot, xmm0);
+  } else {
+    ShouldNotReachHere();
+  }
+  BLOCK_COMMENT("} move_return_value");
+}
+
+
 #ifndef PRODUCT
 extern "C" void print_method_handle(oop mh);
 void trace_method_handle_stub(const char* adaptername,
@@ -321,48 +966,90 @@
                               intptr_t* saved_sp,
                               intptr_t* saved_bp) {
   // called as a leaf from native code: do not block the JVM!
+  bool has_mh = (strstr(adaptername, "return/") == NULL);  // return adapters don't have rcx_mh
   intptr_t* last_sp = (intptr_t*) saved_bp[frame::interpreter_frame_last_sp_offset];
-  intptr_t* base_sp = (intptr_t*) saved_bp[frame::interpreter_frame_monitor_block_top_offset];
-  printf("MH %s mh="INTPTR_FORMAT" sp=("INTPTR_FORMAT"+"INTX_FORMAT") stack_size="INTX_FORMAT" bp="INTPTR_FORMAT"\n",
-         adaptername, (intptr_t)mh, (intptr_t)entry_sp, (intptr_t)(saved_sp - entry_sp), (intptr_t)(base_sp - last_sp), (intptr_t)saved_bp);
-  if (last_sp != saved_sp && last_sp != NULL)
-    printf("*** last_sp="INTPTR_FORMAT"\n", (intptr_t)last_sp);
+  intptr_t* base_sp = last_sp;
+  typedef MethodHandles::RicochetFrame RicochetFrame;
+  RicochetFrame* rfp = (RicochetFrame*)((address)saved_bp - RicochetFrame::sender_link_offset_in_bytes());
+  if (!UseRicochetFrames || Universe::heap()->is_in((address) rfp->saved_args_base())) {
+    // Probably an interpreter frame.
+    base_sp = (intptr_t*) saved_bp[frame::interpreter_frame_monitor_block_top_offset];
+  }
+  intptr_t    mh_reg = (intptr_t)mh;
+  const char* mh_reg_name = "rcx_mh";
+  if (!has_mh)  mh_reg_name = "rcx";
+  tty->print_cr("MH %s %s="PTR_FORMAT" sp=("PTR_FORMAT"+"INTX_FORMAT") stack_size="INTX_FORMAT" bp="PTR_FORMAT,
+                adaptername, mh_reg_name, mh_reg,
+                (intptr_t)entry_sp, (intptr_t)(saved_sp - entry_sp), (intptr_t)(base_sp - last_sp), (intptr_t)saved_bp);
   if (Verbose) {
-    printf(" reg dump: ");
+    tty->print(" reg dump: ");
     int saved_regs_count = (entry_sp-1) - saved_regs;
     // 32 bit: rdi rsi rbp rsp; rbx rdx rcx (*) rax
     int i;
     for (i = 0; i <= saved_regs_count; i++) {
-      if (i > 0 && i % 4 == 0 && i != saved_regs_count)
-        printf("\n   + dump: ");
-      printf(" %d: "INTPTR_FORMAT, i, saved_regs[i]);
+      if (i > 0 && i % 4 == 0 && i != saved_regs_count) {
+        tty->cr();
+        tty->print("   + dump: ");
+      }
+      tty->print(" %d: "PTR_FORMAT, i, saved_regs[i]);
     }
-    printf("\n");
+    tty->cr();
+    if (last_sp != saved_sp && last_sp != NULL)
+      tty->print_cr("*** last_sp="PTR_FORMAT, (intptr_t)last_sp);
     int stack_dump_count = 16;
     if (stack_dump_count < (int)(saved_bp + 2 - saved_sp))
       stack_dump_count = (int)(saved_bp + 2 - saved_sp);
     if (stack_dump_count > 64)  stack_dump_count = 48;
     for (i = 0; i < stack_dump_count; i += 4) {
-      printf(" dump at SP[%d] "INTPTR_FORMAT": "INTPTR_FORMAT" "INTPTR_FORMAT" "INTPTR_FORMAT" "INTPTR_FORMAT"\n",
-             i, (intptr_t) &entry_sp[i+0], entry_sp[i+0], entry_sp[i+1], entry_sp[i+2], entry_sp[i+3]);
+      tty->print_cr(" dump at SP[%d] "PTR_FORMAT": "PTR_FORMAT" "PTR_FORMAT" "PTR_FORMAT" "PTR_FORMAT,
+                    i, (intptr_t) &entry_sp[i+0], entry_sp[i+0], entry_sp[i+1], entry_sp[i+2], entry_sp[i+3]);
     }
-    print_method_handle(mh);
+    if (has_mh)
+      print_method_handle(mh);
   }
 }
+
+// The stub wraps the arguments in a struct on the stack to avoid
+// dealing with the different calling conventions for passing 6
+// arguments.
+struct MethodHandleStubArguments {
+  const char* adaptername;
+  oopDesc* mh;
+  intptr_t* saved_regs;
+  intptr_t* entry_sp;
+  intptr_t* saved_sp;
+  intptr_t* saved_bp;
+};
+void trace_method_handle_stub_wrapper(MethodHandleStubArguments* args) {
+  trace_method_handle_stub(args->adaptername,
+                           args->mh,
+                           args->saved_regs,
+                           args->entry_sp,
+                           args->saved_sp,
+                           args->saved_bp);
+}
+
 void MethodHandles::trace_method_handle(MacroAssembler* _masm, const char* adaptername) {
   if (!TraceMethodHandles)  return;
   BLOCK_COMMENT("trace_method_handle {");
   __ push(rax);
-  __ lea(rax, Address(rsp, wordSize*6)); // entry_sp
+  __ lea(rax, Address(rsp, wordSize * NOT_LP64(6) LP64_ONLY(14))); // entry_sp  __ pusha();
   __ pusha();
-  // arguments:
-  __ push(rbp);               // interpreter frame pointer
+  __ mov(rbx, rsp);
+  __ enter();
+  // incoming state:
+  // rcx: method handle
+  // r13 or rsi: saved sp
+  // To avoid calling convention issues, build a record on the stack and pass the pointer to that instead.
+  __ push(rbp);               // saved_bp
   __ push(rsi);               // saved_sp
   __ push(rax);               // entry_sp
+  __ push(rbx);               // pusha saved_regs
   __ push(rcx);               // mh
-  __ push(rcx);
+  __ push(rcx);               // adaptername
   __ movptr(Address(rsp, 0), (intptr_t) adaptername);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, trace_method_handle_stub), 5);
+  __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, trace_method_handle_stub_wrapper), rsp);
+  __ leave();
   __ popa();
   __ pop(rax);
   BLOCK_COMMENT("} trace_method_handle");
@@ -376,13 +1063,20 @@
          |(1<<java_lang_invoke_AdapterMethodHandle::OP_CHECK_CAST)
          |(1<<java_lang_invoke_AdapterMethodHandle::OP_PRIM_TO_PRIM)
          |(1<<java_lang_invoke_AdapterMethodHandle::OP_REF_TO_PRIM)
+          //OP_PRIM_TO_REF is below...
          |(1<<java_lang_invoke_AdapterMethodHandle::OP_SWAP_ARGS)
          |(1<<java_lang_invoke_AdapterMethodHandle::OP_ROT_ARGS)
          |(1<<java_lang_invoke_AdapterMethodHandle::OP_DUP_ARGS)
          |(1<<java_lang_invoke_AdapterMethodHandle::OP_DROP_ARGS)
-         //|(1<<java_lang_invoke_AdapterMethodHandle::OP_SPREAD_ARGS) //BUG!
+          //OP_COLLECT_ARGS is below...
+         |(1<<java_lang_invoke_AdapterMethodHandle::OP_SPREAD_ARGS)
+         |(!UseRicochetFrames ? 0 :
+           java_lang_invoke_MethodTypeForm::vmlayout_offset_in_bytes() <= 0 ? 0 :
+           ((1<<java_lang_invoke_AdapterMethodHandle::OP_PRIM_TO_REF)
+           |(1<<java_lang_invoke_AdapterMethodHandle::OP_COLLECT_ARGS)
+           |(1<<java_lang_invoke_AdapterMethodHandle::OP_FOLD_ARGS)
+            ))
          );
-  // FIXME: MethodHandlesTest gets a crash if we enable OP_SPREAD_ARGS.
 }
 
 //------------------------------------------------------------------------------
@@ -391,6 +1085,8 @@
 // Generate an "entry" field for a method handle.
 // This determines how the method handle will respond to calls.
 void MethodHandles::generate_method_handle_stub(MacroAssembler* _masm, MethodHandles::EntryKind ek) {
+  MethodHandles::EntryKind ek_orig = ek_original_kind(ek);
+
   // Here is the register state during an interpreted call,
   // as set up by generate_method_handle_interpreter_entry():
   // - rbx: garbage temp (was MethodHandle.invoke methodOop, unused)
@@ -403,10 +1099,11 @@
   const Register rax_argslot = rax;
   const Register rbx_temp    = rbx;
   const Register rdx_temp    = rdx;
+  const Register rdi_temp    = rdi;
 
   // This guy is set up by prepare_to_jump_from_interpreted (from interpreted calls)
   // and gen_c2i_adapter (from compiled calls):
-  const Register saved_last_sp = LP64_ONLY(r13) NOT_LP64(rsi);
+  const Register saved_last_sp = saved_last_sp_register();
 
   // Argument registers for _raise_exception.
   // 32-bit: Pass first two oop/int args in registers ECX and EDX.
@@ -439,6 +1136,13 @@
     return;
   }
 
+#ifdef ASSERT
+  __ push((int32_t) 0xEEEEEEEE);
+  __ push((int32_t) (intptr_t) entry_name(ek));
+  LP64_ONLY(__ push((int32_t) high((intptr_t) entry_name(ek))));
+  __ push((int32_t) 0x33333333);
+#endif //ASSERT
+
   address interp_entry = __ pc();
 
   trace_method_handle(_masm, entry_name(ek));
@@ -554,7 +1258,6 @@
       __ load_klass(rax_klass, rcx_recv);
       __ verify_oop(rax_klass);
 
-      Register rdi_temp   = rdi;
       Register rbx_method = rbx_index;
 
       // get interface klass
@@ -590,16 +1293,14 @@
   case _bound_long_direct_mh:
     {
       bool direct_to_method = (ek >= _bound_ref_direct_mh);
-      BasicType arg_type  = T_ILLEGAL;
-      int       arg_mask  = _INSERT_NO_MASK;
-      int       arg_slots = -1;
-      get_ek_bound_mh_info(ek, arg_type, arg_mask, arg_slots);
+      BasicType arg_type  = ek_bound_mh_arg_type(ek);
+      int       arg_slots = type2size[arg_type];
 
       // make room for the new argument:
       __ movl(rax_argslot, rcx_bmh_vmargslot);
       __ lea(rax_argslot, __ argument_address(rax_argslot));
 
-      insert_arg_slots(_masm, arg_slots * stack_move_unit(), arg_mask, rax_argslot, rbx_temp, rdx_temp);
+      insert_arg_slots(_masm, arg_slots * stack_move_unit(), rax_argslot, rbx_temp, rdx_temp);
 
       // store bound argument into the new stack slot:
       __ load_heap_oop(rbx_temp, rcx_bmh_argument);
@@ -607,9 +1308,10 @@
         __ movptr(Address(rax_argslot, 0), rbx_temp);
       } else {
         Address prim_value_addr(rbx_temp, java_lang_boxing_object::value_offset_in_bytes(arg_type));
-        const int arg_size = type2aelembytes(arg_type);
-        __ load_sized_value(rdx_temp, prim_value_addr, arg_size, is_signed_subword_type(arg_type), rbx_temp);
-        __ store_sized_value(Address(rax_argslot, 0), rdx_temp, arg_size, rbx_temp);
+        move_typed_arg(_masm, arg_type, false,
+                       Address(rax_argslot, 0),
+                       prim_value_addr,
+                       rbx_temp, rdx_temp);
       }
 
       if (direct_to_method) {
@@ -646,7 +1348,7 @@
 
       // What class are we casting to?
       __ load_heap_oop(rbx_klass, rcx_amh_argument); // this is a Class object!
-      __ load_heap_oop(rbx_klass, Address(rbx_klass, java_lang_Class::klass_offset_in_bytes()));
+      load_klass_from_Class(_masm, rbx_klass);
 
       Label done;
       __ movptr(rdx_temp, vmarg);
@@ -681,6 +1383,7 @@
 
   case _adapter_prim_to_prim:
   case _adapter_ref_to_prim:
+  case _adapter_prim_to_ref:
     // handled completely by optimized cases
     __ stop("init_AdapterMethodHandle should not issue this");
     break;
@@ -732,8 +1435,7 @@
 
       // Do the requested conversion and store the value.
       Register rbx_vminfo = rbx_temp;
-      __ movl(rbx_vminfo, rcx_amh_conversion);
-      assert(CONV_VMINFO_SHIFT == 0, "preshifted");
+      load_conversion_vminfo(_masm, rbx_vminfo, rcx_amh_conversion);
 
       // get the new MH:
       __ load_heap_oop(rcx_recv, rcx_mh_vmtarget);
@@ -771,7 +1473,7 @@
 
       // on a little-endian machine we keep the first slot and add another after
       __ lea(rax_argslot, __ argument_address(rax_argslot, 1));
-      insert_arg_slots(_masm, stack_move_unit(), _INSERT_INT_MASK,
+      insert_arg_slots(_masm, stack_move_unit(),
                        rax_argslot, rbx_temp, rdx_temp);
       Address vmarg1(rax_argslot, -Interpreter::stackElementSize);
       Address vmarg2 = vmarg1.plus_disp(Interpreter::stackElementSize);
@@ -823,7 +1525,7 @@
       __ movl(rax_argslot, rcx_amh_vmargslot);
       __ lea(rax_argslot, __ argument_address(rax_argslot, 1));
       if (ek == _adapter_opt_f2d) {
-        insert_arg_slots(_masm, stack_move_unit(), _INSERT_INT_MASK,
+        insert_arg_slots(_masm, stack_move_unit(),
                          rax_argslot, rbx_temp, rdx_temp);
       }
       Address vmarg(rax_argslot, -Interpreter::stackElementSize);
@@ -841,7 +1543,7 @@
 #else //_LP64
       if (ek == _adapter_opt_f2d) {
         __ fld_s(vmarg);        // load float to ST0
-        __ fstp_s(vmarg);       // store single
+        __ fstp_d(vmarg);       // store double
       } else {
         __ fld_d(vmarg);        // load double to ST0
         __ fstp_s(vmarg);       // store single
@@ -858,10 +1560,6 @@
     }
     break;
 
-  case _adapter_prim_to_ref:
-    __ unimplemented(entry_name(ek)); // %%% FIXME: NYI
-    break;
-
   case _adapter_swap_args:
   case _adapter_rot_args:
     // handled completely by optimized cases
@@ -875,8 +1573,8 @@
   case _adapter_opt_rot_2_up:
   case _adapter_opt_rot_2_down:
     {
-      int swap_bytes = 0, rotate = 0;
-      get_ek_adapter_opt_swap_rot_info(ek, swap_bytes, rotate);
+      int swap_slots = ek_adapter_opt_swap_slots(ek);
+      int rotate     = ek_adapter_opt_swap_mode(ek);
 
       // 'argslot' is the position of the first argument to swap
       __ movl(rax_argslot, rcx_amh_vmargslot);
@@ -884,83 +1582,69 @@
 
       // 'vminfo' is the second
       Register rbx_destslot = rbx_temp;
-      __ movl(rbx_destslot, rcx_amh_conversion);
-      assert(CONV_VMINFO_SHIFT == 0, "preshifted");
-      __ andl(rbx_destslot, CONV_VMINFO_MASK);
+      load_conversion_vminfo(_masm, rbx_destslot, rcx_amh_conversion);
       __ lea(rbx_destslot, __ argument_address(rbx_destslot));
-      DEBUG_ONLY(verify_argslot(_masm, rbx_destslot, "swap point must fall within current frame"));
+      if (VerifyMethodHandles)
+        verify_argslot(_masm, rbx_destslot, "swap point must fall within current frame");
 
+      assert(Interpreter::stackElementSize == wordSize, "else rethink use of wordSize here");
       if (!rotate) {
-        for (int i = 0; i < swap_bytes; i += wordSize) {
-          __ movptr(rdx_temp, Address(rax_argslot , i));
-          __ push(rdx_temp);
-          __ movptr(rdx_temp, Address(rbx_destslot, i));
-          __ movptr(Address(rax_argslot, i), rdx_temp);
-          __ pop(rdx_temp);
-          __ movptr(Address(rbx_destslot, i), rdx_temp);
+        // simple swap
+        for (int i = 0; i < swap_slots; i++) {
+          __ movptr(rdi_temp, Address(rax_argslot,  i * wordSize));
+          __ movptr(rdx_temp, Address(rbx_destslot, i * wordSize));
+          __ movptr(Address(rax_argslot,  i * wordSize), rdx_temp);
+          __ movptr(Address(rbx_destslot, i * wordSize), rdi_temp);
         }
       } else {
-        // push the first chunk, which is going to get overwritten
-        for (int i = swap_bytes; (i -= wordSize) >= 0; ) {
-          __ movptr(rdx_temp, Address(rax_argslot, i));
-          __ push(rdx_temp);
+        // A rotate is actually pair of moves, with an "odd slot" (or pair)
+        // changing place with a series of other slots.
+        // First, push the "odd slot", which is going to get overwritten
+        for (int i = swap_slots - 1; i >= 0; i--) {
+          // handle one with rdi_temp instead of a push:
+          if (i == 0)  __ movptr(rdi_temp, Address(rax_argslot, i * wordSize));
+          else         __ pushptr(         Address(rax_argslot, i * wordSize));
         }
-
         if (rotate > 0) {
-          // rotate upward
-          __ subptr(rax_argslot, swap_bytes);
-#ifdef ASSERT
-          {
-            // Verify that argslot > destslot, by at least swap_bytes.
-            Label L_ok;
-            __ cmpptr(rax_argslot, rbx_destslot);
-            __ jccb(Assembler::aboveEqual, L_ok);
-            __ stop("source must be above destination (upward rotation)");
-            __ bind(L_ok);
-          }
-#endif
+          // Here is rotate > 0:
+          // (low mem)                                          (high mem)
+          //     | dest:     more_slots...     | arg: odd_slot :arg+1 |
+          // =>
+          //     | dest: odd_slot | dest+1: more_slots...      :arg+1 |
           // work argslot down to destslot, copying contiguous data upwards
           // pseudo-code:
           //   rax = src_addr - swap_bytes
           //   rbx = dest_addr
           //   while (rax >= rbx) *(rax + swap_bytes) = *(rax + 0), rax--;
-          Label loop;
-          __ bind(loop);
-          __ movptr(rdx_temp, Address(rax_argslot, 0));
-          __ movptr(Address(rax_argslot, swap_bytes), rdx_temp);
-          __ addptr(rax_argslot, -wordSize);
-          __ cmpptr(rax_argslot, rbx_destslot);
-          __ jccb(Assembler::aboveEqual, loop);
+          move_arg_slots_up(_masm,
+                            rbx_destslot,
+                            Address(rax_argslot, 0),
+                            swap_slots,
+                            rax_argslot, rdx_temp);
         } else {
-          __ addptr(rax_argslot, swap_bytes);
-#ifdef ASSERT
-          {
-            // Verify that argslot < destslot, by at least swap_bytes.
-            Label L_ok;
-            __ cmpptr(rax_argslot, rbx_destslot);
-            __ jccb(Assembler::belowEqual, L_ok);
-            __ stop("source must be below destination (downward rotation)");
-            __ bind(L_ok);
-          }
-#endif
+          // Here is the other direction, rotate < 0:
+          // (low mem)                                          (high mem)
+          //     | arg: odd_slot | arg+1: more_slots...       :dest+1 |
+          // =>
+          //     | arg:    more_slots...     | dest: odd_slot :dest+1 |
           // work argslot up to destslot, copying contiguous data downwards
           // pseudo-code:
           //   rax = src_addr + swap_bytes
           //   rbx = dest_addr
           //   while (rax <= rbx) *(rax - swap_bytes) = *(rax + 0), rax++;
-          Label loop;
-          __ bind(loop);
-          __ movptr(rdx_temp, Address(rax_argslot, 0));
-          __ movptr(Address(rax_argslot, -swap_bytes), rdx_temp);
-          __ addptr(rax_argslot, wordSize);
-          __ cmpptr(rax_argslot, rbx_destslot);
-          __ jccb(Assembler::belowEqual, loop);
+          __ addptr(rbx_destslot, wordSize);
+          move_arg_slots_down(_masm,
+                              Address(rax_argslot, swap_slots * wordSize),
+                              rbx_destslot,
+                              -swap_slots,
+                              rax_argslot, rdx_temp);
+
+          __ subptr(rbx_destslot, wordSize);
         }
-
         // pop the original first chunk into the destination slot, now free
-        for (int i = 0; i < swap_bytes; i += wordSize) {
-          __ pop(rdx_temp);
-          __ movptr(Address(rbx_destslot, i), rdx_temp);
+        for (int i = 0; i < swap_slots; i++) {
+          if (i == 0)  __ movptr(Address(rbx_destslot, i * wordSize), rdi_temp);
+          else         __ popptr(Address(rbx_destslot, i * wordSize));
         }
       }
 
@@ -976,53 +1660,22 @@
       __ lea(rax_argslot, __ argument_address(rax_argslot));
 
       // 'stack_move' is negative number of words to duplicate
-      Register rdx_stack_move = rdx_temp;
-      __ movl2ptr(rdx_stack_move, rcx_amh_conversion);
-      __ sarptr(rdx_stack_move, CONV_STACK_MOVE_SHIFT);
+      Register rdi_stack_move = rdi_temp;
+      load_stack_move(_masm, rdi_stack_move, rcx_recv, true);
 
-      int argslot0_num = 0;
-      Address argslot0 = __ argument_address(RegisterOrConstant(argslot0_num));
-      assert(argslot0.base() == rsp, "");
-      int pre_arg_size = argslot0.disp();
-      assert(pre_arg_size % wordSize == 0, "");
-      assert(pre_arg_size > 0, "must include PC");
-
-      // remember the old rsp+1 (argslot[0])
-      Register rbx_oldarg = rbx_temp;
-      __ lea(rbx_oldarg, argslot0);
-
-      // move rsp down to make room for dups
-      __ lea(rsp, Address(rsp, rdx_stack_move, Address::times_ptr));
-
-      // compute the new rsp+1 (argslot[0])
-      Register rdx_newarg = rdx_temp;
-      __ lea(rdx_newarg, argslot0);
-
-      __ push(rdi);             // need a temp
-      // (preceding push must be done after arg addresses are taken!)
-
-      // pull down the pre_arg_size data (PC)
-      for (int i = -pre_arg_size; i < 0; i += wordSize) {
-        __ movptr(rdi, Address(rbx_oldarg, i));
-        __ movptr(Address(rdx_newarg, i), rdi);
+      if (VerifyMethodHandles) {
+        verify_argslots(_masm, rdi_stack_move, rax_argslot, true,
+                        "copied argument(s) must fall within current frame");
       }
 
-      // copy from rax_argslot[0...] down to new_rsp[1...]
-      // pseudo-code:
-      //   rbx = old_rsp+1
-      //   rdx = new_rsp+1
-      //   rax = argslot
-      //   while (rdx < rbx) *rdx++ = *rax++
-      Label loop;
-      __ bind(loop);
-      __ movptr(rdi, Address(rax_argslot, 0));
-      __ movptr(Address(rdx_newarg, 0), rdi);
-      __ addptr(rax_argslot, wordSize);
-      __ addptr(rdx_newarg, wordSize);
-      __ cmpptr(rdx_newarg, rbx_oldarg);
-      __ jccb(Assembler::less, loop);
+      // insert location is always the bottom of the argument list:
+      Address insert_location = __ argument_address(constant(0));
+      int pre_arg_words = insert_location.disp() / wordSize;   // return PC is pushed
+      assert(insert_location.base() == rsp, "");
 
-      __ pop(rdi);              // restore temp
+      __ negl(rdi_stack_move);
+      push_arg_slots(_masm, rax_argslot, rdi_stack_move,
+                     pre_arg_words, rbx_temp, rdx_temp);
 
       __ load_heap_oop(rcx_recv, rcx_mh_vmtarget);
       __ jump_to_method_handle_entry(rcx_recv, rdx_temp);
@@ -1035,63 +1688,583 @@
       __ movl(rax_argslot, rcx_amh_vmargslot);
       __ lea(rax_argslot, __ argument_address(rax_argslot));
 
-      __ push(rdi);             // need a temp
       // (must do previous push after argslot address is taken)
 
       // 'stack_move' is number of words to drop
-      Register rdi_stack_move = rdi;
-      __ movl2ptr(rdi_stack_move, rcx_amh_conversion);
-      __ sarptr(rdi_stack_move, CONV_STACK_MOVE_SHIFT);
+      Register rdi_stack_move = rdi_temp;
+      load_stack_move(_masm, rdi_stack_move, rcx_recv, false);
       remove_arg_slots(_masm, rdi_stack_move,
                        rax_argslot, rbx_temp, rdx_temp);
 
-      __ pop(rdi);              // restore temp
-
       __ load_heap_oop(rcx_recv, rcx_mh_vmtarget);
       __ jump_to_method_handle_entry(rcx_recv, rdx_temp);
     }
     break;
 
   case _adapter_collect_args:
-    __ unimplemented(entry_name(ek)); // %%% FIXME: NYI
-    break;
-
+  case _adapter_fold_args:
   case _adapter_spread_args:
     // handled completely by optimized cases
     __ stop("init_AdapterMethodHandle should not issue this");
     break;
 
+  case _adapter_opt_collect_ref:
+  case _adapter_opt_collect_int:
+  case _adapter_opt_collect_long:
+  case _adapter_opt_collect_float:
+  case _adapter_opt_collect_double:
+  case _adapter_opt_collect_void:
+  case _adapter_opt_collect_0_ref:
+  case _adapter_opt_collect_1_ref:
+  case _adapter_opt_collect_2_ref:
+  case _adapter_opt_collect_3_ref:
+  case _adapter_opt_collect_4_ref:
+  case _adapter_opt_collect_5_ref:
+  case _adapter_opt_filter_S0_ref:
+  case _adapter_opt_filter_S1_ref:
+  case _adapter_opt_filter_S2_ref:
+  case _adapter_opt_filter_S3_ref:
+  case _adapter_opt_filter_S4_ref:
+  case _adapter_opt_filter_S5_ref:
+  case _adapter_opt_collect_2_S0_ref:
+  case _adapter_opt_collect_2_S1_ref:
+  case _adapter_opt_collect_2_S2_ref:
+  case _adapter_opt_collect_2_S3_ref:
+  case _adapter_opt_collect_2_S4_ref:
+  case _adapter_opt_collect_2_S5_ref:
+  case _adapter_opt_fold_ref:
+  case _adapter_opt_fold_int:
+  case _adapter_opt_fold_long:
+  case _adapter_opt_fold_float:
+  case _adapter_opt_fold_double:
+  case _adapter_opt_fold_void:
+  case _adapter_opt_fold_1_ref:
+  case _adapter_opt_fold_2_ref:
+  case _adapter_opt_fold_3_ref:
+  case _adapter_opt_fold_4_ref:
+  case _adapter_opt_fold_5_ref:
+    {
+      // Given a fresh incoming stack frame, build a new ricochet frame.
+      // On entry, TOS points at a return PC, and RBP is the callers frame ptr.
+      // RSI/R13 has the caller's exact stack pointer, which we must also preserve.
+      // RCX contains an AdapterMethodHandle of the indicated kind.
+
+      // Relevant AMH fields:
+      // amh.vmargslot:
+      //   points to the trailing edge of the arguments
+      //   to filter, collect, or fold.  For a boxing operation,
+      //   it points just after the single primitive value.
+      // amh.argument:
+      //   recursively called MH, on |collect| arguments
+      // amh.vmtarget:
+      //   final destination MH, on return value, etc.
+      // amh.conversion.dest:
+      //   tells what is the type of the return value
+      //   (not needed here, since dest is also derived from ek)
+      // amh.conversion.vminfo:
+      //   points to the trailing edge of the return value
+      //   when the vmtarget is to be called; this is
+      //   equal to vmargslot + (retained ? |collect| : 0)
+
+      // Pass 0 or more argument slots to the recursive target.
+      int collect_count_constant = ek_adapter_opt_collect_count(ek);
+
+      // The collected arguments are copied from the saved argument list:
+      int collect_slot_constant = ek_adapter_opt_collect_slot(ek);
+
+      assert(ek_orig == _adapter_collect_args ||
+             ek_orig == _adapter_fold_args, "");
+      bool retain_original_args = (ek_orig == _adapter_fold_args);
+
+      // The return value is replaced (or inserted) at the 'vminfo' argslot.
+      // Sometimes we can compute this statically.
+      int dest_slot_constant = -1;
+      if (!retain_original_args)
+        dest_slot_constant = collect_slot_constant;
+      else if (collect_slot_constant >= 0 && collect_count_constant >= 0)
+        // We are preserving all the arguments, and the return value is prepended,
+        // so the return slot is to the left (above) the |collect| sequence.
+        dest_slot_constant = collect_slot_constant + collect_count_constant;
+
+      // Replace all those slots by the result of the recursive call.
+      // The result type can be one of ref, int, long, float, double, void.
+      // In the case of void, nothing is pushed on the stack after return.
+      BasicType dest = ek_adapter_opt_collect_type(ek);
+      assert(dest == type2wfield[dest], "dest is a stack slot type");
+      int dest_count = type2size[dest];
+      assert(dest_count == 1 || dest_count == 2 || (dest_count == 0 && dest == T_VOID), "dest has a size");
+
+      // Choose a return continuation.
+      EntryKind ek_ret = _adapter_opt_return_any;
+      if (dest != T_CONFLICT && OptimizeMethodHandles) {
+        switch (dest) {
+        case T_INT    : ek_ret = _adapter_opt_return_int;     break;
+        case T_LONG   : ek_ret = _adapter_opt_return_long;    break;
+        case T_FLOAT  : ek_ret = _adapter_opt_return_float;   break;
+        case T_DOUBLE : ek_ret = _adapter_opt_return_double;  break;
+        case T_OBJECT : ek_ret = _adapter_opt_return_ref;     break;
+        case T_VOID   : ek_ret = _adapter_opt_return_void;    break;
+        default       : ShouldNotReachHere();
+        }
+        if (dest == T_OBJECT && dest_slot_constant >= 0) {
+          EntryKind ek_try = EntryKind(_adapter_opt_return_S0_ref + dest_slot_constant);
+          if (ek_try <= _adapter_opt_return_LAST &&
+              ek_adapter_opt_return_slot(ek_try) == dest_slot_constant) {
+            ek_ret = ek_try;
+          }
+        }
+        assert(ek_adapter_opt_return_type(ek_ret) == dest, "");
+      }
+
+      // Already pushed:  ... keep1 | collect | keep2 | sender_pc |
+      // push(sender_pc);
+
+      // Compute argument base:
+      Register rax_argv = rax_argslot;
+      __ lea(rax_argv, __ argument_address(constant(0)));
+
+      // Push a few extra argument words, if we need them to store the return value.
+      {
+        int extra_slots = 0;
+        if (retain_original_args) {
+          extra_slots = dest_count;
+        } else if (collect_count_constant == -1) {
+          extra_slots = dest_count;  // collect_count might be zero; be generous
+        } else if (dest_count > collect_count_constant) {
+          extra_slots = (dest_count - collect_count_constant);
+        } else {
+          // else we know we have enough dead space in |collect| to repurpose for return values
+        }
+        DEBUG_ONLY(extra_slots += 1);
+        if (extra_slots > 0) {
+          __ pop(rbx_temp);   // return value
+          __ subptr(rsp, (extra_slots * Interpreter::stackElementSize));
+          // Push guard word #2 in debug mode.
+          DEBUG_ONLY(__ movptr(Address(rsp, 0), (int32_t) RicochetFrame::MAGIC_NUMBER_2));
+          __ push(rbx_temp);
+        }
+      }
+
+      RicochetFrame::enter_ricochet_frame(_masm, rcx_recv, rax_argv,
+                                          entry(ek_ret)->from_interpreted_entry(), rbx_temp);
+
+      // Now pushed:  ... keep1 | collect | keep2 | RF |
+      // some handy frame slots:
+      Address exact_sender_sp_addr = RicochetFrame::frame_address(RicochetFrame::exact_sender_sp_offset_in_bytes());
+      Address conversion_addr      = RicochetFrame::frame_address(RicochetFrame::conversion_offset_in_bytes());
+      Address saved_args_base_addr = RicochetFrame::frame_address(RicochetFrame::saved_args_base_offset_in_bytes());
+
+#ifdef ASSERT
+      if (VerifyMethodHandles && dest != T_CONFLICT) {
+        BLOCK_COMMENT("verify AMH.conv.dest");
+        load_conversion_dest_type(_masm, rbx_temp, conversion_addr);
+        Label L_dest_ok;
+        __ cmpl(rbx_temp, (int) dest);
+        __ jcc(Assembler::equal, L_dest_ok);
+        if (dest == T_INT) {
+          for (int bt = T_BOOLEAN; bt < T_INT; bt++) {
+            if (is_subword_type(BasicType(bt))) {
+              __ cmpl(rbx_temp, (int) bt);
+              __ jcc(Assembler::equal, L_dest_ok);
+            }
+          }
+        }
+        __ stop("bad dest in AMH.conv");
+        __ BIND(L_dest_ok);
+      }
+#endif //ASSERT
+
+      // Find out where the original copy of the recursive argument sequence begins.
+      Register rax_coll = rax_argv;
+      {
+        RegisterOrConstant collect_slot = collect_slot_constant;
+        if (collect_slot_constant == -1) {
+          __ movl(rdi_temp, rcx_amh_vmargslot);
+          collect_slot = rdi_temp;
+        }
+        if (collect_slot_constant != 0)
+          __ lea(rax_coll, Address(rax_argv, collect_slot, Interpreter::stackElementScale()));
+        // rax_coll now points at the trailing edge of |collect| and leading edge of |keep2|
+      }
+
+      // Replace the old AMH with the recursive MH.  (No going back now.)
+      // In the case of a boxing call, the recursive call is to a 'boxer' method,
+      // such as Integer.valueOf or Long.valueOf.  In the case of a filter
+      // or collect call, it will take one or more arguments, transform them,
+      // and return some result, to store back into argument_base[vminfo].
+      __ load_heap_oop(rcx_recv, rcx_amh_argument);
+      if (VerifyMethodHandles)  verify_method_handle(_masm, rcx_recv);
+
+      // Push a space for the recursively called MH first:
+      __ push((int32_t)NULL_WORD);
+
+      // Calculate |collect|, the number of arguments we are collecting.
+      Register rdi_collect_count = rdi_temp;
+      RegisterOrConstant collect_count;
+      if (collect_count_constant >= 0) {
+        collect_count = collect_count_constant;
+      } else {
+        __ load_method_handle_vmslots(rdi_collect_count, rcx_recv, rdx_temp);
+        collect_count = rdi_collect_count;
+      }
+#ifdef ASSERT
+      if (VerifyMethodHandles && collect_count_constant >= 0) {
+        __ load_method_handle_vmslots(rbx_temp, rcx_recv, rdx_temp);
+        Label L_count_ok;
+        __ cmpl(rbx_temp, collect_count_constant);
+        __ jcc(Assembler::equal, L_count_ok);
+        __ stop("bad vminfo in AMH.conv");
+        __ BIND(L_count_ok);
+      }
+#endif //ASSERT
+
+      // copy |collect| slots directly to TOS:
+      push_arg_slots(_masm, rax_coll, collect_count, 0, rbx_temp, rdx_temp);
+      // Now pushed:  ... keep1 | collect | keep2 | RF... | collect |
+      // rax_coll still points at the trailing edge of |collect| and leading edge of |keep2|
+
+      // If necessary, adjust the saved arguments to make room for the eventual return value.
+      // Normal adjustment:  ... keep1 | +dest+ | -collect- | keep2 | RF... | collect |
+      // If retaining args:  ... keep1 | +dest+ |  collect  | keep2 | RF... | collect |
+      // In the non-retaining case, this might move keep2 either up or down.
+      // We don't have to copy the whole | RF... collect | complex,
+      // but we must adjust RF.saved_args_base.
+      // Also, from now on, we will forget about the origial copy of |collect|.
+      // If we are retaining it, we will treat it as part of |keep2|.
+      // For clarity we will define |keep3| = |collect|keep2| or |keep2|.
+
+      BLOCK_COMMENT("adjust trailing arguments {");
+      // Compare the sizes of |+dest+| and |-collect-|, which are opposed opening and closing movements.
+      int                open_count  = dest_count;
+      RegisterOrConstant close_count = collect_count_constant;
+      Register rdi_close_count = rdi_collect_count;
+      if (retain_original_args) {
+        close_count = constant(0);
+      } else if (collect_count_constant == -1) {
+        close_count = rdi_collect_count;
+      }
+
+      // How many slots need moving?  This is simply dest_slot (0 => no |keep3|).
+      RegisterOrConstant keep3_count;
+      Register rsi_keep3_count = rsi;  // can repair from RF.exact_sender_sp
+      if (dest_slot_constant >= 0) {
+        keep3_count = dest_slot_constant;
+      } else  {
+        load_conversion_vminfo(_masm, rsi_keep3_count, conversion_addr);
+        keep3_count = rsi_keep3_count;
+      }
+#ifdef ASSERT
+      if (VerifyMethodHandles && dest_slot_constant >= 0) {
+        load_conversion_vminfo(_masm, rbx_temp, conversion_addr);
+        Label L_vminfo_ok;
+        __ cmpl(rbx_temp, dest_slot_constant);
+        __ jcc(Assembler::equal, L_vminfo_ok);
+        __ stop("bad vminfo in AMH.conv");
+        __ BIND(L_vminfo_ok);
+      }
+#endif //ASSERT
+
+      // tasks remaining:
+      bool move_keep3 = (!keep3_count.is_constant() || keep3_count.as_constant() != 0);
+      bool stomp_dest = (NOT_DEBUG(dest == T_OBJECT) DEBUG_ONLY(dest_count != 0));
+      bool fix_arg_base = (!close_count.is_constant() || open_count != close_count.as_constant());
+
+      if (stomp_dest | fix_arg_base) {
+        // we will probably need an updated rax_argv value
+        if (collect_slot_constant >= 0) {
+          // rax_coll already holds the leading edge of |keep2|, so tweak it
+          assert(rax_coll == rax_argv, "elided a move");
+          if (collect_slot_constant != 0)
+            __ subptr(rax_argv, collect_slot_constant * Interpreter::stackElementSize);
+        } else {
+          // Just reload from RF.saved_args_base.
+          __ movptr(rax_argv, saved_args_base_addr);
+        }
+      }
+
+      // Old and new argument locations (based at slot 0).
+      // Net shift (&new_argv - &old_argv) is (close_count - open_count).
+      bool zero_open_count = (open_count == 0);  // remember this bit of info
+      if (move_keep3 && fix_arg_base) {
+        // It will be easier t have everything in one register:
+        if (close_count.is_register()) {
+          // Deduct open_count from close_count register to get a clean +/- value.
+          __ subptr(close_count.as_register(), open_count);
+        } else {
+          close_count = close_count.as_constant() - open_count;
+        }
+        open_count = 0;
+      }
+      Address old_argv(rax_argv, 0);
+      Address new_argv(rax_argv, close_count,  Interpreter::stackElementScale(),
+                                - open_count * Interpreter::stackElementSize);
+
+      // First decide if any actual data are to be moved.
+      // We can skip if (a) |keep3| is empty, or (b) the argument list size didn't change.
+      // (As it happens, all movements involve an argument list size change.)
+
+      // If there are variable parameters, use dynamic checks to skip around the whole mess.
+      Label L_done;
+      if (!keep3_count.is_constant()) {
+        __ testl(keep3_count.as_register(), keep3_count.as_register());
+        __ jcc(Assembler::zero, L_done);
+      }
+      if (!close_count.is_constant()) {
+        __ cmpl(close_count.as_register(), open_count);
+        __ jcc(Assembler::equal, L_done);
+      }
+
+      if (move_keep3 && fix_arg_base) {
+        bool emit_move_down = false, emit_move_up = false, emit_guard = false;
+        if (!close_count.is_constant()) {
+          emit_move_down = emit_guard = !zero_open_count;
+          emit_move_up   = true;
+        } else if (open_count != close_count.as_constant()) {
+          emit_move_down = (open_count > close_count.as_constant());
+          emit_move_up   = !emit_move_down;
+        }
+        Label L_move_up;
+        if (emit_guard) {
+          __ cmpl(close_count.as_register(), open_count);
+          __ jcc(Assembler::greater, L_move_up);
+        }
+
+        if (emit_move_down) {
+          // Move arguments down if |+dest+| > |-collect-|
+          // (This is rare, except when arguments are retained.)
+          // This opens space for the return value.
+          if (keep3_count.is_constant()) {
+            for (int i = 0; i < keep3_count.as_constant(); i++) {
+              __ movptr(rdx_temp, old_argv.plus_disp(i * Interpreter::stackElementSize));
+              __ movptr(          new_argv.plus_disp(i * Interpreter::stackElementSize), rdx_temp);
+            }
+          } else {
+            Register rbx_argv_top = rbx_temp;
+            __ lea(rbx_argv_top, old_argv.plus_disp(keep3_count, Interpreter::stackElementScale()));
+            move_arg_slots_down(_masm,
+                                old_argv,     // beginning of old argv
+                                rbx_argv_top, // end of old argv
+                                close_count,  // distance to move down (must be negative)
+                                rax_argv, rdx_temp);
+            // Used argv as an iteration variable; reload from RF.saved_args_base.
+            __ movptr(rax_argv, saved_args_base_addr);
+          }
+        }
+
+        if (emit_guard) {
+          __ jmp(L_done);  // assumes emit_move_up is true also
+          __ BIND(L_move_up);
+        }
+
+        if (emit_move_up) {
+
+          // Move arguments up if |+dest+| < |-collect-|
+          // (This is usual, except when |keep3| is empty.)
+          // This closes up the space occupied by the now-deleted collect values.
+          if (keep3_count.is_constant()) {
+            for (int i = keep3_count.as_constant() - 1; i >= 0; i--) {
+              __ movptr(rdx_temp, old_argv.plus_disp(i * Interpreter::stackElementSize));
+              __ movptr(          new_argv.plus_disp(i * Interpreter::stackElementSize), rdx_temp);
+            }
+          } else {
+            Address argv_top = old_argv.plus_disp(keep3_count, Interpreter::stackElementScale());
+            move_arg_slots_up(_masm,
+                              rax_argv,     // beginning of old argv
+                              argv_top,     // end of old argv
+                              close_count,  // distance to move up (must be positive)
+                              rbx_temp, rdx_temp);
+          }
+        }
+      }
+      __ BIND(L_done);
+
+      if (fix_arg_base) {
+        // adjust RF.saved_args_base by adding (close_count - open_count)
+        if (!new_argv.is_same_address(Address(rax_argv, 0)))
+          __ lea(rax_argv, new_argv);
+        __ movptr(saved_args_base_addr, rax_argv);
+      }
+
+      if (stomp_dest) {
+        // Stomp the return slot, so it doesn't hold garbage.
+        // This isn't strictly necessary, but it may help detect bugs.
+        int forty_two = RicochetFrame::RETURN_VALUE_PLACEHOLDER;
+        __ movptr(Address(rax_argv, keep3_count, Address::times_ptr),
+                  (int32_t) forty_two);
+        // uses rsi_keep3_count
+      }
+      BLOCK_COMMENT("} adjust trailing arguments");
+
+      BLOCK_COMMENT("do_recursive_call");
+      __ mov(saved_last_sp, rsp);    // set rsi/r13 for callee
+      __ pushptr(ExternalAddress(SharedRuntime::ricochet_blob()->bounce_addr()).addr());
+      // The globally unique bounce address has two purposes:
+      // 1. It helps the JVM recognize this frame (frame::is_ricochet_frame).
+      // 2. When returned to, it cuts back the stack and redirects control flow
+      //    to the return handler.
+      // The return handler will further cut back the stack when it takes
+      // down the RF.  Perhaps there is a way to streamline this further.
+
+      // State during recursive call:
+      // ... keep1 | dest | dest=42 | keep3 | RF... | collect | bounce_pc |
+      __ jump_to_method_handle_entry(rcx_recv, rdx_temp);
+
+      break;
+    }
+
+  case _adapter_opt_return_ref:
+  case _adapter_opt_return_int:
+  case _adapter_opt_return_long:
+  case _adapter_opt_return_float:
+  case _adapter_opt_return_double:
+  case _adapter_opt_return_void:
+  case _adapter_opt_return_S0_ref:
+  case _adapter_opt_return_S1_ref:
+  case _adapter_opt_return_S2_ref:
+  case _adapter_opt_return_S3_ref:
+  case _adapter_opt_return_S4_ref:
+  case _adapter_opt_return_S5_ref:
+    {
+      BasicType dest_type_constant = ek_adapter_opt_return_type(ek);
+      int       dest_slot_constant = ek_adapter_opt_return_slot(ek);
+
+      if (VerifyMethodHandles)  RicochetFrame::verify_clean(_masm);
+
+      if (dest_slot_constant == -1) {
+        // The current stub is a general handler for this dest_type.
+        // It can be called from _adapter_opt_return_any below.
+        // Stash the address in a little table.
+        assert((dest_type_constant & CONV_TYPE_MASK) == dest_type_constant, "oob");
+        address return_handler = __ pc();
+        _adapter_return_handlers[dest_type_constant] = return_handler;
+        if (dest_type_constant == T_INT) {
+          // do the subword types too
+          for (int bt = T_BOOLEAN; bt < T_INT; bt++) {
+            if (is_subword_type(BasicType(bt)) &&
+                _adapter_return_handlers[bt] == NULL) {
+              _adapter_return_handlers[bt] = return_handler;
+            }
+          }
+        }
+      }
+
+      Register rbx_arg_base = rbx_temp;
+      assert_different_registers(rax, rdx,  // possibly live return value registers
+                                 rdi_temp, rbx_arg_base);
+
+      Address conversion_addr      = RicochetFrame::frame_address(RicochetFrame::conversion_offset_in_bytes());
+      Address saved_args_base_addr = RicochetFrame::frame_address(RicochetFrame::saved_args_base_offset_in_bytes());
+
+      __ movptr(rbx_arg_base, saved_args_base_addr);
+      RegisterOrConstant dest_slot = dest_slot_constant;
+      if (dest_slot_constant == -1) {
+        load_conversion_vminfo(_masm, rdi_temp, conversion_addr);
+        dest_slot = rdi_temp;
+      }
+      // Store the result back into the argslot.
+      // This code uses the interpreter calling sequence, in which the return value
+      // is usually left in the TOS register, as defined by InterpreterMacroAssembler::pop.
+      // There are certain irregularities with floating point values, which can be seen
+      // in TemplateInterpreterGenerator::generate_return_entry_for.
+      move_return_value(_masm, dest_type_constant, Address(rbx_arg_base, dest_slot, Interpreter::stackElementScale()));
+
+      RicochetFrame::leave_ricochet_frame(_masm, rcx_recv, rbx_arg_base, rdx_temp);
+      __ push(rdx_temp);  // repush the return PC
+
+      // Load the final target and go.
+      if (VerifyMethodHandles)  verify_method_handle(_masm, rcx_recv);
+      __ jump_to_method_handle_entry(rcx_recv, rdx_temp);
+      __ hlt(); // --------------------
+      break;
+    }
+
+  case _adapter_opt_return_any:
+    {
+      if (VerifyMethodHandles)  RicochetFrame::verify_clean(_masm);
+      Register rdi_conv = rdi_temp;
+      assert_different_registers(rax, rdx,  // possibly live return value registers
+                                 rdi_conv, rbx_temp);
+
+      Address conversion_addr = RicochetFrame::frame_address(RicochetFrame::conversion_offset_in_bytes());
+      load_conversion_dest_type(_masm, rdi_conv, conversion_addr);
+      __ lea(rbx_temp, ExternalAddress((address) &_adapter_return_handlers[0]));
+      __ movptr(rbx_temp, Address(rbx_temp, rdi_conv, Address::times_ptr));
+
+#ifdef ASSERT
+      { Label L_badconv;
+        __ testptr(rbx_temp, rbx_temp);
+        __ jccb(Assembler::zero, L_badconv);
+        __ jmp(rbx_temp);
+        __ bind(L_badconv);
+        __ stop("bad method handle return");
+      }
+#else //ASSERT
+      __ jmp(rbx_temp);
+#endif //ASSERT
+      break;
+    }
+
   case _adapter_opt_spread_0:
-  case _adapter_opt_spread_1:
-  case _adapter_opt_spread_more:
+  case _adapter_opt_spread_1_ref:
+  case _adapter_opt_spread_2_ref:
+  case _adapter_opt_spread_3_ref:
+  case _adapter_opt_spread_4_ref:
+  case _adapter_opt_spread_5_ref:
+  case _adapter_opt_spread_ref:
+  case _adapter_opt_spread_byte:
+  case _adapter_opt_spread_char:
+  case _adapter_opt_spread_short:
+  case _adapter_opt_spread_int:
+  case _adapter_opt_spread_long:
+  case _adapter_opt_spread_float:
+  case _adapter_opt_spread_double:
     {
       // spread an array out into a group of arguments
-      int length_constant = get_ek_adapter_opt_spread_info(ek);
+      int length_constant = ek_adapter_opt_spread_count(ek);
+      bool length_can_be_zero = (length_constant == 0);
+      if (length_constant < 0) {
+        // some adapters with variable length must handle the zero case
+        if (!OptimizeMethodHandles ||
+            ek_adapter_opt_spread_type(ek) != T_OBJECT)
+          length_can_be_zero = true;
+      }
 
       // find the address of the array argument
       __ movl(rax_argslot, rcx_amh_vmargslot);
       __ lea(rax_argslot, __ argument_address(rax_argslot));
 
-      // grab some temps
-      { __ push(rsi); __ push(rdi); }
-      // (preceding pushes must be done after argslot address is taken!)
-#define UNPUSH_RSI_RDI \
-      { __ pop(rdi); __ pop(rsi); }
+      // grab another temp
+      Register rsi_temp = rsi;
+      { if (rsi_temp == saved_last_sp)  __ push(saved_last_sp); }
+      // (preceding push must be done after argslot address is taken!)
+#define UNPUSH_RSI \
+      { if (rsi_temp == saved_last_sp)  __ pop(saved_last_sp); }
 
       // arx_argslot points both to the array and to the first output arg
       vmarg = Address(rax_argslot, 0);
 
       // Get the array value.
-      Register  rsi_array       = rsi;
+      Register  rsi_array       = rsi_temp;
       Register  rdx_array_klass = rdx_temp;
-      BasicType elem_type       = T_OBJECT;
+      BasicType elem_type = ek_adapter_opt_spread_type(ek);
+      int       elem_slots = type2size[elem_type];  // 1 or 2
+      int       array_slots = 1;  // array is always a T_OBJECT
       int       length_offset   = arrayOopDesc::length_offset_in_bytes();
       int       elem0_offset    = arrayOopDesc::base_offset_in_bytes(elem_type);
       __ movptr(rsi_array, vmarg);
-      Label skip_array_check;
-      if (length_constant == 0) {
+
+      Label L_array_is_empty, L_insert_arg_space, L_copy_args, L_args_done;
+      if (length_can_be_zero) {
+        // handle the null pointer case, if zero is allowed
+        Label L_skip;
+        if (length_constant < 0) {
+          load_conversion_vminfo(_masm, rbx_temp, rcx_amh_conversion);
+          __ testl(rbx_temp, rbx_temp);
+          __ jcc(Assembler::notZero, L_skip);
+        }
         __ testptr(rsi_array, rsi_array);
-        __ jcc(Assembler::zero, skip_array_check);
+        __ jcc(Assembler::zero, L_array_is_empty);
+        __ bind(L_skip);
       }
       __ null_check(rsi_array, oopDesc::klass_offset_in_bytes());
       __ load_klass(rdx_array_klass, rsi_array);
@@ -1099,22 +2272,20 @@
       // Check the array type.
       Register rbx_klass = rbx_temp;
       __ load_heap_oop(rbx_klass, rcx_amh_argument); // this is a Class object!
-      __ load_heap_oop(rbx_klass, Address(rbx_klass, java_lang_Class::klass_offset_in_bytes()));
+      load_klass_from_Class(_masm, rbx_klass);
 
       Label ok_array_klass, bad_array_klass, bad_array_length;
-      __ check_klass_subtype(rdx_array_klass, rbx_klass, rdi, ok_array_klass);
+      __ check_klass_subtype(rdx_array_klass, rbx_klass, rdi_temp, ok_array_klass);
       // If we get here, the type check failed!
       __ jmp(bad_array_klass);
-      __ bind(ok_array_klass);
+      __ BIND(ok_array_klass);
 
       // Check length.
       if (length_constant >= 0) {
         __ cmpl(Address(rsi_array, length_offset), length_constant);
       } else {
         Register rbx_vminfo = rbx_temp;
-        __ movl(rbx_vminfo, rcx_amh_conversion);
-        assert(CONV_VMINFO_SHIFT == 0, "preshifted");
-        __ andl(rbx_vminfo, CONV_VMINFO_MASK);
+        load_conversion_vminfo(_masm, rbx_vminfo, rcx_amh_conversion);
         __ cmpl(rbx_vminfo, Address(rsi_array, length_offset));
       }
       __ jcc(Assembler::notEqual, bad_array_length);
@@ -1126,90 +2297,104 @@
         // Form a pointer to the end of the affected region.
         __ lea(rdx_argslot_limit, Address(rax_argslot, Interpreter::stackElementSize));
         // 'stack_move' is negative number of words to insert
-        Register rdi_stack_move = rdi;
-        __ movl2ptr(rdi_stack_move, rcx_amh_conversion);
-        __ sarptr(rdi_stack_move, CONV_STACK_MOVE_SHIFT);
+        // This number already accounts for elem_slots.
+        Register rdi_stack_move = rdi_temp;
+        load_stack_move(_masm, rdi_stack_move, rcx_recv, true);
+        __ cmpptr(rdi_stack_move, 0);
+        assert(stack_move_unit() < 0, "else change this comparison");
+        __ jcc(Assembler::less, L_insert_arg_space);
+        __ jcc(Assembler::equal, L_copy_args);
+        // single argument case, with no array movement
+        __ BIND(L_array_is_empty);
+        remove_arg_slots(_masm, -stack_move_unit() * array_slots,
+                         rax_argslot, rbx_temp, rdx_temp);
+        __ jmp(L_args_done);  // no spreading to do
+        __ BIND(L_insert_arg_space);
+        // come here in the usual case, stack_move < 0 (2 or more spread arguments)
         Register rsi_temp = rsi_array;  // spill this
-        insert_arg_slots(_masm, rdi_stack_move, -1,
+        insert_arg_slots(_masm, rdi_stack_move,
                          rax_argslot, rbx_temp, rsi_temp);
-        // reload the array (since rsi was killed)
-        __ movptr(rsi_array, vmarg);
-      } else if (length_constant > 1) {
-        int arg_mask = 0;
-        int new_slots = (length_constant - 1);
-        for (int i = 0; i < new_slots; i++) {
-          arg_mask <<= 1;
-          arg_mask |= _INSERT_REF_MASK;
-        }
-        insert_arg_slots(_masm, new_slots * stack_move_unit(), arg_mask,
+        // reload the array since rsi was killed
+        // reload from rdx_argslot_limit since rax_argslot is now decremented
+        __ movptr(rsi_array, Address(rdx_argslot_limit, -Interpreter::stackElementSize));
+      } else if (length_constant >= 1) {
+        int new_slots = (length_constant * elem_slots) - array_slots;
+        insert_arg_slots(_masm, new_slots * stack_move_unit(),
                          rax_argslot, rbx_temp, rdx_temp);
-      } else if (length_constant == 1) {
-        // no stack resizing required
       } else if (length_constant == 0) {
-        remove_arg_slots(_masm, -stack_move_unit(),
+        __ BIND(L_array_is_empty);
+        remove_arg_slots(_masm, -stack_move_unit() * array_slots,
                          rax_argslot, rbx_temp, rdx_temp);
+      } else {
+        ShouldNotReachHere();
       }
 
       // Copy from the array to the new slots.
       // Note: Stack change code preserves integrity of rax_argslot pointer.
       // So even after slot insertions, rax_argslot still points to first argument.
+      // Beware:  Arguments that are shallow on the stack are deep in the array,
+      // and vice versa.  So a downward-growing stack (the usual) has to be copied
+      // elementwise in reverse order from the source array.
+      __ BIND(L_copy_args);
       if (length_constant == -1) {
         // [rax_argslot, rdx_argslot_limit) is the area we are inserting into.
+        // Array element [0] goes at rdx_argslot_limit[-wordSize].
         Register rsi_source = rsi_array;
         __ lea(rsi_source, Address(rsi_array, elem0_offset));
+        Register rdx_fill_ptr = rdx_argslot_limit;
         Label loop;
-        __ bind(loop);
-        __ movptr(rbx_temp, Address(rsi_source, 0));
-        __ movptr(Address(rax_argslot, 0), rbx_temp);
+        __ BIND(loop);
+        __ addptr(rdx_fill_ptr, -Interpreter::stackElementSize * elem_slots);
+        move_typed_arg(_masm, elem_type, true,
+                       Address(rdx_fill_ptr, 0), Address(rsi_source, 0),
+                       rbx_temp, rdi_temp);
         __ addptr(rsi_source, type2aelembytes(elem_type));
-        __ addptr(rax_argslot, Interpreter::stackElementSize);
-        __ cmpptr(rax_argslot, rdx_argslot_limit);
-        __ jccb(Assembler::less, loop);
+        __ cmpptr(rdx_fill_ptr, rax_argslot);
+        __ jcc(Assembler::above, loop);
       } else if (length_constant == 0) {
-        __ bind(skip_array_check);
         // nothing to copy
       } else {
         int elem_offset = elem0_offset;
-        int slot_offset = 0;
+        int slot_offset = length_constant * Interpreter::stackElementSize;
         for (int index = 0; index < length_constant; index++) {
-          __ movptr(rbx_temp, Address(rsi_array, elem_offset));
-          __ movptr(Address(rax_argslot, slot_offset), rbx_temp);
+          slot_offset -= Interpreter::stackElementSize * elem_slots;  // fill backward
+          move_typed_arg(_masm, elem_type, true,
+                         Address(rax_argslot, slot_offset), Address(rsi_array, elem_offset),
+                         rbx_temp, rdi_temp);
           elem_offset += type2aelembytes(elem_type);
-           slot_offset += Interpreter::stackElementSize;
         }
       }
+      __ BIND(L_args_done);
 
       // Arguments are spread.  Move to next method handle.
-      UNPUSH_RSI_RDI;
+      UNPUSH_RSI;
       __ load_heap_oop(rcx_recv, rcx_mh_vmtarget);
       __ jump_to_method_handle_entry(rcx_recv, rdx_temp);
 
       __ bind(bad_array_klass);
-      UNPUSH_RSI_RDI;
+      UNPUSH_RSI;
       assert(!vmarg.uses(rarg2_required), "must be different registers");
-      __ movptr(rarg2_required, Address(rdx_array_klass, java_mirror_offset));  // required type
-      __ movptr(rarg1_actual,   vmarg);                                         // bad array
-      __ movl(  rarg0_code,     (int) Bytecodes::_aaload);                      // who is complaining?
+      __ load_heap_oop( rarg2_required, Address(rdx_array_klass, java_mirror_offset));  // required type
+      __ movptr(        rarg1_actual,   vmarg);                                         // bad array
+      __ movl(          rarg0_code,     (int) Bytecodes::_aaload);                      // who is complaining?
       __ jump(ExternalAddress(from_interpreted_entry(_raise_exception)));
 
       __ bind(bad_array_length);
-      UNPUSH_RSI_RDI;
+      UNPUSH_RSI;
       assert(!vmarg.uses(rarg2_required), "must be different registers");
-      __ mov   (rarg2_required, rcx_recv);                       // AMH requiring a certain length
-      __ movptr(rarg1_actual,   vmarg);                          // bad array
-      __ movl(  rarg0_code,     (int) Bytecodes::_arraylength);  // who is complaining?
+      __ mov(    rarg2_required, rcx_recv);                       // AMH requiring a certain length
+      __ movptr( rarg1_actual,   vmarg);                          // bad array
+      __ movl(   rarg0_code,     (int) Bytecodes::_arraylength);  // who is complaining?
       __ jump(ExternalAddress(from_interpreted_entry(_raise_exception)));
+#undef UNPUSH_RSI
 
-#undef UNPUSH_RSI_RDI
+      break;
     }
-    break;
 
-  case _adapter_flyby:
-  case _adapter_ricochet:
-    __ unimplemented(entry_name(ek)); // %%% FIXME: NYI
-    break;
-
-  default:  ShouldNotReachHere();
+  default:
+    // do not require all platforms to recognize all adapter types
+    __ nop();
+    return;
   }
   __ hlt();
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cpu/x86/vm/methodHandles_x86.hpp	Tue May 24 11:09:39 2011 -0700
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// Platform-specific definitions for method handles.
+// These definitions are inlined into class MethodHandles.
+
+public:
+
+// The stack just after the recursive call from a ricochet frame
+// looks something like this.  Offsets are marked in words, not bytes.
+// rsi (r13 on LP64) is part of the interpreter calling sequence
+// which tells the callee where is my real rsp (for frame walking).
+// (...lower memory addresses)
+// rsp:     [ return pc                 ]   always the global RicochetBlob::bounce_addr
+// rsp+1:   [ recursive arg N           ]
+// rsp+2:   [ recursive arg N-1         ]
+// ...
+// rsp+N:   [ recursive arg 1           ]
+// rsp+N+1: [ recursive method handle   ]
+// ...
+// rbp-6:   [ cleanup continuation pc   ]   <-- (struct RicochetFrame)
+// rbp-5:   [ saved target MH           ]   the MH we will call on the saved args
+// rbp-4:   [ saved args layout oop     ]   an int[] array which describes argument layout
+// rbp-3:   [ saved args pointer        ]   address of transformed adapter arg M (slot 0)
+// rbp-2:   [ conversion                ]   information about how the return value is used
+// rbp-1:   [ exact sender sp           ]   exact TOS (rsi/r13) of original sender frame
+// rbp+0:   [ saved sender fp           ]   (for original sender of AMH)
+// rbp+1:   [ saved sender pc           ]   (back to original sender of AMH)
+// rbp+2:   [ transformed adapter arg M ]   <-- (extended TOS of original sender)
+// rbp+3:   [ transformed adapter arg M-1]
+// ...
+// rbp+M+1: [ transformed adapter arg 1 ]
+// rbp+M+2: [ padding                   ] <-- (rbp + saved args base offset)
+// ...      [ optional padding]
+// (higher memory addresses...)
+//
+// The arguments originally passed by the original sender
+// are lost, and arbitrary amounts of stack motion might have
+// happened due to argument transformation.
+// (This is done by C2I/I2C adapters and non-direct method handles.)
+// This is why there is an unpredictable amount of memory between
+// the extended and exact TOS of the sender.
+// The ricochet adapter itself will also (in general) perform
+// transformations before the recursive call.
+//
+// The transformed and saved arguments, immediately above the saved
+// return PC, are a well-formed method handle invocation ready to execute.
+// When the GC needs to walk the stack, these arguments are described
+// via the saved arg types oop, an int[] array with a private format.
+// This array is derived from the type of the transformed adapter
+// method handle, which also sits at the base of the saved argument
+// bundle.  Since the GC may not be able to fish out the int[]
+// array, so it is pushed explicitly on the stack.  This may be
+// an unnecessary expense.
+//
+// The following register conventions are significant at this point:
+// rsp       the thread stack, as always; preserved by caller
+// rsi/r13   exact TOS of recursive frame (contents of [rbp-2])
+// rcx       recursive method handle (contents of [rsp+N+1])
+// rbp       preserved by caller (not used by caller)
+// Unless otherwise specified, all registers can be blown by the call.
+//
+// If this frame must be walked, the transformed adapter arguments
+// will be found with the help of the saved arguments descriptor.
+//
+// Therefore, the descriptor must match the referenced arguments.
+// The arguments must be followed by at least one word of padding,
+// which will be necessary to complete the final method handle call.
+// That word is not treated as holding an oop.  Neither is the word
+//
+// The word pointed to by the return argument pointer is not
+// treated as an oop, even if points to a saved argument.
+// This allows the saved argument list to have a "hole" in it
+// to receive an oop from the recursive call.
+// (The hole might temporarily contain RETURN_VALUE_PLACEHOLDER.)
+//
+// When the recursive callee returns, RicochetBlob::bounce_addr will
+// immediately jump to the continuation stored in the RF.
+// This continuation will merge the recursive return value
+// into the saved argument list.  At that point, the original
+// rsi, rbp, and rsp will be reloaded, the ricochet frame will
+// disappear, and the final target of the adapter method handle
+// will be invoked on the transformed argument list.
+
+class RicochetFrame {
+  friend class MethodHandles;
+
+ private:
+  intptr_t* _continuation;          // what to do when control gets back here
+  oopDesc*  _saved_target;          // target method handle to invoke on saved_args
+  oopDesc*  _saved_args_layout;     // caching point for MethodTypeForm.vmlayout cookie
+  intptr_t* _saved_args_base;       // base of pushed arguments (slot 0, arg N) (-3)
+  intptr_t  _conversion;            // misc. information from original AdapterMethodHandle (-2)
+  intptr_t* _exact_sender_sp;       // parallel to interpreter_frame_sender_sp (-1)
+  intptr_t* _sender_link;           // *must* coincide with frame::link_offset (0)
+  address   _sender_pc;             // *must* coincide with frame::return_addr_offset (1)
+
+ public:
+  intptr_t* continuation() const        { return _continuation; }
+  oop       saved_target() const        { return _saved_target; }
+  oop       saved_args_layout() const   { return _saved_args_layout; }
+  intptr_t* saved_args_base() const     { return _saved_args_base; }
+  intptr_t  conversion() const          { return _conversion; }
+  intptr_t* exact_sender_sp() const     { return _exact_sender_sp; }
+  intptr_t* sender_link() const         { return _sender_link; }
+  address   sender_pc() const           { return _sender_pc; }
+
+  intptr_t* extended_sender_sp() const  { return saved_args_base(); }
+
+  intptr_t  return_value_slot_number() const {
+    return adapter_conversion_vminfo(conversion());
+  }
+  BasicType return_value_type() const {
+    return adapter_conversion_dest_type(conversion());
+  }
+  bool has_return_value_slot() const {
+    return return_value_type() != T_VOID;
+  }
+  intptr_t* return_value_slot_addr() const {
+    assert(has_return_value_slot(), "");
+    return saved_arg_slot_addr(return_value_slot_number());
+  }
+  intptr_t* saved_target_slot_addr() const {
+    return saved_arg_slot_addr(saved_args_length());
+  }
+  intptr_t* saved_arg_slot_addr(int slot) const {
+    assert(slot >= 0, "");
+    return (intptr_t*)( (address)saved_args_base() + (slot * Interpreter::stackElementSize) );
+  }
+
+  jint      saved_args_length() const;
+  jint      saved_arg_offset(int arg) const;
+
+  // GC interface
+  oop*  saved_target_addr()                     { return (oop*)&_saved_target; }
+  oop*  saved_args_layout_addr()                { return (oop*)&_saved_args_layout; }
+
+  oop  compute_saved_args_layout(bool read_cache, bool write_cache);
+
+  // Compiler/assembler interface.
+  static int continuation_offset_in_bytes()     { return offset_of(RicochetFrame, _continuation); }
+  static int saved_target_offset_in_bytes()     { return offset_of(RicochetFrame, _saved_target); }
+  static int saved_args_layout_offset_in_bytes(){ return offset_of(RicochetFrame, _saved_args_layout); }
+  static int saved_args_base_offset_in_bytes()  { return offset_of(RicochetFrame, _saved_args_base); }
+  static int conversion_offset_in_bytes()       { return offset_of(RicochetFrame, _conversion); }
+  static int exact_sender_sp_offset_in_bytes()  { return offset_of(RicochetFrame, _exact_sender_sp); }
+  static int sender_link_offset_in_bytes()      { return offset_of(RicochetFrame, _sender_link); }
+  static int sender_pc_offset_in_bytes()        { return offset_of(RicochetFrame, _sender_pc); }
+
+  // This value is not used for much, but it apparently must be nonzero.
+  static int frame_size_in_bytes()              { return sender_link_offset_in_bytes(); }
+
+#ifdef ASSERT
+  // The magic number is supposed to help find ricochet frames within the bytes of stack dumps.
+  enum { MAGIC_NUMBER_1 = 0xFEED03E, MAGIC_NUMBER_2 = 0xBEEF03E };
+  static int magic_number_1_offset_in_bytes()   { return -wordSize; }
+  static int magic_number_2_offset_in_bytes()   { return sizeof(RicochetFrame); }
+  intptr_t magic_number_1() const               { return *(intptr_t*)((address)this + magic_number_1_offset_in_bytes()); };
+  intptr_t magic_number_2() const               { return *(intptr_t*)((address)this + magic_number_2_offset_in_bytes()); };
+#endif //ASSERT
+
+  enum { RETURN_VALUE_PLACEHOLDER = (NOT_DEBUG(0) DEBUG_ONLY(42)) };
+
+  static void verify_offsets() NOT_DEBUG_RETURN;
+  void verify() const NOT_DEBUG_RETURN; // check for MAGIC_NUMBER, etc.
+  void zap_arguments() NOT_DEBUG_RETURN;
+
+  static void generate_ricochet_blob(MacroAssembler* _masm,
+                                     // output params:
+                                     int* frame_size_in_words, int* bounce_offset, int* exception_offset);
+
+  static void enter_ricochet_frame(MacroAssembler* _masm,
+                                   Register rcx_recv,
+                                   Register rax_argv,
+                                   address return_handler,
+                                   Register rbx_temp);
+  static void leave_ricochet_frame(MacroAssembler* _masm,
+                                   Register rcx_recv,
+                                   Register new_sp_reg,
+                                   Register sender_pc_reg);
+
+  static Address frame_address(int offset = 0) {
+    // The RicochetFrame is found by subtracting a constant offset from rbp.
+    return Address(rbp, - sender_link_offset_in_bytes() + offset);
+  }
+
+  static RicochetFrame* from_frame(const frame& fr) {
+    address bp = (address) fr.fp();
+    RicochetFrame* rf = (RicochetFrame*)(bp - sender_link_offset_in_bytes());
+    rf->verify();
+    return rf;
+  }
+
+  static void verify_clean(MacroAssembler* _masm) NOT_DEBUG_RETURN;
+};
+
+// Additional helper methods for MethodHandles code generation:
+public:
+  static void load_klass_from_Class(MacroAssembler* _masm, Register klass_reg);
+  static void load_conversion_vminfo(MacroAssembler* _masm, Register reg, Address conversion_field_addr);
+  static void load_conversion_dest_type(MacroAssembler* _masm, Register reg, Address conversion_field_addr);
+
+  static void load_stack_move(MacroAssembler* _masm,
+                              Register rdi_stack_move,
+                              Register rcx_amh,
+                              bool might_be_negative);
+
+  static void insert_arg_slots(MacroAssembler* _masm,
+                               RegisterOrConstant arg_slots,
+                               Register rax_argslot,
+                               Register rbx_temp, Register rdx_temp);
+
+  static void remove_arg_slots(MacroAssembler* _masm,
+                               RegisterOrConstant arg_slots,
+                               Register rax_argslot,
+                               Register rbx_temp, Register rdx_temp);
+
+  static void push_arg_slots(MacroAssembler* _masm,
+                                   Register rax_argslot,
+                                   RegisterOrConstant slot_count,
+                                   int skip_words_count,
+                                   Register rbx_temp, Register rdx_temp);
+
+  static void move_arg_slots_up(MacroAssembler* _masm,
+                                Register rbx_bottom,  // invariant
+                                Address  top_addr,    // can use rax_temp
+                                RegisterOrConstant positive_distance_in_slots,
+                                Register rax_temp, Register rdx_temp);
+
+  static void move_arg_slots_down(MacroAssembler* _masm,
+                                  Address  bottom_addr,  // can use rax_temp
+                                  Register rbx_top,      // invariant
+                                  RegisterOrConstant negative_distance_in_slots,
+                                  Register rax_temp, Register rdx_temp);
+
+  static void move_typed_arg(MacroAssembler* _masm,
+                             BasicType type, bool is_element,
+                             Address slot_dest, Address value_src,
+                             Register rbx_temp, Register rdx_temp);
+
+  static void move_return_value(MacroAssembler* _masm, BasicType type,
+                                Address return_slot);
+
+  static void verify_argslot(MacroAssembler* _masm, Register argslot_reg,
+                             const char* error_message) NOT_DEBUG_RETURN;
+
+  static void verify_argslots(MacroAssembler* _masm,
+                              RegisterOrConstant argslot_count,
+                              Register argslot_reg,
+                              bool negate_argslot,
+                              const char* error_message) NOT_DEBUG_RETURN;
+
+  static void verify_stack_move(MacroAssembler* _masm,
+                                RegisterOrConstant arg_slots,
+                                int direction) NOT_DEBUG_RETURN;
+
+  static void verify_klass(MacroAssembler* _masm,
+                           Register obj, KlassHandle klass,
+                           const char* error_message = "wrong klass") NOT_DEBUG_RETURN;
+
+  static void verify_method_handle(MacroAssembler* _masm, Register mh_reg) {
+    verify_klass(_masm, mh_reg, SystemDictionaryHandles::MethodHandle_klass(),
+                 "reference is a MH");
+  }
+
+  static void trace_method_handle(MacroAssembler* _masm, const char* adaptername) PRODUCT_RETURN;
+
+  static Register saved_last_sp_register() {
+    // Should be in sharedRuntime, not here.
+    return LP64_ONLY(r13) NOT_LP64(rsi);
+  }
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2253,6 +2253,31 @@
   return 0;
 }
 
+//----------------------------generate_ricochet_blob---------------------------
+void SharedRuntime::generate_ricochet_blob() {
+  if (!EnableInvokeDynamic)  return;  // leave it as a null
+
+  // allocate space for the code
+  ResourceMark rm;
+  // setup code generation tools
+  CodeBuffer   buffer("ricochet_blob", 256, 256);
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+
+  int frame_size_in_words = -1, bounce_offset = -1, exception_offset = -1;
+  MethodHandles::RicochetFrame::generate_ricochet_blob(masm, &frame_size_in_words, &bounce_offset, &exception_offset);
+
+  // -------------
+  // make sure all code is generated
+  masm->flush();
+
+  // failed to generate?
+  if (frame_size_in_words < 0 || bounce_offset < 0 || exception_offset < 0) {
+    assert(false, "bad ricochet blob");
+    return;
+  }
+
+  _ricochet_blob = RicochetBlob::create(&buffer, bounce_offset, exception_offset, frame_size_in_words);
+}
 
 //------------------------------generate_deopt_blob----------------------------
 void SharedRuntime::generate_deopt_blob() {
@@ -2996,6 +3021,8 @@
     generate_handler_blob(CAST_FROM_FN_PTR(address,
                    SafepointSynchronize::handle_polling_page_exception), true);
 
+  generate_ricochet_blob();
+
   generate_deopt_blob();
 #ifdef COMPILER2
   generate_uncommon_trap_blob();
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2530,6 +2530,32 @@
 }
 
 
+//----------------------------generate_ricochet_blob---------------------------
+void SharedRuntime::generate_ricochet_blob() {
+  if (!EnableInvokeDynamic)  return;  // leave it as a null
+
+  // allocate space for the code
+  ResourceMark rm;
+  // setup code generation tools
+  CodeBuffer   buffer("ricochet_blob", 512, 512);
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+
+  int frame_size_in_words = -1, bounce_offset = -1, exception_offset = -1;
+  MethodHandles::RicochetFrame::generate_ricochet_blob(masm, &frame_size_in_words, &bounce_offset, &exception_offset);
+
+  // -------------
+  // make sure all code is generated
+  masm->flush();
+
+  // failed to generate?
+  if (frame_size_in_words < 0 || bounce_offset < 0 || exception_offset < 0) {
+    assert(false, "bad ricochet blob");
+    return;
+  }
+
+  _ricochet_blob = RicochetBlob::create(&buffer, bounce_offset, exception_offset, frame_size_in_words);
+}
+
 //------------------------------generate_deopt_blob----------------------------
 void SharedRuntime::generate_deopt_blob() {
   // Allocate space for the code
@@ -3205,6 +3231,8 @@
     generate_handler_blob(CAST_FROM_FN_PTR(address,
                    SafepointSynchronize::handle_polling_page_exception), true);
 
+  generate_ricochet_blob();
+
   generate_deopt_blob();
 
 #ifdef COMPILER2
--- a/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Tue May 24 11:09:39 2011 -0700
@@ -36,7 +36,7 @@
 
 // MethodHandles adapters
 enum method_handles_platform_dependent_constants {
-  method_handles_adapters_code_size = 10000
+  method_handles_adapters_code_size = 30000 DEBUG_ONLY(+ 10000)
 };
 
 class x86 {
--- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Tue May 24 11:09:39 2011 -0700
@@ -38,7 +38,7 @@
 
 // MethodHandles adapters
 enum method_handles_platform_dependent_constants {
-  method_handles_adapters_code_size = 40000
+  method_handles_adapters_code_size = 80000 DEBUG_ONLY(+ 120000)
 };
 
 class x86 {
--- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1589,6 +1589,7 @@
                                            int tempcount,
                                            int popframe_extra_args,
                                            int moncount,
+                                           int caller_actual_parameters,
                                            int callee_param_count,
                                            int callee_locals,
                                            frame* caller,
--- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1603,6 +1603,7 @@
                                            int tempcount,
                                            int popframe_extra_args,
                                            int moncount,
+                                           int caller_actual_parameters,
                                            int callee_param_count,
                                            int callee_locals,
                                            frame* caller,
--- a/src/cpu/x86/vm/x86_32.ad	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/x86/vm/x86_32.ad	Tue May 24 11:09:39 2011 -0700
@@ -12989,6 +12989,53 @@
 %}
 
 // ============================================================================
+// Counted Loop limit node which represents exact final iterator value.
+// Note: the resulting value should fit into integer range since
+// counted loops have limit check on overflow.
+instruct loopLimit_eReg(eAXRegI limit, nadxRegI init, immI stride, eDXRegI limit_hi, nadxRegI tmp, eFlagsReg flags) %{
+  match(Set limit (LoopLimit (Binary init limit) stride));
+  effect(TEMP limit_hi, TEMP tmp, KILL flags);
+  ins_cost(300);
+
+  format %{ "loopLimit $init,$limit,$stride  # $limit = $init + $stride *( $limit - $init + $stride -1)/ $stride, kills $limit_hi" %}
+  ins_encode %{
+    int strd = (int)$stride$$constant;
+    assert(strd != 1 && strd != -1, "sanity");
+    int m1 = (strd > 0) ? 1 : -1;
+    // Convert limit to long (EAX:EDX)
+    __ cdql();
+    // Convert init to long (init:tmp)
+    __ movl($tmp$$Register, $init$$Register);
+    __ sarl($tmp$$Register, 31);
+    // $limit - $init
+    __ subl($limit$$Register, $init$$Register);
+    __ sbbl($limit_hi$$Register, $tmp$$Register);
+    // + ($stride - 1)
+    if (strd > 0) {
+      __ addl($limit$$Register, (strd - 1));
+      __ adcl($limit_hi$$Register, 0);
+      __ movl($tmp$$Register, strd);
+    } else {
+      __ addl($limit$$Register, (strd + 1));
+      __ adcl($limit_hi$$Register, -1);
+      __ lneg($limit_hi$$Register, $limit$$Register);
+      __ movl($tmp$$Register, -strd);
+    }
+    // signed devision: (EAX:EDX) / pos_stride
+    __ idivl($tmp$$Register);
+    if (strd < 0) {
+      // restore sign
+      __ negl($tmp$$Register);
+    }
+    // (EAX) * stride
+    __ mull($tmp$$Register);
+    // + init (ignore upper bits)
+    __ addl($limit$$Register, $init$$Register);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ============================================================================
 // Branch Instructions
 // Jump Table
 instruct jumpXtnd(eRegI switch_val) %{
--- a/src/cpu/zero/vm/cppInterpreter_zero.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/zero/vm/cppInterpreter_zero.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1427,6 +1427,7 @@
                                            int       tempcount,
                                            int       popframe_extra_args,
                                            int       moncount,
+                                           int       caller_actual_parameters,
                                            int       callee_param_count,
                                            int       callee_locals,
                                            frame*    caller,
--- a/src/cpu/zero/vm/interpreter_zero.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/cpu/zero/vm/interpreter_zero.cpp	Tue May 24 11:09:39 2011 -0700
@@ -82,24 +82,6 @@
   return true;
 }
 
-int AbstractInterpreter::size_activation(methodOop method,
-                                         int tempcount,
-                                         int popframe_extra_args,
-                                         int moncount,
-                                         int callee_param_count,
-                                         int callee_locals,
-                                         bool is_top_frame) {
-  return layout_activation(method,
-                           tempcount,
-                           popframe_extra_args,
-                           moncount,
-                           callee_param_count,
-                           callee_locals,
-                           (frame*) NULL,
-                           (frame*) NULL,
-                           is_top_frame);
-}
-
 void Deoptimization::unwind_callee_save_values(frame* f,
                                                vframeArray* vframe_array) {
 }
--- a/src/os/linux/vm/globals_linux.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/os/linux/vm/globals_linux.hpp	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -47,7 +47,7 @@
 // Defines Linux-specific default values. The flags are available on all
 // platforms, but they may have different default values on other platforms.
 //
-define_pd_global(bool, UseLargePages, false);
+define_pd_global(bool, UseLargePages, true);
 define_pd_global(bool, UseLargePagesIndividualAllocation, false);
 define_pd_global(bool, UseOSErrorReporting, false);
 define_pd_global(bool, UseThreadPriorities, true) ;
--- a/src/os/linux/vm/os_linux.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/os/linux/vm/os_linux.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2850,7 +2850,7 @@
         char chars[257];
         long x = 0;
         if (fgets(chars, sizeof(chars), fp)) {
-          if (sscanf(chars, "%lx-%*lx", &x) == 1
+          if (sscanf(chars, "%lx-%*x", &x) == 1
               && x == (long)p) {
             if (strstr (chars, "hugepage")) {
               result = true;
@@ -2914,16 +2914,21 @@
 
 static size_t _large_page_size = 0;
 
-bool os::large_page_init() {
+void os::large_page_init() {
   if (!UseLargePages) {
     UseHugeTLBFS = false;
     UseSHM = false;
-    return false;
+    return;
   }
 
   if (FLAG_IS_DEFAULT(UseHugeTLBFS) && FLAG_IS_DEFAULT(UseSHM)) {
-    // Our user has not expressed a preference, so we'll try both.
-    UseHugeTLBFS = UseSHM = true;
+    // If UseLargePages is specified on the command line try both methods,
+    // if it's default, then try only HugeTLBFS.
+    if (FLAG_IS_DEFAULT(UseLargePages)) {
+      UseHugeTLBFS = true;
+    } else {
+      UseHugeTLBFS = UseSHM = true;
+    }
   }
 
   if (LargePageSizeInBytes) {
@@ -2978,7 +2983,6 @@
     _page_sizes[1] = default_page_size;
     _page_sizes[2] = 0;
   }
-
   UseHugeTLBFS = UseHugeTLBFS &&
                  Linux::hugetlbfs_sanity_check(warn_on_failure, _large_page_size);
 
@@ -2988,12 +2992,6 @@
   UseLargePages = UseHugeTLBFS || UseSHM;
 
   set_coredump_filter();
-
-  // Large page support is available on 2.6 or newer kernel, some vendors
-  // (e.g. Redhat) have backported it to their 2.4 based distributions.
-  // We optimistically assume the support is available. If later it turns out
-  // not true, VM will automatically switch to use regular page size.
-  return true;
 }
 
 #ifndef SHM_HUGETLB
@@ -4118,7 +4116,7 @@
 #endif
   }
 
-  FLAG_SET_DEFAULT(UseLargePages, os::large_page_init());
+  os::large_page_init();
 
   // initialize suspend/resume support - must do this before signal_sets_init()
   if (SR_initialize() != 0) {
--- a/src/os/solaris/vm/os_solaris.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/os/solaris/vm/os_solaris.cpp	Tue May 24 11:09:39 2011 -0700
@@ -3336,11 +3336,11 @@
   return true;
 }
 
-bool os::large_page_init() {
+void os::large_page_init() {
   if (!UseLargePages) {
     UseISM = false;
     UseMPSS = false;
-    return false;
+    return;
   }
 
   // print a warning if any large page related flag is specified on command line
@@ -3361,7 +3361,6 @@
             Solaris::mpss_sanity_check(warn_on_failure, &_large_page_size);
 
   UseLargePages = UseISM || UseMPSS;
-  return UseLargePages;
 }
 
 bool os::Solaris::set_mpss_range(caddr_t start, size_t bytes, size_t align) {
@@ -4992,7 +4991,7 @@
 #endif
 }
 
-  FLAG_SET_DEFAULT(UseLargePages, os::large_page_init());
+  os::large_page_init();
 
   // Check minimum allowable stack size for thread creation and to initialize
   // the java system classes, including StackOverflowError - depends on page
--- a/src/os/windows/vm/os_windows.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/os/windows/vm/os_windows.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2649,8 +2649,8 @@
   _hToken = NULL;
 }
 
-bool os::large_page_init() {
-  if (!UseLargePages) return false;
+void os::large_page_init() {
+  if (!UseLargePages) return;
 
   // print a warning if any large page related flag is specified on command line
   bool warn_on_failure = !FLAG_IS_DEFAULT(UseLargePages) ||
@@ -2695,7 +2695,7 @@
   }
 
   cleanup_after_large_page_init();
-  return success;
+  UseLargePages = success;
 }
 
 // On win32, one cannot release just a part of reserved memory, it's an
@@ -3478,7 +3478,7 @@
 #endif
 }
 
-  FLAG_SET_DEFAULT(UseLargePages, os::large_page_init());
+  os::large_page_init();
 
   // Setup Windows Exceptions
 
--- a/src/os_cpu/linux_x86/vm/orderAccess_linux_x86.inline.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/os_cpu/linux_x86/vm/orderAccess_linux_x86.inline.hpp	Tue May 24 11:09:39 2011 -0700
@@ -93,7 +93,7 @@
 
 inline void     OrderAccess::store_fence(jbyte*  p, jbyte  v) {
   __asm__ volatile (  "xchgb (%2),%0"
-                    : "=r" (v)
+                    : "=q" (v)
                     : "0" (v), "r" (p)
                     : "memory");
 }
@@ -155,7 +155,7 @@
 // Must duplicate definitions instead of calling store_fence because we don't want to cast away volatile.
 inline void     OrderAccess::release_store_fence(volatile jbyte*  p, jbyte  v) {
   __asm__ volatile (  "xchgb (%2),%0"
-                    : "=r" (v)
+                    : "=q" (v)
                     : "0" (v), "r" (p)
                     : "memory");
 }
--- a/src/share/vm/c1/c1_InstructionPrinter.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/c1/c1_InstructionPrinter.cpp	Tue May 24 11:09:39 2011 -0700
@@ -132,17 +132,22 @@
     if (value->is_null_object()) {
       output()->print("null");
     } else if (!value->is_loaded()) {
-      output()->print("<unloaded object 0x%x>", value);
+      output()->print("<unloaded object " PTR_FORMAT ">", value);
     } else if (value->is_method()) {
       ciMethod* m = (ciMethod*)value;
       output()->print("<method %s.%s>", m->holder()->name()->as_utf8(), m->name()->as_utf8());
     } else {
-      output()->print("<object 0x%x>", value->constant_encoding());
+      output()->print("<object " PTR_FORMAT ">", value->constant_encoding());
     }
   } else if (type->as_InstanceConstant() != NULL) {
-    output()->print("<instance 0x%x>", type->as_InstanceConstant()->value()->constant_encoding());
+    ciInstance* value = type->as_InstanceConstant()->value();
+    if (value->is_loaded()) {
+      output()->print("<instance " PTR_FORMAT ">", value->constant_encoding());
+    } else {
+      output()->print("<unloaded instance " PTR_FORMAT ">", value);
+    }
   } else if (type->as_ArrayConstant() != NULL) {
-    output()->print("<array 0x%x>", type->as_ArrayConstant()->value()->constant_encoding());
+    output()->print("<array " PTR_FORMAT ">", type->as_ArrayConstant()->value()->constant_encoding());
   } else if (type->as_ClassConstant() != NULL) {
     ciInstanceKlass* klass = type->as_ClassConstant()->value();
     if (!klass->is_loaded()) {
--- a/src/share/vm/c1/c1_Optimizer.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/c1/c1_Optimizer.cpp	Tue May 24 11:09:39 2011 -0700
@@ -252,26 +252,28 @@
         Constant::CompareResult t_compare_res = x_tval_const->compare(cond, y_const);
         Constant::CompareResult f_compare_res = x_fval_const->compare(cond, y_const);
 
-        guarantee(t_compare_res != Constant::not_comparable && f_compare_res != Constant::not_comparable, "incomparable constants in IfOp");
+        // not_comparable here is a valid return in case we're comparing unloaded oop constants
+        if (t_compare_res != Constant::not_comparable && f_compare_res != Constant::not_comparable) {
+          Value new_tval = t_compare_res == Constant::cond_true ? tval : fval;
+          Value new_fval = f_compare_res == Constant::cond_true ? tval : fval;
 
-        Value new_tval = t_compare_res == Constant::cond_true ? tval : fval;
-        Value new_fval = f_compare_res == Constant::cond_true ? tval : fval;
-
-        _ifop_count++;
-        if (new_tval == new_fval) {
-          return new_tval;
-        } else {
-          return new IfOp(x_ifop->x(), x_ifop_cond, x_ifop->y(), new_tval, new_fval);
+          _ifop_count++;
+          if (new_tval == new_fval) {
+            return new_tval;
+          } else {
+            return new IfOp(x_ifop->x(), x_ifop_cond, x_ifop->y(), new_tval, new_fval);
+          }
         }
       }
     } else {
       Constant* x_const = x->as_Constant();
       if (x_const != NULL) {         // x and y are constants
         Constant::CompareResult x_compare_res = x_const->compare(cond, y_const);
-        guarantee(x_compare_res != Constant::not_comparable, "incomparable constants in IfOp");
-
-        _ifop_count++;
-        return x_compare_res == Constant::cond_true ? tval : fval;
+        // not_comparable here is a valid return in case we're comparing unloaded oop constants
+        if (x_compare_res != Constant::not_comparable) {
+          _ifop_count++;
+          return x_compare_res == Constant::cond_true ? tval : fval;
+        }
       }
     }
   }
--- a/src/share/vm/ci/ciMethodData.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/ci/ciMethodData.hpp	Tue May 24 11:09:39 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -233,7 +233,10 @@
 
 public:
   bool is_method_data()  { return true; }
-  bool is_empty() { return _state == empty_state; }
+
+  void set_mature() { _state = mature_state; }
+
+  bool is_empty()  { return _state == empty_state; }
   bool is_mature() { return _state == mature_state; }
 
   int creation_mileage() { return _orig.creation_mileage(); }
--- a/src/share/vm/ci/ciMethodHandle.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/ci/ciMethodHandle.cpp	Tue May 24 11:09:39 2011 -0700
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "ci/ciClassList.hpp"
 #include "ci/ciInstance.hpp"
+#include "ci/ciMethodData.hpp"
 #include "ci/ciMethodHandle.hpp"
 #include "ci/ciUtilities.hpp"
 #include "prims/methodHandleWalk.hpp"
@@ -36,13 +37,13 @@
 // ciMethodHandle::get_adapter
 //
 // Return an adapter for this MethodHandle.
-ciMethod* ciMethodHandle::get_adapter(bool is_invokedynamic) const {
+ciMethod* ciMethodHandle::get_adapter_impl(bool is_invokedynamic) const {
   VM_ENTRY_MARK;
   Handle h(get_oop());
   methodHandle callee(_callee->get_methodOop());
   // We catch all exceptions here that could happen in the method
   // handle compiler and stop the VM.
-  MethodHandleCompiler mhc(h, callee, is_invokedynamic, THREAD);
+  MethodHandleCompiler mhc(h, callee, _profile->count(), is_invokedynamic, THREAD);
   if (!HAS_PENDING_EXCEPTION) {
     methodHandle m = mhc.compile(THREAD);
     if (!HAS_PENDING_EXCEPTION) {
@@ -58,6 +59,22 @@
   return NULL;
 }
 
+// ------------------------------------------------------------------
+// ciMethodHandle::get_adapter
+//
+// Return an adapter for this MethodHandle.
+ciMethod* ciMethodHandle::get_adapter(bool is_invokedynamic) const {
+  ciMethod* result = get_adapter_impl(is_invokedynamic);
+  if (result) {
+    // Fake up the MDO maturity.
+    ciMethodData* mdo = result->method_data();
+    if (mdo != NULL && _caller->method_data() != NULL && _caller->method_data()->is_mature()) {
+      mdo->set_mature();
+    }
+  }
+  return result;
+}
+
 
 // ------------------------------------------------------------------
 // ciMethodHandle::print_impl
--- a/src/share/vm/ci/ciMethodHandle.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/ci/ciMethodHandle.hpp	Tue May 24 11:09:39 2011 -0700
@@ -25,6 +25,7 @@
 #ifndef SHARE_VM_CI_CIMETHODHANDLE_HPP
 #define SHARE_VM_CI_CIMETHODHANDLE_HPP
 
+#include "ci/ciCallProfile.hpp"
 #include "ci/ciInstance.hpp"
 #include "prims/methodHandles.hpp"
 
@@ -33,32 +34,37 @@
 // The class represents a java.lang.invoke.MethodHandle object.
 class ciMethodHandle : public ciInstance {
 private:
-  ciMethod* _callee;
+  ciMethod*      _callee;
+  ciMethod*      _caller;
+  ciCallProfile* _profile;
 
   // Return an adapter for this MethodHandle.
-  ciMethod* get_adapter(bool is_invokedynamic) const;
+  ciMethod* get_adapter_impl(bool is_invokedynamic) const;
+  ciMethod* get_adapter(     bool is_invokedynamic) const;
 
 protected:
   void print_impl(outputStream* st);
 
 public:
-  ciMethodHandle(instanceHandle h_i) : ciInstance(h_i) {};
+  ciMethodHandle(instanceHandle h_i) :
+    ciInstance(h_i),
+    _callee(NULL),
+    _caller(NULL),
+    _profile(NULL)
+  {}
 
   // What kind of ciObject is this?
   bool is_method_handle() const { return true; }
 
-  ciMethod* callee() const { return _callee; }
-  void  set_callee(ciMethod* m) { _callee = m; }
+  void set_callee(ciMethod* m)                  { _callee  = m;       }
+  void set_caller(ciMethod* m)                  { _caller  = m;       }
+  void set_call_profile(ciCallProfile* profile) { _profile = profile; }
 
   // Return an adapter for a MethodHandle call.
-  ciMethod* get_method_handle_adapter() const {
-    return get_adapter(false);
-  }
+  ciMethod* get_method_handle_adapter() const { return get_adapter(false); }
 
   // Return an adapter for an invokedynamic call.
-  ciMethod* get_invokedynamic_adapter() const {
-    return get_adapter(true);
-  }
+  ciMethod* get_invokedynamic_adapter() const { return get_adapter(true);  }
 };
 
 #endif // SHARE_VM_CI_CIMETHODHANDLE_HPP
--- a/src/share/vm/classfile/javaClasses.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/classfile/javaClasses.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2602,6 +2602,7 @@
 // Support for java_lang_invoke_MethodTypeForm
 
 int java_lang_invoke_MethodTypeForm::_vmslots_offset;
+int java_lang_invoke_MethodTypeForm::_vmlayout_offset;
 int java_lang_invoke_MethodTypeForm::_erasedType_offset;
 int java_lang_invoke_MethodTypeForm::_genericInvoker_offset;
 
@@ -2609,6 +2610,7 @@
   klassOop k = SystemDictionary::MethodTypeForm_klass();
   if (k != NULL) {
     compute_optional_offset(_vmslots_offset,    k, vmSymbols::vmslots_name(),    vmSymbols::int_signature(), true);
+    compute_optional_offset(_vmlayout_offset,   k, vmSymbols::vmlayout_name(),   vmSymbols::object_signature());
     compute_optional_offset(_erasedType_offset, k, vmSymbols::erasedType_name(), vmSymbols::java_lang_invoke_MethodType_signature(), true);
     compute_optional_offset(_genericInvoker_offset, k, vmSymbols::genericInvoker_name(), vmSymbols::java_lang_invoke_MethodHandle_signature(), true);
     if (_genericInvoker_offset == 0)  _genericInvoker_offset = -1;  // set to explicit "empty" value
@@ -2617,9 +2619,31 @@
 
 int java_lang_invoke_MethodTypeForm::vmslots(oop mtform) {
   assert(mtform->klass() == SystemDictionary::MethodTypeForm_klass(), "MTForm only");
+  assert(_vmslots_offset > 0, "");
   return mtform->int_field(_vmslots_offset);
 }
 
+oop java_lang_invoke_MethodTypeForm::vmlayout(oop mtform) {
+  assert(mtform->klass() == SystemDictionary::MethodTypeForm_klass(), "MTForm only");
+  assert(_vmlayout_offset > 0, "");
+  return mtform->obj_field(_vmlayout_offset);
+}
+
+oop java_lang_invoke_MethodTypeForm::init_vmlayout(oop mtform, oop cookie) {
+  assert(mtform->klass() == SystemDictionary::MethodTypeForm_klass(), "MTForm only");
+  oop previous = vmlayout(mtform);
+  if (previous != NULL) {
+    return previous;  // someone else beat us to it
+  }
+  HeapWord* cookie_addr = (HeapWord*) mtform->obj_field_addr<oop>(_vmlayout_offset);
+  OrderAccess::storestore();  // make sure our copy is fully committed
+  previous = oopDesc::atomic_compare_exchange_oop(cookie, cookie_addr, previous);
+  if (previous != NULL) {
+    return previous;  // someone else beat us to it
+  }
+  return cookie;
+}
+
 oop java_lang_invoke_MethodTypeForm::erasedType(oop mtform) {
   assert(mtform->klass() == SystemDictionary::MethodTypeForm_klass(), "MTForm only");
   return mtform->obj_field(_erasedType_offset);
--- a/src/share/vm/classfile/javaClasses.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/classfile/javaClasses.hpp	Tue May 24 11:09:39 2011 -0700
@@ -949,18 +949,19 @@
     OP_CHECK_CAST    = 0x2, // ref-to-ref conversion; requires a Class argument
     OP_PRIM_TO_PRIM  = 0x3, // converts from one primitive to another
     OP_REF_TO_PRIM   = 0x4, // unboxes a wrapper to produce a primitive
-    OP_PRIM_TO_REF   = 0x5, // boxes a primitive into a wrapper (NYI)
+    OP_PRIM_TO_REF   = 0x5, // boxes a primitive into a wrapper
     OP_SWAP_ARGS     = 0x6, // swap arguments (vminfo is 2nd arg)
     OP_ROT_ARGS      = 0x7, // rotate arguments (vminfo is displaced arg)
     OP_DUP_ARGS      = 0x8, // duplicates one or more arguments (at TOS)
     OP_DROP_ARGS     = 0x9, // remove one or more argument slots
-    OP_COLLECT_ARGS  = 0xA, // combine one or more arguments into a varargs (NYI)
+    OP_COLLECT_ARGS  = 0xA, // combine arguments using an auxiliary function
     OP_SPREAD_ARGS   = 0xB, // expand in place a varargs array (of known size)
-    OP_FLYBY         = 0xC, // operate first on reified argument list (NYI)
-    OP_RICOCHET      = 0xD, // run an adapter chain on the return value (NYI)
+    OP_FOLD_ARGS     = 0xC, // combine but do not remove arguments; prepend result
+    //OP_UNUSED_13   = 0xD, // unused code, perhaps for reified argument lists
     CONV_OP_LIMIT    = 0xE, // limit of CONV_OP enumeration
 
     CONV_OP_MASK     = 0xF00, // this nybble contains the conversion op field
+    CONV_TYPE_MASK   = 0x0F,  // fits T_ADDRESS and below
     CONV_VMINFO_MASK = 0x0FF, // LSB is reserved for JVM use
     CONV_VMINFO_SHIFT     =  0, // position of bits in CONV_VMINFO_MASK
     CONV_OP_SHIFT         =  8, // position of bits in CONV_OP_MASK
@@ -1089,6 +1090,7 @@
 
  private:
   static int _vmslots_offset;           // number of argument slots needed
+  static int _vmlayout_offset;          // object describing internal calling sequence
   static int _erasedType_offset;        // erasedType = canonical MethodType
   static int _genericInvoker_offset;    // genericInvoker = adapter for invokeGeneric
 
@@ -1100,8 +1102,12 @@
   static oop            erasedType(oop mtform);
   static oop            genericInvoker(oop mtform);
 
+  static oop            vmlayout(oop mtform);
+  static oop       init_vmlayout(oop mtform, oop cookie);
+
   // Accessors for code generation:
   static int vmslots_offset_in_bytes()          { return _vmslots_offset; }
+  static int vmlayout_offset_in_bytes()         { return _vmlayout_offset; }
   static int erasedType_offset_in_bytes()       { return _erasedType_offset; }
   static int genericInvoker_offset_in_bytes()   { return _genericInvoker_offset; }
 };
--- a/src/share/vm/classfile/systemDictionary.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/classfile/systemDictionary.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2362,8 +2362,15 @@
       spe = invoke_method_table()->find_entry(index, hash, signature, name_id);
       if (spe == NULL)
         spe = invoke_method_table()->add_entry(index, hash, signature, name_id);
-      if (spe->property_oop() == NULL)
+      if (spe->property_oop() == NULL) {
         spe->set_property_oop(m());
+        // Link m to his method type, if it is suitably generic.
+        oop mtform = java_lang_invoke_MethodType::form(mt());
+        if (mtform != NULL && mt() == java_lang_invoke_MethodTypeForm::erasedType(mtform)
+            && java_lang_invoke_MethodTypeForm::vmlayout_offset_in_bytes() > 0) {
+          java_lang_invoke_MethodTypeForm::init_vmlayout(mtform, m());
+        }
+      }
     } else {
       non_cached_result = m;
     }
--- a/src/share/vm/classfile/vmSymbols.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/classfile/vmSymbols.hpp	Tue May 24 11:09:39 2011 -0700
@@ -341,6 +341,7 @@
   template(vmtarget_name,                             "vmtarget")                                 \
   template(vmentry_name,                              "vmentry")                                  \
   template(vmslots_name,                              "vmslots")                                  \
+  template(vmlayout_name,                             "vmlayout")                                 \
   template(vmindex_name,                              "vmindex")                                  \
   template(vmargslot_name,                            "vmargslot")                                \
   template(flags_name,                                "flags")                                    \
@@ -393,6 +394,7 @@
   template(void_signature,                            "V")                                        \
   template(byte_array_signature,                      "[B")                                       \
   template(char_array_signature,                      "[C")                                       \
+  template(int_array_signature,                       "[I")                                       \
   template(object_void_signature,                     "(Ljava/lang/Object;)V")                    \
   template(object_int_signature,                      "(Ljava/lang/Object;)I")                    \
   template(object_boolean_signature,                  "(Ljava/lang/Object;)Z")                    \
@@ -471,6 +473,13 @@
   template(sun_management_ManagementFactory,           "sun/management/ManagementFactory")                        \
   template(sun_management_Sensor,                      "sun/management/Sensor")                                   \
   template(sun_management_Agent,                       "sun/management/Agent")                                    \
+  template(sun_management_GarbageCollectorImpl,        "sun/management/GarbageCollectorImpl")                     \
+  template(getGcInfoBuilder_name,                      "getGcInfoBuilder")                                        \
+  template(getGcInfoBuilder_signature,                 "()Lsun/management/GcInfoBuilder;")                        \
+  template(com_sun_management_GcInfo,                  "com/sun/management/GcInfo")                               \
+  template(com_sun_management_GcInfo_constructor_signature, "(Lsun/management/GcInfoBuilder;JJJ[Ljava/lang/management/MemoryUsage;[Ljava/lang/management/MemoryUsage;[Ljava/lang/Object;)V") \
+  template(createGCNotification_name,                  "createGCNotification")                                    \
+  template(createGCNotification_signature,             "(JLjava/lang/String;Ljava/lang/String;Ljava/lang/String;Lcom/sun/management/GcInfo;)V") \
   template(createMemoryPoolMBean_name,                 "createMemoryPoolMBean")                                   \
   template(createMemoryManagerMBean_name,              "createMemoryManagerMBean")                                \
   template(createGarbageCollectorMBean_name,           "createGarbageCollectorMBean")                             \
@@ -488,6 +497,7 @@
   template(java_lang_management_MemoryPoolMXBean,      "java/lang/management/MemoryPoolMXBean")                   \
   template(java_lang_management_MemoryManagerMXBean,   "java/lang/management/MemoryManagerMXBean")                \
   template(java_lang_management_GarbageCollectorMXBean,"java/lang/management/GarbageCollectorMXBean")             \
+  template(gcInfoBuilder_name,                         "gcInfoBuilder")                                           \
   template(createMemoryPool_name,                      "createMemoryPool")                                        \
   template(createMemoryManager_name,                   "createMemoryManager")                                     \
   template(createGarbageCollector_name,                "createGarbageCollector")                                  \
--- a/src/share/vm/code/codeBlob.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/code/codeBlob.cpp	Tue May 24 11:09:39 2011 -0700
@@ -152,6 +152,32 @@
 }
 
 
+void CodeBlob::trace_new_stub(CodeBlob* stub, const char* name1, const char* name2) {
+  // Do not hold the CodeCache lock during name formatting.
+  assert(!CodeCache_lock->owned_by_self(), "release CodeCache before registering the stub");
+
+  if (stub != NULL) {
+    char stub_id[256];
+    assert(strlen(name1) + strlen(name2) < sizeof(stub_id), "");
+    jio_snprintf(stub_id, sizeof(stub_id), "%s%s", name1, name2);
+    if (PrintStubCode) {
+      tty->print_cr("Decoding %s " INTPTR_FORMAT, stub_id, (intptr_t) stub);
+      Disassembler::decode(stub->code_begin(), stub->code_end());
+    }
+    Forte::register_stub(stub_id, stub->code_begin(), stub->code_end());
+
+    if (JvmtiExport::should_post_dynamic_code_generated()) {
+      const char* stub_name = name2;
+      if (name2[0] == '\0')  stub_name = name1;
+      JvmtiExport::post_dynamic_code_generated(stub_name, stub->code_begin(), stub->code_end());
+    }
+  }
+
+  // Track memory usage statistic after releasing CodeCache_lock
+  MemoryService::track_code_cache_memory_usage();
+}
+
+
 void CodeBlob::flush() {
   if (_oop_maps) {
     FREE_C_HEAP_ARRAY(unsigned char, _oop_maps);
@@ -312,23 +338,7 @@
     stub = new (size) RuntimeStub(stub_name, cb, size, frame_complete, frame_size, oop_maps, caller_must_gc_arguments);
   }
 
-  // Do not hold the CodeCache lock during name formatting.
-  if (stub != NULL) {
-    char stub_id[256];
-    jio_snprintf(stub_id, sizeof(stub_id), "RuntimeStub - %s", stub_name);
-    if (PrintStubCode) {
-      tty->print_cr("Decoding %s " INTPTR_FORMAT, stub_id, stub);
-      Disassembler::decode(stub->code_begin(), stub->code_end());
-    }
-    Forte::register_stub(stub_id, stub->code_begin(), stub->code_end());
-
-    if (JvmtiExport::should_post_dynamic_code_generated()) {
-      JvmtiExport::post_dynamic_code_generated(stub_name, stub->code_begin(), stub->code_end());
-    }
-  }
-
-  // Track memory usage statistic after releasing CodeCache_lock
-  MemoryService::track_code_cache_memory_usage();
+  trace_new_stub(stub, "RuntimeStub - ", stub_name);
 
   return stub;
 }
@@ -340,6 +350,50 @@
   return p;
 }
 
+// operator new shared by all singletons:
+void* SingletonBlob::operator new(size_t s, unsigned size) {
+  void* p = CodeCache::allocate(size);
+  if (!p) fatal("Initial size of CodeCache is too small");
+  return p;
+}
+
+
+//----------------------------------------------------------------------------------------------------
+// Implementation of RicochetBlob
+
+RicochetBlob::RicochetBlob(
+  CodeBuffer* cb,
+  int         size,
+  int         bounce_offset,
+  int         exception_offset,
+  int         frame_size
+)
+: SingletonBlob("RicochetBlob", cb, sizeof(RicochetBlob), size, frame_size, (OopMapSet*) NULL)
+{
+  _bounce_offset = bounce_offset;
+  _exception_offset = exception_offset;
+}
+
+
+RicochetBlob* RicochetBlob::create(
+  CodeBuffer* cb,
+  int         bounce_offset,
+  int         exception_offset,
+  int         frame_size)
+{
+  RicochetBlob* blob = NULL;
+  ThreadInVMfromUnknown __tiv;  // get to VM state in case we block on CodeCache_lock
+  {
+    MutexLockerEx mu(CodeCache_lock, Mutex::_no_safepoint_check_flag);
+    unsigned int size = allocation_size(cb, sizeof(RicochetBlob));
+    blob = new (size) RicochetBlob(cb, size, bounce_offset, exception_offset, frame_size);
+  }
+
+  trace_new_stub(blob, "RicochetBlob");
+
+  return blob;
+}
+
 
 //----------------------------------------------------------------------------------------------------
 // Implementation of DeoptimizationBlob
@@ -386,34 +440,12 @@
                                          frame_size);
   }
 
-  // Do not hold the CodeCache lock during name formatting.
-  if (blob != NULL) {
-    char blob_id[256];
-    jio_snprintf(blob_id, sizeof(blob_id), "DeoptimizationBlob@" PTR_FORMAT, blob->code_begin());
-    if (PrintStubCode) {
-      tty->print_cr("Decoding %s " INTPTR_FORMAT, blob_id, blob);
-      Disassembler::decode(blob->code_begin(), blob->code_end());
-    }
-    Forte::register_stub(blob_id, blob->code_begin(), blob->code_end());
-
-    if (JvmtiExport::should_post_dynamic_code_generated()) {
-      JvmtiExport::post_dynamic_code_generated("DeoptimizationBlob", blob->code_begin(), blob->code_end());
-    }
-  }
-
-  // Track memory usage statistic after releasing CodeCache_lock
-  MemoryService::track_code_cache_memory_usage();
+  trace_new_stub(blob, "DeoptimizationBlob");
 
   return blob;
 }
 
 
-void* DeoptimizationBlob::operator new(size_t s, unsigned size) {
-  void* p = CodeCache::allocate(size);
-  if (!p) fatal("Initial size of CodeCache is too small");
-  return p;
-}
-
 //----------------------------------------------------------------------------------------------------
 // Implementation of UncommonTrapBlob
 
@@ -441,33 +473,12 @@
     blob = new (size) UncommonTrapBlob(cb, size, oop_maps, frame_size);
   }
 
-  // Do not hold the CodeCache lock during name formatting.
-  if (blob != NULL) {
-    char blob_id[256];
-    jio_snprintf(blob_id, sizeof(blob_id), "UncommonTrapBlob@" PTR_FORMAT, blob->code_begin());
-    if (PrintStubCode) {
-      tty->print_cr("Decoding %s " INTPTR_FORMAT, blob_id, blob);
-      Disassembler::decode(blob->code_begin(), blob->code_end());
-    }
-    Forte::register_stub(blob_id, blob->code_begin(), blob->code_end());
-
-    if (JvmtiExport::should_post_dynamic_code_generated()) {
-      JvmtiExport::post_dynamic_code_generated("UncommonTrapBlob", blob->code_begin(), blob->code_end());
-    }
-  }
-
-  // Track memory usage statistic after releasing CodeCache_lock
-  MemoryService::track_code_cache_memory_usage();
+  trace_new_stub(blob, "UncommonTrapBlob");
 
   return blob;
 }
 
 
-void* UncommonTrapBlob::operator new(size_t s, unsigned size) {
-  void* p = CodeCache::allocate(size);
-  if (!p) fatal("Initial size of CodeCache is too small");
-  return p;
-}
 #endif // COMPILER2
 
 
@@ -498,33 +509,12 @@
     blob = new (size) ExceptionBlob(cb, size, oop_maps, frame_size);
   }
 
-  // We do not need to hold the CodeCache lock during name formatting
-  if (blob != NULL) {
-    char blob_id[256];
-    jio_snprintf(blob_id, sizeof(blob_id), "ExceptionBlob@" PTR_FORMAT, blob->code_begin());
-    if (PrintStubCode) {
-      tty->print_cr("Decoding %s " INTPTR_FORMAT, blob_id, blob);
-      Disassembler::decode(blob->code_begin(), blob->code_end());
-    }
-    Forte::register_stub(blob_id, blob->code_begin(), blob->code_end());
-
-    if (JvmtiExport::should_post_dynamic_code_generated()) {
-      JvmtiExport::post_dynamic_code_generated("ExceptionBlob", blob->code_begin(), blob->code_end());
-    }
-  }
-
-  // Track memory usage statistic after releasing CodeCache_lock
-  MemoryService::track_code_cache_memory_usage();
+  trace_new_stub(blob, "ExceptionBlob");
 
   return blob;
 }
 
 
-void* ExceptionBlob::operator new(size_t s, unsigned size) {
-  void* p = CodeCache::allocate(size);
-  if (!p) fatal("Initial size of CodeCache is too small");
-  return p;
-}
 #endif // COMPILER2
 
 
@@ -554,35 +544,12 @@
     blob = new (size) SafepointBlob(cb, size, oop_maps, frame_size);
   }
 
-  // We do not need to hold the CodeCache lock during name formatting.
-  if (blob != NULL) {
-    char blob_id[256];
-    jio_snprintf(blob_id, sizeof(blob_id), "SafepointBlob@" PTR_FORMAT, blob->code_begin());
-    if (PrintStubCode) {
-      tty->print_cr("Decoding %s " INTPTR_FORMAT, blob_id, blob);
-      Disassembler::decode(blob->code_begin(), blob->code_end());
-    }
-    Forte::register_stub(blob_id, blob->code_begin(), blob->code_end());
-
-    if (JvmtiExport::should_post_dynamic_code_generated()) {
-      JvmtiExport::post_dynamic_code_generated("SafepointBlob", blob->code_begin(), blob->code_end());
-    }
-  }
-
-  // Track memory usage statistic after releasing CodeCache_lock
-  MemoryService::track_code_cache_memory_usage();
+  trace_new_stub(blob, "SafepointBlob");
 
   return blob;
 }
 
 
-void* SafepointBlob::operator new(size_t s, unsigned size) {
-  void* p = CodeCache::allocate(size);
-  if (!p) fatal("Initial size of CodeCache is too small");
-  return p;
-}
-
-
 //----------------------------------------------------------------------------------------------------
 // Verification and printing
 
--- a/src/share/vm/code/codeBlob.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/code/codeBlob.hpp	Tue May 24 11:09:39 2011 -0700
@@ -35,6 +35,7 @@
 // Suptypes are:
 //   nmethod            : Compiled Java methods (include method that calls to native code)
 //   RuntimeStub        : Call to VM runtime methods
+//   RicochetBlob       : Used for blocking MethodHandle adapters
 //   DeoptimizationBlob : Used for deoptimizatation
 //   ExceptionBlob      : Used for stack unrolling
 //   SafepointBlob      : Used to handle illegal instruction exceptions
@@ -95,12 +96,13 @@
   void flush();
 
   // Typing
-  virtual bool is_buffer_blob() const                 { return false; }
-  virtual bool is_nmethod() const                     { return false; }
-  virtual bool is_runtime_stub() const                { return false; }
-  virtual bool is_deoptimization_stub() const         { return false; }
-  virtual bool is_uncommon_trap_stub() const          { return false; }
-  virtual bool is_exception_stub() const              { return false; }
+  virtual bool is_buffer_blob() const            { return false; }
+  virtual bool is_nmethod() const                { return false; }
+  virtual bool is_runtime_stub() const           { return false; }
+  virtual bool is_ricochet_stub() const          { return false; }
+  virtual bool is_deoptimization_stub() const    { return false; }
+  virtual bool is_uncommon_trap_stub() const     { return false; }
+  virtual bool is_exception_stub() const         { return false; }
   virtual bool is_safepoint_stub() const              { return false; }
   virtual bool is_adapter_blob() const                { return false; }
   virtual bool is_method_handles_adapter_blob() const { return false; }
@@ -182,6 +184,9 @@
   virtual void print_on(outputStream* st) const;
   virtual void print_value_on(outputStream* st) const;
 
+  // Deal with Disassembler, VTune, Forte, JvmtiExport, MemoryService.
+  static void trace_new_stub(CodeBlob* blob, const char* name1, const char* name2 = "");
+
   // Print the comment associated with offset on stream, if there is one
   virtual void print_block_comment(outputStream* stream, address block_begin) {
     intptr_t offset = (intptr_t)(block_begin - code_begin());
@@ -318,7 +323,11 @@
 
 class SingletonBlob: public CodeBlob {
   friend class VMStructs;
-  public:
+
+ protected:
+  void* operator new(size_t s, unsigned size);
+
+ public:
    SingletonBlob(
      const char* name,
      CodeBuffer* cb,
@@ -341,6 +350,50 @@
 
 
 //----------------------------------------------------------------------------------------------------
+// RicochetBlob
+// Holds an arbitrary argument list indefinitely while Java code executes recursively.
+
+class RicochetBlob: public SingletonBlob {
+  friend class VMStructs;
+ private:
+
+  int _bounce_offset;
+  int _exception_offset;
+
+  // Creation support
+  RicochetBlob(
+    CodeBuffer* cb,
+    int         size,
+    int         bounce_offset,
+    int         exception_offset,
+    int         frame_size
+  );
+
+ public:
+  // Creation
+  static RicochetBlob* create(
+    CodeBuffer* cb,
+    int         bounce_offset,
+    int         exception_offset,
+    int         frame_size
+  );
+
+  // Typing
+  bool is_ricochet_stub() const { return true; }
+
+  // GC for args
+  void preserve_callee_argument_oops(frame fr, const RegisterMap *reg_map, OopClosure* f) { /* Nothing to do */ }
+
+  address bounce_addr() const           { return code_begin() + _bounce_offset; }
+  address exception_addr() const        { return code_begin() + _exception_offset; }
+  bool returns_to_bounce_addr(address pc) const {
+    address bounce_pc = bounce_addr();
+    return (pc == bounce_pc || (pc + frame::pc_return_offset) == bounce_pc);
+  }
+};
+
+
+//----------------------------------------------------------------------------------------------------
 // DeoptimizationBlob
 
 class DeoptimizationBlob: public SingletonBlob {
@@ -363,8 +416,6 @@
     int         frame_size
   );
 
-  void* operator new(size_t s, unsigned size);
-
  public:
   // Creation
   static DeoptimizationBlob* create(
@@ -378,7 +429,6 @@
 
   // Typing
   bool is_deoptimization_stub() const { return true; }
-  const DeoptimizationBlob *as_deoptimization_stub() const { return this; }
   bool exception_address_is_unpack_entry(address pc) const {
     address unpack_pc = unpack();
     return (pc == unpack_pc || (pc + frame::pc_return_offset) == unpack_pc);
@@ -426,8 +476,6 @@
     int         frame_size
   );
 
-  void* operator new(size_t s, unsigned size);
-
  public:
   // Creation
   static UncommonTrapBlob* create(
@@ -458,8 +506,6 @@
     int         frame_size
   );
 
-  void* operator new(size_t s, unsigned size);
-
  public:
   // Creation
   static ExceptionBlob* create(
@@ -491,8 +537,6 @@
     int         frame_size
   );
 
-  void* operator new(size_t s, unsigned size);
-
  public:
   // Creation
   static SafepointBlob* create(
--- a/src/share/vm/code/codeCache.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/code/codeCache.cpp	Tue May 24 11:09:39 2011 -0700
@@ -796,6 +796,7 @@
   int nmethodCount = 0;
   int runtimeStubCount = 0;
   int adapterCount = 0;
+  int ricochetStubCount = 0;
   int deoptimizationStubCount = 0;
   int uncommonTrapStubCount = 0;
   int bufferBlobCount = 0;
@@ -840,6 +841,8 @@
       }
     } else if (cb->is_runtime_stub()) {
       runtimeStubCount++;
+    } else if (cb->is_ricochet_stub()) {
+      ricochetStubCount++;
     } else if (cb->is_deoptimization_stub()) {
       deoptimizationStubCount++;
     } else if (cb->is_uncommon_trap_stub()) {
@@ -876,6 +879,7 @@
   tty->print_cr("runtime_stubs: %d",runtimeStubCount);
   tty->print_cr("adapters: %d",adapterCount);
   tty->print_cr("buffer blobs: %d",bufferBlobCount);
+  tty->print_cr("ricochet_stubs: %d",ricochetStubCount);
   tty->print_cr("deoptimization_stubs: %d",deoptimizationStubCount);
   tty->print_cr("uncommon_traps: %d",uncommonTrapStubCount);
   tty->print_cr("\nnmethod size distribution (non-zombie java)");
--- a/src/share/vm/compiler/disassembler.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/compiler/disassembler.cpp	Tue May 24 11:09:39 2011 -0700
@@ -283,10 +283,10 @@
         st->print("Stub::%s", desc->name());
         if (desc->begin() != adr)
           st->print("%+d 0x%p",adr - desc->begin(), adr);
-        else if (WizardMode) st->print(" " INTPTR_FORMAT, adr);
+        else if (WizardMode) st->print(" " PTR_FORMAT, adr);
         return;
       }
-      st->print("Stub::<unknown> " INTPTR_FORMAT, adr);
+      st->print("Stub::<unknown> " PTR_FORMAT, adr);
       return;
     }
 
@@ -314,8 +314,8 @@
     }
   }
 
-  // Fall through to a simple numeral.
-  st->print(INTPTR_FORMAT, (intptr_t)adr);
+  // Fall through to a simple (hexadecimal) numeral.
+  st->print(PTR_FORMAT, adr);
 }
 
 void decode_env::print_insn_labels() {
@@ -326,7 +326,7 @@
     cb->print_block_comment(st, p);
   }
   if (_print_pc) {
-    st->print("  " INTPTR_FORMAT ": ", (intptr_t) p);
+    st->print("  " PTR_FORMAT ": ", p);
   }
 }
 
@@ -432,7 +432,7 @@
 void Disassembler::decode(CodeBlob* cb, outputStream* st) {
   if (!load_library())  return;
   decode_env env(cb, st);
-  env.output()->print_cr("Decoding CodeBlob " INTPTR_FORMAT, cb);
+  env.output()->print_cr("Decoding CodeBlob " PTR_FORMAT, cb);
   env.decode_instructions(cb->code_begin(), cb->code_end());
 }
 
@@ -446,7 +446,7 @@
 void Disassembler::decode(nmethod* nm, outputStream* st) {
   if (!load_library())  return;
   decode_env env(nm, st);
-  env.output()->print_cr("Decoding compiled method " INTPTR_FORMAT ":", nm);
+  env.output()->print_cr("Decoding compiled method " PTR_FORMAT ":", nm);
   env.output()->print_cr("Code:");
 
 #ifdef SHARK
@@ -478,9 +478,9 @@
     int offset = 0;
     for (address p = nm->consts_begin(); p < nm->consts_end(); p += 4, offset += 4) {
       if ((offset % 8) == 0) {
-        env.output()->print_cr("  " INTPTR_FORMAT " (offset: %4d): " PTR32_FORMAT "   " PTR64_FORMAT, (intptr_t) p, offset, *((int32_t*) p), *((int64_t*) p));
+        env.output()->print_cr("  " PTR_FORMAT " (offset: %4d): " PTR32_FORMAT "   " PTR64_FORMAT, p, offset, *((int32_t*) p), *((int64_t*) p));
       } else {
-        env.output()->print_cr("  " INTPTR_FORMAT " (offset: %4d): " PTR32_FORMAT,                    (intptr_t) p, offset, *((int32_t*) p));
+        env.output()->print_cr("  " PTR_FORMAT " (offset: %4d): " PTR32_FORMAT,                    p, offset, *((int32_t*) p));
       }
     }
   }
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2026,7 +2026,7 @@
   }
 
   {
-    TraceCMSMemoryManagerStats();
+    TraceCMSMemoryManagerStats tmms(gch->gc_cause());
   }
   GenMarkSweep::invoke_at_safepoint(_cmsGen->level(),
     ref_processor(), clear_all_soft_refs);
@@ -3479,7 +3479,7 @@
 void CMSCollector::checkpointRootsInitial(bool asynch) {
   assert(_collectorState == InitialMarking, "Wrong collector state");
   check_correct_thread_executing();
-  TraceCMSMemoryManagerStats tms(_collectorState);
+  TraceCMSMemoryManagerStats tms(_collectorState,GenCollectedHeap::heap()->gc_cause());
 
   ReferenceProcessor* rp = ref_processor();
   SpecializationStats::clear();
@@ -4858,7 +4858,8 @@
   // world is stopped at this checkpoint
   assert(SafepointSynchronize::is_at_safepoint(),
          "world should be stopped");
-  TraceCMSMemoryManagerStats tms(_collectorState);
+  TraceCMSMemoryManagerStats tms(_collectorState,GenCollectedHeap::heap()->gc_cause());
+
   verify_work_stacks_empty();
   verify_overflow_empty();
 
@@ -5993,7 +5994,7 @@
   verify_work_stacks_empty();
   verify_overflow_empty();
   increment_sweep_count();
-  TraceCMSMemoryManagerStats tms(_collectorState);
+  TraceCMSMemoryManagerStats tms(_collectorState,GenCollectedHeap::heap()->gc_cause());
 
   _inter_sweep_timer.stop();
   _inter_sweep_estimate.sample(_inter_sweep_timer.seconds());
@@ -9235,11 +9236,12 @@
   return res;
 }
 
-TraceCMSMemoryManagerStats::TraceCMSMemoryManagerStats(CMSCollector::CollectorState phase): TraceMemoryManagerStats() {
+TraceCMSMemoryManagerStats::TraceCMSMemoryManagerStats(CMSCollector::CollectorState phase, GCCause::Cause cause): TraceMemoryManagerStats() {
 
   switch (phase) {
     case CMSCollector::InitialMarking:
       initialize(true  /* fullGC */ ,
+                 cause /* cause of the GC */,
                  true  /* recordGCBeginTime */,
                  true  /* recordPreGCUsage */,
                  false /* recordPeakUsage */,
@@ -9251,6 +9253,7 @@
 
     case CMSCollector::FinalMarking:
       initialize(true  /* fullGC */ ,
+                 cause /* cause of the GC */,
                  false /* recordGCBeginTime */,
                  false /* recordPreGCUsage */,
                  false /* recordPeakUsage */,
@@ -9262,6 +9265,7 @@
 
     case CMSCollector::Sweeping:
       initialize(true  /* fullGC */ ,
+                 cause /* cause of the GC */,
                  false /* recordGCBeginTime */,
                  false /* recordPreGCUsage */,
                  true  /* recordPeakUsage */,
@@ -9277,8 +9281,9 @@
 }
 
 // when bailing out of cms in concurrent mode failure
-TraceCMSMemoryManagerStats::TraceCMSMemoryManagerStats(): TraceMemoryManagerStats() {
+TraceCMSMemoryManagerStats::TraceCMSMemoryManagerStats(GCCause::Cause cause): TraceMemoryManagerStats() {
   initialize(true /* fullGC */ ,
+             cause /* cause of the GC */,
              true /* recordGCBeginTime */,
              true /* recordPreGCUsage */,
              true /* recordPeakUsage */,
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp	Tue May 24 11:09:39 2011 -0700
@@ -1895,8 +1895,8 @@
 class TraceCMSMemoryManagerStats : public TraceMemoryManagerStats {
 
  public:
-  TraceCMSMemoryManagerStats(CMSCollector::CollectorState phase);
-  TraceCMSMemoryManagerStats();
+  TraceCMSMemoryManagerStats(CMSCollector::CollectorState phase, GCCause::Cause cause);
+  TraceCMSMemoryManagerStats(GCCause::Cause cause);
 };
 
 
--- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Tue May 24 11:09:39 2011 -0700
@@ -826,6 +826,14 @@
 void ConcurrentMark::checkpointRootsInitialPost() {
   G1CollectedHeap*   g1h = G1CollectedHeap::heap();
 
+  // If we force an overflow during remark, the remark operation will
+  // actually abort and we'll restart concurrent marking. If we always
+  // force an oveflow during remark we'll never actually complete the
+  // marking phase. So, we initilize this here, at the start of the
+  // cycle, so that at the remaining overflow number will decrease at
+  // every remark and we'll eventually not need to cause one.
+  force_overflow_stw()->init();
+
   // For each region note start of marking.
   NoteStartOfMarkHRClosure startcl;
   g1h->heap_region_iterate(&startcl);
@@ -893,27 +901,37 @@
 }
 
 /*
-   Notice that in the next two methods, we actually leave the STS
-   during the barrier sync and join it immediately afterwards. If we
-   do not do this, this then the following deadlock can occur: one
-   thread could be in the barrier sync code, waiting for the other
-   thread to also sync up, whereas another one could be trying to
-   yield, while also waiting for the other threads to sync up too.
-
-   Because the thread that does the sync barrier has left the STS, it
-   is possible to be suspended for a Full GC or an evacuation pause
-   could occur. This is actually safe, since the entering the sync
-   barrier is one of the last things do_marking_step() does, and it
-   doesn't manipulate any data structures afterwards.
-*/
+ * Notice that in the next two methods, we actually leave the STS
+ * during the barrier sync and join it immediately afterwards. If we
+ * do not do this, the following deadlock can occur: one thread could
+ * be in the barrier sync code, waiting for the other thread to also
+ * sync up, whereas another one could be trying to yield, while also
+ * waiting for the other threads to sync up too.
+ *
+ * Note, however, that this code is also used during remark and in
+ * this case we should not attempt to leave / enter the STS, otherwise
+ * we'll either hit an asseert (debug / fastdebug) or deadlock
+ * (product). So we should only leave / enter the STS if we are
+ * operating concurrently.
+ *
+ * Because the thread that does the sync barrier has left the STS, it
+ * is possible to be suspended for a Full GC or an evacuation pause
+ * could occur. This is actually safe, since the entering the sync
+ * barrier is one of the last things do_marking_step() does, and it
+ * doesn't manipulate any data structures afterwards.
+ */
 
 void ConcurrentMark::enter_first_sync_barrier(int task_num) {
   if (verbose_low())
     gclog_or_tty->print_cr("[%d] entering first barrier", task_num);
 
-  ConcurrentGCThread::stsLeave();
+  if (concurrent()) {
+    ConcurrentGCThread::stsLeave();
+  }
   _first_overflow_barrier_sync.enter();
-  ConcurrentGCThread::stsJoin();
+  if (concurrent()) {
+    ConcurrentGCThread::stsJoin();
+  }
   // at this point everyone should have synced up and not be doing any
   // more work
 
@@ -923,7 +941,12 @@
   // let task 0 do this
   if (task_num == 0) {
     // task 0 is responsible for clearing the global data structures
-    clear_marking_state();
+    // We should be here because of an overflow. During STW we should
+    // not clear the overflow flag since we rely on it being true when
+    // we exit this method to abort the pause and restart concurent
+    // marking.
+    clear_marking_state(concurrent() /* clear_overflow */);
+    force_overflow()->update();
 
     if (PrintGC) {
       gclog_or_tty->date_stamp(PrintGCDateStamps);
@@ -940,15 +963,45 @@
   if (verbose_low())
     gclog_or_tty->print_cr("[%d] entering second barrier", task_num);
 
-  ConcurrentGCThread::stsLeave();
+  if (concurrent()) {
+    ConcurrentGCThread::stsLeave();
+  }
   _second_overflow_barrier_sync.enter();
-  ConcurrentGCThread::stsJoin();
+  if (concurrent()) {
+    ConcurrentGCThread::stsJoin();
+  }
   // at this point everything should be re-initialised and ready to go
 
   if (verbose_low())
     gclog_or_tty->print_cr("[%d] leaving second barrier", task_num);
 }
 
+#ifndef PRODUCT
+void ForceOverflowSettings::init() {
+  _num_remaining = G1ConcMarkForceOverflow;
+  _force = false;
+  update();
+}
+
+void ForceOverflowSettings::update() {
+  if (_num_remaining > 0) {
+    _num_remaining -= 1;
+    _force = true;
+  } else {
+    _force = false;
+  }
+}
+
+bool ForceOverflowSettings::should_force() {
+  if (_force) {
+    _force = false;
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif // !PRODUCT
+
 void ConcurrentMark::grayRoot(oop p) {
   HeapWord* addr = (HeapWord*) p;
   // We can't really check against _heap_start and _heap_end, since it
@@ -1117,6 +1170,7 @@
   _restart_for_overflow = false;
 
   size_t active_workers = MAX2((size_t) 1, parallel_marking_threads());
+  force_overflow_conc()->init();
   set_phase(active_workers, true /* concurrent */);
 
   CMConcurrentMarkingTask markingTask(this, cmThread());
@@ -1845,7 +1899,7 @@
   while (!_cleanup_list.is_empty()) {
     HeapRegion* hr = _cleanup_list.remove_head();
     assert(hr != NULL, "the list was not empty");
-    hr->rem_set()->clear();
+    hr->par_clear();
     tmp_free_list.add_as_tail(hr);
 
     // Instead of adding one region at a time to the secondary_free_list,
@@ -2703,12 +2757,16 @@
 
 }
 
-void ConcurrentMark::clear_marking_state() {
+void ConcurrentMark::clear_marking_state(bool clear_overflow) {
   _markStack.setEmpty();
   _markStack.clear_overflow();
   _regionStack.setEmpty();
   _regionStack.clear_overflow();
-  clear_has_overflown();
+  if (clear_overflow) {
+    clear_has_overflown();
+  } else {
+    assert(has_overflown(), "pre-condition");
+  }
   _finger = _heap_start;
 
   for (int i = 0; i < (int)_max_task_num; ++i) {
@@ -4279,6 +4337,15 @@
     }
   }
 
+  // If we are about to wrap up and go into termination, check if we
+  // should raise the overflow flag.
+  if (do_termination && !has_aborted()) {
+    if (_cm->force_overflow()->should_force()) {
+      _cm->set_has_overflown();
+      regular_clock_call();
+    }
+  }
+
   // We still haven't aborted. Now, let's try to get into the
   // termination protocol.
   if (do_termination && !has_aborted()) {
--- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Tue May 24 11:09:39 2011 -0700
@@ -316,6 +316,19 @@
   void setEmpty()   { _index = 0; clear_overflow(); }
 };
 
+class ForceOverflowSettings VALUE_OBJ_CLASS_SPEC {
+private:
+#ifndef PRODUCT
+  uintx _num_remaining;
+  bool _force;
+#endif // !defined(PRODUCT)
+
+public:
+  void init() PRODUCT_RETURN;
+  void update() PRODUCT_RETURN;
+  bool should_force() PRODUCT_RETURN_( return false; );
+};
+
 // this will enable a variety of different statistics per GC task
 #define _MARKING_STATS_       0
 // this will enable the higher verbose levels
@@ -462,6 +475,9 @@
 
   WorkGang* _parallel_workers;
 
+  ForceOverflowSettings _force_overflow_conc;
+  ForceOverflowSettings _force_overflow_stw;
+
   void weakRefsWork(bool clear_all_soft_refs);
 
   void swapMarkBitMaps();
@@ -470,7 +486,7 @@
   // task local ones; should be called during initial mark.
   void reset();
   // It resets all the marking data structures.
-  void clear_marking_state();
+  void clear_marking_state(bool clear_overflow = true);
 
   // It should be called to indicate which phase we're in (concurrent
   // mark or remark) and how many threads are currently active.
@@ -547,6 +563,22 @@
   void enter_first_sync_barrier(int task_num);
   void enter_second_sync_barrier(int task_num);
 
+  ForceOverflowSettings* force_overflow_conc() {
+    return &_force_overflow_conc;
+  }
+
+  ForceOverflowSettings* force_overflow_stw() {
+    return &_force_overflow_stw;
+  }
+
+  ForceOverflowSettings* force_overflow() {
+    if (concurrent()) {
+      return force_overflow_conc();
+    } else {
+      return force_overflow_stw();
+    }
+  }
+
 public:
   // Manipulation of the global mark stack.
   // Notice that the first mark_stack_push is CAS-based, whereas the
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1162,7 +1162,7 @@
                 PrintGC, true, gclog_or_tty);
 
     TraceCollectorStats tcs(g1mm()->full_collection_counters());
-    TraceMemoryManagerStats tms(true /* fullGC */);
+    TraceMemoryManagerStats tms(true /* fullGC */, gc_cause());
 
     double start = os::elapsedTime();
     g1_policy()->record_full_collection_start();
@@ -3202,7 +3202,7 @@
     TraceTime t(verbose_str, PrintGC && !PrintGCDetails, true, gclog_or_tty);
 
     TraceCollectorStats tcs(g1mm()->incremental_collection_counters());
-    TraceMemoryManagerStats tms(false /* fullGC */);
+    TraceMemoryManagerStats tms(false /* fullGC */, gc_cause());
 
     // If the secondary_free_list is not empty, append it to the
     // free_list. No need to wait for the cleanup operation to finish;
@@ -3975,6 +3975,9 @@
 oop
 G1CollectedHeap::handle_evacuation_failure_par(OopsInHeapRegionClosure* cl,
                                                oop old) {
+  assert(obj_in_cs(old),
+         err_msg("obj: "PTR_FORMAT" should still be in the CSet",
+                 (HeapWord*) old));
   markOop m = old->mark();
   oop forward_ptr = old->forward_to_atomic(old);
   if (forward_ptr == NULL) {
@@ -3997,7 +4000,13 @@
     }
     return old;
   } else {
-    // Someone else had a place to copy it.
+    // Forward-to-self failed. Either someone else managed to allocate
+    // space for this object (old != forward_ptr) or they beat us in
+    // self-forwarding it (old == forward_ptr).
+    assert(old == forward_ptr || !obj_in_cs(forward_ptr),
+           err_msg("obj: "PTR_FORMAT" forwarded to: "PTR_FORMAT" "
+                   "should not be in the CSet",
+                   (HeapWord*) old, (HeapWord*) forward_ptr));
     return forward_ptr;
   }
 }
@@ -4308,11 +4317,10 @@
   T heap_oop = oopDesc::load_heap_oop(p);
   if (!oopDesc::is_null(heap_oop)) {
     oop obj = oopDesc::decode_heap_oop(heap_oop);
-    assert((_g1->evacuation_failed()) || (!_g1->obj_in_cs(obj)),
-           "shouldn't still be in the CSet if evacuation didn't fail.");
     HeapWord* addr = (HeapWord*)obj;
-    if (_g1->is_in_g1_reserved(addr))
+    if (_g1->is_in_g1_reserved(addr)) {
       _cm->grayRoot(oop(addr));
+    }
   }
 }
 
@@ -4961,36 +4969,45 @@
 
 #ifndef PRODUCT
 class G1VerifyCardTableCleanup: public HeapRegionClosure {
+  G1CollectedHeap* _g1h;
   CardTableModRefBS* _ct_bs;
 public:
-  G1VerifyCardTableCleanup(CardTableModRefBS* ct_bs)
-    : _ct_bs(ct_bs) { }
+  G1VerifyCardTableCleanup(G1CollectedHeap* g1h, CardTableModRefBS* ct_bs)
+    : _g1h(g1h), _ct_bs(ct_bs) { }
   virtual bool doHeapRegion(HeapRegion* r) {
-    MemRegion mr(r->bottom(), r->end());
     if (r->is_survivor()) {
-      _ct_bs->verify_dirty_region(mr);
+      _g1h->verify_dirty_region(r);
     } else {
-      _ct_bs->verify_clean_region(mr);
+      _g1h->verify_not_dirty_region(r);
     }
     return false;
   }
 };
 
+void G1CollectedHeap::verify_not_dirty_region(HeapRegion* hr) {
+  // All of the region should be clean.
+  CardTableModRefBS* ct_bs = (CardTableModRefBS*)barrier_set();
+  MemRegion mr(hr->bottom(), hr->end());
+  ct_bs->verify_not_dirty_region(mr);
+}
+
+void G1CollectedHeap::verify_dirty_region(HeapRegion* hr) {
+  // We cannot guarantee that [bottom(),end()] is dirty.  Threads
+  // dirty allocated blocks as they allocate them. The thread that
+  // retires each region and replaces it with a new one will do a
+  // maximal allocation to fill in [pre_dummy_top(),end()] but will
+  // not dirty that area (one less thing to have to do while holding
+  // a lock). So we can only verify that [bottom(),pre_dummy_top()]
+  // is dirty.
+  CardTableModRefBS* ct_bs = (CardTableModRefBS*) barrier_set();
+  MemRegion mr(hr->bottom(), hr->pre_dummy_top());
+  ct_bs->verify_dirty_region(mr);
+}
+
 void G1CollectedHeap::verify_dirty_young_list(HeapRegion* head) {
-  CardTableModRefBS* ct_bs = (CardTableModRefBS*) (barrier_set());
+  CardTableModRefBS* ct_bs = (CardTableModRefBS*) barrier_set();
   for (HeapRegion* hr = head; hr != NULL; hr = hr->get_next_young_region()) {
-    // We cannot guarantee that [bottom(),end()] is dirty.  Threads
-    // dirty allocated blocks as they allocate them. The thread that
-    // retires each region and replaces it with a new one will do a
-    // maximal allocation to fill in [pre_dummy_top(),end()] but will
-    // not dirty that area (one less thing to have to do while holding
-    // a lock). So we can only verify that [bottom(),pre_dummy_top()]
-    // is dirty. Also note that verify_dirty_region() requires
-    // mr.start() and mr.end() to be card aligned and pre_dummy_top()
-    // is not guaranteed to be.
-    MemRegion mr(hr->bottom(),
-                 ct_bs->align_to_card_boundary(hr->pre_dummy_top()));
-    ct_bs->verify_dirty_region(mr);
+    verify_dirty_region(hr);
   }
 }
 
@@ -5033,7 +5050,7 @@
   g1_policy()->record_clear_ct_time( elapsed * 1000.0);
 #ifndef PRODUCT
   if (G1VerifyCTCleanup || VerifyAfterGC) {
-    G1VerifyCardTableCleanup cleanup_verifier(ct_bs);
+    G1VerifyCardTableCleanup cleanup_verifier(this, ct_bs);
     heap_region_iterate(&cleanup_verifier);
   }
 #endif
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Tue May 24 11:09:39 2011 -0700
@@ -970,6 +970,8 @@
   // The number of regions available for "regular" expansion.
   size_t expansion_regions() { return _expansion_regions; }
 
+  void verify_not_dirty_region(HeapRegion* hr) PRODUCT_RETURN;
+  void verify_dirty_region(HeapRegion* hr) PRODUCT_RETURN;
   void verify_dirty_young_list(HeapRegion* head) PRODUCT_RETURN;
   void verify_dirty_young_regions() PRODUCT_RETURN;
 
--- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp	Tue May 24 11:09:39 2011 -0700
@@ -157,7 +157,6 @@
   void set_try_claimed() { _try_claimed = true; }
 
   void scanCard(size_t index, HeapRegion *r) {
-    _cards_done++;
     DirtyCardToOopClosure* cl =
       r->new_dcto_closure(_oc,
                          CardTableModRefBS::Precise,
@@ -168,17 +167,14 @@
     HeapWord* card_start = _bot_shared->address_for_index(index);
     HeapWord* card_end = card_start + G1BlockOffsetSharedArray::N_words;
     Space *sp = SharedHeap::heap()->space_containing(card_start);
-    MemRegion sm_region;
-    if (ParallelGCThreads > 0) {
-      // first find the used area
-      sm_region = sp->used_region_at_save_marks();
-    } else {
-      // The closure is not idempotent.  We shouldn't look at objects
-      // allocated during the GC.
-      sm_region = sp->used_region_at_save_marks();
-    }
+    MemRegion sm_region = sp->used_region_at_save_marks();
     MemRegion mr = sm_region.intersection(MemRegion(card_start,card_end));
-    if (!mr.is_empty()) {
+    if (!mr.is_empty() && !_ct_bs->is_card_claimed(index)) {
+      // We make the card as "claimed" lazily (so races are possible
+      // but they're benign), which reduces the number of duplicate
+      // scans (the rsets of the regions in the cset can intersect).
+      _ct_bs->set_card_claimed(index);
+      _cards_done++;
       cl->do_MemRegion(mr);
     }
   }
@@ -199,6 +195,9 @@
     HeapRegionRemSet* hrrs = r->rem_set();
     if (hrrs->iter_is_complete()) return false; // All done.
     if (!_try_claimed && !hrrs->claim_iter()) return false;
+    // If we ever free the collection set concurrently, we should also
+    // clear the card table concurrently therefore we won't need to
+    // add regions of the collection set to the dirty cards region.
     _g1h->push_dirty_cards_region(r);
     // If we didn't return above, then
     //   _try_claimed || r->claim_iter()
@@ -230,15 +229,10 @@
         _g1h->push_dirty_cards_region(card_region);
       }
 
-       // If the card is dirty, then we will scan it during updateRS.
-      if (!card_region->in_collection_set() && !_ct_bs->is_card_dirty(card_index)) {
-        // We make the card as "claimed" lazily (so races are possible but they're benign),
-        // which reduces the number of duplicate scans (the rsets of the regions in the cset
-        // can intersect).
-        if (!_ct_bs->is_card_claimed(card_index)) {
-          _ct_bs->set_card_claimed(card_index);
-          scanCard(card_index, card_region);
-        }
+      // If the card is dirty, then we will scan it during updateRS.
+      if (!card_region->in_collection_set() &&
+          !_ct_bs->is_card_dirty(card_index)) {
+        scanCard(card_index, card_region);
       }
     }
     if (!_try_claimed) {
@@ -246,8 +240,6 @@
     }
     return false;
   }
-  // Set all cards back to clean.
-  void cleanup() {_g1h->cleanUpCardTable();}
   size_t cards_done() { return _cards_done;}
   size_t cards_looked_up() { return _cards;}
 };
@@ -566,8 +558,9 @@
     update_rs_cl.set_region(r);
     HeapWord* stop_point =
       r->oops_on_card_seq_iterate_careful(scanRegion,
-                                        &filter_then_update_rs_cset_oop_cl,
-                                        false /* filter_young */);
+                                          &filter_then_update_rs_cset_oop_cl,
+                                          false /* filter_young */,
+                                          NULL  /* card_ptr */);
 
     // Since this is performed in the event of an evacuation failure, we
     // we shouldn't see a non-null stop point
@@ -735,12 +728,6 @@
                                 (OopClosure*)&mux :
                                 (OopClosure*)&update_rs_oop_cl));
 
-  // Undirty the card.
-  *card_ptr = CardTableModRefBS::clean_card_val();
-  // We must complete this write before we do any of the reads below.
-  OrderAccess::storeload();
-  // And process it, being careful of unallocated portions of TLAB's.
-
   // The region for the current card may be a young region. The
   // current card may have been a card that was evicted from the
   // card cache. When the card was inserted into the cache, we had
@@ -749,7 +736,7 @@
   // and tagged as young.
   //
   // We wish to filter out cards for such a region but the current
-  // thread, if we're running conucrrently, may "see" the young type
+  // thread, if we're running concurrently, may "see" the young type
   // change at any time (so an earlier "is_young" check may pass or
   // fail arbitrarily). We tell the iteration code to perform this
   // filtering when it has been determined that there has been an actual
@@ -759,7 +746,8 @@
   HeapWord* stop_point =
     r->oops_on_card_seq_iterate_careful(dirtyRegion,
                                         &filter_then_update_rs_oop_cl,
-                                        filter_young);
+                                        filter_young,
+                                        card_ptr);
 
   // If stop_point is non-null, then we encountered an unallocated region
   // (perhaps the unfilled portion of a TLAB.)  For now, we'll dirty the
--- a/src/share/vm/gc_implementation/g1/g1_globals.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp	Tue May 24 11:09:39 2011 -0700
@@ -311,7 +311,11 @@
                                                                             \
   develop(bool, G1ExitOnExpansionFailure, false,                            \
           "Raise a fatal VM exit out of memory failure in the event "       \
-          " that heap expansion fails due to running out of swap.")
+          " that heap expansion fails due to running out of swap.")         \
+                                                                            \
+  develop(uintx, G1ConcMarkForceOverflow, 0,                                \
+          "The number of times we'll force an overflow during "             \
+          "concurrent marking")
 
 G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)
 
--- a/src/share/vm/gc_implementation/g1/heapRegion.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/g1/heapRegion.cpp	Tue May 24 11:09:39 2011 -0700
@@ -376,6 +376,17 @@
   if (clear_space) clear(SpaceDecorator::Mangle);
 }
 
+void HeapRegion::par_clear() {
+  assert(used() == 0, "the region should have been already cleared");
+  assert(capacity() == (size_t) HeapRegion::GrainBytes,
+         "should be back to normal");
+  HeapRegionRemSet* hrrs = rem_set();
+  hrrs->clear();
+  CardTableModRefBS* ct_bs =
+                   (CardTableModRefBS*)G1CollectedHeap::heap()->barrier_set();
+  ct_bs->clear(MemRegion(bottom(), end()));
+}
+
 // <PREDICTION>
 void HeapRegion::calc_gc_efficiency() {
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
@@ -600,7 +611,15 @@
 HeapRegion::
 oops_on_card_seq_iterate_careful(MemRegion mr,
                                  FilterOutOfRegionClosure* cl,
-                                 bool filter_young) {
+                                 bool filter_young,
+                                 jbyte* card_ptr) {
+  // Currently, we should only have to clean the card if filter_young
+  // is true and vice versa.
+  if (filter_young) {
+    assert(card_ptr != NULL, "pre-condition");
+  } else {
+    assert(card_ptr == NULL, "pre-condition");
+  }
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
 
   // If we're within a stop-world GC, then we might look at a card in a
@@ -626,6 +645,15 @@
 
   assert(!is_young(), "check value of filter_young");
 
+  // We can only clean the card here, after we make the decision that
+  // the card is not young. And we only clean the card if we have been
+  // asked to (i.e., card_ptr != NULL).
+  if (card_ptr != NULL) {
+    *card_ptr = CardTableModRefBS::clean_card_val();
+    // We must complete this write before we do any of the reads below.
+    OrderAccess::storeload();
+  }
+
   // We used to use "block_start_careful" here.  But we're actually happy
   // to update the BOT while we do this...
   HeapWord* cur = block_start(mr.start());
--- a/src/share/vm/gc_implementation/g1/heapRegion.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/g1/heapRegion.hpp	Tue May 24 11:09:39 2011 -0700
@@ -584,6 +584,7 @@
 
   // Reset HR stuff to default values.
   void hr_clear(bool par, bool clear_space);
+  void par_clear();
 
   void initialize(MemRegion mr, bool clear_space, bool mangle_space);
 
@@ -802,12 +803,16 @@
   HeapWord*
   object_iterate_mem_careful(MemRegion mr, ObjectClosure* cl);
 
-  // In this version - if filter_young is true and the region
-  // is a young region then we skip the iteration.
+  // filter_young: if true and the region is a young region then we
+  // skip the iteration.
+  // card_ptr: if not NULL, and we decide that the card is not young
+  // and we iterate over it, we'll clean the card before we start the
+  // iteration.
   HeapWord*
   oops_on_card_seq_iterate_careful(MemRegion mr,
                                    FilterOutOfRegionClosure* cl,
-                                   bool filter_young);
+                                   bool filter_young,
+                                   jbyte* card_ptr);
 
   // A version of block start that is guaranteed to find *some* block
   // boundary at or before "p", but does not object iteration, and may
--- a/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp	Tue May 24 11:09:39 2011 -0700
@@ -29,13 +29,14 @@
 #include "memory/sharedHeap.hpp"
 #include "memory/space.inline.hpp"
 #include "memory/universe.hpp"
+#include "oops/oop.inline.hpp"
 #include "runtime/java.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/virtualspace.hpp"
 
 void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
-                                                             DirtyCardToOopClosure* dcto_cl,
-                                                             ClearNoncleanCardWrapper* cl,
+                                                             OopsInGenClosure* cl,
+                                                             CardTableRS* ct,
                                                              int n_threads) {
   assert(n_threads > 0, "Error: expected n_threads > 0");
   assert((n_threads == 1 && ParallelGCThreads == 0) ||
@@ -49,14 +50,14 @@
                           lowest_non_clean_base_chunk_index,
                           lowest_non_clean_chunk_size);
 
-  int n_strides = n_threads * StridesPerThread;
+  int n_strides = n_threads * ParGCStridesPerThread;
   SequentialSubTasksDone* pst = sp->par_seq_tasks();
   pst->set_n_threads(n_threads);
   pst->set_n_tasks(n_strides);
 
   int stride = 0;
   while (!pst->is_task_claimed(/* reference */ stride)) {
-    process_stride(sp, mr, stride, n_strides, dcto_cl, cl,
+    process_stride(sp, mr, stride, n_strides, cl, ct,
                    lowest_non_clean,
                    lowest_non_clean_base_chunk_index,
                    lowest_non_clean_chunk_size);
@@ -79,13 +80,13 @@
 process_stride(Space* sp,
                MemRegion used,
                jint stride, int n_strides,
-               DirtyCardToOopClosure* dcto_cl,
-               ClearNoncleanCardWrapper* cl,
+               OopsInGenClosure* cl,
+               CardTableRS* ct,
                jbyte** lowest_non_clean,
                uintptr_t lowest_non_clean_base_chunk_index,
                size_t    lowest_non_clean_chunk_size) {
-  // We don't have to go downwards here; it wouldn't help anyway,
-  // because of parallelism.
+  // We go from higher to lower addresses here; it wouldn't help that much
+  // because of the strided parallelism pattern used here.
 
   // Find the first card address of the first chunk in the stride that is
   // at least "bottom" of the used region.
@@ -98,25 +99,35 @@
   if ((uintptr_t)stride >= start_chunk_stride_num) {
     chunk_card_start = (jbyte*)(start_card +
                                 (stride - start_chunk_stride_num) *
-                                CardsPerStrideChunk);
+                                ParGCCardsPerStrideChunk);
   } else {
     // Go ahead to the next chunk group boundary, then to the requested stride.
     chunk_card_start = (jbyte*)(start_card +
                                 (n_strides - start_chunk_stride_num + stride) *
-                                CardsPerStrideChunk);
+                                ParGCCardsPerStrideChunk);
   }
 
   while (chunk_card_start < end_card) {
-    // We don't have to go downwards here; it wouldn't help anyway,
-    // because of parallelism.  (We take care with "min_done"; see below.)
+    // Even though we go from lower to higher addresses below, the
+    // strided parallelism can interleave the actual processing of the
+    // dirty pages in various ways. For a specific chunk within this
+    // stride, we take care to avoid double scanning or missing a card
+    // by suitably initializing the "min_done" field in process_chunk_boundaries()
+    // below, together with the dirty region extension accomplished in
+    // DirtyCardToOopClosure::do_MemRegion().
+    jbyte*    chunk_card_end = chunk_card_start + ParGCCardsPerStrideChunk;
     // Invariant: chunk_mr should be fully contained within the "used" region.
-    jbyte*    chunk_card_end = chunk_card_start + CardsPerStrideChunk;
     MemRegion chunk_mr       = MemRegion(addr_for(chunk_card_start),
                                          chunk_card_end >= end_card ?
                                            used.end() : addr_for(chunk_card_end));
     assert(chunk_mr.word_size() > 0, "[chunk_card_start > used_end)");
     assert(used.contains(chunk_mr), "chunk_mr should be subset of used");
 
+    DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(),
+                                                     cl->gen_boundary());
+    ClearNoncleanCardWrapper clear_cl(dcto_cl, ct);
+
+
     // Process the chunk.
     process_chunk_boundaries(sp,
                              dcto_cl,
@@ -126,17 +137,30 @@
                              lowest_non_clean_base_chunk_index,
                              lowest_non_clean_chunk_size);
 
+    // We want the LNC array updates above in process_chunk_boundaries
+    // to be visible before any of the card table value changes as a
+    // result of the dirty card iteration below.
+    OrderAccess::storestore();
+
     // We do not call the non_clean_card_iterate_serial() version because
-    // we want to clear the cards, and the ClearNoncleanCardWrapper closure
-    // itself does the work of finding contiguous dirty ranges of cards to
-    // process (and clear).
-    cl->do_MemRegion(chunk_mr);
+    // we want to clear the cards: clear_cl here does the work of finding
+    // contiguous dirty ranges of cards to process and clear.
+    clear_cl.do_MemRegion(chunk_mr);
 
     // Find the next chunk of the stride.
-    chunk_card_start += CardsPerStrideChunk * n_strides;
+    chunk_card_start += ParGCCardsPerStrideChunk * n_strides;
   }
 }
 
+
+// If you want a talkative process_chunk_boundaries,
+// then #define NOISY(x) x
+#ifdef NOISY
+#error "Encountered a global preprocessor flag, NOISY, which might clash with local definition to follow"
+#else
+#define NOISY(x)
+#endif
+
 void
 CardTableModRefBS::
 process_chunk_boundaries(Space* sp,
@@ -147,126 +171,232 @@
                          uintptr_t lowest_non_clean_base_chunk_index,
                          size_t    lowest_non_clean_chunk_size)
 {
-  // We must worry about the chunk boundaries.
+  // We must worry about non-array objects that cross chunk boundaries,
+  // because such objects are both precisely and imprecisely marked:
+  // .. if the head of such an object is dirty, the entire object
+  //    needs to be scanned, under the interpretation that this
+  //    was an imprecise mark
+  // .. if the head of such an object is not dirty, we can assume
+  //    precise marking and it's efficient to scan just the dirty
+  //    cards.
+  // In either case, each scanned reference must be scanned precisely
+  // once so as to avoid cloning of a young referent. For efficiency,
+  // our closures depend on this property and do not protect against
+  // double scans.
 
-  // First, set our max_to_do:
-  HeapWord* max_to_do = NULL;
   uintptr_t cur_chunk_index = addr_to_chunk_index(chunk_mr.start());
   cur_chunk_index           = cur_chunk_index - lowest_non_clean_base_chunk_index;
 
+  NOISY(tty->print_cr("===========================================================================");)
+  NOISY(tty->print_cr(" process_chunk_boundary: Called with [" PTR_FORMAT "," PTR_FORMAT ")",
+                      chunk_mr.start(), chunk_mr.end());)
+
+  // First, set "our" lowest_non_clean entry, which would be
+  // used by the thread scanning an adjoining left chunk with
+  // a non-array object straddling the mutual boundary.
+  // Find the object that spans our boundary, if one exists.
+  // first_block is the block possibly straddling our left boundary.
+  HeapWord* first_block = sp->block_start(chunk_mr.start());
+  assert((chunk_mr.start() != used.start()) || (first_block == chunk_mr.start()),
+         "First chunk should always have a co-initial block");
+  // Does the block straddle the chunk's left boundary, and is it
+  // a non-array object?
+  if (first_block < chunk_mr.start()        // first block straddles left bdry
+      && sp->block_is_obj(first_block)      // first block is an object
+      && !(oop(first_block)->is_objArray()  // first block is not an array (arrays are precisely dirtied)
+           || oop(first_block)->is_typeArray())) {
+    // Find our least non-clean card, so that a left neighbour
+    // does not scan an object straddling the mutual boundary
+    // too far to the right, and attempt to scan a portion of
+    // that object twice.
+    jbyte* first_dirty_card = NULL;
+    jbyte* last_card_of_first_obj =
+        byte_for(first_block + sp->block_size(first_block) - 1);
+    jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start());
+    jbyte* last_card_of_cur_chunk = byte_for(chunk_mr.last());
+    jbyte* last_card_to_check =
+      (jbyte*) MIN2((intptr_t) last_card_of_cur_chunk,
+                    (intptr_t) last_card_of_first_obj);
+    // Note that this does not need to go beyond our last card
+    // if our first object completely straddles this chunk.
+    for (jbyte* cur = first_card_of_cur_chunk;
+         cur <= last_card_to_check; cur++) {
+      jbyte val = *cur;
+      if (card_will_be_scanned(val)) {
+        first_dirty_card = cur; break;
+      } else {
+        assert(!card_may_have_been_dirty(val), "Error");
+      }
+    }
+    if (first_dirty_card != NULL) {
+      NOISY(tty->print_cr(" LNC: Found a dirty card at " PTR_FORMAT " in current chunk",
+                    first_dirty_card);)
+      assert(0 <= cur_chunk_index && cur_chunk_index < lowest_non_clean_chunk_size,
+             "Bounds error.");
+      assert(lowest_non_clean[cur_chunk_index] == NULL,
+             "Write exactly once : value should be stable hereafter for this round");
+      lowest_non_clean[cur_chunk_index] = first_dirty_card;
+    } NOISY(else {
+      tty->print_cr(" LNC: Found no dirty card in current chunk; leaving LNC entry NULL");
+      // In the future, we could have this thread look for a non-NULL value to copy from its
+      // right neighbour (up to the end of the first object).
+      if (last_card_of_cur_chunk < last_card_of_first_obj) {
+        tty->print_cr(" LNC: BEWARE!!! first obj straddles past right end of chunk:\n"
+                      "   might be efficient to get value from right neighbour?");
+      }
+    })
+  } else {
+    // In this case we can help our neighbour by just asking them
+    // to stop at our first card (even though it may not be dirty).
+    NOISY(tty->print_cr(" LNC: first block is not a non-array object; setting LNC to first card of current chunk");)
+    assert(lowest_non_clean[cur_chunk_index] == NULL, "Write once : value should be stable hereafter");
+    jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start());
+    lowest_non_clean[cur_chunk_index] = first_card_of_cur_chunk;
+  }
+  NOISY(tty->print_cr(" process_chunk_boundary: lowest_non_clean[" INTPTR_FORMAT "] = " PTR_FORMAT
+                "   which corresponds to the heap address " PTR_FORMAT,
+                cur_chunk_index, lowest_non_clean[cur_chunk_index],
+                (lowest_non_clean[cur_chunk_index] != NULL)
+                ? addr_for(lowest_non_clean[cur_chunk_index])
+                : NULL);)
+  NOISY(tty->print_cr("---------------------------------------------------------------------------");)
+
+  // Next, set our own max_to_do, which will strictly/exclusively bound
+  // the highest address that we will scan past the right end of our chunk.
+  HeapWord* max_to_do = NULL;
   if (chunk_mr.end() < used.end()) {
-    // This is not the last chunk in the used region.  What is the last
-    // object?
-    HeapWord* last_block = sp->block_start(chunk_mr.end());
+    // This is not the last chunk in the used region.
+    // What is our last block? We check the first block of
+    // the next (right) chunk rather than strictly check our last block
+    // because it's potentially more efficient to do so.
+    HeapWord* const last_block = sp->block_start(chunk_mr.end());
     assert(last_block <= chunk_mr.end(), "In case this property changes.");
-    if (last_block == chunk_mr.end()
-        || !sp->block_is_obj(last_block)) {
+    if ((last_block == chunk_mr.end())     // our last block does not straddle boundary
+        || !sp->block_is_obj(last_block)   // last_block isn't an object
+        || oop(last_block)->is_objArray()  // last_block is an array (precisely marked)
+        || oop(last_block)->is_typeArray()) {
       max_to_do = chunk_mr.end();
-
+      NOISY(tty->print_cr(" process_chunk_boundary: Last block on this card is not a non-array object;\n"
+                         "   max_to_do left at " PTR_FORMAT, max_to_do);)
     } else {
-      // It is an object and starts before the end of the current chunk.
+      assert(last_block < chunk_mr.end(), "Tautology");
+      // It is a non-array object that straddles the right boundary of this chunk.
       // last_obj_card is the card corresponding to the start of the last object
       // in the chunk.  Note that the last object may not start in
       // the chunk.
-      jbyte* last_obj_card = byte_for(last_block);
-      if (!card_may_have_been_dirty(*last_obj_card)) {
-        // The card containing the head is not dirty.  Any marks in
+      jbyte* const last_obj_card = byte_for(last_block);
+      const jbyte val = *last_obj_card;
+      if (!card_will_be_scanned(val)) {
+        assert(!card_may_have_been_dirty(val), "Error");
+        // The card containing the head is not dirty.  Any marks on
         // subsequent cards still in this chunk must have been made
-        // precisely; we can cap processing at the end.
+        // precisely; we can cap processing at the end of our chunk.
         max_to_do = chunk_mr.end();
+        NOISY(tty->print_cr(" process_chunk_boundary: Head of last object on this card is not dirty;\n"
+                            "   max_to_do left at " PTR_FORMAT,
+                            max_to_do);)
       } else {
         // The last object must be considered dirty, and extends onto the
         // following chunk.  Look for a dirty card in that chunk that will
         // bound our processing.
         jbyte* limit_card = NULL;
-        size_t last_block_size = sp->block_size(last_block);
-        jbyte* last_card_of_last_obj =
+        const size_t last_block_size = sp->block_size(last_block);
+        jbyte* const last_card_of_last_obj =
           byte_for(last_block + last_block_size - 1);
-        jbyte* first_card_of_next_chunk = byte_for(chunk_mr.end());
+        jbyte* const first_card_of_next_chunk = byte_for(chunk_mr.end());
         // This search potentially goes a long distance looking
-        // for the next card that will be scanned.  For example,
-        // an object that is an array of primitives will not
-        // have any cards covering regions interior to the array
-        // that will need to be scanned. The scan can be terminated
-        // at the last card of the next chunk.  That would leave
-        // limit_card as NULL and would result in "max_to_do"
-        // being set with the LNC value or with the end
-        // of the last block.
-        jbyte* last_card_of_next_chunk = first_card_of_next_chunk +
-          CardsPerStrideChunk;
-        assert(byte_for(chunk_mr.end()) - byte_for(chunk_mr.start())
-          == CardsPerStrideChunk, "last card of next chunk may be wrong");
-        jbyte* last_card_to_check = (jbyte*) MIN2(last_card_of_last_obj,
-                                                  last_card_of_next_chunk);
+        // for the next card that will be scanned, terminating
+        // at the end of the last_block, if no earlier dirty card
+        // is found.
+        assert(byte_for(chunk_mr.end()) - byte_for(chunk_mr.start()) == ParGCCardsPerStrideChunk,
+               "last card of next chunk may be wrong");
         for (jbyte* cur = first_card_of_next_chunk;
-             cur <= last_card_to_check; cur++) {
-          if (card_will_be_scanned(*cur)) {
+             cur <= last_card_of_last_obj; cur++) {
+          const jbyte val = *cur;
+          if (card_will_be_scanned(val)) {
+            NOISY(tty->print_cr(" Found a non-clean card " PTR_FORMAT " with value 0x%x",
+                                cur, (int)val);)
             limit_card = cur; break;
+          } else {
+            assert(!card_may_have_been_dirty(val), "Error: card can't be skipped");
           }
         }
-        assert(0 <= cur_chunk_index+1 &&
-               cur_chunk_index+1 < lowest_non_clean_chunk_size,
+        if (limit_card != NULL) {
+          max_to_do = addr_for(limit_card);
+          assert(limit_card != NULL && max_to_do != NULL, "Error");
+          NOISY(tty->print_cr(" process_chunk_boundary: Found a dirty card at " PTR_FORMAT
+                        "   max_to_do set at " PTR_FORMAT " which is before end of last block in chunk: "
+                        PTR_FORMAT " + " PTR_FORMAT " = " PTR_FORMAT,
+                        limit_card, max_to_do, last_block, last_block_size, (last_block+last_block_size));)
+        } else {
+          // The following is a pessimistic value, because it's possible
+          // that a dirty card on a subsequent chunk has been cleared by
+          // the time we get to look at it; we'll correct for that further below,
+          // using the LNC array which records the least non-clean card
+          // before cards were cleared in a particular chunk.
+          limit_card = last_card_of_last_obj;
+          max_to_do = last_block + last_block_size;
+          assert(limit_card != NULL && max_to_do != NULL, "Error");
+          NOISY(tty->print_cr(" process_chunk_boundary: Found no dirty card before end of last block in chunk\n"
+                              "   Setting limit_card to " PTR_FORMAT
+                              " and max_to_do " PTR_FORMAT " + " PTR_FORMAT " = " PTR_FORMAT,
+                              limit_card, last_block, last_block_size, max_to_do);)
+        }
+        assert(0 < cur_chunk_index+1 && cur_chunk_index+1 < lowest_non_clean_chunk_size,
                "Bounds error.");
-        // LNC for the next chunk
-        jbyte* lnc_card = lowest_non_clean[cur_chunk_index+1];
-        if (limit_card == NULL) {
-          limit_card = lnc_card;
+        // It is possible that a dirty card for the last object may have been
+        // cleared before we had a chance to examine it. In that case, the value
+        // will have been logged in the LNC for that chunk.
+        // We need to examine as many chunks to the right as this object
+        // covers.
+        const uintptr_t last_chunk_index_to_check = addr_to_chunk_index(last_block + last_block_size - 1)
+                                                    - lowest_non_clean_base_chunk_index;
+        DEBUG_ONLY(const uintptr_t last_chunk_index = addr_to_chunk_index(used.last())
+                                                      - lowest_non_clean_base_chunk_index;)
+        assert(last_chunk_index_to_check <= last_chunk_index,
+               err_msg("Out of bounds: last_chunk_index_to_check " INTPTR_FORMAT
+                       " exceeds last_chunk_index " INTPTR_FORMAT,
+                       last_chunk_index_to_check, last_chunk_index));
+        for (uintptr_t lnc_index = cur_chunk_index + 1;
+             lnc_index <= last_chunk_index_to_check;
+             lnc_index++) {
+          jbyte* lnc_card = lowest_non_clean[lnc_index];
+          if (lnc_card != NULL) {
+            // we can stop at the first non-NULL entry we find
+            if (lnc_card <= limit_card) {
+              NOISY(tty->print_cr(" process_chunk_boundary: LNC card " PTR_FORMAT " is lower than limit_card " PTR_FORMAT,
+                                  "   max_to_do will be lowered to " PTR_FORMAT " from " PTR_FORMAT,
+                                  lnc_card, limit_card, addr_for(lnc_card), max_to_do);)
+              limit_card = lnc_card;
+              max_to_do = addr_for(limit_card);
+              assert(limit_card != NULL && max_to_do != NULL, "Error");
+            }
+            // In any case, we break now
+            break;
+          }  // else continue to look for a non-NULL entry if any
         }
-        if (limit_card != NULL) {
-          if (lnc_card != NULL) {
-            limit_card = (jbyte*)MIN2((intptr_t)limit_card,
-                                      (intptr_t)lnc_card);
-          }
-          max_to_do = addr_for(limit_card);
-        } else {
-          max_to_do = last_block + last_block_size;
-        }
+        assert(limit_card != NULL && max_to_do != NULL, "Error");
       }
+      assert(max_to_do != NULL, "OOPS 1 !");
     }
-    assert(max_to_do != NULL, "OOPS!");
+    assert(max_to_do != NULL, "OOPS 2!");
   } else {
     max_to_do = used.end();
+    NOISY(tty->print_cr(" process_chunk_boundary: Last chunk of this space;\n"
+                  "   max_to_do left at " PTR_FORMAT,
+                  max_to_do);)
   }
+  assert(max_to_do != NULL, "OOPS 3!");
   // Now we can set the closure we're using so it doesn't to beyond
   // max_to_do.
   dcto_cl->set_min_done(max_to_do);
 #ifndef PRODUCT
   dcto_cl->set_last_bottom(max_to_do);
 #endif
+  NOISY(tty->print_cr("===========================================================================\n");)
+}
 
-  // Now we set *our" lowest_non_clean entry.
-  // Find the object that spans our boundary, if one exists.
-  // Nothing to do on the first chunk.
-  if (chunk_mr.start() > used.start()) {
-    // first_block is the block possibly spanning the chunk start
-    HeapWord* first_block = sp->block_start(chunk_mr.start());
-    // Does the block span the start of the chunk and is it
-    // an object?
-    if (first_block < chunk_mr.start() &&
-        sp->block_is_obj(first_block)) {
-      jbyte* first_dirty_card = NULL;
-      jbyte* last_card_of_first_obj =
-          byte_for(first_block + sp->block_size(first_block) - 1);
-      jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start());
-      jbyte* last_card_of_cur_chunk = byte_for(chunk_mr.last());
-      jbyte* last_card_to_check =
-        (jbyte*) MIN2((intptr_t) last_card_of_cur_chunk,
-                      (intptr_t) last_card_of_first_obj);
-      for (jbyte* cur = first_card_of_cur_chunk;
-           cur <= last_card_to_check; cur++) {
-        if (card_will_be_scanned(*cur)) {
-          first_dirty_card = cur; break;
-        }
-      }
-      if (first_dirty_card != NULL) {
-        assert(0 <= cur_chunk_index &&
-                 cur_chunk_index < lowest_non_clean_chunk_size,
-               "Bounds error.");
-        lowest_non_clean[cur_chunk_index] = first_dirty_card;
-      }
-    }
-  }
-}
+#undef NOISY
 
 void
 CardTableModRefBS::
@@ -283,8 +413,8 @@
   // LNC array for the covered region.  Any later expansion can't affect
   // the used_at_save_marks region.
   // (I observed a bug in which the first thread to execute this would
-  // resize, and then it would cause "expand_and_allocates" that would
-  // Increase the number of chunks in the covered region.  Then a second
+  // resize, and then it would cause "expand_and_allocate" that would
+  // increase the number of chunks in the covered region.  Then a second
   // thread would come and execute this, see that the size didn't match,
   // and free and allocate again.  So the first thread would be using a
   // freed "_lowest_non_clean" array.)
--- a/src/share/vm/gc_implementation/parNew/parOopClosures.inline.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/parNew/parOopClosures.inline.hpp	Tue May 24 11:09:39 2011 -0700
@@ -77,7 +77,23 @@
   if (!oopDesc::is_null(heap_oop)) {
     oop obj = oopDesc::decode_heap_oop_not_null(heap_oop);
     if ((HeapWord*)obj < _boundary) {
-      assert(!_g->to()->is_in_reserved(obj), "Scanning field twice?");
+#ifndef PRODUCT
+      if (_g->to()->is_in_reserved(obj)) {
+        tty->print_cr("Scanning field (" PTR_FORMAT ") twice?", p);
+        GenCollectedHeap* gch =  (GenCollectedHeap*)Universe::heap();
+        Space* sp = gch->space_containing(p);
+        oop obj = oop(sp->block_start(p));
+        assert((HeapWord*)obj < (HeapWord*)p, "Error");
+        tty->print_cr("Object: " PTR_FORMAT, obj);
+        tty->print_cr("-------");
+        obj->print();
+        tty->print_cr("-----");
+        tty->print_cr("Heap:");
+        tty->print_cr("-----");
+        gch->print();
+        ShouldNotReachHere();
+      }
+#endif
       // OK, we need to ensure that it is copied.
       // We read the klass and mark in this order, so that we can reliably
       // get the size of the object: if the mark we read is not a
--- a/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp	Tue May 24 11:09:39 2011 -0700
@@ -173,7 +173,7 @@
     TraceCPUTime tcpu(PrintGCDetails, true, gclog_or_tty);
     TraceTime t1(gc_cause_str, PrintGC, !PrintGCDetails, gclog_or_tty);
     TraceCollectorStats tcs(counters());
-    TraceMemoryManagerStats tms(true /* Full GC */);
+    TraceMemoryManagerStats tms(true /* Full GC */,gc_cause);
 
     if (TraceGen1Time) accumulated_time()->start();
 
--- a/src/share/vm/gc_implementation/parallelScavenge/psOldGen.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/parallelScavenge/psOldGen.cpp	Tue May 24 11:09:39 2011 -0700
@@ -224,6 +224,12 @@
   const size_t alignment = virtual_space()->alignment();
   size_t aligned_bytes  = align_size_up(bytes, alignment);
   size_t aligned_expand_bytes = align_size_up(MinHeapDeltaBytes, alignment);
+
+  if (UseNUMA) {
+    // With NUMA we use round-robin page allocation for the old gen. Expand by at least
+    // providing a page per lgroup. Alignment is larger or equal to the page size.
+    aligned_expand_bytes = MAX2(aligned_expand_bytes, alignment * os::numa_get_groups_num());
+  }
   if (aligned_bytes == 0){
     // The alignment caused the number of bytes to wrap.  An expand_by(0) will
     // return true with the implication that and expansion was done when it
--- a/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2057,7 +2057,7 @@
     TraceCPUTime tcpu(PrintGCDetails, true, gclog_or_tty);
     TraceTime t1(gc_cause_str, PrintGC, !PrintGCDetails, gclog_or_tty);
     TraceCollectorStats tcs(counters());
-    TraceMemoryManagerStats tms(true /* Full GC */);
+    TraceMemoryManagerStats tms(true /* Full GC */,gc_cause);
 
     if (TraceGen1Time) accumulated_time()->start();
 
--- a/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp	Tue May 24 11:09:39 2011 -0700
@@ -322,7 +322,7 @@
     TraceCPUTime tcpu(PrintGCDetails, true, gclog_or_tty);
     TraceTime t1("GC", PrintGC, !PrintGCDetails, gclog_or_tty);
     TraceCollectorStats tcs(counters());
-    TraceMemoryManagerStats tms(false /* not full GC */);
+    TraceMemoryManagerStats tms(false /* not full GC */,gc_cause);
 
     if (TraceGen0Time) accumulated_time()->start();
 
--- a/src/share/vm/interpreter/abstractInterpreter.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/interpreter/abstractInterpreter.hpp	Tue May 24 11:09:39 2011 -0700
@@ -175,19 +175,32 @@
                                     int temps,
                                     int popframe_args,
                                     int monitors,
+                                    int caller_actual_parameters,
                                     int callee_params,
                                     int callee_locals,
-                                    bool is_top_frame);
+                                    bool is_top_frame) {
+    return layout_activation(method,
+                             temps,
+                             popframe_args,
+                             monitors,
+                             caller_actual_parameters,
+                             callee_params,
+                             callee_locals,
+                             (frame*)NULL,
+                             (frame*)NULL,
+                             is_top_frame);
+  }
 
   static int       layout_activation(methodOop method,
-                                      int temps,
-                                      int popframe_args,
-                                      int monitors,
-                                      int callee_params,
-                                      int callee_locals,
-                                      frame* caller,
-                                      frame* interpreter_frame,
-                                      bool is_top_frame);
+                                     int temps,
+                                     int popframe_args,
+                                     int monitors,
+                                     int caller_actual_parameters,
+                                     int callee_params,
+                                     int callee_locals,
+                                     frame* caller,
+                                     frame* interpreter_frame,
+                                     bool is_top_frame);
 
   // Runtime support
   static bool       is_not_reached(                       methodHandle method, int bci);
--- a/src/share/vm/interpreter/linkResolver.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/interpreter/linkResolver.cpp	Tue May 24 11:09:39 2011 -0700
@@ -327,6 +327,7 @@
 
   // 1. check if klass is not interface
   if (resolved_klass->is_interface()) {
+    ResourceMark rm(THREAD);
     char buf[200];
     jio_snprintf(buf, sizeof(buf), "Found interface %s, but class was expected", Klass::cast(resolved_klass())->external_name());
     THROW_MSG(vmSymbols::java_lang_IncompatibleClassChangeError(), buf);
@@ -413,6 +414,7 @@
 
  // check if klass is interface
   if (!resolved_klass->is_interface()) {
+    ResourceMark rm(THREAD);
     char buf[200];
     jio_snprintf(buf, sizeof(buf), "Found class %s, but interface was expected", Klass::cast(resolved_klass())->external_name());
     THROW_MSG(vmSymbols::java_lang_IncompatibleClassChangeError(), buf);
@@ -534,6 +536,7 @@
 
   // check for errors
   if (is_static != fd.is_static()) {
+    ResourceMark rm(THREAD);
     char msg[200];
     jio_snprintf(msg, sizeof(msg), "Expected %s field %s.%s", is_static ? "static" : "non-static", Klass::cast(resolved_klass())->external_name(), fd.name()->as_C_string());
     THROW_MSG(vmSymbols::java_lang_IncompatibleClassChangeError(), msg);
@@ -631,6 +634,7 @@
 
   // check if static
   if (!resolved_method->is_static()) {
+    ResourceMark rm(THREAD);
     char buf[200];
     jio_snprintf(buf, sizeof(buf), "Expected static method %s", methodOopDesc::name_and_sig_as_C_string(Klass::cast(resolved_klass()),
                                                       resolved_method->name(),
@@ -671,6 +675,7 @@
 
   // check if not static
   if (resolved_method->is_static()) {
+    ResourceMark rm(THREAD);
     char buf[200];
     jio_snprintf(buf, sizeof(buf),
                  "Expecting non-static method %s",
@@ -717,6 +722,7 @@
 
   // check if not static
   if (sel_method->is_static()) {
+    ResourceMark rm(THREAD);
     char buf[200];
     jio_snprintf(buf, sizeof(buf), "Expecting non-static method %s", methodOopDesc::name_and_sig_as_C_string(Klass::cast(resolved_klass()),
                                                                                                              resolved_method->name(),
@@ -757,6 +763,7 @@
 
   // check if not static
   if (resolved_method->is_static()) {
+    ResourceMark rm(THREAD);
     char buf[200];
     jio_snprintf(buf, sizeof(buf), "Expecting non-static method %s", methodOopDesc::name_and_sig_as_C_string(Klass::cast(resolved_klass()),
                                                                                                              resolved_method->name(),
@@ -873,6 +880,7 @@
 
   // check if receiver klass implements the resolved interface
   if (!recv_klass->is_subtype_of(resolved_klass())) {
+    ResourceMark rm(THREAD);
     char buf[200];
     jio_snprintf(buf, sizeof(buf), "Class %s does not implement the requested interface %s",
                  (Klass::cast(recv_klass()))->external_name(),
--- a/src/share/vm/memory/allocation.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/allocation.cpp	Tue May 24 11:09:39 2011 -0700
@@ -44,6 +44,14 @@
   return (void *) AllocateHeap(size, "CHeapObj-new");
 }
 
+void* CHeapObj::operator new (size_t size, const std::nothrow_t&  nothrow_constant) {
+  char* p = (char*) os::malloc(size);
+#ifdef ASSERT
+  if (PrintMallocFree) trace_heap_malloc(size, "CHeapObj-new", p);
+#endif
+  return p;
+}
+
 void CHeapObj::operator delete(void* p){
  FreeHeap(p);
 }
--- a/src/share/vm/memory/allocation.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/allocation.hpp	Tue May 24 11:09:39 2011 -0700
@@ -34,6 +34,8 @@
 #include "opto/c2_globals.hpp"
 #endif
 
+#include <new>
+
 #define ARENA_ALIGN_M1 (((size_t)(ARENA_AMALLOC_ALIGNMENT)) - 1)
 #define ARENA_ALIGN_MASK (~((size_t)ARENA_ALIGN_M1))
 #define ARENA_ALIGN(x) ((((size_t)(x)) + ARENA_ALIGN_M1) & ARENA_ALIGN_MASK)
@@ -99,6 +101,7 @@
 class CHeapObj ALLOCATION_SUPER_CLASS_SPEC {
  public:
   void* operator new(size_t size);
+  void* operator new (size_t size, const std::nothrow_t&  nothrow_constant);
   void  operator delete(void* p);
   void* new_array(size_t size);
 };
--- a/src/share/vm/memory/blockOffsetTable.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/blockOffsetTable.cpp	Tue May 24 11:09:39 2011 -0700
@@ -541,20 +541,33 @@
     // to go back by.
     size_t n_cards_back = entry_to_cards_back(offset);
     q -= (N_words * n_cards_back);
-    assert(q >= _sp->bottom(), "Went below bottom!");
+    assert(q >= _sp->bottom(),
+           err_msg("q = " PTR_FORMAT " crossed below bottom = " PTR_FORMAT,
+                   q, _sp->bottom()));
+    assert(q < _sp->end(),
+           err_msg("q = " PTR_FORMAT " crossed above end = " PTR_FORMAT,
+                   q, _sp->end()));
     index -= n_cards_back;
     offset = _array->offset_array(index);
   }
   assert(offset < N_words, "offset too large");
   index--;
   q -= offset;
+  assert(q >= _sp->bottom(),
+         err_msg("q = " PTR_FORMAT " crossed below bottom = " PTR_FORMAT,
+                 q, _sp->bottom()));
+  assert(q < _sp->end(),
+         err_msg("q = " PTR_FORMAT " crossed above end = " PTR_FORMAT,
+                 q, _sp->end()));
   HeapWord* n = q;
 
   while (n <= addr) {
     debug_only(HeapWord* last = q);   // for debugging
     q = n;
     n += _sp->block_size(n);
-    assert(n > q, err_msg("Looping at: " INTPTR_FORMAT, n));
+    assert(n > q,
+           err_msg("Looping at n = " PTR_FORMAT " with last = " PTR_FORMAT " _sp = [" PTR_FORMAT "," PTR_FORMAT ")",
+                   n, last, _sp->bottom(), _sp->end()));
   }
   assert(q <= addr, err_msg("wrong order for current (" INTPTR_FORMAT ") <= arg (" INTPTR_FORMAT ")", q, addr));
   assert(addr <= n, err_msg("wrong order for arg (" INTPTR_FORMAT ") <= next (" INTPTR_FORMAT ")", addr, n));
--- a/src/share/vm/memory/cardTableModRefBS.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/cardTableModRefBS.cpp	Tue May 24 11:09:39 2011 -0700
@@ -455,25 +455,29 @@
   return true;
 }
 
-
 void CardTableModRefBS::non_clean_card_iterate_possibly_parallel(Space* sp,
                                                                  MemRegion mr,
-                                                                 DirtyCardToOopClosure* dcto_cl,
-                                                                 ClearNoncleanCardWrapper* cl) {
+                                                                 OopsInGenClosure* cl,
+                                                                 CardTableRS* ct) {
   if (!mr.is_empty()) {
     int n_threads = SharedHeap::heap()->n_par_threads();
     if (n_threads > 0) {
 #ifndef SERIALGC
-      non_clean_card_iterate_parallel_work(sp, mr, dcto_cl, cl, n_threads);
+      non_clean_card_iterate_parallel_work(sp, mr, cl, ct, n_threads);
 #else  // SERIALGC
       fatal("Parallel gc not supported here.");
 #endif // SERIALGC
     } else {
       // We do not call the non_clean_card_iterate_serial() version below because
       // we want to clear the cards (which non_clean_card_iterate_serial() does not
-      // do for us), and the ClearNoncleanCardWrapper closure itself does the work
-      // of finding contiguous dirty ranges of cards to process (and clear).
-      cl->do_MemRegion(mr);
+      // do for us): clear_cl here does the work of finding contiguous dirty ranges
+      // of cards to process and clear.
+
+      DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(),
+                                                       cl->gen_boundary());
+      ClearNoncleanCardWrapper clear_cl(dcto_cl, ct);
+
+      clear_cl.do_MemRegion(mr);
     }
   }
 }
@@ -652,43 +656,37 @@
 }
 
 #ifndef PRODUCT
-class GuaranteeNotModClosure: public MemRegionClosure {
-  CardTableModRefBS* _ct;
-public:
-  GuaranteeNotModClosure(CardTableModRefBS* ct) : _ct(ct) {}
-  void do_MemRegion(MemRegion mr) {
-    jbyte* entry = _ct->byte_for(mr.start());
-    guarantee(*entry != CardTableModRefBS::clean_card,
-              "Dirty card in region that should be clean");
+void CardTableModRefBS::verify_region(MemRegion mr,
+                                      jbyte val, bool val_equals) {
+  jbyte* start    = byte_for(mr.start());
+  jbyte* end      = byte_for(mr.last());
+  bool   failures = false;
+  for (jbyte* curr = start; curr <= end; ++curr) {
+    jbyte curr_val = *curr;
+    bool failed = (val_equals) ? (curr_val != val) : (curr_val == val);
+    if (failed) {
+      if (!failures) {
+        tty->cr();
+        tty->print_cr("== CT verification failed: ["PTR_FORMAT","PTR_FORMAT"]");
+        tty->print_cr("==   %sexpecting value: %d",
+                      (val_equals) ? "" : "not ", val);
+        failures = true;
+      }
+      tty->print_cr("==   card "PTR_FORMAT" ["PTR_FORMAT","PTR_FORMAT"], "
+                    "val: %d", curr, addr_for(curr),
+                    (HeapWord*) (((size_t) addr_for(curr)) + card_size),
+                    (int) curr_val);
+    }
   }
-};
-
-void CardTableModRefBS::verify_clean_region(MemRegion mr) {
-  GuaranteeNotModClosure blk(this);
-  non_clean_card_iterate_serial(mr, &blk);
+  guarantee(!failures, "there should not have been any failures");
 }
 
-// To verify a MemRegion is entirely dirty this closure is passed to
-// dirty_card_iterate. If the region is dirty do_MemRegion will be
-// invoked only once with a MemRegion equal to the one being
-// verified.
-class GuaranteeDirtyClosure: public MemRegionClosure {
-  CardTableModRefBS* _ct;
-  MemRegion _mr;
-  bool _result;
-public:
-  GuaranteeDirtyClosure(CardTableModRefBS* ct, MemRegion mr)
-    : _ct(ct), _mr(mr), _result(false) {}
-  void do_MemRegion(MemRegion mr) {
-    _result = _mr.equals(mr);
-  }
-  bool result() const { return _result; }
-};
+void CardTableModRefBS::verify_not_dirty_region(MemRegion mr) {
+  verify_region(mr, dirty_card, false /* val_equals */);
+}
 
 void CardTableModRefBS::verify_dirty_region(MemRegion mr) {
-  GuaranteeDirtyClosure blk(this, mr);
-  dirty_card_iterate(mr, &blk);
-  guarantee(blk.result(), "Non-dirty cards in region that should be dirty");
+  verify_region(mr, dirty_card, true /* val_equals */);
 }
 #endif
 
--- a/src/share/vm/memory/cardTableModRefBS.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/cardTableModRefBS.hpp	Tue May 24 11:09:39 2011 -0700
@@ -150,7 +150,9 @@
   // Mapping from address to card marking array entry
   jbyte* byte_for(const void* p) const {
     assert(_whole_heap.contains(p),
-           "out of bounds access to card marking array");
+           err_msg("Attempt to access p = "PTR_FORMAT" out of bounds of "
+                   " card marking array's _whole_heap = ["PTR_FORMAT","PTR_FORMAT")",
+                   p, _whole_heap.start(), _whole_heap.end()));
     jbyte* result = &byte_map_base[uintptr_t(p) >> card_shift];
     assert(result >= _byte_map && result < _byte_map + _byte_map_size,
            "out of bounds accessor for card marking array");
@@ -173,18 +175,17 @@
   // A variant of the above that will operate in a parallel mode if
   // worker threads are available, and clear the dirty cards as it
   // processes them.
-  // ClearNoncleanCardWrapper cl must wrap the DirtyCardToOopClosure dcto_cl,
-  // which may itself be modified by the method.
+  // XXX ??? MemRegionClosure above vs OopsInGenClosure below XXX
+  // XXX some new_dcto_cl's take OopClosure's, plus as above there are
+  // some MemRegionClosures. Clean this up everywhere. XXX
   void non_clean_card_iterate_possibly_parallel(Space* sp, MemRegion mr,
-                                                DirtyCardToOopClosure* dcto_cl,
-                                                ClearNoncleanCardWrapper* cl);
+                                                OopsInGenClosure* cl, CardTableRS* ct);
 
  private:
   // Work method used to implement non_clean_card_iterate_possibly_parallel()
   // above in the parallel case.
   void non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
-                                            DirtyCardToOopClosure* dcto_cl,
-                                            ClearNoncleanCardWrapper* cl,
+                                            OopsInGenClosure* cl, CardTableRS* ct,
                                             int n_threads);
 
  protected:
@@ -198,11 +199,6 @@
 
   // *** Support for parallel card scanning.
 
-  enum SomeConstantsForParallelism {
-    StridesPerThread    = 2,
-    CardsPerStrideChunk = 256
-  };
-
   // This is an array, one element per covered region of the card table.
   // Each entry is itself an array, with one element per chunk in the
   // covered region.  Each entry of these arrays is the lowest non-clean
@@ -235,7 +231,7 @@
   // covers the given address.
   uintptr_t addr_to_chunk_index(const void* addr) {
     uintptr_t card = (uintptr_t) byte_for(addr);
-    return card / CardsPerStrideChunk;
+    return card / ParGCCardsPerStrideChunk;
   }
 
   // Apply cl, which must either itself apply dcto_cl or be dcto_cl,
@@ -243,8 +239,8 @@
   void process_stride(Space* sp,
                       MemRegion used,
                       jint stride, int n_strides,
-                      DirtyCardToOopClosure* dcto_cl,
-                      ClearNoncleanCardWrapper* cl,
+                      OopsInGenClosure* cl,
+                      CardTableRS* ct,
                       jbyte** lowest_non_clean,
                       uintptr_t lowest_non_clean_base_chunk_index,
                       size_t lowest_non_clean_chunk_size);
@@ -457,14 +453,18 @@
     size_t delta = pointer_delta(p, byte_map_base, sizeof(jbyte));
     HeapWord* result = (HeapWord*) (delta << card_shift);
     assert(_whole_heap.contains(result),
-           "out of bounds accessor from card marking array");
+           err_msg("Returning result = "PTR_FORMAT" out of bounds of "
+                   " card marking array's _whole_heap = ["PTR_FORMAT","PTR_FORMAT")",
+                   result, _whole_heap.start(), _whole_heap.end()));
     return result;
   }
 
   // Mapping from address to card marking array index.
   size_t index_for(void* p) {
     assert(_whole_heap.contains(p),
-           "out of bounds access to card marking array");
+           err_msg("Attempt to access p = "PTR_FORMAT" out of bounds of "
+                   " card marking array's _whole_heap = ["PTR_FORMAT","PTR_FORMAT")",
+                   p, _whole_heap.start(), _whole_heap.end()));
     return byte_for(p) - _byte_map;
   }
 
@@ -475,11 +475,14 @@
   void verify();
   void verify_guard();
 
-  void verify_clean_region(MemRegion mr) PRODUCT_RETURN;
+  // val_equals -> it will check that all cards covered by mr equal val
+  // !val_equals -> it will check that all cards covered by mr do not equal val
+  void verify_region(MemRegion mr, jbyte val, bool val_equals) PRODUCT_RETURN;
+  void verify_not_dirty_region(MemRegion mr) PRODUCT_RETURN;
   void verify_dirty_region(MemRegion mr) PRODUCT_RETURN;
 
   static size_t par_chunk_heapword_alignment() {
-    return CardsPerStrideChunk * card_size_in_words;
+    return ParGCCardsPerStrideChunk * card_size_in_words;
   }
 
 };
--- a/src/share/vm/memory/cardTableRS.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/cardTableRS.cpp	Tue May 24 11:09:39 2011 -0700
@@ -162,7 +162,7 @@
 }
 
 ClearNoncleanCardWrapper::ClearNoncleanCardWrapper(
-  MemRegionClosure* dirty_card_closure, CardTableRS* ct) :
+  DirtyCardToOopClosure* dirty_card_closure, CardTableRS* ct) :
     _dirty_card_closure(dirty_card_closure), _ct(ct) {
     _is_par = (SharedHeap::heap()->n_par_threads() > 0);
 }
@@ -246,10 +246,6 @@
 
 void CardTableRS::younger_refs_in_space_iterate(Space* sp,
                                                 OopsInGenClosure* cl) {
-  DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, _ct_bs->precision(),
-                                                   cl->gen_boundary());
-  ClearNoncleanCardWrapper clear_cl(dcto_cl, this);
-
   const MemRegion urasm = sp->used_region_at_save_marks();
 #ifdef ASSERT
   // Convert the assertion check to a warning if we are running
@@ -275,10 +271,10 @@
     if (!urasm.equals(urasm2)) {
       warning("CMS+ParNew: Flickering used_region_at_save_marks()!!");
     }
+    ShouldNotReachHere();
   }
 #endif
-  _ct_bs->non_clean_card_iterate_possibly_parallel(sp, urasm,
-                                                   dcto_cl, &clear_cl);
+  _ct_bs->non_clean_card_iterate_possibly_parallel(sp, urasm, cl, this);
 }
 
 void CardTableRS::clear_into_younger(Generation* gen, bool clear_perm) {
--- a/src/share/vm/memory/cardTableRS.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/cardTableRS.hpp	Tue May 24 11:09:39 2011 -0700
@@ -31,7 +31,6 @@
 
 class Space;
 class OopsInGenClosure;
-class DirtyCardToOopClosure;
 
 // This kind of "GenRemSet" uses a card table both as shared data structure
 // for a mod ref barrier set and for the rem set information.
@@ -167,7 +166,7 @@
 };
 
 class ClearNoncleanCardWrapper: public MemRegionClosure {
-  MemRegionClosure* _dirty_card_closure;
+  DirtyCardToOopClosure* _dirty_card_closure;
   CardTableRS* _ct;
   bool _is_par;
 private:
@@ -179,7 +178,7 @@
   inline bool clear_card_parallel(jbyte* entry);
 
 public:
-  ClearNoncleanCardWrapper(MemRegionClosure* dirty_card_closure, CardTableRS* ct);
+  ClearNoncleanCardWrapper(DirtyCardToOopClosure* dirty_card_closure, CardTableRS* ct);
   void do_MemRegion(MemRegion mr);
 };
 
--- a/src/share/vm/memory/collectorPolicy.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/collectorPolicy.cpp	Tue May 24 11:09:39 2011 -0700
@@ -265,8 +265,6 @@
   MaxHeapSize = align_size_up(MaxHeapSize, max_alignment());
 
   always_do_update_barrier = UseConcMarkSweepGC;
-  BlockOffsetArrayUseUnallocatedBlock =
-      BlockOffsetArrayUseUnallocatedBlock || ParallelGCThreads > 0;
 
   // Check validity of heap flags
   assert(OldSize     % min_alignment() == 0, "old space alignment");
--- a/src/share/vm/memory/genCollectedHeap.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/genCollectedHeap.cpp	Tue May 24 11:09:39 2011 -0700
@@ -537,7 +537,7 @@
         // Timer for individual generations. Last argument is false: no CR
         TraceTime t1(_gens[i]->short_name(), PrintGCDetails, false, gclog_or_tty);
         TraceCollectorStats tcs(_gens[i]->counters());
-        TraceMemoryManagerStats tmms(_gens[i]->kind());
+        TraceMemoryManagerStats tmms(_gens[i]->kind(),gc_cause());
 
         size_t prev_used = _gens[i]->used();
         _gens[i]->stat_record()->invocations++;
--- a/src/share/vm/memory/genOopClosures.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/genOopClosures.hpp	Tue May 24 11:09:39 2011 -0700
@@ -175,7 +175,7 @@
  protected:
   template <class T> inline void do_oop_work(T* p) {
     oop obj = oopDesc::load_decode_heap_oop(p);
-    guarantee(obj->is_oop_or_null(), err_msg("invalid oop: " INTPTR_FORMAT, obj));
+    guarantee(obj->is_oop_or_null(), err_msg("invalid oop: " INTPTR_FORMAT, (oopDesc*) obj));
   }
  public:
   virtual void do_oop(oop* p);
--- a/src/share/vm/memory/modRefBarrierSet.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/modRefBarrierSet.hpp	Tue May 24 11:09:39 2011 -0700
@@ -100,12 +100,6 @@
   // Pass along the argument to the superclass.
   ModRefBarrierSet(int max_covered_regions) :
     BarrierSet(max_covered_regions) {}
-
-#ifndef PRODUCT
-  // Verifies that the given region contains no modified references.
-  virtual void verify_clean_region(MemRegion mr) = 0;
-#endif
-
 };
 
 #endif // SHARE_VM_MEMORY_MODREFBARRIERSET_HPP
--- a/src/share/vm/memory/space.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/memory/space.cpp	Tue May 24 11:09:39 2011 -0700
@@ -97,6 +97,14 @@
   }
 }
 
+// We get called with "mr" representing the dirty region
+// that we want to process. Because of imprecise marking,
+// we may need to extend the incoming "mr" to the right,
+// and scan more. However, because we may already have
+// scanned some of that extended region, we may need to
+// trim its right-end back some so we do not scan what
+// we (or another worker thread) may already have scanned
+// or planning to scan.
 void DirtyCardToOopClosure::do_MemRegion(MemRegion mr) {
 
   // Some collectors need to do special things whenever their dirty
@@ -148,7 +156,7 @@
   // e.g. the dirty card region is entirely in a now free object
   // -- something that could happen with a concurrent sweeper.
   bottom = MIN2(bottom, top);
-  mr     = MemRegion(bottom, top);
+  MemRegion extended_mr = MemRegion(bottom, top);
   assert(bottom <= top &&
          (_precision != CardTableModRefBS::ObjHeadPreciseArray ||
           _min_done == NULL ||
@@ -156,8 +164,8 @@
          "overlap!");
 
   // Walk the region if it is not empty; otherwise there is nothing to do.
-  if (!mr.is_empty()) {
-    walk_mem_region(mr, bottom_obj, top);
+  if (!extended_mr.is_empty()) {
+    walk_mem_region(extended_mr, bottom_obj, top);
   }
 
   // An idempotent closure might be applied in any order, so we don't
--- a/src/share/vm/oops/constantPoolKlass.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/oops/constantPoolKlass.cpp	Tue May 24 11:09:39 2011 -0700
@@ -285,10 +285,9 @@
 void constantPoolKlass::oop_push_contents(PSPromotionManager* pm, oop obj) {
   assert(obj->is_constantPool(), "should be constant pool");
   constantPoolOop cp = (constantPoolOop) obj;
-  if (cp->tags() != NULL &&
-      (!JavaObjectsInPerm || (EnableInvokeDynamic && cp->has_pseudo_string()))) {
+  if (cp->tags() != NULL) {
     for (int i = 1; i < cp->length(); ++i) {
-      if (cp->tag_at(i).is_string()) {
+      if (cp->is_pointer_entry(i)) {
         oop* base = cp->obj_at_addr_raw(i);
         if (PSScavenge::should_scavenge(base)) {
           pm->claim_or_forward_depth(base);
@@ -342,6 +341,11 @@
         anObj->print_value_on(st);
         st->print(" {0x%lx}", (address)anObj);
         break;
+      case JVM_CONSTANT_Object :
+        anObj = cp->object_at(index);
+        anObj->print_value_on(st);
+        st->print(" {0x%lx}", (address)anObj);
+        break;
       case JVM_CONSTANT_Integer :
         st->print("%d", cp->int_at(index));
         break;
@@ -432,23 +436,21 @@
   guarantee(cp->is_perm(), "should be in permspace");
   if (!cp->partially_loaded()) {
     for (int i = 0; i< cp->length();  i++) {
+      constantTag tag = cp->tag_at(i);
       CPSlot entry = cp->slot_at(i);
-      if (cp->tag_at(i).is_klass()) {
+      if (tag.is_klass()) {
         if (entry.is_oop()) {
           guarantee(entry.get_oop()->is_perm(),     "should be in permspace");
           guarantee(entry.get_oop()->is_klass(),    "should be klass");
         }
-      }
-      if (cp->tag_at(i).is_unresolved_klass()) {
+      } else if (tag.is_unresolved_klass()) {
         if (entry.is_oop()) {
           guarantee(entry.get_oop()->is_perm(),     "should be in permspace");
           guarantee(entry.get_oop()->is_klass(),    "should be klass");
         }
-      }
-      if (cp->tag_at(i).is_symbol()) {
+      } else if (tag.is_symbol()) {
         guarantee(entry.get_symbol()->refcount() != 0, "should have nonzero reference count");
-      }
-      if (cp->tag_at(i).is_unresolved_string()) {
+      } else if (tag.is_unresolved_string()) {
         if (entry.is_oop()) {
           guarantee(entry.get_oop()->is_perm(),     "should be in permspace");
           guarantee(entry.get_oop()->is_instance(), "should be instance");
@@ -456,8 +458,7 @@
         else {
           guarantee(entry.get_symbol()->refcount() != 0, "should have nonzero reference count");
         }
-      }
-      if (cp->tag_at(i).is_string()) {
+      } else if (tag.is_string()) {
         if (!cp->has_pseudo_string()) {
           if (entry.is_oop()) {
             guarantee(!JavaObjectsInPerm || entry.get_oop()->is_perm(),
@@ -467,8 +468,11 @@
         } else {
           // can be non-perm, can be non-instance (array)
         }
+      } else if (tag.is_object()) {
+        assert(entry.get_oop()->is_oop(), "should be some valid oop");
+      } else {
+        assert(!cp->is_pointer_entry(i), "unhandled oop type in constantPoolKlass::verify_on");
       }
-      // FIXME: verify JSR 292 tags JVM_CONSTANT_MethodHandle, etc.
     }
     guarantee(cp->tags()->is_perm(),         "should be in permspace");
     guarantee(cp->tags()->is_typeArray(),    "should be type array");
--- a/src/share/vm/oops/methodDataOop.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/oops/methodDataOop.hpp	Tue May 24 11:09:39 2011 -0700
@@ -1194,7 +1194,7 @@
   // Whole-method sticky bits and flags
 public:
   enum {
-    _trap_hist_limit    = 16,   // decoupled from Deoptimization::Reason_LIMIT
+    _trap_hist_limit    = 17,   // decoupled from Deoptimization::Reason_LIMIT
     _trap_hist_mask     = max_jubyte,
     _extra_data_count   = 4     // extra DataLayout headers, for trap history
   }; // Public flag values
--- a/src/share/vm/opto/bytecodeInfo.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/bytecodeInfo.cpp	Tue May 24 11:09:39 2011 -0700
@@ -89,7 +89,7 @@
 }
 
 // positive filter: should send be inlined?  returns NULL, if yes, or rejection msg
-const char* InlineTree::shouldInline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result) const {
+const char* InlineTree::should_inline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result) const {
   // Allows targeted inlining
   if(callee_method->should_inline()) {
     *wci_result = *(WarmCallInfo::always_hot());
@@ -102,8 +102,7 @@
 
   // positive filter: should send be inlined?  returns NULL (--> yes)
   // or rejection msg
-  int max_size = C->max_inline_size();
-  int size     = callee_method->code_size();
+  int size = callee_method->code_size();
 
   // Check for too many throws (and not too huge)
   if(callee_method->interpreter_throwout_count() > InlineThrowCount &&
@@ -120,18 +119,36 @@
     return NULL;  // size and frequency are represented in a new way
   }
 
+  int default_max_inline_size = C->max_inline_size();
+  int inline_small_code_size  = InlineSmallCode / 4;
+  int max_inline_size         = default_max_inline_size;
+
   int call_site_count  = method()->scale_count(profile.count());
   int invoke_count     = method()->interpreter_invocation_count();
-  assert( invoke_count != 0, "Require invokation count greater than zero");
-  int freq = call_site_count/invoke_count;
+
+  // Bytecoded method handle adapters do not have interpreter
+  // profiling data but only made up MDO data.  Get the counter from
+  // there.
+  if (caller_method->is_method_handle_adapter()) {
+    assert(method()->method_data_or_null(), "must have an MDO");
+    ciMethodData* mdo = method()->method_data();
+    ciProfileData* mha_profile = mdo->bci_to_data(caller_bci);
+    assert(mha_profile, "must exist");
+    CounterData* cd = mha_profile->as_CounterData();
+    invoke_count = cd->count();
+    call_site_count = invoke_count;  // use the same value
+  }
+
+  assert(invoke_count != 0, "require invocation count greater than zero");
+  int freq = call_site_count / invoke_count;
 
   // bump the max size if the call is frequent
   if ((freq >= InlineFrequencyRatio) ||
       (call_site_count >= InlineFrequencyCount) ||
       is_init_with_ea(callee_method, caller_method, C)) {
 
-    max_size = C->freq_inline_size();
-    if (size <= max_size && TraceFrequencyInlining) {
+    max_inline_size = C->freq_inline_size();
+    if (size <= max_inline_size && TraceFrequencyInlining) {
       CompileTask::print_inline_indent(inline_depth());
       tty->print_cr("Inlined frequent method (freq=%d count=%d):", freq, call_site_count);
       CompileTask::print_inline_indent(inline_depth());
@@ -141,11 +158,11 @@
   } else {
     // Not hot.  Check for medium-sized pre-existing nmethod at cold sites.
     if (callee_method->has_compiled_code() &&
-        callee_method->instructions_size(CompLevel_full_optimization) > InlineSmallCode/4)
+        callee_method->instructions_size(CompLevel_full_optimization) > inline_small_code_size)
       return "already compiled into a medium method";
   }
-  if (size > max_size) {
-    if (max_size > C->max_inline_size())
+  if (size > max_inline_size) {
+    if (max_inline_size > default_max_inline_size)
       return "hot method too big";
     return "too big";
   }
@@ -154,7 +171,7 @@
 
 
 // negative filter: should send NOT be inlined?  returns NULL, ok to inline, or rejection msg
-const char* InlineTree::shouldNotInline(ciMethod *callee_method, ciMethod* caller_method, WarmCallInfo* wci_result) const {
+const char* InlineTree::should_not_inline(ciMethod *callee_method, ciMethod* caller_method, WarmCallInfo* wci_result) const {
   // negative filter: should send NOT be inlined?  returns NULL (--> inline) or rejection msg
   if (!UseOldInlining) {
     const char* fail = NULL;
@@ -269,14 +286,13 @@
   }
 
   const char *msg = NULL;
-  if ((msg = shouldInline(callee_method, caller_method, caller_bci,
-                          profile, wci_result)) != NULL) {
+  msg = should_inline(callee_method, caller_method, caller_bci, profile, wci_result);
+  if (msg != NULL)
     return msg;
-  }
-  if ((msg = shouldNotInline(callee_method, caller_method,
-                             wci_result)) != NULL) {
+
+  msg = should_not_inline(callee_method, caller_method, wci_result);
+  if (msg != NULL)
     return msg;
-  }
 
   if (InlineAccessors && callee_method->is_accessor()) {
     // accessor methods are not subject to any of the following limits.
@@ -310,13 +326,14 @@
     return "inlining too deep";
   }
 
-  // We need to detect recursive inlining of method handle targets: if
-  // the current method is a method handle adapter and one of the
-  // callers is the same method as the callee, we bail out if
-  // MaxRecursiveInlineLevel is hit.
-  if (method()->is_method_handle_adapter()) {
+  // detect direct and indirect recursive inlining
+  {
+    // count the current method and the callee
+    int inline_level = (method() == callee_method) ? 1 : 0;
+    if (inline_level > MaxRecursiveInlineLevel)
+      return "recursively inlining too deep";
+    // count callers of current method and callee
     JVMState* jvms = caller_jvms();
-    int inline_level = 0;
     while (jvms != NULL && jvms->has_method()) {
       if (jvms->method() == callee_method) {
         inline_level++;
@@ -327,10 +344,6 @@
     }
   }
 
-  if (method() == callee_method && inline_depth() > MaxRecursiveInlineLevel) {
-    return "recursively inlining too deep";
-  }
-
   int size = callee_method->code_size();
 
   if (UseOldInlining && ClipInlining
@@ -376,7 +389,6 @@
   return true;
 }
 
-#ifndef PRODUCT
 //------------------------------print_inlining---------------------------------
 // Really, the failure_msg can be a success message also.
 void InlineTree::print_inlining(ciMethod* callee_method, int caller_bci, const char* failure_msg) const {
@@ -388,7 +400,6 @@
     tty->print("  bcs: %d+%d  invoked: %d", top->count_inline_bcs(), callee_method->code_size(), callee_method->interpreter_invocation_count());
   }
 }
-#endif
 
 //------------------------------ok_to_inline-----------------------------------
 WarmCallInfo* InlineTree::ok_to_inline(ciMethod* callee_method, JVMState* jvms, ciCallProfile& profile, WarmCallInfo* initial_wci) {
@@ -497,9 +508,8 @@
       new_depth_adjust -= 1;  // don't count method handle calls from java.lang.invoke implem
     }
     if (new_depth_adjust != 0 && PrintInlining) {
-      stringStream nm1; caller_jvms->method()->print_name(&nm1);
-      stringStream nm2; callee_method->print_name(&nm2);
-      tty->print_cr("discounting inlining depth from %s to %s", nm1.base(), nm2.base());
+      CompileTask::print_inline_indent(inline_depth());
+      tty->print_cr(" \\-> discounting inline depth");
     }
     if (new_depth_adjust != 0 && C->log()) {
       int id1 = C->log()->identify(caller_jvms->method());
--- a/src/share/vm/opto/c2_globals.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/c2_globals.hpp	Tue May 24 11:09:39 2011 -0700
@@ -183,6 +183,21 @@
   develop(bool, TraceLoopOpts, false,                                       \
           "Trace executed loop optimizations")                              \
                                                                             \
+  diagnostic(bool, LoopLimitCheck, true,                                    \
+          "Generate a loop limits check for overflow")                      \
+                                                                            \
+  develop(bool, TraceLoopLimitCheck, false,                                 \
+          "Trace generation of loop limits checks")                         \
+                                                                            \
+  diagnostic(bool, RangeLimitCheck, true,                                   \
+          "Additional overflow checks during range check elimination")      \
+                                                                            \
+  develop(bool, TraceRangeLimitCheck, false,                                \
+          "Trace additional overflow checks in RCE")                        \
+                                                                            \
+  diagnostic(bool, UnrollLimitCheck, true,                                  \
+          "Additional overflow checks during loop unroll")                  \
+                                                                            \
   product(bool, OptimizeFill, false,                                        \
           "convert fill/copy loops into intrinsic")                         \
                                                                             \
--- a/src/share/vm/opto/cfgnode.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/cfgnode.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1373,7 +1373,7 @@
 
   // Clone loop predicates
   if (predicate_proj != NULL) {
-    newn = igvn->clone_loop_predicates(predicate_proj, newn);
+    newn = igvn->clone_loop_predicates(predicate_proj, newn, !n->is_CountedLoop());
   }
 
   // Now I can point to the new node.
--- a/src/share/vm/opto/classes.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/classes.hpp	Tue May 24 11:09:39 2011 -0700
@@ -156,6 +156,7 @@
 macro(LogD)
 macro(Log10D)
 macro(Loop)
+macro(LoopLimit)
 macro(Mach)
 macro(MachProj)
 macro(MaxI)
--- a/src/share/vm/opto/doCall.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/doCall.cpp	Tue May 24 11:09:39 2011 -0700
@@ -62,7 +62,10 @@
 CallGenerator* Compile::call_generator(ciMethod* call_method, int vtable_index, bool call_is_virtual,
                                        JVMState* jvms, bool allow_inline,
                                        float prof_factor) {
-  CallGenerator* cg;
+  CallGenerator*  cg;
+  ciMethod*       caller   = jvms->method();
+  int             bci      = jvms->bci();
+  Bytecodes::Code bytecode = caller->java_code_at_bci(bci);
   guarantee(call_method != NULL, "failed method resolution");
 
   // Dtrace currently doesn't work unless all calls are vanilla
@@ -73,7 +76,7 @@
   // Note: When we get profiling during stage-1 compiles, we want to pull
   // from more specific profile data which pertains to this inlining.
   // Right now, ignore the information in jvms->caller(), and do method[bci].
-  ciCallProfile profile = jvms->method()->call_profile_at_bci(jvms->bci());
+  ciCallProfile profile = caller->call_profile_at_bci(bci);
 
   // See how many times this site has been invoked.
   int site_count = profile.count();
@@ -116,7 +119,7 @@
   // MethodHandle.invoke* are native methods which obviously don't
   // have bytecodes and so normal inlining fails.
   if (call_method->is_method_handle_invoke()) {
-    if (jvms->method()->java_code_at_bci(jvms->bci()) != Bytecodes::_invokedynamic) {
+    if (bytecode != Bytecodes::_invokedynamic) {
       GraphKit kit(jvms);
       Node* n = kit.argument(0);
 
@@ -125,17 +128,19 @@
         ciObject* const_oop = oop_ptr->const_oop();
         ciMethodHandle* method_handle = const_oop->as_method_handle();
 
-        // Set the actually called method to have access to the class
-        // and signature in the MethodHandleCompiler.
+        // Set the callee to have access to the class and signature in
+        // the MethodHandleCompiler.
         method_handle->set_callee(call_method);
+        method_handle->set_caller(caller);
+        method_handle->set_call_profile(&profile);
 
         // Get an adapter for the MethodHandle.
         ciMethod* target_method = method_handle->get_method_handle_adapter();
-        CallGenerator* hit_cg = NULL;
-        if (target_method != NULL)
-          hit_cg = this->call_generator(target_method, vtable_index, false, jvms, true, prof_factor);
-        if (hit_cg != NULL && hit_cg->is_inline())
-          return hit_cg;
+        if (target_method != NULL) {
+          CallGenerator* hit_cg = this->call_generator(target_method, vtable_index, false, jvms, true, prof_factor);
+          if (hit_cg != NULL && hit_cg->is_inline())
+            return hit_cg;
+        }
       }
 
       return CallGenerator::for_direct_call(call_method);
@@ -148,18 +153,20 @@
       ciCallSite*     call_site     = str.get_call_site();
       ciMethodHandle* method_handle = call_site->get_target();
 
-      // Set the actually called method to have access to the class
-      // and signature in the MethodHandleCompiler.
+      // Set the callee to have access to the class and signature in
+      // the MethodHandleCompiler.
       method_handle->set_callee(call_method);
+      method_handle->set_caller(caller);
+      method_handle->set_call_profile(&profile);
 
       // Get an adapter for the MethodHandle.
       ciMethod* target_method = method_handle->get_invokedynamic_adapter();
-      CallGenerator* hit_cg = NULL;
-      if (target_method != NULL)
-        hit_cg = this->call_generator(target_method, vtable_index, false, jvms, true, prof_factor);
-      if (hit_cg != NULL && hit_cg->is_inline()) {
-        CallGenerator* miss_cg = CallGenerator::for_dynamic_call(call_method);
-        return CallGenerator::for_predicted_dynamic_call(method_handle, miss_cg, hit_cg, prof_factor);
+      if (target_method != NULL) {
+        CallGenerator* hit_cg = this->call_generator(target_method, vtable_index, false, jvms, true, prof_factor);
+        if (hit_cg != NULL && hit_cg->is_inline()) {
+          CallGenerator* miss_cg = CallGenerator::for_dynamic_call(call_method);
+          return CallGenerator::for_predicted_dynamic_call(method_handle, miss_cg, hit_cg, prof_factor);
+        }
       }
 
       // If something failed, generate a normal dynamic call.
--- a/src/share/vm/opto/graphKit.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/graphKit.cpp	Tue May 24 11:09:39 2011 -0700
@@ -3378,6 +3378,10 @@
   if (UseLoopPredicate) {
     add_predicate_impl(Deoptimization::Reason_predicate, nargs);
   }
+  // loop's limit check predicate should be near the loop.
+  if (LoopLimitCheck) {
+    add_predicate_impl(Deoptimization::Reason_loop_limit_check, nargs);
+  }
 }
 
 //----------------------------- store barriers ----------------------------
--- a/src/share/vm/opto/ifnode.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/ifnode.cpp	Tue May 24 11:09:39 2011 -0700
@@ -236,6 +236,7 @@
   }
   Node* predicate_c = NULL;
   Node* predicate_x = NULL;
+  bool counted_loop = r->is_CountedLoop();
 
   Node *region_c = new (igvn->C, req_c + 1) RegionNode(req_c + 1);
   Node *phi_c    = con1;
@@ -294,16 +295,16 @@
   if (predicate_c != NULL) {
     assert(predicate_x == NULL, "only one predicate entry expected");
     // Clone loop predicates to each path
-    iff_c_t = igvn->clone_loop_predicates(predicate_c, iff_c_t);
-    iff_c_f = igvn->clone_loop_predicates(predicate_c, iff_c_f);
+    iff_c_t = igvn->clone_loop_predicates(predicate_c, iff_c_t, !counted_loop);
+    iff_c_f = igvn->clone_loop_predicates(predicate_c, iff_c_f, !counted_loop);
   }
   Node *iff_x_t = phase->transform(new (igvn->C, 1) IfTrueNode (iff_x));
   Node *iff_x_f = phase->transform(new (igvn->C, 1) IfFalseNode(iff_x));
   if (predicate_x != NULL) {
     assert(predicate_c == NULL, "only one predicate entry expected");
     // Clone loop predicates to each path
-    iff_x_t = igvn->clone_loop_predicates(predicate_x, iff_x_t);
-    iff_x_f = igvn->clone_loop_predicates(predicate_x, iff_x_f);
+    iff_x_t = igvn->clone_loop_predicates(predicate_x, iff_x_t, !counted_loop);
+    iff_x_f = igvn->clone_loop_predicates(predicate_x, iff_x_f, !counted_loop);
   }
 
   // Merge the TRUE paths
@@ -545,6 +546,7 @@
   Node *new_bol = gvn->transform( new (gvn->C, 2) BoolNode( new_cmp, bol->as_Bool()->_test._test ) );
   igvn->hash_delete( iff );
   iff->set_req_X( 1, new_bol, igvn );
+  igvn->_worklist.push( iff );
 }
 
 //------------------------------up_one_dom-------------------------------------
--- a/src/share/vm/opto/library_call.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/library_call.cpp	Tue May 24 11:09:39 2011 -0700
@@ -867,12 +867,10 @@
   Node* str1_offset  = make_load(no_ctrl, str1_offseta, TypeInt::INT, T_INT, string_type->add_offset(offset_offset));
   Node* str1_start   = array_element_address(str1_value, str1_offset, T_CHAR);
 
-  // Pin loads from String::equals() argument since it could be NULL.
-  Node* str2_ctrl = (opcode == Op_StrEquals) ? control() : no_ctrl;
   Node* str2_valuea  = basic_plus_adr(str2, str2, value_offset);
-  Node* str2_value   = make_load(str2_ctrl, str2_valuea, value_type, T_OBJECT, string_type->add_offset(value_offset));
+  Node* str2_value   = make_load(no_ctrl, str2_valuea, value_type, T_OBJECT, string_type->add_offset(value_offset));
   Node* str2_offseta = basic_plus_adr(str2, str2, offset_offset);
-  Node* str2_offset  = make_load(str2_ctrl, str2_offseta, TypeInt::INT, T_INT, string_type->add_offset(offset_offset));
+  Node* str2_offset  = make_load(no_ctrl, str2_offseta, TypeInt::INT, T_INT, string_type->add_offset(offset_offset));
   Node* str2_start   = array_element_address(str2_value, str2_offset, T_CHAR);
 
   Node* result = NULL;
@@ -1012,14 +1010,15 @@
   if (!stopped()) {
     // Properly cast the argument to String
     argument = _gvn.transform(new (C, 2) CheckCastPPNode(control(), argument, string_type));
+    // This path is taken only when argument's type is String:NotNull.
+    argument = cast_not_null(argument, false);
 
     // Get counts for string and argument
     Node* receiver_cnta = basic_plus_adr(receiver, receiver, count_offset);
     receiver_cnt  = make_load(no_ctrl, receiver_cnta, TypeInt::INT, T_INT, string_type->add_offset(count_offset));
 
-    // Pin load from argument string since it could be NULL.
     Node* argument_cnta = basic_plus_adr(argument, argument, count_offset);
-    argument_cnt  = make_load(control(), argument_cnta, TypeInt::INT, T_INT, string_type->add_offset(count_offset));
+    argument_cnt  = make_load(no_ctrl, argument_cnta, TypeInt::INT, T_INT, string_type->add_offset(count_offset));
 
     // Check for receiver count != argument count
     Node* cmp = _gvn.transform( new(C, 3) CmpINode(receiver_cnt, argument_cnt) );
--- a/src/share/vm/opto/loopPredicate.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/loopPredicate.cpp	Tue May 24 11:09:39 2011 -0700
@@ -341,7 +341,7 @@
   // Cut predicate from old place.
   Node* old = predicate_proj;
   igvn->_worklist.push(old);
-  for (DUIterator_Last imin, i = old->last_outs(imin); i >= imin; ) {
+  for (DUIterator_Last imin, i = old->last_outs(imin); i >= imin;) {
     Node* use = old->last_out(i);  // for each use...
     igvn->hash_delete(use);
     igvn->_worklist.push(use);
@@ -384,24 +384,25 @@
 
 //--------------------------clone_loop_predicates-----------------------
 // Interface from IGVN
-Node* PhaseIterGVN::clone_loop_predicates(Node* old_entry, Node* new_entry) {
-  return PhaseIdealLoop::clone_loop_predicates(old_entry, new_entry, false, NULL, this);
+Node* PhaseIterGVN::clone_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check) {
+  return PhaseIdealLoop::clone_loop_predicates(old_entry, new_entry, false, clone_limit_check, NULL, this);
 }
-Node* PhaseIterGVN::move_loop_predicates(Node* old_entry, Node* new_entry) {
-  return PhaseIdealLoop::clone_loop_predicates(old_entry, new_entry, true, NULL, this);
+Node* PhaseIterGVN::move_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check) {
+  return PhaseIdealLoop::clone_loop_predicates(old_entry, new_entry, true, clone_limit_check, NULL, this);
 }
 
 // Interface from PhaseIdealLoop
-Node* PhaseIdealLoop::clone_loop_predicates(Node* old_entry, Node* new_entry) {
-  return clone_loop_predicates(old_entry, new_entry, false, this, &this->_igvn);
+Node* PhaseIdealLoop::clone_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check) {
+  return clone_loop_predicates(old_entry, new_entry, false, clone_limit_check, this, &this->_igvn);
 }
-Node* PhaseIdealLoop::move_loop_predicates(Node* old_entry, Node* new_entry) {
-  return clone_loop_predicates(old_entry, new_entry, true, this, &this->_igvn);
+Node* PhaseIdealLoop::move_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check) {
+  return clone_loop_predicates(old_entry, new_entry, true, clone_limit_check, this, &this->_igvn);
 }
 
 // Clone loop predicates to cloned loops (peeled, unswitched, split_if).
 Node* PhaseIdealLoop::clone_loop_predicates(Node* old_entry, Node* new_entry,
                                                 bool move_predicates,
+                                                bool clone_limit_check,
                                                 PhaseIdealLoop* loop_phase,
                                                 PhaseIterGVN* igvn) {
 #ifdef ASSERT
@@ -413,10 +414,16 @@
 #endif
   // Search original predicates
   Node* entry = old_entry;
+  ProjNode* limit_check_proj = NULL;
+  if (LoopLimitCheck) {
+    limit_check_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+    if (limit_check_proj != NULL) {
+      entry = entry->in(0)->in(0);
+    }
+  }
   if (UseLoopPredicate) {
     ProjNode* predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
     if (predicate_proj != NULL) { // right pattern that can be used by loop predication
-      assert(entry->in(0)->in(1)->in(1)->Opcode()==Op_Opaque1, "must be");
       if (move_predicates) {
         new_entry =  move_predicate(predicate_proj, new_entry,
                                     Deoptimization::Reason_predicate,
@@ -435,11 +442,37 @@
       }
     }
   }
+  if (limit_check_proj != NULL && clone_limit_check) {
+    // Clone loop limit check last to insert it before loop.
+    // Don't clone a limit check which was already finalized
+    // for this counted loop (only one limit check is needed).
+    if (move_predicates) {
+      new_entry =  move_predicate(limit_check_proj, new_entry,
+                                  Deoptimization::Reason_loop_limit_check,
+                                  loop_phase, igvn);
+      assert(new_entry == limit_check_proj, "old limit check fall through projection");
+    } else {
+      new_entry = clone_predicate(limit_check_proj, new_entry,
+                                  Deoptimization::Reason_loop_limit_check,
+                                  loop_phase, igvn);
+      assert(new_entry != NULL && new_entry->is_Proj(), "IfTrue or IfFalse after clone limit check");
+    }
+    if (TraceLoopLimitCheck) {
+      tty->print_cr("Loop Limit Check %s: ", move_predicates ? "moved" : "cloned");
+      debug_only( new_entry->in(0)->dump(); )
+    }
+  }
   return new_entry;
 }
 
 //--------------------------eliminate_loop_predicates-----------------------
 void PhaseIdealLoop::eliminate_loop_predicates(Node* entry) {
+  if (LoopLimitCheck) {
+    Node* predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+    if (predicate != NULL) {
+      entry = entry->in(0)->in(0);
+    }
+  }
   if (UseLoopPredicate) {
     ProjNode* predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
     if (predicate_proj != NULL) { // right pattern that can be used by loop predication
@@ -456,10 +489,15 @@
 // Skip related predicates.
 Node* PhaseIdealLoop::skip_loop_predicates(Node* entry) {
   Node* predicate = NULL;
+  if (LoopLimitCheck) {
+    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+    if (predicate != NULL) {
+      entry = entry->in(0)->in(0);
+    }
+  }
   if (UseLoopPredicate) {
     predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
     if (predicate != NULL) { // right pattern that can be used by loop predication
-      assert(entry->is_Proj() && entry->in(0)->in(1)->in(1)->Opcode()==Op_Opaque1, "must be");
       IfNode* iff = entry->in(0)->as_If();
       ProjNode* uncommon_proj = iff->proj_out(1 - entry->as_Proj()->_con);
       Node* rgn = uncommon_proj->unique_ctrl_out();
@@ -491,10 +529,15 @@
 // Find a predicate
 Node* PhaseIdealLoop::find_predicate(Node* entry) {
   Node* predicate = NULL;
+  if (LoopLimitCheck) {
+    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+    if (predicate != NULL) { // right pattern that can be used by loop predication
+      return entry;
+    }
+  }
   if (UseLoopPredicate) {
     predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
     if (predicate != NULL) { // right pattern that can be used by loop predication
-      assert(entry->in(0)->in(1)->in(1)->Opcode()==Op_Opaque1, "must be");
       return entry;
     }
   }
@@ -658,7 +701,7 @@
   Node* range = cmp->in(2);
   if (range->Opcode() != Op_LoadRange) {
     const TypeInt* tint = phase->_igvn.type(range)->isa_int();
-    if (!OptimizeFill || tint == NULL || tint->empty() || tint->_lo < 0) {
+    if (tint == NULL || tint->empty() || tint->_lo < 0) {
       // Allow predication on positive values that aren't LoadRanges.
       // This allows optimization of loops where the length of the
       // array is a known value and doesn't need to be loaded back
@@ -696,36 +739,49 @@
 //   max(scale*i + offset) = scale*(limit-stride) + offset
 // (2) stride*scale < 0
 //   max(scale*i + offset) = scale*init + offset
-BoolNode* PhaseIdealLoop::rc_predicate(Node* ctrl,
+BoolNode* PhaseIdealLoop::rc_predicate(IdealLoopTree *loop, Node* ctrl,
                                        int scale, Node* offset,
                                        Node* init, Node* limit, Node* stride,
                                        Node* range, bool upper) {
-  DEBUG_ONLY(ttyLocker ttyl);
-  if (TraceLoopPredicate) tty->print("rc_predicate ");
+  stringStream* predString = NULL;
+  if (TraceLoopPredicate) {
+    predString = new stringStream();
+    predString->print("rc_predicate ");
+  }
 
   Node* max_idx_expr  = init;
   int stride_con = stride->get_int();
   if ((stride_con > 0) == (scale > 0) == upper) {
-    max_idx_expr = new (C, 3) SubINode(limit, stride);
-    register_new_node(max_idx_expr, ctrl);
-    if (TraceLoopPredicate) tty->print("(limit - stride) ");
+    if (LoopLimitCheck) {
+      // With LoopLimitCheck limit is not exact.
+      // Calculate exact limit here.
+      // Note, counted loop's test is '<' or '>'.
+      limit = exact_limit(loop);
+      max_idx_expr = new (C, 3) SubINode(limit, stride);
+      register_new_node(max_idx_expr, ctrl);
+      if (TraceLoopPredicate) predString->print("(limit - stride) ");
+    } else {
+      max_idx_expr = new (C, 3) SubINode(limit, stride);
+      register_new_node(max_idx_expr, ctrl);
+      if (TraceLoopPredicate) predString->print("(limit - stride) ");
+    }
   } else {
-    if (TraceLoopPredicate) tty->print("init ");
+    if (TraceLoopPredicate) predString->print("init ");
   }
 
   if (scale != 1) {
     ConNode* con_scale = _igvn.intcon(scale);
     max_idx_expr = new (C, 3) MulINode(max_idx_expr, con_scale);
     register_new_node(max_idx_expr, ctrl);
-    if (TraceLoopPredicate) tty->print("* %d ", scale);
+    if (TraceLoopPredicate) predString->print("* %d ", scale);
   }
 
   if (offset && (!offset->is_Con() || offset->get_int() != 0)){
     max_idx_expr = new (C, 3) AddINode(max_idx_expr, offset);
     register_new_node(max_idx_expr, ctrl);
     if (TraceLoopPredicate)
-      if (offset->is_Con()) tty->print("+ %d ", offset->get_int());
-      else tty->print("+ offset ");
+      if (offset->is_Con()) predString->print("+ %d ", offset->get_int());
+      else predString->print("+ offset ");
   }
 
   CmpUNode* cmp = new (C, 3) CmpUNode(max_idx_expr, range);
@@ -733,7 +789,10 @@
   BoolNode* bol = new (C, 2) BoolNode(cmp, BoolTest::lt);
   register_new_node(bol, ctrl);
 
-  if (TraceLoopPredicate) tty->print_cr("<u range");
+  if (TraceLoopPredicate) {
+    predString->print_cr("<u range");
+    tty->print(predString->as_string());
+  }
   return bol;
 }
 
@@ -746,29 +805,36 @@
     // Could be a simple region when irreducible loops are present.
     return false;
   }
+  LoopNode* head = loop->_head->as_Loop();
 
-  if (loop->_head->unique_ctrl_out()->Opcode() == Op_NeverBranch) {
+  if (head->unique_ctrl_out()->Opcode() == Op_NeverBranch) {
     // do nothing for infinite loops
     return false;
   }
 
   CountedLoopNode *cl = NULL;
-  if (loop->_head->is_CountedLoop()) {
-    cl = loop->_head->as_CountedLoop();
+  if (head->is_CountedLoop()) {
+    cl = head->as_CountedLoop();
     // do nothing for iteration-splitted loops
     if (!cl->is_normal_loop()) return false;
   }
 
-  LoopNode *lpn  = loop->_head->as_Loop();
-  Node* entry = lpn->in(LoopNode::EntryControl);
+  Node* entry = head->in(LoopNode::EntryControl);
+  ProjNode *predicate_proj = NULL;
+  // Loop limit check predicate should be near the loop.
+  if (LoopLimitCheck) {
+    predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+    if (predicate_proj != NULL)
+      entry = predicate_proj->in(0)->in(0);
+  }
 
-  ProjNode *predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
+  predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
   if (!predicate_proj) {
 #ifndef PRODUCT
     if (TraceLoopPredicate) {
       tty->print("missing predicate:");
       loop->dump_head();
-      lpn->dump(1);
+      head->dump(1);
     }
 #endif
     return false;
@@ -782,7 +848,6 @@
   // Create list of if-projs such that a newer proj dominates all older
   // projs in the list, and they all dominate loop->tail()
   Node_List if_proj_list(area);
-  LoopNode *head  = loop->_head->as_Loop();
   Node *current_proj = loop->tail(); //start from tail
   while (current_proj != head) {
     if (loop == get_loop(current_proj) && // still in the loop ?
@@ -856,8 +921,8 @@
       const Node*    cmp    = bol->in(1)->as_Cmp();
       Node*          idx    = cmp->in(1);
       assert(!invar.is_invariant(idx), "index is variant");
-      assert(cmp->in(2)->Opcode() == Op_LoadRange || OptimizeFill, "must be");
       Node* rng = cmp->in(2);
+      assert(rng->Opcode() == Op_LoadRange || _igvn.type(rng)->is_int() >= 0, "must be");
       assert(invar.is_invariant(rng), "range must be invariant");
       int scale    = 1;
       Node* offset = zero;
@@ -886,14 +951,14 @@
       }
 
       // Test the lower bound
-      Node*  lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, false);
+      Node*  lower_bound_bol = rc_predicate(loop, ctrl, scale, offset, init, limit, stride, rng, false);
       IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If();
       _igvn.hash_delete(lower_bound_iff);
       lower_bound_iff->set_req(1, lower_bound_bol);
       if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx);
 
       // Test the upper bound
-      Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, true);
+      Node* upper_bound_bol = rc_predicate(loop, ctrl, scale, offset, init, limit, stride, rng, true);
       IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If();
       _igvn.hash_delete(upper_bound_iff);
       upper_bound_iff->set_req(1, upper_bound_bol);
@@ -957,4 +1022,3 @@
 
   return hoisted;
 }
-
--- a/src/share/vm/opto/loopTransform.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/loopTransform.cpp	Tue May 24 11:09:39 2011 -0700
@@ -83,7 +83,7 @@
 #ifdef ASSERT
   BoolTest::mask bt = cl->loopexit()->test_trip();
   assert(bt == BoolTest::lt || bt == BoolTest::gt ||
-         bt == BoolTest::ne, "canonical test is expected");
+         (bt == BoolTest::ne && !LoopLimitCheck), "canonical test is expected");
 #endif
 
   Node* init_n = cl->init_trip();
@@ -510,7 +510,7 @@
   //         the pre-loop with only 1 user (the new peeled iteration), but the
   //         peeled-loop backedge has 2 users.
   Node* new_exit_value = old_new[head->in(LoopNode::LoopBackControl)->_idx];
-  new_exit_value = move_loop_predicates(entry, new_exit_value);
+  new_exit_value = move_loop_predicates(entry, new_exit_value, !counted_loop);
   _igvn.hash_delete(head);
   head->set_req(LoopNode::EntryControl, new_exit_value);
   for (DUIterator_Fast jmax, j = head->fast_outs(jmax); j < jmax; j++) {
@@ -593,6 +593,12 @@
     return false;
   }
 
+  // Fully unroll a loop with few iterations regardless next
+  // conditions since following loop optimizations will split
+  // such loop anyway (pre-main-post).
+  if (trip_count <= 3)
+    return true;
+
   // Take into account that after unroll conjoined heads and tails will fold,
   // otherwise policy_unroll() may allow more unrolling than max unrolling.
   uint new_body_size = EMPTY_LOOP_SIZE + (body_size - EMPTY_LOOP_SIZE) * trip_count;
@@ -605,15 +611,6 @@
     return false;
   }
 
-  // Currently we don't have policy to optimize one iteration loops.
-  // Maximally unrolling transformation is used for that:
-  // it is peeled and the original loop become non reachable (dead).
-  // Also fully unroll a loop with few iterations regardless next
-  // conditions since following loop optimizations will split
-  // such loop anyway (pre-main-post).
-  if (trip_count <= 3)
-    return true;
-
   // Do not unroll a loop with String intrinsics code.
   // String intrinsics are large and have loops.
   for (uint k = 0; k < _body.size(); k++) {
@@ -632,6 +629,8 @@
 }
 
 
+#define MAX_UNROLL 16 // maximum number of unrolls for main loop
+
 //------------------------------policy_unroll----------------------------------
 // Return TRUE or FALSE if the loop should be unrolled or not.  Unroll if
 // the loop is a CountedLoop and the body is small enough.
@@ -643,13 +642,15 @@
   if (!cl->is_valid_counted_loop())
     return false; // Malformed counted loop
 
-  // protect against over-unrolling
-  if (cl->trip_count() <= 1) return false;
-
-  // Check for stride being a small enough constant
-  if (abs(cl->stride_con()) > (1<<3)) return false;
+  // Protect against over-unrolling.
+  // After split at least one iteration will be executed in pre-loop.
+  if (cl->trip_count() <= (uint)(cl->is_normal_loop() ? 2 : 1)) return false;
 
   int future_unroll_ct = cl->unrolled_count() * 2;
+  if (future_unroll_ct > MAX_UNROLL) return false;
+
+  // Check for initial stride being a small enough constant
+  if (abs(cl->stride_con()) > (1<<2)*future_unroll_ct) return false;
 
   // Don't unroll if the next round of unrolling would push us
   // over the expected trip count of the loop.  One is subtracted
@@ -675,6 +676,7 @@
 
   Node *init_n = cl->init_trip();
   Node *limit_n = cl->limit();
+  int stride_con = cl->stride_con();
   // Non-constant bounds.
   // Protect against over-unrolling when init or/and limit are not constant
   // (so that trip_count's init value is maxint) but iv range is known.
@@ -684,7 +686,7 @@
     if (phi != NULL) {
       assert(phi->is_Phi() && phi->in(0) == _head, "Counted loop should have iv phi.");
       const TypeInt* iv_type = phase->_igvn.type(phi)->is_int();
-      int next_stride = cl->stride_con() * 2; // stride after this unroll
+      int next_stride = stride_con * 2; // stride after this unroll
       if (next_stride > 0) {
         if (iv_type->_lo + next_stride <= iv_type->_lo || // overflow
             iv_type->_lo + next_stride >  iv_type->_hi) {
@@ -699,15 +701,19 @@
     }
   }
 
+  // After unroll limit will be adjusted: new_limit = limit-stride.
+  // Bailout if adjustment overflow.
+  const TypeInt* limit_type = phase->_igvn.type(limit_n)->is_int();
+  if (stride_con > 0 && ((limit_type->_hi - stride_con) >= limit_type->_hi) ||
+      stride_con < 0 && ((limit_type->_lo - stride_con) <= limit_type->_lo))
+    return false;  // overflow
+
   // Adjust body_size to determine if we unroll or not
   uint body_size = _body.size();
-  // Key test to unroll CaffeineMark's Logic test
-  int xors_in_loop = 0;
   // Also count ModL, DivL and MulL which expand mightly
   for (uint k = 0; k < _body.size(); k++) {
     Node* n = _body.at(k);
     switch (n->Opcode()) {
-      case Op_XorI: xors_in_loop++; break; // CaffeineMark's Logic test
       case Op_ModL: body_size += 30; break;
       case Op_DivL: body_size += 30; break;
       case Op_MulL: body_size += 10; break;
@@ -724,8 +730,7 @@
 
   // Check for being too big
   if (body_size > (uint)LoopUnrollLimit) {
-    if (xors_in_loop >= 4 && body_size < (uint)LoopUnrollLimit*4) return true;
-    // Normal case: loop too big
+     // Normal case: loop too big
     return false;
   }
 
@@ -747,28 +752,31 @@
 // Return TRUE or FALSE if the loop should be range-check-eliminated.
 // Actually we do iteration-splitting, a more powerful form of RCE.
 bool IdealLoopTree::policy_range_check( PhaseIdealLoop *phase ) const {
-  if( !RangeCheckElimination ) return false;
+  if (!RangeCheckElimination) return false;
 
   CountedLoopNode *cl = _head->as_CountedLoop();
   // If we unrolled with no intention of doing RCE and we later
   // changed our minds, we got no pre-loop.  Either we need to
   // make a new pre-loop, or we gotta disallow RCE.
-  if( cl->is_main_no_pre_loop() ) return false; // Disallowed for now.
+  if (cl->is_main_no_pre_loop()) return false; // Disallowed for now.
   Node *trip_counter = cl->phi();
 
   // Check loop body for tests of trip-counter plus loop-invariant vs
   // loop-invariant.
-  for( uint i = 0; i < _body.size(); i++ ) {
+  for (uint i = 0; i < _body.size(); i++) {
     Node *iff = _body[i];
-    if( iff->Opcode() == Op_If ) { // Test?
+    if (iff->Opcode() == Op_If) { // Test?
 
       // Comparing trip+off vs limit
       Node *bol = iff->in(1);
-      if( bol->req() != 2 ) continue; // dead constant test
+      if (bol->req() != 2) continue; // dead constant test
       if (!bol->is_Bool()) {
         assert(UseLoopPredicate && bol->Opcode() == Op_Conv2B, "predicate check only");
         continue;
       }
+      if (bol->as_Bool()->_test._test == BoolTest::ne)
+        continue; // not RC
+
       Node *cmp = bol->in(1);
 
       Node *rc_exp = cmp->in(1);
@@ -1064,6 +1072,7 @@
   // negative stride use >
 
   if (pre_end->in(CountedLoopEndNode::TestValue)->as_Bool()->_test._test == BoolTest::ne) {
+    assert(!LoopLimitCheck, "only canonical tests (lt or gt) are expected");
 
     BoolTest::mask new_test = (main_end->stride_con() > 0) ? BoolTest::lt : BoolTest::gt;
     // Modify pre loop end condition
@@ -1090,6 +1099,9 @@
   main_head->set_main_loop();
   if( peel_only ) main_head->set_main_no_pre_loop();
 
+  // Subtract a trip count for the pre-loop.
+  main_head->set_trip_count(main_head->trip_count() - 1);
+
   // It's difficult to be precise about the trip-counts
   // for the pre/post loops.  They are usually very short,
   // so guess that 4 trips is a reasonable value.
@@ -1123,9 +1135,9 @@
     loop->dump_head();
   } else if (TraceLoopOpts) {
     if (loop_head->trip_count() < (uint)LoopUnrollLimit) {
-      tty->print("Unroll  %d(%2d) ", loop_head->unrolled_count()*2, loop_head->trip_count());
+      tty->print("Unroll %d(%2d) ", loop_head->unrolled_count()*2, loop_head->trip_count());
     } else {
-      tty->print("Unroll  %d     ", loop_head->unrolled_count()*2);
+      tty->print("Unroll %d     ", loop_head->unrolled_count()*2);
     }
     loop->dump_head();
   }
@@ -1141,7 +1153,8 @@
   Node *stride = loop_head->stride();
 
   Node *opaq = NULL;
-  if( adjust_min_trip ) {       // If not maximally unrolling, need adjustment
+  if (adjust_min_trip) {       // If not maximally unrolling, need adjustment
+    // Search for zero-trip guard.
     assert( loop_head->is_main_loop(), "" );
     assert( ctrl->Opcode() == Op_IfTrue || ctrl->Opcode() == Op_IfFalse, "" );
     Node *iff = ctrl->in(0);
@@ -1151,63 +1164,210 @@
     Node *cmp = bol->in(1);
     assert( cmp->Opcode() == Op_CmpI, "" );
     opaq = cmp->in(2);
-    // Occasionally it's possible for a pre-loop Opaque1 node to be
+    // Occasionally it's possible for a zero-trip guard Opaque1 node to be
     // optimized away and then another round of loop opts attempted.
     // We can not optimize this particular loop in that case.
-    if( opaq->Opcode() != Op_Opaque1 )
-      return;                   // Cannot find pre-loop!  Bail out!
+    if (opaq->Opcode() != Op_Opaque1)
+      return; // Cannot find zero-trip guard!  Bail out!
+    // Zero-trip test uses an 'opaque' node which is not shared.
+    assert(opaq->outcnt() == 1 && opaq->in(1) == limit, "");
   }
 
   C->set_major_progress();
 
-  // Adjust max trip count. The trip count is intentionally rounded
-  // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
-  // the main, unrolled, part of the loop will never execute as it is protected
-  // by the min-trip test.  See bug 4834191 for a case where we over-unrolled
-  // and later determined that part of the unrolled loop was dead.
-  loop_head->set_trip_count(loop_head->trip_count() / 2);
+  Node* new_limit = NULL;
+  if (UnrollLimitCheck) {
+    int stride_con = stride->get_int();
+    int stride_p = (stride_con > 0) ? stride_con : -stride_con;
+    uint old_trip_count = loop_head->trip_count();
+    // Verify that unroll policy result is still valid.
+    assert(old_trip_count > 1 &&
+           (!adjust_min_trip || stride_p <= (1<<3)*loop_head->unrolled_count()), "sanity");
 
-  // Double the count of original iterations in the unrolled loop body.
-  loop_head->double_unrolled_count();
+    // Adjust loop limit to keep valid iterations number after unroll.
+    // Use (limit - stride) instead of (((limit - init)/stride) & (-2))*stride
+    // which may overflow.
+    if (!adjust_min_trip) {
+      assert(old_trip_count > 1 && (old_trip_count & 1) == 0,
+             "odd trip count for maximally unroll");
+      // Don't need to adjust limit for maximally unroll since trip count is even.
+    } else if (loop_head->has_exact_trip_count() && init->is_Con()) {
+      // Loop's limit is constant. Loop's init could be constant when pre-loop
+      // become peeled iteration.
+      long init_con = init->get_int();
+      // We can keep old loop limit if iterations count stays the same:
+      //   old_trip_count == new_trip_count * 2
+      // Note: since old_trip_count >= 2 then new_trip_count >= 1
+      // so we also don't need to adjust zero trip test.
+      long limit_con  = limit->get_int();
+      // (stride_con*2) not overflow since stride_con <= 8.
+      int new_stride_con = stride_con * 2;
+      int stride_m    = new_stride_con - (stride_con > 0 ? 1 : -1);
+      long trip_count = (limit_con - init_con + stride_m)/new_stride_con;
+      // New trip count should satisfy next conditions.
+      assert(trip_count > 0 && (julong)trip_count < (julong)max_juint/2, "sanity");
+      uint new_trip_count = (uint)trip_count;
+      adjust_min_trip = (old_trip_count != new_trip_count*2);
+    }
 
-  // -----------
-  // Step 2: Cut back the trip counter for an unroll amount of 2.
-  // Loop will normally trip (limit - init)/stride_con.  Since it's a
-  // CountedLoop this is exact (stride divides limit-init exactly).
-  // We are going to double the loop body, so we want to knock off any
-  // odd iteration: (trip_cnt & ~1).  Then back compute a new limit.
-  Node *span = new (C, 3) SubINode( limit, init );
-  register_new_node( span, ctrl );
-  Node *trip = new (C, 3) DivINode( 0, span, stride );
-  register_new_node( trip, ctrl );
-  Node *mtwo = _igvn.intcon(-2);
-  set_ctrl(mtwo, C->root());
-  Node *rond = new (C, 3) AndINode( trip, mtwo );
-  register_new_node( rond, ctrl );
-  Node *spn2 = new (C, 3) MulINode( rond, stride );
-  register_new_node( spn2, ctrl );
-  Node *lim2 = new (C, 3) AddINode( spn2, init );
-  register_new_node( lim2, ctrl );
+    if (adjust_min_trip) {
+      // Step 2: Adjust the trip limit if it is called for.
+      // The adjustment amount is -stride. Need to make sure if the
+      // adjustment underflows or overflows, then the main loop is skipped.
+      Node* cmp = loop_end->cmp_node();
+      assert(cmp->in(2) == limit, "sanity");
+      assert(opaq != NULL && opaq->in(1) == limit, "sanity");
 
-  // Hammer in the new limit
-  Node *ctrl2 = loop_end->in(0);
-  Node *cmp2 = new (C, 3) CmpINode( loop_head->incr(), lim2 );
-  register_new_node( cmp2, ctrl2 );
-  Node *bol2 = new (C, 2) BoolNode( cmp2, loop_end->test_trip() );
-  register_new_node( bol2, ctrl2 );
-  _igvn.hash_delete(loop_end);
-  loop_end->set_req(CountedLoopEndNode::TestValue, bol2);
+      // Verify that policy_unroll result is still valid.
+      const TypeInt* limit_type = _igvn.type(limit)->is_int();
+      assert(stride_con > 0 && ((limit_type->_hi - stride_con) < limit_type->_hi) ||
+             stride_con < 0 && ((limit_type->_lo - stride_con) > limit_type->_lo), "sanity");
 
-  // Step 3: Find the min-trip test guaranteed before a 'main' loop.
-  // Make it a 1-trip test (means at least 2 trips).
-  if( adjust_min_trip ) {
-    // Guard test uses an 'opaque' node which is not shared.  Hence I
-    // can edit it's inputs directly.  Hammer in the new limit for the
-    // minimum-trip guard.
-    assert( opaq->outcnt() == 1, "" );
-    _igvn.hash_delete(opaq);
-    opaq->set_req(1, lim2);
-  }
+      if (limit->is_Con()) {
+        // The check in policy_unroll and the assert above guarantee
+        // no underflow if limit is constant.
+        new_limit = _igvn.intcon(limit->get_int() - stride_con);
+        set_ctrl(new_limit, C->root());
+      } else {
+        // Limit is not constant.
+        if (loop_head->unrolled_count() == 1) { // only for first unroll
+          // Separate limit by Opaque node in case it is an incremented
+          // variable from previous loop to avoid using pre-incremented
+          // value which could increase register pressure.
+          // Otherwise reorg_offsets() optimization will create a separate
+          // Opaque node for each use of trip-counter and as result
+          // zero trip guard limit will be different from loop limit.
+          assert(has_ctrl(opaq), "should have it");
+          Node* opaq_ctrl = get_ctrl(opaq);
+          limit = new (C, 2) Opaque2Node( C, limit );
+          register_new_node( limit, opaq_ctrl );
+        }
+        if (stride_con > 0 && ((limit_type->_lo - stride_con) < limit_type->_lo) ||
+                   stride_con < 0 && ((limit_type->_hi - stride_con) > limit_type->_hi)) {
+          // No underflow.
+          new_limit = new (C, 3) SubINode(limit, stride);
+        } else {
+          // (limit - stride) may underflow.
+          // Clamp the adjustment value with MININT or MAXINT:
+          //
+          //   new_limit = limit-stride
+          //   if (stride > 0)
+          //     new_limit = (limit < new_limit) ? MININT : new_limit;
+          //   else
+          //     new_limit = (limit > new_limit) ? MAXINT : new_limit;
+          //
+          BoolTest::mask bt = loop_end->test_trip();
+          assert(bt == BoolTest::lt || bt == BoolTest::gt, "canonical test is expected");
+          Node* adj_max = _igvn.intcon((stride_con > 0) ? min_jint : max_jint);
+          set_ctrl(adj_max, C->root());
+          Node* old_limit = NULL;
+          Node* adj_limit = NULL;
+          Node* bol = limit->is_CMove() ? limit->in(CMoveNode::Condition) : NULL;
+          if (loop_head->unrolled_count() > 1 &&
+              limit->is_CMove() && limit->Opcode() == Op_CMoveI &&
+              limit->in(CMoveNode::IfTrue) == adj_max &&
+              bol->as_Bool()->_test._test == bt &&
+              bol->in(1)->Opcode() == Op_CmpI &&
+              bol->in(1)->in(2) == limit->in(CMoveNode::IfFalse)) {
+            // Loop was unrolled before.
+            // Optimize the limit to avoid nested CMove:
+            // use original limit as old limit.
+            old_limit = bol->in(1)->in(1);
+            // Adjust previous adjusted limit.
+            adj_limit = limit->in(CMoveNode::IfFalse);
+            adj_limit = new (C, 3) SubINode(adj_limit, stride);
+          } else {
+            old_limit = limit;
+            adj_limit = new (C, 3) SubINode(limit, stride);
+          }
+          assert(old_limit != NULL && adj_limit != NULL, "");
+          register_new_node( adj_limit, ctrl ); // adjust amount
+          Node* adj_cmp = new (C, 3) CmpINode(old_limit, adj_limit);
+          register_new_node( adj_cmp, ctrl );
+          Node* adj_bool = new (C, 2) BoolNode(adj_cmp, bt);
+          register_new_node( adj_bool, ctrl );
+          new_limit = new (C, 4) CMoveINode(adj_bool, adj_limit, adj_max, TypeInt::INT);
+        }
+        register_new_node(new_limit, ctrl);
+      }
+      assert(new_limit != NULL, "");
+      // Replace in loop test.
+      _igvn.hash_delete(cmp);
+      cmp->set_req(2, new_limit);
+
+      // Step 3: Find the min-trip test guaranteed before a 'main' loop.
+      // Make it a 1-trip test (means at least 2 trips).
+
+      // Guard test uses an 'opaque' node which is not shared.  Hence I
+      // can edit it's inputs directly.  Hammer in the new limit for the
+      // minimum-trip guard.
+      assert(opaq->outcnt() == 1, "");
+      _igvn.hash_delete(opaq);
+      opaq->set_req(1, new_limit);
+    }
+
+    // Adjust max trip count. The trip count is intentionally rounded
+    // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
+    // the main, unrolled, part of the loop will never execute as it is protected
+    // by the min-trip test.  See bug 4834191 for a case where we over-unrolled
+    // and later determined that part of the unrolled loop was dead.
+    loop_head->set_trip_count(old_trip_count / 2);
+
+    // Double the count of original iterations in the unrolled loop body.
+    loop_head->double_unrolled_count();
+
+  } else { // LoopLimitCheck
+
+    // Adjust max trip count. The trip count is intentionally rounded
+    // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
+    // the main, unrolled, part of the loop will never execute as it is protected
+    // by the min-trip test.  See bug 4834191 for a case where we over-unrolled
+    // and later determined that part of the unrolled loop was dead.
+    loop_head->set_trip_count(loop_head->trip_count() / 2);
+
+    // Double the count of original iterations in the unrolled loop body.
+    loop_head->double_unrolled_count();
+
+    // -----------
+    // Step 2: Cut back the trip counter for an unroll amount of 2.
+    // Loop will normally trip (limit - init)/stride_con.  Since it's a
+    // CountedLoop this is exact (stride divides limit-init exactly).
+    // We are going to double the loop body, so we want to knock off any
+    // odd iteration: (trip_cnt & ~1).  Then back compute a new limit.
+    Node *span = new (C, 3) SubINode( limit, init );
+    register_new_node( span, ctrl );
+    Node *trip = new (C, 3) DivINode( 0, span, stride );
+    register_new_node( trip, ctrl );
+    Node *mtwo = _igvn.intcon(-2);
+    set_ctrl(mtwo, C->root());
+    Node *rond = new (C, 3) AndINode( trip, mtwo );
+    register_new_node( rond, ctrl );
+    Node *spn2 = new (C, 3) MulINode( rond, stride );
+    register_new_node( spn2, ctrl );
+    new_limit = new (C, 3) AddINode( spn2, init );
+    register_new_node( new_limit, ctrl );
+
+    // Hammer in the new limit
+    Node *ctrl2 = loop_end->in(0);
+    Node *cmp2 = new (C, 3) CmpINode( loop_head->incr(), new_limit );
+    register_new_node( cmp2, ctrl2 );
+    Node *bol2 = new (C, 2) BoolNode( cmp2, loop_end->test_trip() );
+    register_new_node( bol2, ctrl2 );
+    _igvn.hash_delete(loop_end);
+    loop_end->set_req(CountedLoopEndNode::TestValue, bol2);
+
+    // Step 3: Find the min-trip test guaranteed before a 'main' loop.
+    // Make it a 1-trip test (means at least 2 trips).
+    if( adjust_min_trip ) {
+      assert( new_limit != NULL, "" );
+      // Guard test uses an 'opaque' node which is not shared.  Hence I
+      // can edit it's inputs directly.  Hammer in the new limit for the
+      // minimum-trip guard.
+      assert( opaq->outcnt() == 1, "" );
+      _igvn.hash_delete(opaq);
+      opaq->set_req(1, new_limit);
+    }
+  } // LoopLimitCheck
 
   // ---------
   // Step 4: Clone the loop body.  Move it inside the loop.  This loop body
@@ -1263,6 +1423,7 @@
 
 void PhaseIdealLoop::do_maximally_unroll( IdealLoopTree *loop, Node_List &old_new ) {
   CountedLoopNode *cl = loop->_head->as_CountedLoop();
+  assert(cl->has_exact_trip_count(), "trip count is not exact");
   assert(cl->trip_count() > 0, "");
 #ifndef PRODUCT
   if (TraceLoopOpts) {
@@ -1279,6 +1440,7 @@
   // Now its tripping an even number of times remaining.  Double loop body.
   // Do not adjust pre-guards; they are not needed and do not exist.
   if (cl->trip_count() > 0) {
+    assert((cl->trip_count() & 1) == 0, "missed peeling");
     do_unroll(loop, old_new, false);
   }
 }
@@ -1292,22 +1454,13 @@
 }
 
 //------------------------------add_constraint---------------------------------
-// Constrain the main loop iterations so the condition:
-//    scale_con * I + offset  <  limit
+// Constrain the main loop iterations so the conditions:
+//    low_limit <= scale_con * I + offset  <  upper_limit
 // always holds true.  That is, either increase the number of iterations in
 // the pre-loop or the post-loop until the condition holds true in the main
 // loop.  Stride, scale, offset and limit are all loop invariant.  Further,
 // stride and scale are constants (offset and limit often are).
-void PhaseIdealLoop::add_constraint( int stride_con, int scale_con, Node *offset, Node *limit, Node *pre_ctrl, Node **pre_limit, Node **main_limit ) {
-
-  // Compute "I :: (limit-offset)/scale_con"
-  Node *con = new (C, 3) SubINode( limit, offset );
-  register_new_node( con, pre_ctrl );
-  Node *scale = _igvn.intcon(scale_con);
-  set_ctrl(scale, C->root());
-  Node *X = new (C, 3) DivINode( 0, con, scale );
-  register_new_node( X, pre_ctrl );
-
+void PhaseIdealLoop::add_constraint( int stride_con, int scale_con, Node *offset, Node *low_limit, Node *upper_limit, Node *pre_ctrl, Node **pre_limit, Node **main_limit ) {
   // For positive stride, the pre-loop limit always uses a MAX function
   // and the main loop a MIN function.  For negative stride these are
   // reversed.
@@ -1316,48 +1469,143 @@
   // pre-loop must check for underflow and the post-loop for overflow.
   // Negative stride*scale reverses this; pre-loop checks for overflow and
   // post-loop for underflow.
-  if( stride_con*scale_con > 0 ) {
-    // Compute I < (limit-offset)/scale_con
-    // Adjust main-loop last iteration to be MIN/MAX(main_loop,X)
-    *main_limit = (stride_con > 0)
-      ? (Node*)(new (C, 3) MinINode( *main_limit, X ))
-      : (Node*)(new (C, 3) MaxINode( *main_limit, X ));
-    register_new_node( *main_limit, pre_ctrl );
+  if (stride_con*scale_con > 0) {
+    // The overflow limit: scale*I+offset < upper_limit
+    // For main-loop compute
+    //   ( if (scale > 0) /* and stride > 0 */
+    //       I < (upper_limit-offset)/scale
+    //     else /* scale < 0 and stride < 0 */
+    //       I > (upper_limit-offset)/scale
+    //   )
+    //
+    // (upper_limit-offset) may overflow when offset < 0.
+    // But it is fine since main loop will either have
+    // less iterations or will be skipped in such case.
+    Node *con = new (C, 3) SubINode(upper_limit, offset);
+    register_new_node(con, pre_ctrl);
+    Node *scale = _igvn.intcon(scale_con);
+    set_ctrl(scale, C->root());
+    Node *X = new (C, 3) DivINode(0, con, scale);
+    register_new_node(X, pre_ctrl);
 
-  } else {
-    // Compute (limit-offset)/scale_con + SGN(-scale_con) <= I
-    // Add the negation of the main-loop constraint to the pre-loop.
-    // See footnote [++] below for a derivation of the limit expression.
-    Node *incr = _igvn.intcon(scale_con > 0 ? -1 : 1);
-    set_ctrl(incr, C->root());
-    Node *adj = new (C, 3) AddINode( X, incr );
-    register_new_node( adj, pre_ctrl );
-    *pre_limit = (scale_con > 0)
-      ? (Node*)new (C, 3) MinINode( *pre_limit, adj )
-      : (Node*)new (C, 3) MaxINode( *pre_limit, adj );
-    register_new_node( *pre_limit, pre_ctrl );
+    // Adjust main-loop last iteration
+    Node *loop_limit = *main_limit;
+    loop_limit = (stride_con > 0) // scale > 0
+      ? (Node*)(new (C, 3) MinINode(loop_limit, X))
+      : (Node*)(new (C, 3) MaxINode(loop_limit, X));
+    register_new_node(loop_limit, pre_ctrl);
+    *main_limit = loop_limit;
 
-//   [++] Here's the algebra that justifies the pre-loop limit expression:
-//
-//   NOT( scale_con * I + offset  <  limit )
-//      ==
-//   scale_con * I + offset  >=  limit
-//      ==
-//   SGN(scale_con) * I  >=  (limit-offset)/|scale_con|
-//      ==
-//   (limit-offset)/|scale_con|   <=  I * SGN(scale_con)
-//      ==
-//   (limit-offset)/|scale_con|-1  <  I * SGN(scale_con)
-//      ==
-//   ( if (scale_con > 0) /*common case*/
-//       (limit-offset)/scale_con - 1  <  I
-//     else
-//       (limit-offset)/scale_con + 1  >  I
-//    )
-//   ( if (scale_con > 0) /*common case*/
-//       (limit-offset)/scale_con + SGN(-scale_con)  <  I
-//     else
-//       (limit-offset)/scale_con + SGN(-scale_con)  >  I
+    // The underflow limit: low_limit <= scale*I+offset.
+    // For pre-loop compute
+    //   NOT(scale*I+offset >= low_limit)
+    //   scale*I+offset < low_limit
+    //   ( if (scale > 0) /* and stride > 0 */
+    //       I < (low_limit-offset)/scale
+    //     else /* scale < 0 and stride < 0 */
+    //       I > (low_limit-offset)/scale
+    //   )
+
+    if (low_limit->get_int() == -max_jint) {
+      if (!RangeLimitCheck) return;
+      // We need this guard when scale*pre_limit+offset >= limit
+      // due to underflow so we need execute pre-loop until
+      // scale*I+offset >= min_int. But (low_limit-offset) will
+      // underflow when offset > 0 and X will be > original_limit.
+      // To avoid it we replace offset = offset > 0 ? 0 : offset
+      // and add min(pre_limit, original_limit).
+      Node* shift = _igvn.intcon(31);
+      set_ctrl(shift, C->root());
+      Node *neg_off = new (C, 3) RShiftINode(offset, shift);
+      register_new_node(neg_off, pre_ctrl);
+      offset = new (C, 3) AndINode(offset, neg_off);
+      register_new_node(offset, pre_ctrl);
+    } else {
+      assert(low_limit->get_int() == 0, "wrong low limit for range check");
+      // The only problem we have here when offset == min_int
+      // since (0-min_int) == min_int. It may be fine for scale > 0
+      // but for scale < 0 X will be < original_limit.
+    }
+    con = new (C, 3) SubINode(low_limit, offset);
+    register_new_node(con, pre_ctrl);
+    scale = _igvn.intcon(scale_con);
+    set_ctrl(scale, C->root());
+    X = new (C, 3) DivINode(0, con, scale);
+    register_new_node(X, pre_ctrl);
+
+    // Adjust pre-loop last iteration
+    loop_limit = *pre_limit;
+    loop_limit = (stride_con > 0) // scale > 0
+      ? (Node*)(new (C, 3) MaxINode(loop_limit, X))
+      : (Node*)(new (C, 3) MinINode(loop_limit, X));
+    register_new_node( loop_limit, pre_ctrl );
+    *pre_limit = loop_limit;
+
+  } else { // stride_con*scale_con < 0
+    // For negative stride*scale pre-loop checks for overflow and
+    // post-loop for underflow.
+    //
+    // The underflow limit: low_limit <= scale*I+offset.
+    // For main-loop compute
+    //   scale*I+offset+1 > low_limit
+    //   ( if (scale < 0) /* and stride > 0 */
+    //       I < (low_limit-(offset+1))/scale
+    //     else /* scale < 0 and stride < 0 */
+    //       I > (low_limit-(offset+1))/scale
+    //   )
+
+    if (low_limit->get_int() == -max_jint) {
+      if (!RangeLimitCheck) return;
+    } else {
+      assert(low_limit->get_int() == 0, "wrong low limit for range check");
+    }
+
+    Node *one  = _igvn.intcon(1);
+    set_ctrl(one, C->root());
+    Node *plus_one = new (C, 3) AddINode(offset, one);
+    register_new_node( plus_one, pre_ctrl );
+    Node *con = new (C, 3) SubINode(low_limit, plus_one);
+    register_new_node(con, pre_ctrl);
+    Node *scale = _igvn.intcon(scale_con);
+    set_ctrl(scale, C->root());
+    Node *X = new (C, 3) DivINode(0, con, scale);
+    register_new_node(X, pre_ctrl);
+
+    // Adjust main-loop last iteration
+    Node *loop_limit = *main_limit;
+    loop_limit = (stride_con > 0) // scale < 0
+      ? (Node*)(new (C, 3) MinINode(loop_limit, X))
+      : (Node*)(new (C, 3) MaxINode(loop_limit, X));
+    register_new_node(loop_limit, pre_ctrl);
+    *main_limit = loop_limit;
+
+    // The overflow limit: scale*I+offset < upper_limit
+    // For pre-loop compute
+    //   NOT(scale*I+offset < upper_limit)
+    //   scale*I+offset >= upper_limit
+    //   scale*I+offset+1 > upper_limit
+    //   ( if (scale < 0) /* and stride > 0 */
+    //       I < (upper_limit-(offset+1))/scale
+    //     else /* scale < 0 and stride < 0 */
+    //       I > (upper_limit-(offset+1))/scale
+    //   )
+    plus_one = new (C, 3) AddINode(offset, one);
+    register_new_node( plus_one, pre_ctrl );
+    con = new (C, 3) SubINode(upper_limit, plus_one);
+    register_new_node(con, pre_ctrl);
+    scale = _igvn.intcon(scale_con);
+    set_ctrl(scale, C->root());
+    X = new (C, 3) DivINode(0, con, scale);
+    register_new_node(X, pre_ctrl);
+
+    // Adjust pre-loop last iteration
+    loop_limit = *pre_limit;
+    loop_limit = (stride_con > 0) // scale < 0
+      ? (Node*)(new (C, 3) MaxINode(loop_limit, X))
+      : (Node*)(new (C, 3) MinINode(loop_limit, X));
+    register_new_node( loop_limit, pre_ctrl );
+    *pre_limit = loop_limit;
+
   }
 }
 
@@ -1488,7 +1736,7 @@
   Node *cmpzm = bolzm->in(1);
   assert(cmpzm->is_Cmp(), "");
   Node *opqzm = cmpzm->in(2);
-  // Can not optimize a loop if pre-loop Opaque1 node is optimized
+  // Can not optimize a loop if zero-trip Opaque1 node is optimized
   // away and then another round of loop opts attempted.
   if (opqzm->Opcode() != Op_Opaque1)
     return;
@@ -1523,8 +1771,11 @@
   int stride_con = cl->stride_con();
   Node *zero = _igvn.intcon(0);
   Node *one  = _igvn.intcon(1);
+  // Use symmetrical int range [-max_jint,max_jint]
+  Node *mini = _igvn.intcon(-max_jint);
   set_ctrl(zero, C->root());
   set_ctrl(one,  C->root());
+  set_ctrl(mini, C->root());
 
   // Range checks that do not dominate the loop backedge (ie.
   // conditionally executed) can lengthen the pre loop limit beyond
@@ -1599,7 +1850,12 @@
       if( offset_c == ctrl ) {
         continue; // Don't rce this check but continue looking for other candidates.
       }
-
+#ifdef ASSERT
+      if (TraceRangeLimitCheck) {
+        tty->print_cr("RC bool node%s", flip ? " flipped:" : ":");
+        bol->dump(2);
+      }
+#endif
       // At this point we have the expression as:
       //   scale_con * trip_counter + offset :: limit
       // where scale_con, offset and limit are loop invariant.  Trip_counter
@@ -1610,17 +1866,16 @@
       // Adjust pre and main loop limits to guard the correct iteration set
       if( cmp->Opcode() == Op_CmpU ) {// Unsigned compare is really 2 tests
         if( b_test._test == BoolTest::lt ) { // Range checks always use lt
-          // The overflow limit: scale*I+offset < limit
-          add_constraint( stride_con, scale_con, offset, limit, pre_ctrl, &pre_limit, &main_limit );
-          // The underflow limit: 0 <= scale*I+offset.
-          // Some math yields: -scale*I-(offset+1) < 0
-          Node *plus_one = new (C, 3) AddINode( offset, one );
-          register_new_node( plus_one, pre_ctrl );
-          Node *neg_offset = new (C, 3) SubINode( zero, plus_one );
-          register_new_node( neg_offset, pre_ctrl );
-          add_constraint( stride_con, -scale_con, neg_offset, zero, pre_ctrl, &pre_limit, &main_limit );
+          // The underflow and overflow limits: 0 <= scale*I+offset < limit
+          add_constraint( stride_con, scale_con, offset, zero, limit, pre_ctrl, &pre_limit, &main_limit );
           if (!conditional_rc) {
             conditional_rc = !loop->dominates_backedge(iff);
+            // It is also needed if offset->_lo == min_int since
+            // (0-min_int) == min_int. It may be fine for stride > 0
+            // but for stride < 0 pre_limit will be < original_limit.
+            const TypeInt* offset_t = _igvn.type(offset)->is_int();
+            conditional_rc |= RangeLimitCheck && (offset_t->_lo == min_jint) &&
+                              (scale_con<0) && (stride_con<0);
           }
         } else {
 #ifndef PRODUCT
@@ -1631,21 +1886,35 @@
         }
       } else {                  // Otherwise work on normal compares
         switch( b_test._test ) {
-        case BoolTest::ge:      // Convert X >= Y to -X <= -Y
+        case BoolTest::gt:
+          // Fall into GE case
+        case BoolTest::ge:
+          // Convert (I*scale+offset) >= Limit to (I*(-scale)+(-offset)) <= -Limit
           scale_con = -scale_con;
           offset = new (C, 3) SubINode( zero, offset );
           register_new_node( offset, pre_ctrl );
           limit  = new (C, 3) SubINode( zero, limit  );
           register_new_node( limit, pre_ctrl );
           // Fall into LE case
-        case BoolTest::le:      // Convert X <= Y to X < Y+1
-          limit = new (C, 3) AddINode( limit, one );
-          register_new_node( limit, pre_ctrl );
+        case BoolTest::le:
+          if (b_test._test != BoolTest::gt) {
+            // Convert X <= Y to X < Y+1
+            limit = new (C, 3) AddINode( limit, one );
+            register_new_node( limit, pre_ctrl );
+          }
           // Fall into LT case
         case BoolTest::lt:
-          add_constraint( stride_con, scale_con, offset, limit, pre_ctrl, &pre_limit, &main_limit );
+          // The underflow and overflow limits: MIN_INT <= scale*I+offset < limit
+          add_constraint( stride_con, scale_con, offset, mini, limit, pre_ctrl, &pre_limit, &main_limit );
           if (!conditional_rc) {
             conditional_rc = !loop->dominates_backedge(iff);
+            // It is also needed if scale*pre_limit+offset >= limit
+            // due to underflow so we need execute pre-loop until
+            // scale*I+offset >= min_int. But (low_limit-offset) will
+            // underflow when offset > 0 and X will be > original_limit.
+            const TypeInt* offset_t = _igvn.type(offset)->is_int();
+            conditional_rc |= RangeLimitCheck && (offset_t->_hi > 0) &&
+                              (scale_con>0) && (stride_con>0);
           }
           break;
         default:
@@ -1696,7 +1965,8 @@
 
   // Note:: we are making the main loop limit no longer precise;
   // need to round up based on stride.
-  if( stride_con != 1 && stride_con != -1 ) { // Cutout for common case
+  cl->set_nonexact_trip_count();
+  if (!LoopLimitCheck && stride_con != 1 && stride_con != -1) { // Cutout for common case
     // "Standard" round-up logic:  ([main_limit-init+(y-1)]/y)*y+init
     // Hopefully, compiler will optimize for powers of 2.
     Node *ctrl = get_ctrl(main_limit);
@@ -1876,7 +2146,19 @@
   // iteration.  Then the CountedLoopEnd will collapse (backedge never
   // taken) and all loop-invariant uses of the exit values will be correct.
   Node *phi = cl->phi();
-  Node *final = new (phase->C, 3) SubINode( cl->limit(), cl->stride() );
+  Node *exact_limit = phase->exact_limit(this);
+  if (exact_limit != cl->limit()) {
+    // We also need to replace the original limit to collapse loop exit.
+    Node* cmp = cl->loopexit()->cmp_node();
+    assert(cl->limit() == cmp->in(2), "sanity");
+    phase->_igvn._worklist.push(cmp->in(2)); // put limit on worklist
+    phase->_igvn.hash_delete(cmp);
+    cmp->set_req(2, exact_limit);
+    phase->_igvn._worklist.push(cmp);        // put cmp on worklist
+  }
+  // Note: the final value after increment should not overflow since
+  // counted loop has limit check predicate.
+  Node *final = new (phase->C, 3) SubINode( exact_limit, cl->stride() );
   phase->register_new_node(final,cl->in(LoopNode::EntryControl));
   phase->_igvn.replace_node(phi,final);
   phase->C->set_major_progress();
--- a/src/share/vm/opto/loopUnswitch.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/loopUnswitch.cpp	Tue May 24 11:09:39 2011 -0700
@@ -130,6 +130,11 @@
   Node* uniqc = proj_true->unique_ctrl_out();
   Node* entry = head->in(LoopNode::EntryControl);
   Node* predicate = find_predicate(entry);
+  if (predicate != NULL && LoopLimitCheck && UseLoopPredicate) {
+    // We may have two predicates, find first.
+    entry = find_predicate(entry->in(0)->in(0));
+    if (entry != NULL) predicate = entry;
+  }
   if (predicate != NULL) predicate = predicate->in(0);
   assert(proj_true->is_IfTrue() &&
          (predicate == NULL && uniqc == head ||
@@ -217,6 +222,7 @@
 ProjNode* PhaseIdealLoop::create_slow_version_of_loop(IdealLoopTree *loop,
                                                       Node_List &old_new) {
   LoopNode* head  = loop->_head->as_Loop();
+  bool counted_loop = head->is_CountedLoop();
   Node*     entry = head->in(LoopNode::EntryControl);
   _igvn.hash_delete(entry);
   _igvn._worklist.push(entry);
@@ -242,14 +248,14 @@
   assert(old_new[head->_idx]->is_Loop(), "" );
 
   // Fast (true) control
-  Node* iffast_pred = clone_loop_predicates(entry, iffast);
+  Node* iffast_pred = clone_loop_predicates(entry, iffast, !counted_loop);
   _igvn.hash_delete(head);
   head->set_req(LoopNode::EntryControl, iffast_pred);
   set_idom(head, iffast_pred, dom_depth(head));
   _igvn._worklist.push(head);
 
   // Slow (false) control
-  Node* ifslow_pred = move_loop_predicates(entry, ifslow);
+  Node* ifslow_pred = move_loop_predicates(entry, ifslow, !counted_loop);
   LoopNode* slow_head = old_new[head->_idx]->as_Loop();
   _igvn.hash_delete(slow_head);
   slow_head->set_req(LoopNode::EntryControl, ifslow_pred);
--- a/src/share/vm/opto/loopnode.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/loopnode.cpp	Tue May 24 11:09:39 2011 -0700
@@ -206,7 +206,7 @@
   // Get backedge compare
   Node *cmp = test->in(1);
   int cmp_op = cmp->Opcode();
-  if( cmp_op != Op_CmpI )
+  if (cmp_op != Op_CmpI)
     return false;                // Avoid pointer & float compares
 
   // Find the trip-counter increment & limit.  Limit must be loop invariant.
@@ -259,7 +259,8 @@
   }
   // Stride must be constant
   int stride_con = stride->get_int();
-  assert(stride_con != 0, "missed some peephole opt");
+  if (stride_con == 0)
+    return false; // missed some peephole opt
 
   if (!xphi->is_Phi())
     return false; // Too much math on the trip counter
@@ -319,7 +320,7 @@
       // Count down loop rolls through MAXINT
       (bt == BoolTest::le || bt == BoolTest::lt) && stride_con < 0 ||
       // Count up loop rolls through MININT
-      (bt == BoolTest::ge || bt == BoolTest::gt) && stride_con > 0 ) {
+      (bt == BoolTest::ge || bt == BoolTest::gt) && stride_con > 0) {
     return false; // Bail out
   }
 
@@ -341,12 +342,137 @@
   //
   assert(x->Opcode() == Op_Loop, "regular loops only");
   C->print_method("Before CountedLoop", 3);
+
+  Node *hook = new (C, 6) Node(6);
+
+  if (LoopLimitCheck) {
+
+  // ===================================================
+  // Generate loop limit check to avoid integer overflow
+  // in cases like next (cyclic loops):
+  //
+  // for (i=0; i <= max_jint; i++) {}
+  // for (i=0; i <  max_jint; i+=2) {}
+  //
+  //
+  // Limit check predicate depends on the loop test:
+  //
+  // for(;i != limit; i++)       --> limit <= (max_jint)
+  // for(;i <  limit; i+=stride) --> limit <= (max_jint - stride + 1)
+  // for(;i <= limit; i+=stride) --> limit <= (max_jint - stride    )
+  //
+
+  // Check if limit is excluded to do more precise int overflow check.
+  bool incl_limit = (bt == BoolTest::le || bt == BoolTest::ge);
+  int stride_m  = stride_con - (incl_limit ? 0 : (stride_con > 0 ? 1 : -1));
+
+  // If compare points directly to the phi we need to adjust
+  // the compare so that it points to the incr. Limit have
+  // to be adjusted to keep trip count the same and the
+  // adjusted limit should be checked for int overflow.
+  if (phi_incr != NULL) {
+    stride_m  += stride_con;
+  }
+
+  if (limit->is_Con()) {
+    int limit_con = limit->get_int();
+    if ((stride_con > 0 && limit_con > (max_jint - stride_m)) ||
+        (stride_con < 0 && limit_con < (min_jint - stride_m))) {
+      // Bailout: it could be integer overflow.
+      return false;
+    }
+  } else if ((stride_con > 0 && limit_t->_hi <= (max_jint - stride_m)) ||
+             (stride_con < 0 && limit_t->_lo >= (min_jint - stride_m))) {
+      // Limit's type may satisfy the condition, for example,
+      // when it is an array length.
+  } else {
+    // Generate loop's limit check.
+    // Loop limit check predicate should be near the loop.
+    ProjNode *limit_check_proj = find_predicate_insertion_point(init_control, Deoptimization::Reason_loop_limit_check);
+    if (!limit_check_proj) {
+      // The limit check predicate is not generated if this method trapped here before.
+#ifdef ASSERT
+      if (TraceLoopLimitCheck) {
+        tty->print("missing loop limit check:");
+        loop->dump_head();
+        x->dump(1);
+      }
+#endif
+      return false;
+    }
+
+    IfNode* check_iff = limit_check_proj->in(0)->as_If();
+    Node* cmp_limit;
+    Node* bol;
+
+    if (stride_con > 0) {
+      cmp_limit = new (C, 3) CmpINode(limit, _igvn.intcon(max_jint - stride_m));
+      bol = new (C, 2) BoolNode(cmp_limit, BoolTest::le);
+    } else {
+      cmp_limit = new (C, 3) CmpINode(limit, _igvn.intcon(min_jint - stride_m));
+      bol = new (C, 2) BoolNode(cmp_limit, BoolTest::ge);
+    }
+    cmp_limit = _igvn.register_new_node_with_optimizer(cmp_limit);
+    bol = _igvn.register_new_node_with_optimizer(bol);
+    set_subtree_ctrl(bol);
+
+    // Replace condition in original predicate but preserve Opaque node
+    // so that previous predicates could be found.
+    assert(check_iff->in(1)->Opcode() == Op_Conv2B &&
+           check_iff->in(1)->in(1)->Opcode() == Op_Opaque1, "");
+    Node* opq = check_iff->in(1)->in(1);
+    _igvn.hash_delete(opq);
+    opq->set_req(1, bol);
+    // Update ctrl.
+    set_ctrl(opq, check_iff->in(0));
+    set_ctrl(check_iff->in(1), check_iff->in(0));
+
 #ifndef PRODUCT
-  if (TraceLoopOpts) {
-    tty->print("Counted      ");
-    loop->dump_head();
+    // report that the loop predication has been actually performed
+    // for this loop
+    if (TraceLoopLimitCheck) {
+      tty->print_cr("Counted Loop Limit Check generated:");
+      debug_only( bol->dump(2); )
+    }
+#endif
   }
-#endif
+
+  if (phi_incr != NULL) {
+    // If compare points directly to the phi we need to adjust
+    // the compare so that it points to the incr. Limit have
+    // to be adjusted to keep trip count the same and we
+    // should avoid int overflow.
+    //
+    //   i = init; do {} while(i++ < limit);
+    // is converted to
+    //   i = init; do {} while(++i < limit+1);
+    //
+    limit = gvn->transform(new (C, 3) AddINode(limit, stride));
+  }
+
+  // Now we need to canonicalize loop condition.
+  if (bt == BoolTest::ne) {
+    assert(stride_con == 1 || stride_con == -1, "simple increment only");
+    bt = (stride_con > 0) ? BoolTest::lt : BoolTest::gt;
+  }
+
+  if (incl_limit) {
+    // The limit check guaranties that 'limit <= (max_jint - stride)' so
+    // we can convert 'i <= limit' to 'i < limit+1' since stride != 0.
+    //
+    Node* one = (stride_con > 0) ? gvn->intcon( 1) : gvn->intcon(-1);
+    limit = gvn->transform(new (C, 3) AddINode(limit, one));
+    if (bt == BoolTest::le)
+      bt = BoolTest::lt;
+    else if (bt == BoolTest::ge)
+      bt = BoolTest::gt;
+    else
+      ShouldNotReachHere();
+  }
+  set_subtree_ctrl( limit );
+
+  } else { // LoopLimitCheck
+
   // If compare points to incr, we are ok.  Otherwise the compare
   // can directly point to the phi; in this case adjust the compare so that
   // it points to the incr by adjusting the limit.
@@ -359,7 +485,6 @@
   Node *one_m = gvn->intcon(-1);
 
   Node *trip_count = NULL;
-  Node *hook = new (C, 6) Node(6);
   switch( bt ) {
   case BoolTest::eq:
     ShouldNotReachHere();
@@ -441,6 +566,8 @@
   limit = gvn->transform(new (C, 3) AddINode(span,init_trip));
   set_subtree_ctrl( limit );
 
+  } // LoopLimitCheck
+
   // Check for SafePoint on backedge and remove
   Node *sfpt = x->in(LoopNode::LoopBackControl);
   if (sfpt->Opcode() == Op_SafePoint && is_deleteable_safept(sfpt)) {
@@ -531,7 +658,7 @@
 
   // Check for immediately preceding SafePoint and remove
   Node *sfpt2 = le->in(0);
-  if( sfpt2->Opcode() == Op_SafePoint && is_deleteable_safept(sfpt2))
+  if (sfpt2->Opcode() == Op_SafePoint && is_deleteable_safept(sfpt2))
     lazy_replace( sfpt2, sfpt2->in(TypeFunc::Control));
 
   // Free up intermediate goo
@@ -541,12 +668,56 @@
   assert(l->is_valid_counted_loop(), "counted loop shape is messed up");
   assert(l == loop->_head && l->phi() == phi && l->loopexit() == lex, "" );
 #endif
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("Counted      ");
+    loop->dump_head();
+  }
+#endif
 
   C->print_method("After CountedLoop", 3);
 
   return true;
 }
 
+//----------------------exact_limit-------------------------------------------
+Node* PhaseIdealLoop::exact_limit( IdealLoopTree *loop ) {
+  assert(loop->_head->is_CountedLoop(), "");
+  CountedLoopNode *cl = loop->_head->as_CountedLoop();
+
+  if (!LoopLimitCheck || ABS(cl->stride_con()) == 1 ||
+      cl->limit()->Opcode() == Op_LoopLimit) {
+    // Old code has exact limit (it could be incorrect in case of int overflow).
+    // Loop limit is exact with stride == 1. And loop may already have exact limit.
+    return cl->limit();
+  }
+  Node *limit = NULL;
+#ifdef ASSERT
+  BoolTest::mask bt = cl->loopexit()->test_trip();
+  assert(bt == BoolTest::lt || bt == BoolTest::gt, "canonical test is expected");
+#endif
+  if (cl->has_exact_trip_count()) {
+    // Simple case: loop has constant boundaries.
+    // Use longs to avoid integer overflow.
+    int stride_con = cl->stride_con();
+    long  init_con = cl->init_trip()->get_int();
+    long limit_con = cl->limit()->get_int();
+    julong trip_cnt = cl->trip_count();
+    long final_con = init_con + trip_cnt*stride_con;
+    final_con -= stride_con;
+    int final_int = (int)final_con;
+    // The final value should be in integer range since the loop
+    // is counted and the limit was checked for overflow.
+    assert(final_con == (long)final_int, "final value should be integer");
+    limit = _igvn.intcon(final_int);
+  } else {
+    // Create new LoopLimit node to get exact limit (final iv value).
+    limit = new (C, 4) LoopLimitNode(C, cl->init_trip(), cl->limit(), cl->stride());
+    register_new_node(limit, cl->in(LoopNode::EntryControl));
+  }
+  assert(limit != NULL, "sanity");
+  return limit;
+}
 
 //------------------------------Ideal------------------------------------------
 // Return a node which is more "ideal" than the current node.
@@ -572,14 +743,12 @@
 #ifndef PRODUCT
 void CountedLoopNode::dump_spec(outputStream *st) const {
   LoopNode::dump_spec(st);
-  if( stride_is_con() ) {
+  if (stride_is_con()) {
     st->print("stride: %d ",stride_con());
-  } else {
-    st->print("stride: not constant ");
   }
-  if( is_pre_loop () ) st->print("pre of N%d" , _main_idx );
-  if( is_main_loop() ) st->print("main of N%d", _idx );
-  if( is_post_loop() ) st->print("post of N%d", _main_idx );
+  if (is_pre_loop ()) st->print("pre of N%d" , _main_idx);
+  if (is_main_loop()) st->print("main of N%d", _idx);
+  if (is_post_loop()) st->print("post of N%d", _main_idx);
 }
 #endif
 
@@ -588,7 +757,130 @@
   return stride()->bottom_type()->is_int()->get_con();
 }
 
-
+//=============================================================================
+//------------------------------Value-----------------------------------------
+const Type *LoopLimitNode::Value( PhaseTransform *phase ) const {
+  const Type* init_t   = phase->type(in(Init));
+  const Type* limit_t  = phase->type(in(Limit));
+  const Type* stride_t = phase->type(in(Stride));
+  // Either input is TOP ==> the result is TOP
+  if (init_t   == Type::TOP) return Type::TOP;
+  if (limit_t  == Type::TOP) return Type::TOP;
+  if (stride_t == Type::TOP) return Type::TOP;
+
+  int stride_con = stride_t->is_int()->get_con();
+  if (stride_con == 1)
+    return NULL;  // Identity
+
+  if (init_t->is_int()->is_con() && limit_t->is_int()->is_con()) {
+    // Use longs to avoid integer overflow.
+    long init_con   =  init_t->is_int()->get_con();
+    long limit_con  = limit_t->is_int()->get_con();
+    int  stride_m   = stride_con - (stride_con > 0 ? 1 : -1);
+    long trip_count = (limit_con - init_con + stride_m)/stride_con;
+    long final_con  = init_con + stride_con*trip_count;
+    int final_int = (int)final_con;
+    // The final value should be in integer range since the loop
+    // is counted and the limit was checked for overflow.
+    assert(final_con == (long)final_int, "final value should be integer");
+    return TypeInt::make(final_int);
+  }
+
+  return bottom_type(); // TypeInt::INT
+}
+
+//------------------------------Ideal------------------------------------------
+// Return a node which is more "ideal" than the current node.
+Node *LoopLimitNode::Ideal(PhaseGVN *phase, bool can_reshape) {
+  if (phase->type(in(Init))   == Type::TOP ||
+      phase->type(in(Limit))  == Type::TOP ||
+      phase->type(in(Stride)) == Type::TOP)
+    return NULL;  // Dead
+
+  int stride_con = phase->type(in(Stride))->is_int()->get_con();
+  if (stride_con == 1)
+    return NULL;  // Identity
+
+  if (in(Init)->is_Con() && in(Limit)->is_Con())
+    return NULL;  // Value
+
+  // Delay following optimizations until all loop optimizations
+  // done to keep Ideal graph simple.
+  if (!can_reshape || phase->C->major_progress())
+    return NULL;
+
+  const TypeInt* init_t  = phase->type(in(Init) )->is_int();
+  const TypeInt* limit_t = phase->type(in(Limit))->is_int();
+  int stride_p;
+  long lim, ini;
+  julong max;
+  if (stride_con > 0) {
+    stride_p = stride_con;
+    lim = limit_t->_hi;
+    ini = init_t->_lo;
+    max = (julong)max_jint;
+  } else {
+    stride_p = -stride_con;
+    lim = init_t->_hi;
+    ini = limit_t->_lo;
+    max = (julong)min_jint;
+  }
+  julong range = lim - ini + stride_p;
+  if (range <= max) {
+    // Convert to integer expression if it is not overflow.
+    Node* stride_m = phase->intcon(stride_con - (stride_con > 0 ? 1 : -1));
+    Node *range = phase->transform(new (phase->C, 3) SubINode(in(Limit), in(Init)));
+    Node *bias  = phase->transform(new (phase->C, 3) AddINode(range, stride_m));
+    Node *trip  = phase->transform(new (phase->C, 3) DivINode(0, bias, in(Stride)));
+    Node *span  = phase->transform(new (phase->C, 3) MulINode(trip, in(Stride)));
+    return new (phase->C, 3) AddINode(span, in(Init)); // exact limit
+  }
+
+  if (is_power_of_2(stride_p) ||                // divisor is 2^n
+      !Matcher::has_match_rule(Op_LoopLimit)) { // or no specialized Mach node?
+    // Convert to long expression to avoid integer overflow
+    // and let igvn optimizer convert this division.
+    //
+    Node*   init   = phase->transform( new (phase->C, 2) ConvI2LNode(in(Init)));
+    Node*  limit   = phase->transform( new (phase->C, 2) ConvI2LNode(in(Limit)));
+    Node* stride   = phase->longcon(stride_con);
+    Node* stride_m = phase->longcon(stride_con - (stride_con > 0 ? 1 : -1));
+
+    Node *range = phase->transform(new (phase->C, 3) SubLNode(limit, init));
+    Node *bias  = phase->transform(new (phase->C, 3) AddLNode(range, stride_m));
+    Node *span;
+    if (stride_con > 0 && is_power_of_2(stride_p)) {
+      // bias >= 0 if stride >0, so if stride is 2^n we can use &(-stride)
+      // and avoid generating rounding for division. Zero trip guard should
+      // guarantee that init < limit but sometimes the guard is missing and
+      // we can get situation when init > limit. Note, for the empty loop
+      // optimization zero trip guard is generated explicitly which leaves
+      // only RCE predicate where exact limit is used and the predicate
+      // will simply fail forcing recompilation.
+      Node* neg_stride   = phase->longcon(-stride_con);
+      span = phase->transform(new (phase->C, 3) AndLNode(bias, neg_stride));
+    } else {
+      Node *trip  = phase->transform(new (phase->C, 3) DivLNode(0, bias, stride));
+      span = phase->transform(new (phase->C, 3) MulLNode(trip, stride));
+    }
+    // Convert back to int
+    Node *span_int = phase->transform(new (phase->C, 2) ConvL2INode(span));
+    return new (phase->C, 3) AddINode(span_int, in(Init)); // exact limit
+  }
+
+  return NULL;    // No progress
+}
+
+//------------------------------Identity---------------------------------------
+// If stride == 1 return limit node.
+Node *LoopLimitNode::Identity( PhaseTransform *phase ) {
+  int stride_con = phase->type(in(Stride))->is_int()->get_con();
+  if (stride_con == 1 || stride_con == -1)
+    return in(Limit);
+  return this;
+}
+
+//=============================================================================
 //----------------------match_incr_with_optional_truncation--------------------
 // Match increment with optional truncation:
 // CHAR: (i+1)&0x7fff, BYTE: ((i+1)<<8)>>8, or SHORT: ((i+1)<<16)>>16
@@ -870,7 +1162,7 @@
   outer = igvn.register_new_node_with_optimizer(outer, _head);
   phase->set_created_loop_node();
 
-  Node* pred = phase->clone_loop_predicates(ctl, outer);
+  Node* pred = phase->clone_loop_predicates(ctl, outer, true);
   // Outermost loop falls into '_head' loop
   _head->set_req(LoopNode::EntryControl, pred);
   _head->del_req(outer_idx);
@@ -1440,9 +1732,16 @@
     tty->print("  ");
   tty->print("Loop: N%d/N%d ",_head->_idx,_tail->_idx);
   if (_irreducible) tty->print(" IRREDUCIBLE");
+  Node* entry = _head->in(LoopNode::EntryControl);
+  if (LoopLimitCheck) {
+    Node* predicate = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+    if (predicate != NULL ) {
+      tty->print(" limit_check");
+      entry = entry->in(0)->in(0);
+    }
+  }
   if (UseLoopPredicate) {
-    Node* entry = PhaseIdealLoop::find_predicate_insertion_point(_head->in(LoopNode::EntryControl),
-                                                                 Deoptimization::Reason_predicate);
+    entry = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
     if (entry != NULL) {
       tty->print(" predicated");
     }
@@ -1528,10 +1827,15 @@
       !loop->tail()->is_top()) {
     LoopNode* lpn = loop->_head->as_Loop();
     Node* entry = lpn->in(LoopNode::EntryControl);
-    Node* predicate_proj = find_predicate(entry);
+    Node* predicate_proj = find_predicate(entry); // loop_limit_check first
     if (predicate_proj != NULL ) { // right pattern that can be used by loop predication
       assert(entry->in(0)->in(1)->in(1)->Opcode() == Op_Opaque1, "must be");
       useful_predicates.push(entry->in(0)->in(1)->in(1)); // good one
+      entry = entry->in(0)->in(0);
+    }
+    predicate_proj = find_predicate(entry); // Predicate
+    if (predicate_proj != NULL ) {
+      useful_predicates.push(entry->in(0)->in(1)->in(1)); // good one
     }
   }
 
@@ -1542,6 +1846,8 @@
 
 //------------------------eliminate_useless_predicates-----------------------------
 // Eliminate all inserted predicates if they could not be used by loop predication.
+// Note: it will also eliminates loop limits check predicate since it also uses
+// Opaque1 node (see Parse::add_predicate()).
 void PhaseIdealLoop::eliminate_useless_predicates() {
   if (C->predicate_count() == 0)
     return; // no predicate left
@@ -1731,7 +2037,7 @@
   // Some parser-inserted loop predicates could never be used by loop
   // predication or they were moved away from loop during some optimizations.
   // For example, peeling. Eliminate them before next loop optimizations.
-  if (UseLoopPredicate) {
+  if (UseLoopPredicate || LoopLimitCheck) {
     eliminate_useless_predicates();
   }
 
--- a/src/share/vm/opto/loopnode.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/loopnode.hpp	Tue May 24 11:09:39 2011 -0700
@@ -289,6 +289,28 @@
 inline Node *CountedLoopNode::incr() const { return loopexit() ? loopexit()->incr() : NULL; }
 inline Node *CountedLoopNode::phi() const { return loopexit() ? loopexit()->phi() : NULL; }
 
+//------------------------------LoopLimitNode-----------------------------
+// Counted Loop limit node which represents exact final iterator value:
+// trip_count = (limit - init_trip + stride - 1)/stride
+// final_value= trip_count * stride + init_trip.
+// Use HW instructions to calculate it when it can overflow in integer.
+// Note, final_value should fit into integer since counted loop has
+// limit check: limit <= max_int-stride.
+class LoopLimitNode : public Node {
+  enum { Init=1, Limit=2, Stride=3 };
+ public:
+  LoopLimitNode( Compile* C, Node *init, Node *limit, Node *stride ) : Node(0,init,limit,stride) {
+    // Put it on the Macro nodes list to optimize during macro nodes expansion.
+    init_flags(Flag_is_macro);
+    C->add_macro_node(this);
+  }
+  virtual int Opcode() const;
+  virtual const Type *bottom_type() const { return TypeInt::INT; }
+  virtual uint ideal_reg() const { return Op_RegI; }
+  virtual const Type *Value( PhaseTransform *phase ) const;
+  virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
+  virtual Node *Identity( PhaseTransform *phase );
+};
 
 // -----------------------------IdealLoopTree----------------------------------
 class IdealLoopTree : public ResourceObj {
@@ -775,6 +797,8 @@
 
   bool is_counted_loop( Node *x, IdealLoopTree *loop );
 
+  Node* exact_limit( IdealLoopTree *loop );
+
   // Return a post-walked LoopNode
   IdealLoopTree *get_loop( Node *n ) const {
     // Dead nodes have no loop, so return the top level loop instead
@@ -837,7 +861,6 @@
   bool is_scaled_iv_plus_offset(Node* exp, Node* iv, int* p_scale, Node** p_offset, int depth = 0);
 
   // Return true if proj is for "proj->[region->..]call_uct"
-  // Return true if proj is for "proj->[region->..]call_uct"
   static bool is_uncommon_trap_proj(ProjNode* proj, Deoptimization::DeoptReason reason);
   // Return true for    "if(test)-> proj -> ...
   //                          |
@@ -860,10 +883,11 @@
                                    PhaseIterGVN* igvn);
   static Node* clone_loop_predicates(Node* old_entry, Node* new_entry,
                                          bool move_predicates,
+                                         bool clone_limit_check,
                                          PhaseIdealLoop* loop_phase,
                                          PhaseIterGVN* igvn);
-  Node* clone_loop_predicates(Node* old_entry, Node* new_entry);
-  Node*  move_loop_predicates(Node* old_entry, Node* new_entry);
+  Node* clone_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check);
+  Node*  move_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check);
 
   void eliminate_loop_predicates(Node* entry);
   static Node* skip_loop_predicates(Node* entry);
@@ -873,7 +897,7 @@
   // Find a predicate
   static Node* find_predicate(Node* entry);
   // Construct a range check for a predicate if
-  BoolNode* rc_predicate(Node* ctrl,
+  BoolNode* rc_predicate(IdealLoopTree *loop, Node* ctrl,
                          int scale, Node* offset,
                          Node* init, Node* limit, Node* stride,
                          Node* range, bool upper);
@@ -903,11 +927,11 @@
 
   // Range Check Elimination uses this function!
   // Constrain the main loop iterations so the affine function:
-  //    scale_con * I + offset  <  limit
+  //    low_limit <= scale_con * I + offset  <  upper_limit
   // always holds true.  That is, either increase the number of iterations in
   // the pre-loop or the post-loop until the condition holds true in the main
   // loop.  Scale_con, offset and limit are all loop invariant.
-  void add_constraint( int stride_con, int scale_con, Node *offset, Node *limit, Node *pre_ctrl, Node **pre_limit, Node **main_limit );
+  void add_constraint( int stride_con, int scale_con, Node *offset, Node *low_limit, Node *upper_limit, Node *pre_ctrl, Node **pre_limit, Node **main_limit );
 
   // Partially peel loop up through last_peel node.
   bool partial_peel( IdealLoopTree *loop, Node_List &old_new );
--- a/src/share/vm/opto/macro.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/macro.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2154,6 +2154,11 @@
       debug_only(int old_macro_count = C->macro_count(););
       if (n->is_AbstractLock()) {
         success = eliminate_locking_node(n->as_AbstractLock());
+      } else if (n->Opcode() == Op_LoopLimit) {
+        // Remove it from macro list and put on IGVN worklist to optimize.
+        C->remove_macro_node(n);
+        _igvn._worklist.push(n);
+        success = true;
       } else if (n->Opcode() == Op_Opaque1 || n->Opcode() == Op_Opaque2) {
         _igvn.replace_node(n, n->in(1));
         success = true;
--- a/src/share/vm/opto/matcher.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/matcher.cpp	Tue May 24 11:09:39 2011 -0700
@@ -2086,6 +2086,13 @@
         n->del_req(3);
         break;
       }
+      case Op_LoopLimit: {
+        Node *pair1 = new (C, 3) BinaryNode(n->in(1),n->in(2));
+        n->set_req(1,pair1);
+        n->set_req(2,n->in(3));
+        n->del_req(3);
+        break;
+      }
       case Op_StrEquals: {
         Node *pair1 = new (C, 3) BinaryNode(n->in(2),n->in(3));
         n->set_req(2,pair1);
--- a/src/share/vm/opto/parse.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/parse.hpp	Tue May 24 11:09:39 2011 -0700
@@ -68,9 +68,9 @@
                                            JVMState* caller_jvms,
                                            int caller_bci);
   const char* try_to_inline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result);
-  const char* shouldInline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result) const;
-  const char* shouldNotInline(ciMethod* callee_method, ciMethod* caller_method, WarmCallInfo* wci_result) const;
-  void        print_inlining(ciMethod *callee_method, int caller_bci, const char *failure_msg) const PRODUCT_RETURN;
+  const char* should_inline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result) const;
+  const char* should_not_inline(ciMethod* callee_method, ciMethod* caller_method, WarmCallInfo* wci_result) const;
+  void        print_inlining(ciMethod *callee_method, int caller_bci, const char *failure_msg) const;
 
   InlineTree *caller_tree()       const { return _caller_tree;  }
   InlineTree* callee_at(int bci, ciMethod* m) const;
--- a/src/share/vm/opto/parse1.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/parse1.cpp	Tue May 24 11:09:39 2011 -0700
@@ -638,7 +638,7 @@
         ensure_phis_everywhere();
 
         if (block->is_SEL_head() &&
-            UseLoopPredicate) {
+            (UseLoopPredicate || LoopLimitCheck)) {
           // Add predicate to single entry (not irreducible) loop head.
           assert(!block->has_merged_backedge(), "only entry paths should be merged for now");
           // Need correct bci for predicate.
--- a/src/share/vm/opto/phaseX.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/phaseX.hpp	Tue May 24 11:09:39 2011 -0700
@@ -472,8 +472,8 @@
   }
 
   // Clone loop predicates. Defined in loopTransform.cpp.
-  Node* clone_loop_predicates(Node* old_entry, Node* new_entry);
-  Node*  move_loop_predicates(Node* old_entry, Node* new_entry);
+  Node* clone_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check);
+  Node*  move_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check);
   // Create a new if below new_entry for the predicate to be cloned
   ProjNode* create_new_if_for_predicate(ProjNode* cont_proj, Node* new_entry,
                                         Deoptimization::DeoptReason reason);
--- a/src/share/vm/opto/subnode.cpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/subnode.cpp	Tue May 24 11:09:39 2011 -0700
@@ -1223,21 +1223,6 @@
 }
 
 //=============================================================================
-//------------------------------NegNode----------------------------------------
-Node *NegFNode::Ideal(PhaseGVN *phase, bool can_reshape) {
-  if( in(1)->Opcode() == Op_SubF )
-    return new (phase->C, 3) SubFNode( in(1)->in(2), in(1)->in(1) );
-  return NULL;
-}
-
-Node *NegDNode::Ideal(PhaseGVN *phase, bool can_reshape) {
-  if( in(1)->Opcode() == Op_SubD )
-    return new (phase->C, 3) SubDNode( in(1)->in(2), in(1)->in(1) );
-  return NULL;
-}
-
-
-//=============================================================================
 //------------------------------Value------------------------------------------
 // Compute sqrt
 const Type *SqrtDNode::Value( PhaseTransform *phase ) const {
--- a/src/share/vm/opto/subnode.hpp	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/opto/subnode.hpp	Tue May 24 11:09:39 2011 -0700
@@ -377,7 +377,6 @@
 public:
   NegFNode( Node *in1 ) : NegNode(in1) {}
   virtual int Opcode() const;
-  virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
   const Type *bottom_type() const { return Type::FLOAT; }
   virtual uint ideal_reg() const { return Op_RegF; }
 };
@@ -391,7 +390,6 @@
 public:
   NegDNode( Node *in1 ) : NegNode(in1) {}
   virtual int Opcode() const;
-  virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
   const Type *bottom_type() const { return Type::DOUBLE; }
   virtual uint ideal_reg() const { return Op_RegD; }
 };
--- a/src/share/vm/prims/jvmti.xml	Fri May 06 14:32:44 2011 -0700
+++ b/src/share/vm/prims/jvmti.xml	Tue May 24 11:09:39 2011 -0700
@@ -280,10 +280,8 @@
    <!ELEMENT externallink (#PCDATA|jvmti|code|i|b|tm)*>
    <!ATTLIST externallink id CDATA #REQUIRED>
 
-   <!ELEMENT vmspeclink EMPTY>
-   <!ATTLIST vmspeclink id CDATA #IMPLIED>
-   <!ATTLIST vmspeclink name CDATA #IMPLIED>
-   <!ATTLIST vmspeclink preposition CDATA #IMPLIED>
+   <!ELEMENT vmspec EMPTY>
+   <!ATTLIST vmspec chapter CDATA #IMPLIED>
 
    <!ELEMENT internallink (#PCDATA|jvmti|code|i|b)*>
    <!ATTLIST internallink id CDATA #REQUIRED>
@@ -2285,9 +2283,8 @@
         Stack frames are referenced by depth.
         The frame at depth zero is the current frame.
         <p/>
-        Stack frames are as described in the 
-        <vmspeclink id="Overview.doc.html#17257"
-                    name="Frames section"/>.  
+        Stack frames are as described in
+        <vmspec chapter="3.6"/>,
         That is, they correspond to method 
         invocations (including native methods) but do not correspond to platform native or 
         VM internal frames.
@@ -2627,7 +2624,7 @@
         <param id="use_java_stack">
 	  <jboolean/>
 	  <description>
-	    Return the stack showing the <vmspeclink/>
+	    Return the stack showing <vmspec/>
 	    model of the stack; 
 	    otherwise, show the internal representation of the stack with
 	    inlined and optimized methods missing.  If the virtual machine
@@ -2707,7 +2704,7 @@
 	When the thread is resumed, the execution 
 	state of the thread is reset to the state
 	immediately before the called method was invoked.
-	That is (using the <vmspeclink/> terminology):
+	That is (using <vmspec/> terminology):
 	  <ul>
 	    <li>the current frame is discarded as the previous frame becomes the current one</li>
 	    <li>the operand stack is restored--the argument values are added back
@@ -2868,9 +2865,8 @@
       to return at any point during its execution.
       The method which will return early is referred to as the <i>called method</i>.
       The called method is the current method
-      (as defined by the 
-      <vmspeclink id="Overview.doc.html#17257"
-                  name="Frames section"/>) 
+      (as defined by
+      <vmspec chapter="3.6"/>) 
       for the specified thread at
       the time the function is called.
       <p/>
@@ -3576,10 +3572,8 @@
 	<field id="index">
 	  <jint/>
 	  <description>	    
-	    The index into the constant pool of the class. See the
-            <vmspeclink id="ClassFile.doc.html#20080"
-                        name="Constant Pool section"/>
-	    description.
+	    The index into the constant pool of the class. See the description in 
+      <vmspec chapter="4.4"/>.
 	  </description>
 	</field>
       </typedef>
@@ -5006,9 +5000,8 @@
 	    For references of this kind the <code>referrer_index</code>
             parameter to the <internallink id="jvmtiObjectReferenceCallback">
             jvmtiObjectReferenceCallback</internallink> is the index into
-            constant pool table of the class, starting at 1. See the
-            <vmspeclink id="ClassFile.doc.html#20080"
-                        name="Constant Pool section"/>
+            constant pool table of the class, starting at 1. See
+            <vmspec chapter="4.4"/>.
 	  </constant>
 	</constants>