changeset 4345:b9e0f2c87dd6 hs24-b26

Merge
author amurillo
date Thu, 29 Nov 2012 22:32:44 -0800
parents 8e459e9615fd 1ba2ed1c07df
children ed9b424d5e43
files
diffstat 70 files changed, 3549 insertions(+), 668 deletions(-) [+]
line wrap: on
line diff
--- a/make/hotspot_version	Thu Nov 29 19:41:00 2012 -0800
+++ b/make/hotspot_version	Thu Nov 29 22:32:44 2012 -0800
@@ -35,7 +35,7 @@
 
 HS_MAJOR_VER=24
 HS_MINOR_VER=0
-HS_BUILD_NUMBER=25
+HS_BUILD_NUMBER=26
 
 JDK_MAJOR_VER=1
 JDK_MINOR_VER=7
--- a/make/jprt.properties	Thu Nov 29 19:41:00 2012 -0800
+++ b/make/jprt.properties	Thu Nov 29 22:32:44 2012 -0800
@@ -38,7 +38,7 @@
 
 # This tells jprt what default release we want to build
 
-jprt.hotspot.default.release=jdk7u10
+jprt.hotspot.default.release=jdk7u12
 
 jprt.tools.default.release=${jprt.submit.option.release?${jprt.submit.option.release}:${jprt.hotspot.default.release}}
 
@@ -54,77 +54,77 @@
 # Define the Solaris platforms we want for the various releases
 jprt.my.solaris.sparc.jdk8=solaris_sparc_5.10
 jprt.my.solaris.sparc.jdk7=solaris_sparc_5.10
-jprt.my.solaris.sparc.jdk7u10=${jprt.my.solaris.sparc.jdk7}
+jprt.my.solaris.sparc.jdk7u12=${jprt.my.solaris.sparc.jdk7}
 jprt.my.solaris.sparc=${jprt.my.solaris.sparc.${jprt.tools.default.release}}
 
 jprt.my.solaris.sparcv9.jdk8=solaris_sparcv9_5.10
 jprt.my.solaris.sparcv9.jdk7=solaris_sparcv9_5.10
-jprt.my.solaris.sparcv9.jdk7u10=${jprt.my.solaris.sparcv9.jdk7}
+jprt.my.solaris.sparcv9.jdk7u12=${jprt.my.solaris.sparcv9.jdk7}
 jprt.my.solaris.sparcv9=${jprt.my.solaris.sparcv9.${jprt.tools.default.release}}
 
 jprt.my.solaris.i586.jdk8=solaris_i586_5.10
 jprt.my.solaris.i586.jdk7=solaris_i586_5.10
-jprt.my.solaris.i586.jdk7u10=${jprt.my.solaris.i586.jdk7}
+jprt.my.solaris.i586.jdk7u12=${jprt.my.solaris.i586.jdk7}
 jprt.my.solaris.i586=${jprt.my.solaris.i586.${jprt.tools.default.release}}
 
 jprt.my.solaris.x64.jdk8=solaris_x64_5.10
 jprt.my.solaris.x64.jdk7=solaris_x64_5.10
-jprt.my.solaris.x64.jdk7u10=${jprt.my.solaris.x64.jdk7}
+jprt.my.solaris.x64.jdk7u12=${jprt.my.solaris.x64.jdk7}
 jprt.my.solaris.x64=${jprt.my.solaris.x64.${jprt.tools.default.release}}
 
 jprt.my.linux.i586.jdk8=linux_i586_2.6
 jprt.my.linux.i586.jdk7=linux_i586_2.6
-jprt.my.linux.i586.jdk7u10=${jprt.my.linux.i586.jdk7}
+jprt.my.linux.i586.jdk7u12=${jprt.my.linux.i586.jdk7}
 jprt.my.linux.i586=${jprt.my.linux.i586.${jprt.tools.default.release}}
 
 jprt.my.linux.x64.jdk8=linux_x64_2.6
 jprt.my.linux.x64.jdk7=linux_x64_2.6
-jprt.my.linux.x64.jdk7u10=${jprt.my.linux.x64.jdk7}
+jprt.my.linux.x64.jdk7u12=${jprt.my.linux.x64.jdk7}
 jprt.my.linux.x64=${jprt.my.linux.x64.${jprt.tools.default.release}}
 
 jprt.my.linux.ppc.jdk8=linux_ppc_2.6
 jprt.my.linux.ppc.jdk7=linux_ppc_2.6
-jprt.my.linux.ppc.jdk7u10=${jprt.my.linux.ppc.jdk7}
+jprt.my.linux.ppc.jdk7u12=${jprt.my.linux.ppc.jdk7}
 jprt.my.linux.ppc=${jprt.my.linux.ppc.${jprt.tools.default.release}}
 
 jprt.my.linux.ppcv2.jdk8=linux_ppcv2_2.6
 jprt.my.linux.ppcv2.jdk7=linux_ppcv2_2.6
-jprt.my.linux.ppcv2.jdk7u10=${jprt.my.linux.ppcv2.jdk7}
+jprt.my.linux.ppcv2.jdk7u12=${jprt.my.linux.ppcv2.jdk7}
 jprt.my.linux.ppcv2=${jprt.my.linux.ppcv2.${jprt.tools.default.release}}
 
 jprt.my.linux.ppcsflt.jdk8=linux_ppcsflt_2.6
 jprt.my.linux.ppcsflt.jdk7=linux_ppcsflt_2.6
-jprt.my.linux.ppcsflt.jdk7u10=${jprt.my.linux.ppcsflt.jdk7}
+jprt.my.linux.ppcsflt.jdk7u12=${jprt.my.linux.ppcsflt.jdk7}
 jprt.my.linux.ppcsflt=${jprt.my.linux.ppcsflt.${jprt.tools.default.release}}
 
 jprt.my.linux.armvfp.jdk8=linux_armvfp_2.6
 jprt.my.linux.armvfp.jdk7=linux_armvfp_2.6
-jprt.my.linux.armvfp.jdk7u10=${jprt.my.linux.armvfp.jdk7}
+jprt.my.linux.armvfp.jdk7u12=${jprt.my.linux.armvfp.jdk7}
 jprt.my.linux.armvfp=${jprt.my.linux.armvfp.${jprt.tools.default.release}}
 
 jprt.my.linux.armv6.jdk8=linux_armv6_2.6
 jprt.my.linux.armv6.jdk7=linux_armv6_2.6
-jprt.my.linux.armv6.jdk7u10=${jprt.my.linux.armv6.jdk7}
+jprt.my.linux.armv6.jdk7u12=${jprt.my.linux.armv6.jdk7}
 jprt.my.linux.armv6=${jprt.my.linux.armv6.${jprt.tools.default.release}}
 
 jprt.my.linux.armsflt.jdk8=linux_armsflt_2.6
 jprt.my.linux.armsflt.jdk7=linux_armsflt_2.6
-jprt.my.linux.armsflt.jdk7u10=${jprt.my.linux.armsflt.jdk7}
+jprt.my.linux.armsflt.jdk7u12=${jprt.my.linux.armsflt.jdk7}
 jprt.my.linux.armsflt=${jprt.my.linux.armsflt.${jprt.tools.default.release}}
 
 jprt.my.macosx.x64.jdk8=macosx_x64_10.7
 jprt.my.macosx.x64.jdk7=macosx_x64_10.7
-jprt.my.macosx.x64.jdk7u10=${jprt.my.macosx.x64.jdk7}
+jprt.my.macosx.x64.jdk7u12=${jprt.my.macosx.x64.jdk7}
 jprt.my.macosx.x64=${jprt.my.macosx.x64.${jprt.tools.default.release}}
 
 jprt.my.windows.i586.jdk8=windows_i586_5.1
 jprt.my.windows.i586.jdk7=windows_i586_5.1
-jprt.my.windows.i586.jdk7u10=${jprt.my.windows.i586.jdk7}
+jprt.my.windows.i586.jdk7u12=${jprt.my.windows.i586.jdk7}
 jprt.my.windows.i586=${jprt.my.windows.i586.${jprt.tools.default.release}}
 
 jprt.my.windows.x64.jdk8=windows_x64_5.2
 jprt.my.windows.x64.jdk7=windows_x64_5.2
-jprt.my.windows.x64.jdk7u10=${jprt.my.windows.x64.jdk7}
+jprt.my.windows.x64.jdk7u12=${jprt.my.windows.x64.jdk7}
 jprt.my.windows.x64=${jprt.my.windows.x64.${jprt.tools.default.release}}
 
 # Standard list of jprt build targets for this source tree
@@ -159,7 +159,7 @@
 
 jprt.build.targets.jdk8=${jprt.build.targets.all}
 jprt.build.targets.jdk7=${jprt.build.targets.all}
-jprt.build.targets.jdk7u10=${jprt.build.targets.all}
+jprt.build.targets.jdk7u12=${jprt.build.targets.all}
 jprt.build.targets=${jprt.build.targets.${jprt.tools.default.release}}
 
 # Subset lists of test targets for this source tree
@@ -452,7 +452,7 @@
 
 jprt.test.targets.jdk8=${jprt.test.targets.standard}
 jprt.test.targets.jdk7=${jprt.test.targets.standard}
-jprt.test.targets.jdk7u10=${jprt.test.targets.jdk7}
+jprt.test.targets.jdk7u12=${jprt.test.targets.jdk7}
 jprt.test.targets=${jprt.test.targets.${jprt.tools.default.release}}
 
 # The default test/Makefile targets that should be run
@@ -512,7 +512,7 @@
 
 jprt.make.rule.test.targets.jdk8=${jprt.make.rule.test.targets.standard}
 jprt.make.rule.test.targets.jdk7=${jprt.make.rule.test.targets.standard}
-jprt.make.rule.test.targets.jdk7u10=${jprt.make.rule.test.targets.jdk7}
+jprt.make.rule.test.targets.jdk7u12=${jprt.make.rule.test.targets.jdk7}
 jprt.make.rule.test.targets=${jprt.make.rule.test.targets.${jprt.tools.default.release}}
 
 # 7155453: Work-around to prevent popups on OSX from blocking test completion
--- a/src/cpu/x86/vm/assembler_x86.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -1017,6 +1017,67 @@
   emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }
 
+void Assembler::aesdec(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_aes(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xde);
+  emit_operand(dst, src);
+}
+
+void Assembler::aesdec(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_aes(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xde);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::aesdeclast(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_aes(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdf);
+  emit_operand(dst, src);
+}
+
+void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_aes(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdf);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::aesenc(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_aes(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdc);
+  emit_operand(dst, src);
+}
+
+void Assembler::aesenc(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_aes(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdc);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::aesenclast(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_aes(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdd);
+  emit_operand(dst, src);
+}
+
+void Assembler::aesenclast(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_aes(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdd);
+  emit_byte(0xC0 | encode);
+}
+
+
 void Assembler::andl(Address dst, int32_t imm32) {
   InstructionMark im(this);
   prefix(dst);
@@ -2337,6 +2398,22 @@
   a_byte(p);
 }
 
+void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_ssse3(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x00);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::pshufb(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_ssse3(), "");
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x00);
+  emit_operand(dst, src);
+}
+
 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@@ -8049,6 +8126,15 @@
   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
 }
 
+void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::movdqu(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::movdqu(dst, Address(rscratch1, 0));
+  }
+}
+
 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
   if (reachable(src)) {
     Assembler::movsd(dst, as_Address(src));
@@ -8339,6 +8425,17 @@
   }
 }
 
+void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-bit flipping with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  if (reachable(src)) {
+    Assembler::pshufb(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::pshufb(dst, Address(rscratch1, 0));
+  }
+}
+
 // AVX 3-operands instructions
 
 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
--- a/src/cpu/x86/vm/assembler_x86.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -885,6 +885,17 @@
   void addss(XMMRegister dst, Address src);
   void addss(XMMRegister dst, XMMRegister src);
 
+  // AES instructions
+  void aesdec(XMMRegister dst, Address src);
+  void aesdec(XMMRegister dst, XMMRegister src);
+  void aesdeclast(XMMRegister dst, Address src);
+  void aesdeclast(XMMRegister dst, XMMRegister src);
+  void aesenc(XMMRegister dst, Address src);
+  void aesenc(XMMRegister dst, XMMRegister src);
+  void aesenclast(XMMRegister dst, Address src);
+  void aesenclast(XMMRegister dst, XMMRegister src);
+
+
   void andl(Address  dst, int32_t imm32);
   void andl(Register dst, int32_t imm32);
   void andl(Register dst, Address src);
@@ -1434,6 +1445,10 @@
   void prefetcht2(Address src);
   void prefetchw(Address src);
 
+  // Shuffle Bytes
+  void pshufb(XMMRegister dst, XMMRegister src);
+  void pshufb(XMMRegister dst, Address src);
+
   // Shuffle Packed Doublewords
   void pshufd(XMMRegister dst, XMMRegister src, int mode);
   void pshufd(XMMRegister dst, Address src,     int mode);
@@ -2596,6 +2611,12 @@
   void divss(XMMRegister dst, Address src)        { Assembler::divss(dst, src); }
   void divss(XMMRegister dst, AddressLiteral src);
 
+  // Move Unaligned Double Quadword
+  void movdqu(Address     dst, XMMRegister src)   { Assembler::movdqu(dst, src); }
+  void movdqu(XMMRegister dst, Address src)       { Assembler::movdqu(dst, src); }
+  void movdqu(XMMRegister dst, XMMRegister src)   { Assembler::movdqu(dst, src); }
+  void movdqu(XMMRegister dst, AddressLiteral src);
+
   void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); }
   void movsd(Address dst, XMMRegister src)     { Assembler::movsd(dst, src); }
   void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
@@ -2643,6 +2664,10 @@
   void xorps(XMMRegister dst, Address src)     { Assembler::xorps(dst, src); }
   void xorps(XMMRegister dst, AddressLiteral src);
 
+  // Shuffle Bytes
+  void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); }
+  void pshufb(XMMRegister dst, Address src)     { Assembler::pshufb(dst, src); }
+  void pshufb(XMMRegister dst, AddressLiteral src);
   // AVX 3-operands instructions
 
   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); }
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -2155,6 +2155,529 @@
     }
   }
 
+  // AES intrinsic stubs
+  enum {AESBlockSize = 16};
+
+  address generate_key_shuffle_mask() {
+    __ align(16);
+    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
+    address start = __ pc();
+    __ emit_data(0x00010203, relocInfo::none, 0 );
+    __ emit_data(0x04050607, relocInfo::none, 0 );
+    __ emit_data(0x08090a0b, relocInfo::none, 0 );
+    __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
+    return start;
+  }
+
+  // Utility routine for loading a 128-bit key word in little endian format
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    __ movdqu(xmmdst, Address(key, offset));
+    if (xmm_shuf_mask != NULL) {
+      __ pshufb(xmmdst, xmm_shuf_mask);
+    } else {
+      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    }
+  }
+
+  // aesenc using specified key+offset
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    load_key(xmmtmp, key, offset, xmm_shuf_mask);
+    __ aesenc(xmmdst, xmmtmp);
+  }
+
+  // aesdec using specified key+offset
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    load_key(xmmtmp, key, offset, xmm_shuf_mask);
+    __ aesdec(xmmdst, xmmtmp);
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_encryptBlock() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+    Label L_doLast;
+    address start = __ pc();
+
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register keylen      = rax;
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    const XMMRegister xmm_key_shuf_mask = xmm2;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ push(rsi);
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    __ movptr(key  , key_param);
+
+    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    // keylen = # of 32-bit words, convert to 128-bit words
+    __ shrl(keylen, 2);
+    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
+
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
+
+    // For encryption, the java expanded key ordering is just what we need
+
+    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+    __ pxor(xmm_result, xmm_temp);
+    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+    }
+    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
+    __ cmpl(keylen, 0);
+    __ jcc(Assembler::equal, L_doLast);
+    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
+    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+    __ subl(keylen, 2);
+    __ jcc(Assembler::equal, L_doLast);
+    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
+    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+    __ BIND(L_doLast);
+    __ aesenclast(xmm_result, xmm_temp);
+    __ movdqu(Address(to, 0), xmm_result);        // store the result
+    __ xorptr(rax, rax); // return 0
+    __ pop(rsi);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+    Label L_doLast;
+    address start = __ pc();
+
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register keylen      = rax;
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    const XMMRegister xmm_key_shuf_mask = xmm2;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ push(rsi);
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    __ movptr(key  , key_param);
+
+    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    // keylen = # of 32-bit words, convert to 128-bit words
+    __ shrl(keylen, 2);
+    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
+
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movdqu(xmm_result, Address(from, 0));
+
+    // for decryption java expanded key ordering is rotated one position from what we want
+    // so we start from 0x10 here and hit 0x00 last
+    // we don't know if the key is aligned, hence not using load-execute form
+    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
+    __ pxor  (xmm_result, xmm_temp);
+    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
+      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+    }
+    __ cmpl(keylen, 0);
+    __ jcc(Assembler::equal, L_doLast);
+    // only in 192 and 256 bit keys
+    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+    __ subl(keylen, 2);
+    __ jcc(Assembler::equal, L_doLast);
+    // only in 256 bit keys
+    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+    __ BIND(L_doLast);
+    // for decryption the aesdeclast operation is always on key+0x00
+    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+    __ aesdeclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, 0), xmm_result);  // store the result
+
+    __ xorptr(rax, rax); // return 0
+    __ pop(rsi);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+  void handleSOERegisters(bool saving) {
+    const int saveFrameSizeInBytes = 4 * wordSize;
+    const Address saved_rbx     (rbp, -3 * wordSize);
+    const Address saved_rsi     (rbp, -2 * wordSize);
+    const Address saved_rdi     (rbp, -1 * wordSize);
+
+    if (saving) {
+      __ subptr(rsp, saveFrameSizeInBytes);
+      __ movptr(saved_rsi, rsi);
+      __ movptr(saved_rdi, rdi);
+      __ movptr(saved_rbx, rbx);
+    } else {
+      // restoring
+      __ movptr(rsi, saved_rsi);
+      __ movptr(rdi, saved_rdi);
+      __ movptr(rbx, saved_rbx);
+    }
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  address generate_cipherBlockChaining_encryptAESCrypt() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+    address start = __ pc();
+
+    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register rvec        = rdi;      // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
+    const Register pos         = rax;
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    // first 6 keys preloaded into xmm2-xmm7
+    const int XMM_REG_NUM_KEY_FIRST = 2;
+    const int XMM_REG_NUM_KEY_LAST  = 7;
+    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    handleSOERegisters(true /*saving*/);
+
+    // load registers from incoming parameters
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+    const Address  rvec_param (rbp, 8+12);
+    const Address  len_param  (rbp, 8+16);
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    __ movptr(key  , key_param);
+    __ movptr(rvec , rvec_param);
+    __ movptr(len_reg , len_param);
+
+    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    // load up xmm regs 2 thru 7 with keys 0-5
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+      offset += 0x10;
+    }
+
+    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
+
+    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 44);
+    __ jcc(Assembler::notEqual, L_key_192_256);
+
+    // 128 bit code follows here
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_128);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0xa0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_128);
+
+    __ BIND(L_exit);
+    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
+
+    handleSOERegisters(false /*restoring*/);
+    __ movl(rax, 0);                             // return 0 (why?)
+    __ leave();                                  // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+  __ BIND(L_key_192_256);
+  // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::notEqual, L_key_256);
+
+    // 192-bit code follows here (could be changed to use more xmm registers)
+    __ movptr(pos, 0);
+  __ align(OptoLoopAlignment);
+  __ BIND(L_loopTop_192);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0xc0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_192);
+    __ jmp(L_exit);
+
+  __ BIND(L_key_256);
+    // 256-bit code follows here (could be changed to use more xmm registers)
+    __ movptr(pos, 0);
+  __ align(OptoLoopAlignment);
+  __ BIND(L_loopTop_256);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0xe0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_256);
+    __ jmp(L_exit);
+
+    return start;
+  }
+
+
+  // CBC AES Decryption.
+  // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
+  //
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+
+  address generate_cipherBlockChaining_decryptAESCrypt() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+    address start = __ pc();
+
+    Label L_exit, L_key_192_256, L_key_256;
+    Label L_singleBlock_loopTop_128;
+    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register rvec        = rdi;      // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
+    const Register pos         = rax;
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    // first 6 keys preloaded into xmm2-xmm7
+    const int XMM_REG_NUM_KEY_FIRST = 2;
+    const int XMM_REG_NUM_KEY_LAST  = 7;
+    const int FIRST_NON_REG_KEY_offset = 0x70;
+    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    handleSOERegisters(true /*saving*/);
+
+    // load registers from incoming parameters
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+    const Address  rvec_param (rbp, 8+12);
+    const Address  len_param  (rbp, 8+16);
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    __ movptr(key  , key_param);
+    __ movptr(rvec , rvec_param);
+    __ movptr(len_reg , len_param);
+
+    // the java expanded key ordering is rotated one position from what we want
+    // so we start from 0x10 here and hit 0x00 last
+    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    // load up xmm regs 2 thru 6 with first 5 keys
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+      offset += 0x10;
+    }
+
+    // inside here, use the rvec register to point to previous block cipher
+    // with which we xor at the end of each newly decrypted block
+    const Register  prev_block_cipher_ptr = rvec;
+
+    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 44);
+    __ jcc(Assembler::notEqual, L_key_192_256);
+
+
+    // 128-bit code follows here, parallelized
+    __ movptr(pos, 0);
+  __ align(OptoLoopAlignment);
+  __ BIND(L_singleBlock_loopTop_128);
+    __ cmpptr(len_reg, 0);           // any blocks left??
+    __ jcc(Assembler::equal, L_exit);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) {   // 128-bit runs up to key offset a0
+      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
+    __ aesdeclast(xmm_result, xmm_temp);
+    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jmp(L_singleBlock_loopTop_128);
+
+
+    __ BIND(L_exit);
+    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+    __ movptr(rvec , rvec_param);                                     // restore this since used in loop
+    __ movdqu(Address(rvec, 0), xmm_temp);                            // final value of r stored in rvec of CipherBlockChaining object
+    handleSOERegisters(false /*restoring*/);
+    __ movl(rax, 0);                                                  // return 0 (why?)
+    __ leave();                                                       // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+
+    __ BIND(L_key_192_256);
+    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::notEqual, L_key_256);
+
+    // 192-bit code follows here (could be optimized to use parallelism)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_192);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) {   // 192-bit runs up to key offset c0
+      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
+    __ aesdeclast(xmm_result, xmm_temp);
+    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
+    __ jmp(L_exit);
+
+    __ BIND(L_key_256);
+    // 256-bit code follows here (could be optimized to use parallelism)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_256);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) {   // 256-bit runs up to key offset e0
+      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
+    __ aesdeclast(xmm_result, xmm_temp);
+    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
+    __ jmp(L_exit);
+
+    return start;
+  }
+
+
  public:
   // Information about frame layout at time of blocking runtime call.
   // Note that we only have to preserve callee-saved registers since
@@ -2350,6 +2873,16 @@
     generate_arraycopy_stubs();
 
     generate_math_stubs();
+
+    // don't bother generating these AES intrinsic stubs unless global flag is set
+    if (UseAESIntrinsics) {
+      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
+
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+    }
   }
 
 
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -2958,6 +2958,548 @@
     }
   }
 
+  // AES intrinsic stubs
+  enum {AESBlockSize = 16};
+
+  address generate_key_shuffle_mask() {
+    __ align(16);
+    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
+    address start = __ pc();
+    __ emit_data64( 0x0405060700010203, relocInfo::none );
+    __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
+    return start;
+  }
+
+  // Utility routine for loading a 128-bit key word in little endian format
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    __ movdqu(xmmdst, Address(key, offset));
+    if (xmm_shuf_mask != NULL) {
+      __ pshufb(xmmdst, xmm_shuf_mask);
+    } else {
+      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    }
+  }
+
+  // aesenc using specified key+offset
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    load_key(xmmtmp, key, offset, xmm_shuf_mask);
+    __ aesenc(xmmdst, xmmtmp);
+  }
+
+  // aesdec using specified key+offset
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    load_key(xmmtmp, key, offset, xmm_shuf_mask);
+    __ aesdec(xmmdst, xmmtmp);
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_encryptBlock() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+    Label L_doLast;
+    address start = __ pc();
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rax;
+
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    const XMMRegister xmm_key_shuf_mask = xmm2;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    // keylen = # of 32-bit words, convert to 128-bit words
+    __ shrl(keylen, 2);
+    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
+
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
+
+    // For encryption, the java expanded key ordering is just what we need
+    // we don't know if the key is aligned, hence not using load-execute form
+
+    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+    __ pxor(xmm_result, xmm_temp);
+    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+    }
+    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
+    __ cmpl(keylen, 0);
+    __ jcc(Assembler::equal, L_doLast);
+    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
+    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+    __ subl(keylen, 2);
+    __ jcc(Assembler::equal, L_doLast);
+    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
+    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+    __ BIND(L_doLast);
+    __ aesenclast(xmm_result, xmm_temp);
+    __ movdqu(Address(to, 0), xmm_result);        // store the result
+    __ xorptr(rax, rax); // return 0
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+    Label L_doLast;
+    address start = __ pc();
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rax;
+
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    const XMMRegister xmm_key_shuf_mask = xmm2;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    // keylen = # of 32-bit words, convert to 128-bit words
+    __ shrl(keylen, 2);
+    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
+
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movdqu(xmm_result, Address(from, 0));
+
+    // for decryption java expanded key ordering is rotated one position from what we want
+    // so we start from 0x10 here and hit 0x00 last
+    // we don't know if the key is aligned, hence not using load-execute form
+    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
+    __ pxor  (xmm_result, xmm_temp);
+    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
+      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+    }
+    __ cmpl(keylen, 0);
+    __ jcc(Assembler::equal, L_doLast);
+    // only in 192 and 256 bit keys
+    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+    __ subl(keylen, 2);
+    __ jcc(Assembler::equal, L_doLast);
+    // only in 256 bit keys
+    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+    __ BIND(L_doLast);
+    // for decryption the aesdeclast operation is always on key+0x00
+    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+    __ aesdeclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, 0), xmm_result);  // store the result
+
+    __ xorptr(rax, rax); // return 0
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  address generate_cipherBlockChaining_encryptAESCrypt() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+    address start = __ pc();
+
+    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+#ifndef _WIN64
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+#else
+    const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
+    const Register len_reg     = r10;      // pick the first volatile windows register
+#endif
+    const Register pos         = rax;
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    // keys 0-10 preloaded into xmm2-xmm12
+    const int XMM_REG_NUM_KEY_FIRST = 2;
+    const int XMM_REG_NUM_KEY_LAST  = 12;
+    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+    const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+    // on win64, fill len_reg from stack position
+    __ movl(len_reg, len_mem);
+    // save the xmm registers which must be preserved 6-12
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+#endif
+
+    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+      offset += 0x10;
+    }
+
+    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
+
+    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 44);
+    __ jcc(Assembler::notEqual, L_key_192_256);
+
+    // 128 bit code follows here
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_128);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    __ aesenclast(xmm_result, xmm_key10);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_128);
+
+    __ BIND(L_exit);
+    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
+
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+#endif
+    __ movl(rax, 0); // return 0 (why?)
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    __ BIND(L_key_192_256);
+    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::notEqual, L_key_256);
+
+    // 192-bit code follows here (could be changed to use more xmm registers)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_192);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
+    load_key(xmm_temp, key, 0xc0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_192);
+    __ jmp(L_exit);
+
+    __ BIND(L_key_256);
+    // 256-bit code follows here (could be changed to use more xmm registers)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_256);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
+    aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
+    aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
+    load_key(xmm_temp, key, 0xe0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_256);
+    __ jmp(L_exit);
+
+    return start;
+  }
+
+
+
+  // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
+  // to hide instruction latency
+  //
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+
+  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+    address start = __ pc();
+
+    Label L_exit, L_key_192_256, L_key_256;
+    Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
+    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+#ifndef _WIN64
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+#else
+    const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
+    const Register len_reg     = r10;      // pick the first volatile windows register
+#endif
+    const Register pos         = rax;
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
+    // keys 0-10 preloaded into xmm2-xmm12
+    const int XMM_REG_NUM_KEY_FIRST = 5;
+    const int XMM_REG_NUM_KEY_LAST  = 15;
+    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+    const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+    // on win64, fill len_reg from stack position
+    __ movl(len_reg, len_mem);
+    // save the xmm registers which must be preserved 6-15
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+#endif
+    // the java expanded key ordering is rotated one position from what we want
+    // so we start from 0x10 here and hit 0x00 last
+    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
+      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+      offset += 0x10;
+    }
+
+    const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
+    // registers holding the four results in the parallelized loop
+    const XMMRegister xmm_result0 = xmm0;
+    const XMMRegister xmm_result1 = xmm2;
+    const XMMRegister xmm_result2 = xmm3;
+    const XMMRegister xmm_result3 = xmm4;
+
+    __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
+
+    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 44);
+    __ jcc(Assembler::notEqual, L_key_192_256);
+
+
+    // 128-bit code follows here, parallelized
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_multiBlock_loopTop_128);
+    __ cmpptr(len_reg, 4*AESBlockSize);           // see if at least 4 blocks left
+    __ jcc(Assembler::less, L_singleBlock_loopTop_128);
+
+    __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize));   // get next 4 blocks into xmmresult registers
+    __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize));
+    __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize));
+    __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize));
+
+#define DoFour(opc, src_reg)                    \
+    __ opc(xmm_result0, src_reg);               \
+    __ opc(xmm_result1, src_reg);               \
+    __ opc(xmm_result2, src_reg);               \
+    __ opc(xmm_result3, src_reg);
+
+    DoFour(pxor, xmm_key_first);
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      DoFour(aesdec, as_XMMRegister(rnum));
+    }
+    DoFour(aesdeclast, xmm_key_last);
+    // for each result, xor with the r vector of previous cipher block
+    __ pxor(xmm_result0, xmm_prev_block_cipher);
+    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
+    __ pxor(xmm_result1, xmm_prev_block_cipher);
+    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
+    __ pxor(xmm_result2, xmm_prev_block_cipher);
+    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
+    __ pxor(xmm_result3, xmm_prev_block_cipher);
+    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize));   // this will carry over to next set of blocks
+
+    __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
+    __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
+    __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
+    __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
+
+    __ addptr(pos, 4*AESBlockSize);
+    __ subptr(len_reg, 4*AESBlockSize);
+    __ jmp(L_multiBlock_loopTop_128);
+
+    // registers used in the non-parallelized loops
+    const XMMRegister xmm_prev_block_cipher_save = xmm2;
+    const XMMRegister xmm_temp   = xmm3;
+
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_128);
+    __ cmpptr(len_reg, 0);           // any blocks left??
+    __ jcc(Assembler::equal, L_exit);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
+    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    __ aesdeclast(xmm_result, xmm_key_last);
+    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
+
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jmp(L_singleBlock_loopTop_128);
+
+
+    __ BIND(L_exit);
+    __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
+#ifdef _WIN64
+    // restore regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+#endif
+    __ movl(rax, 0); // return 0 (why?)
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+
+    __ BIND(L_key_192_256);
+    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::notEqual, L_key_256);
+
+    // 192-bit code follows here (could be optimized to use parallelism)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_192);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
+    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 192-bit key goes up to c0
+    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
+    __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
+    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
+
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
+    __ jmp(L_exit);
+
+    __ BIND(L_key_256);
+    // 256-bit code follows here (could be optimized to use parallelism)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_256);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
+    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 256-bit key goes up to e0
+    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xe0);
+    __ aesdeclast(xmm_result, xmm_key_last);             // xmm15 came from key+0
+    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
+
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
+    __ jmp(L_exit);
+
+    return start;
+  }
+
+
+
 #undef __
 #define __ masm->
 
@@ -3152,6 +3694,16 @@
     generate_arraycopy_stubs();
 
     generate_math_stubs();
+
+    // don't bother generating these AES intrinsic stubs unless global flag is set
+    if (UseAESIntrinsics) {
+      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
+
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
+    }
   }
 
  public:
--- a/src/cpu/x86/vm/stubRoutines_x86_32.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/stubRoutines_x86_32.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -44,3 +44,4 @@
 
 address StubRoutines::x86::_verify_mxcsr_entry         = NULL;
 address StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = NULL;
+address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
--- a/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -41,10 +41,14 @@
  private:
   static address _verify_mxcsr_entry;
   static address _verify_fpu_cntrl_wrd_entry;
+  // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers
+  static address _key_shuffle_mask_addr;
 
  public:
   static address verify_mxcsr_entry()                        { return _verify_mxcsr_entry; }
   static address verify_fpu_cntrl_wrd_entry()                { return _verify_fpu_cntrl_wrd_entry; }
+  static address key_shuffle_mask_addr()                     { return _key_shuffle_mask_addr; }
+
 };
 
   static bool    returns_to_call_stub(address return_pc)     { return return_pc == _call_stub_return_address; }
--- a/src/cpu/x86/vm/stubRoutines_x86_64.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -56,3 +56,4 @@
 address StubRoutines::x86::_double_sign_mask = NULL;
 address StubRoutines::x86::_double_sign_flip = NULL;
 address StubRoutines::x86::_mxcsr_std = NULL;
+address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
--- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -54,6 +54,8 @@
   static address _double_sign_mask;
   static address _double_sign_flip;
   static address _mxcsr_std;
+  // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers
+  static address _key_shuffle_mask_addr;
 
  public:
 
@@ -116,6 +118,9 @@
   {
     return _mxcsr_std;
   }
+
+  static address key_shuffle_mask_addr()                     { return _key_shuffle_mask_addr; }
+
 };
 
 #endif // CPU_X86_VM_STUBROUTINES_X86_64_HPP
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -419,13 +419,16 @@
   if (UseAVX < 1)
     _cpuFeatures &= ~CPU_AVX;
 
+  if (!UseAES && !FLAG_IS_DEFAULT(UseAES))
+    _cpuFeatures &= ~CPU_AES;
+
   if (logical_processors_per_package() == 1) {
     // HT processor could be installed on a system which doesn't support HT.
     _cpuFeatures &= ~CPU_HT;
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -441,6 +444,7 @@
                (supports_popcnt() ? ", popcnt" : ""),
                (supports_avx()    ? ", avx" : ""),
                (supports_avx2()   ? ", avx2" : ""),
+               (supports_aes()    ? ", aes" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
                (supports_lzcnt()   ? ", lzcnt": ""),
@@ -472,6 +476,29 @@
   if (!supports_avx ()) // Drop to 0 if no AVX  support
     UseAVX = 0;
 
+  // Use AES instructions if available.
+  if (supports_aes()) {
+    if (FLAG_IS_DEFAULT(UseAES)) {
+      UseAES = true;
+    }
+  } else if (UseAES) {
+    if (!FLAG_IS_DEFAULT(UseAES))
+      warning("AES instructions not available on this CPU");
+    FLAG_SET_DEFAULT(UseAES, false);
+  }
+
+  // The AES intrinsic stubs require AES instruction support (of course)
+  // but also require AVX and sse3 modes for instructions it use.
+  if (UseAES && (UseAVX > 0) && (UseSSE > 2)) {
+    if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+      UseAESIntrinsics = true;
+    }
+  } else if (UseAESIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseAESIntrinsics))
+      warning("AES intrinsics not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+  }
+
 #ifdef COMPILER2
   if (UseFPUForSpilling) {
     if (UseSSE < 2) {
@@ -714,6 +741,9 @@
     if (UseAVX > 0) {
       tty->print("  UseAVX=%d",UseAVX);
     }
+    if (UseAES) {
+      tty->print("  UseAES=1");
+    }
     tty->cr();
     tty->print("Allocation");
     if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
--- a/src/cpu/x86/vm/vm_version_x86.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/cpu/x86/vm/vm_version_x86.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -78,7 +78,9 @@
                sse4_2   : 1,
                         : 2,
                popcnt   : 1,
-                        : 3,
+                        : 1,
+               aes      : 1,
+                        : 1,
                osxsave  : 1,
                avx      : 1,
                         : 3;
@@ -244,7 +246,8 @@
     CPU_TSC    = (1 << 15),
     CPU_TSCINV = (1 << 16),
     CPU_AVX    = (1 << 17),
-    CPU_AVX2   = (1 << 18)
+    CPU_AVX2   = (1 << 18),
+    CPU_AES    = (1 << 19)
   } cpuFeatureFlags;
 
   enum {
@@ -420,6 +423,8 @@
       result |= CPU_TSC;
     if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0)
       result |= CPU_TSCINV;
+    if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0)
+      result |= CPU_AES;
 
     // AMD features.
     if (is_amd()) {
@@ -544,6 +549,7 @@
   static bool supports_avx()      { return (_cpuFeatures & CPU_AVX) != 0; }
   static bool supports_avx2()     { return (_cpuFeatures & CPU_AVX2) != 0; }
   static bool supports_tsc()      { return (_cpuFeatures & CPU_TSC)    != 0; }
+  static bool supports_aes()      { return (_cpuFeatures & CPU_AES) != 0; }
 
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
--- a/src/os/bsd/vm/perfMemory_bsd.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os/bsd/vm/perfMemory_bsd.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -30,6 +30,7 @@
 #include "os_bsd.inline.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/perfMemory.hpp"
+#include "services/memTracker.hpp"
 #include "utilities/exceptions.hpp"
 
 // put OS-includes here
@@ -753,6 +754,10 @@
   // clear the shared memory region
   (void)::memset((void*) mapAddress, 0, size);
 
+  // it does not go through os api, the operation has to record from here
+  MemTracker::record_virtual_memory_reserve((address)mapAddress, size, CURRENT_PC);
+  MemTracker::record_virtual_memory_type((address)mapAddress, mtInternal);
+
   return mapAddress;
 }
 
@@ -912,6 +917,10 @@
               "Could not map PerfMemory");
   }
 
+  // it does not go through os api, the operation has to record from here
+  MemTracker::record_virtual_memory_reserve((address)mapAddress, size, CURRENT_PC);
+  MemTracker::record_virtual_memory_type((address)mapAddress, mtInternal);
+
   *addr = mapAddress;
   *sizep = size;
 
--- a/src/os/linux/vm/perfMemory_linux.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os/linux/vm/perfMemory_linux.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -30,6 +30,7 @@
 #include "os_linux.inline.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/perfMemory.hpp"
+#include "services/memTracker.hpp"
 #include "utilities/exceptions.hpp"
 
 // put OS-includes here
@@ -753,6 +754,10 @@
   // clear the shared memory region
   (void)::memset((void*) mapAddress, 0, size);
 
+  // it does not go through os api, the operation has to record from here
+  MemTracker::record_virtual_memory_reserve((address)mapAddress, size, CURRENT_PC);
+  MemTracker::record_virtual_memory_type((address)mapAddress, mtInternal);
+
   return mapAddress;
 }
 
@@ -912,6 +917,10 @@
               "Could not map PerfMemory");
   }
 
+  // it does not go through os api, the operation has to record from here
+  MemTracker::record_virtual_memory_reserve((address)mapAddress, size, CURRENT_PC);
+  MemTracker::record_virtual_memory_type((address)mapAddress, mtInternal);
+
   *addr = mapAddress;
   *sizep = size;
 
--- a/src/os/solaris/vm/os_solaris.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os/solaris/vm/os_solaris.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -55,6 +55,7 @@
 #include "runtime/threadCritical.hpp"
 #include "runtime/timer.hpp"
 #include "services/attachListener.hpp"
+#include "services/memTracker.hpp"
 #include "services/runtimeService.hpp"
 #include "thread_solaris.inline.hpp"
 #include "utilities/decoder.hpp"
@@ -1482,11 +1483,11 @@
 
 
 // First crack at OS-specific initialization, from inside the new thread.
-void os::initialize_thread() {
+void os::initialize_thread(Thread* thr) {
   int r = thr_main() ;
   guarantee (r == 0 || r == 1, "CR6501650 or CR6493689") ;
   if (r) {
-    JavaThread* jt = (JavaThread *)Thread::current();
+    JavaThread* jt = (JavaThread *)thr;
     assert(jt != NULL,"Sanity check");
     size_t stack_size;
     address base = jt->stack_base();
@@ -3072,11 +3073,12 @@
   // Since snv_84, Solaris attempts to honor the address hint - see 5003415.
   // Give it a try, if the kernel honors the hint we can return immediately.
   char* addr = Solaris::anon_mmap(requested_addr, bytes, 0, false);
+
   volatile int err = errno;
   if (addr == requested_addr) {
     return addr;
   } else if (addr != NULL) {
-    unmap_memory(addr, bytes);
+    pd_unmap_memory(addr, bytes);
   }
 
   if (PrintMiscellaneous && Verbose) {
--- a/src/os/solaris/vm/perfMemory_solaris.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os/solaris/vm/perfMemory_solaris.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -30,6 +30,7 @@
 #include "os_solaris.inline.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/perfMemory.hpp"
+#include "services/memTracker.hpp"
 #include "utilities/exceptions.hpp"
 
 // put OS-includes here
@@ -768,6 +769,10 @@
   // clear the shared memory region
   (void)::memset((void*) mapAddress, 0, size);
 
+  // it does not go through os api, the operation has to record from here
+  MemTracker::record_virtual_memory_reserve((address)mapAddress, size, CURRENT_PC);
+  MemTracker::record_virtual_memory_type((address)mapAddress, mtInternal);
+
   return mapAddress;
 }
 
@@ -927,6 +932,10 @@
               "Could not map PerfMemory");
   }
 
+  // it does not go through os api, the operation has to record from here
+  MemTracker::record_virtual_memory_reserve((address)mapAddress, size, CURRENT_PC);
+  MemTracker::record_virtual_memory_type((address)mapAddress, mtInternal);
+
   *addr = mapAddress;
   *sizep = size;
 
--- a/src/os/windows/vm/perfMemory_windows.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os/windows/vm/perfMemory_windows.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -30,6 +30,7 @@
 #include "os_windows.inline.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/perfMemory.hpp"
+#include "services/memTracker.hpp"
 #include "utilities/exceptions.hpp"
 
 #include <windows.h>
@@ -1496,6 +1497,10 @@
   // clear the shared memory region
   (void)memset(mapAddress, '\0', size);
 
+  // it does not go through os api, the operation has to record from here
+  MemTracker::record_virtual_memory_reserve((address)mapAddress, size, CURRENT_PC);
+  MemTracker::record_virtual_memory_type((address)mapAddress, mtInternal);
+
   return (char*) mapAddress;
 }
 
@@ -1672,6 +1677,11 @@
               "Could not map PerfMemory");
   }
 
+  // it does not go through os api, the operation has to record from here
+  MemTracker::record_virtual_memory_reserve((address)mapAddress, size, CURRENT_PC);
+  MemTracker::record_virtual_memory_type((address)mapAddress, mtInternal);
+
+
   *addrp = (char*)mapAddress;
   *sizep = size;
 
@@ -1824,6 +1834,8 @@
   }
 
   remove_file_mapping(addr);
+  // it does not go through os api, the operation has to record from here
+  MemTracker::record_virtual_memory_release((address)addr, bytes);
 }
 
 char* PerfMemory::backing_store_filename() {
--- a/src/os_cpu/bsd_x86/vm/os_bsd_x86.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os_cpu/bsd_x86/vm/os_bsd_x86.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -291,7 +291,7 @@
   return (char*) -1;
 }
 
-void os::initialize_thread() {
+void os::initialize_thread(Thread* thr) {
 // Nothing to do.
 }
 
--- a/src/os_cpu/bsd_zero/vm/os_bsd_zero.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os_cpu/bsd_zero/vm/os_bsd_zero.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -97,7 +97,7 @@
 #endif // SPARC
 }
 
-void os::initialize_thread() {
+void os::initialize_thread(Thread* thr) {
   // Nothing to do.
 }
 
--- a/src/os_cpu/linux_sparc/vm/os_linux_sparc.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os_cpu/linux_sparc/vm/os_linux_sparc.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -218,7 +218,7 @@
   return (char*) 0;
 }
 
-void os::initialize_thread() {}
+void os::initialize_thread(Thread* thr) {}
 
 void os::print_context(outputStream *st, void *context) {
   if (context == NULL) return;
--- a/src/os_cpu/linux_x86/vm/os_linux_x86.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os_cpu/linux_x86/vm/os_linux_x86.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -108,7 +108,7 @@
   return (char*) -1;
 }
 
-void os::initialize_thread() {
+void os::initialize_thread(Thread* thr) {
 // Nothing to do.
 }
 
--- a/src/os_cpu/linux_zero/vm/os_linux_zero.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os_cpu/linux_zero/vm/os_linux_zero.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -92,7 +92,7 @@
 #endif // SPARC
 }
 
-void os::initialize_thread() {
+void os::initialize_thread(Thread * thr){
   // Nothing to do.
 }
 
--- a/src/os_cpu/windows_x86/vm/os_windows_x86.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/os_cpu/windows_x86/vm/os_windows_x86.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -213,7 +213,7 @@
   return true;
 }
 
-void os::initialize_thread() {
+void os::initialize_thread(Thread* thr) {
 // Nothing to do.
 }
 
--- a/src/share/vm/classfile/vmSymbols.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/classfile/vmSymbols.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -110,10 +110,12 @@
   template(sun_jkernel_DownloadManager,               "sun/jkernel/DownloadManager")              \
   template(getBootClassPathEntryForClass_name,        "getBootClassPathEntryForClass")            \
   template(sun_misc_PostVMInitHook,                   "sun/misc/PostVMInitHook")                  \
+  template(sun_misc_Launcher_ExtClassLoader,          "sun/misc/Launcher$ExtClassLoader")         \
                                                                                                   \
   /* Java runtime version access */                                                               \
   template(sun_misc_Version,                          "sun/misc/Version")                         \
   template(java_runtime_name_name,                    "java_runtime_name")                        \
+  template(java_runtime_version_name,                 "java_runtime_version")                     \
                                                                                                   \
   /* class file format tags */                                                                    \
   template(tag_source_file,                           "SourceFile")                               \
@@ -719,6 +721,21 @@
   /* java/lang/ref/Reference */                                                                                         \
   do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                         \
+  /* support for com.sum.crypto.provider.AESCrypt and some of its callers */                                            \
+  do_class(com_sun_crypto_provider_aescrypt,      "com/sun/crypto/provider/AESCrypt")                                   \
+  do_intrinsic(_aescrypt_encryptBlock, com_sun_crypto_provider_aescrypt, encryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \
+  do_intrinsic(_aescrypt_decryptBlock, com_sun_crypto_provider_aescrypt, decryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \
+   do_name(     encryptBlock_name,                                 "encryptBlock")                                      \
+   do_name(     decryptBlock_name,                                 "decryptBlock")                                      \
+   do_signature(byteArray_int_byteArray_int_signature,             "([BI[BI)V")                                         \
+                                                                                                                        \
+  do_class(com_sun_crypto_provider_cipherBlockChaining,            "com/sun/crypto/provider/CipherBlockChaining")       \
+   do_intrinsic(_cipherBlockChaining_encryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, encrypt_name, byteArray_int_int_byteArray_int_signature, F_R)   \
+   do_intrinsic(_cipherBlockChaining_decryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, decrypt_name, byteArray_int_int_byteArray_int_signature, F_R)   \
+   do_name(     encrypt_name,                                      "encrypt")                                           \
+   do_name(     decrypt_name,                                      "decrypt")                                           \
+   do_signature(byteArray_int_int_byteArray_int_signature,         "([BII[BI)V")                                        \
+                                                                                                                        \
   /* support for sun.misc.Unsafe */                                                                                     \
   do_class(sun_misc_Unsafe,               "sun/misc/Unsafe")                                                            \
                                                                                                                         \
--- a/src/share/vm/memory/allocation.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/memory/allocation.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -389,19 +389,18 @@
   NOT_PRODUCT(Atomic::inc(&_instance_count);)
 }
 
-Arena::Arena(Arena *a) : _chunk(a->_chunk), _hwm(a->_hwm), _max(a->_max), _first(a->_first) {
-  set_size_in_bytes(a->size_in_bytes());
-  NOT_PRODUCT(Atomic::inc(&_instance_count);)
-}
-
-
 Arena *Arena::move_contents(Arena *copy) {
   copy->destruct_contents();
   copy->_chunk = _chunk;
   copy->_hwm   = _hwm;
   copy->_max   = _max;
   copy->_first = _first;
-  copy->set_size_in_bytes(size_in_bytes());
+
+  // workaround rare racing condition, which could double count
+  // the arena size by native memory tracking
+  size_t size = size_in_bytes();
+  set_size_in_bytes(0);
+  copy->set_size_in_bytes(size);
   // Destroy original arena
   reset();
   return copy;            // Return Arena with contents
@@ -453,6 +452,9 @@
     char* end = _first->next() ? _first->top() : _hwm;
     free_malloced_objects(_first, _first->bottom(), end, _hwm);
   }
+  // reset size before chop to avoid a rare racing condition
+  // that can have total arena memory exceed total chunk memory
+  set_size_in_bytes(0);
   _first->chop();
   reset();
 }
--- a/src/share/vm/memory/allocation.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/memory/allocation.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -134,8 +134,10 @@
   mtNMT               = 0x0A00,  // memory used by native memory tracking
   mtChunk             = 0x0B00,  // chunk that holds content of arenas
   mtJavaHeap          = 0x0C00,  // Java heap
-  mtDontTrack         = 0x0D00,  // memory we donot or cannot track
-  mt_number_of_types  = 0x000C,  // number of memory types
+  mtClassShared       = 0x0D00,  // class data sharing
+  mt_number_of_types  = 0x000D,  // number of memory types (mtDontTrack
+                                 // is not included as validate type)
+  mtDontTrack         = 0x0E00,  // memory we do not or cannot track
   mt_masks            = 0x7F00,
 
   // object type mask
@@ -299,7 +301,6 @@
  public:
   Arena();
   Arena(size_t init_size);
-  Arena(Arena *old);
   ~Arena();
   void  destruct_contents();
   char* hwm() const             { return _hwm; }
--- a/src/share/vm/memory/filemap.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/memory/filemap.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -29,6 +29,7 @@
 #include "runtime/arguments.hpp"
 #include "runtime/java.hpp"
 #include "runtime/os.hpp"
+#include "services/memTracker.hpp"
 #include "utilities/defaultStream.hpp"
 
 # include <sys/stat.h>
@@ -358,7 +359,13 @@
   ReservedSpace unmapped_rs = rs.last_part(size);
   mapped_rs.release();
 
-  return map_region(i, true);
+  // This memory still belongs to JavaHeap
+  MemTracker::record_virtual_memory_type((address)unmapped_rs.base(), mtJavaHeap);
+  char* mapped_addr = map_region(i, true);
+  if (mapped_addr != NULL) {
+    MemTracker::record_virtual_memory_type((address)mapped_addr, mtJavaHeap);
+  }
+  return mapped_addr;
 }
 
 
--- a/src/share/vm/memory/genCollectedHeap.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/memory/genCollectedHeap.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -51,6 +51,7 @@
 #include "runtime/java.hpp"
 #include "runtime/vmThread.hpp"
 #include "services/memoryService.hpp"
+#include "services/memTracker.hpp"
 #include "utilities/vmError.hpp"
 #include "utilities/workgroup.hpp"
 #ifndef SERIALGC
@@ -171,9 +172,13 @@
     ReservedSpace this_rs = heap_rs.first_part(_gen_specs[i]->max_size(),
                                               UseSharedSpaces, UseSharedSpaces);
     _gens[i] = _gen_specs[i]->init(this_rs, i, rem_set());
+    // tag generations in JavaHeap
+    MemTracker::record_virtual_memory_type((address)this_rs.base(), mtJavaHeap);
     heap_rs = heap_rs.last_part(_gen_specs[i]->max_size());
   }
   _perm_gen = perm_gen_spec->init(heap_rs, PermSize, rem_set());
+  // tag PermGen
+  MemTracker::record_virtual_memory_type((address)heap_rs.base(), mtJavaHeap);
 
   clear_incremental_collection_failed();
 
--- a/src/share/vm/memory/resourceArea.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/memory/resourceArea.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -127,15 +127,21 @@
   void reset_to_mark() {
     if (UseMallocOnly) free_malloced_objects();
 
-    if( _chunk->next() )        // Delete later chunks
+    if( _chunk->next() ) {       // Delete later chunks
+      // reset arena size before delete chunks. Otherwise, the total
+      // arena size could exceed total chunk size
+      assert(_area->size_in_bytes() > size_in_bytes(), "Sanity check");
+      _area->set_size_in_bytes(size_in_bytes());
       _chunk->next_chop();
+    } else {
+      assert(_area->size_in_bytes() == size_in_bytes(), "Sanity check");
+    }
     _area->_chunk = _chunk;     // Roll back arena to saved chunk
     _area->_hwm = _hwm;
     _area->_max = _max;
 
     // clear out this chunk (to detect allocation bugs)
     if (ZapResourceArea) memset(_hwm, badResourceValue, _max - _hwm);
-    _area->set_size_in_bytes(size_in_bytes());
   }
 
   ~ResourceMark() {
@@ -219,15 +225,21 @@
   void reset_to_mark() {
     if (UseMallocOnly) free_malloced_objects();
 
-    if( _chunk->next() )        // Delete later chunks
+    if( _chunk->next() ) {        // Delete later chunks
+      // reset arena size before delete chunks. Otherwise, the total
+      // arena size could exceed total chunk size
+      assert(_area->size_in_bytes() > size_in_bytes(), "Sanity check");
+      _area->set_size_in_bytes(size_in_bytes());
       _chunk->next_chop();
+    } else {
+      assert(_area->size_in_bytes() == size_in_bytes(), "Sanity check");
+    }
     _area->_chunk = _chunk;     // Roll back arena to saved chunk
     _area->_hwm = _hwm;
     _area->_max = _max;
 
     // clear out this chunk (to detect allocation bugs)
     if (ZapResourceArea) memset(_hwm, badResourceValue, _max - _hwm);
-    _area->set_size_in_bytes(size_in_bytes());
   }
 
   ~DeoptResourceMark() {
--- a/src/share/vm/oops/methodOop.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/oops/methodOop.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -1097,8 +1097,12 @@
 vmSymbols::SID methodOopDesc::klass_id_for_intrinsics(klassOop holder) {
   // if loader is not the default loader (i.e., != NULL), we can't know the intrinsics
   // because we are not loading from core libraries
-  if (instanceKlass::cast(holder)->class_loader() != NULL)
+  // exception: the AES intrinsics come from lib/ext/sunjce_provider.jar
+  // which does not use the class default class loader so we check for its loader here
+  if ((instanceKlass::cast(holder)->class_loader() != NULL) &&
+       instanceKlass::cast(holder)->class_loader()->klass()->klass_part()->name() != vmSymbols::sun_misc_Launcher_ExtClassLoader()) {
     return vmSymbols::NO_SID;   // regardless of name, no intrinsics here
+  }
 
   // see if the klass name is well-known:
   Symbol* klass_name = instanceKlass::cast(holder)->name();
--- a/src/share/vm/opto/callGenerator.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/opto/callGenerator.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -670,6 +670,129 @@
 }
 
 
+//------------------------PredictedIntrinsicGenerator------------------------------
+// Internal class which handles all predicted Intrinsic calls.
+class PredictedIntrinsicGenerator : public CallGenerator {
+  CallGenerator* _intrinsic;
+  CallGenerator* _cg;
+
+public:
+  PredictedIntrinsicGenerator(CallGenerator* intrinsic,
+                              CallGenerator* cg)
+    : CallGenerator(cg->method())
+  {
+    _intrinsic = intrinsic;
+    _cg        = cg;
+  }
+
+  virtual bool      is_virtual()   const    { return true; }
+  virtual bool      is_inlined()   const    { return true; }
+  virtual bool      is_intrinsic() const    { return true; }
+
+  virtual JVMState* generate(JVMState* jvms);
+};
+
+
+CallGenerator* CallGenerator::for_predicted_intrinsic(CallGenerator* intrinsic,
+                                                      CallGenerator* cg) {
+  return new PredictedIntrinsicGenerator(intrinsic, cg);
+}
+
+
+JVMState* PredictedIntrinsicGenerator::generate(JVMState* jvms) {
+  GraphKit kit(jvms);
+  PhaseGVN& gvn = kit.gvn();
+
+  CompileLog* log = kit.C->log();
+  if (log != NULL) {
+    log->elem("predicted_intrinsic bci='%d' method='%d'",
+              jvms->bci(), log->identify(method()));
+  }
+
+  Node* slow_ctl = _intrinsic->generate_predicate(kit.sync_jvms());
+  if (kit.failing())
+    return NULL;  // might happen because of NodeCountInliningCutoff
+
+  SafePointNode* slow_map = NULL;
+  JVMState* slow_jvms;
+  if (slow_ctl != NULL) {
+    PreserveJVMState pjvms(&kit);
+    kit.set_control(slow_ctl);
+    if (!kit.stopped()) {
+      slow_jvms = _cg->generate(kit.sync_jvms());
+      if (kit.failing())
+        return NULL;  // might happen because of NodeCountInliningCutoff
+      assert(slow_jvms != NULL, "must be");
+      kit.add_exception_states_from(slow_jvms);
+      kit.set_map(slow_jvms->map());
+      if (!kit.stopped())
+        slow_map = kit.stop();
+    }
+  }
+
+  if (kit.stopped()) {
+    // Predicate is always false.
+    kit.set_jvms(slow_jvms);
+    return kit.transfer_exceptions_into_jvms();
+  }
+
+  // Generate intrinsic code:
+  JVMState* new_jvms = _intrinsic->generate(kit.sync_jvms());
+  if (new_jvms == NULL) {
+    // Intrinsic failed, so use slow code or make a direct call.
+    if (slow_map == NULL) {
+      CallGenerator* cg = CallGenerator::for_direct_call(method());
+      new_jvms = cg->generate(kit.sync_jvms());
+    } else {
+      kit.set_jvms(slow_jvms);
+      return kit.transfer_exceptions_into_jvms();
+    }
+  }
+  kit.add_exception_states_from(new_jvms);
+  kit.set_jvms(new_jvms);
+
+  // Need to merge slow and fast?
+  if (slow_map == NULL) {
+    // The fast path is the only path remaining.
+    return kit.transfer_exceptions_into_jvms();
+  }
+
+  if (kit.stopped()) {
+    // Intrinsic method threw an exception, so it's just the slow path after all.
+    kit.set_jvms(slow_jvms);
+    return kit.transfer_exceptions_into_jvms();
+  }
+
+  // Finish the diamond.
+  kit.C->set_has_split_ifs(true); // Has chance for split-if optimization
+  RegionNode* region = new (kit.C) RegionNode(3);
+  region->init_req(1, kit.control());
+  region->init_req(2, slow_map->control());
+  kit.set_control(gvn.transform(region));
+  Node* iophi = PhiNode::make(region, kit.i_o(), Type::ABIO);
+  iophi->set_req(2, slow_map->i_o());
+  kit.set_i_o(gvn.transform(iophi));
+  kit.merge_memory(slow_map->merged_memory(), region, 2);
+  uint tos = kit.jvms()->stkoff() + kit.sp();
+  uint limit = slow_map->req();
+  for (uint i = TypeFunc::Parms; i < limit; i++) {
+    // Skip unused stack slots; fast forward to monoff();
+    if (i == tos) {
+      i = kit.jvms()->monoff();
+      if( i >= limit ) break;
+    }
+    Node* m = kit.map()->in(i);
+    Node* n = slow_map->in(i);
+    if (m != n) {
+      const Type* t = gvn.type(m)->meet(gvn.type(n));
+      Node* phi = PhiNode::make(region, m, t);
+      phi->set_req(2, n);
+      kit.map()->set_req(i, gvn.transform(phi));
+    }
+  }
+  return kit.transfer_exceptions_into_jvms();
+}
+
 //-------------------------UncommonTrapCallGenerator-----------------------------
 // Internal class which handles all out-of-line calls checking receiver type.
 class UncommonTrapCallGenerator : public CallGenerator {
--- a/src/share/vm/opto/callGenerator.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/opto/callGenerator.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -143,6 +143,9 @@
   // Registry for intrinsics:
   static CallGenerator* for_intrinsic(ciMethod* m);
   static void register_intrinsic(ciMethod* m, CallGenerator* cg);
+  static CallGenerator* for_predicted_intrinsic(CallGenerator* intrinsic,
+                                                CallGenerator* cg);
+  virtual Node* generate_predicate(JVMState* jvms) { return NULL; };
 
   static void print_inlining(ciMethod* callee, int inline_level, int bci, const char* msg) {
     if (PrintInlining)
--- a/src/share/vm/opto/doCall.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/opto/doCall.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -108,7 +108,17 @@
   // intrinsics handle strict f.p. correctly.
   if (allow_inline && allow_intrinsics) {
     CallGenerator* cg = find_intrinsic(callee, call_is_virtual);
-    if (cg != NULL)  return cg;
+    if (cg != NULL) {
+      if (cg->is_predicted()) {
+        // Code without intrinsic but, hopefully, inlined.
+        CallGenerator* inline_cg = this->call_generator(callee,
+              vtable_index, call_is_virtual, jvms, allow_inline, prof_factor, false);
+        if (inline_cg != NULL) {
+          cg = CallGenerator::for_predicted_intrinsic(cg, inline_cg);
+        }
+      }
+      return cg;
+    }
   }
 
   // Do method handle calls.
--- a/src/share/vm/opto/escape.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/opto/escape.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -886,12 +886,16 @@
                                        arg_has_oops && (i > TypeFunc::Parms);
 #ifdef ASSERT
           if (!(is_arraycopy ||
-                call->as_CallLeaf()->_name != NULL &&
-                (strcmp(call->as_CallLeaf()->_name, "g1_wb_pre")  == 0 ||
-                 strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 ))
-          ) {
+                (call->as_CallLeaf()->_name != NULL &&
+                 (strcmp(call->as_CallLeaf()->_name, "g1_wb_pre")  == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "aescrypt_encryptBlock") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0)
+                  ))) {
             call->dump();
-            assert(false, "EA: unexpected CallLeaf");
+            fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
           }
 #endif
           // Always process arraycopy's destination object since
--- a/src/share/vm/opto/library_call.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/opto/library_call.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -44,18 +44,22 @@
  public:
  private:
   bool             _is_virtual;
+  bool             _is_predicted;
   vmIntrinsics::ID _intrinsic_id;
 
  public:
-  LibraryIntrinsic(ciMethod* m, bool is_virtual, vmIntrinsics::ID id)
+  LibraryIntrinsic(ciMethod* m, bool is_virtual, bool is_predicted, vmIntrinsics::ID id)
     : InlineCallGenerator(m),
       _is_virtual(is_virtual),
+      _is_predicted(is_predicted),
       _intrinsic_id(id)
   {
   }
   virtual bool is_intrinsic() const { return true; }
   virtual bool is_virtual()   const { return _is_virtual; }
+  virtual bool is_predicted()   const { return _is_predicted; }
   virtual JVMState* generate(JVMState* jvms);
+  virtual Node* generate_predicate(JVMState* jvms);
   vmIntrinsics::ID intrinsic_id() const { return _intrinsic_id; }
 };
 
@@ -83,6 +87,7 @@
   int               arg_size()  const    { return callee()->arg_size(); }
 
   bool try_to_inline();
+  Node* try_to_predicate();
 
   // Helper functions to inline natives
   void push_result(RegionNode* region, PhiNode* value);
@@ -148,6 +153,7 @@
   CallJavaNode* generate_method_call_virtual(vmIntrinsics::ID method_id) {
     return generate_method_call(method_id, true, false);
   }
+  Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static);
 
   Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2);
   Node* make_string_method_node(int opcode, Node* str1, Node* str2);
@@ -253,6 +259,10 @@
   bool inline_reverseBytes(vmIntrinsics::ID id);
 
   bool inline_reference_get();
+  bool inline_aescrypt_Block(vmIntrinsics::ID id);
+  bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id);
+  Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
+  Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
 };
 
 
@@ -306,6 +316,8 @@
     }
   }
 
+  bool is_predicted = false;
+
   switch (id) {
   case vmIntrinsics::_compareTo:
     if (!SpecialStringCompareTo)  return NULL;
@@ -413,6 +425,18 @@
     break;
 #endif
 
+  case vmIntrinsics::_aescrypt_encryptBlock:
+  case vmIntrinsics::_aescrypt_decryptBlock:
+    if (!UseAESIntrinsics) return NULL;
+    break;
+
+  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    if (!UseAESIntrinsics) return NULL;
+    // these two require the predicated logic
+    is_predicted = true;
+    break;
+
  default:
     assert(id <= vmIntrinsics::LAST_COMPILER_INLINE, "caller responsibility");
     assert(id != vmIntrinsics::_Object_init && id != vmIntrinsics::_invoke, "enum out of order?");
@@ -444,7 +468,7 @@
     if (!InlineUnsafeOps)  return NULL;
   }
 
-  return new LibraryIntrinsic(m, is_virtual, (vmIntrinsics::ID) id);
+  return new LibraryIntrinsic(m, is_virtual, is_predicted, (vmIntrinsics::ID) id);
 }
 
 //----------------------register_library_intrinsics-----------------------
@@ -496,6 +520,47 @@
   return NULL;
 }
 
+Node* LibraryIntrinsic::generate_predicate(JVMState* jvms) {
+  LibraryCallKit kit(jvms, this);
+  Compile* C = kit.C;
+  int nodes = C->unique();
+#ifndef PRODUCT
+  assert(is_predicted(), "sanity");
+  if ((PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) && Verbose) {
+    char buf[1000];
+    const char* str = vmIntrinsics::short_name_as_C_string(intrinsic_id(), buf, sizeof(buf));
+    tty->print_cr("Predicate for intrinsic %s", str);
+  }
+#endif
+
+  Node* slow_ctl = kit.try_to_predicate();
+  if (!kit.failing()) {
+    if (C->log()) {
+      C->log()->elem("predicate_intrinsic id='%s'%s nodes='%d'",
+                     vmIntrinsics::name_at(intrinsic_id()),
+                     (is_virtual() ? " virtual='1'" : ""),
+                     C->unique() - nodes);
+    }
+    return slow_ctl; // Could be NULL if the check folds.
+  }
+
+  // The intrinsic bailed out
+  if (PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) {
+    if (jvms->has_method()) {
+      // Not a root compile.
+      const char* msg = "failed to generate predicate for intrinsic";
+      CompileTask::print_inlining(kit.callee(), jvms->depth() - 1, kit.bci(), msg);
+    } else {
+      // Root compile
+      tty->print("Did not generate predicate for intrinsic %s%s at bci:%d in",
+               vmIntrinsics::name_at(intrinsic_id()),
+               (is_virtual() ? " (virtual)" : ""), kit.bci());
+    }
+  }
+  C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_failed);
+  return NULL;
+}
+
 bool LibraryCallKit::try_to_inline() {
   // Handle symbolic names for otherwise undistinguished boolean switches:
   const bool is_store       = true;
@@ -767,6 +832,14 @@
   case vmIntrinsics::_Reference_get:
     return inline_reference_get();
 
+  case vmIntrinsics::_aescrypt_encryptBlock:
+  case vmIntrinsics::_aescrypt_decryptBlock:
+    return inline_aescrypt_Block(intrinsic_id());
+
+  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    return inline_cipherBlockChaining_AESCrypt(intrinsic_id());
+
   default:
     // If you get here, it may be that someone has added a new intrinsic
     // to the list in vmSymbols.hpp without implementing it here.
@@ -780,6 +853,36 @@
   }
 }
 
+Node* LibraryCallKit::try_to_predicate() {
+  if (!jvms()->has_method()) {
+    // Root JVMState has a null method.
+    assert(map()->memory()->Opcode() == Op_Parm, "");
+    // Insert the memory aliasing node
+    set_all_memory(reset_memory());
+  }
+  assert(merged_memory(), "");
+
+  switch (intrinsic_id()) {
+  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+    return inline_cipherBlockChaining_AESCrypt_predicate(false);
+  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    return inline_cipherBlockChaining_AESCrypt_predicate(true);
+
+  default:
+    // If you get here, it may be that someone has added a new intrinsic
+    // to the list in vmSymbols.hpp without implementing it here.
+#ifndef PRODUCT
+    if ((PrintMiscellaneous && (Verbose || WizardMode)) || PrintOpto) {
+      tty->print_cr("*** Warning: Unimplemented predicate for intrinsic %s(%d)",
+                    vmIntrinsics::name_at(intrinsic_id()), intrinsic_id());
+    }
+#endif
+    Node* slow_ctl = control();
+    set_control(top()); // No fast path instrinsic
+    return slow_ctl;
+  }
+}
+
 //------------------------------push_result------------------------------
 // Helper function for finishing intrinsics.
 void LibraryCallKit::push_result(RegionNode* region, PhiNode* value) {
@@ -5613,3 +5716,265 @@
   push(result);
   return true;
 }
+
+
+Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString,
+                                              bool is_exact=true, bool is_static=false) {
+
+  const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr();
+  assert(tinst != NULL, "obj is null");
+  assert(tinst->klass()->is_loaded(), "obj is not loaded");
+  assert(!is_exact || tinst->klass_is_exact(), "klass not exact");
+
+  ciField* field = tinst->klass()->as_instance_klass()->get_field_by_name(ciSymbol::make(fieldName),
+                                                                          ciSymbol::make(fieldTypeString),
+                                                                          is_static);
+  if (field == NULL) return (Node *) NULL;
+  assert (field != NULL, "undefined field");
+
+  // Next code  copied from Parse::do_get_xxx():
+
+  // Compute address and memory type.
+  int offset  = field->offset_in_bytes();
+  bool is_vol = field->is_volatile();
+  ciType* field_klass = field->type();
+  assert(field_klass->is_loaded(), "should be loaded");
+  const TypePtr* adr_type = C->alias_type(field)->adr_type();
+  Node *adr = basic_plus_adr(fromObj, fromObj, offset);
+  BasicType bt = field->layout_type();
+
+  // Build the resultant type of the load
+  const Type *type = TypeOopPtr::make_from_klass(field_klass->as_klass());
+
+  // Build the load.
+  Node* loadedField = make_load(NULL, adr, type, bt, adr_type, is_vol);
+  return loadedField;
+}
+
+
+//------------------------------inline_aescrypt_Block-----------------------
+bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) {
+  address stubAddr;
+  const char *stubName;
+  assert(UseAES, "need AES instruction support");
+
+  switch(id) {
+  case vmIntrinsics::_aescrypt_encryptBlock:
+    stubAddr = StubRoutines::aescrypt_encryptBlock();
+    stubName = "aescrypt_encryptBlock";
+    break;
+  case vmIntrinsics::_aescrypt_decryptBlock:
+    stubAddr = StubRoutines::aescrypt_decryptBlock();
+    stubName = "aescrypt_decryptBlock";
+    break;
+  }
+  if (stubAddr == NULL) return false;
+
+  // Restore the stack and pop off the arguments.
+  int nargs = 5;  // this + 2 oop/offset combos
+  assert(callee()->signature()->size() == nargs-1, "encryptBlock has 4 arguments");
+
+  Node *aescrypt_object  = argument(0);
+  Node *src         = argument(1);
+  Node *src_offset  = argument(2);
+  Node *dest        = argument(3);
+  Node *dest_offset = argument(4);
+
+  // (1) src and dest are arrays.
+  const Type* src_type = src->Value(&_gvn);
+  const Type* dest_type = dest->Value(&_gvn);
+  const TypeAryPtr* top_src = src_type->isa_aryptr();
+  const TypeAryPtr* top_dest = dest_type->isa_aryptr();
+  assert (top_src  != NULL && top_src->klass()  != NULL &&  top_dest != NULL && top_dest->klass() != NULL, "args are strange");
+
+  // for the quick and dirty code we will skip all the checks.
+  // we are just trying to get the call to be generated.
+  Node* src_start  = src;
+  Node* dest_start = dest;
+  if (src_offset != NULL || dest_offset != NULL) {
+    assert(src_offset != NULL && dest_offset != NULL, "");
+    src_start  = array_element_address(src,  src_offset,  T_BYTE);
+    dest_start = array_element_address(dest, dest_offset, T_BYTE);
+  }
+
+  // now need to get the start of its expanded key array
+  // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java
+  Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object);
+  if (k_start == NULL) return false;
+
+  // Call the stub.
+  make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(),
+                    stubAddr, stubName, TypePtr::BOTTOM,
+                    src_start, dest_start, k_start);
+
+  return true;
+}
+
+//------------------------------inline_cipherBlockChaining_AESCrypt-----------------------
+bool LibraryCallKit::inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id) {
+  address stubAddr;
+  const char *stubName;
+
+  assert(UseAES, "need AES instruction support");
+
+  switch(id) {
+  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+    stubAddr = StubRoutines::cipherBlockChaining_encryptAESCrypt();
+    stubName = "cipherBlockChaining_encryptAESCrypt";
+    break;
+  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    stubAddr = StubRoutines::cipherBlockChaining_decryptAESCrypt();
+    stubName = "cipherBlockChaining_decryptAESCrypt";
+    break;
+  }
+  if (stubAddr == NULL) return false;
+
+
+  // Restore the stack and pop off the arguments.
+  int nargs = 6;  // this + oop/offset + len + oop/offset
+  assert(callee()->signature()->size() == nargs-1, "wrong number of arguments");
+  Node *cipherBlockChaining_object  = argument(0);
+  Node *src         = argument(1);
+  Node *src_offset  = argument(2);
+  Node *len         = argument(3);
+  Node *dest        = argument(4);
+  Node *dest_offset = argument(5);
+
+  // (1) src and dest are arrays.
+  const Type* src_type = src->Value(&_gvn);
+  const Type* dest_type = dest->Value(&_gvn);
+  const TypeAryPtr* top_src = src_type->isa_aryptr();
+  const TypeAryPtr* top_dest = dest_type->isa_aryptr();
+  assert (top_src  != NULL && top_src->klass()  != NULL
+          &&  top_dest != NULL && top_dest->klass() != NULL, "args are strange");
+
+  // checks are the responsibility of the caller
+  Node* src_start  = src;
+  Node* dest_start = dest;
+  if (src_offset != NULL || dest_offset != NULL) {
+    assert(src_offset != NULL && dest_offset != NULL, "");
+    src_start  = array_element_address(src,  src_offset,  T_BYTE);
+    dest_start = array_element_address(dest, dest_offset, T_BYTE);
+  }
+
+  // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object
+  // (because of the predicated logic executed earlier).
+  // so we cast it here safely.
+  // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java
+
+  Node* embeddedCipherObj = load_field_from_object(cipherBlockChaining_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
+  if (embeddedCipherObj == NULL) return false;
+
+  // cast it to what we know it will be at runtime
+  const TypeInstPtr* tinst = _gvn.type(cipherBlockChaining_object)->isa_instptr();
+  assert(tinst != NULL, "CBC obj is null");
+  assert(tinst->klass()->is_loaded(), "CBC obj is not loaded");
+  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
+  if (!klass_AESCrypt->is_loaded()) return false;
+
+  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
+  const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt);
+  const TypeOopPtr* xtype = aklass->as_instance_type();
+  Node* aescrypt_object = new(C) CheckCastPPNode(control(), embeddedCipherObj, xtype);
+  aescrypt_object = _gvn.transform(aescrypt_object);
+
+  // we need to get the start of the aescrypt_object's expanded key array
+  Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object);
+  if (k_start == NULL) return false;
+
+  // similarly, get the start address of the r vector
+  Node* objRvec = load_field_from_object(cipherBlockChaining_object, "r", "[B", /*is_exact*/ false);
+  if (objRvec == NULL) return false;
+  Node* r_start = array_element_address(objRvec, intcon(0), T_BYTE);
+
+  // Call the stub, passing src_start, dest_start, k_start, r_start and src_len
+  make_runtime_call(RC_LEAF|RC_NO_FP,
+                    OptoRuntime::cipherBlockChaining_aescrypt_Type(),
+                    stubAddr, stubName, TypePtr::BOTTOM,
+                    src_start, dest_start, k_start, r_start, len);
+
+  // return is void so no result needs to be pushed
+
+  return true;
+}
+
+//------------------------------get_key_start_from_aescrypt_object-----------------------
+Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) {
+  Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false);
+  assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt");
+  if (objAESCryptKey == NULL) return (Node *) NULL;
+
+  // now have the array, need to get the start address of the K array
+  Node* k_start = array_element_address(objAESCryptKey, intcon(0), T_INT);
+  return k_start;
+}
+
+//----------------------------inline_cipherBlockChaining_AESCrypt_predicate----------------------------
+// Return node representing slow path of predicate check.
+// the pseudo code we want to emulate with this predicate is:
+// for encryption:
+//    if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath
+// for decryption:
+//    if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath
+//    note cipher==plain is more conservative than the original java code but that's OK
+//
+Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting) {
+  // First, check receiver for NULL since it is virtual method.
+  int nargs = arg_size();
+  Node* objCBC = argument(0);
+  _sp += nargs;
+  objCBC = do_null_check(objCBC, T_OBJECT);
+  _sp -= nargs;
+
+  if (stopped()) return NULL; // Always NULL
+
+  // Load embeddedCipher field of CipherBlockChaining object.
+  Node* embeddedCipherObj = load_field_from_object(objCBC, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
+
+  // get AESCrypt klass for instanceOf check
+  // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point
+  // will have same classloader as CipherBlockChaining object
+  const TypeInstPtr* tinst = _gvn.type(objCBC)->isa_instptr();
+  assert(tinst != NULL, "CBCobj is null");
+  assert(tinst->klass()->is_loaded(), "CBCobj is not loaded");
+
+  // we want to do an instanceof comparison against the AESCrypt class
+  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
+  if (!klass_AESCrypt->is_loaded()) {
+    // if AESCrypt is not even loaded, we never take the intrinsic fast path
+    Node* ctrl = control();
+    set_control(top()); // no regular fast path
+    return ctrl;
+  }
+  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
+
+  _sp += nargs;          // gen_instanceof might do an uncommon trap
+  Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt)));
+  _sp -= nargs;
+  Node* cmp_instof  = _gvn.transform(new (C) CmpINode(instof, intcon(1)));
+  Node* bool_instof  = _gvn.transform(new (C) BoolNode(cmp_instof, BoolTest::ne));
+
+  Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN);
+
+  // for encryption, we are done
+  if (!decrypting)
+    return instof_false;  // even if it is NULL
+
+  // for decryption, we need to add a further check to avoid
+  // taking the intrinsic path when cipher and plain are the same
+  // see the original java code for why.
+  RegionNode* region = new(C) RegionNode(3);
+  region->init_req(1, instof_false);
+  Node* src = argument(1);
+  Node *dest = argument(4);
+  Node* cmp_src_dest = _gvn.transform(new (C) CmpPNode(src, dest));
+  Node* bool_src_dest = _gvn.transform(new (C) BoolNode(cmp_src_dest, BoolTest::eq));
+  Node* src_dest_conjoint = generate_guard(bool_src_dest, NULL, PROB_MIN);
+  region->init_req(2, src_dest_conjoint);
+
+  record_for_igvn(region);
+  return _gvn.transform(region);
+
+}
+
+
--- a/src/share/vm/opto/runtime.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/opto/runtime.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -811,6 +811,48 @@
   return TypeFunc::make(domain, range);
 }
 
+// for aescrypt encrypt/decrypt operations, just three pointers returning void (length is constant)
+const TypeFunc* OptoRuntime::aescrypt_block_Type() {
+  // create input type (domain)
+  int num_args      = 3;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // src
+  fields[argp++] = TypePtr::NOTNULL;    // dest
+  fields[argp++] = TypePtr::NOTNULL;    // k array
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // no result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = NULL; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
+// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning void
+const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
+  // create input type (domain)
+  int num_args      = 5;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // src
+  fields[argp++] = TypePtr::NOTNULL;    // dest
+  fields[argp++] = TypePtr::NOTNULL;    // k array
+  fields[argp++] = TypePtr::NOTNULL;    // r array
+  fields[argp++] = TypeInt::INT;        // src len
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // no result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = NULL; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
 //------------- Interpreter state access for on stack replacement
 const TypeFunc* OptoRuntime::osr_end_Type() {
   // create input type (domain)
--- a/src/share/vm/opto/runtime.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/opto/runtime.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -280,6 +280,9 @@
 
   static const TypeFunc* array_fill_Type();
 
+  static const TypeFunc* aescrypt_block_Type();
+  static const TypeFunc* cipherBlockChaining_aescrypt_Type();
+
   // leaf on stack replacement interpreter accessor types
   static const TypeFunc* osr_end_Type();
 
--- a/src/share/vm/runtime/arguments.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/arguments.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -1959,6 +1959,12 @@
   }
 #endif // SPARC
 
+  // check native memory tracking flags
+  if (PrintNMTStatistics && MemTracker::tracking_level() == MemTracker::NMT_off) {
+    warning("PrintNMTStatistics is disabled, because native memory tracking is not enabled");
+    PrintNMTStatistics = false;
+  }
+
   return status;
 }
 
--- a/src/share/vm/runtime/globals.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/globals.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -530,6 +530,9 @@
   product(intx, UseSSE, 99,                                                 \
           "Highest supported SSE instructions set on x86/x64")              \
                                                                             \
+  product(bool, UseAES, false,                                               \
+          "Control whether AES instructions can be used on x86/x64")        \
+                                                                            \
   product(uintx, LargePageSizeInBytes, 0,                                   \
           "Large page size (0 to let VM choose the page size")              \
                                                                             \
@@ -632,6 +635,9 @@
   product(bool, UseSSE42Intrinsics, false,                                  \
           "SSE4.2 versions of intrinsics")                                  \
                                                                             \
+  product(bool, UseAESIntrinsics, false,                                    \
+          "use intrinsics for AES versions of crypto")                      \
+                                                                            \
   develop(bool, TraceCallFixup, false,                                      \
           "traces all call fixups")                                         \
                                                                             \
@@ -860,6 +866,9 @@
   product(ccstr, NativeMemoryTracking, "off",                               \
           "Native memory tracking options")                                 \
                                                                             \
+  diagnostic(bool, PrintNMTStatistics, false,                               \
+          "Print native memory tracking summary data if it is on")          \
+                                                                            \
   diagnostic(bool, LogCompilation, false,                                   \
           "Log compilation activity in detail to hotspot.log or LogFile")   \
                                                                             \
--- a/src/share/vm/runtime/handles.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/handles.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -153,13 +153,18 @@
 
   // Delete later chunks
   if( _chunk->next() ) {
+    // reset arena size before delete chunks. Otherwise, the total
+    // arena size could exceed total chunk size
+    assert(area->size_in_bytes() > size_in_bytes(), "Sanity check");
+    area->set_size_in_bytes(size_in_bytes());
     _chunk->next_chop();
+  } else {
+    assert(area->size_in_bytes() == size_in_bytes(), "Sanity check");
   }
   // Roll back arena to saved top markers
   area->_chunk = _chunk;
   area->_hwm = _hwm;
   area->_max = _max;
-  area->set_size_in_bytes(_size_in_bytes);
 #ifdef ASSERT
   // clear out first chunk (to detect allocation bugs)
   if (ZapVMHandleArea) {
--- a/src/share/vm/runtime/handles.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/handles.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -319,6 +319,7 @@
   void set_previous_handle_mark(HandleMark* mark) { _previous_handle_mark = mark; }
   HandleMark* previous_handle_mark() const        { return _previous_handle_mark; }
 
+  size_t size_in_bytes() const { return _size_in_bytes; }
  public:
   HandleMark();                            // see handles_inline.hpp
   HandleMark(Thread* thread)                      { initialize(thread); }
--- a/src/share/vm/runtime/handles.inline.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/handles.inline.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -79,13 +79,18 @@
   HandleArea* area = _area;   // help compilers with poor alias analysis
   // Delete later chunks
   if( _chunk->next() ) {
+    // reset arena size before delete chunks. Otherwise, the total
+    // arena size could exceed total chunk size
+    assert(area->size_in_bytes() > size_in_bytes(), "Sanity check");
+    area->set_size_in_bytes(size_in_bytes());
     _chunk->next_chop();
+  } else {
+    assert(area->size_in_bytes() == size_in_bytes(), "Sanity check");
   }
   // Roll back arena to saved top markers
   area->_chunk = _chunk;
   area->_hwm = _hwm;
   area->_max = _max;
-  area->set_size_in_bytes(_size_in_bytes);
   debug_only(area->_handle_mark_nesting--);
 }
 
--- a/src/share/vm/runtime/java.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/java.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -57,6 +57,8 @@
 #include "runtime/task.hpp"
 #include "runtime/timer.hpp"
 #include "runtime/vm_operations.hpp"
+#include "services/memReporter.hpp"
+#include "services/memTracker.hpp"
 #include "trace/tracing.hpp"
 #include "trace/traceEventTypes.hpp"
 #include "utilities/dtrace.hpp"
@@ -356,6 +358,15 @@
   }
 #endif // COMPILER2
 #endif // ENABLE_ZAP_DEAD_LOCALS
+  // Native memory tracking data
+  if (PrintNMTStatistics) {
+    if (MemTracker::is_on()) {
+      BaselineTTYOutputer outputer(tty);
+      MemTracker::print_memory_usage(outputer, K, false);
+    } else {
+      tty->print_cr(MemTracker::reason());
+    }
+  }
 }
 
 #else // PRODUCT MODE STATISTICS
@@ -373,6 +384,16 @@
   if (PrintBiasedLockingStatistics) {
     BiasedLocking::print_counters();
   }
+
+  // Native memory tracking data
+  if (PrintNMTStatistics) {
+    if (MemTracker::is_on()) {
+      BaselineTTYOutputer outputer(tty);
+      MemTracker::print_memory_usage(outputer, K, false);
+    } else {
+      tty->print_cr(MemTracker::reason());
+    }
+  }
 }
 
 #endif
@@ -661,6 +682,7 @@
 
 JDK_Version JDK_Version::_current;
 const char* JDK_Version::_runtime_name;
+const char* JDK_Version::_runtime_version;
 
 void JDK_Version::initialize() {
   jdk_version_info info;
--- a/src/share/vm/runtime/java.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/java.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -75,6 +75,7 @@
 
   static JDK_Version _current;
   static const char* _runtime_name;
+  static const char* _runtime_version;
 
   // In this class, we promote the minor version of release to be the
   // major version for releases >= 5 in anticipation of the JDK doing the
@@ -189,6 +190,13 @@
     _runtime_name = name;
   }
 
+  static const char* runtime_version() {
+    return _runtime_version;
+  }
+  static void set_runtime_version(const char* version) {
+    _runtime_version = version;
+  }
+
   // Convenience methods for queries on the current major/minor version
   static bool is_jdk12x_version() {
     return current().compare_major(2) == 0;
--- a/src/share/vm/runtime/os.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/os.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -593,9 +593,7 @@
   if (PrintMalloc && tty != NULL) tty->print_cr("os::malloc " SIZE_FORMAT " bytes --> " PTR_FORMAT, size, memblock);
 
   // we do not track MallocCushion memory
-  if (MemTracker::is_on()) {
     MemTracker::record_malloc((address)memblock, size, memflags, caller == 0 ? CALLER_PC : caller);
-  }
 
   return memblock;
 }
@@ -606,7 +604,7 @@
   NOT_PRODUCT(inc_stat_counter(&num_mallocs, 1));
   NOT_PRODUCT(inc_stat_counter(&alloc_bytes, size));
   void* ptr = ::realloc(memblock, size);
-  if (ptr != NULL && MemTracker::is_on()) {
+  if (ptr != NULL) {
     MemTracker::record_realloc((address)memblock, (address)ptr, size, memflags,
      caller == 0 ? CALLER_PC : caller);
   }
@@ -1389,7 +1387,7 @@
 
 char* os::reserve_memory(size_t bytes, char* addr, size_t alignment_hint) {
   char* result = pd_reserve_memory(bytes, addr, alignment_hint);
-  if (result != NULL && MemTracker::is_on()) {
+  if (result != NULL) {
     MemTracker::record_virtual_memory_reserve((address)result, bytes, CALLER_PC);
   }
 
@@ -1397,7 +1395,7 @@
 }
 char* os::attempt_reserve_memory_at(size_t bytes, char* addr) {
   char* result = pd_attempt_reserve_memory_at(bytes, addr);
-  if (result != NULL && MemTracker::is_on()) {
+  if (result != NULL) {
     MemTracker::record_virtual_memory_reserve((address)result, bytes, CALLER_PC);
   }
   return result;
@@ -1410,7 +1408,7 @@
 
 bool os::commit_memory(char* addr, size_t bytes, bool executable) {
   bool res = pd_commit_memory(addr, bytes, executable);
-  if (res && MemTracker::is_on()) {
+  if (res) {
     MemTracker::record_virtual_memory_commit((address)addr, bytes, CALLER_PC);
   }
   return res;
@@ -1419,7 +1417,7 @@
 bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
                               bool executable) {
   bool res = os::pd_commit_memory(addr, size, alignment_hint, executable);
-  if (res && MemTracker::is_on()) {
+  if (res) {
     MemTracker::record_virtual_memory_commit((address)addr, size, CALLER_PC);
   }
   return res;
@@ -1446,8 +1444,9 @@
                            char *addr, size_t bytes, bool read_only,
                            bool allow_exec) {
   char* result = pd_map_memory(fd, file_name, file_offset, addr, bytes, read_only, allow_exec);
-  if (result != NULL && MemTracker::is_on()) {
+  if (result != NULL) {
     MemTracker::record_virtual_memory_reserve((address)result, bytes, CALLER_PC);
+    MemTracker::record_virtual_memory_commit((address)result, bytes, CALLER_PC);
   }
   return result;
 }
@@ -1462,6 +1461,7 @@
 bool os::unmap_memory(char *addr, size_t bytes) {
   bool result = pd_unmap_memory(addr, bytes);
   if (result) {
+    MemTracker::record_virtual_memory_uncommit((address)addr, bytes);
     MemTracker::record_virtual_memory_release((address)addr, bytes);
   }
   return result;
--- a/src/share/vm/runtime/os.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/os.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -387,7 +387,7 @@
   static void pd_start_thread(Thread* thread);
   static void start_thread(Thread* thread);
 
-  static void initialize_thread();
+  static void initialize_thread(Thread* thr);
   static void free_thread(OSThread* osthread);
 
   // thread id on Linux/64bit is 64bit, on Windows and Solaris, it's 32bit
--- a/src/share/vm/runtime/stubRoutines.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/stubRoutines.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -120,6 +120,10 @@
 address StubRoutines::_arrayof_jshort_fill;
 address StubRoutines::_arrayof_jint_fill;
 
+address StubRoutines::_aescrypt_encryptBlock               = NULL;
+address StubRoutines::_aescrypt_decryptBlock               = NULL;
+address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
+address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;
 
 double (* StubRoutines::_intrinsic_log   )(double) = NULL;
 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
--- a/src/share/vm/runtime/stubRoutines.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/stubRoutines.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -199,6 +199,11 @@
   // zero heap space aligned to jlong (8 bytes)
   static address _zero_aligned_words;
 
+  static address _aescrypt_encryptBlock;
+  static address _aescrypt_decryptBlock;
+  static address _cipherBlockChaining_encryptAESCrypt;
+  static address _cipherBlockChaining_decryptAESCrypt;
+
   // These are versions of the java.lang.Math methods which perform
   // the same operations as the intrinsic version.  They are used for
   // constant folding in the compiler to ensure equivalence.  If the
@@ -330,6 +335,11 @@
   static address arrayof_jshort_fill() { return _arrayof_jshort_fill; }
   static address arrayof_jint_fill()   { return _arrayof_jint_fill; }
 
+  static address aescrypt_encryptBlock()                { return _aescrypt_encryptBlock; }
+  static address aescrypt_decryptBlock()                { return _aescrypt_decryptBlock; }
+  static address cipherBlockChaining_encryptAESCrypt()  { return _cipherBlockChaining_encryptAESCrypt; }
+  static address cipherBlockChaining_decryptAESCrypt()  { return _cipherBlockChaining_decryptAESCrypt; }
+
   static address select_fill_function(BasicType t, bool aligned, const char* &name);
 
   static address zero_aligned_words()   { return _zero_aligned_words; }
--- a/src/share/vm/runtime/thread.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/thread.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -306,20 +306,25 @@
 
   // initialize structure dependent on thread local storage
   ThreadLocalStorage::set_thread(this);
-
-  // set up any platform-specific state.
-  os::initialize_thread();
 }
 
 void Thread::record_stack_base_and_size() {
   set_stack_base(os::current_stack_base());
   set_stack_size(os::current_stack_size());
-
-  // record thread's native stack, stack grows downward
-  address vm_base = _stack_base - _stack_size;
-  MemTracker::record_virtual_memory_reserve(vm_base, _stack_size,
-    CURRENT_PC, this);
-  MemTracker::record_virtual_memory_type(vm_base, mtThreadStack);
+  // CR 7190089: on Solaris, primordial thread's stack is adjusted
+  // in initialize_thread(). Without the adjustment, stack size is
+  // incorrect if stack is set to unlimited (ulimit -s unlimited).
+  // So far, only Solaris has real implementation of initialize_thread().
+  //
+  // set up any platform-specific state.
+  os::initialize_thread(this);
+
+   // record thread's native stack, stack grows downward
+  if (MemTracker::is_on()) {
+    address stack_low_addr = stack_base() - stack_size();
+    MemTracker::record_thread_stack(stack_low_addr, stack_size(), this,
+      CURRENT_PC);
+  }
 }
 
 
@@ -327,8 +332,17 @@
   // Reclaim the objectmonitors from the omFreeList of the moribund thread.
   ObjectSynchronizer::omFlush (this) ;
 
-  MemTracker::record_virtual_memory_release((_stack_base - _stack_size),
-    _stack_size, this);
+  // stack_base can be NULL if the thread is never started or exited before
+  // record_stack_base_and_size called. Although, we would like to ensure
+  // that all started threads do call record_stack_base_and_size(), there is
+  // not proper way to enforce that.
+  if (_stack_base != NULL) {
+    address low_stack_addr = stack_base() - stack_size();
+    MemTracker::release_thread_stack(low_stack_addr, stack_size(), this);
+#ifdef ASSERT
+    set_stack_base(NULL);
+#endif
+  }
 
   // deallocate data structures
   delete resource_area();
@@ -1008,6 +1022,7 @@
 }
 
 char java_runtime_name[128] = "";
+char java_runtime_version[128] = "";
 
 // extract the JRE name from sun.misc.Version.java_runtime_name
 static const char* get_java_runtime_name(TRAPS) {
@@ -1030,6 +1045,27 @@
   }
 }
 
+// extract the JRE version from sun.misc.Version.java_runtime_version
+static const char* get_java_runtime_version(TRAPS) {
+  klassOop k = SystemDictionary::find(vmSymbols::sun_misc_Version(),
+                                      Handle(), Handle(), CHECK_AND_CLEAR_NULL);
+  fieldDescriptor fd;
+  bool found = k != NULL &&
+               instanceKlass::cast(k)->find_local_field(vmSymbols::java_runtime_version_name(),
+                                                        vmSymbols::string_signature(), &fd);
+  if (found) {
+    oop name_oop = k->java_mirror()->obj_field(fd.offset());
+    if (name_oop == NULL)
+      return NULL;
+    const char* name = java_lang_String::as_utf8_string(name_oop,
+                                                        java_runtime_version,
+                                                        sizeof(java_runtime_version));
+    return name;
+  } else {
+    return NULL;
+  }
+}
+
 // General purpose hook into Java code, run once when the VM is initialized.
 // The Java library method itself may be changed independently from the VM.
 static void call_postVMInitHook(TRAPS) {
@@ -1527,10 +1563,12 @@
       tty->print_cr("terminate thread %p", this);
   }
 
-  // Info NMT that this JavaThread is exiting, its memory
-  // recorder should be collected
+  // By now, this thread should already be invisible to safepoint,
+  // and its per-thread recorder also collected.
   assert(!is_safepoint_visible(), "wrong state");
-  MemTracker::thread_exiting(this);
+#if INCLUDE_NMT
+  assert(get_recorder() == NULL, "Already collected");
+#endif // INCLUDE_NMT
 
   // JSR166 -- return the parker to the free list
   Parker::Release(_parker);
@@ -2431,6 +2469,7 @@
 }
 
 void JavaThread::remove_stack_guard_pages() {
+  assert(Thread::current() == this, "from different thread");
   if (_stack_guard_state == stack_guard_unused) return;
   address low_addr = stack_base() - stack_size();
   size_t len = (StackYellowPages + StackRedPages) * os::vm_page_size();
@@ -3454,6 +3493,7 @@
 
       // get the Java runtime name after java.lang.System is initialized
       JDK_Version::set_runtime_name(get_java_runtime_name(THREAD));
+      JDK_Version::set_runtime_version(get_java_runtime_version(THREAD));
     } else {
       warning("java.lang.System not initialized");
     }
@@ -4070,7 +4110,10 @@
 
     // Now, this thread is not visible to safepoint
     p->set_safepoint_visible(false);
-
+    // once the thread becomes safepoint invisible, we can not use its per-thread
+    // recorder. And Threads::do_threads() no longer walks this thread, so we have
+    // to release its per-thread recorder here.
+    MemTracker::thread_exiting(p);
   } // unlock Threads_lock
 
   // Since Events::log uses a lock, we grab it outside the Threads_lock
--- a/src/share/vm/runtime/vm_version.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/runtime/vm_version.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -241,19 +241,21 @@
 
   #ifndef FLOAT_ARCH
     #if defined(__SOFTFP__)
-      #define FLOAT_ARCH "-sflt"
+      #define FLOAT_ARCH_STR "-sflt"
     #elif defined(E500V2)
-      #define FLOAT_ARCH "-e500v2"
+      #define FLOAT_ARCH_STR "-e500v2"
     #elif defined(ARM)
-      #define FLOAT_ARCH "-vfp"
+      #define FLOAT_ARCH_STR "-vfp"
     #elif defined(PPC)
-      #define FLOAT_ARCH "-hflt"
+      #define FLOAT_ARCH_STR "-hflt"
     #else
-      #define FLOAT_ARCH ""
+      #define FLOAT_ARCH_STR ""
     #endif
+  #else
+    #define FLOAT_ARCH_STR XSTR(FLOAT_ARCH)
   #endif
 
-  return VMNAME " (" VM_RELEASE ") for " OS "-" CPU FLOAT_ARCH
+  return VMNAME " (" VM_RELEASE ") for " OS "-" CPU FLOAT_ARCH_STR
          " JRE (" JRE_RELEASE_VERSION "), built on " __DATE__ " " __TIME__
          " by " XSTR(HOTSPOT_BUILD_USER) " with " HOTSPOT_BUILD_COMPILER;
 }
--- a/src/share/vm/services/attachListener.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/attachListener.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -404,6 +404,8 @@
 static void attach_listener_thread_entry(JavaThread* thread, TRAPS) {
   os::set_priority(thread, NearMaxPriority);
 
+  thread->record_stack_base_and_size();
+
   if (AttachListener::pd_init() != 0) {
     return;
   }
--- a/src/share/vm/services/memBaseline.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memBaseline.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -40,6 +40,7 @@
   {mtSymbol,     "Symbol"},
   {mtNMT,        "Memory Tracking"},
   {mtChunk,      "Pooled Free Chunks"},
+  {mtClassShared,"Shared spaces for classes"},
   {mtNone,       "Unknown"}  // It can happen when type tagging records are lagging
                              // behind
 };
@@ -55,6 +56,7 @@
 
   _malloc_cs = NULL;
   _vm_cs = NULL;
+  _vm_map = NULL;
 
   _number_of_classes = 0;
   _number_of_threads = 0;
@@ -72,6 +74,11 @@
     _vm_cs = NULL;
   }
 
+  if (_vm_map != NULL) {
+    delete _vm_map;
+    _vm_map = NULL;
+  }
+
   reset();
 }
 
@@ -85,6 +92,7 @@
 
   if (_malloc_cs != NULL) _malloc_cs->clear();
   if (_vm_cs != NULL) _vm_cs->clear();
+  if (_vm_map != NULL) _vm_map->clear();
 
   for (int index = 0; index < NUMBER_OF_MEMORY_TYPE; index ++) {
     _malloc_data[index].clear();
@@ -94,39 +102,41 @@
 }
 
 MemBaseline::~MemBaseline() {
-  if (_malloc_cs != NULL) {
-    delete _malloc_cs;
-  }
-
-  if (_vm_cs != NULL) {
-    delete _vm_cs;
-  }
+  clear();
 }
 
 // baseline malloc'd memory records, generate overall summary and summaries by
 // memory types
 bool MemBaseline::baseline_malloc_summary(const MemPointerArray* malloc_records) {
-  MemPointerArrayIteratorImpl mItr((MemPointerArray*)malloc_records);
-  MemPointerRecord* mptr = (MemPointerRecord*)mItr.current();
+  MemPointerArrayIteratorImpl malloc_itr((MemPointerArray*)malloc_records);
+  MemPointerRecord* malloc_ptr = (MemPointerRecord*)malloc_itr.current();
   size_t used_arena_size = 0;
   int index;
-  while (mptr != NULL) {
-    index = flag2index(FLAGS_TO_MEMORY_TYPE(mptr->flags()));
-    size_t size = mptr->size();
-    _total_malloced += size;
-    _malloc_data[index].inc(size);
-    if (MemPointerRecord::is_arena_record(mptr->flags())) {
-      // see if arena size record present
-      MemPointerRecord* next_p = (MemPointerRecordEx*)mItr.peek_next();
-      if (MemPointerRecord::is_arena_size_record(next_p->flags())) {
-        assert(next_p->is_size_record_of_arena(mptr), "arena records do not match");
-        size = next_p->size();
-        _arena_data[index].inc(size);
-        used_arena_size += size;
-        mItr.next();
+  while (malloc_ptr != NULL) {
+    index = flag2index(FLAGS_TO_MEMORY_TYPE(malloc_ptr->flags()));
+    size_t size = malloc_ptr->size();
+    if (malloc_ptr->is_arena_memory_record()) {
+      // We do have anonymous arenas, they are either used as value objects,
+      // which are embedded inside other objects, or used as stack objects.
+      _arena_data[index].inc(size);
+      used_arena_size += size;
+    } else {
+      _total_malloced += size;
+      _malloc_data[index].inc(size);
+      if (malloc_ptr->is_arena_record()) {
+        // see if arena memory record present
+        MemPointerRecord* next_malloc_ptr = (MemPointerRecordEx*)malloc_itr.peek_next();
+        if (next_malloc_ptr->is_arena_memory_record()) {
+          assert(next_malloc_ptr->is_memory_record_of_arena(malloc_ptr),
+             "Arena records do not match");
+          size = next_malloc_ptr->size();
+          _arena_data[index].inc(size);
+          used_arena_size += size;
+          malloc_itr.next();
+        }
       }
     }
-    mptr = (MemPointerRecordEx*)mItr.next();
+    malloc_ptr = (MemPointerRecordEx*)malloc_itr.next();
   }
 
   // substract used arena size to get size of arena chunk in free list
@@ -142,20 +152,23 @@
 // baseline mmap'd memory records, generate overall summary and summaries by
 // memory types
 bool MemBaseline::baseline_vm_summary(const MemPointerArray* vm_records) {
-  MemPointerArrayIteratorImpl vItr((MemPointerArray*)vm_records);
-  VMMemRegion* vptr = (VMMemRegion*)vItr.current();
+  MemPointerArrayIteratorImpl vm_itr((MemPointerArray*)vm_records);
+  VMMemRegion* vm_ptr = (VMMemRegion*)vm_itr.current();
   int index;
-  while (vptr != NULL) {
-    index = flag2index(FLAGS_TO_MEMORY_TYPE(vptr->flags()));
-
+  while (vm_ptr != NULL) {
+    if (vm_ptr->is_reserved_region()) {
+      index = flag2index(FLAGS_TO_MEMORY_TYPE(vm_ptr->flags()));
     // we use the number of thread stack to count threads
-    if (IS_MEMORY_TYPE(vptr->flags(), mtThreadStack)) {
+      if (IS_MEMORY_TYPE(vm_ptr->flags(), mtThreadStack)) {
       _number_of_threads ++;
     }
-    _total_vm_reserved += vptr->reserved_size();
-    _total_vm_committed += vptr->committed_size();
-    _vm_data[index].inc(vptr->reserved_size(), vptr->committed_size());
-    vptr = (VMMemRegion*)vItr.next();
+      _total_vm_reserved += vm_ptr->size();
+      _vm_data[index].inc(vm_ptr->size(), 0);
+    } else {
+      _total_vm_committed += vm_ptr->size();
+      _vm_data[index].inc(0, vm_ptr->size());
+    }
+    vm_ptr = (VMMemRegion*)vm_itr.next();
   }
   return true;
 }
@@ -165,41 +178,57 @@
 bool MemBaseline::baseline_malloc_details(const MemPointerArray* malloc_records) {
   assert(MemTracker::track_callsite(), "detail tracking is off");
 
-  MemPointerArrayIteratorImpl mItr((MemPointerArray*)malloc_records);
-  MemPointerRecordEx* mptr = (MemPointerRecordEx*)mItr.current();
-  MallocCallsitePointer mp;
+  MemPointerArrayIteratorImpl malloc_itr(const_cast<MemPointerArray*>(malloc_records));
+  MemPointerRecordEx* malloc_ptr = (MemPointerRecordEx*)malloc_itr.current();
+  MallocCallsitePointer malloc_callsite;
 
+  // initailize malloc callsite array
   if (_malloc_cs == NULL) {
     _malloc_cs = new (std::nothrow) MemPointerArrayImpl<MallocCallsitePointer>(64);
     // out of native memory
-    if (_malloc_cs == NULL) {
+    if (_malloc_cs == NULL || _malloc_cs->out_of_memory()) {
       return false;
     }
   } else {
     _malloc_cs->clear();
   }
 
+  MemPointerArray* malloc_data = const_cast<MemPointerArray*>(malloc_records);
+
+  // sort into callsite pc order. Details are aggregated by callsites
+  malloc_data->sort((FN_SORT)malloc_sort_by_pc);
+  bool ret = true;
+
   // baseline memory that is totaled over 1 KB
-  while (mptr != NULL) {
-    if (!MemPointerRecord::is_arena_size_record(mptr->flags())) {
+  while (malloc_ptr != NULL) {
+    if (!MemPointerRecord::is_arena_memory_record(malloc_ptr->flags())) {
       // skip thread stacks
-      if (!IS_MEMORY_TYPE(mptr->flags(), mtThreadStack)) {
-        if (mp.addr() != mptr->pc()) {
-          if ((mp.amount()/K) > 0) {
-            if (!_malloc_cs->append(&mp)) {
+      if (!IS_MEMORY_TYPE(malloc_ptr->flags(), mtThreadStack)) {
+        if (malloc_callsite.addr() != malloc_ptr->pc()) {
+          if ((malloc_callsite.amount()/K) > 0) {
+            if (!_malloc_cs->append(&malloc_callsite)) {
+              ret = false;
+              break;
+            }
+          }
+          malloc_callsite = MallocCallsitePointer(malloc_ptr->pc());
+        }
+        malloc_callsite.inc(malloc_ptr->size());
+      }
+    }
+    malloc_ptr = (MemPointerRecordEx*)malloc_itr.next();
+  }
+
+  // restore to address order. Snapshot malloc data is maintained in memory
+  // address order.
+  malloc_data->sort((FN_SORT)malloc_sort_by_addr);
+
+  if (!ret) {
               return false;
             }
-          }
-          mp = MallocCallsitePointer(mptr->pc());
-        }
-        mp.inc(mptr->size());
-      }
-    }
-    mptr = (MemPointerRecordEx*)mItr.next();
-  }
-
-  if (mp.addr() != 0 && (mp.amount()/K) > 0) {
-    if (!_malloc_cs->append(&mp)) {
+  // deal with last record
+  if (malloc_callsite.addr() != 0 && (malloc_callsite.amount()/K) > 0) {
+    if (!_malloc_cs->append(&malloc_callsite)) {
       return false;
     }
   }
@@ -210,34 +239,106 @@
 bool MemBaseline::baseline_vm_details(const MemPointerArray* vm_records) {
   assert(MemTracker::track_callsite(), "detail tracking is off");
 
-  VMCallsitePointer vp;
-  MemPointerArrayIteratorImpl vItr((MemPointerArray*)vm_records);
-  VMMemRegionEx* vptr = (VMMemRegionEx*)vItr.current();
+  VMCallsitePointer  vm_callsite;
+  VMCallsitePointer* cur_callsite = NULL;
+  MemPointerArrayIteratorImpl vm_itr((MemPointerArray*)vm_records);
+  VMMemRegionEx* vm_ptr = (VMMemRegionEx*)vm_itr.current();
 
+  // initialize virtual memory map array
+  if (_vm_map == NULL) {
+    _vm_map = new (std::nothrow) MemPointerArrayImpl<VMMemRegionEx>(vm_records->length());
+   if (_vm_map == NULL || _vm_map->out_of_memory()) {
+     return false;
+   }
+  } else {
+    _vm_map->clear();
+  }
+
+  // initialize virtual memory callsite array
   if (_vm_cs == NULL) {
     _vm_cs = new (std::nothrow) MemPointerArrayImpl<VMCallsitePointer>(64);
-    if (_vm_cs == NULL) {
+    if (_vm_cs == NULL || _vm_cs->out_of_memory()) {
       return false;
     }
   } else {
     _vm_cs->clear();
   }
 
-  while (vptr != NULL) {
-    if (vp.addr() != vptr->pc()) {
-      if (!_vm_cs->append(&vp)) {
+  // consolidate virtual memory data
+  VMMemRegionEx*     reserved_rec = NULL;
+  VMMemRegionEx*     committed_rec = NULL;
+
+  // vm_ptr is coming in increasing base address order
+  while (vm_ptr != NULL) {
+    if (vm_ptr->is_reserved_region()) {
+      // consolidate reserved memory regions for virtual memory map.
+      // The criteria for consolidation is:
+      // 1. two adjacent reserved memory regions
+      // 2. belong to the same memory type
+      // 3. reserved from the same callsite
+      if (reserved_rec == NULL ||
+        reserved_rec->base() + reserved_rec->size() != vm_ptr->addr() ||
+        FLAGS_TO_MEMORY_TYPE(reserved_rec->flags()) != FLAGS_TO_MEMORY_TYPE(vm_ptr->flags()) ||
+        reserved_rec->pc() != vm_ptr->pc()) {
+        if (!_vm_map->append(vm_ptr)) {
         return false;
       }
-      vp = VMCallsitePointer(vptr->pc());
+        // inserted reserved region, we need the pointer to the element in virtual
+        // memory map array.
+        reserved_rec = (VMMemRegionEx*)_vm_map->at(_vm_map->length() - 1);
+      } else {
+        reserved_rec->expand_region(vm_ptr->addr(), vm_ptr->size());
     }
-    vp.inc(vptr->size(), vptr->committed_size());
-    vptr = (VMMemRegionEx*)vItr.next();
-  }
-  if (vp.addr() != 0) {
-    if (!_vm_cs->append(&vp)) {
+
+      if (cur_callsite != NULL && !_vm_cs->append(cur_callsite)) {
       return false;
     }
+      vm_callsite = VMCallsitePointer(vm_ptr->pc());
+      cur_callsite = &vm_callsite;
+      vm_callsite.inc(vm_ptr->size(), 0);
+    } else {
+      // consolidate committed memory regions for virtual memory map
+      // The criterial is:
+      // 1. two adjacent committed memory regions
+      // 2. committed from the same callsite
+      if (committed_rec == NULL ||
+        committed_rec->base() + committed_rec->size() != vm_ptr->addr() ||
+        committed_rec->pc() != vm_ptr->pc()) {
+        if (!_vm_map->append(vm_ptr)) {
+          return false;
   }
+        committed_rec = (VMMemRegionEx*)_vm_map->at(_vm_map->length() - 1);
+    } else {
+        committed_rec->expand_region(vm_ptr->addr(), vm_ptr->size());
+      }
+      vm_callsite.inc(0, vm_ptr->size());
+    }
+    vm_ptr = (VMMemRegionEx*)vm_itr.next();
+  }
+  // deal with last record
+  if (cur_callsite != NULL && !_vm_cs->append(cur_callsite)) {
+    return false;
+  }
+
+  // sort it into callsite pc order. Details are aggregated by callsites
+  _vm_cs->sort((FN_SORT)bl_vm_sort_by_pc);
+
+  // walk the array to consolidate record by pc
+  MemPointerArrayIteratorImpl itr(_vm_cs);
+  VMCallsitePointer* callsite_rec = (VMCallsitePointer*)itr.current();
+  VMCallsitePointer* next_rec = (VMCallsitePointer*)itr.next();
+  while (next_rec != NULL) {
+    assert(callsite_rec != NULL, "Sanity check");
+    if (next_rec->addr() == callsite_rec->addr()) {
+      callsite_rec->inc(next_rec->reserved_amount(), next_rec->committed_amount());
+      itr.remove();
+      next_rec = (VMCallsitePointer*)itr.current();
+    } else {
+      callsite_rec = next_rec;
+      next_rec = (VMCallsitePointer*)itr.next();
+    }
+  }
+
   return true;
 }
 
@@ -251,12 +352,8 @@
   _number_of_classes = SystemDictionary::number_of_classes();
 
   if (!summary_only && MemTracker::track_callsite() && _baselined) {
-    ((MemPointerArray*)snapshot._alloc_ptrs)->sort((FN_SORT)malloc_sort_by_pc);
-    ((MemPointerArray*)snapshot._vm_ptrs)->sort((FN_SORT)vm_sort_by_pc);
     _baselined =  baseline_malloc_details(snapshot._alloc_ptrs) &&
       baseline_vm_details(snapshot._vm_ptrs);
-    ((MemPointerArray*)snapshot._alloc_ptrs)->sort((FN_SORT)malloc_sort_by_addr);
-    ((MemPointerArray*)snapshot._vm_ptrs)->sort((FN_SORT)vm_sort_by_addr);
   }
   return _baselined;
 }
@@ -278,7 +375,7 @@
       return MemType2NameMap[index]._name;
     }
   }
-  assert(false, "no type");
+  assert(false, err_msg("bad type %x", type));
   return NULL;
 }
 
@@ -341,13 +438,6 @@
   return UNSIGNED_COMPARE(mp1->addr(), mp2->addr());
 }
 
-// sort snapshot mmap'd records in callsite pc order
-int MemBaseline::vm_sort_by_pc(const void* p1, const void* p2) {
-  assert(MemTracker::track_callsite(),"Just check");
-  const VMMemRegionEx* mp1 = (const VMMemRegionEx*)p1;
-  const VMMemRegionEx* mp2 = (const VMMemRegionEx*)p2;
-  return UNSIGNED_COMPARE(mp1->pc(), mp2->pc());
-}
 
 // sort baselined mmap'd records in size (reserved size) order
 int MemBaseline::bl_vm_sort_by_size(const void* p1, const void* p2) {
@@ -376,12 +466,3 @@
   return delta;
 }
 
-// sort snapshot mmap'd records in memory block address order
-int MemBaseline::vm_sort_by_addr(const void* p1, const void* p2) {
-  assert(MemTracker::is_on(), "Just check");
-  const VMMemRegion* mp1 = (const VMMemRegion*)p1;
-  const VMMemRegion* mp2 = (const VMMemRegion*)p2;
-  int delta = UNSIGNED_COMPARE(mp1->addr(), mp2->addr());
-  assert(delta != 0, "dup pointer");
-  return delta;
-}
--- a/src/share/vm/services/memBaseline.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memBaseline.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -320,6 +320,8 @@
   // only available when detail tracking is on.
   MemPointerArray*  _malloc_cs;
   MemPointerArray*  _vm_cs;
+  // virtual memory map
+  MemPointerArray*  _vm_map;
 
  private:
   static MemType2Name  MemType2NameMap[NUMBER_OF_MEMORY_TYPE];
@@ -432,9 +434,6 @@
   static int malloc_sort_by_pc(const void* p1, const void* p2);
   static int malloc_sort_by_addr(const void* p1, const void* p2);
 
-  static int vm_sort_by_pc(const void* p1, const void* p2);
-  static int vm_sort_by_addr(const void* p1, const void* p2);
-
  private:
   // sorting functions for baselined records
   static int bl_malloc_sort_by_size(const void* p1, const void* p2);
--- a/src/share/vm/services/memPtr.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memPtr.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -40,35 +40,3 @@
   return seq;
 }
 
-
-
-bool VMMemRegion::contains(const VMMemRegion* mr) const {
-  assert(base() != 0, "no base address");
-  assert(size() != 0 || committed_size() != 0,
-    "no range");
-  address base_addr = base();
-  address end_addr = base_addr +
-    (is_reserve_record()? reserved_size(): committed_size());
-  if (mr->is_reserve_record()) {
-    if (mr->base() == base_addr && mr->size() == size()) {
-      // the same range
-      return true;
-    }
-    return false;
-  } else if (mr->is_commit_record() || mr->is_uncommit_record()) {
-    assert(mr->base() != 0 && mr->committed_size() > 0,
-      "bad record");
-    return (mr->base() >= base_addr &&
-      (mr->base() + mr->committed_size()) <= end_addr);
-  } else if (mr->is_type_tagging_record()) {
-    assert(mr->base() != 0, "no base");
-    return mr->base() == base_addr;
-  } else if (mr->is_release_record()) {
-    assert(mr->base() != 0 && mr->size() > 0,
-      "bad record");
-    return (mr->base() == base_addr && mr->size() == size());
-  } else {
-    assert(false, "what happened?");
-    return false;
-  }
-}
--- a/src/share/vm/services/memPtr.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memPtr.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -165,7 +165,7 @@
     return (flags & (otArena | tag_size)) == otArena;
   }
 
-  inline static bool is_arena_size_record(MEMFLAGS flags) {
+  inline static bool is_arena_memory_record(MEMFLAGS flags) {
     return (flags & (otArena | tag_size)) == (otArena | tag_size);
   }
 
@@ -256,8 +256,8 @@
   }
 
   // if this record records a size information of an arena
-  inline bool is_arena_size_record() const {
-    return is_arena_size_record(_flags);
+  inline bool is_arena_memory_record() const {
+    return is_arena_memory_record(_flags);
   }
 
   // if this pointer represents an address to an arena object
@@ -266,8 +266,8 @@
   }
 
   // if this record represents a size information of specific arena
-  inline bool is_size_record_of_arena(const MemPointerRecord* arena_rc) {
-    assert(is_arena_size_record(), "not size record");
+  inline bool is_memory_record_of_arena(const MemPointerRecord* arena_rc) {
+    assert(is_arena_memory_record(), "not size record");
     assert(arena_rc->is_arena_record(), "not arena record");
     return (arena_rc->addr() + sizeof(void*)) == addr();
   }
@@ -291,6 +291,37 @@
   inline bool is_type_tagging_record() const {
     return is_virtual_memory_type_record(_flags);
   }
+
+  // if the two memory pointer records actually represent the same
+  // memory block
+  inline bool is_same_region(const MemPointerRecord* other) const {
+    return (addr() == other->addr() && size() == other->size());
+  }
+
+  // if this memory region fully contains another one
+  inline bool contains_region(const MemPointerRecord* other) const {
+    return contains_region(other->addr(), other->size());
+  }
+
+  // if this memory region fully contains specified memory range
+  inline bool contains_region(address add, size_t sz) const {
+    return (addr() <= add && addr() + size() >= add + sz);
+  }
+
+  inline bool contains_address(address add) const {
+    return (addr() <= add && addr() + size() > add);
+  }
+
+  // if this memory region overlaps another region
+  inline bool overlaps_region(const MemPointerRecord* other) const {
+    assert(other != NULL, "Just check");
+    assert(size() > 0 && other->size() > 0, "empty range");
+    return contains_address(other->addr()) ||
+           contains_address(other->addr() + other->size() - 1) || // exclude end address
+           other->contains_address(addr()) ||
+           other->contains_address(addr() + size() - 1); // exclude end address
+  }
+
 };
 
 // MemPointerRecordEx also records callsite pc, from where
@@ -321,66 +352,32 @@
   }
 };
 
-// a virtual memory region
+// a virtual memory region. The region can represent a reserved
+// virtual memory region or a committed memory region
 class VMMemRegion : public MemPointerRecord {
- private:
-  // committed size
-  size_t       _committed_size;
-
 public:
-  VMMemRegion(): _committed_size(0) { }
+  VMMemRegion() { }
 
   void init(const MemPointerRecord* mp) {
-    assert(mp->is_vm_pointer(), "not virtual memory pointer");
+    assert(mp->is_vm_pointer(), "Sanity check");
     _addr = mp->addr();
-    if (mp->is_commit_record() || mp->is_uncommit_record()) {
-      _committed_size = mp->size();
-      set_size(_committed_size);
-    } else {
       set_size(mp->size());
-      _committed_size = 0;
-    }
     set_flags(mp->flags());
   }
 
   VMMemRegion& operator=(const VMMemRegion& other) {
     MemPointerRecord::operator=(other);
-    _committed_size = other.committed_size();
     return *this;
   }
 
-  inline bool is_reserve_record() const {
-    return is_virtual_memory_reserve_record(flags());
+  inline bool is_reserved_region() const {
+    return is_allocation_record();
   }
 
-  inline bool is_release_record() const {
-    return is_virtual_memory_release_record(flags());
+  inline bool is_committed_region() const {
+    return is_commit_record();
   }
 
-  // resize reserved VM range
-  inline void set_reserved_size(size_t new_size) {
-    assert(new_size >= committed_size(), "resize");
-    set_size(new_size);
-  }
-
-  inline void commit(size_t size) {
-    _committed_size += size;
-  }
-
-  inline void uncommit(size_t size) {
-    if (_committed_size >= size) {
-      _committed_size -= size;
-    } else {
-      _committed_size = 0;
-    }
-  }
-
-  /*
-   * if this virtual memory range covers whole range of
-   * the other VMMemRegion
-   */
-  bool contains(const VMMemRegion* mr) const;
-
   /* base address of this virtual memory range */
   inline address base() const {
     return addr();
@@ -391,13 +388,28 @@
     set_flags(flags() | (f & mt_masks));
   }
 
-  // release part of memory range
-  inline void partial_release(address add, size_t sz) {
-    assert(add >= addr() && add < addr() + size(), "not valid address");
-    // for now, it can partially release from the both ends,
-    // but not in the middle
+  // expand this region to also cover specified range.
+  // The range has to be on either end of the memory region.
+  void expand_region(address addr, size_t sz) {
+    if (addr < base()) {
+      assert(addr + sz == base(), "Sanity check");
+      _addr = addr;
+      set_size(size() + sz);
+    } else {
+      assert(base() + size() == addr, "Sanity check");
+      set_size(size() + sz);
+    }
+  }
+
+  // exclude the specified address range from this region.
+  // The excluded memory range has to be on either end of this memory
+  // region.
+  inline void exclude_region(address add, size_t sz) {
+    assert(is_reserved_region() || is_committed_region(), "Sanity check");
+    assert(addr() != NULL && size() != 0, "Sanity check");
+    assert(add >= addr() && add < addr() + size(), "Sanity check");
     assert(add == addr() || (add + sz) == (addr() + size()),
-      "release in the middle");
+      "exclude in the middle");
     if (add == addr()) {
       set_addr(add + sz);
       set_size(size() - sz);
@@ -405,16 +417,6 @@
       set_size(size() - sz);
     }
   }
-
-  // the committed size of the virtual memory block
-  inline size_t committed_size() const {
-    return _committed_size;
-  }
-
-  // the reserved size of the virtual memory block
-  inline size_t reserved_size() const {
-    return size();
-  }
 };
 
 class VMMemRegionEx : public VMMemRegion {
--- a/src/share/vm/services/memPtrArray.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memPtrArray.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -84,11 +84,7 @@
 
 // implementation class
 class MemPointerArrayIteratorImpl : public MemPointerArrayIterator {
-#ifdef ASSERT
  protected:
-#else
- private:
-#endif
   MemPointerArray*  _array;
   int               _pos;
 
--- a/src/share/vm/services/memRecorder.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memRecorder.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -31,14 +31,19 @@
 #include "services/memTracker.hpp"
 
 MemPointer* SequencedRecordIterator::next_record() {
-  MemPointer* itr_cur = _itr.current();
-  if (itr_cur == NULL) return NULL;
-  MemPointer* itr_next = _itr.next();
+  MemPointerRecord* itr_cur = (MemPointerRecord*)_itr.current();
+  if (itr_cur == NULL)  {
+    return itr_cur;
+  }
 
-  while (itr_next != NULL &&
-    same_kind((MemPointerRecord*)itr_cur, (MemPointerRecord*)itr_next)) {
+  MemPointerRecord* itr_next = (MemPointerRecord*)_itr.next();
+
+  // don't collapse virtual memory records
+  while (itr_next != NULL && !itr_cur->is_vm_pointer() &&
+    !itr_next->is_vm_pointer() &&
+    same_kind(itr_cur, itr_next)) {
     itr_cur = itr_next;
-    itr_next = _itr.next();
+    itr_next = (MemPointerRecord*)_itr.next();
   }
 
   return itr_cur;
--- a/src/share/vm/services/memRecorder.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memRecorder.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -188,6 +188,7 @@
   // Test if the two records are the same kind: the same memory block and allocation
   // type.
   inline bool same_kind(const MemPointerRecord* p1, const MemPointerRecord* p2) const {
+    assert(!p1->is_vm_pointer() && !p2->is_vm_pointer(), "malloc pointer only");
     return (p1->addr() == p2->addr() &&
       (p1->flags() &MemPointerRecord::tag_masks) ==
       (p2->flags() & MemPointerRecord::tag_masks));
--- a/src/share/vm/services/memReporter.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memReporter.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -51,6 +51,7 @@
 
   report_summaries(baseline);
   if (!summary_only && MemTracker::track_callsite()) {
+    report_virtual_memory_map(baseline);
     report_callsites(baseline);
   }
   _outputer.done();
@@ -74,6 +75,25 @@
   _outputer.done_category_summary();
 }
 
+void BaselineReporter::report_virtual_memory_map(const MemBaseline& baseline) {
+  _outputer.start_virtual_memory_map();
+  MemBaseline* pBL = const_cast<MemBaseline*>(&baseline);
+  MemPointerArrayIteratorImpl itr = MemPointerArrayIteratorImpl(pBL->_vm_map);
+  VMMemRegionEx* rgn = (VMMemRegionEx*)itr.current();
+  while (rgn != NULL) {
+    if (rgn->is_reserved_region()) {
+      _outputer.reserved_memory_region(FLAGS_TO_MEMORY_TYPE(rgn->flags()),
+        rgn->base(), rgn->base() + rgn->size(), amount_in_current_scale(rgn->size()), rgn->pc());
+    } else {
+      _outputer.committed_memory_region(rgn->base(), rgn->base() + rgn->size(),
+        amount_in_current_scale(rgn->size()), rgn->pc());
+    }
+    rgn = (VMMemRegionEx*)itr.next();
+  }
+
+  _outputer.done_virtual_memory_map();
+}
+
 void BaselineReporter::report_callsites(const MemBaseline& baseline) {
   _outputer.start_callsite();
   MemBaseline* pBL = const_cast<MemBaseline*>(&baseline);
@@ -324,6 +344,40 @@
   _output->print_cr(" ");
 }
 
+
+void BaselineTTYOutputer::start_virtual_memory_map() {
+  _output->print_cr("Virtual memory map:");
+}
+
+void BaselineTTYOutputer::reserved_memory_region(MEMFLAGS type, address base, address end,
+                                                 size_t size, address pc) {
+  const char* unit = memory_unit(_scale);
+  char buf[128];
+  int  offset;
+  _output->print_cr(" ");
+  _output->print_cr("[" PTR_FORMAT " - " PTR_FORMAT "] reserved %d%s for %s", base, end, size, unit,
+            MemBaseline::type2name(type));
+  if (os::dll_address_to_function_name(pc, buf, sizeof(buf), &offset)) {
+      _output->print_cr("\t\tfrom [%s+0x%x]", buf, offset);
+  }
+}
+
+void BaselineTTYOutputer::committed_memory_region(address base, address end, size_t size, address pc) {
+  const char* unit = memory_unit(_scale);
+  char buf[128];
+  int  offset;
+  _output->print("\t[" PTR_FORMAT " - " PTR_FORMAT "] committed %d%s", base, end, size, unit);
+  if (os::dll_address_to_function_name(pc, buf, sizeof(buf), &offset)) {
+      _output->print_cr(" from [%s+0x%x]", buf, offset);
+  }
+}
+
+void BaselineTTYOutputer::done_virtual_memory_map() {
+  _output->print_cr(" ");
+}
+
+
+
 void BaselineTTYOutputer::start_callsite() {
   _output->print_cr("Details:");
   _output->print_cr(" ");
@@ -337,7 +391,7 @@
   size_t malloc_count) {
   if (malloc_amt > 0) {
     const char* unit = memory_unit(_scale);
-    char buf[64];
+    char buf[128];
     int  offset;
     if (pc == 0) {
       _output->print("[BOOTSTRAP]%18s", " ");
@@ -357,7 +411,7 @@
   size_t committed_amt) {
   if (reserved_amt > 0) {
     const char* unit = memory_unit(_scale);
-    char buf[64];
+    char buf[128];
     int  offset;
     if (pc == 0) {
       _output->print("[BOOTSTRAP]%18s", " ");
@@ -502,7 +556,7 @@
     int malloc_diff, int malloc_count_diff) {
   if (malloc_diff != 0) {
     const char* unit = memory_unit(_scale);
-    char buf[64];
+    char buf[128];
     int  offset;
     if (pc == 0) {
       _output->print_cr("[BOOTSTRAP]%18s", " ");
--- a/src/share/vm/services/memReporter.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memReporter.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -91,6 +91,11 @@
 
   virtual void done_category_summary() = 0;
 
+  virtual void start_virtual_memory_map() = 0;
+  virtual void reserved_memory_region(MEMFLAGS type, address base, address end, size_t size, address pc) = 0;
+  virtual void committed_memory_region(address base, address end, size_t size, address pc) = 0;
+  virtual void done_virtual_memory_map() = 0;
+
   /*
    *  Report callsite information
    */
@@ -134,6 +139,7 @@
 
  private:
   void report_summaries(const MemBaseline& baseline);
+  void report_virtual_memory_map(const MemBaseline& baseline);
   void report_callsites(const MemBaseline& baseline);
 
   void diff_summaries(const MemBaseline& cur, const MemBaseline& prev);
@@ -249,6 +255,13 @@
 
   void done_category_summary();
 
+  // virtual memory map
+  void start_virtual_memory_map();
+  void reserved_memory_region(MEMFLAGS type, address base, address end, size_t size, address pc);
+  void committed_memory_region(address base, address end, size_t size, address pc);
+  void done_virtual_memory_map();
+
+
   /*
    *  Report callsite information
    */
--- a/src/share/vm/services/memSnapshot.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memSnapshot.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -31,148 +31,357 @@
 #include "services/memSnapshot.hpp"
 #include "services/memTracker.hpp"
 
+#ifdef ASSERT
 
-// stagging data groups the data of a VM memory range, so we can consolidate
-// them into one record during the walk
-bool StagingWalker::consolidate_vm_records(VMMemRegionEx* vm_rec) {
-  MemPointerRecord* cur = (MemPointerRecord*)_itr.current();
-  assert(cur != NULL && cur->is_vm_pointer(), "not a virtual memory pointer");
+void decode_pointer_record(MemPointerRecord* rec) {
+  tty->print("Pointer: [" PTR_FORMAT " - " PTR_FORMAT  "] size = %d bytes", rec->addr(),
+    rec->addr() + rec->size(), (int)rec->size());
+  tty->print(" type = %s", MemBaseline::type2name(FLAGS_TO_MEMORY_TYPE(rec->flags())));
+  if (rec->is_vm_pointer()) {
+    if (rec->is_allocation_record()) {
+      tty->print_cr(" (reserve)");
+    } else if (rec->is_commit_record()) {
+      tty->print_cr(" (commit)");
+    } else if (rec->is_uncommit_record()) {
+      tty->print_cr(" (uncommit)");
+    } else if (rec->is_deallocation_record()) {
+      tty->print_cr(" (release)");
+    } else {
+      tty->print_cr(" (tag)");
+    }
+  } else {
+    if (rec->is_arena_memory_record()) {
+      tty->print_cr(" (arena size)");
+    } else if (rec->is_allocation_record()) {
+      tty->print_cr(" (malloc)");
+    } else {
+      tty->print_cr(" (free)");
+    }
+  }
+  if (MemTracker::track_callsite()) {
+    char buf[1024];
+    address pc = ((MemPointerRecordEx*)rec)->pc();
+    if (pc != NULL && os::dll_address_to_function_name(pc, buf, sizeof(buf), NULL)) {
+      tty->print_cr("\tfrom %s", buf);
+    } else {
+      tty->print_cr("\tcould not decode pc = " PTR_FORMAT "", pc);
+    }
+  }
+}
 
-  jint cur_seq;
-  jint next_seq;
+void decode_vm_region_record(VMMemRegion* rec) {
+  tty->print("VM Region [" PTR_FORMAT " - " PTR_FORMAT "]", rec->addr(),
+    rec->addr() + rec->size());
+  tty->print(" type = %s", MemBaseline::type2name(FLAGS_TO_MEMORY_TYPE(rec->flags())));
+  if (rec->is_allocation_record()) {
+    tty->print_cr(" (reserved)");
+  } else if (rec->is_commit_record()) {
+    tty->print_cr(" (committed)");
+  } else {
+    ShouldNotReachHere();
+  }
+  if (MemTracker::track_callsite()) {
+    char buf[1024];
+    address pc = ((VMMemRegionEx*)rec)->pc();
+    if (pc != NULL && os::dll_address_to_function_name(pc, buf, sizeof(buf), NULL)) {
+      tty->print_cr("\tfrom %s", buf);
+    } else {
+      tty->print_cr("\tcould not decode pc = " PTR_FORMAT "", pc);
+    }
 
-  bool trackCallsite = MemTracker::track_callsite();
+  }
+}
 
-  if (trackCallsite) {
-    vm_rec->init((MemPointerRecordEx*)cur);
-    cur_seq = ((SeqMemPointerRecordEx*)cur)->seq();
+#endif
+
+
+bool VMMemPointerIterator::insert_record(MemPointerRecord* rec) {
+  VMMemRegionEx new_rec;
+  assert(rec->is_allocation_record() || rec->is_commit_record(),
+    "Sanity check");
+  if (MemTracker::track_callsite()) {
+    new_rec.init((MemPointerRecordEx*)rec);
   } else {
-    vm_rec->init((MemPointerRecord*)cur);
-    cur_seq = ((SeqMemPointerRecord*)cur)->seq();
+    new_rec.init(rec);
+  }
+  return insert(&new_rec);
+}
+
+bool VMMemPointerIterator::insert_record_after(MemPointerRecord* rec) {
+  VMMemRegionEx new_rec;
+  assert(rec->is_allocation_record() || rec->is_commit_record(),
+    "Sanity check");
+  if (MemTracker::track_callsite()) {
+    new_rec.init((MemPointerRecordEx*)rec);
+  } else {
+    new_rec.init(rec);
+  }
+  return insert_after(&new_rec);
+}
+
+// we don't consolidate reserved regions, since they may be categorized
+// in different types.
+bool VMMemPointerIterator::add_reserved_region(MemPointerRecord* rec) {
+  assert(rec->is_allocation_record(), "Sanity check");
+  VMMemRegion* reserved_region = (VMMemRegion*)current();
+
+  // we don't have anything yet
+  if (reserved_region == NULL) {
+    return insert_record(rec);
   }
 
-  // only can consolidate when we have allocation record,
-  // which contains virtual memory range
-  if (!cur->is_allocation_record()) {
-    _itr.next();
+  assert(reserved_region->is_reserved_region(), "Sanity check");
+  // duplicated records
+  if (reserved_region->is_same_region(rec)) {
+    return true;
+  }
+  // Overlapping stack regions indicate that a JNI thread failed to
+  // detach from the VM before exiting. This leaks the JavaThread object.
+  if (CheckJNICalls)  {
+      guarantee(FLAGS_TO_MEMORY_TYPE(reserved_region->flags()) != mtThreadStack ||
+         !reserved_region->overlaps_region(rec),
+         "Attached JNI thread exited without being detached");
+  }
+  // otherwise, we should not have overlapping reserved regions
+  assert(FLAGS_TO_MEMORY_TYPE(reserved_region->flags()) == mtThreadStack ||
+    reserved_region->base() > rec->addr(), "Just check: locate()");
+  assert(FLAGS_TO_MEMORY_TYPE(reserved_region->flags()) == mtThreadStack ||
+    !reserved_region->overlaps_region(rec), "overlapping reserved regions");
+
+  return insert_record(rec);
+}
+
+// we do consolidate committed regions
+bool VMMemPointerIterator::add_committed_region(MemPointerRecord* rec) {
+  assert(rec->is_commit_record(), "Sanity check");
+  VMMemRegion* reserved_rgn = (VMMemRegion*)current();
+  assert(reserved_rgn->is_reserved_region() && reserved_rgn->contains_region(rec),
+    "Sanity check");
+
+  // thread's native stack is always marked as "committed", ignore
+  // the "commit" operation for creating stack guard pages
+  if (FLAGS_TO_MEMORY_TYPE(reserved_rgn->flags()) == mtThreadStack &&
+      FLAGS_TO_MEMORY_TYPE(rec->flags()) != mtThreadStack) {
     return true;
   }
 
-  // allocation range
-  address base = cur->addr();
-  address end = base + cur->size();
-
-  MemPointerRecord* next = (MemPointerRecord*)_itr.peek_next();
-  // if the memory range is alive
-  bool live_vm_rec = true;
-  while (next != NULL && next->is_vm_pointer()) {
-    if (next->is_allocation_record()) {
-      assert(next->addr() >= base, "sorting order or overlapping");
-      break;
+  // if the reserved region has any committed regions
+  VMMemRegion* committed_rgn  = (VMMemRegion*)next();
+  while (committed_rgn != NULL && committed_rgn->is_committed_region()) {
+    // duplicated commit records
+    if(committed_rgn->contains_region(rec)) {
+      return true;
+    } else if (committed_rgn->overlaps_region(rec)) {
+      // overlaps front part
+      if (rec->addr() < committed_rgn->addr()) {
+        committed_rgn->expand_region(rec->addr(),
+          committed_rgn->addr() - rec->addr());
+      } else {
+        // overlaps tail part
+        address committed_rgn_end = committed_rgn->addr() +
+              committed_rgn->size();
+        assert(committed_rgn_end < rec->addr() + rec->size(),
+             "overlap tail part");
+        committed_rgn->expand_region(committed_rgn_end,
+          (rec->addr() + rec->size()) - committed_rgn_end);
+      }
+    } else if (committed_rgn->base() + committed_rgn->size() == rec->addr()) {
+      // adjunct each other
+      committed_rgn->expand_region(rec->addr(), rec->size());
+      VMMemRegion* next_reg = (VMMemRegion*)next();
+      // see if we can consolidate next committed region
+      if (next_reg != NULL && next_reg->is_committed_region() &&
+        next_reg->base() == committed_rgn->base() + committed_rgn->size()) {
+          committed_rgn->expand_region(next_reg->base(), next_reg->size());
+          // delete merged region
+          remove();
+      }
+      return true;
+    } else if (committed_rgn->base() > rec->addr()) {
+      // found the location, insert this committed region
+      return insert_record(rec);
     }
-
-    if (trackCallsite) {
-      next_seq = ((SeqMemPointerRecordEx*)next)->seq();
-    } else {
-      next_seq = ((SeqMemPointerRecord*)next)->seq();
-    }
-
-    if (next_seq < cur_seq) {
-      _itr.next();
-      next = (MemPointerRecord*)_itr.peek_next();
-      continue;
-    }
-
-    if (next->is_deallocation_record()) {
-      if (next->addr() == base && next->size() == cur->size()) {
-        // the virtual memory range has been released
-        _itr.next();
-        live_vm_rec = false;
-        break;
-      } else if (next->addr() < end) { // partial release
-        vm_rec->partial_release(next->addr(), next->size());
-        _itr.next();
-      } else {
-        break;
-      }
-    } else if (next->is_commit_record()) {
-      if (next->addr() >= base && next->addr() + next->size() <= end) {
-        vm_rec->commit(next->size());
-        _itr.next();
-      } else {
-        assert(next->addr() >= base, "sorting order or overlapping");
-        break;
-      }
-    } else if (next->is_uncommit_record()) {
-      if (next->addr() >= base && next->addr() + next->size() <= end) {
-        vm_rec->uncommit(next->size());
-        _itr.next();
-      } else {
-        assert(next->addr() >= end, "sorting order or overlapping");
-        break;
-      }
-    } else if (next->is_type_tagging_record()) {
-      if (next->addr() >= base && next->addr() < end ) {
-        vm_rec->tag(next->flags());
-        _itr.next();
-      } else {
-          break;
-      }
-    } else {
-      assert(false, "unknown record type");
-    }
-    next = (MemPointerRecord*)_itr.peek_next();
+    committed_rgn = (VMMemRegion*)next();
   }
-  _itr.next();
-  return live_vm_rec;
+  return insert_record(rec);
 }
 
-MemPointer* StagingWalker::next() {
-  MemPointerRecord* cur_p = (MemPointerRecord*)_itr.current();
-  if (cur_p == NULL) {
-    _end_of_array = true;
-    return NULL;
+bool VMMemPointerIterator::remove_uncommitted_region(MemPointerRecord* rec) {
+  assert(rec->is_uncommit_record(), "sanity check");
+  VMMemRegion* cur;
+  cur = (VMMemRegion*)current();
+  assert(cur->is_reserved_region() && cur->contains_region(rec),
+    "Sanity check");
+  // thread's native stack is always marked as "committed", ignore
+  // the "commit" operation for creating stack guard pages
+  if (FLAGS_TO_MEMORY_TYPE(cur->flags()) == mtThreadStack &&
+      FLAGS_TO_MEMORY_TYPE(rec->flags()) != mtThreadStack) {
+    return true;
   }
 
-  MemPointerRecord* next_p;
-  if (cur_p->is_vm_pointer()) {
-    _is_vm_record = true;
-    if (!consolidate_vm_records(&_vm_record)) {
-      return next();
-    }
-  } else { // malloc-ed pointer
-    _is_vm_record = false;
-    next_p = (MemPointerRecord*)_itr.peek_next();
-    if (next_p != NULL && next_p->addr() == cur_p->addr()) {
-      assert(cur_p->is_allocation_record(), "sorting order");
-      assert(!next_p->is_allocation_record(), "sorting order");
-      _itr.next();
-      if (cur_p->seq() < next_p->seq()) {
-        cur_p = next_p;
+  cur = (VMMemRegion*)next();
+  while (cur != NULL && cur->is_committed_region()) {
+    // region already uncommitted, must be due to duplicated record
+    if (cur->addr() >= rec->addr() + rec->size()) {
+      break;
+    } else if (cur->contains_region(rec)) {
+      // uncommit whole region
+      if (cur->is_same_region(rec)) {
+        remove();
+        break;
+      } else if (rec->addr() == cur->addr() ||
+        rec->addr() + rec->size() == cur->addr() + cur->size()) {
+        // uncommitted from either end of current memory region.
+        cur->exclude_region(rec->addr(), rec->size());
+        break;
+      } else { // split the committed region and release the middle
+        address high_addr = cur->addr() + cur->size();
+        size_t sz = high_addr - rec->addr();
+        cur->exclude_region(rec->addr(), sz);
+        sz = high_addr - (rec->addr() + rec->size());
+        if (MemTracker::track_callsite()) {
+          MemPointerRecordEx tmp(rec->addr() + rec->size(), cur->flags(), sz,
+             ((VMMemRegionEx*)cur)->pc());
+          return insert_record_after(&tmp);
+        } else {
+          MemPointerRecord tmp(rec->addr() + rec->size(), cur->flags(), sz);
+          return insert_record_after(&tmp);
+        }
       }
     }
+    cur = (VMMemRegion*)next();
+  }
+
+  // we may not find committed record due to duplicated records
+  return true;
+}
+
+bool VMMemPointerIterator::remove_released_region(MemPointerRecord* rec) {
+  assert(rec->is_deallocation_record(), "Sanity check");
+  VMMemRegion* cur = (VMMemRegion*)current();
+  assert(cur->is_reserved_region() && cur->contains_region(rec),
+    "Sanity check");
+  if (rec->is_same_region(cur)) {
+    // release whole reserved region
+#ifdef ASSERT
+    VMMemRegion* next_region = (VMMemRegion*)peek_next();
+    // should not have any committed memory in this reserved region
+    assert(next_region == NULL || !next_region->is_committed_region(), "Sanity check");
+#endif
+    remove();
+  } else if (rec->addr() == cur->addr() ||
+    rec->addr() + rec->size() == cur->addr() + cur->size()) {
+    // released region is at either end of this region
+    cur->exclude_region(rec->addr(), rec->size());
+    assert(check_reserved_region(), "Integrity check");
+  } else { // split the reserved region and release the middle
+    address high_addr = cur->addr() + cur->size();
+    size_t sz = high_addr - rec->addr();
+    cur->exclude_region(rec->addr(), sz);
+    sz = high_addr - rec->addr() - rec->size();
     if (MemTracker::track_callsite()) {
-      _malloc_record.init((MemPointerRecordEx*)cur_p);
+      MemPointerRecordEx tmp(rec->addr() + rec->size(), cur->flags(), sz,
+        ((VMMemRegionEx*)cur)->pc());
+      bool ret = insert_reserved_region(&tmp);
+      assert(!ret || check_reserved_region(), "Integrity check");
+      return ret;
     } else {
-      _malloc_record.init((MemPointerRecord*)cur_p);
+      MemPointerRecord tmp(rec->addr() + rec->size(), cur->flags(), sz);
+      bool ret = insert_reserved_region(&tmp);
+      assert(!ret || check_reserved_region(), "Integrity check");
+      return ret;
     }
+  }
+  return true;
+}
 
-    _itr.next();
+bool VMMemPointerIterator::insert_reserved_region(MemPointerRecord* rec) {
+  // skip all 'commit' records associated with previous reserved region
+  VMMemRegion* p = (VMMemRegion*)next();
+  while (p != NULL && p->is_committed_region() &&
+         p->base() + p->size() < rec->addr()) {
+    p = (VMMemRegion*)next();
   }
-  return current();
+  return insert_record(rec);
 }
 
+bool VMMemPointerIterator::split_reserved_region(VMMemRegion* rgn, address new_rgn_addr, size_t new_rgn_size) {
+  assert(rgn->contains_region(new_rgn_addr, new_rgn_size), "Not fully contained");
+  address pc = (MemTracker::track_callsite() ? ((VMMemRegionEx*)rgn)->pc() : NULL);
+  if (rgn->base() == new_rgn_addr) { // new region is at the beginning of the region
+    size_t sz = rgn->size() - new_rgn_size;
+    // the original region becomes 'new' region
+    rgn->exclude_region(new_rgn_addr + new_rgn_size, sz);
+     // remaining becomes next region
+    MemPointerRecordEx next_rgn(new_rgn_addr + new_rgn_size, rgn->flags(), sz, pc);
+    return insert_reserved_region(&next_rgn);
+  } else if (rgn->base() + rgn->size() == new_rgn_addr + new_rgn_size) {
+    rgn->exclude_region(new_rgn_addr, new_rgn_size);
+    MemPointerRecordEx next_rgn(new_rgn_addr, rgn->flags(), new_rgn_size, pc);
+    return insert_reserved_region(&next_rgn);
+  } else {
+    // the orginal region will be split into three
+    address rgn_high_addr = rgn->base() + rgn->size();
+    // first region
+    rgn->exclude_region(new_rgn_addr, (rgn_high_addr - new_rgn_addr));
+    // the second region is the new region
+    MemPointerRecordEx new_rgn(new_rgn_addr, rgn->flags(), new_rgn_size, pc);
+    if (!insert_reserved_region(&new_rgn)) return false;
+    // the remaining region
+    MemPointerRecordEx rem_rgn(new_rgn_addr + new_rgn_size, rgn->flags(),
+      rgn_high_addr - (new_rgn_addr + new_rgn_size), pc);
+    return insert_reserved_region(&rem_rgn);
+  }
+}
+
+static int sort_in_seq_order(const void* p1, const void* p2) {
+  assert(p1 != NULL && p2 != NULL, "Sanity check");
+  const MemPointerRecord* mp1 = (MemPointerRecord*)p1;
+  const MemPointerRecord* mp2 = (MemPointerRecord*)p2;
+  return (mp1->seq() - mp2->seq());
+}
+
+bool StagingArea::init() {
+  if (MemTracker::track_callsite()) {
+    _malloc_data = new (std::nothrow)MemPointerArrayImpl<SeqMemPointerRecordEx>();
+    _vm_data = new (std::nothrow)MemPointerArrayImpl<SeqMemPointerRecordEx>();
+  } else {
+    _malloc_data = new (std::nothrow)MemPointerArrayImpl<SeqMemPointerRecord>();
+    _vm_data = new (std::nothrow)MemPointerArrayImpl<SeqMemPointerRecord>();
+  }
+
+  if (_malloc_data != NULL && _vm_data != NULL &&
+      !_malloc_data->out_of_memory() &&
+      !_vm_data->out_of_memory()) {
+    return true;
+  } else {
+    if (_malloc_data != NULL) delete _malloc_data;
+    if (_vm_data != NULL) delete _vm_data;
+    _malloc_data = NULL;
+    _vm_data = NULL;
+    return false;
+  }
+}
+
+
+VMRecordIterator StagingArea::virtual_memory_record_walker() {
+  MemPointerArray* arr = vm_data();
+  // sort into seq number order
+  arr->sort((FN_SORT)sort_in_seq_order);
+  return VMRecordIterator(arr);
+}
+
+
 MemSnapshot::MemSnapshot() {
   if (MemTracker::track_callsite()) {
     _alloc_ptrs = new (std::nothrow) MemPointerArrayImpl<MemPointerRecordEx>();
     _vm_ptrs = new (std::nothrow)MemPointerArrayImpl<VMMemRegionEx>(64, true);
-    _staging_area = new (std::nothrow)MemPointerArrayImpl<SeqMemPointerRecordEx>();
   } else {
     _alloc_ptrs = new (std::nothrow) MemPointerArrayImpl<MemPointerRecord>();
     _vm_ptrs = new (std::nothrow)MemPointerArrayImpl<VMMemRegion>(64, true);
-    _staging_area = new (std::nothrow)MemPointerArrayImpl<SeqMemPointerRecord>();
   }
 
+  _staging_area.init();
   _lock = new (std::nothrow) Mutex(Monitor::max_nonleaf - 1, "memSnapshotLock");
   NOT_PRODUCT(_untracked_count = 0;)
 }
@@ -181,11 +390,6 @@
   assert(MemTracker::shutdown_in_progress(), "native memory tracking still on");
   {
     MutexLockerEx locker(_lock);
-    if (_staging_area != NULL) {
-      delete _staging_area;
-      _staging_area = NULL;
-    }
-
     if (_alloc_ptrs != NULL) {
       delete _alloc_ptrs;
       _alloc_ptrs = NULL;
@@ -203,207 +407,226 @@
   }
 }
 
-void MemSnapshot::copy_pointer(MemPointerRecord* dest, const MemPointerRecord* src) {
+
+void MemSnapshot::copy_seq_pointer(MemPointerRecord* dest, const MemPointerRecord* src) {
   assert(dest != NULL && src != NULL, "Just check");
   assert(dest->addr() == src->addr(), "Just check");
+  assert(dest->seq() > 0 && src->seq() > 0, "not sequenced");
 
-  MEMFLAGS flags = dest->flags();
+  if (MemTracker::track_callsite()) {
+    *(SeqMemPointerRecordEx*)dest = *(SeqMemPointerRecordEx*)src;
+  } else {
+    *(SeqMemPointerRecord*)dest = *(SeqMemPointerRecord*)src;
+  }
+}
+
+void MemSnapshot::assign_pointer(MemPointerRecord*dest, const MemPointerRecord* src) {
+  assert(src != NULL && dest != NULL, "Just check");
+  assert(dest->seq() == 0 && src->seq() >0, "cast away sequence");
 
   if (MemTracker::track_callsite()) {
     *(MemPointerRecordEx*)dest = *(MemPointerRecordEx*)src;
   } else {
-    *dest = *src;
+    *(MemPointerRecord*)dest = *(MemPointerRecord*)src;
   }
 }
 
-
-// merge a per-thread memory recorder to the staging area
+// merge a recorder to the staging area
 bool MemSnapshot::merge(MemRecorder* rec) {
   assert(rec != NULL && !rec->out_of_memory(), "Just check");
 
-  // out of memory
-  if (_staging_area == NULL || _staging_area->out_of_memory()) {
-    return false;
-  }
-
   SequencedRecordIterator itr(rec->pointer_itr());
 
   MutexLockerEx lock(_lock, true);
-  MemPointerIterator staging_itr(_staging_area);
-  MemPointerRecord *p1, *p2;
-  p1 = (MemPointerRecord*) itr.current();
-  while (p1 != NULL) {
-    p2 = (MemPointerRecord*)staging_itr.locate(p1->addr());
-    // we have not seen this memory block, so just add to staging area
-    if (p2 == NULL) {
-      if (!staging_itr.insert(p1)) {
+  MemPointerIterator malloc_staging_itr(_staging_area.malloc_data());
+  MemPointerRecord* incoming_rec = (MemPointerRecord*) itr.current();
+  MemPointerRecord* matched_rec;
+
+  while (incoming_rec != NULL) {
+    if (incoming_rec->is_vm_pointer()) {
+      // we don't do anything with virtual memory records during merge
+      if (!_staging_area.vm_data()->append(incoming_rec)) {
         return false;
       }
-    } else if (p1->addr() == p2->addr()) {
-      MemPointerRecord* staging_next = (MemPointerRecord*)staging_itr.peek_next();
-      // a memory block can have many tagging records, find right one to replace or
-      // right position to insert
-      while (staging_next != NULL && staging_next->addr() == p1->addr()) {
-        if ((staging_next->flags() & MemPointerRecord::tag_masks) <=
-          (p1->flags() & MemPointerRecord::tag_masks)) {
-          p2 = (MemPointerRecord*)staging_itr.next();
-          staging_next = (MemPointerRecord*)staging_itr.peek_next();
-        } else {
-          break;
+    } else {
+      // locate matched record and/or also position the iterator to proper
+      // location for this incoming record.
+      matched_rec = (MemPointerRecord*)malloc_staging_itr.locate(incoming_rec->addr());
+      // we have not seen this memory block in this generation,
+      // so just add to staging area
+      if (matched_rec == NULL) {
+        if (!malloc_staging_itr.insert(incoming_rec)) {
+          return false;
         }
-      }
-      int df = (p1->flags() & MemPointerRecord::tag_masks) -
-        (p2->flags() & MemPointerRecord::tag_masks);
-      if (df == 0) {
-        assert(p1->seq() > 0, "not sequenced");
-        assert(p2->seq() > 0, "not sequenced");
-        if (p1->seq() > p2->seq()) {
-          copy_pointer(p2, p1);
+      } else if (incoming_rec->addr() == matched_rec->addr()) {
+        // whoever has higher sequence number wins
+        if (incoming_rec->seq() > matched_rec->seq()) {
+          copy_seq_pointer(matched_rec, incoming_rec);
         }
-      } else if (df < 0) {
-        if (!staging_itr.insert(p1)) {
+      } else if (incoming_rec->addr() < matched_rec->addr()) {
+        if (!malloc_staging_itr.insert(incoming_rec)) {
           return false;
         }
       } else {
-        if (!staging_itr.insert_after(p1)) {
-          return false;
-        }
-      }
-    } else if (p1->addr() < p2->addr()) {
-      if (!staging_itr.insert(p1)) {
-        return false;
-      }
-    } else {
-      if (!staging_itr.insert_after(p1)) {
-        return false;
+        ShouldNotReachHere();
       }
     }
-    p1 = (MemPointerRecord*)itr.next();
+    incoming_rec = (MemPointerRecord*)itr.next();
   }
   NOT_PRODUCT(void check_staging_data();)
   return true;
 }
 
 
+// promote data to next generation
+bool MemSnapshot::promote() {
+  assert(_alloc_ptrs != NULL && _vm_ptrs != NULL, "Just check");
+  assert(_staging_area.malloc_data() != NULL && _staging_area.vm_data() != NULL,
+         "Just check");
+  MutexLockerEx lock(_lock, true);
 
-// promote data to next generation
-void MemSnapshot::promote() {
-  assert(_alloc_ptrs != NULL && _staging_area != NULL && _vm_ptrs != NULL,
-    "Just check");
-  MutexLockerEx lock(_lock, true);
-  StagingWalker walker(_staging_area);
-  MemPointerIterator malloc_itr(_alloc_ptrs);
-  VMMemPointerIterator vm_itr(_vm_ptrs);
-  MemPointer* cur = walker.current();
-  while (cur != NULL) {
-    if (walker.is_vm_record()) {
-      VMMemRegion* cur_vm = (VMMemRegion*)cur;
-      VMMemRegion* p = (VMMemRegion*)vm_itr.locate(cur_vm->addr());
-      cur_vm = (VMMemRegion*)cur;
-      if (p != NULL && (p->contains(cur_vm) || p->base() == cur_vm->base())) {
-        assert(p->is_reserve_record() ||
-          p->is_commit_record(), "wrong vm record type");
-        // resize existing reserved range
-        if (cur_vm->is_reserve_record() && p->base() == cur_vm->base()) {
-          assert(cur_vm->size() >= p->committed_size(), "incorrect resizing");
-          p->set_reserved_size(cur_vm->size());
-        } else if (cur_vm->is_commit_record()) {
-          p->commit(cur_vm->committed_size());
-        } else if (cur_vm->is_uncommit_record()) {
-          p->uncommit(cur_vm->committed_size());
-          if (!p->is_reserve_record() && p->committed_size() == 0) {
-            vm_itr.remove();
+  MallocRecordIterator  malloc_itr = _staging_area.malloc_record_walker();
+  bool promoted = false;
+  if (promote_malloc_records(&malloc_itr)) {
+    VMRecordIterator vm_itr = _staging_area.virtual_memory_record_walker();
+    if (promote_virtual_memory_records(&vm_itr)) {
+      promoted = true;
+    }
+  }
+
+  NOT_PRODUCT(check_malloc_pointers();)
+  _staging_area.clear();
+  return promoted;
+}
+
+bool MemSnapshot::promote_malloc_records(MemPointerArrayIterator* itr) {
+  MemPointerIterator malloc_snapshot_itr(_alloc_ptrs);
+  MemPointerRecord* new_rec = (MemPointerRecord*)itr->current();
+  MemPointerRecord* matched_rec;
+  while (new_rec != NULL) {
+    matched_rec = (MemPointerRecord*)malloc_snapshot_itr.locate(new_rec->addr());
+    // found matched memory block
+    if (matched_rec != NULL && new_rec->addr() == matched_rec->addr()) {
+      // snapshot already contains 'live' records
+      assert(matched_rec->is_allocation_record() || matched_rec->is_arena_memory_record(),
+             "Sanity check");
+      // update block states
+      if (new_rec->is_allocation_record()) {
+        assign_pointer(matched_rec, new_rec);
+      } else if (new_rec->is_arena_memory_record()) {
+        if (new_rec->size() == 0) {
+          // remove size record once size drops to 0
+          malloc_snapshot_itr.remove();
+        } else {
+          assign_pointer(matched_rec, new_rec);
+        }
+      } else {
+        // a deallocation record
+        assert(new_rec->is_deallocation_record(), "Sanity check");
+        // an arena record can be followed by a size record, we need to remove both
+        if (matched_rec->is_arena_record()) {
+          MemPointerRecord* next = (MemPointerRecord*)malloc_snapshot_itr.peek_next();
+          if (next->is_arena_memory_record() && next->is_memory_record_of_arena(matched_rec)) {
+            malloc_snapshot_itr.remove();
           }
-        } else if (cur_vm->is_type_tagging_record()) {
-          p->tag(cur_vm->flags());
-        } else if (cur_vm->is_release_record()) {
-          if (cur_vm->base() == p->base() && cur_vm->size() == p->size()) {
-            // release the whole range
-            vm_itr.remove();
+        }
+        // the memory is deallocated, remove related record(s)
+        malloc_snapshot_itr.remove();
+      }
+    } else {
+      // don't insert size 0 record
+      if (new_rec->is_arena_memory_record() && new_rec->size() == 0) {
+        new_rec = NULL;
+      }
+
+      if (new_rec != NULL) {
+        if  (new_rec->is_allocation_record() || new_rec->is_arena_memory_record()) {
+          if (matched_rec != NULL && new_rec->addr() > matched_rec->addr()) {
+            if (!malloc_snapshot_itr.insert_after(new_rec)) {
+              return false;
+            }
           } else {
-            // partial release
-            p->partial_release(cur_vm->base(), cur_vm->size());
+            if (!malloc_snapshot_itr.insert(new_rec)) {
+              return false;
+            }
+          }
+        }
+#ifndef PRODUCT
+        else if (!has_allocation_record(new_rec->addr())) {
+          // NMT can not track some startup memory, which is allocated before NMT is on
+          _untracked_count ++;
+        }
+#endif
+      }
+    }
+    new_rec = (MemPointerRecord*)itr->next();
+  }
+  return true;
+}
+
+bool MemSnapshot::promote_virtual_memory_records(MemPointerArrayIterator* itr) {
+  VMMemPointerIterator vm_snapshot_itr(_vm_ptrs);
+  MemPointerRecord* new_rec = (MemPointerRecord*)itr->current();
+  VMMemRegion*  reserved_rec;
+  while (new_rec != NULL) {
+    assert(new_rec->is_vm_pointer(), "Sanity check");
+
+    // locate a reserved region that contains the specified address, or
+    // the nearest reserved region has base address just above the specified
+    // address
+    reserved_rec = (VMMemRegion*)vm_snapshot_itr.locate(new_rec->addr());
+    if (reserved_rec != NULL && reserved_rec->contains_region(new_rec)) {
+      // snapshot can only have 'live' records
+      assert(reserved_rec->is_reserved_region(), "Sanity check");
+      if (new_rec->is_allocation_record()) {
+        if (!reserved_rec->is_same_region(new_rec)) {
+          // only deal with split a bigger reserved region into smaller regions.
+          // So far, CDS is the only use case.
+          if (!vm_snapshot_itr.split_reserved_region(reserved_rec, new_rec->addr(), new_rec->size())) {
+            return false;
+          }
+        }
+      } else if (new_rec->is_uncommit_record()) {
+        if (!vm_snapshot_itr.remove_uncommitted_region(new_rec)) {
+          return false;
+        }
+      } else if (new_rec->is_commit_record()) {
+        // insert or expand existing committed region to cover this
+        // newly committed region
+        if (!vm_snapshot_itr.add_committed_region(new_rec)) {
+          return false;
+        }
+      } else if (new_rec->is_deallocation_record()) {
+        // release part or all memory region
+        if (!vm_snapshot_itr.remove_released_region(new_rec)) {
+          return false;
+        }
+      } else if (new_rec->is_type_tagging_record()) {
+        // tag this reserved virtual memory range to a memory type. Can not re-tag a memory range
+        // to different type.
+        assert(FLAGS_TO_MEMORY_TYPE(reserved_rec->flags()) == mtNone ||
+               FLAGS_TO_MEMORY_TYPE(reserved_rec->flags()) == FLAGS_TO_MEMORY_TYPE(new_rec->flags()),
+               "Sanity check");
+        reserved_rec->tag(new_rec->flags());
+    } else {
+        ShouldNotReachHere();
           }
         } else {
-          // we do see multiple reserver on the same vm range
-          assert((cur_vm->is_commit_record() || cur_vm->is_reserve_record()) &&
-             cur_vm->base() == p->base() && cur_vm->size() == p->size(), "bad record");
-          p->tag(cur_vm->flags());
-        }
-      } else {
-        if(cur_vm->is_reserve_record()) {
-          if (p == NULL || p->base() > cur_vm->base()) {
-            vm_itr.insert(cur_vm);
-          } else {
-            vm_itr.insert_after(cur_vm);
+      /*
+       * The assertion failure indicates mis-matched virtual memory records. The likely
+       * scenario is, that some virtual memory operations are not going through os::xxxx_memory()
+       * api, which have to be tracked manually. (perfMemory is an example).
+      */
+      assert(new_rec->is_allocation_record(), "Sanity check");
+      if (!vm_snapshot_itr.add_reserved_region(new_rec)) {
+            return false;
           }
-        } else {
-          // In theory, we should assert without conditions. However, in case of native
-          // thread stack, NMT explicitly releases the thread stack in Thread's destructor,
-          // due to platform dependent behaviors. On some platforms, we see uncommit/release
-          // native thread stack, but some, we don't.
-          assert(cur_vm->is_uncommit_record() || cur_vm->is_deallocation_record(),
-            err_msg("Should not reach here, pointer addr = [" INTPTR_FORMAT "], flags = [%x]",
-               cur_vm->addr(), cur_vm->flags()));
-        }
-      }
-    } else {
-      MemPointerRecord* cur_p = (MemPointerRecord*)cur;
-      MemPointerRecord* p = (MemPointerRecord*)malloc_itr.locate(cur->addr());
-      if (p != NULL && cur_p->addr() == p->addr()) {
-        assert(p->is_allocation_record() || p->is_arena_size_record(), "untracked");
-        if (cur_p->is_allocation_record() || cur_p->is_arena_size_record()) {
-          copy_pointer(p, cur_p);
-        } else {   // deallocation record
-          assert(cur_p->is_deallocation_record(), "wrong record type");
-
-          // we are removing an arena record, we also need to remove its 'size'
-          // record behind it
-          if (p->is_arena_record()) {
-            MemPointerRecord* next_p = (MemPointerRecord*)malloc_itr.peek_next();
-            if (next_p->is_arena_size_record()) {
-              assert(next_p->is_size_record_of_arena(p), "arena records dont match");
-              malloc_itr.remove();
-            }
-          }
-          malloc_itr.remove();
-        }
-      } else {
-        if (cur_p->is_arena_size_record()) {
-          MemPointerRecord* prev_p = (MemPointerRecord*)malloc_itr.peek_prev();
-          if (prev_p != NULL &&
-             (!prev_p->is_arena_record() || !cur_p->is_size_record_of_arena(prev_p))) {
-            // arena already deallocated
-            cur_p = NULL;
-          }
-        }
-        if (cur_p != NULL) {
-          if (cur_p->is_allocation_record() || cur_p->is_arena_size_record()) {
-            if (p != NULL && cur_p->addr() > p->addr()) {
-              malloc_itr.insert_after(cur);
-            } else {
-              malloc_itr.insert(cur);
-            }
-          }
-#ifndef PRODUCT
-          else if (!has_allocation_record(cur_p->addr())){
-            // NMT can not track some startup memory, which allocated before NMT
-            // is enabled
-            _untracked_count ++;
-          }
-#endif
-        }
-      }
-    }
-
-    cur = walker.next();
   }
-  NOT_PRODUCT(check_malloc_pointers();)
-  _staging_area->shrink();
-  _staging_area->clear();
+    new_rec = (MemPointerRecord*)itr->next();
+  }
+  return true;
 }
 
-
 #ifndef PRODUCT
 void MemSnapshot::print_snapshot_stats(outputStream* st) {
   st->print_cr("Snapshot:");
@@ -413,8 +636,15 @@
   st->print_cr("\tVM: %d/%d [%5.2f%%] %dKB", _vm_ptrs->length(), _vm_ptrs->capacity(),
     (100.0 * (float)_vm_ptrs->length()) / (float)_vm_ptrs->capacity(), _vm_ptrs->instance_size()/K);
 
-  st->print_cr("\tStaging:     %d/%d [%5.2f%%] %dKB", _staging_area->length(), _staging_area->capacity(),
-    (100.0 * (float)_staging_area->length()) / (float)_staging_area->capacity(), _staging_area->instance_size()/K);
+  st->print_cr("\tMalloc staging Area:     %d/%d [%5.2f%%] %dKB", _staging_area.malloc_data()->length(),
+    _staging_area.malloc_data()->capacity(),
+    (100.0 * (float)_staging_area.malloc_data()->length()) / (float)_staging_area.malloc_data()->capacity(),
+    _staging_area.malloc_data()->instance_size()/K);
+
+  st->print_cr("\tVirtual memory staging Area:     %d/%d [%5.2f%%] %dKB", _staging_area.vm_data()->length(),
+    _staging_area.vm_data()->capacity(),
+    (100.0 * (float)_staging_area.vm_data()->length()) / (float)_staging_area.vm_data()->capacity(),
+    _staging_area.vm_data()->instance_size()/K);
 
   st->print_cr("\tUntracked allocation: %d", _untracked_count);
 }
@@ -433,7 +663,7 @@
 }
 
 bool MemSnapshot::has_allocation_record(address addr) {
-  MemPointerArrayIteratorImpl itr(_staging_area);
+  MemPointerArrayIteratorImpl itr(_staging_area.malloc_data());
   MemPointerRecord* cur = (MemPointerRecord*)itr.current();
   while (cur != NULL) {
     if (cur->addr() == addr && cur->is_allocation_record()) {
@@ -447,7 +677,7 @@
 
 #ifdef ASSERT
 void MemSnapshot::check_staging_data() {
-  MemPointerArrayIteratorImpl itr(_staging_area);
+  MemPointerArrayIteratorImpl itr(_staging_area.malloc_data());
   MemPointerRecord* cur = (MemPointerRecord*)itr.current();
   MemPointerRecord* next = (MemPointerRecord*)itr.next();
   while (next != NULL) {
@@ -458,6 +688,41 @@
     cur = next;
     next = (MemPointerRecord*)itr.next();
   }
+
+  MemPointerArrayIteratorImpl vm_itr(_staging_area.vm_data());
+  cur = (MemPointerRecord*)vm_itr.current();
+  while (cur != NULL) {
+    assert(cur->is_vm_pointer(), "virtual memory pointer only");
+    cur = (MemPointerRecord*)vm_itr.next();
+  }
+}
+
+void MemSnapshot::dump_all_vm_pointers() {
+  MemPointerArrayIteratorImpl itr(_vm_ptrs);
+  VMMemRegion* ptr = (VMMemRegion*)itr.current();
+  tty->print_cr("dump virtual memory pointers:");
+  while (ptr != NULL) {
+    if (ptr->is_committed_region()) {
+      tty->print("\t");
+    }
+    tty->print("[" PTR_FORMAT " - " PTR_FORMAT "] [%x]", ptr->addr(),
+      (ptr->addr() + ptr->size()), ptr->flags());
+
+    if (MemTracker::track_callsite()) {
+      VMMemRegionEx* ex = (VMMemRegionEx*)ptr;
+      if (ex->pc() != NULL) {
+        char buf[1024];
+        if (os::dll_address_to_function_name(ex->pc(), buf, sizeof(buf), NULL)) {
+          tty->print_cr("\t%s", buf);
+        } else {
+          tty->print_cr("");
+        }
+      }
+    }
+
+    ptr = (VMMemRegion*)itr.next();
+  }
+  tty->flush();
 }
 #endif // ASSERT
 
--- a/src/share/vm/services/memSnapshot.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memSnapshot.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -31,7 +31,6 @@
 #include "services/memBaseline.hpp"
 #include "services/memPtrArray.hpp"
 
-
 // Snapshot pointer array iterator
 
 // The pointer array contains malloc-ed pointers
@@ -111,40 +110,59 @@
       MemPointerIterator(arr) {
   }
 
-  // locate an exiting record that contains specified address, or
-  // the record, where the record with specified address, should
-  // be inserted
+  // locate an existing reserved memory region that contains specified address,
+  // or the reserved region just above this address, where the incoming
+  // reserved region should be inserted.
   virtual MemPointer* locate(address addr) {
-    VMMemRegion* cur = (VMMemRegion*)current();
-    VMMemRegion* next_p;
+    reset();
+    VMMemRegion* reg = (VMMemRegion*)current();
+    while (reg != NULL) {
+      if (reg->is_reserved_region()) {
+        if (reg->contains_address(addr) || addr < reg->base()) {
+          return reg;
+      }
+    }
+      reg = (VMMemRegion*)next();
+    }
+      return NULL;
+    }
 
-    while (cur != NULL) {
-      if (cur->base() > addr) {
-        return cur;
-      } else {
-        // find nearest existing range that has base address <= addr
-        next_p = (VMMemRegion*)peek_next();
-        if (next_p != NULL && next_p->base() <= addr) {
-          cur = (VMMemRegion*)next();
-          continue;
-        }
+  // following methods update virtual memory in the context
+  // of 'current' position, which is properly positioned by
+  // callers via locate method.
+  bool add_reserved_region(MemPointerRecord* rec);
+  bool add_committed_region(MemPointerRecord* rec);
+  bool remove_uncommitted_region(MemPointerRecord* rec);
+  bool remove_released_region(MemPointerRecord* rec);
+
+  // split a reserved region to create a new memory region with specified base and size
+  bool split_reserved_region(VMMemRegion* rgn, address new_rgn_addr, size_t new_rgn_size);
+ private:
+  bool insert_record(MemPointerRecord* rec);
+  bool insert_record_after(MemPointerRecord* rec);
+
+  bool insert_reserved_region(MemPointerRecord* rec);
+
+  // reset current position
+  inline void reset() { _pos = 0; }
+#ifdef ASSERT
+  // check integrity of records on current reserved memory region.
+  bool check_reserved_region() {
+    VMMemRegion* reserved_region = (VMMemRegion*)current();
+    assert(reserved_region != NULL && reserved_region->is_reserved_region(),
+          "Sanity check");
+    // all committed regions that follow current reserved region, should all
+    // belong to the reserved region.
+    VMMemRegion* next_region = (VMMemRegion*)next();
+    for (; next_region != NULL && next_region->is_committed_region();
+         next_region = (VMMemRegion*)next() ) {
+      if(!reserved_region->contains_region(next_region)) {
+        return false;
       }
-
-      if (cur->is_reserve_record() &&
-        cur->base() <= addr &&
-        (cur->base() + cur->size() > addr)) {
-          return cur;
-      } else if (cur->is_commit_record() &&
-        cur->base() <= addr &&
-        (cur->base() + cur->committed_size() > addr)) {
-          return cur;
-      }
-      cur = (VMMemRegion*)next();
     }
-    return NULL;
+    return true;
   }
 
-#ifdef ASSERT
   virtual bool is_dup_pointer(const MemPointer* ptr1,
     const MemPointer* ptr2) const {
     VMMemRegion* p1 = (VMMemRegion*)ptr1;
@@ -162,73 +180,168 @@
 #endif
 };
 
-class StagingWalker : public MemPointerArrayIterator {
+class MallocRecordIterator : public MemPointerArrayIterator {
  private:
   MemPointerArrayIteratorImpl  _itr;
-  bool                         _is_vm_record;
-  bool                         _end_of_array;
-  VMMemRegionEx                _vm_record;
-  MemPointerRecordEx           _malloc_record;
+
+
 
  public:
-  StagingWalker(MemPointerArray* arr): _itr(arr) {
-    _end_of_array = false;
-    next();
+  MallocRecordIterator(MemPointerArray* arr) : _itr(arr) {
   }
 
-  // return the pointer at current position
-  MemPointer* current() const {
-    if (_end_of_array) {
-      return NULL;
+  virtual MemPointer* current() const {
+#ifdef ASSERT
+    MemPointer* cur_rec = _itr.current();
+    if (cur_rec != NULL) {
+      MemPointer* prev_rec = _itr.peek_prev();
+      MemPointer* next_rec = _itr.peek_next();
+      assert(prev_rec == NULL || prev_rec->addr() < cur_rec->addr(), "Sorting order");
+      assert(next_rec == NULL || next_rec->addr() > cur_rec->addr(), "Sorting order");
     }
-    if (is_vm_record()) {
-      return (MemPointer*)&_vm_record;
-    } else {
-      return (MemPointer*)&_malloc_record;
+#endif
+    return _itr.current();
+  }
+  virtual MemPointer* next() {
+    MemPointerRecord* next_rec = (MemPointerRecord*)_itr.next();
+    // arena memory record is a special case, which we have to compare
+    // sequence number against its associated arena record.
+    if (next_rec != NULL && next_rec->is_arena_memory_record()) {
+      MemPointerRecord* prev_rec = (MemPointerRecord*)_itr.peek_prev();
+      // if there is an associated arena record, it has to be previous
+      // record because of sorting order (by address) - NMT generates a pseudo address
+      // for arena's size record by offsetting arena's address, that guarantees
+      // the order of arena record and it's size record.
+      if (prev_rec != NULL && prev_rec->is_arena_record() &&
+        next_rec->is_memory_record_of_arena(prev_rec)) {
+        if (prev_rec->seq() > next_rec->seq()) {
+          // Skip this arena memory record
+          // Two scenarios:
+          //   - if the arena record is an allocation record, this early
+          //     size record must be leftover by previous arena,
+          //     and the last size record should have size = 0.
+          //   - if the arena record is a deallocation record, this
+          //     size record should be its cleanup record, which should
+          //     also have size = 0. In other world, arena alway reset
+          //     its size before gone (see Arena's destructor)
+          assert(next_rec->size() == 0, "size not reset");
+          return _itr.next();
+        } else {
+          assert(prev_rec->is_allocation_record(),
+            "Arena size record ahead of allocation record");
+        }
+      }
+    }
+    return next_rec;
+  }
+
+  MemPointer* peek_next() const      { ShouldNotReachHere(); return NULL; }
+  MemPointer* peek_prev() const      { ShouldNotReachHere(); return NULL; }
+  void remove()                      { ShouldNotReachHere(); }
+  bool insert(MemPointer* ptr)       { ShouldNotReachHere(); return false; }
+  bool insert_after(MemPointer* ptr) { ShouldNotReachHere(); return false; }
+};
+
+// collapse duplicated records. Eliminating duplicated records here, is much
+// cheaper than during promotion phase. However, it does have limitation - it
+// can only eliminate duplicated records within the generation, there are
+// still chances seeing duplicated records during promotion.
+// We want to use the record with higher sequence number, because it has
+// more accurate callsite pc.
+class VMRecordIterator : public MemPointerArrayIterator {
+ private:
+  MemPointerArrayIteratorImpl  _itr;
+
+ public:
+  VMRecordIterator(MemPointerArray* arr) : _itr(arr) {
+    MemPointerRecord* cur = (MemPointerRecord*)_itr.current();
+    MemPointerRecord* next = (MemPointerRecord*)_itr.peek_next();
+    while (next != NULL) {
+      assert(cur != NULL, "Sanity check");
+      assert(((SeqMemPointerRecord*)next)->seq() > ((SeqMemPointerRecord*)cur)->seq(),
+        "pre-sort order");
+
+      if (is_duplicated_record(cur, next)) {
+        _itr.next();
+        next = (MemPointerRecord*)_itr.peek_next();
+      } else {
+        break;
+      }
     }
   }
 
-  // return the next pointer and advance current position
-  MemPointer* next();
-
-  // type of 'current' record
-  bool is_vm_record() const {
-    return _is_vm_record;
+  virtual MemPointer* current() const {
+    return _itr.current();
   }
 
-  // return the next poinger without advancing current position
-  MemPointer* peek_next() const {
-    assert(false, "not supported");
-    return NULL;
+  // get next record, but skip the duplicated records
+  virtual MemPointer* next() {
+    MemPointerRecord* cur = (MemPointerRecord*)_itr.next();
+    MemPointerRecord* next = (MemPointerRecord*)_itr.peek_next();
+    while (next != NULL) {
+      assert(cur != NULL, "Sanity check");
+      assert(((SeqMemPointerRecord*)next)->seq() > ((SeqMemPointerRecord*)cur)->seq(),
+        "pre-sort order");
+
+      if (is_duplicated_record(cur, next)) {
+        _itr.next();
+        cur = next;
+        next = (MemPointerRecord*)_itr.peek_next();
+      } else {
+        break;
+      }
+    }
+    return cur;
   }
 
-  MemPointer* peek_prev() const {
-    assert(false, "not supported");
-    return NULL;
+  MemPointer* peek_next() const      { ShouldNotReachHere(); return NULL; }
+  MemPointer* peek_prev() const      { ShouldNotReachHere(); return NULL; }
+  void remove()                      { ShouldNotReachHere(); }
+  bool insert(MemPointer* ptr)       { ShouldNotReachHere(); return false; }
+  bool insert_after(MemPointer* ptr) { ShouldNotReachHere(); return false; }
+
+ private:
+  bool is_duplicated_record(MemPointerRecord* p1, MemPointerRecord* p2) const {
+    bool ret = (p1->addr() == p2->addr() && p1->size() == p2->size() && p1->flags() == p2->flags());
+    assert(!(ret && FLAGS_TO_MEMORY_TYPE(p1->flags()) == mtThreadStack), "dup on stack record");
+    return ret;
   }
-  // remove the pointer at current position
-  void remove() {
-    assert(false, "not supported");
+};
+
+class StagingArea : public _ValueObj {
+ private:
+  MemPointerArray*   _malloc_data;
+  MemPointerArray*   _vm_data;
+
+ public:
+  StagingArea() : _malloc_data(NULL), _vm_data(NULL) {
+    init();
   }
 
-  // insert the pointer at current position
-  bool insert(MemPointer* ptr) {
-    assert(false, "not supported");
-    return false;
+  ~StagingArea() {
+    if (_malloc_data != NULL) delete _malloc_data;
+    if (_vm_data != NULL) delete _vm_data;
   }
 
-  bool insert_after(MemPointer* ptr) {
-    assert(false, "not supported");
-    return false;
+  MallocRecordIterator malloc_record_walker() {
+    return MallocRecordIterator(malloc_data());
   }
 
- private:
-  // consolidate all records referring to this vm region
-  bool consolidate_vm_records(VMMemRegionEx* vm_rec);
+  VMRecordIterator virtual_memory_record_walker();
+
+  bool init();
+  void clear() {
+    assert(_malloc_data != NULL && _vm_data != NULL, "Just check");
+    _malloc_data->shrink();
+    _malloc_data->clear();
+    _vm_data->clear();
+  }
+
+  inline MemPointerArray* malloc_data() { return _malloc_data; }
+  inline MemPointerArray* vm_data()     { return _vm_data; }
 };
 
 class MemBaseline;
-
 class MemSnapshot : public CHeapObj<mtNMT> {
  private:
   // the following two arrays contain records of all known lived memory blocks
@@ -237,9 +350,7 @@
   // live virtual memory pointers
   MemPointerArray*      _vm_ptrs;
 
-  // stagging a generation's data, before
-  // it can be prompted to snapshot
-  MemPointerArray*      _staging_area;
+  StagingArea           _staging_area;
 
   // the lock to protect this snapshot
   Monitor*              _lock;
@@ -252,18 +363,19 @@
   virtual ~MemSnapshot();
 
   // if we are running out of native memory
-  bool out_of_memory() const {
-    return (_alloc_ptrs == NULL || _staging_area == NULL ||
+  bool out_of_memory() {
+    return (_alloc_ptrs == NULL ||
+      _staging_area.malloc_data() == NULL ||
+      _staging_area.vm_data() == NULL ||
       _vm_ptrs == NULL || _lock == NULL ||
       _alloc_ptrs->out_of_memory() ||
-      _staging_area->out_of_memory() ||
       _vm_ptrs->out_of_memory());
   }
 
   // merge a per-thread memory recorder into staging area
   bool merge(MemRecorder* rec);
   // promote staged data to snapshot
-  void promote();
+  bool promote();
 
 
   void wait(long timeout) {
@@ -276,11 +388,17 @@
   NOT_PRODUCT(void check_staging_data();)
   NOT_PRODUCT(void check_malloc_pointers();)
   NOT_PRODUCT(bool has_allocation_record(address addr);)
+  // dump all virtual memory pointers in snapshot
+  DEBUG_ONLY( void dump_all_vm_pointers();)
 
  private:
-   // copy pointer data from src to dest
-   void copy_pointer(MemPointerRecord* dest, const MemPointerRecord* src);
+   // copy sequenced pointer from src to dest
+   void copy_seq_pointer(MemPointerRecord* dest, const MemPointerRecord* src);
+   // assign a sequenced pointer to non-sequenced pointer
+   void assign_pointer(MemPointerRecord*dest, const MemPointerRecord* src);
+
+   bool promote_malloc_records(MemPointerArrayIterator* itr);
+   bool promote_virtual_memory_records(MemPointerArrayIterator* itr);
 };
 
-
 #endif // SHARE_VM_SERVICES_MEM_SNAPSHOT_HPP
--- a/src/share/vm/services/memTrackWorker.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memTrackWorker.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -118,7 +118,10 @@
           _head = (_head + 1) % MAX_GENERATIONS;
         }
         // promote this generation data to snapshot
-        snapshot->promote();
+        if (!snapshot->promote()) {
+          // failed to promote, means out of memory
+          MemTracker::shutdown(MemTracker::NMT_out_of_memory);
+        }
       } else {
         snapshot->wait(1000);
         ThreadCritical tc;
--- a/src/share/vm/services/memTracker.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memTracker.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -69,10 +69,12 @@
 
 void MemTracker::init_tracking_options(const char* option_line) {
   _tracking_level = NMT_off;
-  if (strncmp(option_line, "=summary", 8) == 0) {
+  if (strcmp(option_line, "=summary") == 0) {
     _tracking_level = NMT_summary;
-  } else if (strncmp(option_line, "=detail", 8) == 0) {
+  } else if (strcmp(option_line, "=detail") == 0) {
     _tracking_level = NMT_detail;
+  } else if (strcmp(option_line, "=off") != 0) {
+    vm_exit_during_initialization("Syntax error, expecting -XX:NativeMemoryTracking=[off|summary|detail]", NULL);
   }
 }
 
@@ -341,6 +343,7 @@
  */
 void MemTracker::create_memory_record(address addr, MEMFLAGS flags,
     size_t size, address pc, Thread* thread) {
+  assert(addr != NULL, "Sanity check");
   if (!shutdown_in_progress()) {
     // single thread, we just write records direct to global recorder,'
     // with any lock
@@ -358,7 +361,7 @@
 
     if (thread != NULL) {
       if (thread->is_Java_thread() && ((JavaThread*)thread)->is_safepoint_visible()) {
-        JavaThread*      java_thread = static_cast<JavaThread*>(thread);
+        JavaThread*      java_thread = (JavaThread*)thread;
         JavaThreadState  state = java_thread->thread_state();
         if (SafepointSynchronize::safepoint_safe(java_thread, state)) {
           // JavaThreads that are safepoint safe, can run through safepoint,
@@ -466,6 +469,8 @@
       // it should guarantee that NMT is fully sync-ed.
       ThreadCritical tc;
 
+      SequenceGenerator::reset();
+
       // walk all JavaThreads to collect recorders
       SyncThreadRecorderClosure stc;
       Threads::threads_do(&stc);
@@ -478,11 +483,12 @@
         pending_recorders = _global_recorder;
         _global_recorder = NULL;
       }
-      SequenceGenerator::reset();
       // check _worker_thread with lock to avoid racing condition
       if (_worker_thread != NULL) {
         _worker_thread->at_sync_point(pending_recorders);
       }
+
+      assert(SequenceGenerator::peek() == 1, "Should not have memory activities during sync-point");
     }
   }
 
--- a/src/share/vm/services/memTracker.hpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/services/memTracker.hpp	Thu Nov 29 22:32:44 2012 -0800
@@ -39,8 +39,10 @@
 #include "thread_solaris.inline.hpp"
 #endif
 
-#ifdef _DEBUG_
-  #define DEBUG_CALLER_PC  os::get_caller_pc(3)
+extern bool NMT_track_callsite;
+
+#ifdef ASSERT
+  #define DEBUG_CALLER_PC  (NMT_track_callsite ? os::get_caller_pc(2) : 0)
 #else
   #define DEBUG_CALLER_PC  0
 #endif
@@ -85,7 +87,7 @@
     NMT_shutdown                         // shutdown
   };
 
-
+ public:
   // native memory tracking level
   enum NMTLevel {
     NMT_off,              // native memory tracking is off
@@ -93,7 +95,6 @@
     NMT_detail            // track callsite also
   };
 
- public:
    enum ShutdownReason {
      NMT_shutdown_none,     // no shutdown requested
      NMT_shutdown_user,     // user requested shutdown
@@ -117,6 +118,10 @@
       _state >= NMT_bootstrapping_single_thread);
   }
 
+  static inline enum NMTLevel tracking_level() {
+    return _tracking_level;
+  }
+
   // user readable reason for shutting down NMT
   static const char* reason() {
     switch(_reason) {
@@ -184,7 +189,8 @@
   // record a 'malloc' call
   static inline void record_malloc(address addr, size_t size, MEMFLAGS flags,
                             address pc = 0, Thread* thread = NULL) {
-    if (NMT_CAN_TRACK(flags)) {
+    if (is_on() && NMT_CAN_TRACK(flags)) {
+      assert(size > 0, "Sanity check");
       create_memory_record(addr, (flags|MemPointerRecord::malloc_tag()), size, pc, thread);
     }
   }
@@ -197,19 +203,21 @@
   // record a 'realloc' call
   static inline void record_realloc(address old_addr, address new_addr, size_t size,
        MEMFLAGS flags, address pc = 0, Thread* thread = NULL) {
-    if (is_on()) {
+    if (is_on() && NMT_CAN_TRACK(flags)) {
+      assert(size > 0, "Sanity check");
       record_free(old_addr, flags, thread);
       record_malloc(new_addr, size, flags, pc, thread);
     }
   }
 
-  // record arena size
+  // record arena memory size
   static inline void record_arena_size(address addr, size_t size) {
-    // we add a positive offset to arena address, so we can have arena size record
+    // we add a positive offset to arena address, so we can have arena memory record
     // sorted after arena record
     if (is_on() && !UseMallocOnly) {
+      assert(addr != NULL, "Sanity check");
       create_memory_record((addr + sizeof(void*)), MemPointerRecord::arena_size_tag(), size,
-        0, NULL);
+        DEBUG_CALLER_PC, NULL);
     }
   }
 
@@ -217,16 +225,39 @@
   static inline void record_virtual_memory_reserve(address addr, size_t size,
                             address pc = 0, Thread* thread = NULL) {
     if (is_on()) {
-      assert(size > 0, "reserve szero size");
+      assert(size > 0, "Sanity check");
       create_memory_record(addr, MemPointerRecord::virtual_memory_reserve_tag(),
                            size, pc, thread);
     }
   }
 
+  static inline void record_thread_stack(address addr, size_t size, Thread* thr,
+                           address pc = 0) {
+    if (is_on()) {
+      assert(size > 0 && thr != NULL, "Sanity check");
+      create_memory_record(addr, MemPointerRecord::virtual_memory_reserve_tag() | mtThreadStack,
+                          size, pc, thr);
+      create_memory_record(addr, MemPointerRecord::virtual_memory_commit_tag() | mtThreadStack,
+                          size, pc, thr);
+    }
+  }
+
+  static inline void release_thread_stack(address addr, size_t size, Thread* thr) {
+    if (is_on()) {
+      assert(size > 0 && thr != NULL, "Sanity check");
+      assert(!thr->is_Java_thread(), "too early");
+      create_memory_record(addr, MemPointerRecord::virtual_memory_uncommit_tag() | mtThreadStack,
+                          size, DEBUG_CALLER_PC, thr);
+      create_memory_record(addr, MemPointerRecord::virtual_memory_release_tag() | mtThreadStack,
+                          size, DEBUG_CALLER_PC, thr);
+    }
+  }
+
   // record a virtual memory 'commit' call
   static inline void record_virtual_memory_commit(address addr, size_t size,
-                            address pc = 0, Thread* thread = NULL) {
+                            address pc, Thread* thread = NULL) {
     if (is_on()) {
+      assert(size > 0, "Sanity check");
       create_memory_record(addr, MemPointerRecord::virtual_memory_commit_tag(),
                            size, pc, thread);
     }
@@ -236,8 +267,9 @@
   static inline void record_virtual_memory_uncommit(address addr, size_t size,
                             Thread* thread = NULL) {
     if (is_on()) {
+      assert(size > 0, "Sanity check");
       create_memory_record(addr, MemPointerRecord::virtual_memory_uncommit_tag(),
-                           size, 0, thread);
+                           size, DEBUG_CALLER_PC, thread);
     }
   }
 
@@ -245,8 +277,9 @@
   static inline void record_virtual_memory_release(address addr, size_t size,
                             Thread* thread = NULL) {
     if (is_on()) {
+      assert(size > 0, "Sanity check");
       create_memory_record(addr, MemPointerRecord::virtual_memory_release_tag(),
-                           size, 0, thread);
+                           size, DEBUG_CALLER_PC, thread);
     }
   }
 
@@ -257,7 +290,7 @@
       assert(base > 0, "wrong base address");
       assert((flags & (~mt_masks)) == 0, "memory type only");
       create_memory_record(base, (flags | MemPointerRecord::virtual_memory_type_tag()),
-                           0, 0, thread);
+                           0, DEBUG_CALLER_PC, thread);
     }
   }
 
--- a/src/share/vm/utilities/vmError.cpp	Thu Nov 29 19:41:00 2012 -0800
+++ b/src/share/vm/utilities/vmError.cpp	Thu Nov 29 22:32:44 2012 -0800
@@ -453,7 +453,9 @@
      JDK_Version::current().to_string(buf, sizeof(buf));
      const char* runtime_name = JDK_Version::runtime_name() != NULL ?
                                   JDK_Version::runtime_name() : "";
-     st->print_cr("# JRE version: %s (%s)", runtime_name, buf);
+     const char* runtime_version = JDK_Version::runtime_version() != NULL ?
+                                  JDK_Version::runtime_version() : "";
+     st->print_cr("# JRE version: %s (%s) (build %s)", runtime_name, buf, runtime_version);
      st->print_cr("# Java VM: %s (%s %s %s %s)",
                    Abstract_VM_Version::vm_name(),
                    Abstract_VM_Version::vm_release(),
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7184394/TestAESBase.java	Thu Nov 29 22:32:44 2012 -0800
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @author Tom Deneau
+ */
+
+import javax.crypto.Cipher;
+import javax.crypto.KeyGenerator;
+import javax.crypto.SecretKey;
+import javax.crypto.spec.IvParameterSpec;
+import javax.crypto.spec.SecretKeySpec;
+import java.security.AlgorithmParameters;
+
+import java.util.Random;
+import java.util.Arrays;
+
+abstract public class TestAESBase {
+  int msgSize = Integer.getInteger("msgSize", 646);
+  boolean checkOutput = Boolean.getBoolean("checkOutput");
+  boolean noReinit = Boolean.getBoolean("noReinit");
+  int keySize = Integer.getInteger("keySize", 128);
+  String algorithm = System.getProperty("algorithm", "AES");
+  String mode = System.getProperty("mode", "CBC");
+  byte[] input;
+  byte[] encode;
+  byte[] expectedEncode;
+  byte[] decode;
+  byte[] expectedDecode;
+  Random random = new Random(0);
+  Cipher cipher;
+  Cipher dCipher;
+  String paddingStr = "PKCS5Padding";
+  AlgorithmParameters algParams;
+  SecretKey key;
+  int ivLen;
+
+  static int numThreads = 0;
+  int  threadId;
+  static synchronized int getThreadId() {
+    int id = numThreads;
+    numThreads++;
+    return id;
+  }
+
+  abstract public void run();
+
+  public void prepare() {
+    try {
+    System.out.println("\nmsgSize=" + msgSize + ", key size=" + keySize + ", reInit=" + !noReinit + ", checkOutput=" + checkOutput);
+
+      int keyLenBytes = (keySize == 0 ? 16 : keySize/8);
+      byte keyBytes[] = new byte[keyLenBytes];
+      if (keySize == 128)
+        keyBytes = new byte[] {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7};
+      else
+        random.nextBytes(keyBytes);
+
+      key = new SecretKeySpec(keyBytes, algorithm);
+      if (threadId == 0) {
+        System.out.println("Algorithm: " + key.getAlgorithm() + "("
+                           + key.getEncoded().length * 8 + "bit)");
+      }
+      input = new byte[msgSize];
+      for (int i=0; i<input.length; i++) {
+        input[i] = (byte) (i & 0xff);
+      }
+
+      cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+      dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+
+      ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
+      IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
+
+      cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+      algParams = cipher.getParameters();
+      dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
+      if (threadId == 0) {
+        childShowCipher();
+      }
+
+      // do one encode and decode in preparation
+      // this will also create the encode buffer and decode buffer
+      encode = cipher.doFinal(input);
+      decode = dCipher.doFinal(encode);
+      if (checkOutput) {
+        expectedEncode = (byte[]) encode.clone();
+        expectedDecode = (byte[]) decode.clone();
+        showArray(key.getEncoded()  ,  "key:    ");
+        showArray(input,  "input:  ");
+        showArray(encode, "encode: ");
+        showArray(decode, "decode: ");
+      }
+    }
+    catch (Exception e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+  }
+
+  void showArray(byte b[], String name) {
+    System.out.format("%s [%d]: ", name, b.length);
+    for (int i=0; i<Math.min(b.length, 32); i++) {
+      System.out.format("%02x ", b[i] & 0xff);
+    }
+    System.out.println();
+  }
+
+  void compareArrays(byte b[], byte exp[]) {
+    if (b.length != exp.length) {
+      System.out.format("different lengths for actual and expected output arrays\n");
+      showArray(b, "test: ");
+      showArray(exp, "exp : ");
+      System.exit(1);
+    }
+    for (int i=0; i< exp.length; i++) {
+      if (b[i] != exp[i]) {
+        System.out.format("output error at index %d: got %02x, expected %02x\n", i, b[i] & 0xff, exp[i] & 0xff);
+        showArray(b, "test: ");
+        showArray(exp, "exp : ");
+        System.exit(1);
+      }
+    }
+  }
+
+
+  void showCipher(Cipher c, String kind) {
+    System.out.println(kind + " cipher provider: " + cipher.getProvider());
+    System.out.println(kind + " cipher algorithm: " + cipher.getAlgorithm());
+  }
+
+  abstract void childShowCipher();
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7184394/TestAESDecode.java	Thu Nov 29 22:32:44 2012 -0800
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @author Tom Deneau
+ */
+
+import javax.crypto.Cipher;
+
+public class TestAESDecode extends TestAESBase {
+  @Override
+  public void run() {
+    try {
+      if (!noReinit) dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
+      if (checkOutput) {
+        // checked version creates new output buffer each time
+        decode = dCipher.doFinal(encode, 0, encode.length);
+        compareArrays(decode, expectedDecode);
+      } else {
+        // non-checked version outputs to existing encode buffer for maximum speed
+        decode = new byte[dCipher.getOutputSize(encode.length)];
+        dCipher.doFinal(encode, 0, encode.length, decode);
+      }
+    }
+    catch (Exception e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+  }
+
+  @Override
+  void childShowCipher() {
+    showCipher(dCipher, "Decryption");
+  }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7184394/TestAESEncode.java	Thu Nov 29 22:32:44 2012 -0800
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @author Tom Deneau
+ */
+
+import javax.crypto.Cipher;
+
+public class TestAESEncode extends TestAESBase {
+  @Override
+  public void run() {
+    try {
+      if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+      if (checkOutput) {
+        // checked version creates new output buffer each time
+        encode = cipher.doFinal(input, 0, msgSize);
+        compareArrays(encode, expectedEncode);
+      } else {
+        // non-checked version outputs to existing encode buffer for maximum speed
+        encode = new byte[cipher.getOutputSize(msgSize)];
+        cipher.doFinal(input, 0, msgSize, encode);
+      }
+    }
+    catch (Exception e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+  }
+
+  @Override
+  void childShowCipher() {
+    showCipher(cipher, "Encryption");
+  }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7184394/TestAESMain.java	Thu Nov 29 22:32:44 2012 -0800
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7184394
+ * @summary add intrinsics to use AES instructions
+ *
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true TestAESMain
+ *
+ * @author Tom Deneau
+ */
+
+public class TestAESMain {
+  public static void main(String[] args) {
+    int iters = (args.length > 0 ? Integer.valueOf(args[0]) : 1000000);
+    System.out.println(iters + " iterations");
+    TestAESEncode etest = new TestAESEncode();
+    etest.prepare();
+    long start = System.nanoTime();
+    for (int i=0; i<iters; i++) {
+      etest.run();
+    }
+    long end = System.nanoTime();
+    System.out.println("TestAESEncode runtime was " + (double)((end - start)/1000000000.0) + " ms");
+
+    TestAESDecode dtest = new TestAESDecode();
+    dtest.prepare();
+    start = System.nanoTime();
+    for (int i=0; i<iters; i++) {
+      dtest.run();
+    }
+    end = System.nanoTime();
+    System.out.println("TestAESDecode runtime was " + (double)((end - start)/1000000000.0) + " ms");
+  }
+}