changeset 1687:dee553c74493

Merge
author never
date Wed, 01 Sep 2010 00:40:05 -0700
parents f208bf19192d 02f0a9b6f654
children 40d7b43b6fe0 179464550c7d f353275af40e
files
diffstat 27 files changed, 1121 insertions(+), 154 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -1588,6 +1588,185 @@
   }
 
   //
+  //  Generate stub for disjoint short fill.  If "aligned" is true, the
+  //  "to" address is assumed to be heapword aligned.
+  //
+  // Arguments for generated stub:
+  //      to:    O0
+  //      value: O1
+  //      count: O2 treated as signed
+  //
+  address generate_fill(BasicType t, bool aligned, const char* name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    const Register to        = O0;   // source array address
+    const Register value     = O1;   // fill value
+    const Register count     = O2;   // elements count
+    // O3 is used as a temp register
+
+    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
+
+    Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
+    Label L_fill_2_bytes, L_fill_4_bytes, L_fill_32_bytes;
+
+    int shift = -1;
+    switch (t) {
+       case T_BYTE:
+        shift = 2;
+        break;
+       case T_SHORT:
+        shift = 1;
+        break;
+      case T_INT:
+         shift = 0;
+        break;
+      default: ShouldNotReachHere();
+    }
+
+    BLOCK_COMMENT("Entry:");
+
+    if (t == T_BYTE) {
+      // Zero extend value
+      __ and3(value, 0xff, value);
+      __ sllx(value, 8, O3);
+      __ or3(value, O3, value);
+    }
+    if (t == T_SHORT) {
+      // Zero extend value
+      __ sethi(0xffff0000, O3);
+      __ andn(value, O3, value);
+    }
+    if (t == T_BYTE || t == T_SHORT) {
+      __ sllx(value, 16, O3);
+      __ or3(value, O3, value);
+    }
+
+    __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
+    __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_4_bytes); // use unsigned cmp
+    __ delayed()->andcc(count, 1<<shift, G0);
+
+    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
+      // align source address at 4 bytes address boundary
+      if (t == T_BYTE) {
+        // One byte misalignment happens only for byte arrays
+        __ andcc(to, 1, G0);
+        __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
+        __ delayed()->nop();
+        __ stb(value, to, 0);
+        __ inc(to, 1);
+        __ dec(count, 1);
+        __ BIND(L_skip_align1);
+      }
+      // Two bytes misalignment happens only for byte and short (char) arrays
+      __ andcc(to, 2, G0);
+      __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
+      __ delayed()->nop();
+      __ sth(value, to, 0);
+      __ inc(to, 2);
+      __ dec(count, 1 << (shift - 1));
+      __ BIND(L_skip_align2);
+    }
+#ifdef _LP64
+    if (!aligned) {
+#endif
+    // align to 8 bytes, we know we are 4 byte aligned to start
+    __ andcc(to, 7, G0);
+    __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
+    __ delayed()->nop();
+    __ stw(value, to, 0);
+    __ inc(to, 4);
+    __ dec(count, 1 << shift);
+    __ BIND(L_fill_32_bytes);
+#ifdef _LP64
+    }
+#endif
+
+    Label L_check_fill_8_bytes;
+    // Fill 32-byte chunks
+    __ subcc(count, 8 << shift, count);
+    __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
+    __ delayed()->nop();
+
+    if (t == T_INT) {
+      // Zero extend value
+      __ srl(value, 0, value);
+    }
+    if (t == T_BYTE || t == T_SHORT || t == T_INT) {
+      __ sllx(value, 32, O3);
+      __ or3(value, O3, value);
+    }
+
+    Label L_fill_32_bytes_loop;
+    __ align(16);
+    __ BIND(L_fill_32_bytes_loop);
+
+    __ stx(value, to, 0);
+    __ stx(value, to, 8);
+    __ stx(value, to, 16);
+    __ stx(value, to, 24);
+
+    __ subcc(count, 8 << shift, count);
+    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
+    __ delayed()->add(to, 32, to);
+
+    __ BIND(L_check_fill_8_bytes);
+    __ addcc(count, 8 << shift, count);
+    __ brx(Assembler::zero, false, Assembler::pn, L_exit);
+    __ delayed()->subcc(count, 1 << (shift + 1), count);
+    __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
+    __ delayed()->andcc(count, 1<<shift, G0);
+
+    //
+    // length is too short, just fill 8 bytes at a time
+    //
+    Label L_fill_8_bytes_loop;
+    __ BIND(L_fill_8_bytes_loop);
+    __ stx(value, to, 0);
+    __ subcc(count, 1 << (shift + 1), count);
+    __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
+    __ delayed()->add(to, 8, to);
+
+    // fill trailing 4 bytes
+    __ andcc(count, 1<<shift, G0);  // in delay slot of branches
+    __ BIND(L_fill_4_bytes);
+    __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
+    if (t == T_BYTE || t == T_SHORT) {
+      __ delayed()->andcc(count, 1<<(shift-1), G0);
+    } else {
+      __ delayed()->nop();
+    }
+    __ stw(value, to, 0);
+    if (t == T_BYTE || t == T_SHORT) {
+      __ inc(to, 4);
+      // fill trailing 2 bytes
+      __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
+      __ BIND(L_fill_2_bytes);
+      __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
+      __ delayed()->andcc(count, 1, count);
+      __ sth(value, to, 0);
+      if (t == T_BYTE) {
+        __ inc(to, 2);
+        // fill trailing byte
+        __ andcc(count, 1, count);  // in delay slot of branches
+        __ BIND(L_fill_byte);
+        __ brx(Assembler::zero, false, Assembler::pt, L_exit);
+        __ delayed()->nop();
+        __ stb(value, to, 0);
+      } else {
+        __ BIND(L_fill_byte);
+      }
+    } else {
+      __ BIND(L_fill_2_bytes);
+    }
+    __ BIND(L_exit);
+    __ retl();
+    __ delayed()->mov(G0, O0); // return 0
+    return start;
+  }
+
+  //
   //  Generate stub for conjoint short copy.  If "aligned" is true, the
   //  "from" and "to" addresses are assumed to be heapword aligned.
   //
@@ -2855,6 +3034,13 @@
     StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy");
     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy");
     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy");
+
+    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
+    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
+    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
+    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
+    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
+    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
   }
 
   void generate_initial() {
--- a/src/cpu/x86/vm/assembler_x86.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -8767,6 +8767,186 @@
   bind(DONE);
 }
 
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) block_comment(str)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+void MacroAssembler::generate_fill(BasicType t, bool aligned,
+                                   Register to, Register value, Register count,
+                                   Register rtmp, XMMRegister xtmp) {
+  assert_different_registers(to, value, count, rtmp);
+  Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
+  Label L_fill_2_bytes, L_fill_4_bytes;
+
+  int shift = -1;
+  switch (t) {
+    case T_BYTE:
+      shift = 2;
+      break;
+    case T_SHORT:
+      shift = 1;
+      break;
+    case T_INT:
+      shift = 0;
+      break;
+    default: ShouldNotReachHere();
+  }
+
+  if (t == T_BYTE) {
+    andl(value, 0xff);
+    movl(rtmp, value);
+    shll(rtmp, 8);
+    orl(value, rtmp);
+  }
+  if (t == T_SHORT) {
+    andl(value, 0xffff);
+  }
+  if (t == T_BYTE || t == T_SHORT) {
+    movl(rtmp, value);
+    shll(rtmp, 16);
+    orl(value, rtmp);
+  }
+
+  cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
+  jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
+  if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
+    // align source address at 4 bytes address boundary
+    if (t == T_BYTE) {
+      // One byte misalignment happens only for byte arrays
+      testptr(to, 1);
+      jccb(Assembler::zero, L_skip_align1);
+      movb(Address(to, 0), value);
+      increment(to);
+      decrement(count);
+      BIND(L_skip_align1);
+    }
+    // Two bytes misalignment happens only for byte and short (char) arrays
+    testptr(to, 2);
+    jccb(Assembler::zero, L_skip_align2);
+    movw(Address(to, 0), value);
+    addptr(to, 2);
+    subl(count, 1<<(shift-1));
+    BIND(L_skip_align2);
+  }
+  if (UseSSE < 2) {
+    Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
+    // Fill 32-byte chunks
+    subl(count, 8 << shift);
+    jcc(Assembler::less, L_check_fill_8_bytes);
+    align(16);
+
+    BIND(L_fill_32_bytes_loop);
+
+    for (int i = 0; i < 32; i += 4) {
+      movl(Address(to, i), value);
+    }
+
+    addptr(to, 32);
+    subl(count, 8 << shift);
+    jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
+    BIND(L_check_fill_8_bytes);
+    addl(count, 8 << shift);
+    jccb(Assembler::zero, L_exit);
+    jmpb(L_fill_8_bytes);
+
+    //
+    // length is too short, just fill qwords
+    //
+    BIND(L_fill_8_bytes_loop);
+    movl(Address(to, 0), value);
+    movl(Address(to, 4), value);
+    addptr(to, 8);
+    BIND(L_fill_8_bytes);
+    subl(count, 1 << (shift + 1));
+    jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
+    // fall through to fill 4 bytes
+  } else {
+    Label L_fill_32_bytes;
+    if (!UseUnalignedLoadStores) {
+      // align to 8 bytes, we know we are 4 byte aligned to start
+      testptr(to, 4);
+      jccb(Assembler::zero, L_fill_32_bytes);
+      movl(Address(to, 0), value);
+      addptr(to, 4);
+      subl(count, 1<<shift);
+    }
+    BIND(L_fill_32_bytes);
+    {
+      assert( UseSSE >= 2, "supported cpu only" );
+      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
+      // Fill 32-byte chunks
+      movdl(xtmp, value);
+      pshufd(xtmp, xtmp, 0);
+
+      subl(count, 8 << shift);
+      jcc(Assembler::less, L_check_fill_8_bytes);
+      align(16);
+
+      BIND(L_fill_32_bytes_loop);
+
+      if (UseUnalignedLoadStores) {
+        movdqu(Address(to, 0), xtmp);
+        movdqu(Address(to, 16), xtmp);
+      } else {
+        movq(Address(to, 0), xtmp);
+        movq(Address(to, 8), xtmp);
+        movq(Address(to, 16), xtmp);
+        movq(Address(to, 24), xtmp);
+      }
+
+      addptr(to, 32);
+      subl(count, 8 << shift);
+      jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
+      BIND(L_check_fill_8_bytes);
+      addl(count, 8 << shift);
+      jccb(Assembler::zero, L_exit);
+      jmpb(L_fill_8_bytes);
+
+      //
+      // length is too short, just fill qwords
+      //
+      BIND(L_fill_8_bytes_loop);
+      movq(Address(to, 0), xtmp);
+      addptr(to, 8);
+      BIND(L_fill_8_bytes);
+      subl(count, 1 << (shift + 1));
+      jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
+    }
+  }
+  // fill trailing 4 bytes
+  BIND(L_fill_4_bytes);
+  testl(count, 1<<shift);
+  jccb(Assembler::zero, L_fill_2_bytes);
+  movl(Address(to, 0), value);
+  if (t == T_BYTE || t == T_SHORT) {
+    addptr(to, 4);
+    BIND(L_fill_2_bytes);
+    // fill trailing 2 bytes
+    testl(count, 1<<(shift-1));
+    jccb(Assembler::zero, L_fill_byte);
+    movw(Address(to, 0), value);
+    if (t == T_BYTE) {
+      addptr(to, 2);
+      BIND(L_fill_byte);
+      // fill trailing byte
+      testl(count, 1);
+      jccb(Assembler::zero, L_exit);
+      movb(Address(to, 0), value);
+    } else {
+      BIND(L_fill_byte);
+    }
+  } else {
+    BIND(L_fill_2_bytes);
+  }
+  BIND(L_exit);
+}
+#undef BIND
+#undef BLOCK_COMMENT
+
+
 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
   switch (cond) {
     // Note some conditions are synonyms for others
--- a/src/cpu/x86/vm/assembler_x86.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -2242,6 +2242,11 @@
                           Register limit, Register result, Register chr,
                           XMMRegister vec1, XMMRegister vec2);
 
+  // Fill primitive arrays
+  void generate_fill(BasicType t, bool aligned,
+                     Register to, Register value, Register count,
+                     Register rtmp, XMMRegister xtmp);
+
 #undef VIRTUAL
 
 };
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -1039,6 +1039,33 @@
   }
 
 
+  address generate_fill(BasicType t, bool aligned, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    BLOCK_COMMENT("Entry:");
+
+    const Register to       = rdi;  // source array address
+    const Register value    = rdx;  // value
+    const Register count    = rsi;  // elements count
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ push(rsi);
+    __ push(rdi);
+    __ movptr(to   , Address(rsp, 12+ 4));
+    __ movl(value, Address(rsp, 12+ 8));
+    __ movl(count, Address(rsp, 12+ 12));
+
+    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
+
+    __ pop(rdi);
+    __ pop(rsi);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+    return start;
+  }
+
   address generate_conjoint_copy(BasicType t, bool aligned,
                                  Address::ScaleFactor sf,
                                  address nooverlap_target,
@@ -2001,6 +2028,13 @@
         generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
                                     "jlong_arraycopy");
 
+    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
+    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
+    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
+    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
+    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
+    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
+
     StubRoutines::_arrayof_jint_disjoint_arraycopy  =
         StubRoutines::_jint_disjoint_arraycopy;
     StubRoutines::_arrayof_oop_disjoint_arraycopy   =
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -1625,6 +1625,26 @@
     return start;
   }
 
+  address generate_fill(BasicType t, bool aligned, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    BLOCK_COMMENT("Entry:");
+
+    const Register to       = c_rarg0;  // source array address
+    const Register value    = c_rarg1;  // value
+    const Register count    = c_rarg2;  // elements count
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+    return start;
+  }
+
   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   //             ignored
@@ -2712,6 +2732,13 @@
     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy");
     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy");
 
+    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
+    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
+    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
+    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
+    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
+    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
+
     // We don't generate specialized code for HeapWord-aligned source
     // arrays, so just use the code we've already generated
     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
--- a/src/share/vm/asm/codeBuffer.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/asm/codeBuffer.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -143,13 +143,6 @@
 
 void CodeBuffer::initialize_section_size(CodeSection* cs, csize_t size) {
   assert(cs != &_insts, "insts is the memory provider, not the consumer");
-#ifdef ASSERT
-  for (int n = (int)SECT_INSTS+1; n < (int)SECT_LIMIT; n++) {
-    CodeSection* prevCS = code_section(n);
-    if (prevCS == cs)  break;
-    assert(!prevCS->is_allocated(), "section allocation must be in reverse order");
-  }
-#endif
   csize_t slop = CodeSection::end_slop();  // margin between sections
   int align = cs->alignment();
   assert(is_power_of_2(align), "sanity");
@@ -199,13 +192,13 @@
     _total_start = start;
     _total_size  = end - start;
   } else {
-    #ifdef ASSERT
+#ifdef ASSERT
     // Clean out dangling pointers.
     _total_start    = badAddress;
+    _consts._start  = _consts._end  = badAddress;
     _insts._start   = _insts._end   = badAddress;
     _stubs._start   = _stubs._end   = badAddress;
-    _consts._start  = _consts._end  = badAddress;
-    #endif //ASSERT
+#endif //ASSERT
   }
 }
 
@@ -221,9 +214,9 @@
   return NULL;
 #else //PRODUCT
   switch (n) {
+  case SECT_CONSTS:            return "consts";
   case SECT_INSTS:             return "insts";
   case SECT_STUBS:             return "stubs";
-  case SECT_CONSTS:            return "consts";
   default:                     return NULL;
   }
 #endif //PRODUCT
@@ -445,12 +438,11 @@
 
   const CodeSection* prev_cs      = NULL;
   CodeSection*       prev_dest_cs = NULL;
-  for (int n = 0; n < (int)SECT_LIMIT; n++) {
+
+  for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) {
     // figure compact layout of each section
     const CodeSection* cs = code_section(n);
-    address cstart = cs->start();
-    address cend   = cs->end();
-    csize_t csize  = cend - cstart;
+    csize_t csize = cs->size();
 
     CodeSection* dest_cs = dest->code_section(n);
     if (!cs->is_empty()) {
@@ -463,7 +455,7 @@
         prev_dest_cs->_limit += padding;
       }
       #ifdef ASSERT
-      if (prev_cs != NULL && prev_cs->is_frozen() && n < SECT_CONSTS) {
+      if (prev_cs != NULL && prev_cs->is_frozen() && n < (SECT_LIMIT - 1)) {
         // Make sure the ends still match up.
         // This is important because a branch in a frozen section
         // might target code in a following section, via a Label,
@@ -492,22 +484,18 @@
   assert(dest->verify_section_allocation(), "final configuration works");
 }
 
-csize_t CodeBuffer::total_offset_of(address addr) const {
-  csize_t code_size_so_far = 0;
-  for (int n = 0; n < (int)SECT_LIMIT; n++) {
-    const CodeSection* cs = code_section(n);
-    if (!cs->is_empty()) {
-      code_size_so_far = cs->align_at_start(code_size_so_far);
+csize_t CodeBuffer::total_offset_of(CodeSection* cs) const {
+  csize_t size_so_far = 0;
+  for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) {
+    const CodeSection* cur_cs = code_section(n);
+    if (!cur_cs->is_empty()) {
+      size_so_far = cur_cs->align_at_start(size_so_far);
     }
-    if (cs->contains2(addr)) {
-      return code_size_so_far + (addr - cs->start());
+    if (cur_cs->index() == cs->index()) {
+      return size_so_far;
     }
-    code_size_so_far += cs->size();
+    size_so_far += cur_cs->size();
   }
-#ifndef PRODUCT
-  tty->print_cr("Dangling address " PTR_FORMAT " in:", addr);
-  ((CodeBuffer*)this)->print();
-#endif
   ShouldNotReachHere();
   return -1;
 }
@@ -533,7 +521,7 @@
 
   csize_t code_end_so_far = 0;
   csize_t code_point_so_far = 0;
-  for (int n = 0; n < (int)SECT_LIMIT; n++) {
+  for (int n = (int) SECT_FIRST; n < (int)SECT_LIMIT; n++) {
     // pull relocs out of each section
     const CodeSection* cs = code_section(n);
     assert(!(cs->is_empty() && cs->locs_count() > 0), "sanity");
@@ -635,11 +623,14 @@
   ICache::invalidate_range(dest_blob->code_begin(), dest_blob->code_size());
 }
 
-// Move all my code into another code buffer.
-// Consult applicable relocs to repair embedded addresses.
+// Move all my code into another code buffer.  Consult applicable
+// relocs to repair embedded addresses.  The layout in the destination
+// CodeBuffer is different to the source CodeBuffer: the destination
+// CodeBuffer gets the final layout (consts, insts, stubs in order of
+// ascending address).
 void CodeBuffer::relocate_code_to(CodeBuffer* dest) const {
   DEBUG_ONLY(address dest_end = dest->_total_start + dest->_total_size);
-  for (int n = 0; n < (int)SECT_LIMIT; n++) {
+  for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) {
     // pull code out of each section
     const CodeSection* cs = code_section(n);
     if (cs->is_empty())  continue;  // skip trivial section
@@ -681,20 +672,19 @@
                                                csize_t* new_capacity) {
   csize_t new_total_cap = 0;
 
-  int prev_n = -1;
-  for (int n = 0; n < (int)SECT_LIMIT; n++) {
+  for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) {
     const CodeSection* sect = code_section(n);
 
     if (!sect->is_empty()) {
-      // Compute initial padding; assign it to the previous non-empty guy.
-      // Cf. compute_final_layout.
+      // Compute initial padding; assign it to the previous section,
+      // even if it's empty (e.g. consts section can be empty).
+      // Cf. compute_final_layout
       csize_t padding = sect->align_at_start(new_total_cap) - new_total_cap;
       if (padding != 0) {
         new_total_cap += padding;
-        assert(prev_n >= 0, "sanity");
-        new_capacity[prev_n] += padding;
+        assert(n - 1 >= SECT_FIRST, "sanity");
+        new_capacity[n - 1] += padding;
       }
-      prev_n = n;
     }
 
     csize_t exp = sect->size();  // 100% increase
@@ -774,11 +764,11 @@
   this->_before_expand = bxp;
 
   // Give each section its required (expanded) capacity.
-  for (int n = (int)SECT_LIMIT-1; n >= SECT_INSTS; n--) {
+  for (int n = (int)SECT_LIMIT-1; n >= SECT_FIRST; n--) {
     CodeSection* cb_sect   = cb.code_section(n);
     CodeSection* this_sect = code_section(n);
     if (new_capacity[n] == 0)  continue;  // already nulled out
-    if (n > SECT_INSTS) {
+    if (n != SECT_INSTS) {
       cb.initialize_section_size(cb_sect, new_capacity[n]);
     }
     assert(cb_sect->capacity() >= new_capacity[n], "big enough");
@@ -844,17 +834,22 @@
     assert(tstart >= _blob->content_begin(), "sanity");
     assert(tend   <= _blob->content_end(),   "sanity");
   }
-  address tcheck = tstart;  // advancing pointer to verify disjointness
-  for (int n = 0; n < (int)SECT_LIMIT; n++) {
+  // Verify disjointness.
+  for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) {
     CodeSection* sect = code_section(n);
-    if (!sect->is_allocated())  continue;
-    assert(sect->start() >= tcheck, "sanity");
-    tcheck = sect->start();
-    assert((intptr_t)tcheck % sect->alignment() == 0
+    if (!sect->is_allocated() || sect->is_empty())  continue;
+    assert((intptr_t)sect->start() % sect->alignment() == 0
            || sect->is_empty() || _blob == NULL,
            "start is aligned");
-    assert(sect->end()   >= tcheck, "sanity");
-    assert(sect->end()   <= tend,   "sanity");
+    for (int m = (int) SECT_FIRST; m < (int) SECT_LIMIT; m++) {
+      CodeSection* other = code_section(m);
+      if (!other->is_allocated() || other == sect)  continue;
+      assert(!other->contains(sect->start()    ), "sanity");
+      // limit is an exclusive address and can be the start of another
+      // section.
+      assert(!other->contains(sect->limit() - 1), "sanity");
+    }
+    assert(sect->end() <= tend, "sanity");
   }
   return true;
 }
--- a/src/share/vm/asm/codeBuffer.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/asm/codeBuffer.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -289,10 +289,12 @@
  public:
   typedef int csize_t;  // code size type; would be size_t except for history
   enum {
-    // Here is the list of all possible sections, in order of ascending address.
+    // Here is the list of all possible sections.  The order reflects
+    // the final layout.
+    SECT_FIRST = 0,
+    SECT_CONSTS = SECT_FIRST, // Non-instruction data:  Floats, jump tables, etc.
     SECT_INSTS,               // Executable instructions.
     SECT_STUBS,               // Outbound trampolines for supporting call sites.
-    SECT_CONSTS,              // Non-instruction data:  Floats, jump tables, etc.
     SECT_LIMIT, SECT_NONE = -1
   };
 
@@ -304,9 +306,9 @@
 
   const char*  _name;
 
+  CodeSection  _consts;             // constants, jump tables
   CodeSection  _insts;              // instructions (the main section)
   CodeSection  _stubs;              // stubs (call site support), deopt, exception handling
-  CodeSection  _consts;             // constants, jump tables
 
   CodeBuffer*  _before_expand;  // dead buffer, from before the last expansion
 
@@ -334,9 +336,9 @@
   }
 
   void initialize(address code_start, csize_t code_size) {
+    _consts.initialize_outer(this,  SECT_CONSTS);
     _insts.initialize_outer(this,   SECT_INSTS);
     _stubs.initialize_outer(this,   SECT_STUBS);
-    _consts.initialize_outer(this,  SECT_CONSTS);
     _total_start = code_start;
     _total_size  = code_size;
     // Initialize the main section:
@@ -414,16 +416,16 @@
   // construction.
   void initialize(csize_t code_size, csize_t locs_size);
 
+  CodeSection* consts()            { return &_consts; }
   CodeSection* insts()             { return &_insts; }
   CodeSection* stubs()             { return &_stubs; }
-  CodeSection* consts()            { return &_consts; }
 
-  // present sections in order; return NULL at end; insts is #0, etc.
+  // present sections in order; return NULL at end; consts is #0, etc.
   CodeSection* code_section(int n) {
-    // This makes the slightly questionable but portable assumption that
-    // the various members (_insts, _stubs, etc.) are adjacent in the
-    // layout of CodeBuffer.
-    CodeSection* cs = &_insts + n;
+    // This makes the slightly questionable but portable assumption
+    // that the various members (_consts, _insts, _stubs, etc.) are
+    // adjacent in the layout of CodeBuffer.
+    CodeSection* cs = &_consts + n;
     assert(cs->index() == n || !cs->is_allocated(), "sanity");
     return cs;
   }
@@ -484,9 +486,9 @@
   // CodeBlob).
   csize_t total_content_size() const;
 
-  // combined offset (relative to start of insts) of given address,
-  // as eventually found in the final CodeBlob
-  csize_t total_offset_of(address addr) const;
+  // Combined offset (relative to start of first section) of given
+  // section, as eventually found in the final CodeBlob.
+  csize_t total_offset_of(CodeSection* cs) const;
 
   // allocated size of all relocation data, including index, rounded up
   csize_t total_relocation_size() const;
--- a/src/share/vm/code/codeBlob.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/code/codeBlob.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -92,7 +92,7 @@
   _header_size           = header_size;
   _relocation_size       = round_to(cb->total_relocation_size(), oopSize);
   _content_offset        = align_code_offset(header_size + _relocation_size);
-  _code_offset           = _content_offset + cb->total_offset_of(cb->insts()->start());
+  _code_offset           = _content_offset + cb->total_offset_of(cb->insts());
   _data_offset           = _content_offset + round_to(cb->total_content_size(), oopSize);
   assert(_data_offset <= size, "codeBlob is too small");
 
--- a/src/share/vm/code/nmethod.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/code/nmethod.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -87,9 +87,9 @@
   int nmethod_count;
   int total_size;
   int relocation_size;
+  int consts_size;
   int insts_size;
   int stub_size;
-  int consts_size;
   int scopes_data_size;
   int scopes_pcs_size;
   int dependencies_size;
@@ -101,9 +101,9 @@
     nmethod_count += 1;
     total_size          += nm->size();
     relocation_size     += nm->relocation_size();
+    consts_size         += nm->consts_size();
     insts_size          += nm->insts_size();
     stub_size           += nm->stub_size();
-    consts_size         += nm->consts_size();
     oops_size           += nm->oops_size();
     scopes_data_size    += nm->scopes_data_size();
     scopes_pcs_size     += nm->scopes_pcs_size();
@@ -116,9 +116,9 @@
     tty->print_cr("Statistics for %d bytecoded nmethods:", nmethod_count);
     if (total_size != 0)          tty->print_cr(" total in heap  = %d", total_size);
     if (relocation_size != 0)     tty->print_cr(" relocation     = %d", relocation_size);
+    if (consts_size != 0)         tty->print_cr(" constants      = %d", consts_size);
     if (insts_size != 0)          tty->print_cr(" main code      = %d", insts_size);
     if (stub_size != 0)           tty->print_cr(" stub code      = %d", stub_size);
-    if (consts_size != 0)         tty->print_cr(" constants      = %d", consts_size);
     if (oops_size != 0)           tty->print_cr(" oops           = %d", oops_size);
     if (scopes_data_size != 0)    tty->print_cr(" scopes data    = %d", scopes_data_size);
     if (scopes_pcs_size != 0)     tty->print_cr(" scopes pcs     = %d", scopes_pcs_size);
@@ -404,9 +404,9 @@
 
 int nmethod::total_size() const {
   return
+    consts_size()        +
     insts_size()         +
     stub_size()          +
-    consts_size()        +
     scopes_data_size()   +
     scopes_pcs_size()    +
     handler_table_size() +
@@ -789,13 +789,17 @@
     _orig_pc_offset          = orig_pc_offset;
 
     // Section offsets
-    _consts_offset           = content_offset()      + code_buffer->total_offset_of(code_buffer->consts()->start());
-    _stub_offset             = content_offset()      + code_buffer->total_offset_of(code_buffer->stubs()->start());
+    _consts_offset           = content_offset()      + code_buffer->total_offset_of(code_buffer->consts());
+    _stub_offset             = content_offset()      + code_buffer->total_offset_of(code_buffer->stubs());
 
     // Exception handler and deopt handler are in the stub section
     _exception_offset        = _stub_offset          + offsets->value(CodeOffsets::Exceptions);
     _deoptimize_offset       = _stub_offset          + offsets->value(CodeOffsets::Deopt);
-    _deoptimize_mh_offset    = _stub_offset          + offsets->value(CodeOffsets::DeoptMH);
+    if (has_method_handle_invokes()) {
+      _deoptimize_mh_offset  = _stub_offset          + offsets->value(CodeOffsets::DeoptMH);
+    } else {
+      _deoptimize_mh_offset  = -1;
+    }
     if (offsets->value(CodeOffsets::UnwindHandler) != -1) {
       _unwind_handler_offset = code_offset()         + offsets->value(CodeOffsets::UnwindHandler);
     } else {
@@ -885,9 +889,9 @@
     xtty->print(" address='" INTPTR_FORMAT "'", (intptr_t) this);
 
     LOG_OFFSET(xtty, relocation);
+    LOG_OFFSET(xtty, consts);
     LOG_OFFSET(xtty, insts);
     LOG_OFFSET(xtty, stub);
-    LOG_OFFSET(xtty, consts);
     LOG_OFFSET(xtty, scopes_data);
     LOG_OFFSET(xtty, scopes_pcs);
     LOG_OFFSET(xtty, dependencies);
@@ -2336,6 +2340,10 @@
                                               relocation_begin(),
                                               relocation_end(),
                                               relocation_size());
+  if (consts_size       () > 0) tty->print_cr(" constants      [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d",
+                                              consts_begin(),
+                                              consts_end(),
+                                              consts_size());
   if (insts_size        () > 0) tty->print_cr(" main code      [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d",
                                               insts_begin(),
                                               insts_end(),
@@ -2344,10 +2352,6 @@
                                               stub_begin(),
                                               stub_end(),
                                               stub_size());
-  if (consts_size       () > 0) tty->print_cr(" constants      [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d",
-                                              consts_begin(),
-                                              consts_end(),
-                                              consts_size());
   if (oops_size         () > 0) tty->print_cr(" oops           [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d",
                                               oops_begin(),
                                               oops_end(),
@@ -2372,10 +2376,6 @@
                                               nul_chk_table_begin(),
                                               nul_chk_table_end(),
                                               nul_chk_table_size());
-  if (oops_size         () > 0) tty->print_cr(" oops           [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d",
-                                              oops_begin(),
-                                              oops_end(),
-                                              oops_size());
 }
 
 void nmethod::print_code() {
--- a/src/share/vm/code/nmethod.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/code/nmethod.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -143,8 +143,8 @@
 #ifdef HAVE_DTRACE_H
   int _trap_offset;
 #endif // def HAVE_DTRACE_H
+  int _consts_offset;
   int _stub_offset;
-  int _consts_offset;
   int _oops_offset;                       // offset to where embedded oop table begins (inside data)
   int _scopes_data_offset;
   int _scopes_pcs_offset;
@@ -336,16 +336,16 @@
   bool is_compiled_by_shark() const;
 
   // boundaries for different parts
-  address insts_begin           () const          { return code_begin(); }
+  address consts_begin          () const          { return           header_begin() + _consts_offset        ; }
+  address consts_end            () const          { return           header_begin() +  code_offset()        ; }
+  address insts_begin           () const          { return           header_begin() +  code_offset()        ; }
   address insts_end             () const          { return           header_begin() + _stub_offset          ; }
+  address stub_begin            () const          { return           header_begin() + _stub_offset          ; }
+  address stub_end              () const          { return           header_begin() + _oops_offset          ; }
   address exception_begin       () const          { return           header_begin() + _exception_offset     ; }
   address deopt_handler_begin   () const          { return           header_begin() + _deoptimize_offset    ; }
   address deopt_mh_handler_begin() const          { return           header_begin() + _deoptimize_mh_offset ; }
   address unwind_handler_begin  () const          { return _unwind_handler_offset != -1 ? (header_begin() + _unwind_handler_offset) : NULL; }
-  address stub_begin            () const          { return           header_begin() + _stub_offset          ; }
-  address stub_end              () const          { return           header_begin() + _consts_offset        ; }
-  address consts_begin          () const          { return           header_begin() + _consts_offset        ; }
-  address consts_end            () const          { return           header_begin() + _oops_offset          ; }
   oop*    oops_begin            () const          { return (oop*)   (header_begin() + _oops_offset)         ; }
   oop*    oops_end              () const          { return (oop*)   (header_begin() + _scopes_data_offset)  ; }
 
@@ -361,9 +361,9 @@
   address nul_chk_table_end     () const          { return           header_begin() + _nmethod_end_offset   ; }
 
   // Sizes
+  int consts_size       () const                  { return            consts_end       () -            consts_begin       (); }
   int insts_size        () const                  { return            insts_end        () -            insts_begin        (); }
   int stub_size         () const                  { return            stub_end         () -            stub_begin         (); }
-  int consts_size       () const                  { return            consts_end       () -            consts_begin       (); }
   int oops_size         () const                  { return (address)  oops_end         () - (address)  oops_begin         (); }
   int scopes_data_size  () const                  { return            scopes_data_end  () -            scopes_data_begin  (); }
   int scopes_pcs_size   () const                  { return (intptr_t) scopes_pcs_end   () - (intptr_t) scopes_pcs_begin   (); }
@@ -374,9 +374,9 @@
   int total_size        () const;
 
   // Containment
+  bool consts_contains       (address addr) const { return consts_begin       () <= addr && addr < consts_end       (); }
   bool insts_contains        (address addr) const { return insts_begin        () <= addr && addr < insts_end        (); }
   bool stub_contains         (address addr) const { return stub_begin         () <= addr && addr < stub_end         (); }
-  bool consts_contains       (address addr) const { return consts_begin       () <= addr && addr < consts_end       (); }
   bool oops_contains         (oop*    addr) const { return oops_begin         () <= addr && addr < oops_end         (); }
   bool scopes_data_contains  (address addr) const { return scopes_data_begin  () <= addr && addr < scopes_data_end  (); }
   bool scopes_pcs_contains   (PcDesc* addr) const { return scopes_pcs_begin   () <= addr && addr < scopes_pcs_end   (); }
--- a/src/share/vm/code/relocInfo.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/code/relocInfo.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -128,7 +128,16 @@
   _code    = nm;
   _current = nm->relocation_begin() - 1;
   _end     = nm->relocation_end();
-  _addr    = (address) nm->code_begin();
+  _addr    = nm->content_begin();
+
+  // Initialize code sections.
+  _section_start[CodeBuffer::SECT_CONSTS] = nm->consts_begin();
+  _section_start[CodeBuffer::SECT_INSTS ] = nm->insts_begin() ;
+  _section_start[CodeBuffer::SECT_STUBS ] = nm->stub_begin()  ;
+
+  _section_end  [CodeBuffer::SECT_CONSTS] = nm->consts_end()  ;
+  _section_end  [CodeBuffer::SECT_INSTS ] = nm->insts_end()   ;
+  _section_end  [CodeBuffer::SECT_STUBS ] = nm->stub_end()    ;
 
   assert(!has_current(), "just checking");
   assert(begin == NULL || begin >= nm->code_begin(), "in bounds");
@@ -146,9 +155,11 @@
   _code    = NULL; // Not cb->blob();
 
   CodeBuffer* cb = cs->outer();
-  assert((int)SECT_LIMIT == CodeBuffer::SECT_LIMIT, "my copy must be equal");
-  for (int n = 0; n < (int)SECT_LIMIT; n++) {
-    _section_start[n] = cb->code_section(n)->start();
+  assert((int) SECT_LIMIT == CodeBuffer::SECT_LIMIT, "my copy must be equal");
+  for (int n = (int) CodeBuffer::SECT_FIRST; n < (int) CodeBuffer::SECT_LIMIT; n++) {
+    CodeSection* cs = cb->code_section(n);
+    _section_start[n] = cs->start();
+    _section_end  [n] = cs->end();
   }
 
   assert(!has_current(), "just checking");
@@ -166,6 +177,12 @@
 };
 
 
+bool RelocIterator::addr_in_const() const {
+  const int n = CodeBuffer::SECT_CONSTS;
+  return section_start(n) <= addr() && addr() < section_end(n);
+}
+
+
 static inline int num_cards(int code_size) {
   return (code_size-1) / indexCardSize;
 }
@@ -360,31 +377,12 @@
 }
 
 
-address RelocIterator::compute_section_start(int n) const {
-// This routine not only computes a section start, but also
-// memoizes it for later.
-#define CACHE ((RelocIterator*)this)->_section_start[n]
-  CodeBlob* cb = code();
-  guarantee(cb != NULL, "must have a code blob");
-  if (n == CodeBuffer::SECT_INSTS)
-    return CACHE = cb->code_begin();
-  assert(cb->is_nmethod(), "only nmethods have these sections");
-  nmethod* nm = (nmethod*) cb;
-  address res = NULL;
-  switch (n) {
-  case CodeBuffer::SECT_STUBS:
-    res = nm->stub_begin();
-    break;
-  case CodeBuffer::SECT_CONSTS:
-    res = nm->consts_begin();
-    break;
-  default:
-    ShouldNotReachHere();
+void RelocIterator::initialize_misc() {
+  set_has_current(false);
+  for (int i = (int) CodeBuffer::SECT_FIRST; i < (int) CodeBuffer::SECT_LIMIT; i++) {
+    _section_start[i] = NULL;  // these will be lazily computed, if needed
+    _section_end  [i] = NULL;
   }
-  assert(nm->contains(res) || res == nm->code_end(), "tame pointer");
-  CACHE = res;
-  return res;
-#undef CACHE
 }
 
 
--- a/src/share/vm/code/relocInfo.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/code/relocInfo.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2008, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -502,8 +502,7 @@
 //   }
 
 class RelocIterator : public StackObj {
-  enum { SECT_CONSTS = 2,
-         SECT_LIMIT = 3 };  // must be equal to CodeBuffer::SECT_LIMIT
+  enum { SECT_LIMIT = 3 };  // must be equal to CodeBuffer::SECT_LIMIT, checked in ctor
   friend class Relocation;
   friend class relocInfo;       // for change_reloc_info_for_address only
   typedef relocInfo::relocType relocType;
@@ -521,6 +520,7 @@
 
   // Base addresses needed to compute targets of section_word_type relocs.
   address    _section_start[SECT_LIMIT];
+  address    _section_end  [SECT_LIMIT];
 
   void set_has_current(bool b) {
     _datalen = !b ? -1 : 0;
@@ -540,14 +540,7 @@
 
   void advance_over_prefix();    // helper method
 
-  void initialize_misc() {
-    set_has_current(false);
-    for (int i = 0; i < SECT_LIMIT; i++) {
-      _section_start[i] = NULL;  // these will be lazily computed, if needed
-    }
-  }
-
-  address compute_section_start(int n) const;  // out-of-line helper
+  void initialize_misc();
 
   void initialize(nmethod* nm, address begin, address limit);
 
@@ -598,11 +591,15 @@
   bool     has_current()      const { return _datalen >= 0; }
 
   void       set_addr(address addr) { _addr = addr; }
-  bool   addr_in_const()      const { return addr() >= section_start(SECT_CONSTS); }
+  bool   addr_in_const()      const;
 
   address section_start(int n) const {
-    address res = _section_start[n];
-    return (res != NULL) ? res : compute_section_start(n);
+    assert(_section_start[n], "must be initialized");
+    return _section_start[n];
+  }
+  address section_end(int n) const {
+    assert(_section_end[n], "must be initialized");
+    return _section_end[n];
   }
 
   // The address points to the affected displacement part of the instruction.
--- a/src/share/vm/includeDB_compiler2	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/includeDB_compiler2	Wed Sep 01 00:40:05 2010 -0700
@@ -625,6 +625,7 @@
 loopTransform.cpp                       loopnode.hpp
 loopTransform.cpp                       mulnode.hpp
 loopTransform.cpp                       rootnode.hpp
+loopTransform.cpp                       runtime.hpp
 loopTransform.cpp                       subnode.hpp
 
 loopUnswitch.cpp                        allocation.inline.hpp
--- a/src/share/vm/opto/addnode.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/addnode.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -705,6 +705,9 @@
     }
     addr = addr->in(AddPNode::Address);
   }
+  if (addr != base) {
+    return -1;
+  }
   return count;
 }
 
--- a/src/share/vm/opto/c2_globals.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/c2_globals.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -157,6 +157,12 @@
   develop(bool, TraceLoopPredicate, false,                                  \
           "Trace generation of loop predicates")                            \
                                                                             \
+  product(bool, OptimizeFill, false,                                        \
+          "convert fill/copy loops into intrinsic")                         \
+                                                                            \
+  develop(bool, TraceOptimizeFill, false,                                   \
+          "print detailed information about fill conversion")               \
+                                                                            \
   develop(bool, OptoCoalesce, true,                                         \
           "Use Conservative Copy Coalescing in the Register Allocator")     \
                                                                             \
--- a/src/share/vm/opto/loopTransform.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/loopTransform.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -2049,11 +2049,18 @@
   if (cmp->Opcode() != Op_CmpU ) {
     return false;
   }
-  if (cmp->in(2)->Opcode() != Op_LoadRange) {
-    return false;
+  Node* range = cmp->in(2);
+  if (range->Opcode() != Op_LoadRange) {
+    const TypeInt* tint = phase->_igvn.type(range)->isa_int();
+    if (!OptimizeFill || tint == NULL || tint->empty() || tint->_lo < 0) {
+      // Allow predication on positive values that aren't LoadRanges.
+      // This allows optimization of loops where the length of the
+      // array is a known value and doesn't need to be loaded back
+      // from the array.
+      return false;
+    }
   }
-  LoadRangeNode* lr = (LoadRangeNode*)cmp->in(2);
-  if (!invar.is_invariant(lr)) { // loadRange must be invariant
+  if (!invar.is_invariant(range)) {
     return false;
   }
   Node *iv     = _head->as_CountedLoop()->phi();
@@ -2248,9 +2255,9 @@
       const Node*    cmp    = bol->in(1)->as_Cmp();
       Node*          idx    = cmp->in(1);
       assert(!invar.is_invariant(idx), "index is variant");
-      assert(cmp->in(2)->Opcode() == Op_LoadRange, "must be");
-      Node* ld_rng = cmp->in(2); // LoadRangeNode
-      assert(invar.is_invariant(ld_rng), "load range must be invariant");
+      assert(cmp->in(2)->Opcode() == Op_LoadRange || OptimizeFill, "must be");
+      Node* rng = cmp->in(2);
+      assert(invar.is_invariant(rng), "range must be invariant");
       int scale    = 1;
       Node* offset = zero;
       bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset);
@@ -2271,21 +2278,21 @@
 
       // Perform cloning to keep Invariance state correct since the
       // late schedule will place invariant things in the loop.
-      ld_rng = invar.clone(ld_rng, ctrl);
+      rng = invar.clone(rng, ctrl);
       if (offset && offset != zero) {
         assert(invar.is_invariant(offset), "offset must be loop invariant");
         offset = invar.clone(offset, ctrl);
       }
 
       // Test the lower bound
-      Node*  lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, false);
+      Node*  lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, false);
       IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If();
       _igvn.hash_delete(lower_bound_iff);
       lower_bound_iff->set_req(1, lower_bound_bol);
       if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx);
 
       // Test the upper bound
-      Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, true);
+      Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, true);
       IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If();
       _igvn.hash_delete(upper_bound_iff);
       upper_bound_iff->set_req(1, upper_bound_bol);
@@ -2366,3 +2373,348 @@
 
   return hoisted;
 }
+
+
+// Process all the loops in the loop tree and replace any fill
+// patterns with an intrisc version.
+bool PhaseIdealLoop::do_intrinsify_fill() {
+  bool changed = false;
+  for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
+    IdealLoopTree* lpt = iter.current();
+    changed |= intrinsify_fill(lpt);
+  }
+  return changed;
+}
+
+
+// Examine an inner loop looking for a a single store of an invariant
+// value in a unit stride loop,
+bool PhaseIdealLoop::match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value,
+                                     Node*& shift, Node*& con) {
+  const char* msg = NULL;
+  Node* msg_node = NULL;
+
+  store_value = NULL;
+  con = NULL;
+  shift = NULL;
+
+  // Process the loop looking for stores.  If there are multiple
+  // stores or extra control flow give at this point.
+  CountedLoopNode* head = lpt->_head->as_CountedLoop();
+  for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
+    Node* n = lpt->_body.at(i);
+    if (n->outcnt() == 0) continue; // Ignore dead
+    if (n->is_Store()) {
+      if (store != NULL) {
+        msg = "multiple stores";
+        break;
+      }
+      int opc = n->Opcode();
+      if (opc == Op_StoreP || opc == Op_StoreN || opc == Op_StoreCM) {
+        msg = "oop fills not handled";
+        break;
+      }
+      Node* value = n->in(MemNode::ValueIn);
+      if (!lpt->is_invariant(value)) {
+        msg  = "variant store value";
+      }
+      store = n;
+      store_value = value;
+    } else if (n->is_If() && n != head->loopexit()) {
+      msg = "extra control flow";
+      msg_node = n;
+    }
+  }
+
+  if (store == NULL) {
+    // No store in loop
+    return false;
+  }
+
+  if (msg == NULL && head->stride_con() != 1) {
+    // could handle negative strides too
+    if (head->stride_con() < 0) {
+      msg = "negative stride";
+    } else {
+      msg = "non-unit stride";
+    }
+  }
+
+  if (msg == NULL && !store->in(MemNode::Address)->is_AddP()) {
+    msg = "can't handle store address";
+    msg_node = store->in(MemNode::Address);
+  }
+
+  // Make sure there is an appropriate fill routine
+  BasicType t = store->as_Mem()->memory_type();
+  const char* fill_name;
+  if (msg == NULL &&
+      StubRoutines::select_fill_function(t, false, fill_name) == NULL) {
+    msg = "unsupported store";
+    msg_node = store;
+  }
+
+  if (msg != NULL) {
+#ifndef PRODUCT
+    if (TraceOptimizeFill) {
+      tty->print_cr("not fill intrinsic candidate: %s", msg);
+      if (msg_node != NULL) msg_node->dump();
+    }
+#endif
+    return false;
+  }
+
+  // Make sure the address expression can be handled.  It should be
+  // head->phi * elsize + con.  head->phi might have a ConvI2L.
+  Node* elements[4];
+  Node* conv = NULL;
+  int count = store->in(MemNode::Address)->as_AddP()->unpack_offsets(elements, ARRAY_SIZE(elements));
+  for (int e = 0; e < count; e++) {
+    Node* n = elements[e];
+    if (n->is_Con() && con == NULL) {
+      con = n;
+    } else if (n->Opcode() == Op_LShiftX && shift == NULL) {
+      Node* value = n->in(1);
+#ifdef _LP64
+      if (value->Opcode() == Op_ConvI2L) {
+        conv = value;
+        value = value->in(1);
+      }
+#endif
+      if (value != head->phi()) {
+        msg = "unhandled shift in address";
+      } else {
+        shift = n;
+        assert(type2aelembytes(store->as_Mem()->memory_type(), true) == 1 << shift->in(2)->get_int(), "scale should match");
+      }
+    } else if (n->Opcode() == Op_ConvI2L && conv == NULL) {
+      if (n->in(1) == head->phi()) {
+        conv = n;
+      } else {
+        msg = "unhandled input to ConvI2L";
+      }
+    } else if (n == head->phi()) {
+      // no shift, check below for allowed cases
+    } else {
+      msg = "unhandled node in address";
+      msg_node = n;
+    }
+  }
+
+  if (count == -1) {
+    msg = "malformed address expression";
+    msg_node = store;
+  }
+
+  // byte sized items won't have a shift
+  if (msg == NULL && shift == NULL && t != T_BYTE && t != T_BOOLEAN) {
+    msg = "can't find shift";
+    msg_node = store;
+  }
+
+  if (msg != NULL) {
+#ifndef PRODUCT
+    if (TraceOptimizeFill) {
+      tty->print_cr("not fill intrinsic: %s", msg);
+      if (msg_node != NULL) msg_node->dump();
+    }
+#endif
+    return false;
+  }
+
+  // No make sure all the other nodes in the loop can be handled
+  VectorSet ok(Thread::current()->resource_area());
+
+  // store related values are ok
+  ok.set(store->_idx);
+  ok.set(store->in(MemNode::Memory)->_idx);
+
+  // Loop structure is ok
+  ok.set(head->_idx);
+  ok.set(head->loopexit()->_idx);
+  ok.set(head->phi()->_idx);
+  ok.set(head->incr()->_idx);
+  ok.set(head->loopexit()->cmp_node()->_idx);
+  ok.set(head->loopexit()->in(1)->_idx);
+
+  // Address elements are ok
+  if (con)   ok.set(con->_idx);
+  if (shift) ok.set(shift->_idx);
+  if (conv)  ok.set(conv->_idx);
+
+  for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
+    Node* n = lpt->_body.at(i);
+    if (n->outcnt() == 0) continue; // Ignore dead
+    if (ok.test(n->_idx)) continue;
+    // Backedge projection is ok
+    if (n->is_IfTrue() && n->in(0) == head->loopexit()) continue;
+    if (!n->is_AddP()) {
+      msg = "unhandled node";
+      msg_node = n;
+      break;
+    }
+  }
+
+  // Make sure no unexpected values are used outside the loop
+  for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
+    Node* n = lpt->_body.at(i);
+    // These values can be replaced with other nodes if they are used
+    // outside the loop.
+    if (n == store || n == head->loopexit() || n == head->incr()) continue;
+    for (SimpleDUIterator iter(n); iter.has_next(); iter.next()) {
+      Node* use = iter.get();
+      if (!lpt->_body.contains(use)) {
+        msg = "node is used outside loop";
+        // lpt->_body.dump();
+        msg_node = n;
+        break;
+      }
+    }
+  }
+
+#ifdef ASSERT
+  if (TraceOptimizeFill) {
+    if (msg != NULL) {
+      tty->print_cr("no fill intrinsic: %s", msg);
+      if (msg_node != NULL) msg_node->dump();
+    } else {
+      tty->print_cr("fill intrinsic for:");
+    }
+    store->dump();
+    if (Verbose) {
+      lpt->_body.dump();
+    }
+  }
+#endif
+
+  return msg == NULL;
+}
+
+
+
+bool PhaseIdealLoop::intrinsify_fill(IdealLoopTree* lpt) {
+  // Only for counted inner loops
+  if (!lpt->is_counted() || !lpt->is_inner()) {
+    return false;
+  }
+
+  // Must have constant stride
+  CountedLoopNode* head = lpt->_head->as_CountedLoop();
+  if (!head->stride_is_con() || !head->is_normal_loop()) {
+    return false;
+  }
+
+  // Check that the body only contains a store of a loop invariant
+  // value that is indexed by the loop phi.
+  Node* store = NULL;
+  Node* store_value = NULL;
+  Node* shift = NULL;
+  Node* offset = NULL;
+  if (!match_fill_loop(lpt, store, store_value, shift, offset)) {
+    return false;
+  }
+
+  // Now replace the whole loop body by a call to a fill routine that
+  // covers the same region as the loop.
+  Node* base = store->in(MemNode::Address)->as_AddP()->in(AddPNode::Base);
+
+  // Build an expression for the beginning of the copy region
+  Node* index = head->init_trip();
+#ifdef _LP64
+  index = new (C, 2) ConvI2LNode(index);
+  _igvn.register_new_node_with_optimizer(index);
+#endif
+  if (shift != NULL) {
+    // byte arrays don't require a shift but others do.
+    index = new (C, 3) LShiftXNode(index, shift->in(2));
+    _igvn.register_new_node_with_optimizer(index);
+  }
+  index = new (C, 4) AddPNode(base, base, index);
+  _igvn.register_new_node_with_optimizer(index);
+  Node* from = new (C, 4) AddPNode(base, index, offset);
+  _igvn.register_new_node_with_optimizer(from);
+  // Compute the number of elements to copy
+  Node* len = new (C, 3) SubINode(head->limit(), head->init_trip());
+  _igvn.register_new_node_with_optimizer(len);
+
+  BasicType t = store->as_Mem()->memory_type();
+  bool aligned = false;
+  if (offset != NULL && head->init_trip()->is_Con()) {
+    int element_size = type2aelembytes(t);
+    aligned = (offset->find_intptr_t_type()->get_con() + head->init_trip()->get_int() * element_size) % HeapWordSize == 0;
+  }
+
+  // Build a call to the fill routine
+  const char* fill_name;
+  address fill = StubRoutines::select_fill_function(t, aligned, fill_name);
+  assert(fill != NULL, "what?");
+
+  // Convert float/double to int/long for fill routines
+  if (t == T_FLOAT) {
+    store_value = new (C, 2) MoveF2INode(store_value);
+    _igvn.register_new_node_with_optimizer(store_value);
+  } else if (t == T_DOUBLE) {
+    store_value = new (C, 2) MoveD2LNode(store_value);
+    _igvn.register_new_node_with_optimizer(store_value);
+  }
+
+  Node* mem_phi = store->in(MemNode::Memory);
+  Node* result_ctrl;
+  Node* result_mem;
+  const TypeFunc* call_type = OptoRuntime::array_fill_Type();
+  int size = call_type->domain()->cnt();
+  CallLeafNode *call = new (C, size) CallLeafNoFPNode(call_type, fill,
+                                                      fill_name, TypeAryPtr::get_array_body_type(t));
+  call->init_req(TypeFunc::Parms+0, from);
+  call->init_req(TypeFunc::Parms+1, store_value);
+  call->init_req(TypeFunc::Parms+2, len);
+  call->init_req( TypeFunc::Control, head->init_control());
+  call->init_req( TypeFunc::I_O    , C->top() )        ;   // does no i/o
+  call->init_req( TypeFunc::Memory ,  mem_phi->in(LoopNode::EntryControl) );
+  call->init_req( TypeFunc::ReturnAdr, C->start()->proj_out(TypeFunc::ReturnAdr) );
+  call->init_req( TypeFunc::FramePtr, C->start()->proj_out(TypeFunc::FramePtr) );
+  _igvn.register_new_node_with_optimizer(call);
+  result_ctrl = new (C, 1) ProjNode(call,TypeFunc::Control);
+  _igvn.register_new_node_with_optimizer(result_ctrl);
+  result_mem = new (C, 1) ProjNode(call,TypeFunc::Memory);
+  _igvn.register_new_node_with_optimizer(result_mem);
+
+  // If this fill is tightly coupled to an allocation and overwrites
+  // the whole body, allow it to take over the zeroing.
+  AllocateNode* alloc = AllocateNode::Ideal_allocation(base, this);
+  if (alloc != NULL && alloc->is_AllocateArray()) {
+    Node* length = alloc->as_AllocateArray()->Ideal_length();
+    if (head->limit() == length &&
+        head->init_trip() == _igvn.intcon(0)) {
+      if (TraceOptimizeFill) {
+        tty->print_cr("Eliminated zeroing in allocation");
+      }
+      alloc->maybe_set_complete(&_igvn);
+    } else {
+#ifdef ASSERT
+      if (TraceOptimizeFill) {
+        tty->print_cr("filling array but bounds don't match");
+        alloc->dump();
+        head->init_trip()->dump();
+        head->limit()->dump();
+        length->dump();
+      }
+#endif
+    }
+  }
+
+  // Redirect the old control and memory edges that are outside the loop.
+  Node* exit = head->loopexit()->proj_out(0);
+  _igvn.replace_node(exit, result_ctrl);
+  _igvn.replace_node(store, result_mem);
+  // Any uses the increment outside of the loop become the loop limit.
+  _igvn.replace_node(head->incr(), head->limit());
+
+  // Disconnect the head from the loop.
+  for (uint i = 0; i < lpt->_body.size(); i++) {
+    Node* n = lpt->_body.at(i);
+    _igvn.replace_node(n, C->top());
+  }
+
+  return true;
+}
--- a/src/share/vm/opto/loopnode.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/loopnode.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -1673,6 +1673,12 @@
     _ltree_root->_child->loop_predication(this);
   }
 
+  if (OptimizeFill && UseLoopPredicate && C->has_loops() && !C->major_progress()) {
+    if (do_intrinsify_fill()) {
+      C->set_major_progress();
+    }
+  }
+
   // Perform iteration-splitting on inner loops.  Split iterations to avoid
   // range checks or one-shot null checks.
 
--- a/src/share/vm/opto/loopnode.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/loopnode.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -937,6 +937,12 @@
   // same block.  Split thru the Region.
   void do_split_if( Node *iff );
 
+  // Conversion of fill/copy patterns into intrisic versions
+  bool do_intrinsify_fill();
+  bool intrinsify_fill(IdealLoopTree* lpt);
+  bool match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value,
+                       Node*& shift, Node*& offset);
+
 private:
   // Return a type based on condition control flow
   const TypeInt* filtered_type( Node *n, Node* n_ctrl);
--- a/src/share/vm/opto/memnode.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/memnode.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -1547,8 +1547,8 @@
         adr->is_AddP() && off != Type::OffsetBot) {
       // For constant Strings treat the fields as compile time constants.
       Node* base = adr->in(AddPNode::Base);
-      if (base->Opcode() == Op_ConP) {
-        const TypeOopPtr* t = phase->type(base)->isa_oopptr();
+      const TypeOopPtr* t = phase->type(base)->isa_oopptr();
+      if (t != NULL && t->singleton()) {
         ciObject* string = t->const_oop();
         ciConstant constant = string->as_instance()->field_value_by_offset(off);
         if (constant.basic_type() == T_INT) {
--- a/src/share/vm/opto/runtime.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/runtime.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -645,6 +645,22 @@
 }
 
 
+const TypeFunc* OptoRuntime::array_fill_Type() {
+  // create input type (domain)
+  const Type** fields = TypeTuple::fields(3);
+  fields[TypeFunc::Parms+0] = TypePtr::NOTNULL;
+  fields[TypeFunc::Parms+1] = TypeInt::INT;
+  fields[TypeFunc::Parms+2] = TypeInt::INT;
+  const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms + 3, fields);
+
+  // create result type
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = NULL; // void
+  const TypeTuple *range = TypeTuple::make(TypeFunc::Parms, fields);
+
+  return TypeFunc::make(domain, range);
+}
+
 //------------- Interpreter state access for on stack replacement
 const TypeFunc* OptoRuntime::osr_end_Type() {
   // create input type (domain)
--- a/src/share/vm/opto/runtime.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/runtime.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -260,6 +260,8 @@
   static const TypeFunc* generic_arraycopy_Type();
   static const TypeFunc* slow_arraycopy_Type();   // the full routine
 
+  static const TypeFunc* array_fill_Type();
+
   // leaf on stack replacement interpreter accessor types
   static const TypeFunc* osr_end_Type();
 
--- a/src/share/vm/opto/type.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/type.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -314,7 +314,7 @@
   mreg2type[Op_RegL] = TypeLong::LONG;
   mreg2type[Op_RegFlags] = TypeInt::CC;
 
-  TypeAryPtr::RANGE   = TypeAryPtr::make( TypePtr::BotPTR, TypeAry::make(Type::BOTTOM,TypeInt::POS), current->env()->Object_klass(), false, arrayOopDesc::length_offset_in_bytes());
+  TypeAryPtr::RANGE   = TypeAryPtr::make( TypePtr::BotPTR, TypeAry::make(Type::BOTTOM,TypeInt::POS), NULL /* current->env()->Object_klass() */, false, arrayOopDesc::length_offset_in_bytes());
 
   TypeAryPtr::NARROWOOPS = TypeAryPtr::make(TypePtr::BotPTR, TypeAry::make(TypeNarrowOop::BOTTOM, TypeInt::POS), NULL /*ciArrayKlass::make(o)*/,  false,  Type::OffsetBot);
 
@@ -3369,7 +3369,7 @@
         tary = TypeAry::make(Type::BOTTOM, tary->_size);
       }
     }
-    bool xk;
+    bool xk = false;
     switch (tap->ptr()) {
     case AnyNull:
     case TopPTR:
@@ -3391,9 +3391,10 @@
         o = tap->const_oop();
         xk = true;
       } else {
-        xk = this->_klass_is_exact;
+        // Only precise for identical arrays
+        xk = this->_klass_is_exact && (klass() == tap->klass());
       }
-      return TypeAryPtr::make( ptr, o, tary, tap->_klass, xk, off, instance_id );
+      return TypeAryPtr::make( ptr, o, tary, lazy_klass, xk, off, instance_id );
     }
     case NotNull:
     case BotPTR:
@@ -3683,12 +3684,10 @@
 }
 
 
-//------------------------------klass------------------------------------------
-// Return the defining klass for this class
-ciKlass* TypeAryPtr::klass() const {
-  if( _klass ) return _klass;   // Return cached value, if possible
-
-  // Oops, need to compute _klass and cache it
+//----------------------compute_klass------------------------------------------
+// Compute the defining klass for this class
+ciKlass* TypeAryPtr::compute_klass(DEBUG_ONLY(bool verify)) const {
+  // Compute _klass based on element type.
   ciKlass* k_ary = NULL;
   const TypeInstPtr *tinst;
   const TypeAryPtr *tary;
@@ -3715,11 +3714,39 @@
   } else {
     // Cannot compute array klass directly from basic type,
     // since subtypes of TypeInt all have basic type T_INT.
+#ifdef ASSERT
+    if (verify && el->isa_int()) {
+      // Check simple cases when verifying klass.
+      BasicType bt = T_ILLEGAL;
+      if (el == TypeInt::BYTE) {
+        bt = T_BYTE;
+      } else if (el == TypeInt::SHORT) {
+        bt = T_SHORT;
+      } else if (el == TypeInt::CHAR) {
+        bt = T_CHAR;
+      } else if (el == TypeInt::INT) {
+        bt = T_INT;
+      } else {
+        return _klass; // just return specified klass
+      }
+      return ciTypeArrayKlass::make(bt);
+    }
+#endif
     assert(!el->isa_int(),
            "integral arrays must be pre-equipped with a class");
     // Compute array klass directly from basic type
     k_ary = ciTypeArrayKlass::make(el->basic_type());
   }
+  return k_ary;
+}
+
+//------------------------------klass------------------------------------------
+// Return the defining klass for this class
+ciKlass* TypeAryPtr::klass() const {
+  if( _klass ) return _klass;   // Return cached value, if possible
+
+  // Oops, need to compute _klass and cache it
+  ciKlass* k_ary = compute_klass();
 
   if( this != TypeAryPtr::OOPS ) {
     // The _klass field acts as a cache of the underlying
--- a/src/share/vm/opto/type.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/opto/type.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -831,11 +831,30 @@
 //------------------------------TypeAryPtr-------------------------------------
 // Class of Java array pointers
 class TypeAryPtr : public TypeOopPtr {
-  TypeAryPtr( PTR ptr, ciObject* o, const TypeAry *ary, ciKlass* k, bool xk, int offset, int instance_id ) : TypeOopPtr(AryPtr,ptr,k,xk,o,offset, instance_id), _ary(ary) {};
+  TypeAryPtr( PTR ptr, ciObject* o, const TypeAry *ary, ciKlass* k, bool xk, int offset, int instance_id ) : TypeOopPtr(AryPtr,ptr,k,xk,o,offset, instance_id), _ary(ary) {
+#ifdef ASSERT
+    if (k != NULL) {
+      // Verify that specified klass and TypeAryPtr::klass() follow the same rules.
+      ciKlass* ck = compute_klass(true);
+      if (UseNewCode || k != ck) {
+        this->dump(); tty->cr();
+        tty->print(" k: ");
+        k->print(); tty->cr();
+        tty->print("ck: ");
+        if (ck != NULL) ck->print();
+        else tty->print("<NULL>");
+        tty->cr();
+        assert(false, "unexpected TypeAryPtr::_klass");
+      }
+    }
+#endif
+  }
   virtual bool eq( const Type *t ) const;
   virtual int hash() const;     // Type specific hashing
   const TypeAry *_ary;          // Array we point into
 
+  ciKlass* compute_klass(DEBUG_ONLY(bool verify = false)) const;
+
 public:
   // Accessors
   ciKlass* klass() const;
--- a/src/share/vm/runtime/arguments.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/runtime/arguments.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -1513,6 +1513,9 @@
   if (AggressiveOpts && FLAG_IS_DEFAULT(OptimizeStringConcat)) {
     FLAG_SET_DEFAULT(OptimizeStringConcat, true);
   }
+  if (AggressiveOpts && FLAG_IS_DEFAULT(OptimizeFill)) {
+    FLAG_SET_DEFAULT(OptimizeFill, true);
+  }
 #endif
 
   if (AggressiveOpts) {
--- a/src/share/vm/runtime/stubRoutines.cpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/runtime/stubRoutines.cpp	Wed Sep 01 00:40:05 2010 -0700
@@ -97,6 +97,15 @@
 address StubRoutines::_unsafe_arraycopy                  = NULL;
 address StubRoutines::_generic_arraycopy                 = NULL;
 
+
+address StubRoutines::_jbyte_fill;
+address StubRoutines::_jshort_fill;
+address StubRoutines::_jint_fill;
+address StubRoutines::_arrayof_jbyte_fill;
+address StubRoutines::_arrayof_jshort_fill;
+address StubRoutines::_arrayof_jint_fill;
+
+
 double (* StubRoutines::_intrinsic_log   )(double) = NULL;
 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
 double (* StubRoutines::_intrinsic_exp   )(double) = NULL;
@@ -193,6 +202,46 @@
 
 #undef TEST_ARRAYCOPY
 
+#define TEST_FILL(type)                                                                      \
+  if (_##type##_fill != NULL) {                                                              \
+    union {                                                                                  \
+      double d;                                                                              \
+      type body[96];                                                                         \
+    } s;                                                                                     \
+                                                                                             \
+    int v = 32;                                                                              \
+    for (int offset = -2; offset <= 2; offset++) {                                           \
+      for (int i = 0; i < 96; i++) {                                                         \
+        s.body[i] = 1;                                                                       \
+      }                                                                                      \
+      type* start = s.body + 8 + offset;                                                     \
+      for (int aligned = 0; aligned < 2; aligned++) {                                        \
+        if (aligned) {                                                                       \
+          if (((intptr_t)start) % HeapWordSize == 0) {                                       \
+            ((void (*)(type*, int, int))StubRoutines::_arrayof_##type##_fill)(start, v, 80); \
+          } else {                                                                           \
+            continue;                                                                        \
+          }                                                                                  \
+        } else {                                                                             \
+          ((void (*)(type*, int, int))StubRoutines::_##type##_fill)(start, v, 80);           \
+        }                                                                                    \
+        for (int i = 0; i < 96; i++) {                                                       \
+          if (i < (8 + offset) || i >= (88 + offset)) {                                      \
+            assert(s.body[i] == 1, "what?");                                                 \
+          } else {                                                                           \
+            assert(s.body[i] == 32, "what?");                                                \
+          }                                                                                  \
+        }                                                                                    \
+      }                                                                                      \
+    }                                                                                        \
+  }                                                                                          \
+
+  TEST_FILL(jbyte);
+  TEST_FILL(jshort);
+  TEST_FILL(jint);
+
+#undef TEST_FILL
+
 #define TEST_COPYRTN(type) \
   test_arraycopy_func(CAST_FROM_FN_PTR(address, Copy::conjoint_##type##s_atomic),  sizeof(type)); \
   test_arraycopy_func(CAST_FROM_FN_PTR(address, Copy::arrayof_conjoint_##type##s), (int)MAX2(sizeof(HeapWord), sizeof(type)))
@@ -313,3 +362,39 @@
   Copy::arrayof_conjoint_oops(src, dest, count);
   gen_arraycopy_barrier((oop *) dest, count);
 JRT_END
+
+
+address StubRoutines::select_fill_function(BasicType t, bool aligned, const char* &name) {
+#define RETURN_STUB(xxx_fill) { \
+  name = #xxx_fill; \
+  return StubRoutines::xxx_fill(); }
+
+  switch (t) {
+  case T_BYTE:
+  case T_BOOLEAN:
+    if (!aligned) RETURN_STUB(jbyte_fill);
+    RETURN_STUB(arrayof_jbyte_fill);
+  case T_CHAR:
+  case T_SHORT:
+    if (!aligned) RETURN_STUB(jshort_fill);
+    RETURN_STUB(arrayof_jshort_fill);
+  case T_INT:
+  case T_FLOAT:
+    if (!aligned) RETURN_STUB(jint_fill);
+    RETURN_STUB(arrayof_jint_fill);
+  case T_DOUBLE:
+  case T_LONG:
+  case T_ARRAY:
+  case T_OBJECT:
+  case T_NARROWOOP:
+  case T_ADDRESS:
+    // Currently unsupported
+    return NULL;
+
+  default:
+    ShouldNotReachHere();
+    return NULL;
+  }
+
+#undef RETURN_STUB
+}
--- a/src/share/vm/runtime/stubRoutines.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/runtime/stubRoutines.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -148,6 +148,13 @@
   static address _unsafe_arraycopy;
   static address _generic_arraycopy;
 
+  static address _jbyte_fill;
+  static address _jshort_fill;
+  static address _jint_fill;
+  static address _arrayof_jbyte_fill;
+  static address _arrayof_jshort_fill;
+  static address _arrayof_jint_fill;
+
   // These are versions of the java.lang.Math methods which perform
   // the same operations as the intrinsic version.  They are used for
   // constant folding in the compiler to ensure equivalence.  If the
@@ -259,6 +266,16 @@
   static address unsafe_arraycopy()        { return _unsafe_arraycopy; }
   static address generic_arraycopy()       { return _generic_arraycopy; }
 
+  static address jbyte_fill()          { return _jbyte_fill; }
+  static address jshort_fill()         { return _jshort_fill; }
+  static address jint_fill()           { return _jint_fill; }
+  static address arrayof_jbyte_fill()  { return _arrayof_jbyte_fill; }
+  static address arrayof_jshort_fill() { return _arrayof_jshort_fill; }
+  static address arrayof_jint_fill()   { return _arrayof_jint_fill; }
+
+  static address select_fill_function(BasicType t, bool aligned, const char* &name);
+
+
   static double  intrinsic_log(double d) {
     assert(_intrinsic_log != NULL, "must be defined");
     return _intrinsic_log(d);
--- a/src/share/vm/utilities/globalDefinitions.hpp	Mon Aug 30 10:58:13 2010 -0700
+++ b/src/share/vm/utilities/globalDefinitions.hpp	Wed Sep 01 00:40:05 2010 -0700
@@ -529,7 +529,7 @@
 #ifdef ASSERT
 extern int type2aelembytes(BasicType t, bool allow_address = false); // asserts
 #else
-inline int type2aelembytes(BasicType t) { return _type2aelembytes[t]; }
+inline int type2aelembytes(BasicType t, bool allow_address = false) { return _type2aelembytes[t]; }
 #endif