changeset 1371:fc3cd2277dc7

Merge
author jrose
date Tue, 13 Apr 2010 13:01:37 -0700
parents e4c77b879561 213fbcf54799
children e16cca0aa5e1 ef74d6d1ac1e
files
diffstat 26 files changed, 282 insertions(+), 103 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -1728,9 +1728,13 @@
       ShouldNotReachHere();
     }
   } else if (code == lir_cmp_l2i) {
+#ifdef _LP64
+    __ lcmp(left->as_register_lo(), right->as_register_lo(), dst->as_register());
+#else
     __ lcmp(left->as_register_hi(),  left->as_register_lo(),
             right->as_register_hi(), right->as_register_lo(),
             dst->as_register());
+#endif
   } else {
     ShouldNotReachHere();
   }
@@ -2849,7 +2853,7 @@
 
 
 void LIR_Assembler::align_backward_branch_target() {
-  __ align(16);
+  __ align(OptoLoopAlignment);
 }
 
 
--- a/src/cpu/sparc/vm/c2_globals_sparc.hpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/sparc/vm/c2_globals_sparc.hpp	Tue Apr 13 13:01:37 2010 -0700
@@ -60,9 +60,6 @@
 define_pd_global(intx, INTPRESSURE,                  48);  // large register set
 define_pd_global(intx, InteriorEntryAlignment,       16);  // = CodeEntryAlignment
 define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
-// The default setting 16/16 seems to work best.
-// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
-define_pd_global(intx, OptoLoopAlignment,            16);  // = 4*wordSize
 define_pd_global(intx, RegisterCostAreaRatio,        12000);
 define_pd_global(bool, UseTLAB,                      true);
 define_pd_global(bool, ResizeTLAB,                   true);
--- a/src/cpu/sparc/vm/globals_sparc.hpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/sparc/vm/globals_sparc.hpp	Tue Apr 13 13:01:37 2010 -0700
@@ -40,6 +40,9 @@
 define_pd_global(bool, UncommonNullCast,            true);  // Uncommon-trap NULLs past to check cast
 
 define_pd_global(intx, CodeEntryAlignment,    32);
+// The default setting 16/16 seems to work best.
+// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
+define_pd_global(intx, OptoLoopAlignment,     16);  // = 4*wordSize
 define_pd_global(intx, InlineFrequencyCount,  50);  // we can use more inlining on the SPARC
 define_pd_global(intx, InlineSmallCode,       1500);
 #ifdef _LP64
--- a/src/cpu/sparc/vm/sparc.ad	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/sparc/vm/sparc.ad	Tue Apr 13 13:01:37 2010 -0700
@@ -471,6 +471,9 @@
 source %{
 #define __ _masm.
 
+// Block initializing store
+#define ASI_BLK_INIT_QUAD_LDD_P    0xE2
+
 // tertiary op of a LoadP or StoreP encoding
 #define REGP_OP true
 
@@ -6147,6 +6150,7 @@
 %}
 
 instruct prefetchw( memory mem ) %{
+  predicate(AllocatePrefetchStyle != 3 );
   match( PrefetchWrite mem );
   ins_cost(MEMORY_REF_COST);
 
@@ -6156,6 +6160,23 @@
   ins_pipe(iload_mem);
 %}
 
+// Use BIS instruction to prefetch.
+instruct prefetchw_bis( memory mem ) %{
+  predicate(AllocatePrefetchStyle == 3);
+  match( PrefetchWrite mem );
+  ins_cost(MEMORY_REF_COST);
+
+  format %{ "STXA   G0,$mem\t! // Block initializing store" %}
+  ins_encode %{
+     Register base = as_Register($mem$$base);
+     int disp = $mem$$disp;
+     if (disp != 0) {
+       __ add(base, AllocatePrefetchStepSize, base);
+     }
+     __ stxa(G0, base, G0, ASI_BLK_INIT_QUAD_LDD_P);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
 
 //----------Store Instructions-------------------------------------------------
 // Store Byte
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -1148,7 +1148,7 @@
       __ andn(from, 7, from);     // Align address
       __ ldx(from, 0, O3);
       __ inc(from, 8);
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_loop);
       __ ldx(from, 0, O4);
       __ deccc(count, count_dec); // Can we do next iteration after this one?
@@ -1220,7 +1220,7 @@
     //
       __ andn(end_from, 7, end_from);     // Align address
       __ ldx(end_from, 0, O3);
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_loop);
       __ ldx(end_from, -8, O4);
       __ deccc(count, count_dec); // Can we do next iteration after this one?
@@ -1349,7 +1349,7 @@
     __ BIND(L_copy_byte);
       __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
       __ delayed()->nop();
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_copy_byte_loop);
       __ ldub(from, offset, O3);
       __ deccc(count);
@@ -1445,7 +1445,7 @@
                                         L_aligned_copy, L_copy_byte);
     }
     // copy 4 elements (16 bytes) at a time
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_aligned_copy);
       __ dec(end_from, 16);
       __ ldx(end_from, 8, O3);
@@ -1461,7 +1461,7 @@
     __ BIND(L_copy_byte);
       __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
       __ delayed()->nop();
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_copy_byte_loop);
       __ dec(end_from);
       __ dec(end_to);
@@ -1577,7 +1577,7 @@
     __ BIND(L_copy_2_bytes);
       __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
       __ delayed()->nop();
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_copy_2_bytes_loop);
       __ lduh(from, offset, O3);
       __ deccc(count);
@@ -1684,7 +1684,7 @@
                                         L_aligned_copy, L_copy_2_bytes);
     }
     // copy 4 elements (16 bytes) at a time
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_aligned_copy);
       __ dec(end_from, 16);
       __ ldx(end_from, 8, O3);
@@ -1781,7 +1781,7 @@
     // copy with shift 4 elements (16 bytes) at a time
       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
 
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_copy_16_bytes);
       __ ldx(from, 4, O4);
       __ deccc(count, 4); // Can we do next iteration after this one?
@@ -1907,7 +1907,7 @@
     // to form 2 aligned 8-bytes chunks to store.
     //
       __ ldx(end_from, -4, O3);
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_copy_16_bytes);
       __ ldx(end_from, -12, O4);
       __ deccc(count, 4);
@@ -1929,7 +1929,7 @@
       __ delayed()->inc(count, 4);
 
     // copy 4 elements (16 bytes) at a time
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_aligned_copy);
       __ dec(end_from, 16);
       __ ldx(end_from, 8, O3);
@@ -2000,6 +2000,27 @@
   //      to:    O1
   //      count: O2 treated as signed
   //
+  // count -= 2;
+  // if ( count >= 0 ) { // >= 2 elements
+  //   if ( count > 6) { // >= 8 elements
+  //     count -= 6; // original count - 8
+  //     do {
+  //       copy_8_elements;
+  //       count -= 8;
+  //     } while ( count >= 0 );
+  //     count += 6;
+  //   }
+  //   if ( count >= 0 ) { // >= 2 elements
+  //     do {
+  //       copy_2_elements;
+  //     } while ( (count=count-2) >= 0 );
+  //   }
+  // }
+  // count += 2;
+  // if ( count != 0 ) { // 1 element left
+  //   copy_1_element;
+  // }
+  //
   void generate_disjoint_long_copy_core(bool aligned) {
     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
     const Register from    = O0;  // source array address
@@ -2012,7 +2033,39 @@
       __ mov(G0, offset0);   // offset from start of arrays (0)
       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
       __ delayed()->add(offset0, 8, offset8);
-      __ align(16);
+
+    // Copy by 64 bytes chunks
+    Label L_copy_64_bytes;
+    const Register from64 = O3;  // source address
+    const Register to64   = G3;  // destination address
+      __ subcc(count, 6, O3);
+      __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
+      __ delayed()->mov(to,   to64);
+      // Now we can use O4(offset0), O5(offset8) as temps
+      __ mov(O3, count);
+      __ mov(from, from64);
+
+      __ align(OptoLoopAlignment);
+    __ BIND(L_copy_64_bytes);
+      for( int off = 0; off < 64; off += 16 ) {
+        __ ldx(from64,  off+0, O4);
+        __ ldx(from64,  off+8, O5);
+        __ stx(O4, to64,  off+0);
+        __ stx(O5, to64,  off+8);
+      }
+      __ deccc(count, 8);
+      __ inc(from64, 64);
+      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
+      __ delayed()->inc(to64, 64);
+
+      // Restore O4(offset0), O5(offset8)
+      __ sub(from64, from, offset0);
+      __ inccc(count, 6);
+      __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
+      __ delayed()->add(offset0, 8, offset8);
+
+      // Copy by 16 bytes chunks
+      __ align(OptoLoopAlignment);
     __ BIND(L_copy_16_bytes);
       __ ldx(from, offset0, O3);
       __ ldx(from, offset8, G3);
@@ -2023,6 +2076,7 @@
       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
       __ delayed()->inc(offset8, 16);
 
+      // Copy last 8 bytes
     __ BIND(L_copy_8_bytes);
       __ inccc(count, 2);
       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
@@ -2085,7 +2139,7 @@
       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
       __ delayed()->sllx(count, LogBytesPerLong, offset8);
       __ sub(offset8, 8, offset0);
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_copy_16_bytes);
       __ ldx(from, offset8, O2);
       __ ldx(from, offset0, O3);
@@ -2351,7 +2405,7 @@
     //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
     //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
-    __ align(16);
+    __ align(OptoLoopAlignment);
 
     __ BIND(store_element);
     __ deccc(G1_remain);                // decrement the count
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -86,14 +86,24 @@
     if (FLAG_IS_DEFAULT(InteriorEntryAlignment)) {
       FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
     }
+    if (is_niagara1_plus()) {
+      if (AllocatePrefetchStyle > 0 && FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
+        // Use BIS instruction for allocation prefetch.
+        FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3);
+        if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
+          // Use smaller prefetch distance on N2 with BIS
+          FLAG_SET_DEFAULT(AllocatePrefetchDistance, 64);
+        }
+      }
+      if (AllocatePrefetchStyle != 3 && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
+        // Use different prefetch distance without BIS
+        FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
+      }
+    }
+#endif
     if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
       FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
     }
-    if (is_niagara1_plus() && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
-      // Use smaller prefetch distance on N2
-      FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
-    }
-#endif
   }
 
   // Use hardware population count instruction if available.
--- a/src/cpu/x86/vm/assembler_x86.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -3365,6 +3365,13 @@
 
 #else // LP64
 
+void Assembler::set_byte_if_not_zero(Register dst) {
+  int enc = prefix_and_encode(dst->encoding(), true);
+  emit_byte(0x0F);
+  emit_byte(0x95);
+  emit_byte(0xE0 | enc);
+}
+
 // 64bit only pieces of the assembler
 // This should only be used by 64bit instructions that can use rip-relative
 // it cannot be used by instructions that want an immediate value.
--- a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -2690,19 +2690,14 @@
   } else {
     assert(code == lir_cmp_l2i, "check");
 #ifdef _LP64
-      Register dest = dst->as_register();
-      __ xorptr(dest, dest);
-      Label high, done;
-      __ cmpptr(left->as_register_lo(), right->as_register_lo());
-      __ jcc(Assembler::equal, done);
-      __ jcc(Assembler::greater, high);
-      __ decrement(dest);
-      __ jmp(done);
-      __ bind(high);
-      __ increment(dest);
-
-      __ bind(done);
-
+    Label done;
+    Register dest = dst->as_register();
+    __ cmpptr(left->as_register_lo(), right->as_register_lo());
+    __ movl(dest, -1);
+    __ jccb(Assembler::less, done);
+    __ set_byte_if_not_zero(dest);
+    __ movzbl(dest, dest);
+    __ bind(done);
 #else
     __ lcmp2int(left->as_register_hi(),
                 left->as_register_lo(),
--- a/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -781,7 +781,7 @@
 
   // Restore SP from BP if the exception PC is a MethodHandle call site.
   NOT_LP64(__ get_thread(thread);)
-  __ cmpl(Address(thread, JavaThread::is_method_handle_exception_offset()), 0);
+  __ cmpl(Address(thread, JavaThread::is_method_handle_return_offset()), 0);
   __ cmovptr(Assembler::notEqual, rsp, rbp);
 
   // continue at exception handler (return address removed)
--- a/src/cpu/x86/vm/c2_globals_x86.hpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/x86/vm/c2_globals_x86.hpp	Tue Apr 13 13:01:37 2010 -0700
@@ -80,7 +80,6 @@
 // Ergonomics related flags
 define_pd_global(uint64_t,MaxRAM,                    4ULL*G);
 #endif // AMD64
-define_pd_global(intx, OptoLoopAlignment,            16);
 define_pd_global(intx, RegisterCostAreaRatio,        16000);
 
 // Peephole and CISC spilling both break the graph, and so makes the
--- a/src/cpu/x86/vm/globals_x86.hpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/x86/vm/globals_x86.hpp	Tue Apr 13 13:01:37 2010 -0700
@@ -45,6 +45,7 @@
 #else
 define_pd_global(intx, CodeEntryAlignment,       16);
 #endif // COMPILER2
+define_pd_global(intx, OptoLoopAlignment,        16);
 define_pd_global(intx, InlineFrequencyCount,     100);
 define_pd_global(intx, InlineSmallCode,          1000);
 
--- a/src/cpu/x86/vm/runtime_x86_32.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/x86/vm/runtime_x86_32.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -115,8 +115,8 @@
 
   // rax: exception handler for given <exception oop/exception pc>
 
-  // Restore SP from BP if the exception PC is a MethodHandle call.
-  __ cmpl(Address(rcx, JavaThread::is_method_handle_exception_offset()), 0);
+  // Restore SP from BP if the exception PC is a MethodHandle call site.
+  __ cmpl(Address(rcx, JavaThread::is_method_handle_return_offset()), 0);
   __ cmovptr(Assembler::notEqual, rsp, rbp);
 
   // We have a handler in rax, (could be deopt blob)
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -3328,8 +3328,8 @@
 
   // rax: exception handler
 
-  // Restore SP from BP if the exception PC is a MethodHandle call.
-  __ cmpl(Address(r15_thread, JavaThread::is_method_handle_exception_offset()), 0);
+  // Restore SP from BP if the exception PC is a MethodHandle call site.
+  __ cmpl(Address(r15_thread, JavaThread::is_method_handle_return_offset()), 0);
   __ cmovptr(Assembler::notEqual, rsp, rbp);
 
   // We have a handler in rax (could be deopt blob).
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -430,7 +430,7 @@
     __ verify_oop(exception_oop);
 
     // Restore SP from BP if the exception PC is a MethodHandle call site.
-    __ cmpl(Address(thread, JavaThread::is_method_handle_exception_offset()), 0);
+    __ cmpl(Address(thread, JavaThread::is_method_handle_return_offset()), 0);
     __ cmovptr(Assembler::notEqual, rsp, rbp);
 
     // continue at exception handler (return address removed)
@@ -812,7 +812,7 @@
     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
     // Copy 64-byte chunks
     __ jmpb(L_copy_64_bytes);
-    __ align(16);
+    __ align(OptoLoopAlignment);
   __ BIND(L_copy_64_bytes_loop);
 
     if(UseUnalignedLoadStores) {
@@ -874,7 +874,7 @@
     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
     // Copy 64-byte chunks
     __ jmpb(L_copy_64_bytes);
-    __ align(16);
+    __ align(OptoLoopAlignment);
   __ BIND(L_copy_64_bytes_loop);
     __ movq(mmx0, Address(from, 0));
     __ movq(mmx1, Address(from, 8));
@@ -1144,7 +1144,7 @@
       __ movl(Address(to, count, sf, 0), rdx);
       __ jmpb(L_copy_8_bytes);
 
-      __ align(16);
+      __ align(OptoLoopAlignment);
       // Move 8 bytes
     __ BIND(L_copy_8_bytes_loop);
       if (UseXMMForArrayCopy) {
@@ -1235,7 +1235,7 @@
       }
     } else {
       __ jmpb(L_copy_8_bytes);
-      __ align(16);
+      __ align(OptoLoopAlignment);
     __ BIND(L_copy_8_bytes_loop);
       __ fild_d(Address(from, 0));
       __ fistp_d(Address(from, to_from, Address::times_1));
@@ -1282,7 +1282,7 @@
 
     __ jmpb(L_copy_8_bytes);
 
-    __ align(16);
+    __ align(OptoLoopAlignment);
   __ BIND(L_copy_8_bytes_loop);
     if (VM_Version::supports_mmx()) {
       if (UseXMMForArrayCopy) {
@@ -1454,7 +1454,7 @@
     // Loop control:
     //   for (count = -count; count != 0; count++)
     // Base pointers src, dst are biased by 8*count,to last element.
-    __ align(16);
+    __ align(OptoLoopAlignment);
 
     __ BIND(L_store_element);
     __ movptr(to_element_addr, elem);     // store the oop
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -871,9 +871,8 @@
   }
 
   address generate_fp_mask(const char *stub_name, int64_t mask) {
+    __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", stub_name);
-
-    __ align(16);
     address start = __ pc();
 
     __ emit_data64( mask, relocInfo::none );
@@ -1268,7 +1267,7 @@
                              Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
     DEBUG_ONLY(__ stop("enter at entry label, not here"));
     Label L_loop;
-    __ align(16);
+    __ align(OptoLoopAlignment);
   __ BIND(L_loop);
     if(UseUnalignedLoadStores) {
       __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
@@ -1309,7 +1308,7 @@
                               Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
     DEBUG_ONLY(__ stop("enter at entry label, not here"));
     Label L_loop;
-    __ align(16);
+    __ align(OptoLoopAlignment);
   __ BIND(L_loop);
     if(UseUnalignedLoadStores) {
       __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
@@ -2229,7 +2228,7 @@
     // Loop control:
     //   for (count = -count; count != 0; count++)
     // Base pointers src, dst are biased by 8*(count-1),to last element.
-    __ align(16);
+    __ align(OptoLoopAlignment);
 
     __ BIND(L_store_element);
     __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
--- a/src/share/vm/c1/c1_LinearScan.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/c1/c1_LinearScan.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -2608,6 +2608,46 @@
     } else if (opr->is_double_xmm()) {
       assert(opr->fpu_regnrLo() == opr->fpu_regnrHi(), "assumed in calculation");
       VMReg rname_first  = opr->as_xmm_double_reg()->as_VMReg();
+#  ifdef _LP64
+      first = new LocationValue(Location::new_reg_loc(Location::dbl, rname_first));
+      second = &_int_0_scope_value;
+#  else
+      first = new LocationValue(Location::new_reg_loc(Location::normal, rname_first));
+      // %%% This is probably a waste but we'll keep things as they were for now
+      if (true) {
+        VMReg rname_second = rname_first->next();
+        second = new LocationValue(Location::new_reg_loc(Location::normal, rname_second));
+      }
+#  endif
+#endif
+
+    } else if (opr->is_double_fpu()) {
+      // On SPARC, fpu_regnrLo/fpu_regnrHi represents the two halves of
+      // the double as float registers in the native ordering. On X86,
+      // fpu_regnrLo is a FPU stack slot whose VMReg represents
+      // the low-order word of the double and fpu_regnrLo + 1 is the
+      // name for the other half.  *first and *second must represent the
+      // least and most significant words, respectively.
+
+#ifdef X86
+      // the exact location of fpu stack values is only known
+      // during fpu stack allocation, so the stack allocator object
+      // must be present
+      assert(use_fpu_stack_allocation(), "should not have float stack values without fpu stack allocation (all floats must be SSE2)");
+      assert(_fpu_stack_allocator != NULL, "must be present");
+      opr = _fpu_stack_allocator->to_fpu_stack(opr);
+
+      assert(opr->fpu_regnrLo() == opr->fpu_regnrHi(), "assumed in calculation (only fpu_regnrHi is used)");
+#endif
+#ifdef SPARC
+      assert(opr->fpu_regnrLo() == opr->fpu_regnrHi() + 1, "assumed in calculation (only fpu_regnrHi is used)");
+#endif
+
+      VMReg rname_first = frame_map()->fpu_regname(opr->fpu_regnrHi());
+#ifdef _LP64
+      first = new LocationValue(Location::new_reg_loc(Location::dbl, rname_first));
+      second = &_int_0_scope_value;
+#else
       first = new LocationValue(Location::new_reg_loc(Location::normal, rname_first));
       // %%% This is probably a waste but we'll keep things as they were for now
       if (true) {
@@ -2616,37 +2656,6 @@
       }
 #endif
 
-    } else if (opr->is_double_fpu()) {
-      // On SPARC, fpu_regnrLo/fpu_regnrHi represents the two halves of
-      // the double as float registers in the native ordering. On X86,
-      // fpu_regnrLo is a FPU stack slot whose VMReg represents
-      // the low-order word of the double and fpu_regnrLo + 1 is the
-      // name for the other half.  *first and *second must represent the
-      // least and most significant words, respectively.
-
-#ifdef X86
-      // the exact location of fpu stack values is only known
-      // during fpu stack allocation, so the stack allocator object
-      // must be present
-      assert(use_fpu_stack_allocation(), "should not have float stack values without fpu stack allocation (all floats must be SSE2)");
-      assert(_fpu_stack_allocator != NULL, "must be present");
-      opr = _fpu_stack_allocator->to_fpu_stack(opr);
-
-      assert(opr->fpu_regnrLo() == opr->fpu_regnrHi(), "assumed in calculation (only fpu_regnrHi is used)");
-#endif
-#ifdef SPARC
-      assert(opr->fpu_regnrLo() == opr->fpu_regnrHi() + 1, "assumed in calculation (only fpu_regnrHi is used)");
-#endif
-
-      VMReg rname_first = frame_map()->fpu_regname(opr->fpu_regnrHi());
-
-      first = new LocationValue(Location::new_reg_loc(Location::normal, rname_first));
-      // %%% This is probably a waste but we'll keep things as they were for now
-      if (true) {
-        VMReg rname_second = rname_first->next();
-        second = new LocationValue(Location::new_reg_loc(Location::normal, rname_second));
-      }
-
     } else {
       ShouldNotReachHere();
       first = NULL;
--- a/src/share/vm/classfile/classFileParser.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/classfile/classFileParser.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -2956,8 +2956,8 @@
 #endif
     bool compact_fields   = CompactFields;
     int  allocation_style = FieldsAllocationStyle;
-    if( allocation_style < 0 || allocation_style > 1 ) { // Out of range?
-      assert(false, "0 <= FieldsAllocationStyle <= 1");
+    if( allocation_style < 0 || allocation_style > 2 ) { // Out of range?
+      assert(false, "0 <= FieldsAllocationStyle <= 2");
       allocation_style = 1; // Optimistic
     }
 
@@ -2993,6 +2993,25 @@
     } else if( allocation_style == 1 ) {
       // Fields order: longs/doubles, ints, shorts/chars, bytes, oops
       next_nonstatic_double_offset = next_nonstatic_field_offset;
+    } else if( allocation_style == 2 ) {
+      // Fields allocation: oops fields in super and sub classes are together.
+      if( nonstatic_field_size > 0 && super_klass() != NULL &&
+          super_klass->nonstatic_oop_map_size() > 0 ) {
+        int map_size = super_klass->nonstatic_oop_map_size();
+        OopMapBlock* first_map = super_klass->start_of_nonstatic_oop_maps();
+        OopMapBlock* last_map = first_map + map_size - 1;
+        int next_offset = last_map->offset() + (last_map->count() * heapOopSize);
+        if (next_offset == next_nonstatic_field_offset) {
+          allocation_style = 0;   // allocate oops first
+          next_nonstatic_oop_offset    = next_nonstatic_field_offset;
+          next_nonstatic_double_offset = next_nonstatic_oop_offset +
+                                         (nonstatic_oop_count * heapOopSize);
+        }
+      }
+      if( allocation_style == 2 ) {
+        allocation_style = 1;     // allocate oops last
+        next_nonstatic_double_offset = next_nonstatic_field_offset;
+      }
     } else {
       ShouldNotReachHere();
     }
--- a/src/share/vm/code/codeCache.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/code/codeCache.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright 1997-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * Copyright 1997-2010 Sun Microsystems, Inc.  All Rights Reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -284,9 +284,11 @@
       cur->print_on(tty, is_live ? "scavenge root" : "dead scavenge root"); tty->cr();
     }
 #endif //PRODUCT
-    if (is_live)
+    if (is_live) {
       // Perform cur->oops_do(f), maybe just once per nmethod.
       f->do_code_blob(cur);
+      cur->fix_oop_relocations();
+    }
   }
 
   // Check for stray marks.
--- a/src/share/vm/memory/threadLocalAllocBuffer.hpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/memory/threadLocalAllocBuffer.hpp	Tue Apr 13 13:01:37 2010 -0700
@@ -111,7 +111,22 @@
 
   // Allocate size HeapWords. The memory is NOT initialized to zero.
   inline HeapWord* allocate(size_t size);
-  static size_t alignment_reserve()              { return align_object_size(typeArrayOopDesc::header_size(T_INT)); }
+
+  // Reserve space at the end of TLAB
+  static size_t end_reserve() {
+    int reserve_size = typeArrayOopDesc::header_size(T_INT);
+    if (AllocatePrefetchStyle == 3) {
+      // BIS is used to prefetch - we need a space for it.
+      // +1 for rounding up to next cache line +1 to be safe
+      int lines = AllocatePrefetchLines + 2;
+      int step_size = AllocatePrefetchStepSize;
+      int distance = AllocatePrefetchDistance;
+      int prefetch_end = (distance + step_size*lines)/(int)HeapWordSize;
+      reserve_size = MAX2(reserve_size, prefetch_end);
+    }
+    return reserve_size;
+  }
+  static size_t alignment_reserve()              { return align_object_size(end_reserve()); }
   static size_t alignment_reserve_in_bytes()     { return alignment_reserve() * HeapWordSize; }
 
   // Return tlab size or remaining space in eden such that the
--- a/src/share/vm/opto/c2_globals.hpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/opto/c2_globals.hpp	Tue Apr 13 13:01:37 2010 -0700
@@ -52,9 +52,6 @@
           "Code alignment for interior entry points "                       \
           "in generated code (in bytes)")                                   \
                                                                             \
-  product_pd(intx, OptoLoopAlignment,                                       \
-          "Align inner loops to zero relative to this modulus")             \
-                                                                            \
   product(intx, MaxLoopPad, (OptoLoopAlignment-1),                          \
           "Align a loop if padding size in bytes is less or equal to this value") \
                                                                             \
--- a/src/share/vm/opto/macro.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/opto/macro.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -1487,11 +1487,11 @@
                                         Node*& contended_phi_rawmem,
                                         Node* old_eden_top, Node* new_eden_top,
                                         Node* length) {
+   enum { fall_in_path = 1, pf_path = 2 };
    if( UseTLAB && AllocatePrefetchStyle == 2 ) {
       // Generate prefetch allocation with watermark check.
       // As an allocation hits the watermark, we will prefetch starting
       // at a "distance" away from watermark.
-      enum { fall_in_path = 1, pf_path = 2 };
 
       Node *pf_region = new (C, 3) RegionNode(3);
       Node *pf_phi_rawmem = new (C, 3) PhiNode( pf_region, Type::MEMORY,
@@ -1570,6 +1570,45 @@
       needgc_false = pf_region;
       contended_phi_rawmem = pf_phi_rawmem;
       i_o = pf_phi_abio;
+   } else if( UseTLAB && AllocatePrefetchStyle == 3 ) {
+      // Insert a prefetch for each allocation only on the fast-path
+      Node *pf_region = new (C, 3) RegionNode(3);
+      Node *pf_phi_rawmem = new (C, 3) PhiNode( pf_region, Type::MEMORY,
+                                                TypeRawPtr::BOTTOM );
+
+      // Generate several prefetch instructions only for arrays.
+      uint lines = (length != NULL) ? AllocatePrefetchLines : 1;
+      uint step_size = AllocatePrefetchStepSize;
+      uint distance = AllocatePrefetchDistance;
+
+      // Next cache address.
+      Node *cache_adr = new (C, 4) AddPNode(old_eden_top, old_eden_top,
+                                            _igvn.MakeConX(distance));
+      transform_later(cache_adr);
+      cache_adr = new (C, 2) CastP2XNode(needgc_false, cache_adr);
+      transform_later(cache_adr);
+      Node* mask = _igvn.MakeConX(~(intptr_t)(step_size-1));
+      cache_adr = new (C, 3) AndXNode(cache_adr, mask);
+      transform_later(cache_adr);
+      cache_adr = new (C, 2) CastX2PNode(cache_adr);
+      transform_later(cache_adr);
+
+      // Prefetch
+      Node *prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, cache_adr );
+      prefetch->set_req(0, needgc_false);
+      transform_later(prefetch);
+      contended_phi_rawmem = prefetch;
+      Node *prefetch_adr;
+      distance = step_size;
+      for ( uint i = 1; i < lines; i++ ) {
+        prefetch_adr = new (C, 4) AddPNode( cache_adr, cache_adr,
+                                            _igvn.MakeConX(distance) );
+        transform_later(prefetch_adr);
+        prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, prefetch_adr );
+        transform_later(prefetch);
+        distance += step_size;
+        contended_phi_rawmem = prefetch;
+      }
    } else if( AllocatePrefetchStyle > 0 ) {
       // Insert a prefetch for each allocation only on the fast-path
       Node *prefetch_adr;
--- a/src/share/vm/opto/memnode.hpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/opto/memnode.hpp	Tue Apr 13 13:01:37 2010 -0700
@@ -1244,5 +1244,5 @@
   virtual int Opcode() const;
   virtual uint ideal_reg() const { return NotAMachineReg; }
   virtual uint match_edge(uint idx) const { return idx==2; }
-  virtual const Type *bottom_type() const { return Type::ABIO; }
+  virtual const Type *bottom_type() const { return ( AllocatePrefetchStyle == 3 ) ? Type::MEMORY : Type::ABIO; }
 };
--- a/src/share/vm/opto/runtime.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/opto/runtime.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -865,7 +865,7 @@
     thread->set_exception_stack_size(0);
 
     // Check if the exception PC is a MethodHandle call site.
-    thread->set_is_method_handle_exception(nm->is_method_handle_return(pc));
+    thread->set_is_method_handle_return(nm->is_method_handle_return(pc));
   }
 
   // Restore correct return pc.  Was saved above.
--- a/src/share/vm/runtime/globals.hpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/runtime/globals.hpp	Tue Apr 13 13:01:37 2010 -0700
@@ -1052,7 +1052,8 @@
           "Use SSE2 MOVDQU instruction for Arraycopy")                      \
                                                                             \
   product(intx, FieldsAllocationStyle, 1,                                   \
-          "0 - type based with oops first, 1 - with oops last")             \
+          "0 - type based with oops first, 1 - with oops last, "            \
+          "2 - oops in super and sub classes are together")                 \
                                                                             \
   product(bool, CompactFields, true,                                        \
           "Allocate nonstatic fields in gaps between previous fields")      \
@@ -2707,7 +2708,8 @@
   product(intx,  AllocatePrefetchStyle, 1,                                  \
           "0 = no prefetch, "                                               \
           "1 = prefetch instructions for each allocation, "                 \
-          "2 = use TLAB watermark to gate allocation prefetch")             \
+          "2 = use TLAB watermark to gate allocation prefetch, "            \
+          "3 = use BIS instruction on Sparc for allocation prefetch")       \
                                                                             \
   product(intx,  AllocatePrefetchDistance, -1,                              \
           "Distance to prefetch ahead of allocation pointer")               \
@@ -3110,6 +3112,9 @@
   develop_pd(intx, CodeEntryAlignment,                                      \
           "Code entry alignment for generated code (in bytes)")             \
                                                                             \
+  product_pd(intx, OptoLoopAlignment,                                       \
+          "Align inner loops to zero relative to this modulus")             \
+                                                                            \
   product_pd(uintx, InitialCodeCacheSize,                                   \
           "Initial code cache size (in bytes)")                             \
                                                                             \
--- a/src/share/vm/runtime/sharedRuntime.cpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/runtime/sharedRuntime.cpp	Tue Apr 13 13:01:37 2010 -0700
@@ -259,13 +259,16 @@
 address SharedRuntime::raw_exception_handler_for_return_address(JavaThread* thread, address return_address) {
   assert(frame::verify_return_pc(return_address), "must be a return pc");
 
+  // Reset MethodHandle flag.
+  thread->set_is_method_handle_return(false);
+
   // the fastest case first
   CodeBlob* blob = CodeCache::find_blob(return_address);
   if (blob != NULL && blob->is_nmethod()) {
     nmethod* code = (nmethod*)blob;
     assert(code != NULL, "nmethod must be present");
     // Check if the return address is a MethodHandle call site.
-    thread->set_is_method_handle_exception(code->is_method_handle_return(return_address));
+    thread->set_is_method_handle_return(code->is_method_handle_return(return_address));
     // native nmethods don't have exception handlers
     assert(!code->is_native_method(), "no exception handler");
     assert(code->header_begin() != code->exception_begin(), "no exception handler");
@@ -292,7 +295,7 @@
       nmethod* code = (nmethod*)blob;
       assert(code != NULL, "nmethod must be present");
       // Check if the return address is a MethodHandle call site.
-      thread->set_is_method_handle_exception(code->is_method_handle_return(return_address));
+      thread->set_is_method_handle_return(code->is_method_handle_return(return_address));
       assert(code->header_begin() != code->exception_begin(), "no exception handler");
       return code->exception_begin();
     }
--- a/src/share/vm/runtime/thread.hpp	Fri Apr 09 15:01:49 2010 -0700
+++ b/src/share/vm/runtime/thread.hpp	Tue Apr 13 13:01:37 2010 -0700
@@ -772,7 +772,7 @@
   volatile address _exception_pc;                // PC where exception happened
   volatile address _exception_handler_pc;        // PC for handler of exception
   volatile int     _exception_stack_size;        // Size of frame where exception happened
-  volatile int     _is_method_handle_exception;  // True if the current exception PC is at a MethodHandle call.
+  volatile int     _is_method_handle_return;     // true (== 1) if the current exception PC is a MethodHandle call site.
 
   // support for compilation
   bool    _is_compiling;                         // is true if a compilation is active inthis thread (one compilation per thread possible)
@@ -1108,13 +1108,13 @@
   int      exception_stack_size() const          { return _exception_stack_size; }
   address  exception_pc() const                  { return _exception_pc; }
   address  exception_handler_pc() const          { return _exception_handler_pc; }
-  int      is_method_handle_exception() const    { return _is_method_handle_exception; }
+  bool     is_method_handle_return() const       { return _is_method_handle_return == 1; }
 
   void set_exception_oop(oop o)                  { _exception_oop = o; }
   void set_exception_pc(address a)               { _exception_pc = a; }
   void set_exception_handler_pc(address a)       { _exception_handler_pc = a; }
   void set_exception_stack_size(int size)        { _exception_stack_size = size; }
-  void set_is_method_handle_exception(int value) { _is_method_handle_exception = value; }
+  void set_is_method_handle_return(bool value)   { _is_method_handle_return = value ? 1 : 0; }
 
   // Stack overflow support
   inline size_t stack_available(address cur_sp);
@@ -1188,7 +1188,7 @@
   static ByteSize exception_pc_offset()          { return byte_offset_of(JavaThread, _exception_pc        ); }
   static ByteSize exception_handler_pc_offset()  { return byte_offset_of(JavaThread, _exception_handler_pc); }
   static ByteSize exception_stack_size_offset()  { return byte_offset_of(JavaThread, _exception_stack_size); }
-  static ByteSize is_method_handle_exception_offset() { return byte_offset_of(JavaThread, _is_method_handle_exception); }
+  static ByteSize is_method_handle_return_offset() { return byte_offset_of(JavaThread, _is_method_handle_return); }
   static ByteSize stack_guard_state_offset()     { return byte_offset_of(JavaThread, _stack_guard_state   ); }
   static ByteSize suspend_flags_offset()         { return byte_offset_of(JavaThread, _suspend_flags       ); }