changeset 2602:3d42f82cd811

7063628: Use cbcond on T4 Summary: Add new short branch instruction to Hotspot sparc assembler. Reviewed-by: never, twisti, jrose
author kvn
date Thu, 21 Jul 2011 11:25:07 -0700
parents 6a991dcb52bb
children 4e761e7e6e12
files src/cpu/sparc/vm/assembler_sparc.cpp src/cpu/sparc/vm/assembler_sparc.hpp src/cpu/sparc/vm/assembler_sparc.inline.hpp src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp src/cpu/sparc/vm/c1_Runtime1_sparc.cpp src/cpu/sparc/vm/cppInterpreter_sparc.cpp src/cpu/sparc/vm/interp_masm_sparc.cpp src/cpu/sparc/vm/interpreter_sparc.cpp src/cpu/sparc/vm/methodHandles_sparc.cpp src/cpu/sparc/vm/sharedRuntime_sparc.cpp src/cpu/sparc/vm/sparc.ad src/cpu/sparc/vm/stubGenerator_sparc.cpp src/cpu/sparc/vm/templateInterpreter_sparc.cpp src/cpu/sparc/vm/templateTable_sparc.cpp src/cpu/sparc/vm/vm_version_sparc.cpp src/cpu/sparc/vm/vm_version_sparc.hpp src/cpu/sparc/vm/vtableStubs_sparc.cpp src/cpu/x86/vm/x86_32.ad src/cpu/x86/vm/x86_64.ad src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp src/share/vm/adlc/formssel.cpp src/share/vm/adlc/output_c.cpp src/share/vm/adlc/output_h.cpp src/share/vm/opto/compile.cpp src/share/vm/opto/machnode.cpp src/share/vm/opto/machnode.hpp src/share/vm/opto/output.cpp src/share/vm/runtime/globals.hpp
diffstat 30 files changed, 984 insertions(+), 1115 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/sparc/vm/assembler_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/assembler_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -100,12 +100,19 @@
   case call_op:    s = "call"; break;
   case branch_op:
     switch (inv_op2(inst)) {
-      case bpr_op2:    s = "bpr";  break;
       case fb_op2:     s = "fb";   break;
       case fbp_op2:    s = "fbp";  break;
       case br_op2:     s = "br";   break;
       case bp_op2:     s = "bp";   break;
       case cb_op2:     s = "cb";   break;
+      case bpr_op2: {
+        if (is_cbcond(inst)) {
+          s = is_cxb(inst) ? "cxb" : "cwb";
+        } else {
+          s = "bpr";
+        }
+        break;
+      }
       default:         s = "????"; break;
     }
   }
@@ -127,12 +134,21 @@
   case call_op:    m = wdisp(word_aligned_ones, 0, 30);  v = wdisp(dest_pos, inst_pos, 30); break;
   case branch_op:
     switch (inv_op2(inst)) {
-      case bpr_op2:    m = wdisp16(word_aligned_ones, 0);      v = wdisp16(dest_pos, inst_pos);     break;
       case fbp_op2:    m = wdisp(  word_aligned_ones, 0, 19);  v = wdisp(  dest_pos, inst_pos, 19); break;
       case bp_op2:     m = wdisp(  word_aligned_ones, 0, 19);  v = wdisp(  dest_pos, inst_pos, 19); break;
       case fb_op2:     m = wdisp(  word_aligned_ones, 0, 22);  v = wdisp(  dest_pos, inst_pos, 22); break;
       case br_op2:     m = wdisp(  word_aligned_ones, 0, 22);  v = wdisp(  dest_pos, inst_pos, 22); break;
       case cb_op2:     m = wdisp(  word_aligned_ones, 0, 22);  v = wdisp(  dest_pos, inst_pos, 22); break;
+      case bpr_op2: {
+        if (is_cbcond(inst)) {
+          m = wdisp10(word_aligned_ones, 0);
+          v = wdisp10(dest_pos, inst_pos);
+        } else {
+          m = wdisp16(word_aligned_ones, 0);
+          v = wdisp16(dest_pos, inst_pos);
+        }
+        break;
+      }
       default: ShouldNotReachHere();
     }
   }
@@ -149,12 +165,19 @@
   case call_op:        r = inv_wdisp(inst, pos, 30);  break;
   case branch_op:
     switch (inv_op2(inst)) {
-      case bpr_op2:    r = inv_wdisp16(inst, pos);    break;
       case fbp_op2:    r = inv_wdisp(  inst, pos, 19);  break;
       case bp_op2:     r = inv_wdisp(  inst, pos, 19);  break;
       case fb_op2:     r = inv_wdisp(  inst, pos, 22);  break;
       case br_op2:     r = inv_wdisp(  inst, pos, 22);  break;
       case cb_op2:     r = inv_wdisp(  inst, pos, 22);  break;
+      case bpr_op2: {
+        if (is_cbcond(inst)) {
+          r = inv_wdisp10(inst, pos);
+        } else {
+          r = inv_wdisp16(inst, pos);
+        }
+        break;
+      }
       default: ShouldNotReachHere();
     }
   }
@@ -968,13 +991,7 @@
   Label PcOk;
   save_frame(0);                // to avoid clobbering O0
   ld_ptr(pc_addr, L0);
-  tst(L0);
-#ifdef _LP64
-  brx(Assembler::zero, false, Assembler::pt, PcOk);
-#else
-  br(Assembler::zero, false, Assembler::pt, PcOk);
-#endif // _LP64
-  delayed() -> nop();
+  br_null_short(L0, Assembler::pt, PcOk);
   stop("last_Java_pc not zeroed before leaving Java");
   bind(PcOk);
 
@@ -1003,7 +1020,7 @@
   Label StackOk;
   andcc(last_java_sp, 0x01, G0);
   br(Assembler::notZero, false, Assembler::pt, StackOk);
-  delayed() -> nop();
+  delayed()->nop();
   stop("Stack Not Biased in set_last_Java_frame");
   bind(StackOk);
 #endif // ASSERT
@@ -1099,8 +1116,7 @@
 
   Address exception_addr(G2_thread, Thread::pending_exception_offset());
   ld_ptr(exception_addr, scratch_reg);
-  br_null(scratch_reg,false,pt,L);
-  delayed()->nop();
+  br_null_short(scratch_reg, pt, L);
   // we use O7 linkage so that forward_exception_entry has the issuing PC
   call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
   delayed()->nop();
@@ -1874,14 +1890,11 @@
 
   // assert((obj & oop_mask) == oop_bits);
   and3(O0_obj, O2_mask, O4_temp);
-  cmp(O4_temp, O3_bits);
-  brx(notEqual, false, pn, null_or_fail);
-  delayed()->nop();
+  cmp_and_brx_short(O4_temp, O3_bits, notEqual, pn, null_or_fail);
 
   if ((NULL_WORD & Universe::verify_oop_mask()) == Universe::verify_oop_bits()) {
     // the null_or_fail case is useless; must test for null separately
-    br_null(O0_obj, false, pn, succeed);
-    delayed()->nop();
+    br_null_short(O0_obj, pn, succeed);
   }
 
   // Check the klassOop of this object for being in the right area of memory.
@@ -1893,9 +1906,7 @@
   if( Universe::verify_klass_bits() != Universe::verify_oop_bits() )
     set(Universe::verify_klass_bits(), O3_bits);
   and3(O0_obj, O2_mask, O4_temp);
-  cmp(O4_temp, O3_bits);
-  brx(notEqual, false, pn, fail);
-  delayed()->nop();
+  cmp_and_brx_short(O4_temp, O3_bits, notEqual, pn, fail);
   // Check the klass's klass
   load_klass(O0_obj, O0_obj);
   and3(O0_obj, O2_mask, O4_temp);
@@ -2122,13 +2133,12 @@
   return Assembler::rc_z;
 }
 
-// compares register with zero and branches.  NOT FOR USE WITH 64-bit POINTERS
-void MacroAssembler::br_zero( Condition c, bool a, Predict p, Register s1, Label& L) {
+// compares (32 bit) register with zero and branches.  NOT FOR USE WITH 64-bit POINTERS
+void MacroAssembler::cmp_zero_and_br(Condition c, Register s1, Label& L, bool a, Predict p) {
   tst(s1);
   br (c, a, p, L);
 }
 
-
 // Compares a pointer register with zero and branches on null.
 // Does a test & branch on 32-bit systems and a register-branch on 64-bit.
 void MacroAssembler::br_null( Register s1, bool a, Predict p, Label& L ) {
@@ -2154,6 +2164,7 @@
 void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
                                      Register s1, address d,
                                      relocInfo::relocType rt ) {
+  assert_not_delayed();
   if (VM_Version::v9_instructions_work()) {
     bpr(rc, a, p, s1, d, rt);
   } else {
@@ -2164,6 +2175,7 @@
 
 void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
                                      Register s1, Label& L ) {
+  assert_not_delayed();
   if (VM_Version::v9_instructions_work()) {
     bpr(rc, a, p, s1, L);
   } else {
@@ -2172,6 +2184,91 @@
   }
 }
 
+// Compare registers and branch with nop in delay slot or cbcond without delay slot.
+
+// Compare integer (32 bit) values (icc only).
+void MacroAssembler::cmp_and_br_short(Register s1, Register s2, Condition c,
+                                      Predict p, Label& L) {
+  assert_not_delayed();
+  if (use_cbcond(L)) {
+    Assembler::cbcond(c, icc, s1, s2, L);
+  } else {
+    cmp(s1, s2);
+    br(c, false, p, L);
+    delayed()->nop();
+  }
+}
+
+// Compare integer (32 bit) values (icc only).
+void MacroAssembler::cmp_and_br_short(Register s1, int simm13a, Condition c,
+                                      Predict p, Label& L) {
+  assert_not_delayed();
+  if (is_simm(simm13a,5) && use_cbcond(L)) {
+    Assembler::cbcond(c, icc, s1, simm13a, L);
+  } else {
+    cmp(s1, simm13a);
+    br(c, false, p, L);
+    delayed()->nop();
+  }
+}
+
+// Branch that tests xcc in LP64 and icc in !LP64
+void MacroAssembler::cmp_and_brx_short(Register s1, Register s2, Condition c,
+                                       Predict p, Label& L) {
+  assert_not_delayed();
+  if (use_cbcond(L)) {
+    Assembler::cbcond(c, ptr_cc, s1, s2, L);
+  } else {
+    cmp(s1, s2);
+    brx(c, false, p, L);
+    delayed()->nop();
+  }
+}
+
+// Branch that tests xcc in LP64 and icc in !LP64
+void MacroAssembler::cmp_and_brx_short(Register s1, int simm13a, Condition c,
+                                       Predict p, Label& L) {
+  assert_not_delayed();
+  if (is_simm(simm13a,5) && use_cbcond(L)) {
+    Assembler::cbcond(c, ptr_cc, s1, simm13a, L);
+  } else {
+    cmp(s1, simm13a);
+    brx(c, false, p, L);
+    delayed()->nop();
+  }
+}
+
+// Short branch version for compares a pointer with zero.
+
+void MacroAssembler::br_null_short(Register s1, Predict p, Label& L) {
+  assert_not_delayed();
+  if (use_cbcond(L)) {
+    Assembler::cbcond(zero, ptr_cc, s1, 0, L);
+    return;
+  }
+  br_null(s1, false, p, L);
+  delayed()->nop();
+}
+
+void MacroAssembler::br_notnull_short(Register s1, Predict p, Label& L) {
+  assert_not_delayed();
+  if (use_cbcond(L)) {
+    Assembler::cbcond(notZero, ptr_cc, s1, 0, L);
+    return;
+  }
+  br_notnull(s1, false, p, L);
+  delayed()->nop();
+}
+
+// Unconditional short branch
+void MacroAssembler::ba_short(Label& L) {
+  if (use_cbcond(L)) {
+    Assembler::cbcond(equal, icc, G0, G0, L);
+    return;
+  }
+  br(always, false, pt, L);
+  delayed()->nop();
+}
 
 // instruction sequences factored across compiler & interpreter
 
@@ -2197,11 +2294,9 @@
   // since that triplet is reached only after finding the high halves differ.
 
   if (VM_Version::v9_instructions_work()) {
-
-                                    mov  (                     -1, Rresult);
-    ba( false, done );  delayed()-> movcc(greater, false, icc,  1, Rresult);
-  }
-  else {
+    mov(-1, Rresult);
+    ba(done);  delayed()-> movcc(greater, false, icc,  1, Rresult);
+  } else {
     br(less,    true, pt, done); delayed()-> set(-1, Rresult);
     br(greater, true, pt, done); delayed()-> set( 1, Rresult);
   }
@@ -2212,9 +2307,8 @@
     mov(                               -1, Rresult);
     movcc(equal,           false, icc,  0, Rresult);
     movcc(greaterUnsigned, false, icc,  1, Rresult);
-  }
-  else {
-                                                    set(-1, Rresult);
+  } else {
+    set(-1, Rresult);
     br(equal,           true, pt, done); delayed()->set( 0, Rresult);
     br(greaterUnsigned, true, pt, done); delayed()->set( 1, Rresult);
   }
@@ -2250,11 +2344,10 @@
   // This code can be optimized to use the 64 bit shifts in V9.
   // Here we use the 32 bit shifts.
 
-  and3( Rcount,         0x3f,           Rcount);     // take least significant 6 bits
-  subcc(Rcount,         31,             Ralt_count);
+  and3( Rcount, 0x3f, Rcount);     // take least significant 6 bits
+  subcc(Rcount,   31, Ralt_count);
   br(greater, true, pn, big_shift);
-  delayed()->
-  dec(Ralt_count);
+  delayed()->dec(Ralt_count);
 
   // shift < 32 bits, Ralt_count = Rcount-31
 
@@ -2263,28 +2356,27 @@
   // more to take care of the special (rare) case where count is zero
   // (shifting by 32 would not work).
 
-  neg(  Ralt_count                                 );
+  neg(Ralt_count);
 
   // The order of the next two instructions is critical in the case where
   // Rin and Rout are the same and should not be reversed.
 
-  srl(  Rin_low,        Ralt_count,     Rxfer_bits ); // shift right by 31-count
+  srl(Rin_low, Ralt_count, Rxfer_bits); // shift right by 31-count
   if (Rcount != Rout_low) {
-    sll(        Rin_low,        Rcount,         Rout_low   ); // low half
+    sll(Rin_low, Rcount, Rout_low); // low half
   }
-  sll(  Rin_high,       Rcount,         Rout_high  );
+  sll(Rin_high, Rcount, Rout_high);
   if (Rcount == Rout_low) {
-    sll(        Rin_low,        Rcount,         Rout_low   ); // low half
+    sll(Rin_low, Rcount, Rout_low); // low half
   }
-  srl(  Rxfer_bits,     1,              Rxfer_bits ); // shift right by one more
-  ba (false, done);
-  delayed()->
-  or3(  Rout_high,      Rxfer_bits,     Rout_high);   // new hi value: or in shifted old hi part and xfer from low
+  srl(Rxfer_bits, 1, Rxfer_bits ); // shift right by one more
+  ba(done);
+  delayed()->or3(Rout_high, Rxfer_bits, Rout_high);   // new hi value: or in shifted old hi part and xfer from low
 
   // shift >= 32 bits, Ralt_count = Rcount-32
   bind(big_shift);
-  sll(  Rin_low,        Ralt_count,     Rout_high  );
-  clr(  Rout_low                                   );
+  sll(Rin_low, Ralt_count, Rout_high  );
+  clr(Rout_low);
 
   bind(done);
 }
@@ -2313,8 +2405,8 @@
   // This code can be optimized to use the 64 bit shifts in V9.
   // Here we use the 32 bit shifts.
 
-  and3( Rcount,         0x3f,           Rcount);     // take least significant 6 bits
-  subcc(Rcount,         31,             Ralt_count);
+  and3( Rcount, 0x3f, Rcount);     // take least significant 6 bits
+  subcc(Rcount,   31, Ralt_count);
   br(greater, true, pn, big_shift);
   delayed()->dec(Ralt_count);
 
@@ -2325,29 +2417,28 @@
   // more to take care of the special (rare) case where count is zero
   // (shifting by 32 would not work).
 
-  neg(  Ralt_count                                  );
+  neg(Ralt_count);
   if (Rcount != Rout_low) {
-    srl(        Rin_low,        Rcount,         Rout_low    );
+    srl(Rin_low, Rcount, Rout_low);
   }
 
   // The order of the next two instructions is critical in the case where
   // Rin and Rout are the same and should not be reversed.
 
-  sll(  Rin_high,       Ralt_count,     Rxfer_bits  ); // shift left by 31-count
-  sra(  Rin_high,       Rcount,         Rout_high   ); // high half
-  sll(  Rxfer_bits,     1,              Rxfer_bits  ); // shift left by one more
+  sll(Rin_high, Ralt_count, Rxfer_bits); // shift left by 31-count
+  sra(Rin_high,     Rcount, Rout_high ); // high half
+  sll(Rxfer_bits,        1, Rxfer_bits); // shift left by one more
   if (Rcount == Rout_low) {
-    srl(        Rin_low,        Rcount,         Rout_low    );
+    srl(Rin_low, Rcount, Rout_low);
   }
-  ba (false, done);
-  delayed()->
-  or3(  Rout_low,       Rxfer_bits,     Rout_low    ); // new low value: or shifted old low part and xfer from high
+  ba(done);
+  delayed()->or3(Rout_low, Rxfer_bits, Rout_low); // new low value: or shifted old low part and xfer from high
 
   // shift >= 32 bits, Ralt_count = Rcount-32
   bind(big_shift);
 
-  sra(  Rin_high,       Ralt_count,     Rout_low    );
-  sra(  Rin_high,       31,             Rout_high   ); // sign into hi
+  sra(Rin_high, Ralt_count, Rout_low);
+  sra(Rin_high,         31, Rout_high); // sign into hi
 
   bind( done );
 }
@@ -2377,8 +2468,8 @@
   // This code can be optimized to use the 64 bit shifts in V9.
   // Here we use the 32 bit shifts.
 
-  and3( Rcount,         0x3f,           Rcount);     // take least significant 6 bits
-  subcc(Rcount,         31,             Ralt_count);
+  and3( Rcount, 0x3f, Rcount);     // take least significant 6 bits
+  subcc(Rcount,   31, Ralt_count);
   br(greater, true, pn, big_shift);
   delayed()->dec(Ralt_count);
 
@@ -2389,29 +2480,28 @@
   // more to take care of the special (rare) case where count is zero
   // (shifting by 32 would not work).
 
-  neg(  Ralt_count                                  );
+  neg(Ralt_count);
   if (Rcount != Rout_low) {
-    srl(        Rin_low,        Rcount,         Rout_low    );
+    srl(Rin_low, Rcount, Rout_low);
   }
 
   // The order of the next two instructions is critical in the case where
   // Rin and Rout are the same and should not be reversed.
 
-  sll(  Rin_high,       Ralt_count,     Rxfer_bits  ); // shift left by 31-count
-  srl(  Rin_high,       Rcount,         Rout_high   ); // high half
-  sll(  Rxfer_bits,     1,              Rxfer_bits  ); // shift left by one more
+  sll(Rin_high, Ralt_count, Rxfer_bits); // shift left by 31-count
+  srl(Rin_high,     Rcount, Rout_high ); // high half
+  sll(Rxfer_bits,        1, Rxfer_bits); // shift left by one more
   if (Rcount == Rout_low) {
-    srl(        Rin_low,        Rcount,         Rout_low    );
+    srl(Rin_low, Rcount, Rout_low);
   }
-  ba (false, done);
-  delayed()->
-  or3(  Rout_low,       Rxfer_bits,     Rout_low    ); // new low value: or shifted old low part and xfer from high
+  ba(done);
+  delayed()->or3(Rout_low, Rxfer_bits, Rout_low); // new low value: or shifted old low part and xfer from high
 
   // shift >= 32 bits, Ralt_count = Rcount-32
   bind(big_shift);
 
-  srl(  Rin_high,       Ralt_count,     Rout_low    );
-  clr(  Rout_high                                   );
+  srl(Rin_high, Ralt_count, Rout_low);
+  clr(Rout_high);
 
   bind( done );
 }
@@ -2419,7 +2509,7 @@
 #ifdef _LP64
 void MacroAssembler::lcmp( Register Ra, Register Rb, Register Rresult) {
   cmp(Ra, Rb);
-  mov(                       -1, Rresult);
+  mov(-1, Rresult);
   movcc(equal,   false, xcc,  0, Rresult);
   movcc(greater, false, xcc,  1, Rresult);
 }
@@ -2459,14 +2549,14 @@
 
   if (VM_Version::v9_instructions_work()) {
 
-    mov(                   -1, Rresult );
-    movcc( eq, true, fcc0,  0, Rresult );
-    movcc( gt, true, fcc0,  1, Rresult );
+    mov(-1, Rresult);
+    movcc(eq, true, fcc0, 0, Rresult);
+    movcc(gt, true, fcc0, 1, Rresult);
 
   } else {
     Label done;
 
-                                         set( -1, Rresult );
+    set( -1, Rresult );
     //fb(lt, true, pn, done); delayed()->set( -1, Rresult );
     fb( eq, true, pn, done);  delayed()->set(  0, Rresult );
     fb( gt, true, pn, done);  delayed()->set(  1, Rresult );
@@ -2668,9 +2758,7 @@
     set(StubRoutines::Sparc::locked, lock_reg);
 
     bind(retry_get_lock);
-    cmp(yield_reg, V8AtomicOperationUnderLockSpinCount);
-    br(Assembler::less, false, Assembler::pt, dont_yield);
-    delayed()->nop();
+    cmp_and_br_short(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, Assembler::pt, dont_yield);
 
     if(use_call_vm) {
       Untested("Need to verify global reg consistancy");
@@ -2700,9 +2788,7 @@
 
     // yes, got lock.  do we have the same top?
     ld(top_ptr_reg_after_save, 0, value_reg);
-    cmp(value_reg, top_reg_after_save);
-    br(Assembler::notEqual, false, Assembler::pn, not_same);
-    delayed()->nop();
+    cmp_and_br_short(value_reg, top_reg_after_save, Assembler::notEqual, Assembler::pn, not_same);
 
     // yes, same top.
     st(ptr_reg_after_save, top_ptr_reg_after_save, 0);
@@ -2952,8 +3038,7 @@
 
   // on success:
   restore();
-  ba(false, L_success);
-  delayed()->nop();
+  ba_short(L_success);
 
   // on failure:
   bind(L_pop_to_failure);
@@ -2969,8 +3054,7 @@
                                                    Label* L_success,
                                                    Label* L_failure,
                                                    Label* L_slow_path,
-                                        RegisterOrConstant super_check_offset,
-                                        Register instanceof_hack) {
+                                        RegisterOrConstant super_check_offset) {
   int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
                    Klass::secondary_super_cache_offset_in_bytes());
   int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
@@ -2993,29 +3077,10 @@
   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
-  assert(label_nulls <= 1 || instanceof_hack != noreg ||
+  assert(label_nulls <= 1 ||
          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
          "at most one NULL in the batch, usually");
 
-  // Support for the instanceof hack, which uses delay slots to
-  // set a destination register to zero or one.
-  bool do_bool_sets = (instanceof_hack != noreg);
-#define BOOL_SET(bool_value)                            \
-  if (do_bool_sets && bool_value >= 0)                  \
-    set(bool_value, instanceof_hack)
-#define DELAYED_BOOL_SET(bool_value)                    \
-  if (do_bool_sets && bool_value >= 0)                  \
-    delayed()->set(bool_value, instanceof_hack);        \
-  else delayed()->nop()
-  // Hacked ba(), which may only be used just before L_fallthrough.
-#define FINAL_JUMP(label, bool_value)                   \
-  if (&(label) == &L_fallthrough) {                     \
-    BOOL_SET(bool_value);                               \
-  } else {                                              \
-    ba((do_bool_sets && bool_value >= 0), label);       \
-    DELAYED_BOOL_SET(bool_value);                       \
-  }
-
   // If the pointers are equal, we are done (e.g., String[] elements).
   // This self-check enables sharing of secondary supertype arrays among
   // non-primary types such as array-of-interface.  Otherwise, each such
@@ -3024,8 +3089,8 @@
   // type checks are in fact trivially successful in this manner,
   // so we get a nicely predicted branch right at the start of the check.
   cmp(super_klass, sub_klass);
-  brx(Assembler::equal, do_bool_sets, Assembler::pn, *L_success);
-  DELAYED_BOOL_SET(1);
+  brx(Assembler::equal, false, Assembler::pn, *L_success);
+  delayed()->nop();
 
   // Check the supertype display:
   if (must_load_sco) {
@@ -3049,50 +3114,49 @@
   // So if it was a primary super, we can just fail immediately.
   // Otherwise, it's the slow path for us (no success at this point).
 
+  // Hacked ba(), which may only be used just before L_fallthrough.
+#define FINAL_JUMP(label)            \
+  if (&(label) != &L_fallthrough) {  \
+    ba(label);  delayed()->nop();    \
+  }
+
   if (super_check_offset.is_register()) {
-    brx(Assembler::equal, do_bool_sets, Assembler::pn, *L_success);
-    delayed(); if (do_bool_sets)  BOOL_SET(1);
-    // if !do_bool_sets, sneak the next cmp into the delay slot:
-    cmp(super_check_offset.as_register(), sc_offset);
+    brx(Assembler::equal, false, Assembler::pn, *L_success);
+    delayed()->cmp(super_check_offset.as_register(), sc_offset);
 
     if (L_failure == &L_fallthrough) {
-      brx(Assembler::equal, do_bool_sets, Assembler::pt, *L_slow_path);
+      brx(Assembler::equal, false, Assembler::pt, *L_slow_path);
       delayed()->nop();
-      BOOL_SET(0);  // fallthrough on failure
     } else {
-      brx(Assembler::notEqual, do_bool_sets, Assembler::pn, *L_failure);
-      DELAYED_BOOL_SET(0);
-      FINAL_JUMP(*L_slow_path, -1);  // -1 => vanilla delay slot
+      brx(Assembler::notEqual, false, Assembler::pn, *L_failure);
+      delayed()->nop();
+      FINAL_JUMP(*L_slow_path);
     }
   } else if (super_check_offset.as_constant() == sc_offset) {
     // Need a slow path; fast failure is impossible.
     if (L_slow_path == &L_fallthrough) {
-      brx(Assembler::equal, do_bool_sets, Assembler::pt, *L_success);
-      DELAYED_BOOL_SET(1);
+      brx(Assembler::equal, false, Assembler::pt, *L_success);
+      delayed()->nop();
     } else {
       brx(Assembler::notEqual, false, Assembler::pn, *L_slow_path);
       delayed()->nop();
-      FINAL_JUMP(*L_success, 1);
+      FINAL_JUMP(*L_success);
     }
   } else {
     // No slow path; it's a fast decision.
     if (L_failure == &L_fallthrough) {
-      brx(Assembler::equal, do_bool_sets, Assembler::pt, *L_success);
-      DELAYED_BOOL_SET(1);
-      BOOL_SET(0);
+      brx(Assembler::equal, false, Assembler::pt, *L_success);
+      delayed()->nop();
     } else {
-      brx(Assembler::notEqual, do_bool_sets, Assembler::pn, *L_failure);
-      DELAYED_BOOL_SET(0);
-      FINAL_JUMP(*L_success, 1);
+      brx(Assembler::notEqual, false, Assembler::pn, *L_failure);
+      delayed()->nop();
+      FINAL_JUMP(*L_success);
     }
   }
 
   bind(L_fallthrough);
 
-#undef final_jump
-#undef bool_set
-#undef DELAYED_BOOL_SET
-#undef final_jump
+#undef FINAL_JUMP
 }
 
 
@@ -3185,7 +3249,7 @@
   st_ptr(super_klass, sub_klass, sc_offset);
 
   if (L_success != &L_fallthrough) {
-    ba(false, *L_success);
+    ba(*L_success);
     delayed()->nop();
   }
 
@@ -3200,9 +3264,7 @@
   // compare method type against that of the receiver
   RegisterOrConstant mhtype_offset = delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg);
   load_heap_oop(mh_reg, mhtype_offset, temp_reg);
-  cmp(temp_reg, mtype_reg);
-  br(Assembler::notEqual, false, Assembler::pn, wrong_method_type);
-  delayed()->nop();
+  cmp_and_brx_short(temp_reg, mtype_reg, Assembler::notEqual, Assembler::pn, wrong_method_type);
 }
 
 
@@ -3295,9 +3357,7 @@
   // pointers to allow age to be placed into low bits
   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
   and3(mark_reg, markOopDesc::biased_lock_mask_in_place, temp_reg);
-  cmp(temp_reg, markOopDesc::biased_lock_pattern);
-  brx(Assembler::notEqual, false, Assembler::pn, cas_label);
-  delayed()->nop();
+  cmp_and_brx_short(temp_reg, markOopDesc::biased_lock_pattern, Assembler::notEqual, Assembler::pn, cas_label);
 
   load_klass(obj_reg, temp_reg);
   ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
@@ -3364,8 +3424,7 @@
     brx(Assembler::notEqual, true, Assembler::pn, *slow_case);
     delayed()->nop();
   }
-  br(Assembler::always, false, Assembler::pt, done);
-  delayed()->nop();
+  ba_short(done);
 
   bind(try_rebias);
   // At this point we know the epoch has expired, meaning that the
@@ -3393,8 +3452,7 @@
     brx(Assembler::notEqual, true, Assembler::pn, *slow_case);
     delayed()->nop();
   }
-  br(Assembler::always, false, Assembler::pt, done);
-  delayed()->nop();
+  ba_short(done);
 
   bind(try_revoke_bias);
   // The prototype mark in the klass doesn't have the bias bit set any
@@ -3445,7 +3503,7 @@
 // Solaris/SPARC's "as".  Another apt name would be cas_ptr()
 
 void MacroAssembler::casn (Register addr_reg, Register cmp_reg, Register set_reg ) {
-  casx_under_lock (addr_reg, cmp_reg, set_reg, (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr()) ;
+  casx_under_lock (addr_reg, cmp_reg, set_reg, (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr());
 }
 
 
@@ -3486,9 +3544,9 @@
    }
 
    if (EmitSync & 1) {
-     mov    (3, Rscratch) ;
-     st_ptr (Rscratch, Rbox, BasicLock::displaced_header_offset_in_bytes());
-     cmp    (SP, G0) ;
+     mov(3, Rscratch);
+     st_ptr(Rscratch, Rbox, BasicLock::displaced_header_offset_in_bytes());
+     cmp(SP, G0);
      return ;
    }
 
@@ -3529,7 +3587,7 @@
      assert(os::vm_page_size() > 0xfff, "page size too small - change the constant");
      andcc(Rscratch, 0xfffff003, Rscratch);
      st_ptr(Rscratch, Rbox, BasicLock::displaced_header_offset_in_bytes());
-     bind (done) ;
+     bind (done);
      return ;
    }
 
@@ -3538,7 +3596,7 @@
    if (EmitSync & 256) {
       Label IsInflated ;
 
-      ld_ptr (mark_addr, Rmark);           // fetch obj->mark
+      ld_ptr(mark_addr, Rmark);           // fetch obj->mark
       // Triage: biased, stack-locked, neutral, inflated
       if (try_bias) {
         biased_locking_enter(Roop, Rmark, Rscratch, done, NULL, counters);
@@ -3549,49 +3607,49 @@
       // Store mark into displaced mark field in the on-stack basic-lock "box"
       // Critically, this must happen before the CAS
       // Maximize the ST-CAS distance to minimize the ST-before-CAS penalty.
-      st_ptr (Rmark, Rbox, BasicLock::displaced_header_offset_in_bytes());
-      andcc  (Rmark, 2, G0) ;
-      brx    (Assembler::notZero, false, Assembler::pn, IsInflated) ;
-      delayed() ->
+      st_ptr(Rmark, Rbox, BasicLock::displaced_header_offset_in_bytes());
+      andcc(Rmark, 2, G0);
+      brx(Assembler::notZero, false, Assembler::pn, IsInflated);
+      delayed()->
 
       // Try stack-lock acquisition.
       // Beware: the 1st instruction is in a delay slot
-      mov    (Rbox,  Rscratch);
-      or3    (Rmark, markOopDesc::unlocked_value, Rmark);
-      assert (mark_addr.disp() == 0, "cas must take a zero displacement");
-      casn   (mark_addr.base(), Rmark, Rscratch) ;
-      cmp    (Rmark, Rscratch);
-      brx    (Assembler::equal, false, Assembler::pt, done);
+      mov(Rbox,  Rscratch);
+      or3(Rmark, markOopDesc::unlocked_value, Rmark);
+      assert(mark_addr.disp() == 0, "cas must take a zero displacement");
+      casn(mark_addr.base(), Rmark, Rscratch);
+      cmp(Rmark, Rscratch);
+      brx(Assembler::equal, false, Assembler::pt, done);
       delayed()->sub(Rscratch, SP, Rscratch);
 
       // Stack-lock attempt failed - check for recursive stack-lock.
       // See the comments below about how we might remove this case.
 #ifdef _LP64
-      sub    (Rscratch, STACK_BIAS, Rscratch);
+      sub(Rscratch, STACK_BIAS, Rscratch);
 #endif
       assert(os::vm_page_size() > 0xfff, "page size too small - change the constant");
-      andcc  (Rscratch, 0xfffff003, Rscratch);
-      br     (Assembler::always, false, Assembler::pt, done) ;
-      delayed()-> st_ptr (Rscratch, Rbox, BasicLock::displaced_header_offset_in_bytes());
-
-      bind   (IsInflated) ;
+      andcc(Rscratch, 0xfffff003, Rscratch);
+      br(Assembler::always, false, Assembler::pt, done);
+      delayed()-> st_ptr(Rscratch, Rbox, BasicLock::displaced_header_offset_in_bytes());
+
+      bind(IsInflated);
       if (EmitSync & 64) {
          // If m->owner != null goto IsLocked
          // Pessimistic form: Test-and-CAS vs CAS
          // The optimistic form avoids RTS->RTO cache line upgrades.
-         ld_ptr (Rmark, ObjectMonitor::owner_offset_in_bytes() - 2, Rscratch);
-         andcc  (Rscratch, Rscratch, G0) ;
-         brx    (Assembler::notZero, false, Assembler::pn, done) ;
-         delayed()->nop() ;
+         ld_ptr(Rmark, ObjectMonitor::owner_offset_in_bytes() - 2, Rscratch);
+         andcc(Rscratch, Rscratch, G0);
+         brx(Assembler::notZero, false, Assembler::pn, done);
+         delayed()->nop();
          // m->owner == null : it's unlocked.
       }
 
       // Try to CAS m->owner from null to Self
       // Invariant: if we acquire the lock then _recursions should be 0.
-      add    (Rmark, ObjectMonitor::owner_offset_in_bytes()-2, Rmark) ;
-      mov    (G2_thread, Rscratch) ;
-      casn   (Rmark, G0, Rscratch) ;
-      cmp    (Rscratch, G0) ;
+      add(Rmark, ObjectMonitor::owner_offset_in_bytes()-2, Rmark);
+      mov(G2_thread, Rscratch);
+      casn(Rmark, G0, Rscratch);
+      cmp(Rscratch, G0);
       // Intentional fall-through into done
    } else {
       // Aggressively avoid the Store-before-CAS penalty
@@ -3599,9 +3657,9 @@
       Label IsInflated, Recursive ;
 
 // Anticipate CAS -- Avoid RTS->RTO upgrade
-// prefetch (mark_addr, Assembler::severalWritesAndPossiblyReads) ;
-
-      ld_ptr (mark_addr, Rmark);           // fetch obj->mark
+// prefetch (mark_addr, Assembler::severalWritesAndPossiblyReads);
+
+      ld_ptr(mark_addr, Rmark);           // fetch obj->mark
       // Triage: biased, stack-locked, neutral, inflated
 
       if (try_bias) {
@@ -3609,8 +3667,8 @@
         // Invariant: if control reaches this point in the emitted stream
         // then Rmark has not been modified.
       }
-      andcc  (Rmark, 2, G0) ;
-      brx    (Assembler::notZero, false, Assembler::pn, IsInflated) ;
+      andcc(Rmark, 2, G0);
+      brx(Assembler::notZero, false, Assembler::pn, IsInflated);
       delayed()->                         // Beware - dangling delay-slot
 
       // Try stack-lock acquisition.
@@ -3620,23 +3678,21 @@
       //   ST obj->mark = box    -- overwrite transient 0 value
       // This presumes TSO, of course.
 
-      mov    (0, Rscratch) ;
-      or3    (Rmark, markOopDesc::unlocked_value, Rmark);
-      assert (mark_addr.disp() == 0, "cas must take a zero displacement");
-      casn   (mark_addr.base(), Rmark, Rscratch) ;
-// prefetch (mark_addr, Assembler::severalWritesAndPossiblyReads) ;
-      cmp    (Rscratch, Rmark) ;
-      brx    (Assembler::notZero, false, Assembler::pn, Recursive) ;
-      delayed() ->
-        st_ptr (Rmark, Rbox, BasicLock::displaced_header_offset_in_bytes());
+      mov(0, Rscratch);
+      or3(Rmark, markOopDesc::unlocked_value, Rmark);
+      assert(mark_addr.disp() == 0, "cas must take a zero displacement");
+      casn(mark_addr.base(), Rmark, Rscratch);
+// prefetch (mark_addr, Assembler::severalWritesAndPossiblyReads);
+      cmp(Rscratch, Rmark);
+      brx(Assembler::notZero, false, Assembler::pn, Recursive);
+      delayed()->st_ptr(Rmark, Rbox, BasicLock::displaced_header_offset_in_bytes());
       if (counters != NULL) {
         cond_inc(Assembler::equal, (address) counters->fast_path_entry_count_addr(), Rmark, Rscratch);
       }
-      br     (Assembler::always, false, Assembler::pt, done);
-      delayed() ->
-        st_ptr (Rbox, mark_addr) ;
-
-      bind   (Recursive) ;
+      ba(done);
+      delayed()->st_ptr(Rbox, mark_addr);
+
+      bind(Recursive);
       // Stack-lock attempt failed - check for recursive stack-lock.
       // Tests show that we can remove the recursive case with no impact
       // on refworkload 0.83.  If we need to reduce the size of the code
@@ -3653,49 +3709,48 @@
 
       // RScratch contains the fetched obj->mark value from the failed CASN.
 #ifdef _LP64
-      sub    (Rscratch, STACK_BIAS, Rscratch);
+      sub(Rscratch, STACK_BIAS, Rscratch);
 #endif
       sub(Rscratch, SP, Rscratch);
       assert(os::vm_page_size() > 0xfff, "page size too small - change the constant");
-      andcc  (Rscratch, 0xfffff003, Rscratch);
+      andcc(Rscratch, 0xfffff003, Rscratch);
       if (counters != NULL) {
         // Accounting needs the Rscratch register
-        st_ptr (Rscratch, Rbox, BasicLock::displaced_header_offset_in_bytes());
+        st_ptr(Rscratch, Rbox, BasicLock::displaced_header_offset_in_bytes());
         cond_inc(Assembler::equal, (address) counters->fast_path_entry_count_addr(), Rmark, Rscratch);
-        br     (Assembler::always, false, Assembler::pt, done) ;
-        delayed()->nop() ;
+        ba_short(done);
       } else {
-        br     (Assembler::always, false, Assembler::pt, done) ;
-        delayed()-> st_ptr (Rscratch, Rbox, BasicLock::displaced_header_offset_in_bytes());
+        ba(done);
+        delayed()->st_ptr(Rscratch, Rbox, BasicLock::displaced_header_offset_in_bytes());
       }
 
-      bind   (IsInflated) ;
+      bind   (IsInflated);
       if (EmitSync & 64) {
          // If m->owner != null goto IsLocked
          // Test-and-CAS vs CAS
          // Pessimistic form avoids futile (doomed) CAS attempts
          // The optimistic form avoids RTS->RTO cache line upgrades.
-         ld_ptr (Rmark, ObjectMonitor::owner_offset_in_bytes() - 2, Rscratch);
-         andcc  (Rscratch, Rscratch, G0) ;
-         brx    (Assembler::notZero, false, Assembler::pn, done) ;
-         delayed()->nop() ;
+         ld_ptr(Rmark, ObjectMonitor::owner_offset_in_bytes() - 2, Rscratch);
+         andcc(Rscratch, Rscratch, G0);
+         brx(Assembler::notZero, false, Assembler::pn, done);
+         delayed()->nop();
          // m->owner == null : it's unlocked.
       }
 
       // Try to CAS m->owner from null to Self
       // Invariant: if we acquire the lock then _recursions should be 0.
-      add    (Rmark, ObjectMonitor::owner_offset_in_bytes()-2, Rmark) ;
-      mov    (G2_thread, Rscratch) ;
-      casn   (Rmark, G0, Rscratch) ;
-      cmp    (Rscratch, G0) ;
+      add(Rmark, ObjectMonitor::owner_offset_in_bytes()-2, Rmark);
+      mov(G2_thread, Rscratch);
+      casn(Rmark, G0, Rscratch);
+      cmp(Rscratch, G0);
       // ST box->displaced_header = NonZero.
       // Any non-zero value suffices:
       //    unused_mark(), G2_thread, RBox, RScratch, rsp, etc.
-      st_ptr (Rbox, Rbox, BasicLock::displaced_header_offset_in_bytes());
+      st_ptr(Rbox, Rbox, BasicLock::displaced_header_offset_in_bytes());
       // Intentional fall-through into done
    }
 
-   bind   (done) ;
+   bind   (done);
 }
 
 void MacroAssembler::compiler_unlock_object(Register Roop, Register Rmark,
@@ -3706,7 +3761,7 @@
    Label done ;
 
    if (EmitSync & 4) {
-     cmp  (SP, G0) ;
+     cmp(SP, G0);
      return ;
    }
 
@@ -3717,18 +3772,16 @@
 
      // Test first if it is a fast recursive unlock
      ld_ptr(Rbox, BasicLock::displaced_header_offset_in_bytes(), Rmark);
-     cmp(Rmark, G0);
-     brx(Assembler::equal, false, Assembler::pt, done);
-     delayed()->nop();
+     br_null_short(Rmark, Assembler::pt, done);
 
      // Check if it is still a light weight lock, this is is true if we see
      // the stack address of the basicLock in the markOop of the object
      assert(mark_addr.disp() == 0, "cas must take a zero displacement");
      casx_under_lock(mark_addr.base(), Rbox, Rmark,
        (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr());
-     br (Assembler::always, false, Assembler::pt, done);
+     ba(done);
      delayed()->cmp(Rbox, Rmark);
-     bind (done) ;
+     bind(done);
      return ;
    }
 
@@ -3743,14 +3796,14 @@
       biased_locking_exit(mark_addr, Rscratch, done);
    }
 
-   ld_ptr (Roop, oopDesc::mark_offset_in_bytes(), Rmark) ;
-   ld_ptr (Rbox, BasicLock::displaced_header_offset_in_bytes(), Rscratch);
-   andcc  (Rscratch, Rscratch, G0);
-   brx    (Assembler::zero, false, Assembler::pn, done);
-   delayed()-> nop() ;      // consider: relocate fetch of mark, above, into this DS
-   andcc  (Rmark, 2, G0) ;
-   brx    (Assembler::zero, false, Assembler::pt, LStacked) ;
-   delayed()-> nop() ;
+   ld_ptr(Roop, oopDesc::mark_offset_in_bytes(), Rmark);
+   ld_ptr(Rbox, BasicLock::displaced_header_offset_in_bytes(), Rscratch);
+   andcc(Rscratch, Rscratch, G0);
+   brx(Assembler::zero, false, Assembler::pn, done);
+   delayed()->nop();      // consider: relocate fetch of mark, above, into this DS
+   andcc(Rmark, 2, G0);
+   brx(Assembler::zero, false, Assembler::pt, LStacked);
+   delayed()->nop();
 
    // It's inflated
    // Conceptually we need a #loadstore|#storestore "release" MEMBAR before
@@ -3761,48 +3814,45 @@
    // Note that we use 1-0 locking by default for the inflated case.  We
    // close the resultant (and rare) race by having contented threads in
    // monitorenter periodically poll _owner.
-   ld_ptr (Rmark, ObjectMonitor::owner_offset_in_bytes() - 2, Rscratch);
-   ld_ptr (Rmark, ObjectMonitor::recursions_offset_in_bytes() - 2, Rbox);
-   xor3   (Rscratch, G2_thread, Rscratch) ;
-   orcc   (Rbox, Rscratch, Rbox) ;
-   brx    (Assembler::notZero, false, Assembler::pn, done) ;
+   ld_ptr(Rmark, ObjectMonitor::owner_offset_in_bytes() - 2, Rscratch);
+   ld_ptr(Rmark, ObjectMonitor::recursions_offset_in_bytes() - 2, Rbox);
+   xor3(Rscratch, G2_thread, Rscratch);
+   orcc(Rbox, Rscratch, Rbox);
+   brx(Assembler::notZero, false, Assembler::pn, done);
    delayed()->
-   ld_ptr (Rmark, ObjectMonitor::EntryList_offset_in_bytes() - 2, Rscratch);
-   ld_ptr (Rmark, ObjectMonitor::cxq_offset_in_bytes() - 2, Rbox);
-   orcc   (Rbox, Rscratch, G0) ;
+   ld_ptr(Rmark, ObjectMonitor::EntryList_offset_in_bytes() - 2, Rscratch);
+   ld_ptr(Rmark, ObjectMonitor::cxq_offset_in_bytes() - 2, Rbox);
+   orcc(Rbox, Rscratch, G0);
    if (EmitSync & 65536) {
       Label LSucc ;
-      brx    (Assembler::notZero, false, Assembler::pn, LSucc) ;
-      delayed()->nop() ;
-      br     (Assembler::always, false, Assembler::pt, done) ;
-      delayed()->
-      st_ptr (G0, Rmark, ObjectMonitor::owner_offset_in_bytes() - 2);
-
-      bind   (LSucc) ;
-      st_ptr (G0, Rmark, ObjectMonitor::owner_offset_in_bytes() - 2);
-      if (os::is_MP()) { membar (StoreLoad) ; }
-      ld_ptr (Rmark, ObjectMonitor::succ_offset_in_bytes() - 2, Rscratch);
-      andcc  (Rscratch, Rscratch, G0) ;
-      brx    (Assembler::notZero, false, Assembler::pt, done) ;
-      delayed()-> andcc (G0, G0, G0) ;
-      add    (Rmark, ObjectMonitor::owner_offset_in_bytes()-2, Rmark) ;
-      mov    (G2_thread, Rscratch) ;
-      casn   (Rmark, G0, Rscratch) ;
-      cmp    (Rscratch, G0) ;
+      brx(Assembler::notZero, false, Assembler::pn, LSucc);
+      delayed()->nop();
+      ba(done);
+      delayed()->st_ptr(G0, Rmark, ObjectMonitor::owner_offset_in_bytes() - 2);
+
+      bind(LSucc);
+      st_ptr(G0, Rmark, ObjectMonitor::owner_offset_in_bytes() - 2);
+      if (os::is_MP()) { membar (StoreLoad); }
+      ld_ptr(Rmark, ObjectMonitor::succ_offset_in_bytes() - 2, Rscratch);
+      andcc(Rscratch, Rscratch, G0);
+      brx(Assembler::notZero, false, Assembler::pt, done);
+      delayed()->andcc(G0, G0, G0);
+      add(Rmark, ObjectMonitor::owner_offset_in_bytes()-2, Rmark);
+      mov(G2_thread, Rscratch);
+      casn(Rmark, G0, Rscratch);
       // invert icc.zf and goto done
-      brx    (Assembler::notZero, false, Assembler::pt, done) ;
-      delayed() -> cmp (G0, G0) ;
-      br     (Assembler::always, false, Assembler::pt, done);
-      delayed() -> cmp (G0, 1) ;
+      br_notnull(Rscratch, false, Assembler::pt, done);
+      delayed()->cmp(G0, G0);
+      ba(done);
+      delayed()->cmp(G0, 1);
    } else {
-      brx    (Assembler::notZero, false, Assembler::pn, done) ;
-      delayed()->nop() ;
-      br     (Assembler::always, false, Assembler::pt, done) ;
-      delayed()->
-      st_ptr (G0, Rmark, ObjectMonitor::owner_offset_in_bytes() - 2);
+      brx(Assembler::notZero, false, Assembler::pn, done);
+      delayed()->nop();
+      ba(done);
+      delayed()->st_ptr(G0, Rmark, ObjectMonitor::owner_offset_in_bytes() - 2);
    }
 
-   bind   (LStacked) ;
+   bind   (LStacked);
    // Consider: we could replace the expensive CAS in the exit
    // path with a simple ST of the displaced mark value fetched from
    // the on-stack basiclock box.  That admits a race where a thread T2
@@ -3831,11 +3881,11 @@
    // A prototype implementation showed excellent results, although
    // the scavenger and timeout code was rather involved.
 
-   casn   (mark_addr.base(), Rbox, Rscratch) ;
-   cmp    (Rbox, Rscratch);
+   casn(mark_addr.base(), Rbox, Rscratch);
+   cmp(Rbox, Rscratch);
    // Intentional fall through into done ...
 
-   bind   (done) ;
+   bind(done);
 }
 
 
@@ -3891,9 +3941,7 @@
     ld_ptr(G2_thread, in_bytes(JavaThread::tlab_top_offset()), t1);
     ld_ptr(G2_thread, in_bytes(JavaThread::tlab_start_offset()), t2);
     or3(t1, t2, t3);
-    cmp(t1, t2);
-    br(Assembler::greaterEqual, false, Assembler::pn, next);
-    delayed()->nop();
+    cmp_and_br_short(t1, t2, Assembler::greaterEqual, Assembler::pn, next);
     stop("assert(top >= start)");
     should_not_reach_here();
 
@@ -3901,17 +3949,13 @@
     ld_ptr(G2_thread, in_bytes(JavaThread::tlab_top_offset()), t1);
     ld_ptr(G2_thread, in_bytes(JavaThread::tlab_end_offset()), t2);
     or3(t3, t2, t3);
-    cmp(t1, t2);
-    br(Assembler::lessEqual, false, Assembler::pn, next2);
-    delayed()->nop();
+    cmp_and_br_short(t1, t2, Assembler::lessEqual, Assembler::pn, next2);
     stop("assert(top <= end)");
     should_not_reach_here();
 
     bind(next2);
     and3(t3, MinObjAlignmentInBytesMask, t3);
-    cmp(t3, 0);
-    br(Assembler::lessEqual, false, Assembler::pn, ok);
-    delayed()->nop();
+    cmp_and_br_short(t3, 0, Assembler::lessEqual, Assembler::pn, ok);
     stop("assert(aligned)");
     should_not_reach_here();
 
@@ -3937,8 +3981,7 @@
 
   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
     // No allocation in the shared eden.
-    br(Assembler::always, false, Assembler::pt, slow_case);
-    delayed()->nop();
+    ba_short(slow_case);
   } else {
     // get eden boundaries
     // note: we need both top & top_addr!
@@ -4072,8 +4115,7 @@
 
   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
     // No allocation in the shared eden.
-    br(Assembler::always, false, Assembler::pt, slow_case);
-    delayed()->nop();
+    ba_short(slow_case);
   }
 
   ld_ptr(G2_thread, in_bytes(JavaThread::tlab_top_offset()), top);
@@ -4098,8 +4140,7 @@
     add(t2, 1, t2);
     stw(t2, G2_thread, in_bytes(JavaThread::tlab_slow_allocations_offset()));
   }
-  br(Assembler::always, false, Assembler::pt, try_eden);
-  delayed()->nop();
+  ba_short(try_eden);
 
   bind(discard_tlab);
   if (TLABStats) {
@@ -4115,8 +4156,7 @@
 
   // if tlab is currently allocated (top or end != null) then
   // fill [top, end + alignment_reserve) with array object
-  br_null(top, false, Assembler::pn, do_refill);
-  delayed()->nop();
+  br_null_short(top, Assembler::pn, do_refill);
 
   set((intptr_t)markOopDesc::prototype()->copy_set_hash(0x2), t2);
   st_ptr(t2, top, oopDesc::mark_offset_in_bytes()); // set up the mark word
@@ -4151,9 +4191,7 @@
     Label ok;
     ld_ptr(G2_thread, in_bytes(JavaThread::tlab_size_offset()), t2);
     sll_ptr(t2, LogHeapWordSize, t2);
-    cmp(t1, t2);
-    br(Assembler::equal, false, Assembler::pt, ok);
-    delayed()->nop();
+    cmp_and_br_short(t1, t2, Assembler::equal, Assembler::pt, ok);
     stop("assert(t1 == tlab_size)");
     should_not_reach_here();
 
@@ -4164,8 +4202,7 @@
   sub(top, ThreadLocalAllocBuffer::alignment_reserve_in_bytes(), top);
   st_ptr(top, G2_thread, in_bytes(JavaThread::tlab_end_offset()));
   verify_tlab();
-  br(Assembler::always, false, Assembler::pt, retry);
-  delayed()->nop();
+  ba_short(retry);
 }
 
 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes,
@@ -4290,12 +4327,15 @@
   BufferBlob* bb = BufferBlob::create("enqueue_with_frame", EnqueueCodeSize);
   CodeBuffer buf(bb);
   MacroAssembler masm(&buf);
-  address start = masm.pc();
+
+#define __ masm.
+
+  address start = __ pc();
   Register pre_val;
 
   Label refill, restart;
   if (with_frame) {
-    masm.save_frame(0);
+    __ save_frame(0);
     pre_val = I0;  // Was O0 before the save.
   } else {
     pre_val = O0;
@@ -4310,57 +4350,59 @@
          in_bytes(PtrQueue::byte_width_of_buf()) == sizeof(intptr_t),
          "check sizes in assembly below");
 
-  masm.bind(restart);
-  masm.ld_ptr(G2_thread, satb_q_index_byte_offset, L0);
-
-  masm.br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, L0, refill);
+  __ bind(restart);
+  __ ld_ptr(G2_thread, satb_q_index_byte_offset, L0);
+
+  __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, L0, refill);
   // If the branch is taken, no harm in executing this in the delay slot.
-  masm.delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, L1);
-  masm.sub(L0, oopSize, L0);
-
-  masm.st_ptr(pre_val, L1, L0);  // [_buf + index] := I0
+  __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, L1);
+  __ sub(L0, oopSize, L0);
+
+  __ st_ptr(pre_val, L1, L0);  // [_buf + index] := I0
   if (!with_frame) {
     // Use return-from-leaf
-    masm.retl();
-    masm.delayed()->st_ptr(L0, G2_thread, satb_q_index_byte_offset);
+    __ retl();
+    __ delayed()->st_ptr(L0, G2_thread, satb_q_index_byte_offset);
   } else {
     // Not delayed.
-    masm.st_ptr(L0, G2_thread, satb_q_index_byte_offset);
+    __ st_ptr(L0, G2_thread, satb_q_index_byte_offset);
   }
   if (with_frame) {
-    masm.ret();
-    masm.delayed()->restore();
+    __ ret();
+    __ delayed()->restore();
   }
-  masm.bind(refill);
+  __ bind(refill);
 
   address handle_zero =
     CAST_FROM_FN_PTR(address,
                      &SATBMarkQueueSet::handle_zero_index_for_thread);
   // This should be rare enough that we can afford to save all the
   // scratch registers that the calling context might be using.
-  masm.mov(G1_scratch, L0);
-  masm.mov(G3_scratch, L1);
-  masm.mov(G4, L2);
+  __ mov(G1_scratch, L0);
+  __ mov(G3_scratch, L1);
+  __ mov(G4, L2);
   // We need the value of O0 above (for the write into the buffer), so we
   // save and restore it.
-  masm.mov(O0, L3);
+  __ mov(O0, L3);
   // Since the call will overwrite O7, we save and restore that, as well.
-  masm.mov(O7, L4);
-  masm.call_VM_leaf(L5, handle_zero, G2_thread);
-  masm.mov(L0, G1_scratch);
-  masm.mov(L1, G3_scratch);
-  masm.mov(L2, G4);
-  masm.mov(L3, O0);
-  masm.br(Assembler::always, /*annul*/false, Assembler::pt, restart);
-  masm.delayed()->mov(L4, O7);
+  __ mov(O7, L4);
+  __ call_VM_leaf(L5, handle_zero, G2_thread);
+  __ mov(L0, G1_scratch);
+  __ mov(L1, G3_scratch);
+  __ mov(L2, G4);
+  __ mov(L3, O0);
+  __ br(Assembler::always, /*annul*/false, Assembler::pt, restart);
+  __ delayed()->mov(L4, O7);
 
   if (with_frame) {
     satb_log_enqueue_with_frame = start;
-    satb_log_enqueue_with_frame_end = masm.pc();
+    satb_log_enqueue_with_frame_end = __ pc();
   } else {
     satb_log_enqueue_frameless = start;
-    satb_log_enqueue_frameless_end = masm.pc();
+    satb_log_enqueue_frameless_end = __ pc();
   }
+
+#undef __
 }
 
 static inline void generate_satb_log_enqueue_if_necessary(bool with_frame) {
@@ -4426,7 +4468,7 @@
 
   // Check on whether to annul.
   br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered);
-  delayed() -> nop();
+  delayed()->nop();
 
   // Do we need to load the previous value?
   if (obj != noreg) {
@@ -4450,7 +4492,7 @@
   // Is the previous value null?
   // Check on whether to annul.
   br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, pre_val, filtered);
-  delayed() -> nop();
+  delayed()->nop();
 
   // OK, it's not filtered, so we'll need to call enqueue.  In the normal
   // case, pre_val will be a scratch G-reg, but there are some cases in
@@ -4518,79 +4560,83 @@
   BufferBlob* bb = BufferBlob::create("dirty_card_enqueue", EnqueueCodeSize*2);
   CodeBuffer buf(bb);
   MacroAssembler masm(&buf);
-  address start = masm.pc();
+#define __ masm.
+  address start = __ pc();
 
   Label not_already_dirty, restart, refill;
 
 #ifdef _LP64
-  masm.srlx(O0, CardTableModRefBS::card_shift, O0);
+  __ srlx(O0, CardTableModRefBS::card_shift, O0);
 #else
-  masm.srl(O0, CardTableModRefBS::card_shift, O0);
+  __ srl(O0, CardTableModRefBS::card_shift, O0);
 #endif
   AddressLiteral addrlit(byte_map_base);
-  masm.set(addrlit, O1); // O1 := <card table base>
-  masm.ldub(O0, O1, O2); // O2 := [O0 + O1]
-
-  masm.br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
+  __ set(addrlit, O1); // O1 := <card table base>
+  __ ldub(O0, O1, O2); // O2 := [O0 + O1]
+
+  __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
                       O2, not_already_dirty);
   // Get O1 + O2 into a reg by itself -- useful in the take-the-branch
   // case, harmless if not.
-  masm.delayed()->add(O0, O1, O3);
+  __ delayed()->add(O0, O1, O3);
 
   // We didn't take the branch, so we're already dirty: return.
   // Use return-from-leaf
-  masm.retl();
-  masm.delayed()->nop();
+  __ retl();
+  __ delayed()->nop();
 
   // Not dirty.
-  masm.bind(not_already_dirty);
+  __ bind(not_already_dirty);
   // First, dirty it.
-  masm.stb(G0, O3, G0);  // [cardPtr] := 0  (i.e., dirty).
+  __ stb(G0, O3, G0);  // [cardPtr] := 0  (i.e., dirty).
   int dirty_card_q_index_byte_offset =
     in_bytes(JavaThread::dirty_card_queue_offset() +
              PtrQueue::byte_offset_of_index());
   int dirty_card_q_buf_byte_offset =
     in_bytes(JavaThread::dirty_card_queue_offset() +
              PtrQueue::byte_offset_of_buf());
-  masm.bind(restart);
-  masm.ld_ptr(G2_thread, dirty_card_q_index_byte_offset, L0);
-
-  masm.br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
+  __ bind(restart);
+  __ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, L0);
+
+  __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
                       L0, refill);
   // If the branch is taken, no harm in executing this in the delay slot.
-  masm.delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1);
-  masm.sub(L0, oopSize, L0);
-
-  masm.st_ptr(O3, L1, L0);  // [_buf + index] := I0
+  __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1);
+  __ sub(L0, oopSize, L0);
+
+  __ st_ptr(O3, L1, L0);  // [_buf + index] := I0
   // Use return-from-leaf
-  masm.retl();
-  masm.delayed()->st_ptr(L0, G2_thread, dirty_card_q_index_byte_offset);
-
-  masm.bind(refill);
+  __ retl();
+  __ delayed()->st_ptr(L0, G2_thread, dirty_card_q_index_byte_offset);
+
+  __ bind(refill);
   address handle_zero =
     CAST_FROM_FN_PTR(address,
                      &DirtyCardQueueSet::handle_zero_index_for_thread);
   // This should be rare enough that we can afford to save all the
   // scratch registers that the calling context might be using.
-  masm.mov(G1_scratch, L3);
-  masm.mov(G3_scratch, L5);
+  __ mov(G1_scratch, L3);
+  __ mov(G3_scratch, L5);
   // We need the value of O3 above (for the write into the buffer), so we
   // save and restore it.
-  masm.mov(O3, L6);
+  __ mov(O3, L6);
   // Since the call will overwrite O7, we save and restore that, as well.
-  masm.mov(O7, L4);
-
-  masm.call_VM_leaf(L7_thread_cache, handle_zero, G2_thread);
-  masm.mov(L3, G1_scratch);
-  masm.mov(L5, G3_scratch);
-  masm.mov(L6, O3);
-  masm.br(Assembler::always, /*annul*/false, Assembler::pt, restart);
-  masm.delayed()->mov(L4, O7);
+  __ mov(O7, L4);
+
+  __ call_VM_leaf(L7_thread_cache, handle_zero, G2_thread);
+  __ mov(L3, G1_scratch);
+  __ mov(L5, G3_scratch);
+  __ mov(L6, O3);
+  __ br(Assembler::always, /*annul*/false, Assembler::pt, restart);
+  __ delayed()->mov(L4, O7);
 
   dirty_card_log_enqueue = start;
-  dirty_card_log_enqueue_end = masm.pc();
+  dirty_card_log_enqueue_end = __ pc();
   // XXX Should have a guarantee here about not going off the end!
   // Does it already do so?  Do an experiment...
+
+#undef __
+
 }
 
 static inline void
@@ -4903,7 +4949,7 @@
   delayed()->mov(G0, result);     // not equal
 
   // only one char ?
-  br_on_reg_cond(rc_z, true, Assembler::pn, limit, Ldone);
+  cmp_zero_and_br(zero, limit, Ldone, true, Assembler::pn);
   delayed()->add(G0, 1, result); // zero-length arrays are equal
 
   // word by word compare, dont't need alignment check
--- a/src/cpu/sparc/vm/assembler_sparc.hpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp	Thu Jul 21 11:25:07 2011 -0700
@@ -761,7 +761,7 @@
     mwtos_opf   = 0x119
   };
 
-  enum RCondition {  rc_z = 1,  rc_lez = 2,  rc_lz = 3, rc_nz = 5, rc_gz = 6, rc_gez = 7  };
+  enum RCondition {  rc_z = 1,  rc_lez = 2,  rc_lz = 3, rc_nz = 5, rc_gz = 6, rc_gez = 7, rc_last = rc_gez  };
 
   enum Condition {
      // for FBfcc & FBPfcc instruction
@@ -866,9 +866,18 @@
     return is_simm(d, nbits + 2);
   }
 
+  address target_distance(Label& L) {
+    // Assembler::target(L) should be called only when
+    // a branch instruction is emitted since non-bound
+    // labels record current pc() as a branch address.
+    if (L.is_bound()) return target(L);
+    // Return current address for non-bound labels.
+    return pc();
+  }
+
   // test if label is in simm16 range in words (wdisp16).
   bool is_in_wdisp16_range(Label& L) {
-    return is_in_wdisp_range(target(L), pc(), 16);
+    return is_in_wdisp_range(target_distance(L), pc(), 16);
   }
   // test if the distance between two addresses fits in simm30 range in words
   static bool is_in_wdisp30_range(address a, address b) {
@@ -975,6 +984,20 @@
   static int sx(       int         i)  { return  u_field(i,             12, 12); } // shift x=1 means 64-bit
   static int opf(      int         x)  { return  u_field(x,             13,  5); }
 
+  static bool is_cbcond( int x ) {
+    return (VM_Version::has_cbcond() && (inv_cond(x) > rc_last) &&
+            inv_op(x) == branch_op && inv_op2(x) == bpr_op2);
+  }
+  static bool is_cxb( int x ) {
+    assert(is_cbcond(x), "wrong instruction");
+    return (x & (1<<21)) != 0;
+  }
+  static int cond_cbcond( int         x)  { return  u_field((((x & 8)<<1) + 8 + (x & 7)), 29, 25); }
+  static int inv_cond_cbcond(int      x)  {
+    assert(is_cbcond(x), "wrong instruction");
+    return inv_u_field(x, 27, 25) | (inv_u_field(x, 29, 29)<<3);
+  }
+
   static int opf_cc(   CC          c, bool useFloat ) { return u_field((useFloat ? 0 : 4) + c, 13, 11); }
   static int mov_cc(   CC          c, bool useFloat ) { return u_field(useFloat ? 0 : 1,  18, 18) | u_field(c, 12, 11); }
 
@@ -1026,6 +1049,26 @@
     return r;
   }
 
+  // compute inverse of wdisp10
+  static intptr_t inv_wdisp10(int x, intptr_t pos) {
+    assert(is_cbcond(x), "wrong instruction");
+    int lo = inv_u_field(x, 12, 5);
+    int hi = (x >> 19) & 3;
+    if (hi >= 2) hi |= ~1;
+    return (((hi << 8) | lo) << 2) + pos;
+  }
+
+  // word offset for cbcond, 8 bits at [B12,B5], 2 bits at [B20,B19]
+  static int wdisp10(intptr_t x, intptr_t off) {
+    assert(VM_Version::has_cbcond(), "This CPU does not have CBCOND instruction");
+    intptr_t xx = x - off;
+    assert_signed_word_disp_range(xx, 10);
+    int r =  ( ( (xx >>  2   ) & ((1 << 8) - 1) ) <<  5 )
+           | ( ( (xx >> (2+8)) & 3              ) << 19 );
+    // Have to fake cbcond instruction to pass assert in inv_wdisp10()
+    assert(inv_wdisp10((r | op(branch_op) | cond_cbcond(rc_last+1) | op2(bpr_op2)), off) == x,  "inverse is not inverse");
+    return r;
+  }
 
   // word displacement in low-order nbits bits
 
@@ -1138,6 +1181,24 @@
 #endif
   }
 
+  // cbcond instruction should not be generated one after an other
+  bool cbcond_before() {
+    if (offset() == 0) return false; // it is first instruction
+    int x = *(int*)(intptr_t(pc()) - 4); // previous instruction
+    return is_cbcond(x);
+  }
+
+  void no_cbcond_before() {
+    assert(offset() == 0 || !cbcond_before(), "cbcond should not follow an other cbcond");
+  }
+
+  bool use_cbcond(Label& L) {
+    if (!UseCBCond || cbcond_before()) return false;
+    intptr_t x = intptr_t(target_distance(L)) - intptr_t(pc());
+    assert( (x & 3) == 0, "not word aligned");
+    return is_simm(x, 12);
+  }
+
 public:
   // Tells assembler you know that next instruction is delayed
   Assembler* delayed() {
@@ -1181,10 +1242,11 @@
   void addccc( Register s1, Register s2, Register d ) { emit_long( op(arith_op) | rd(d) | op3(addc_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
   void addccc( Register s1, int simm13a, Register d ) { emit_long( op(arith_op) | rd(d) | op3(addc_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 
+
   // pp 136
 
-  inline void bpr( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none );
-  inline void bpr( RCondition c, bool a, Predict p, Register s1, Label& L);
+  inline void bpr(RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none);
+  inline void bpr(RCondition c, bool a, Predict p, Register s1, Label& L);
 
  protected: // use MacroAssembler::br instead
 
@@ -1198,8 +1260,6 @@
   inline void fbp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt = relocInfo::none );
   inline void fbp( Condition c, bool a, CC cc, Predict p, Label& L );
 
- public:
-
   // pp 144
 
   inline void br( Condition c, bool a, address d, relocInfo::relocType rt = relocInfo::none );
@@ -1215,11 +1275,17 @@
   inline void cb( Condition c, bool a, address d, relocInfo::relocType rt = relocInfo::none );
   inline void cb( Condition c, bool a, Label& L );
 
+  // compare and branch
+  inline void cbcond(Condition c, CC cc, Register s1, Register s2, Label& L);
+  inline void cbcond(Condition c, CC cc, Register s1, int simm5, Label& L);
+
   // pp 149
 
   inline void call( address d,  relocInfo::relocType rt = relocInfo::runtime_call_type );
   inline void call( Label& L,   relocInfo::relocType rt = relocInfo::runtime_call_type );
 
+ public:
+
   // pp 150
 
   // These instructions compare the contents of s2 with the contents of
@@ -1862,8 +1928,8 @@
   inline void fb( Condition c, bool a, Predict p, address d, relocInfo::relocType rt = relocInfo::none );
   inline void fb( Condition c, bool a, Predict p, Label& L );
 
-  // compares register with zero and branches (V9 and V8 instructions)
-  void br_zero( Condition c, bool a, Predict p, Register s1, Label& L);
+  // compares register with zero (32 bit) and branches (V9 and V8 instructions)
+  void cmp_zero_and_br( Condition c, Register s1, Label& L, bool a = false, Predict p = pn );
   // Compares a pointer register with zero and branches on (not)null.
   // Does a test & branch on 32-bit systems and a register-branch on 64-bit.
   void br_null   ( Register s1, bool a, Predict p, Label& L );
@@ -1875,6 +1941,26 @@
   void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none );
   void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, Label& L);
 
+  //
+  // Compare registers and branch with nop in delay slot or cbcond without delay slot.
+  //
+  // ATTENTION: use these instructions with caution because cbcond instruction
+  //            has very short distance: 512 instructions (2Kbyte).
+
+  // Compare integer (32 bit) values (icc only).
+  void cmp_and_br_short(Register s1, Register s2, Condition c, Predict p, Label& L);
+  void cmp_and_br_short(Register s1, int simm13a, Condition c, Predict p, Label& L);
+  // Platform depending version for pointer compare (icc on !LP64 and xcc on LP64).
+  void cmp_and_brx_short(Register s1, Register s2, Condition c, Predict p, Label& L);
+  void cmp_and_brx_short(Register s1, int simm13a, Condition c, Predict p, Label& L);
+
+  // Short branch version for compares a pointer pwith zero.
+  void br_null_short   ( Register s1, Predict p, Label& L );
+  void br_notnull_short( Register s1, Predict p, Label& L );
+
+  // unconditional short branch
+  void ba_short(Label& L);
+
   inline void bp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt = relocInfo::none );
   inline void bp( Condition c, bool a, CC cc, Predict p, Label& L );
 
@@ -1882,8 +1968,8 @@
   inline void brx( Condition c, bool a, Predict p, address d, relocInfo::relocType rt = relocInfo::none );
   inline void brx( Condition c, bool a, Predict p, Label& L );
 
-  // unconditional short branch
-  inline void ba( bool a, Label& L );
+  // unconditional branch
+  inline void ba( Label& L );
 
   // Branch that tests fp condition codes
   inline void fbp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt = relocInfo::none );
@@ -2167,7 +2253,6 @@
 
   inline void stbool(Register d, const Address& a) { stb(d, a); }
   inline void ldbool(const Address& a, Register d) { ldsb(a, d); }
-  inline void tstbool( Register s ) { tst(s); }
   inline void movbool( bool boolconst, Register d) { mov( (int) boolconst, d); }
 
   // klass oop manipulations if compressed
@@ -2469,8 +2554,7 @@
                                      Label* L_success,
                                      Label* L_failure,
                                      Label* L_slow_path,
-                RegisterOrConstant super_check_offset = RegisterOrConstant(-1),
-                Register instanceof_hack = noreg);
+                RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 
   // The rest of the type check; must be wired to a corresponding fast path.
   // It does not repeat the fast path logic, so don't use it standalone.
--- a/src/cpu/sparc/vm/assembler_sparc.inline.hpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/assembler_sparc.inline.hpp	Thu Jul 21 11:25:07 2011 -0700
@@ -80,32 +80,36 @@
 inline void Assembler::add(Register s1, int simm13a, Register d, relocInfo::relocType rtype ) { emit_data( op(arith_op) | rd(d) | op3(add_op3) | rs1(s1) | immed(true) | simm(simm13a, 13), rtype ); }
 inline void Assembler::add(Register s1, int simm13a, Register d, RelocationHolder const& rspec ) { emit_data( op(arith_op) | rd(d) | op3(add_op3) | rs1(s1) | immed(true) | simm(simm13a, 13), rspec ); }
 
-inline void Assembler::bpr( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt ) { v9_only();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(bpr_op2) | wdisp16(intptr_t(d), intptr_t(pc())) | predict(p) | rs1(s1), rt);  has_delay_slot(); }
+inline void Assembler::bpr( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt ) { v9_only();  cti();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(bpr_op2) | wdisp16(intptr_t(d), intptr_t(pc())) | predict(p) | rs1(s1), rt);  has_delay_slot(); }
 inline void Assembler::bpr( RCondition c, bool a, Predict p, Register s1, Label& L) { bpr( c, a, p, s1, target(L)); }
 
-inline void Assembler::fb( Condition c, bool a, address d, relocInfo::relocType rt ) { v9_dep();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(fb_op2) | wdisp(intptr_t(d), intptr_t(pc()), 22), rt);  has_delay_slot(); }
+inline void Assembler::fb( Condition c, bool a, address d, relocInfo::relocType rt ) { v9_dep();  cti();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(fb_op2) | wdisp(intptr_t(d), intptr_t(pc()), 22), rt);  has_delay_slot(); }
 inline void Assembler::fb( Condition c, bool a, Label& L ) { fb(c, a, target(L)); }
 
-inline void Assembler::fbp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt ) { v9_only();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(fbp_op2) | branchcc(cc) | predict(p) | wdisp(intptr_t(d), intptr_t(pc()), 19), rt);  has_delay_slot(); }
+inline void Assembler::fbp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt ) { v9_only();  cti();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(fbp_op2) | branchcc(cc) | predict(p) | wdisp(intptr_t(d), intptr_t(pc()), 19), rt);  has_delay_slot(); }
 inline void Assembler::fbp( Condition c, bool a, CC cc, Predict p, Label& L ) { fbp(c, a, cc, p, target(L)); }
 
-inline void Assembler::cb( Condition c, bool a, address d, relocInfo::relocType rt ) { v8_only();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(cb_op2) | wdisp(intptr_t(d), intptr_t(pc()), 22), rt);  has_delay_slot(); }
+inline void Assembler::cb( Condition c, bool a, address d, relocInfo::relocType rt ) { v8_only();  cti();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(cb_op2) | wdisp(intptr_t(d), intptr_t(pc()), 22), rt);  has_delay_slot(); }
 inline void Assembler::cb( Condition c, bool a, Label& L ) { cb(c, a, target(L)); }
 
-inline void Assembler::br( Condition c, bool a, address d, relocInfo::relocType rt ) { v9_dep();   emit_data( op(branch_op) | annul(a) | cond(c) | op2(br_op2) | wdisp(intptr_t(d), intptr_t(pc()), 22), rt);  has_delay_slot(); }
+inline void Assembler::br( Condition c, bool a, address d, relocInfo::relocType rt ) { v9_dep();  cti();   emit_data( op(branch_op) | annul(a) | cond(c) | op2(br_op2) | wdisp(intptr_t(d), intptr_t(pc()), 22), rt);  has_delay_slot(); }
 inline void Assembler::br( Condition c, bool a, Label& L ) { br(c, a, target(L)); }
 
-inline void Assembler::bp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt ) { v9_only();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(bp_op2) | branchcc(cc) | predict(p) | wdisp(intptr_t(d), intptr_t(pc()), 19), rt);  has_delay_slot(); }
+inline void Assembler::bp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt ) { v9_only();  cti();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(bp_op2) | branchcc(cc) | predict(p) | wdisp(intptr_t(d), intptr_t(pc()), 19), rt);  has_delay_slot(); }
 inline void Assembler::bp( Condition c, bool a, CC cc, Predict p, Label& L ) { bp(c, a, cc, p, target(L)); }
 
-inline void Assembler::call( address d,  relocInfo::relocType rt ) { emit_data( op(call_op) | wdisp(intptr_t(d), intptr_t(pc()), 30), rt);  has_delay_slot(); assert(rt != relocInfo::virtual_call_type, "must use virtual_call_Relocation::spec"); }
+// compare and branch
+inline void Assembler::cbcond(Condition c, CC cc, Register s1, Register s2, Label& L) { cti();  no_cbcond_before();  emit_data(op(branch_op) | cond_cbcond(c) | op2(bpr_op2) | branchcc(cc) | wdisp10(intptr_t(target(L)), intptr_t(pc())) | rs1(s1) | rs2(s2)); }
+inline void Assembler::cbcond(Condition c, CC cc, Register s1, int simm5, Label& L)   { cti();  no_cbcond_before();  emit_data(op(branch_op) | cond_cbcond(c) | op2(bpr_op2) | branchcc(cc) | wdisp10(intptr_t(target(L)), intptr_t(pc())) | rs1(s1) | immed(true) | simm(simm5, 5)); }
+
+inline void Assembler::call( address d,  relocInfo::relocType rt ) { cti();  emit_data( op(call_op) | wdisp(intptr_t(d), intptr_t(pc()), 30), rt);  has_delay_slot(); assert(rt != relocInfo::virtual_call_type, "must use virtual_call_Relocation::spec"); }
 inline void Assembler::call( Label& L,   relocInfo::relocType rt ) { call( target(L), rt); }
 
 inline void Assembler::flush( Register s1, Register s2) { emit_long( op(arith_op) | op3(flush_op3) | rs1(s1) | rs2(s2)); }
 inline void Assembler::flush( Register s1, int simm13a) { emit_data( op(arith_op) | op3(flush_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
 
-inline void Assembler::jmpl( Register s1, Register s2, Register d                          ) { emit_long( op(arith_op) | rd(d) | op3(jmpl_op3) | rs1(s1) | rs2(s2));  has_delay_slot(); }
-inline void Assembler::jmpl( Register s1, int simm13a, Register d, RelocationHolder const& rspec ) { emit_data( op(arith_op) | rd(d) | op3(jmpl_op3) | rs1(s1) | immed(true) | simm(simm13a, 13), rspec);  has_delay_slot(); }
+inline void Assembler::jmpl( Register s1, Register s2, Register d ) { cti();  emit_long( op(arith_op) | rd(d) | op3(jmpl_op3) | rs1(s1) | rs2(s2));  has_delay_slot(); }
+inline void Assembler::jmpl( Register s1, int simm13a, Register d, RelocationHolder const& rspec ) { cti();  emit_data( op(arith_op) | rd(d) | op3(jmpl_op3) | rs1(s1) | immed(true) | simm(simm13a, 13), rspec);  has_delay_slot(); }
 
 inline void Assembler::ldf(FloatRegisterImpl::Width w, Register s1, RegisterOrConstant s2, FloatRegister d) {
   if (s2.is_register()) ldf(w, s1, s2.as_register(), d);
@@ -240,8 +244,8 @@
 inline void Assembler::prefetch(const Address& a, PrefetchFcn f, int offset) { v9_only(); relocate(a.rspec(offset)); prefetch(a.base(), a.disp() + offset, f); }
 
 
-inline void Assembler::rett( Register s1, Register s2                         ) { emit_long( op(arith_op) | op3(rett_op3) | rs1(s1) | rs2(s2));  has_delay_slot(); }
-inline void Assembler::rett( Register s1, int simm13a, relocInfo::relocType rt) { emit_data( op(arith_op) | op3(rett_op3) | rs1(s1) | immed(true) | simm(simm13a, 13), rt);  has_delay_slot(); }
+inline void Assembler::rett( Register s1, Register s2                         ) { cti();  emit_long( op(arith_op) | op3(rett_op3) | rs1(s1) | rs2(s2));  has_delay_slot(); }
+inline void Assembler::rett( Register s1, int simm13a, relocInfo::relocType rt) { cti();  emit_data( op(arith_op) | op3(rett_op3) | rs1(s1) | immed(true) | simm(simm13a, 13), rt);  has_delay_slot(); }
 
 inline void Assembler::sethi( int imm22a, Register d, RelocationHolder const& rspec ) { emit_data( op(branch_op) | rd(d) | op2(sethi_op2) | hi22(imm22a), rspec); }
 
@@ -557,8 +561,8 @@
   brx(c, a, p, target(L));
 }
 
-inline void MacroAssembler::ba( bool a, Label& L ) {
-  br(always, a, pt, L);
+inline void MacroAssembler::ba( Label& L ) {
+  br(always, false, pt, L);
 }
 
 // Warning: V9 only functions
--- a/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -303,9 +303,7 @@
     assert(_oop_index >= 0, "must have oop index");
     __ load_heap_oop(_obj, java_lang_Class::klass_offset_in_bytes(), G3);
     __ ld_ptr(G3, instanceKlass::init_thread_offset_in_bytes() + sizeof(klassOopDesc), G3);
-    __ cmp(G2_thread, G3);
-    __ br(Assembler::notEqual, false, Assembler::pn, call_patch);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(G2_thread, G3, Assembler::notEqual, Assembler::pn, call_patch);
 
     // load_klass patches may execute the patched code before it's
     // copied back into place so we need to jump back into the main
--- a/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -217,9 +217,7 @@
       {
         Label L;
         __ ld_ptr(OSR_buf, slot_offset + 1*BytesPerWord, O7);
-        __ cmp(G0, O7);
-        __ br(Assembler::notEqual, false, Assembler::pt, L);
-        __ delayed()->nop();
+        __ cmp_and_br_short(O7, G0, Assembler::notEqual, Assembler::pt, L);
         __ stop("locked object is NULL");
         __ bind(L);
       }
@@ -2096,10 +2094,10 @@
       __ xor3(O0, -1, tmp);
       __ sub(length, tmp, length);
       __ add(src_pos, tmp, src_pos);
-      __ br_zero(Assembler::less, false, Assembler::pn, O0, *stub->entry());
+      __ cmp_zero_and_br(Assembler::less, O0, *stub->entry());
       __ delayed()->add(dst_pos, tmp, dst_pos);
     } else {
-      __ br_zero(Assembler::less, false, Assembler::pn, O0, *stub->entry());
+      __ cmp_zero_and_br(Assembler::less, O0, *stub->entry());
       __ delayed()->nop();
     }
     __ bind(*stub->continuation());
@@ -2123,22 +2121,19 @@
 
   if (flags & LIR_OpArrayCopy::src_pos_positive_check) {
     // test src_pos register
-    __ tst(src_pos);
-    __ br(Assembler::less, false, Assembler::pn, *stub->entry());
+    __ cmp_zero_and_br(Assembler::less, src_pos, *stub->entry());
     __ delayed()->nop();
   }
 
   if (flags & LIR_OpArrayCopy::dst_pos_positive_check) {
     // test dst_pos register
-    __ tst(dst_pos);
-    __ br(Assembler::less, false, Assembler::pn, *stub->entry());
+    __ cmp_zero_and_br(Assembler::less, dst_pos, *stub->entry());
     __ delayed()->nop();
   }
 
   if (flags & LIR_OpArrayCopy::length_positive_check) {
     // make sure length isn't negative
-    __ tst(length);
-    __ br(Assembler::less, false, Assembler::pn, *stub->entry());
+    __ cmp_zero_and_br(Assembler::less, length, *stub->entry());
     __ delayed()->nop();
   }
 
@@ -2261,8 +2256,7 @@
 #ifndef PRODUCT
         if (PrintC1Statistics) {
           Label failed;
-          __ br_notnull(O0, false, Assembler::pn,  failed);
-          __ delayed()->nop();
+          __ br_notnull_short(O0, Assembler::pn, failed);
           __ inc_counter((address)&Runtime1::_arraycopy_checkcast_cnt, G1, G3);
           __ bind(failed);
         }
@@ -2314,9 +2308,7 @@
         __ br(Assembler::notEqual, false, Assembler::pn, halt);
         // load the raw value of the src klass.
         __ delayed()->lduw(src, oopDesc::klass_offset_in_bytes(), tmp2);
-        __ cmp(tmp, tmp2);
-        __ br(Assembler::equal, false, Assembler::pn, known_ok);
-        __ delayed()->nop();
+        __ cmp_and_br_short(tmp, tmp2, Assembler::equal, Assembler::pn, known_ok);
       } else {
         __ cmp(tmp, tmp2);
         __ br(Assembler::equal, false, Assembler::pn, known_ok);
@@ -2330,9 +2322,7 @@
         __ cmp(tmp, tmp2);
         __ brx(Assembler::notEqual, false, Assembler::pn, halt);
         __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), tmp2);
-        __ cmp(tmp, tmp2);
-        __ brx(Assembler::equal, false, Assembler::pn, known_ok);
-        __ delayed()->nop();
+        __ cmp_and_brx_short(tmp, tmp2, Assembler::equal, Assembler::pn, known_ok);
       } else {
         __ cmp(tmp, tmp2);
         __ brx(Assembler::equal, false, Assembler::pn, known_ok);
@@ -2530,15 +2520,13 @@
                           mdo_offset_bias);
     __ ld_ptr(receiver_addr, tmp1);
     __ verify_oop(tmp1);
-    __ cmp(recv, tmp1);
-    __ brx(Assembler::notEqual, false, Assembler::pt, next_test);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(recv, tmp1, Assembler::notEqual, Assembler::pt, next_test);
     Address data_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) -
                       mdo_offset_bias);
     __ ld_ptr(data_addr, tmp1);
     __ add(tmp1, DataLayout::counter_increment, tmp1);
     __ st_ptr(tmp1, data_addr);
-    __ ba(false, *update_done);
+    __ ba(*update_done);
     __ delayed()->nop();
     __ bind(next_test);
   }
@@ -2549,13 +2537,12 @@
     Address recv_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)) -
                       mdo_offset_bias);
     __ ld_ptr(recv_addr, tmp1);
-    __ br_notnull(tmp1, false, Assembler::pt, next_test);
-    __ delayed()->nop();
+    __ br_notnull_short(tmp1, Assembler::pt, next_test);
     __ st_ptr(recv, recv_addr);
     __ set(DataLayout::counter_increment, tmp1);
     __ st_ptr(tmp1, mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) -
               mdo_offset_bias);
-    __ ba(false, *update_done);
+    __ ba(*update_done);
     __ delayed()->nop();
     __ bind(next_test);
   }
@@ -2601,8 +2588,7 @@
     setup_md_access(method, op->profiled_bci(), md, data, mdo_offset_bias);
 
     Label not_null;
-    __ br_notnull(obj, false, Assembler::pn, not_null);
-    __ delayed()->nop();
+    __ br_notnull_short(obj, Assembler::pn, not_null);
     Register mdo      = k_RInfo;
     Register data_val = Rtmp1;
     jobject2reg(md->constant_encoding(), mdo);
@@ -2614,7 +2600,7 @@
     __ ldub(flags_addr, data_val);
     __ or3(data_val, BitData::null_seen_byte_constant(), data_val);
     __ stb(data_val, flags_addr);
-    __ ba(false, *obj_is_null);
+    __ ba(*obj_is_null);
     __ delayed()->nop();
     __ bind(not_null);
   } else {
@@ -2682,7 +2668,7 @@
     __ load_klass(obj, recv);
     type_profile_helper(mdo, mdo_offset_bias, md, data, recv, tmp1, success);
     // Jump over the failure case
-    __ ba(false, *success);
+    __ ba(*success);
     __ delayed()->nop();
     // Cast failure case
     __ bind(profile_cast_failure);
@@ -2695,10 +2681,10 @@
     __ ld_ptr(data_addr, tmp1);
     __ sub(tmp1, DataLayout::counter_increment, tmp1);
     __ st_ptr(tmp1, data_addr);
-    __ ba(false, *failure);
+    __ ba(*failure);
     __ delayed()->nop();
   }
-  __ ba(false, *success);
+  __ ba(*success);
   __ delayed()->nop();
 }
 
@@ -2728,8 +2714,7 @@
 
     if (op->should_profile()) {
       Label not_null;
-      __ br_notnull(value, false, Assembler::pn, not_null);
-      __ delayed()->nop();
+      __ br_notnull_short(value, Assembler::pn, not_null);
       Register mdo      = k_RInfo;
       Register data_val = Rtmp1;
       jobject2reg(md->constant_encoding(), mdo);
@@ -2741,12 +2726,10 @@
       __ ldub(flags_addr, data_val);
       __ or3(data_val, BitData::null_seen_byte_constant(), data_val);
       __ stb(data_val, flags_addr);
-      __ ba(false, done);
-      __ delayed()->nop();
+      __ ba_short(done);
       __ bind(not_null);
     } else {
-      __ br_null(value, false, Assembler::pn, done);
-      __ delayed()->nop();
+      __ br_null_short(value, Assembler::pn, done);
     }
     add_debug_info_for_null_check_here(op->info_for_exception());
     __ load_klass(array, k_RInfo);
@@ -2777,8 +2760,7 @@
       }
       __ load_klass(value, recv);
       type_profile_helper(mdo, mdo_offset_bias, md, data, recv, tmp1, &done);
-      __ ba(false, done);
-      __ delayed()->nop();
+      __ ba_short(done);
       // Cast failure case
       __ bind(profile_cast_failure);
       jobject2reg(md->constant_encoding(), mdo);
@@ -2790,7 +2772,7 @@
       __ ld_ptr(data_addr, tmp1);
       __ sub(tmp1, DataLayout::counter_increment, tmp1);
       __ st_ptr(tmp1, data_addr);
-      __ ba(false, *stub->entry());
+      __ ba(*stub->entry());
       __ delayed()->nop();
     }
     __ bind(done);
@@ -2808,8 +2790,7 @@
     emit_typecheck_helper(op, &success, &failure, &failure);
     __ bind(failure);
     __ set(0, dst);
-    __ ba(false, done);
-    __ delayed()->nop();
+    __ ba_short(done);
     __ bind(success);
     __ set(1, dst);
     __ bind(done);
--- a/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -41,9 +41,7 @@
   // Note: needs more testing of out-of-line vs. inline slow case
   verify_oop(receiver);
   load_klass(receiver, temp_reg);
-  cmp(temp_reg, iCache);
-  brx(Assembler::equal, true, Assembler::pt, L);
-  delayed()->nop();
+  cmp_and_brx_short(temp_reg, iCache, Assembler::equal, Assembler::pt, L);
   AddressLiteral ic_miss(SharedRuntime::get_ic_miss_stub());
   jump_to(ic_miss, temp_reg);
   delayed()->nop();
@@ -142,8 +140,7 @@
   }
   // Test first it it is a fast recursive unlock
   ld_ptr(Rbox, BasicLock::displaced_header_offset_in_bytes(), Rmark);
-  br_null(Rmark, false, Assembler::pt, done);
-  delayed()->nop();
+  br_null_short(Rmark, Assembler::pt, done);
   if (!UseBiasedLocking) {
     // load object
     ld_ptr(Rbox, BasicObjectLock::obj_offset_in_bytes(), Roop);
@@ -231,7 +228,7 @@
   if (!is_simm13(obj_size * wordSize)) {
     // would need to use extra register to load
     // object size => go the slow case for now
-    br(Assembler::always, false, Assembler::pt, slow_case);
+    ba(slow_case);
     delayed()->nop();
     return;
   }
@@ -257,12 +254,10 @@
     Label ok;
     ld(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), t1);
     if (var_size_in_bytes != noreg) {
-      cmp(t1, var_size_in_bytes);
+      cmp_and_brx_short(t1, var_size_in_bytes, Assembler::equal, Assembler::pt, ok);
     } else {
-      cmp(t1, con_size_in_bytes);
+      cmp_and_brx_short(t1, con_size_in_bytes, Assembler::equal, Assembler::pt, ok);
     }
-    brx(Assembler::equal, false, Assembler::pt, ok);
-    delayed()->nop();
     stop("bad size in initialize_object");
     should_not_reach_here();
 
@@ -387,8 +382,7 @@
 
 void C1_MacroAssembler::verify_not_null_oop(Register r) {
   Label not_null;
-  br_notnull(r, false, Assembler::pt, not_null);
-  delayed()->nop();
+  br_notnull_short(r, Assembler::pt, not_null);
   stop("non-null oop required");
   bind(not_null);
   if (!VerifyOops) return;
--- a/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -71,8 +71,7 @@
   { Label L;
     Address exception_addr(G2_thread, Thread::pending_exception_offset());
     ld_ptr(exception_addr, Gtemp);
-    br_null(Gtemp, false, pt, L);
-    delayed()->nop();
+    br_null_short(Gtemp, pt, L);
     Address vm_result_addr(G2_thread, JavaThread::vm_result_offset());
     st_ptr(G0, vm_result_addr);
     Address vm_result_addr_2(G2_thread, JavaThread::vm_result_2_offset());
@@ -333,9 +332,7 @@
   assert(deopt_blob != NULL, "deoptimization blob must have been created");
 
   Label no_deopt;
-  __ tst(O0);
-  __ brx(Assembler::equal, false, Assembler::pt, no_deopt);
-  __ delayed()->nop();
+  __ br_null_short(O0, Assembler::pt, no_deopt);
 
   // return to the deoptimization handler entry for unpacking and rexecute
   // if we simply returned the we'd deopt as if any call we patched had just
@@ -402,18 +399,15 @@
           if (id == fast_new_instance_init_check_id) {
             // make sure the klass is initialized
             __ ld(G5_klass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc), G3_t1);
-            __ cmp(G3_t1, instanceKlass::fully_initialized);
-            __ br(Assembler::notEqual, false, Assembler::pn, slow_path);
-            __ delayed()->nop();
+            __ cmp_and_br_short(G3_t1, instanceKlass::fully_initialized, Assembler::notEqual, Assembler::pn, slow_path);
           }
 #ifdef ASSERT
           // assert object can be fast path allocated
           {
             Label ok, not_ok;
           __ ld(G5_klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), G1_obj_size);
-          __ cmp(G1_obj_size, 0);  // make sure it's an instance (LH > 0)
-          __ br(Assembler::lessEqual, false, Assembler::pn, not_ok);
-          __ delayed()->nop();
+          // make sure it's an instance (LH > 0)
+          __ cmp_and_br_short(G1_obj_size, 0, Assembler::lessEqual, Assembler::pn, not_ok);
           __ btst(Klass::_lh_instance_slow_path_bit, G1_obj_size);
           __ br(Assembler::zero, false, Assembler::pn, ok);
           __ delayed()->nop();
@@ -501,9 +495,7 @@
           int tag = ((id == new_type_array_id)
                      ? Klass::_lh_array_tag_type_value
                      : Klass::_lh_array_tag_obj_value);
-          __ cmp(G3_t1, tag);
-          __ brx(Assembler::equal, false, Assembler::pt, ok);
-          __ delayed()->nop();
+          __ cmp_and_brx_short(G3_t1, tag, Assembler::equal, Assembler::pt, ok);
           __ stop("assert(is an array klass)");
           __ should_not_reach_here();
           __ bind(ok);
@@ -519,9 +511,7 @@
 
           // check that array length is small enough for fast path
           __ set(C1_MacroAssembler::max_array_allocation_length, G3_t1);
-          __ cmp(G4_length, G3_t1);
-          __ br(Assembler::greaterUnsigned, false, Assembler::pn, slow_path);
-          __ delayed()->nop();
+          __ cmp_and_br_short(G4_length, G3_t1, Assembler::greaterUnsigned, Assembler::pn, slow_path);
 
           // if we got here then the TLAB allocation failed, so try
           // refilling the TLAB or allocating directly from eden.
--- a/src/cpu/sparc/vm/cppInterpreter_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/cppInterpreter_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -544,7 +544,7 @@
 
     // Generate regular method entry
     __ bind(slow_path);
-    __ ba(false, fast_accessor_slow_entry_path);
+    __ ba(fast_accessor_slow_entry_path);
     __ delayed()->nop();
     return entry;
   }
@@ -719,8 +719,7 @@
 
     Address exception_addr(G2_thread, 0, in_bytes(Thread::pending_exception_offset()));
     __ ld_ptr(exception_addr, G3_scratch);
-    __ br_notnull(G3_scratch, false, Assembler::pn, pending_exception_present);
-    __ delayed()->nop();
+    __ br_notnull_short(G3_scratch, Assembler::pn, pending_exception_present);
     __ ld_ptr(Address(G5_method, 0, in_bytes(methodOopDesc::signature_handler_offset())), G3_scratch);
     __ bind(L);
   }
@@ -1292,7 +1291,7 @@
   deopt_frame_manager_return_atos  = __ pc();
 
   // O0/O1 live
-  __ ba(false, return_from_deopt_common);
+  __ ba(return_from_deopt_common);
   __ delayed()->set(AbstractInterpreter::BasicType_as_index(T_OBJECT), L3_scratch);    // Result stub address array index
 
 
@@ -1300,14 +1299,14 @@
   deopt_frame_manager_return_btos  = __ pc();
 
   // O0/O1 live
-  __ ba(false, return_from_deopt_common);
+  __ ba(return_from_deopt_common);
   __ delayed()->set(AbstractInterpreter::BasicType_as_index(T_BOOLEAN), L3_scratch);    // Result stub address array index
 
   // deopt needs to jump to here to enter the interpreter (return a result)
   deopt_frame_manager_return_itos  = __ pc();
 
   // O0/O1 live
-  __ ba(false, return_from_deopt_common);
+  __ ba(return_from_deopt_common);
   __ delayed()->set(AbstractInterpreter::BasicType_as_index(T_INT), L3_scratch);    // Result stub address array index
 
   // deopt needs to jump to here to enter the interpreter (return a result)
@@ -1327,21 +1326,21 @@
   __ srlx(G1,32,O0);
 #endif /* !_LP64 && COMPILER2 */
   // O0/O1 live
-  __ ba(false, return_from_deopt_common);
+  __ ba(return_from_deopt_common);
   __ delayed()->set(AbstractInterpreter::BasicType_as_index(T_LONG), L3_scratch);    // Result stub address array index
 
   // deopt needs to jump to here to enter the interpreter (return a result)
 
   deopt_frame_manager_return_ftos  = __ pc();
   // O0/O1 live
-  __ ba(false, return_from_deopt_common);
+  __ ba(return_from_deopt_common);
   __ delayed()->set(AbstractInterpreter::BasicType_as_index(T_FLOAT), L3_scratch);    // Result stub address array index
 
   // deopt needs to jump to here to enter the interpreter (return a result)
   deopt_frame_manager_return_dtos  = __ pc();
 
   // O0/O1 live
-  __ ba(false, return_from_deopt_common);
+  __ ba(return_from_deopt_common);
   __ delayed()->set(AbstractInterpreter::BasicType_as_index(T_DOUBLE), L3_scratch);    // Result stub address array index
 
   // deopt needs to jump to here to enter the interpreter (return a result)
@@ -1398,7 +1397,7 @@
   __ ld_ptr(STATE(_stack), L1_scratch);                // Get current stack top
   __ sub(L1_scratch, entry_size, L1_scratch);
   __ st_ptr(L1_scratch, STATE(_stack));
-  __ ba(false, entry);
+  __ ba(entry);
   __ delayed()->add(L1_scratch, wordSize, L1_scratch);        // first real entry (undo prepush)
 
   // 2. move expression stack
@@ -1651,7 +1650,7 @@
 
   __ set((int)BytecodeInterpreter::got_monitors, L1_scratch);
   VALIDATE_STATE(G3_scratch, 5);
-  __ ba(false, call_interpreter);
+  __ ba(call_interpreter);
   __ delayed()->st(L1_scratch, STATE(_msg));
 
   // uncommon trap needs to jump to here to enter the interpreter (re-execute current bytecode)
@@ -1659,7 +1658,7 @@
 
   // QQQ what message do we send
 
-  __ ba(false, call_interpreter);
+  __ ba(call_interpreter);
   __ delayed()->ld_ptr(STATE(_frame_bottom), SP);                  // restore to full stack frame
 
   //=============================================================================
@@ -1675,7 +1674,7 @@
   // ready to resume the interpreter
 
   __ set((int)BytecodeInterpreter::deopt_resume, L1_scratch);
-  __ ba(false, call_interpreter);
+  __ ba(call_interpreter);
   __ delayed()->st(L1_scratch, STATE(_msg));
 
   // Current frame has caught an exception we need to dispatch to the
@@ -1763,7 +1762,7 @@
 
   // L1_scratch points to top of stack (prepushed)
 
-  __ ba(false, resume_interpreter);
+  __ ba(resume_interpreter);
   __ delayed()->mov(L1_scratch, O1);
 
   // An exception is being caught on return to a vanilla interpreter frame.
@@ -1773,7 +1772,7 @@
 
   __ ld_ptr(STATE(_frame_bottom), SP);                             // restore to full stack frame
   __ ld_ptr(STATE(_stack_base), O1);                               // empty java expression stack
-  __ ba(false, resume_interpreter);
+  __ ba(resume_interpreter);
   __ delayed()->sub(O1, wordSize, O1);                             // account for prepush
 
   // Return from interpreted method we return result appropriate to the caller (i.e. "recursive"
@@ -1852,7 +1851,7 @@
 
   __ set((int)BytecodeInterpreter::method_resume, L1_scratch);
   __ st(L1_scratch, STATE(_msg));
-  __ ba(false, call_interpreter_2);
+  __ ba(call_interpreter_2);
   __ delayed()->st_ptr(O1, STATE(_stack));
 
 
@@ -1867,8 +1866,8 @@
     __ cmp(Gtmp1, O7);                                                // returning to interpreter?
     __ brx(Assembler::equal, true, Assembler::pt, re_dispatch);       // yep
     __ delayed()->nop();
-    __ ba(false, re_dispatch);
-    __ delayed()->mov(G0, prevState);                                   // initial entry
+    __ ba(re_dispatch);
+    __ delayed()->mov(G0, prevState);                                 // initial entry
 
   }
 
@@ -2031,8 +2030,8 @@
   __ brx(Assembler::zero, false, Assembler::pt, unwind_and_forward);
   __ delayed()->nop();
 
-  __ ld_ptr(STATE(_locals), O1);                                   // get result of popping callee's args
-  __ ba(false, unwind_recursive_activation);
+  __ ld_ptr(STATE(_locals), O1); // get result of popping callee's args
+  __ ba(unwind_recursive_activation);
   __ delayed()->nop();
 
   interpreter_frame_manager = entry_point;
--- a/src/cpu/sparc/vm/interp_masm_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/interp_masm_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -236,17 +236,13 @@
     Label L;
     Register thr_state = G3_scratch;
     ld_ptr(G2_thread, JavaThread::jvmti_thread_state_offset(), thr_state);
-    tst(thr_state);
-    br(zero, false, pt, L); // if (thread->jvmti_thread_state() == NULL) exit;
-    delayed()->nop();
+    br_null_short(thr_state, pt, L); // if (thread->jvmti_thread_state() == NULL) exit;
 
     // Initiate earlyret handling only if it is not already being processed.
     // If the flag has the earlyret_processing bit set, it means that this code
     // is called *during* earlyret handling - we don't want to reenter.
     ld(thr_state, JvmtiThreadState::earlyret_state_offset(), G4_scratch);
-    cmp(G4_scratch, JvmtiThreadState::earlyret_pending);
-    br(Assembler::notEqual, false, pt, L);
-    delayed()->nop();
+    cmp_and_br_short(G4_scratch, JvmtiThreadState::earlyret_pending, Assembler::notEqual, pt, L);
 
     // Call Interpreter::remove_activation_early_entry() to get the address of the
     // same-named entrypoint in the generated interpreter code
@@ -566,9 +562,7 @@
 #ifdef _LP64
   sub(Rtemp, STACK_BIAS, Rtemp);  // Bias Rtemp before cmp to FP
 #endif
-  cmp(Rtemp, FP);
-  brx(Assembler::greaterUnsigned, false, Assembler::pn, Bad);
-  delayed()->nop();
+  cmp_and_brx_short(Rtemp, FP, Assembler::greaterUnsigned, Assembler::pn, Bad);
 
   // Saved SP must not be ridiculously below current SP.
   size_t maxstack = MAX2(JavaThread::stack_size_at_create(), (size_t) 4*K*K);
@@ -577,12 +571,9 @@
 #ifdef _LP64
   add(Rtemp, STACK_BIAS, Rtemp);  // Unbias Rtemp before cmp to Rsp
 #endif
-  cmp(Rsp, Rtemp);
-  brx(Assembler::lessUnsigned, false, Assembler::pn, Bad);
-  delayed()->nop();
-
-  br(Assembler::always, false, Assembler::pn, OK);
-  delayed()->nop();
+  cmp_and_brx_short(Rsp, Rtemp, Assembler::lessUnsigned, Assembler::pn, Bad);
+
+  ba_short(OK);
 
   bind(Bad);
   stop("on return to interpreted call, restored SP is corrupted");
@@ -630,8 +621,7 @@
 
     const Address interp_only(G2_thread, JavaThread::interp_only_mode_offset());
     ld(interp_only, scratch);
-    tst(scratch);
-    br(Assembler::notZero, true, Assembler::pn, skip_compiled_code);
+    cmp_zero_and_br(Assembler::notZero, scratch, skip_compiled_code, true, Assembler::pn);
     delayed()->ld_ptr(G5_method, in_bytes(methodOopDesc::interpreter_entry_offset()), target);
     bind(skip_compiled_code);
   }
@@ -641,8 +631,7 @@
 #ifdef ASSERT
   {
     Label ok;
-    br_notnull(target, false, Assembler::pt, ok);
-    delayed()->nop();
+    br_notnull_short(target, Assembler::pt, ok);
     stop("null entry point");
     bind(ok);
   }
@@ -982,8 +971,7 @@
 
   // Don't unlock anything if the _do_not_unlock_if_synchronized flag
   // is set.
-  tstbool(G1_scratch);
-  br(Assembler::notZero, false, pn, no_unlock);
+  cmp_zero_and_br(Assembler::notZero, G1_scratch, no_unlock);
   delayed()->nop();
 
   // BasicObjectLock will be first in list, since this is a synchronized method. However, need
@@ -997,8 +985,7 @@
   add( top_most_monitor(), O1 );
 
   ld_ptr(O1, BasicObjectLock::obj_offset_in_bytes(), G3_scratch);
-  br_notnull(G3_scratch, false, pt, unlock);
-  delayed()->nop();
+  br_notnull_short(G3_scratch, pt, unlock);
 
   if (throw_monitor_exception) {
     // Entry already unlocked need to throw an exception
@@ -1011,8 +998,7 @@
     if (install_monitor_exception) {
       MacroAssembler::call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::new_illegal_monitor_state_exception));
     }
-    ba(false, unlocked);
-    delayed()->nop();
+    ba_short(unlocked);
   }
 
   bind(unlock);
@@ -1037,15 +1023,13 @@
     add(top_most_monitor(), Rmptr, delta);
     { Label L;
       // ensure that Rmptr starts out above (or at) Rlimit
-      cmp(Rmptr, Rlimit);
-      brx(Assembler::greaterEqualUnsigned, false, pn, L);
-      delayed()->nop();
+      cmp_and_brx_short(Rmptr, Rlimit, Assembler::greaterEqualUnsigned, pn, L);
       stop("monitor stack has negative size");
       bind(L);
     }
     #endif
     bind(restart);
-    ba(false, entry);
+    ba(entry);
     delayed()->
     add(top_most_monitor(), Rmptr, delta);      // points to current entry, starting with bottom-most entry
 
@@ -1061,8 +1045,7 @@
       if (install_monitor_exception) {
         MacroAssembler::call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::new_illegal_monitor_state_exception));
       }
-      ba(false, restart);
-      delayed()->nop();
+      ba_short(restart);
     }
 
     bind(loop);
@@ -1073,9 +1056,7 @@
     #ifdef ASSERT
     { Label L;
       // ensure that Rmptr has not somehow stepped below Rlimit
-      cmp(Rmptr, Rlimit);
-      brx(Assembler::greaterEqualUnsigned, false, pn, L);
-      delayed()->nop();
+      cmp_and_brx_short(Rmptr, Rlimit, Assembler::greaterEqualUnsigned, pn, L);
       stop("ran off the end of the monitor stack");
       bind(L);
     }
@@ -1196,9 +1177,7 @@
       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr());
 
     // if the compare and exchange succeeded we are done (we saw an unlocked object)
-    cmp(mark_reg, temp_reg);
-    brx(Assembler::equal, true, Assembler::pt, done);
-    delayed()->nop();
+    cmp_and_brx_short(mark_reg, temp_reg, Assembler::equal, Assembler::pt, done);
 
     // We did not see an unlocked object so try the fast recursive case
 
@@ -1324,13 +1303,7 @@
 
 void InterpreterMacroAssembler::test_method_data_pointer(Label& zero_continue) {
   assert(ProfileInterpreter, "must be profiling interpreter");
-#ifdef _LP64
-  bpr(Assembler::rc_z, false, Assembler::pn, ImethodDataPtr, zero_continue);
-#else
-  tst(ImethodDataPtr);
-  br(Assembler::zero, false, Assembler::pn, zero_continue);
-#endif
-  delayed()->nop();
+  br_null_short(ImethodDataPtr, Assembler::pn, zero_continue);
 }
 
 void InterpreterMacroAssembler::verify_method_data_pointer() {
@@ -1376,31 +1349,18 @@
   Label done;
 
   // if no method data exists, and the counter is high enough, make one
-#ifdef _LP64
-  bpr(Assembler::rc_nz, false, Assembler::pn, ImethodDataPtr, done);
-#else
-  tst(ImethodDataPtr);
-  br(Assembler::notZero, false, Assembler::pn, done);
-#endif
+  br_notnull_short(ImethodDataPtr, Assembler::pn, done);
 
   // Test to see if we should create a method data oop
   AddressLiteral profile_limit((address) &InvocationCounter::InterpreterProfileLimit);
-#ifdef _LP64
-  delayed()->nop();
   sethi(profile_limit, Rtmp);
-#else
-  delayed()->sethi(profile_limit, Rtmp);
-#endif
   ld(Rtmp, profile_limit.low10(), Rtmp);
-  cmp(invocation_count, Rtmp);
-  br(Assembler::lessUnsigned, false, Assembler::pn, profile_continue);
-  delayed()->nop();
+  cmp_and_br_short(invocation_count, Rtmp, Assembler::lessUnsigned, Assembler::pn, profile_continue);
 
   // Build it now.
   call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::profile_method));
   set_method_data_pointer_for_bcp();
-  ba(false, profile_continue);
-  delayed()->nop();
+  ba_short(profile_continue);
   bind(done);
 }
 
@@ -1632,13 +1592,10 @@
     Label skip_receiver_profile;
     if (receiver_can_be_null) {
       Label not_null;
-      tst(receiver);
-      brx(Assembler::notZero, false, Assembler::pt, not_null);
-      delayed()->nop();
+      br_notnull_short(receiver, Assembler::pt, not_null);
       // We are making a call.  Increment the count for null receiver.
       increment_mdp_data_at(in_bytes(CounterData::count_offset()), scratch);
-      ba(false, skip_receiver_profile);
-      delayed()->nop();
+      ba_short(skip_receiver_profile);
       bind(not_null);
     }
 
@@ -1682,8 +1639,7 @@
     // The receiver is receiver[n].  Increment count[n].
     int count_offset = in_bytes(VirtualCallData::receiver_count_offset(row));
     increment_mdp_data_at(count_offset, scratch);
-    ba(false, done);
-    delayed()->nop();
+    ba_short(done);
     bind(next_test);
 
     if (test_for_null_also) {
@@ -1697,8 +1653,7 @@
           // Receiver did not match any saved receiver and there is no empty row for it.
           // Increment total counter to indicate polymorphic case.
           increment_mdp_data_at(in_bytes(CounterData::count_offset()), scratch);
-          ba(false, done);
-          delayed()->nop();
+          ba_short(done);
           bind(found_null);
         } else {
           brx(Assembler::notZero, false, Assembler::pt, done);
@@ -1729,8 +1684,7 @@
   mov(DataLayout::counter_increment, scratch);
   set_mdp_data_at(count_offset, scratch);
   if (start_row > 0) {
-    ba(false, done);
-    delayed()->nop();
+    ba_short(done);
   }
 }
 
@@ -1772,8 +1726,7 @@
 
       // The method data pointer needs to be updated to reflect the new target.
       update_mdp_by_offset(in_bytes(RetData::bci_displacement_offset(row)), scratch);
-      ba(false, profile_continue);
-      delayed()->nop();
+      ba_short(profile_continue);
       bind(next_test);
     }
 
@@ -1922,8 +1875,8 @@
 
     // untested("monitor stack expansion");
     compute_stack_base(Rtemp);
-    ba( false, start_copying );
-    delayed()->cmp( Rtemp, Rlimit); // done? duplicated below
+    ba(start_copying);
+    delayed()->cmp(Rtemp, Rlimit); // done? duplicated below
 
     // note: must copy from low memory upwards
     // On entry to loop,
@@ -2010,9 +1963,7 @@
   // untested("reg area corruption");
   add(Rindex, offset, Rscratch);
   add(Rlimit, 64 + STACK_BIAS, Rscratch1);
-  cmp(Rscratch, Rscratch1);
-  brx(Assembler::greaterEqualUnsigned, false, pn, L);
-  delayed()->nop();
+  cmp_and_brx_short(Rscratch, Rscratch1, Assembler::greaterEqualUnsigned, pn, L);
   stop("regsave area is being clobbered");
   bind(L);
 }
@@ -2174,9 +2125,7 @@
 
   AddressLiteral limit(&InvocationCounter::InterpreterBackwardBranchLimit);
   load_contents(limit, Rtmp);
-  cmp(backedge_count, Rtmp);
-  br(Assembler::lessUnsigned, false, Assembler::pt, did_not_overflow);
-  delayed()->nop();
+  cmp_and_br_short(backedge_count, Rtmp, Assembler::lessUnsigned, Assembler::pt, did_not_overflow);
 
   // When ProfileInterpreter is on, the backedge_count comes from the
   // methodDataOop, which value does not get reset on the call to
@@ -2196,15 +2145,11 @@
 
   // Was an OSR adapter generated?
   // O0 = osr nmethod
-  tst(O0);
-  brx(Assembler::zero, false, Assembler::pn, overflow_with_error);
-  delayed()->nop();
+  br_null_short(O0, Assembler::pn, overflow_with_error);
 
   // Has the nmethod been invalidated already?
   ld(O0, nmethod::entry_bci_offset(), O2);
-  cmp(O2, InvalidOSREntryBci);
-  br(Assembler::equal, false, Assembler::pn, overflow_with_error);
-  delayed()->nop();
+  cmp_and_br_short(O2, InvalidOSREntryBci, Assembler::equal, Assembler::pn, overflow_with_error);
 
   // migrate the interpreter frame off of the stack
 
@@ -2270,8 +2215,7 @@
   mov(reg, Rtmp);
   const int log2_bytecode_size_limit = 16;
   srl(Rtmp, log2_bytecode_size_limit, Rtmp);
-  br_notnull( Rtmp, false, pt, test );
-  delayed()->nop();
+  br_notnull_short( Rtmp, pt, test );
 
   // %%% should use call_VM_leaf here?
   save_frame_and_mov(0, Lmethod, O0, reg, O1);
@@ -2320,9 +2264,7 @@
     Register temp_reg = O5;
     const Address interp_only(G2_thread, JavaThread::interp_only_mode_offset());
     ld(interp_only, temp_reg);
-    tst(temp_reg);
-    br(zero, false, pt, L);
-    delayed()->nop();
+    cmp_and_br_short(temp_reg, 0, equal, pt, L);
     call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::post_method_entry));
     bind(L);
   }
@@ -2372,9 +2314,7 @@
     Register temp_reg = O5;
     const Address interp_only(G2_thread, JavaThread::interp_only_mode_offset());
     ld(interp_only, temp_reg);
-    tst(temp_reg);
-    br(zero, false, pt, L);
-    delayed()->nop();
+    cmp_and_br_short(temp_reg, 0, equal, pt, L);
 
     // Note: frame::interpreter_frame_result has a dependency on how the
     // method result is saved across the call to post_method_exit. For
--- a/src/cpu/sparc/vm/interpreter_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/interpreter_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -191,22 +191,19 @@
     // Optimization, see if there are any more args and get out prior to checking
     // all 16 float registers.  My guess is that this is rare.
     // If is_register is false, then we are done the first six integer args.
-      __ tst(G4_scratch);
-      __ brx(Assembler::zero, false, Assembler::pt, done);
-      __ delayed()->nop();
-
+      __ br_null_short(G4_scratch, Assembler::pt, done);
     }
-    __ ba(false, NextArg);
+    __ ba(NextArg);
     __ delayed()->srl( G4_scratch, 2, G4_scratch );
 
     __ bind(LoadFloatArg);
     __ ldf( FloatRegisterImpl::S, a, ldarg.as_float_register(), 4);
-    __ ba(false, NextArg);
+    __ ba(NextArg);
     __ delayed()->srl( G4_scratch, 2, G4_scratch );
 
     __ bind(LoadDoubleArg);
     __ ldf( FloatRegisterImpl::D, a, ldarg.as_double_register() );
-    __ ba(false, NextArg);
+    __ ba(NextArg);
     __ delayed()->srl( G4_scratch, 2, G4_scratch );
 
     __ bind(NextArg);
@@ -234,8 +231,7 @@
   __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::frequency_counter_overflow), O2, O2, true);
   // returns verified_entry_point or NULL
   // we ignore it in any case
-  __ ba(false, Lcontinue);
-  __ delayed()->nop();
+  __ ba_short(Lcontinue);
 
 }
 
--- a/src/cpu/sparc/vm/methodHandles_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/methodHandles_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -287,9 +287,7 @@
   BLOCK_COMMENT("verify_clean {");
   // Magic numbers must check out:
   __ set((int32_t) MAGIC_NUMBER_1, O7_temp);
-  __ cmp(O7_temp, L0_magic_number_1);
-  __ br(Assembler::equal, false, Assembler::pt, L_ok_1);
-  __ delayed()->nop();
+  __ cmp_and_br_short(O7_temp, L0_magic_number_1, Assembler::equal, Assembler::pt, L_ok_1);
   __ stop("damaged ricochet frame: MAGIC_NUMBER_1 not found");
 
   __ BIND(L_ok_1);
@@ -301,9 +299,7 @@
 #else
   Register FP_temp = FP;
 #endif
-  __ cmp(L4_saved_args_base, FP_temp);
-  __ br(Assembler::greaterEqualUnsigned, false, Assembler::pt, L_ok_2);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(L4_saved_args_base, FP_temp, Assembler::greaterEqualUnsigned, Assembler::pt, L_ok_2);
   __ stop("damaged ricochet frame: L4 < FP");
 
   __ BIND(L_ok_2);
@@ -316,15 +312,11 @@
 
   __ BIND(L_ok_3);
   extract_conversion_dest_type(_masm, L5_conversion, O7_temp);
-  __ cmp(O7_temp, T_VOID);
-  __ br(Assembler::equal, false, Assembler::pt, L_ok_4);
-  __ delayed()->nop();
+  __ cmp_and_br_short(O7_temp, T_VOID, Assembler::equal, Assembler::pt, L_ok_4);
   extract_conversion_vminfo(_masm, L5_conversion, O5_temp);
   __ ld_ptr(L4_saved_args_base, __ argument_offset(O5_temp, O5_temp), O7_temp);
   assert(__ is_simm13(RETURN_VALUE_PLACEHOLDER), "must be simm13");
-  __ cmp(O7_temp, (int32_t) RETURN_VALUE_PLACEHOLDER);
-  __ brx(Assembler::equal, false, Assembler::pt, L_ok_4);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(O7_temp, (int32_t) RETURN_VALUE_PLACEHOLDER, Assembler::equal, Assembler::pt, L_ok_4);
   __ stop("damaged ricochet frame: RETURN_VALUE_PLACEHOLDER not found");
   __ BIND(L_ok_4);
   BLOCK_COMMENT("} verify_clean");
@@ -363,9 +355,7 @@
   if (VerifyMethodHandles) {
     Label L_ok, L_bad;
     int32_t stack_move_limit = 0x0800;  // extra-large
-    __ cmp(stack_move_reg, stack_move_limit);
-    __ br(Assembler::greaterEqual, false, Assembler::pn, L_bad);
-    __ delayed()->nop();
+    __ cmp_and_br_short(stack_move_reg, stack_move_limit, Assembler::greaterEqual, Assembler::pn, L_bad);
     __ cmp(stack_move_reg, -stack_move_limit);
     __ br(Assembler::greater, false, Assembler::pt, L_ok);
     __ delayed()->nop();
@@ -401,13 +391,9 @@
   // Verify that argslot lies within (Gargs, FP].
   Label L_ok, L_bad;
   BLOCK_COMMENT("verify_argslot {");
+  __ cmp_and_brx_short(Gargs, argslot_reg, Assembler::greaterUnsigned, Assembler::pn, L_bad);
   __ add(FP, STACK_BIAS, temp_reg);  // STACK_BIAS is zero on !_LP64
-  __ cmp(argslot_reg, temp_reg);
-  __ brx(Assembler::greaterUnsigned, false, Assembler::pn, L_bad);
-  __ delayed()->nop();
-  __ cmp(Gargs, argslot_reg);
-  __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, L_ok);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(argslot_reg, temp_reg, Assembler::lessEqualUnsigned, Assembler::pt, L_ok);
   __ BIND(L_bad);
   __ stop(error_message);
   __ BIND(L_ok);
@@ -434,14 +420,10 @@
   }
   __ add(arg_slot_base_reg, __ argument_offset(arg_slots, temp_reg), temp_reg);
   __ add(FP, STACK_BIAS, temp2_reg);  // STACK_BIAS is zero on !_LP64
-  __ cmp(temp_reg, temp2_reg);
-  __ brx(Assembler::greaterUnsigned, false, Assembler::pn, L_bad);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(temp_reg, temp2_reg, Assembler::greaterUnsigned, Assembler::pn, L_bad);
   // Gargs points to the first word so adjust by BytesPerWord
   __ add(arg_slot_base_reg, BytesPerWord, temp_reg);
-  __ cmp(Gargs, temp_reg);
-  __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, L_ok);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(Gargs, temp_reg, Assembler::lessEqualUnsigned, Assembler::pt, L_ok);
   __ BIND(L_bad);
   __ stop(error_message);
   __ BIND(L_ok);
@@ -502,21 +484,16 @@
   Label L_ok, L_bad;
   BLOCK_COMMENT("verify_klass {");
   __ verify_oop(obj_reg);
-  __ br_null(obj_reg, false, Assembler::pn, L_bad);
-  __ delayed()->nop();
+  __ br_null_short(obj_reg, Assembler::pn, L_bad);
   __ load_klass(obj_reg, temp_reg);
   __ set(ExternalAddress(klass_addr), temp2_reg);
   __ ld_ptr(Address(temp2_reg, 0), temp2_reg);
-  __ cmp(temp_reg, temp2_reg);
-  __ brx(Assembler::equal, false, Assembler::pt, L_ok);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(temp_reg, temp2_reg, Assembler::equal, Assembler::pt, L_ok);
   intptr_t super_check_offset = klass->super_check_offset();
   __ ld_ptr(Address(temp_reg, super_check_offset), temp_reg);
   __ set(ExternalAddress(klass_addr), temp2_reg);
   __ ld_ptr(Address(temp2_reg, 0), temp2_reg);
-  __ cmp(temp_reg, temp2_reg);
-  __ brx(Assembler::equal, false, Assembler::pt, L_ok);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(temp_reg, temp2_reg, Assembler::equal, Assembler::pt, L_ok);
   __ BIND(L_bad);
   __ stop(error_message);
   __ BIND(L_ok);
@@ -671,9 +648,7 @@
 #ifdef ASSERT
     {
       Label L_ok;
-      __ cmp(arg_slots.as_register(), 0);
-      __ br(Assembler::greaterEqual, false, Assembler::pt, L_ok);
-      __ delayed()->nop();
+      __ cmp_and_br_short(arg_slots.as_register(), 0, Assembler::greaterEqual, Assembler::pt, L_ok);
       __ stop("negative arg_slots");
       __ bind(L_ok);
     }
@@ -748,9 +723,7 @@
     __ ld_ptr(           Address(temp_reg, 0     ), temp2_reg);
     __ st_ptr(temp2_reg, Address(temp_reg, offset)           );
     __ add(temp_reg, wordSize, temp_reg);
-    __ cmp(temp_reg, argslot_reg);
-    __ brx(Assembler::lessUnsigned, false, Assembler::pt, loop);
-    __ delayed()->nop();  // FILLME
+    __ cmp_and_brx_short(temp_reg, argslot_reg, Assembler::lessUnsigned, Assembler::pt, loop);
   }
 
   // Now move the argslot down, to point to the opened-up space.
@@ -797,9 +770,7 @@
     __ ld_ptr(           Address(temp_reg, 0     ), temp2_reg);
     __ st_ptr(temp2_reg, Address(temp_reg, offset)           );
     __ sub(temp_reg, wordSize, temp_reg);
-    __ cmp(temp_reg, Gargs);
-    __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, L_loop);
-    __ delayed()->nop();  // FILLME
+    __ cmp_and_brx_short(temp_reg, Gargs, Assembler::greaterEqualUnsigned, Assembler::pt, L_loop);
   }
 
   // And adjust the argslot address to point at the deletion point.
@@ -848,8 +819,7 @@
     __ delayed()->nop();
     __ ld_ptr(          Address(argslot_reg, 0), temp_reg);
     __ st_ptr(temp_reg, Address(Gargs,       0));
-    __ ba(false, L_break);
-    __ delayed()->nop();  // FILLME
+    __ ba_short(L_break);
     __ BIND(L_plural);
 
     // Loop for 2 or more:
@@ -863,9 +833,7 @@
     __ sub(Gargs,   wordSize, Gargs  );
     __ ld_ptr(           Address(top_reg, 0), temp2_reg);
     __ st_ptr(temp2_reg, Address(Gargs,   0));
-    __ cmp(top_reg, argslot_reg);
-    __ brx(Assembler::greaterUnsigned, false, Assembler::pt, L_loop);
-    __ delayed()->nop();  // FILLME
+    __ cmp_and_brx_short(top_reg, argslot_reg, Assembler::greaterUnsigned, Assembler::pt, L_loop);
     __ BIND(L_break);
   }
   BLOCK_COMMENT("} push_arg_slots");
@@ -897,17 +865,13 @@
       __ br(Assembler::lessEqual, false, Assembler::pn, L_bad);
       __ delayed()->nop();
     }
-    __ cmp(bottom_reg, top_reg);
-    __ brx(Assembler::lessUnsigned, false, Assembler::pt, L_ok);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(bottom_reg, top_reg, Assembler::lessUnsigned, Assembler::pt, L_ok);
     __ BIND(L_bad);
     __ stop("valid bounds (copy up)");
     __ BIND(L_ok);
   }
 #endif
-  __ cmp(bottom_reg, top_reg);
-  __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pn, L_break);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(bottom_reg, top_reg, Assembler::greaterEqualUnsigned, Assembler::pn, L_break);
   // work top down to bottom, copying contiguous data upwards
   // In pseudo-code:
   //   while (--top >= bottom) *(top + distance) = *(top + 0);
@@ -916,9 +880,7 @@
   __ sub(top_reg, wordSize, top_reg);
   __ ld_ptr(           Address(top_reg, 0     ), temp2_reg);
   __ st_ptr(temp2_reg, Address(top_reg, offset)           );
-  __ cmp(top_reg, bottom_reg);
-  __ brx(Assembler::greaterUnsigned, false, Assembler::pt, L_loop);
-  __ delayed()->nop();  // FILLME
+  __ cmp_and_brx_short(top_reg, bottom_reg, Assembler::greaterUnsigned, Assembler::pt, L_loop);
   assert(Interpreter::stackElementSize == wordSize, "else change loop");
   __ BIND(L_break);
   BLOCK_COMMENT("} move_arg_slots_up");
@@ -951,17 +913,13 @@
       __ br(Assembler::greaterEqual, false, Assembler::pn, L_bad);
       __ delayed()->nop();
     }
-    __ cmp(bottom_reg, top_reg);
-    __ brx(Assembler::lessUnsigned, false, Assembler::pt, L_ok);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(bottom_reg, top_reg, Assembler::lessUnsigned, Assembler::pt, L_ok);
     __ BIND(L_bad);
     __ stop("valid bounds (copy down)");
     __ BIND(L_ok);
   }
 #endif
-  __ cmp(bottom_reg, top_reg);
-  __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pn, L_break);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(bottom_reg, top_reg, Assembler::greaterEqualUnsigned, Assembler::pn, L_break);
   // work bottom up to top, copying contiguous data downwards
   // In pseudo-code:
   //   while (bottom < top) *(bottom - distance) = *(bottom + 0), bottom++;
@@ -970,9 +928,7 @@
   __ ld_ptr(           Address(bottom_reg, 0     ), temp2_reg);
   __ st_ptr(temp2_reg, Address(bottom_reg, offset)           );
   __ add(bottom_reg, wordSize, bottom_reg);
-  __ cmp(bottom_reg, top_reg);
-  __ brx(Assembler::lessUnsigned, false, Assembler::pt, L_loop);
-  __ delayed()->nop();  // FILLME
+  __ cmp_and_brx_short(bottom_reg, top_reg, Assembler::lessUnsigned, Assembler::pt, L_loop);
   assert(Interpreter::stackElementSize == wordSize, "else change loop");
   __ BIND(L_break);
   BLOCK_COMMENT("} move_arg_slots_down");
@@ -1329,9 +1285,7 @@
 
       Label L_done;
       __ ld_ptr(vmarg, O2_scratch);
-      __ tst(O2_scratch);
-      __ brx(Assembler::zero, false, Assembler::pn, L_done);  // No cast if null.
-      __ delayed()->nop();
+      __ br_null_short(O2_scratch, Assembler::pn, L_done);  // No cast if null.
       __ load_klass(O2_scratch, O2_scratch);
 
       // Live at this point:
@@ -1436,8 +1390,7 @@
 
       // this path is taken for int->byte, int->short
       __ sra(O1_scratch, G5_vminfo, O1_scratch);
-      __ ba(false, done);
-      __ delayed()->nop();
+      __ ba_short(done);
 
       __ bind(zero_extend);
       // this is taken for int->char
@@ -1860,9 +1813,7 @@
           BLOCK_COMMENT("verify collect_count_constant {");
           __ load_method_handle_vmslots(O3_scratch, G3_method_handle, O2_scratch);
           Label L_count_ok;
-          __ cmp(O3_scratch, collect_count_constant);
-          __ br(Assembler::equal, false, Assembler::pt, L_count_ok);
-          __ delayed()->nop();
+          __ cmp_and_br_short(O3_scratch, collect_count_constant, Assembler::equal, Assembler::pt, L_count_ok);
           __ stop("bad vminfo in AMH.conv");
           __ BIND(L_count_ok);
           BLOCK_COMMENT("} verify collect_count_constant");
@@ -1909,9 +1860,7 @@
           BLOCK_COMMENT("verify dest_slot_constant {");
           extract_conversion_vminfo(_masm, RicochetFrame::L5_conversion, O3_scratch);
           Label L_vminfo_ok;
-          __ cmp(O3_scratch, dest_slot_constant);
-          __ br(Assembler::equal, false, Assembler::pt, L_vminfo_ok);
-          __ delayed()->nop();
+          __ cmp_and_br_short(O3_scratch, dest_slot_constant, Assembler::equal, Assembler::pt, L_vminfo_ok);
           __ stop("bad vminfo in AMH.conv");
           __ BIND(L_vminfo_ok);
           BLOCK_COMMENT("} verify dest_slot_constant");
@@ -1951,14 +1900,10 @@
       // If there are variable parameters, use dynamic checks to skip around the whole mess.
       Label L_done;
       if (keep3_count.is_register()) {
-        __ tst(keep3_count.as_register());
-        __ br(Assembler::zero, false, Assembler::pn, L_done);
-        __ delayed()->nop();
+        __ cmp_and_br_short(keep3_count.as_register(), 0, Assembler::equal, Assembler::pn, L_done);
       }
       if (close_count.is_register()) {
-        __ cmp(close_count.as_register(), open_count);
-        __ br(Assembler::equal, false, Assembler::pn, L_done);
-        __ delayed()->nop();
+        __ cmp_and_br_short(close_count.as_register(), open_count, Assembler::equal, Assembler::pn, L_done);
       }
 
       if (move_keep3 && fix_arg_base) {
@@ -1999,8 +1944,7 @@
         }
 
         if (emit_guard) {
-          __ ba(false, L_done);  // assumes emit_move_up is true also
-          __ delayed()->nop();
+          __ ba_short(L_done);  // assumes emit_move_up is true also
           __ BIND(L_move_up);
         }
 
@@ -2133,8 +2077,7 @@
 
 #ifdef ASSERT
       { Label L_ok;
-        __ br_notnull(O7_temp, false, Assembler::pt, L_ok);
-        __ delayed()->nop();
+        __ br_notnull_short(O7_temp, Assembler::pt, L_ok);
         __ stop("bad method handle return");
         __ BIND(L_ok);
       }
@@ -2192,11 +2135,10 @@
         Label L_skip;
         if (length_constant < 0) {
           load_conversion_vminfo(_masm, G3_amh_conversion, O3_scratch);
-          __ br_zero(Assembler::notZero, false, Assembler::pn, O3_scratch, L_skip);
-          __ delayed()->nop();
+          __ cmp_zero_and_br(Assembler::notZero, O3_scratch, L_skip);
+          __ delayed()->nop(); // to avoid back-to-back cbcond instructions
         }
-        __ br_null(O1_array, false, Assembler::pn, L_array_is_empty);
-        __ delayed()->nop();
+        __ br_null_short(O1_array, Assembler::pn, L_array_is_empty);
         __ BIND(L_skip);
       }
       __ null_check(O1_array, oopDesc::klass_offset_in_bytes());
@@ -2210,8 +2152,7 @@
       Label L_ok_array_klass, L_bad_array_klass, L_bad_array_length;
       __ check_klass_subtype(O2_array_klass, O3_klass, O4_scratch, G5_scratch, L_ok_array_klass);
       // If we get here, the type check failed!
-      __ ba(false, L_bad_array_klass);
-      __ delayed()->nop();
+      __ ba_short(L_bad_array_klass);
       __ BIND(L_ok_array_klass);
 
       // Check length.
@@ -2247,8 +2188,7 @@
         __ BIND(L_array_is_empty);
         remove_arg_slots(_masm, -stack_move_unit() * array_slots,
                          O0_argslot, O1_scratch, O2_scratch, O3_scratch);
-        __ ba(false, L_args_done);  // no spreading to do
-        __ delayed()->nop();
+        __ ba_short(L_args_done);  // no spreading to do
         __ BIND(L_insert_arg_space);
         // come here in the usual case, stack_move < 0 (2 or more spread arguments)
         // Live: O1_array, O2_argslot_limit, O3_stack_move
@@ -2289,9 +2229,7 @@
                        Address(O1_source, 0), Address(O4_fill_ptr, 0),
                        O2_scratch);  // must be an even register for !_LP64 long moves (uses O2/O3)
         __ add(O1_source, type2aelembytes(elem_type), O1_source);
-        __ cmp(O4_fill_ptr, O0_argslot);
-        __ brx(Assembler::greaterUnsigned, false, Assembler::pt, L_loop);
-        __ delayed()->nop();  // FILLME
+        __ cmp_and_brx_short(O4_fill_ptr, O0_argslot, Assembler::greaterUnsigned, Assembler::pt, L_loop);
       } else if (length_constant == 0) {
         // nothing to copy
       } else {
--- a/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -600,7 +600,7 @@
 void AdapterGenerator::patch_callers_callsite() {
   Label L;
   __ ld_ptr(G5_method, in_bytes(methodOopDesc::code_offset()), G3_scratch);
-  __ br_null(G3_scratch, false, __ pt, L);
+  __ br_null(G3_scratch, false, Assembler::pt, L);
   // Schedule the branch target address early.
   __ delayed()->ld_ptr(G5_method, in_bytes(methodOopDesc::interpreter_entry_offset()), G3_scratch);
   // Call into the VM to patch the caller, then jump to compiled callee
@@ -1127,8 +1127,7 @@
       Label loop;
       __ bind(loop);
       __ sub(L0, 1, L0);
-      __ br_null(L0, false, Assembler::pt, loop);
-      __ delayed()->nop();
+      __ br_null_short(L0, Assembler::pt, loop);
 
       __ restore();
     }
@@ -1202,7 +1201,7 @@
     // the call site corrected.
     __ ld_ptr(G5_method, in_bytes(methodOopDesc::code_offset()), G3_scratch);
     __ bind(ok2);
-    __ br_null(G3_scratch, false, __ pt, skip_fixup);
+    __ br_null(G3_scratch, false, Assembler::pt, skip_fixup);
     __ delayed()->ld_ptr(G5_method, in_bytes(methodOopDesc::interpreter_entry_offset()), G3_scratch);
     __ jump_to(ic_miss, G3_scratch);
     __ delayed()->nop();
@@ -1779,9 +1778,7 @@
     AddressLiteral ic_miss(SharedRuntime::get_ic_miss_stub());
     __ verify_oop(O0);
     __ load_klass(O0, temp_reg);
-    __ cmp(temp_reg, G5_inline_cache_reg);
-    __ brx(Assembler::equal, true, Assembler::pt, L);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(temp_reg, G5_inline_cache_reg, Assembler::equal, Assembler::pt, L);
 
     __ jump_to(ic_miss, temp_reg);
     __ delayed()->nop();
@@ -2182,8 +2179,7 @@
 #ifdef ASSERT
     { Label L;
     __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), O0);
-    __ br_null(O0, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ br_null_short(O0, Assembler::pt, L);
     __ stop("no pending exception allowed on exit from IR::monitorenter");
     __ bind(L);
     }
@@ -2298,9 +2294,7 @@
     Address suspend_state(G2_thread, JavaThread::suspend_flags_offset());
     __ br(Assembler::notEqual, false, Assembler::pn, L);
     __ delayed()->ld(suspend_state, G3_scratch);
-    __ cmp(G3_scratch, 0);
-    __ br(Assembler::equal, false, Assembler::pt, no_block);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G3_scratch, 0, Assembler::equal, Assembler::pt, no_block);
     __ bind(L);
 
     // Block.  Save any potential method result value before the operation and
@@ -2328,9 +2322,7 @@
 
   Label no_reguard;
   __ ld(G2_thread, JavaThread::stack_guard_state_offset(), G3_scratch);
-  __ cmp(G3_scratch, JavaThread::stack_guard_yellow_disabled);
-  __ br(Assembler::notEqual, false, Assembler::pt, no_reguard);
-  __ delayed()->nop();
+  __ cmp_and_br_short(G3_scratch, JavaThread::stack_guard_yellow_disabled, Assembler::notEqual, Assembler::pt, no_reguard);
 
     save_native_result(masm, ret_type, stack_slots);
   __ call(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages));
@@ -2382,8 +2374,7 @@
 #ifdef ASSERT
     { Label L;
     __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), O0);
-    __ br_null(O0, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ br_null_short(O0, Assembler::pt, L);
     __ stop("no pending exception allowed on exit from IR::monitorexit");
     __ bind(L);
     }
@@ -2639,9 +2630,7 @@
     AddressLiteral ic_miss(SharedRuntime::get_ic_miss_stub());
     __ verify_oop(O0);
     __ ld_ptr(O0, oopDesc::klass_offset_in_bytes(), temp_reg);
-    __ cmp(temp_reg, G5_inline_cache_reg);
-    __ brx(Assembler::equal, true, Assembler::pt, L);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(temp_reg, G5_inline_cache_reg, Assembler::equal, Assembler::pt, L);
 
     __ jump_to(ic_miss, temp_reg);
     __ delayed()->nop();
@@ -3143,8 +3132,7 @@
 
   gen_new_frame(masm, deopt);        // allocate an interpreter frame
 
-  __ tst(O4array_size);
-  __ br(Assembler::notZero, false, Assembler::pn, loop);
+  __ cmp_zero_and_br(Assembler::notZero, O4array_size, loop);
   __ delayed()->add(O3array, wordSize, O3array);
   __ ld_ptr(G3pcs, 0, O7);                      // load final frame new pc
 
@@ -3221,7 +3209,7 @@
   // pc is now in O7. Return values are still in the expected places
 
   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_words);
-  __ ba(false, cont);
+  __ ba(cont);
   __ delayed()->mov(Deoptimization::Unpack_deopt, L0deopt_mode);
 
   int exception_offset = __ offset() - start;
@@ -3256,8 +3244,7 @@
     // verify that there is really an exception oop in exception_oop
     Label has_exception;
     __ ld_ptr(G2_thread, JavaThread::exception_oop_offset(), Oexception);
-    __ br_notnull(Oexception, false, Assembler::pt, has_exception);
-    __ delayed()-> nop();
+    __ br_notnull_short(Oexception, Assembler::pt, has_exception);
     __ stop("no exception in thread");
     __ bind(has_exception);
 
@@ -3265,14 +3252,13 @@
     Label no_pending_exception;
     Address exception_addr(G2_thread, Thread::pending_exception_offset());
     __ ld_ptr(exception_addr, Oexception);
-    __ br_null(Oexception, false, Assembler::pt, no_pending_exception);
-    __ delayed()->nop();
+    __ br_null_short(Oexception, Assembler::pt, no_pending_exception);
     __ stop("must not have pending exception here");
     __ bind(no_pending_exception);
   }
 #endif
 
-  __ ba(false, cont);
+  __ ba(cont);
   __ delayed()->mov(Deoptimization::Unpack_exception, L0deopt_mode);;
 
   //
@@ -3313,9 +3299,7 @@
   RegisterSaver::restore_result_registers(masm);
 
   Label noException;
-  __ cmp(G4deopt_mode, Deoptimization::Unpack_exception);   // Was exception pending?
-  __ br(Assembler::notEqual, false, Assembler::pt, noException);
-  __ delayed()->nop();
+  __ cmp_and_br_short(G4deopt_mode, Deoptimization::Unpack_exception, Assembler::notEqual, Assembler::pt, noException);
 
   // Move the pending exception from exception_oop to Oexception so
   // the pending exception will be picked up the interpreter.
@@ -3359,9 +3343,7 @@
   // In 32 bit, C2 returns longs in G1 so restore the saved G1 into
   // I0/I1 if the return value is long.
   Label not_long;
-  __ cmp(O0,T_LONG);
-  __ br(Assembler::notEqual, false, Assembler::pt, not_long);
-  __ delayed()->nop();
+  __ cmp_and_br_short(O0,T_LONG, Assembler::notEqual, Assembler::pt, not_long);
   __ ldd(saved_Greturn1_addr,I0);
   __ bind(not_long);
 #endif
@@ -3534,9 +3516,7 @@
   Label pending;
 
   __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), O1);
-  __ tst(O1);
-  __ brx(Assembler::notEqual, true, Assembler::pn, pending);
-  __ delayed()->nop();
+  __ br_notnull_short(O1, Assembler::pn, pending);
 
   RegisterSaver::restore_live_registers(masm);
 
@@ -3623,9 +3603,7 @@
   Label pending;
 
   __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), O1);
-  __ tst(O1);
-  __ brx(Assembler::notEqual, true, Assembler::pn, pending);
-  __ delayed()->nop();
+  __ br_notnull_short(O1, Assembler::pn, pending);
 
   // get the returned methodOop
 
--- a/src/cpu/sparc/vm/sparc.ad	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/sparc.ad	Thu Jul 21 11:25:07 2011 -0700
@@ -1693,7 +1693,6 @@
 
 void MachUEPNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   MacroAssembler _masm(&cbuf);
-  Label L;
   Register G5_ic_reg  = reg_to_register_object(Matcher::inline_cache_reg_encode());
   Register temp_reg   = G3;
   assert( G5_ic_reg != temp_reg, "conflicting registers" );
@@ -2315,60 +2314,23 @@
     __ delayed()->nop();
   %}
 
-  enc_class enc_bp( Label labl, cmpOp cmp, flagsReg cc ) %{
+  enc_class enc_bp( label labl, cmpOp cmp, flagsReg cc ) %{
     MacroAssembler _masm(&cbuf);
-    Label &L = *($labl$$label);
+    Label* L = $labl$$label;
     Assembler::Predict predict_taken =
-      cbuf.is_backward_branch(L) ? Assembler::pt : Assembler::pn;
-
-    __ bp( (Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, L);
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+
+    __ bp( (Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, *L);
     __ delayed()->nop();
   %}
 
-  enc_class enc_bpl( Label labl, cmpOp cmp, flagsRegL cc ) %{
+  enc_class enc_bpr( label labl, cmpOp_reg cmp, iRegI op1 ) %{
     MacroAssembler _masm(&cbuf);
-    Label &L = *($labl$$label);
+    Label* L = $labl$$label;
     Assembler::Predict predict_taken =
-      cbuf.is_backward_branch(L) ? Assembler::pt : Assembler::pn;
-
-    __ bp( (Assembler::Condition)($cmp$$cmpcode), false, Assembler::xcc, predict_taken, L);
-    __ delayed()->nop();
-  %}
-
-  enc_class enc_bpx( Label labl, cmpOp cmp, flagsRegP cc ) %{
-    MacroAssembler _masm(&cbuf);
-    Label &L = *($labl$$label);
-    Assembler::Predict predict_taken =
-      cbuf.is_backward_branch(L) ? Assembler::pt : Assembler::pn;
-
-    __ bp( (Assembler::Condition)($cmp$$cmpcode), false, Assembler::ptr_cc, predict_taken, L);
-    __ delayed()->nop();
-  %}
-
-  enc_class enc_fbp( Label labl, cmpOpF cmp, flagsRegF cc ) %{
-    MacroAssembler _masm(&cbuf);
-    Label &L = *($labl$$label);
-    Assembler::Predict predict_taken =
-      cbuf.is_backward_branch(L) ? Assembler::pt : Assembler::pn;
-
-    __ fbp( (Assembler::Condition)($cmp$$cmpcode), false, (Assembler::CC)($cc$$reg), predict_taken, L);
-    __ delayed()->nop();
-  %}
-
-  enc_class enc_ba( Label labl ) %{
-    MacroAssembler _masm(&cbuf);
-    Label &L = *($labl$$label);
-    __ ba(false, L);
-    __ delayed()->nop();
-  %}
-
-  enc_class enc_bpr( Label labl, cmpOp_reg cmp, iRegI op1 ) %{
-    MacroAssembler _masm(&cbuf);
-    Label &L = *$labl$$label;
-    Assembler::Predict predict_taken =
-      cbuf.is_backward_branch(L) ? Assembler::pt : Assembler::pn;
-
-    __ bpr( (Assembler::RCondition)($cmp$$cmpcode), false, predict_taken, as_Register($op1$$reg), L);
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+
+    __ bpr( (Assembler::RCondition)($cmp$$cmpcode), false, predict_taken, as_Register($op1$$reg), *L);
     __ delayed()->nop();
   %}
 
@@ -2986,7 +2948,7 @@
     __ brx(Assembler::equal, true, Assembler::pn, Ldone);
     __ delayed()->add(G0, 1, result_reg);
 
-    __ br_on_reg_cond(Assembler::rc_z, true, Assembler::pn, cnt_reg, Ldone);
+    __ cmp_zero_and_br(Assembler::zero, cnt_reg, Ldone, true, Assembler::pn);
     __ delayed()->add(G0, 1, result_reg); // count == 0
 
     //rename registers
@@ -3006,7 +2968,7 @@
     // Compare char[] arrays aligned to 4 bytes.
     __ char_arrays_equals(str1_reg, str2_reg, limit_reg, result_reg,
                           chr1_reg, chr2_reg, Ldone);
-    __ ba(false,Ldone);
+    __ ba(Ldone);
     __ delayed()->add(G0, 1, result_reg);
 
     // char by char compare
@@ -3065,7 +3027,7 @@
     __ br(Assembler::notEqual, true, Assembler::pn, Ldone);
     __ delayed()->mov(G0, result_reg);     // not equal
 
-    __ br_on_reg_cond(Assembler::rc_z, true, Assembler::pn, tmp1_reg, Ldone);
+    __ cmp_zero_and_br(Assembler::zero, tmp1_reg, Ldone, true, Assembler::pn);
     __ delayed()->add(G0, 1, result_reg); // zero-length arrays are equal
 
     // load array addresses
@@ -9232,9 +9194,11 @@
   size(8);
   ins_cost(BRANCH_COST);
   format %{ "BA     $labl" %}
-  // Prim = bits 24-22, Secnd = bits 31-30, Tert = cond
-  opcode(Assembler::br_op2, Assembler::branch_op, Assembler::always);
-  ins_encode( enc_ba( labl ) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ ba(*L);
+    __ delayed()->nop();
+  %}
   ins_pc_relative(1);
   ins_pipe(br);
 %}
@@ -9314,8 +9278,14 @@
   size(8);
   ins_cost(BRANCH_COST);
   format %{ "BP$cmp  $pcc,$labl" %}
-  // Prim = bits 24-22, Secnd = bits 31-30
-  ins_encode( enc_bpx( labl, cmp, pcc ) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+
+    __ bp( (Assembler::Condition)($cmp$$cmpcode), false, Assembler::ptr_cc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
   ins_pc_relative(1);
   ins_pipe(br_cc);
 %}
@@ -9327,8 +9297,14 @@
   size(8);
   ins_cost(BRANCH_COST);
   format %{ "FBP$cmp $fcc,$labl" %}
-  // Prim = bits 24-22, Secnd = bits 31-30
-  ins_encode( enc_fbp( labl, cmp, fcc ) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+
+    __ fbp( (Assembler::Condition)($cmp$$cmpcode), false, (Assembler::CC)($fcc$$reg), predict_taken, *L);
+    __ delayed()->nop();
+  %}
   ins_pc_relative(1);
   ins_pipe(br_fcc);
 %}
@@ -9387,8 +9363,14 @@
   size(8);
   ins_cost(BRANCH_COST);
   format %{ "BP$cmp   $xcc,$labl" %}
-  // Prim = bits 24-22, Secnd = bits 31-30
-  ins_encode( enc_bpl( labl, cmp, xcc ) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+
+    __ bp( (Assembler::Condition)($cmp$$cmpcode), false, Assembler::xcc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
   ins_pc_relative(1);
   ins_pipe(br_cc);
 %}
@@ -9707,7 +9689,6 @@
   effect(KILL scratch, TEMP scratch2);
   ins_cost(100);
 
-  size(4*112);       // conservative overestimation ...
   format %{ "FASTLOCK  $object, $box; KILL $scratch, $scratch2, $box" %}
   ins_encode( Fast_Lock(object, box, scratch, scratch2) );
   ins_pipe(long_memory_op);
@@ -9719,7 +9700,6 @@
   effect(KILL scratch, TEMP scratch2);
   ins_cost(100);
 
-  size(4*120);       // conservative overestimation ...
   format %{ "FASTUNLOCK  $object, $box; KILL $scratch, $scratch2, $box" %}
   ins_encode( Fast_Unlock(object, box, scratch, scratch2) );
   ins_pipe(long_memory_op);
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -150,8 +150,7 @@
     { const Register t = G3_scratch;
       Label L;
       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
-      __ br_null(t, false, Assembler::pt, L);
-      __ delayed()->nop();
+      __ br_null_short(t, Assembler::pt, L);
       __ stop("StubRoutines::call_stub: entered with pending exception");
       __ bind(L);
     }
@@ -207,8 +206,7 @@
       Label exit;
       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
       __ add( FP, STACK_BIAS, dst );
-      __ tst(cnt);
-      __ br(Assembler::zero, false, Assembler::pn, exit);
+      __ cmp_zero_and_br(Assembler::zero, cnt, exit);
       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
 
       // copy parameters if any
@@ -282,20 +280,20 @@
       __ delayed()->restore();
 
       __ BIND(is_object);
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->st_ptr(O0, addr, G0);
 
       __ BIND(is_float);
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
 
       __ BIND(is_double);
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
 
       __ BIND(is_long);
 #ifdef _LP64
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->st_long(O0, addr, G0);      // store entire long
 #else
 #if defined(COMPILER2)
@@ -307,11 +305,11 @@
   // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
   // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
 
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->stx(G1, addr, G0);  // store entire long
 #else
       __ st(O1, addr, BytesPerInt);
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->st(O0, addr, G0);
 #endif /* COMPILER2 */
 #endif /* _LP64 */
@@ -382,8 +380,7 @@
     // make sure that this code is only executed if there is a pending exception
     { Label L;
       __ ld_ptr(exception_addr, Gtemp);
-      __ br_notnull(Gtemp, false, Assembler::pt, L);
-      __ delayed()->nop();
+      __ br_notnull_short(Gtemp, Assembler::pt, L);
       __ stop("StubRoutines::forward exception: no pending exception (1)");
       __ bind(L);
     }
@@ -406,8 +403,7 @@
 #ifdef ASSERT
     // make sure exception is set
     { Label L;
-      __ br_notnull(Oexception, false, Assembler::pt, L);
-      __ delayed()->nop();
+      __ br_notnull_short(Oexception, Assembler::pt, L);
       __ stop("StubRoutines::forward exception: no pending exception (2)");
       __ bind(L);
     }
@@ -501,8 +497,7 @@
     Address exception_addr(G2_thread, Thread::pending_exception_offset());
     Register scratch_reg = Gtemp;
     __ ld_ptr(exception_addr, scratch_reg);
-    __ br_notnull(scratch_reg, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ br_notnull_short(scratch_reg, Assembler::pt, L);
     __ should_not_reach_here();
     __ bind(L);
 #endif // ASSERT
@@ -614,9 +609,7 @@
     __ mov(G0,yield_reg);
 
     __ BIND(retry);
-    __ cmp(yield_reg, V8AtomicOperationUnderLockSpinCount);
-    __ br(Assembler::less, false, Assembler::pt, dontyield);
-    __ delayed()->nop();
+    __ cmp_and_br_short(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, Assembler::pt, dontyield);
 
     // This code can only be called from inside the VM, this
     // stub is only invoked from Atomic::add().  We do not
@@ -676,9 +669,7 @@
       // try to replace O2 with O3
       __ cas_under_lock(O1, O2, O3,
       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
-      __ cmp(O2, O3);
-      __ br(Assembler::notEqual, false, Assembler::pn, retry);
-      __ delayed()->nop();
+      __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 
       __ retl(false);
       __ delayed()->mov(O2, O0);  // report previous value to caller
@@ -798,11 +789,9 @@
       __ BIND(retry);
 
       __ lduw(O1, 0, O2);
-      __ add(O0,   O2, O3);
-      __ cas(O1,   O2, O3);
-      __ cmp(      O2, O3);
-      __ br(Assembler::notEqual, false, Assembler::pn, retry);
-      __ delayed()->nop();
+      __ add(O0, O2, O3);
+      __ cas(O1, O2, O3);
+      __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
       __ retl(false);
       __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
     } else {
@@ -1370,8 +1359,7 @@
 
     // copy tailing bytes
     __ BIND(L_copy_byte);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       __ align(OptoLoopAlignment);
     __ BIND(L_copy_byte_loop);
       __ ldub(from, offset, O3);
@@ -1482,8 +1470,7 @@
 
     // copy 1 element (2 bytes) at a time
     __ BIND(L_copy_byte);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       __ align(OptoLoopAlignment);
     __ BIND(L_copy_byte_loop);
       __ dec(end_from);
@@ -1600,8 +1587,7 @@
 
     // copy 1 element at a time
     __ BIND(L_copy_2_bytes);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       __ align(OptoLoopAlignment);
     __ BIND(L_copy_2_bytes_loop);
       __ lduh(from, offset, O3);
@@ -1946,8 +1932,7 @@
 
     // copy 1 element (2 bytes) at a time
     __ BIND(L_copy_2_bytes);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
     __ BIND(L_copy_2_bytes_loop);
       __ dec(end_from, 2);
       __ dec(end_to, 2);
@@ -2060,8 +2045,7 @@
 
     // copy 1 element at a time
     __ BIND(L_copy_4_bytes);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
     __ BIND(L_copy_4_bytes_loop);
       __ ld(from, offset, O3);
       __ deccc(count);
@@ -2193,8 +2177,7 @@
 
     // copy 1 element (4 bytes) at a time
     __ BIND(L_copy_4_bytes);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
     __ BIND(L_copy_4_bytes_loop);
       __ dec(end_from, 4);
       __ dec(end_to, 4);
@@ -2576,7 +2559,7 @@
                                      super_klass->after_save(),
                                      L0, L1, L2, L4,
                                      NULL, &L_pop_to_miss);
-    __ ba(false, L_success);
+    __ ba(L_success);
     __ delayed()->restore();
 
     __ bind(L_pop_to_miss);
@@ -2673,8 +2656,7 @@
     // ======== loop entry is here ========
     __ BIND(load_element);
     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
-    __ br_null(G3_oop, true, Assembler::pt, store_element);
-    __ delayed()->nop();
+    __ br_null_short(G3_oop, Assembler::pt, store_element);
 
     __ load_klass(G3_oop, G4_klass); // query the object klass
 
@@ -2896,8 +2878,7 @@
     //  assert(src->klass() != NULL);
     BLOCK_COMMENT("assert klasses not null");
     { Label L_a, L_b;
-      __ br_notnull(G3_src_klass, false, Assembler::pt, L_b); // it is broken if klass is NULL
-      __ delayed()->nop();
+      __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
       __ bind(L_a);
       __ stop("broken null klass");
       __ bind(L_b);
@@ -2937,9 +2918,7 @@
     }
 
     //  if (src->klass() != dst->klass()) return -1;
-    __ cmp(G3_src_klass, G4_dst_klass);
-    __ brx(Assembler::notEqual, false, Assembler::pn, L_failed);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
 
     //  if (!src->is_Array()) return -1;
     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
@@ -3007,9 +2986,7 @@
     __ delayed()->signx(length, count); // length
 #ifdef ASSERT
     { Label L;
-      __ cmp(G3_elsize, LogBytesPerLong);
-      __ br(Assembler::equal, false, Assembler::pt, L);
-      __ delayed()->nop();
+      __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
       __ stop("must be long copy, but elsize is wrong");
       __ bind(L);
     }
--- a/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -190,9 +190,7 @@
   const Register size  = G1_scratch;
   if (EnableInvokeDynamic) {
     __ ldub(Address(Lbcp, 0), G1_scratch);  // Load current bytecode.
-    __ cmp(G1_scratch, Bytecodes::_invokedynamic);
-    __ br(Assembler::equal, false, Assembler::pn, L_giant_index);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G1_scratch, Bytecodes::_invokedynamic, Assembler::equal, Assembler::pn, L_giant_index);
   }
   __ get_cache_and_index_at_bcp(cache, G1_scratch, 1);
   __ bind(L_got_cache);
@@ -207,8 +205,7 @@
   if (EnableInvokeDynamic) {
     __ bind(L_giant_index);
     __ get_cache_and_index_at_bcp(cache, G1_scratch, 1, sizeof(u4));
-    __ ba(false, L_got_cache);
-    __ delayed()->nop();
+    __ ba_short(L_got_cache);
   }
 
   return entry;
@@ -221,9 +218,7 @@
   { Label L;
     Address exception_addr(G2_thread, Thread::pending_exception_offset());
     __ ld_ptr(exception_addr, Gtemp);  // Load pending exception.
-    __ tst(Gtemp);
-    __ brx(Assembler::equal, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ br_null_short(Gtemp, Assembler::pt, L);
     __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_pending_exception));
     __ should_not_reach_here();
     __ bind(L);
@@ -304,8 +299,7 @@
     if (ProfileInterpreter) {
       // If no method data exists, go to profile_continue.
       __ ld_ptr(Lmethod, methodOopDesc::method_data_offset(), G4_scratch);
-      __ br_null(G4_scratch, false, Assembler::pn, no_mdo);
-      __ delayed()->nop();
+      __ br_null_short(G4_scratch, Assembler::pn, no_mdo);
       // Increment counter
       Address mdo_invocation_counter(G4_scratch,
                                      in_bytes(methodDataOopDesc::invocation_counter_offset()) +
@@ -313,8 +307,7 @@
       __ increment_mask_and_jump(mdo_invocation_counter, increment, mask,
                                  G3_scratch, Lscratch,
                                  Assembler::zero, overflow);
-      __ ba(false, done);
-      __ delayed()->nop();
+      __ ba_short(done);
     }
 
     // Increment counter in methodOop
@@ -340,9 +333,7 @@
       // Test to see if we should create a method data oop
       AddressLiteral profile_limit((address)&InvocationCounter::InterpreterProfileLimit);
       __ load_contents(profile_limit, G3_scratch);
-      __ cmp(O0, G3_scratch);
-      __ br(Assembler::lessUnsigned, false, Assembler::pn, *profile_method_continue);
-      __ delayed()->nop();
+      __ cmp_and_br_short(O0, G3_scratch, Assembler::lessUnsigned, Assembler::pn, *profile_method_continue);
 
       // if no method data exists, go to profile_method
       __ test_method_data_pointer(*profile_method);
@@ -351,7 +342,7 @@
     AddressLiteral invocation_limit((address)&InvocationCounter::InterpreterInvocationLimit);
     __ load_contents(invocation_limit, G3_scratch);
     __ cmp(O0, G3_scratch);
-    __ br(Assembler::greaterEqualUnsigned, false, Assembler::pn, *overflow);
+    __ br(Assembler::greaterEqualUnsigned, false, Assembler::pn, *overflow); // Far distance
     __ delayed()->nop();
   }
 
@@ -410,19 +401,14 @@
 
   assert_different_registers(Rframe_size, Rscratch, Rscratch2);
 
-  __ set( page_size,   Rscratch );
-  __ cmp( Rframe_size, Rscratch );
-
-  __ br( Assembler::lessEqual, false, Assembler::pt, after_frame_check );
-  __ delayed()->nop();
+  __ set(page_size, Rscratch);
+  __ cmp_and_br_short(Rframe_size, Rscratch, Assembler::lessEqual, Assembler::pt, after_frame_check);
 
   // get the stack base, and in debug, verify it is non-zero
   __ ld_ptr( G2_thread, Thread::stack_base_offset(), Rscratch );
 #ifdef ASSERT
   Label base_not_zero;
-  __ cmp( Rscratch, G0 );
-  __ brx( Assembler::notEqual, false, Assembler::pn, base_not_zero );
-  __ delayed()->nop();
+  __ br_notnull_short(Rscratch, Assembler::pn, base_not_zero);
   __ stop("stack base is zero in generate_stack_overflow_check");
   __ bind(base_not_zero);
 #endif
@@ -432,9 +418,7 @@
   __ ld_ptr( G2_thread, Thread::stack_size_offset(), Rscratch2 );
 #ifdef ASSERT
   Label size_not_zero;
-  __ cmp( Rscratch2, G0 );
-  __ brx( Assembler::notEqual, false, Assembler::pn, size_not_zero );
-  __ delayed()->nop();
+  __ br_notnull_short(Rscratch2, Assembler::pn, size_not_zero);
   __ stop("stack size is zero in generate_stack_overflow_check");
   __ bind(size_not_zero);
 #endif
@@ -450,9 +434,7 @@
 
   // the frame is greater than one page in size, so check against
   // the bottom of the stack
-  __ cmp( SP, Rscratch );
-  __ brx( Assembler::greater, false, Assembler::pt, after_frame_check );
-  __ delayed()->nop();
+  __ cmp_and_brx_short(SP, Rscratch, Assembler::greater, Assembler::pt, after_frame_check);
 
   // Save the return address as the exception pc
   __ st_ptr(O7, saved_exception_pc);
@@ -624,9 +606,7 @@
     // If we need a safepoint check, generate full interpreter entry.
     AddressLiteral sync_state(SafepointSynchronize::address_of_state());
     __ set(sync_state, G3_scratch);
-    __ cmp(G3_scratch, SafepointSynchronize::_not_synchronized);
-    __ br(Assembler::notEqual, false, Assembler::pn, slow_path);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G3_scratch, SafepointSynchronize::_not_synchronized, Assembler::notEqual, Assembler::pn, slow_path);
 
     // Code: _return
     __ retl();
@@ -664,14 +644,12 @@
     AddressLiteral sync_state(SafepointSynchronize::address_of_state());
     __ load_contents(sync_state, G3_scratch);
     __ cmp(G3_scratch, SafepointSynchronize::_not_synchronized);
-    __ br(Assembler::notEqual, false, Assembler::pn, slow_path);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G3_scratch, SafepointSynchronize::_not_synchronized, Assembler::notEqual, Assembler::pn, slow_path);
 
     // Check if local 0 != NULL
     __ ld_ptr(Gargs, G0, Otos_i ); // get local 0
-    __ tst(Otos_i);  // check if local 0 == NULL and go the slow path
-    __ brx(Assembler::zero, false, Assembler::pn, slow_path);
-    __ delayed()->nop();
+    // check if local 0 == NULL and go the slow path
+    __ br_null_short(Otos_i, Assembler::pn, slow_path);
 
 
     // read first instruction word and extract bytecode @ 1 and index @ 2
@@ -697,9 +675,7 @@
     __ ld_ptr(G3_scratch, cp_base_offset + ConstantPoolCacheEntry::indices_offset(), G1_scratch);
     __ srl(G1_scratch, 2*BitsPerByte, G1_scratch);
     __ and3(G1_scratch, 0xFF, G1_scratch);
-    __ cmp(G1_scratch, Bytecodes::_getfield);
-    __ br(Assembler::notEqual, false, Assembler::pn, slow_path);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G1_scratch, Bytecodes::_getfield, Assembler::notEqual, Assembler::pn, slow_path);
 
     // Get the type and return field offset from the constant pool cache
     __ ld_ptr(G3_scratch, cp_base_offset + ConstantPoolCacheEntry::flags_offset(), G1_scratch);
@@ -787,9 +763,8 @@
     // Check if local 0 != NULL
     // If the receiver is null then it is OK to jump to the slow path.
     __ ld_ptr(Gargs, G0, Otos_i ); // get local 0
-    __ tst(Otos_i);  // check if local 0 == NULL and go the slow path
-    __ brx(Assembler::zero, false, Assembler::pn, slow_path);
-    __ delayed()->nop();
+    // check if local 0 == NULL and go the slow path
+    __ cmp_and_brx_short(Otos_i, 0, Assembler::equal, Assembler::pn, slow_path);
 
 
     // Load the value of the referent field.
@@ -952,9 +927,7 @@
   { Label L;
     Address signature_handler(Lmethod, methodOopDesc::signature_handler_offset());
     __ ld_ptr(signature_handler, G3_scratch);
-    __ tst(G3_scratch);
-    __ brx(Assembler::notZero, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ br_notnull_short(G3_scratch, Assembler::pt, L);
     __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::prepare_native_call), Lmethod);
     __ ld_ptr(signature_handler, G3_scratch);
     __ bind(L);
@@ -1019,9 +992,7 @@
 #ifdef ASSERT
     if (!PrintSignatureHandlers)  // do not dirty the output with this
     { Label L;
-      __ tst(O1);
-      __ brx(Assembler::notZero, false, Assembler::pt, L);
-      __ delayed()->nop();
+      __ br_notnull_short(O1, Assembler::pt, L);
       __ stop("mirror is missing");
       __ bind(L);
     }
@@ -1038,9 +1009,7 @@
 
 #ifdef ASSERT
   { Label L;
-    __ tst(O0);
-    __ brx(Assembler::notZero, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ br_notnull_short(O0, Assembler::pt, L);
     __ stop("native entry point is missing");
     __ bind(L);
   }
@@ -1079,9 +1048,7 @@
 #ifdef ASSERT
   { Label L;
     __ ld(thread_state, G3_scratch);
-    __ cmp(G3_scratch, _thread_in_Java);
-    __ br(Assembler::equal, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G3_scratch, _thread_in_Java, Assembler::equal, Assembler::pt, L);
     __ stop("Wrong thread state in native stub");
     __ bind(L);
   }
@@ -1134,9 +1101,7 @@
     Label L;
     __ br(Assembler::notEqual, false, Assembler::pn, L);
     __ delayed()->ld(G2_thread, JavaThread::suspend_flags_offset(), G3_scratch);
-    __ cmp(G3_scratch, 0);
-    __ br(Assembler::equal, false, Assembler::pt, no_block);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G3_scratch, 0, Assembler::equal, Assembler::pt, no_block);
     __ bind(L);
 
     // Block.  Save any potential method result value before the operation and
@@ -1185,9 +1150,7 @@
     Label no_oop, store_result;
 
     __ set((intptr_t)AbstractInterpreter::result_handler(T_OBJECT), G3_scratch);
-    __ cmp(G3_scratch, Lscratch);
-    __ brx(Assembler::notEqual, false, Assembler::pt, no_oop);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(G3_scratch, Lscratch, Assembler::notEqual, Assembler::pt, no_oop);
     __ addcc(G0, O0, O0);
     __ brx(Assembler::notZero, true, Assembler::pt, store_result);     // if result is not NULL:
     __ delayed()->ld_ptr(O0, 0, O0);                                   // unbox it
@@ -1206,9 +1169,7 @@
   { Label L;
     Address exception_addr(G2_thread, Thread::pending_exception_offset());
     __ ld_ptr(exception_addr, Gtemp);
-    __ tst(Gtemp);
-    __ brx(Assembler::equal, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ br_null_short(Gtemp, Assembler::pt, L);
     // Note: This could be handled more efficiently since we know that the native
     //       method doesn't have an exception handler. We could directly return
     //       to the exception handler for the caller.
@@ -1245,9 +1206,7 @@
 #ifdef ASSERT
   {
     Label ok;
-    __ cmp(I5_savedSP, FP);
-    __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, ok);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(I5_savedSP, FP, Assembler::greaterEqualUnsigned, Assembler::pt, ok);
     __ stop("bad I5_savedSP value");
     __ should_not_reach_here();
     __ bind(ok);
@@ -1429,8 +1388,7 @@
 
       __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::profile_method));
       __ set_method_data_pointer_for_bcp();
-      __ ba(false, profile_method_continue);
-      __ delayed()->nop();
+      __ ba_short(profile_method_continue);
     }
 
     // handle invocation counter overflow
@@ -1856,9 +1814,7 @@
     // adapter frames in C2.
     Label caller_not_deoptimized;
     __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, InterpreterRuntime::interpreter_contains), I7);
-    __ tst(O0);
-    __ brx(Assembler::notEqual, false, Assembler::pt, caller_not_deoptimized);
-    __ delayed()->nop();
+    __ br_notnull_short(O0, Assembler::pt, caller_not_deoptimized);
 
     const Register Gtmp1 = G3_scratch;
     const Register Gtmp2 = G1_scratch;
@@ -1992,10 +1948,10 @@
 void TemplateInterpreterGenerator::set_vtos_entry_points(Template* t, address& bep, address& cep, address& sep, address& aep, address& iep, address& lep, address& fep, address& dep, address& vep) {
   assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
   Label L;
-  aep = __ pc(); __ push_ptr(); __ ba(false, L); __ delayed()->nop();
-  fep = __ pc(); __ push_f();   __ ba(false, L); __ delayed()->nop();
-  dep = __ pc(); __ push_d();   __ ba(false, L); __ delayed()->nop();
-  lep = __ pc(); __ push_l();   __ ba(false, L); __ delayed()->nop();
+  aep = __ pc(); __ push_ptr(); __ ba_short(L);
+  fep = __ pc(); __ push_f();   __ ba_short(L);
+  dep = __ pc(); __ push_d();   __ ba_short(L);
+  lep = __ pc(); __ push_l();   __ ba_short(L);
   iep = __ pc(); __ push_i();
   bep = cep = sep = iep;                        // there aren't any
   vep = __ pc(); __ bind(L);                    // fall through
--- a/src/cpu/sparc/vm/templateTable_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/templateTable_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -159,13 +159,10 @@
   if (JvmtiExport::can_post_breakpoint()) {
     Label fast_patch;
     __ ldub(at_bcp(0), Rscratch);
-    __ cmp(Rscratch, Bytecodes::_breakpoint);
-    __ br(Assembler::notEqual, false, Assembler::pt, fast_patch);
-    __ delayed()->nop();  // don't bother to hoist the stb here
+    __ cmp_and_br_short(Rscratch, Bytecodes::_breakpoint, Assembler::notEqual, Assembler::pt, fast_patch);
     // perform the quickening, slowly, in the bowels of the breakpoint table
     __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::set_original_bytecode_at), Lmethod, Lbcp, Rbyte_code);
-    __ ba(false, patch_done);
-    __ delayed()->nop();
+    __ ba_short(patch_done);
     __ bind(fast_patch);
   }
 #ifdef ASSERT
@@ -281,17 +278,14 @@
   // get type from tags
   __ add(O2, tags_offset, O2);
   __ ldub(O2, O1, O2);
-  __ cmp(O2, JVM_CONSTANT_UnresolvedString);    // unresolved string? If so, must resolve
-  __ brx(Assembler::equal, true, Assembler::pt, call_ldc);
-  __ delayed()->nop();
-
-  __ cmp(O2, JVM_CONSTANT_UnresolvedClass);     // unresolved class? If so, must resolve
-  __ brx(Assembler::equal, true, Assembler::pt, call_ldc);
-  __ delayed()->nop();
-
-  __ cmp(O2, JVM_CONSTANT_UnresolvedClassInError);     // unresolved class in error state
-  __ brx(Assembler::equal, true, Assembler::pn, call_ldc);
-  __ delayed()->nop();
+  // unresolved string? If so, must resolve
+  __ cmp_and_brx_short(O2, JVM_CONSTANT_UnresolvedString, Assembler::equal, Assembler::pt, call_ldc);
+
+  // unresolved class? If so, must resolve
+  __ cmp_and_brx_short(O2, JVM_CONSTANT_UnresolvedClass, Assembler::equal, Assembler::pt, call_ldc);
+
+  // unresolved class in error state
+  __ cmp_and_brx_short(O2, JVM_CONSTANT_UnresolvedClassInError, Assembler::equal, Assembler::pn, call_ldc);
 
   __ cmp(O2, JVM_CONSTANT_Class);      // need to call vm to get java mirror of the class
   __ brx(Assembler::notEqual, true, Assembler::pt, notClass);
@@ -301,8 +295,7 @@
   __ set(wide, O1);
   call_VM(Otos_i, CAST_FROM_FN_PTR(address, InterpreterRuntime::ldc), O1);
   __ push(atos);
-  __ ba(false, exit);
-  __ delayed()->nop();
+  __ ba_short(exit);
 
   __ bind(notClass);
  // __ add(O0, base_offset, O0);
@@ -312,8 +305,7 @@
   __ delayed()->cmp(O2, JVM_CONSTANT_String);
   __ ld(O0, O1, Otos_i);
   __ push(itos);
-  __ ba(false, exit);
-  __ delayed()->nop();
+  __ ba_short(exit);
 
   __ bind(notInt);
  // __ cmp(O2, JVM_CONSTANT_String);
@@ -325,8 +317,7 @@
   __ ld_ptr(O0, O1, Otos_i);
   __ verify_oop(Otos_i);
   __ push(atos);
-  __ ba(false, exit);
-  __ delayed()->nop();
+  __ ba_short(exit);
 
   __ bind(notString);
  // __ ldf(FloatRegisterImpl::S, O0, O1, Ftos_f);
@@ -365,9 +356,7 @@
   __ load_klass(Otos_i, Rcon_klass);
   AddressLiteral array_klass_addr((address)Universe::systemObjArrayKlassObj_addr());
   __ load_contents(array_klass_addr, Rarray_klass);
-  __ cmp(Rarray_klass, Rcon_klass);
-  __ brx(Assembler::notEqual, false, Assembler::pt, L_done);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(Rarray_klass, Rcon_klass, Assembler::notEqual, Assembler::pt, L_done);
   __ ld(Address(Otos_i, arrayOopDesc::length_offset_in_bytes()), Rcon_klass);
   __ tst(Rcon_klass);
   __ brx(Assembler::zero, true, Assembler::pt, L_done);
@@ -397,9 +386,7 @@
   __ sll(O1, LogBytesPerWord, O1);
   __ add(O0, O1, G3_scratch);
 
-  __ cmp(O2, JVM_CONSTANT_Double);
-  __ brx(Assembler::notEqual, false, Assembler::pt, Long);
-  __ delayed()->nop();
+  __ cmp_and_brx_short(O2, JVM_CONSTANT_Double, Assembler::notEqual, Assembler::pt, Long);
   // A double can be placed at word-aligned locations in the constant pool.
   // Check out Conversions.java for an example.
   // Also constantPoolOopDesc::header_size() is 20, which makes it very difficult
@@ -413,8 +400,7 @@
          f->successor());
 #endif
   __ push(dtos);
-  __ ba(false, exit);
-  __ delayed()->nop();
+  __ ba_short(exit);
 
   __ bind(Long);
 #ifdef _LP64
@@ -453,9 +439,7 @@
     // last two iloads in a pair.  Comparing against fast_iload means that
     // the next bytecode is neither an iload or a caload, and therefore
     // an iload pair.
-    __ cmp(G3_scratch, (int)Bytecodes::_iload);
-    __ br(Assembler::equal, false, Assembler::pn, done);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G3_scratch, (int)Bytecodes::_iload, Assembler::equal, Assembler::pn, done);
 
     __ cmp(G3_scratch, (int)Bytecodes::_fast_iload);
     __ br(Assembler::equal, false, Assembler::pn, rewrite);
@@ -697,9 +681,7 @@
     aload(0);
 
     // if _getfield then wait with rewrite
-    __ cmp(G3_scratch, (int)Bytecodes::_getfield);
-    __ br(Assembler::equal, false, Assembler::pn, done);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G3_scratch, (int)Bytecodes::_getfield, Assembler::equal, Assembler::pn, done);
 
     // if _igetfield then rewrite to _fast_iaccess_0
     assert(Bytecodes::java_code(Bytecodes::_fast_iaccess_0) == Bytecodes::_aload_0, "adjust fast bytecode def");
@@ -867,8 +849,7 @@
   __ index_check_without_pop(O3, O2, UseCompressedOops ? 2 : LogBytesPerWord, G3_scratch, O1);
 
   // do array store check - check for NULL value first
-  __ br_null( Otos_i, false, Assembler::pn, is_null );
-  __ delayed()->nop();
+  __ br_null_short( Otos_i, Assembler::pn, is_null );
 
   __ load_klass(O3, O4); // get array klass
   __ load_klass(Otos_i, O5); // get value klass
@@ -899,7 +880,7 @@
   __ bind(store_ok);
   do_oop_store(_masm, O1, noreg, arrayOopDesc::base_offset_in_bytes(T_OBJECT), Otos_i, G3_scratch, _bs->kind(), true);
 
-  __ ba(false,done);
+  __ ba(done);
   __ delayed()->inc(Lesp, 3* Interpreter::stackElementSize); // adj sp (pops array, index and value)
 
   __ bind(is_null);
@@ -1633,16 +1614,14 @@
       if (ProfileInterpreter) {
         // If no method data exists, go to profile_continue.
         __ ld_ptr(Lmethod, methodOopDesc::method_data_offset(), G4_scratch);
-        __ br_null(G4_scratch, false, Assembler::pn, Lno_mdo);
-        __ delayed()->nop();
+        __ br_null_short(G4_scratch, Assembler::pn, Lno_mdo);
 
         // Increment backedge counter in the MDO
         Address mdo_backedge_counter(G4_scratch, in_bytes(methodDataOopDesc::backedge_counter_offset()) +
                                                  in_bytes(InvocationCounter::counter_offset()));
         __ increment_mask_and_jump(mdo_backedge_counter, increment, mask, G3_scratch, Lscratch,
                                    Assembler::notZero, &Lforward);
-        __ ba(false, Loverflow);
-        __ delayed()->nop();
+        __ ba_short(Loverflow);
       }
 
       // If there's no MDO, increment counter in methodOop
@@ -1658,14 +1637,11 @@
 
       // Was an OSR adapter generated?
       // O0 = osr nmethod
-      __ br_null(O0, false, Assembler::pn, Lforward);
-      __ delayed()->nop();
+      __ br_null_short(O0, Assembler::pn, Lforward);
 
       // Has the nmethod been invalidated already?
       __ ld(O0, nmethod::entry_bci_offset(), O2);
-      __ cmp(O2, InvalidOSREntryBci);
-      __ br(Assembler::equal, false, Assembler::pn, Lforward);
-      __ delayed()->nop();
+      __ cmp_and_br_short(O2, InvalidOSREntryBci, Assembler::equal, Assembler::pn, Lforward);
 
       // migrate the interpreter frame off of the stack
 
@@ -1830,7 +1806,7 @@
   __ profile_switch_case(O2, O3, G3_scratch, G4_scratch);
   __ sll(O2, LogBytesPerInt, O2);
   __ add(O2, 3 * BytesPerInt, O2);
-  __ ba(false, continue_execution);
+  __ ba(continue_execution);
   __ delayed()->ld(O1, O2, O2);
   // handle default
   __ bind(default_case);
@@ -1858,7 +1834,7 @@
   __ ld(O1, BytesPerInt, O2);
   __ sll(O2, LogBytesPerInt + 1, O2); // in word-pairs
   __ add(O1, 2 * BytesPerInt, O3); // set first pair addr
-  __ ba(false, loop_entry);
+  __ ba(loop_entry);
   __ delayed()->add(O3, O2, O2); // counter now points past last pair
 
   // table search
@@ -1877,8 +1853,7 @@
   __ ld(O1, 0, O4); // get default offset
   if (ProfileInterpreter) {
     __ profile_switch_default(O3);
-    __ ba(false, continue_execution);
-    __ delayed()->nop();
+    __ ba_short(continue_execution);
   }
 
   // entry found -> get offset
@@ -1944,7 +1919,7 @@
 
   // and start
   Label entry;
-  __ ba(false, entry);
+  __ ba(entry);
   __ delayed()->ld( Rarray, -BytesPerInt, Rj);
   // (Rj is already in the native byte-ordering.)
 
@@ -2002,8 +1977,7 @@
   // (Rj is already in the native byte-ordering.)
 
   if (ProfileInterpreter) {
-    __ ba(false, continue_execution);
-    __ delayed()->nop();
+    __ ba_short(continue_execution);
   }
 
   __ bind(default_case); // fall through (if not profiling)
@@ -2216,9 +2190,7 @@
     assert_different_registers(Rcache, index, G1_scratch);
     AddressLiteral get_field_access_count_addr(JvmtiExport::get_field_access_count_addr());
     __ load_contents(get_field_access_count_addr, G1_scratch);
-    __ tst(G1_scratch);
-    __ br(Assembler::zero, false, Assembler::pt, Label1);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G1_scratch, 0, Assembler::equal, Assembler::pt, Label1);
 
     __ add(Rcache, in_bytes(cp_base_offset), Rcache);
 
@@ -2298,7 +2270,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_agetfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notObj);
@@ -2313,7 +2285,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_igetfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notInt);
@@ -2329,7 +2301,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_lgetfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notLong);
@@ -2344,7 +2316,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_bgetfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notByte);
@@ -2359,7 +2331,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_cgetfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notChar);
@@ -2374,7 +2346,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_sgetfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notShort);
@@ -2390,7 +2362,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_fgetfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notFloat);
@@ -2499,9 +2471,7 @@
     Label done;
     AddressLiteral get_field_modification_count_addr(JvmtiExport::get_field_modification_count_addr());
     __ load_contents(get_field_modification_count_addr, G4_scratch);
-    __ tst(G4_scratch);
-    __ br(Assembler::zero, false, Assembler::pt, done);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G4_scratch, 0, Assembler::equal, Assembler::pt, done);
     __ pop_ptr(G4_scratch);     // copy the object pointer from tos
     __ verify_oop(G4_scratch);
     __ push_ptr(G4_scratch);    // put the object pointer back on tos
@@ -2552,9 +2522,7 @@
     assert_different_registers(Rcache, index, G1_scratch);
     AddressLiteral get_field_modification_count_addr(JvmtiExport::get_field_modification_count_addr());
     __ load_contents(get_field_modification_count_addr, G1_scratch);
-    __ tst(G1_scratch);
-    __ br(Assembler::zero, false, Assembler::pt, Label1);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G1_scratch, 0, Assembler::zero, Assembler::pt, Label1);
 
     // The Rcache and index registers have been already set.
     // This allows to eliminate this call but the Rcache and index
@@ -2584,8 +2552,7 @@
       __ br(Assembler::equal, false, Assembler::pt, two_word);
       __ delayed()->nop();
       __ inc(G4_scratch, Interpreter::expr_offset_in_bytes(1));
-      __ br(Assembler::always, false, Assembler::pt, valsizeknown);
-      __ delayed()->nop();
+      __ ba_short(valsizeknown);
       __ bind(two_word);
 
       __ inc(G4_scratch, Interpreter::expr_offset_in_bytes(2));
@@ -2636,9 +2603,7 @@
     __ and3(Rflags, Lscratch, Lscratch);
 
     if (__ membar_has_effect(read_bits)) {
-      __ tst(Lscratch);
-      __ br(Assembler::zero, false, Assembler::pt, notVolatile);
-      __ delayed()->nop();
+      __ cmp_and_br_short(Lscratch, 0, Assembler::equal, Assembler::pt, notVolatile);
       volatile_barrier(read_bits);
       __ bind(notVolatile);
     }
@@ -2663,7 +2628,7 @@
 
     do_oop_store(_masm, Rclass, Roffset, 0, Otos_i, G1_scratch, _bs->kind(), false);
 
-    __ ba(false, checkVolatile);
+    __ ba(checkVolatile);
     __ delayed()->tst(Lscratch);
 
     __ bind(notObj);
@@ -2675,7 +2640,7 @@
     // itos
     __ pop_i();
     __ st(Otos_i, Rclass, Roffset);
-    __ ba(false, checkVolatile);
+    __ ba(checkVolatile);
     __ delayed()->tst(Lscratch);
 
     __ bind(notInt);
@@ -2691,7 +2656,7 @@
     pop_and_check_object(Rclass);
     __ st(Otos_i, Rclass, Roffset);
     patch_bytecode(Bytecodes::_fast_iputfield, G3_scratch, G4_scratch);
-    __ ba(false, checkVolatile);
+    __ ba(checkVolatile);
     __ delayed()->tst(Lscratch);
 
     __ bind(notInt);
@@ -2707,7 +2672,7 @@
     do_oop_store(_masm, Rclass, Roffset, 0, Otos_i, G1_scratch, _bs->kind(), false);
 
     patch_bytecode(Bytecodes::_fast_aputfield, G3_scratch, G4_scratch);
-    __ ba(false, checkVolatile);
+    __ ba(checkVolatile);
     __ delayed()->tst(Lscratch);
 
     __ bind(notObj);
@@ -2724,7 +2689,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_bputfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notByte);
@@ -2740,7 +2705,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_lputfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notLong);
@@ -2756,7 +2721,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_cputfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notChar);
@@ -2771,7 +2736,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_sputfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notShort);
@@ -2786,7 +2751,7 @@
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_fputfield, G3_scratch, G4_scratch);
   }
-  __ ba(false, checkVolatile);
+  __ ba(checkVolatile);
   __ delayed()->tst(Lscratch);
 
   __ bind(notFloat);
@@ -2833,9 +2798,7 @@
     __ set((1 << ConstantPoolCacheEntry::volatileField), Lscratch);
     __ and3(Rflags, Lscratch, Lscratch);
     if (__ membar_has_effect(read_bits)) {
-      __ tst(Lscratch);
-      __ br(Assembler::zero, false, Assembler::pt, notVolatile);
-      __ delayed()->nop();
+      __ cmp_and_br_short(Lscratch, 0, Assembler::equal, Assembler::pt, notVolatile);
       volatile_barrier(read_bits);
       __ bind(notVolatile);
     }
@@ -2864,9 +2827,7 @@
   }
 
   if (__ membar_has_effect(write_bits)) {
-    __ tst(Lscratch);
-    __ br(Assembler::zero, false, Assembler::pt, exit);
-    __ delayed()->nop();
+    __ cmp_and_br_short(Lscratch, 0, Assembler::equal, Assembler::pt, exit);
     volatile_barrier(Assembler::StoreLoad);
     __ bind(exit);
   }
@@ -3226,8 +3187,7 @@
     // the VM should throw IncompatibleClassChangeError.  linkResolver checks
     // this too but that's only if the entry isn't already resolved, so we
     // need to check again.
-    __ br_notnull( Rtemp, false, Assembler::pt, ok);
-    __ delayed()->nop();
+    __ br_notnull_short( Rtemp, Assembler::pt, ok);
     call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_IncompatibleClassChangeError));
     __ should_not_reach_here();
     __ bind(ok);
@@ -3251,9 +3211,7 @@
   // Check for abstract method error.
   {
     Label ok;
-    __ tst(G5_method);
-    __ brx(Assembler::notZero, false, Assembler::pt, ok);
-    __ delayed()->nop();
+    __ br_notnull_short(G5_method, Assembler::pt, ok);
     call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_AbstractMethodError));
     __ should_not_reach_here();
     __ bind(ok);
@@ -3408,17 +3366,14 @@
 #else
       __ srl(RfreeValue, LogHeapWordSize, RfreeValue);
 #endif
-      __ cmp(RtlabWasteLimitValue, RfreeValue);
-      __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, slow_case); // tlab waste is small
-      __ delayed()->nop();
+      __ cmp_and_brx_short(RtlabWasteLimitValue, RfreeValue, Assembler::greaterEqualUnsigned, Assembler::pt, slow_case); // tlab waste is small
 
       // increment waste limit to prevent getting stuck on this slow path
       __ add(RtlabWasteLimitValue, ThreadLocalAllocBuffer::refill_waste_limit_increment(), RtlabWasteLimitValue);
       __ st_ptr(RtlabWasteLimitValue, G2_thread, in_bytes(JavaThread::tlab_refill_waste_limit_offset()));
     } else {
       // No allocation in the shared eden.
-      __ br(Assembler::always, false, Assembler::pt, slow_case);
-      __ delayed()->nop();
+      __ ba_short(slow_case);
     }
   }
 
@@ -3440,18 +3395,14 @@
 
     // RnewTopValue contains the top address after the new object
     // has been allocated.
-    __ cmp(RnewTopValue, RendValue);
-    __ brx(Assembler::greaterUnsigned, false, Assembler::pn, slow_case);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(RnewTopValue, RendValue, Assembler::greaterUnsigned, Assembler::pn, slow_case);
 
     __ casx_under_lock(RtopAddr, RoldTopValue, RnewTopValue,
       VM_Version::v9_instructions_work() ? NULL :
       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr());
 
     // if someone beat us on the allocation, try again, otherwise continue
-    __ cmp(RoldTopValue, RnewTopValue);
-    __ brx(Assembler::notEqual, false, Assembler::pn, retry);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(RoldTopValue, RnewTopValue, Assembler::notEqual, Assembler::pn, retry);
 
     // bump total bytes allocated by this thread
     // RoldTopValue and RtopAddr are dead, so can use G1 and G3
@@ -3474,8 +3425,7 @@
       __ br(Assembler::notEqual, false, Assembler::pt, loop);
       __ delayed()->subcc(Roffset, wordSize, Roffset);
     }
-    __ br(Assembler::always, false, Assembler::pt, initialize_header);
-    __ delayed()->nop();
+    __ ba_short(initialize_header);
   }
 
   // slow case
@@ -3485,8 +3435,7 @@
 
   call_VM(Otos_i, CAST_FROM_FN_PTR(address, InterpreterRuntime::_new), O1, O2);
 
-  __ ba(false, done);
-  __ delayed()->nop();
+  __ ba_short(done);
 
   // Initialize the header: mark, klass
   __ bind(initialize_header);
@@ -3550,8 +3499,7 @@
   Register RspecifiedKlass = O4;
 
   // Check for casting a NULL
-  __ br_null(Otos_i, false, Assembler::pn, is_null);
-  __ delayed()->nop();
+  __ br_null_short(Otos_i, Assembler::pn, is_null);
 
   // Get value klass in RobjKlass
   __ load_klass(Otos_i, RobjKlass); // get value klass
@@ -3571,8 +3519,7 @@
   call_VM(RspecifiedKlass, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc) );
   __ pop_ptr(Otos_i, G3_scratch); // restore receiver
 
-  __ br(Assembler::always, false, Assembler::pt, resolved);
-  __ delayed()->nop();
+  __ ba_short(resolved);
 
   // Extract target class from constant pool
   __ bind(quicked);
@@ -3591,8 +3538,7 @@
   __ bind(cast_ok);
 
   if (ProfileInterpreter) {
-    __ ba(false, done);
-    __ delayed()->nop();
+    __ ba_short(done);
   }
   __ bind(is_null);
   __ profile_null_seen(G3_scratch);
@@ -3608,8 +3554,7 @@
   Register RspecifiedKlass = O4;
 
   // Check for casting a NULL
-  __ br_null(Otos_i, false, Assembler::pt, is_null);
-  __ delayed()->nop();
+  __ br_null_short(Otos_i, Assembler::pt, is_null);
 
   // Get value klass in RobjKlass
   __ load_klass(Otos_i, RobjKlass); // get value klass
@@ -3629,9 +3574,7 @@
   call_VM(RspecifiedKlass, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc) );
   __ pop_ptr(Otos_i, G3_scratch); // restore receiver
 
-  __ br(Assembler::always, false, Assembler::pt, resolved);
-  __ delayed()->nop();
-
+  __ ba_short(resolved);
 
   // Extract target class from constant pool
   __ bind(quicked);
@@ -3649,8 +3592,7 @@
   __ clr( Otos_i );
 
   if (ProfileInterpreter) {
-    __ ba(false, done);
-    __ delayed()->nop();
+    __ ba_short(done);
   }
   __ bind(is_null);
   __ profile_null_seen(G3_scratch);
@@ -3724,7 +3666,7 @@
   {
     Label entry, loop, exit;
     __ add( __ top_most_monitor(), O2 ); // last one to check
-    __ ba( false, entry );
+    __ ba( entry );
     __ delayed()->mov( Lmonitors, O3 ); // first one to check
 
 
@@ -3757,8 +3699,7 @@
   { Label allocated;
 
     // found free slot?
-    __ br_notnull(O1, false, Assembler::pn, allocated);
-    __ delayed()->nop();
+    __ br_notnull_short(O1, Assembler::pn, allocated);
 
     __ add_monitor_to_stack( false, O2, O3 );
     __ mov(Lmonitors, O1);
@@ -3791,7 +3732,7 @@
 
   { Label entry, loop, found;
     __ add( __ top_most_monitor(), O2 ); // last one to check
-    __ ba(false, entry );
+    __ ba(entry);
     // use Lscratch to hold monitor elem to check, start with most recent monitor,
     // By using a local it survives the call to the C routine.
     __ delayed()->mov( Lmonitors, Lscratch );
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -59,6 +59,11 @@
 
   assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value");
 
+  if (AllocatePrefetchStyle == 3 && !has_blk_init()) {
+    warning("BIS instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(AllocatePrefetchStyle, 1);
+  }
+
   UseSSE = 0; // Only on x86 and x64
 
   _supports_cx8               = has_v9();
@@ -116,27 +121,44 @@
     if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
       FLAG_SET_DEFAULT(UsePopCountInstruction, true);
     }
+  } else if (UsePopCountInstruction) {
+    warning("POPC instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UsePopCountInstruction, false);
+  }
+
+  // T4 and newer Sparc cpus have new compare and branch instruction.
+  if (has_cbcond()) {
+    if (FLAG_IS_DEFAULT(UseCBCond)) {
+      FLAG_SET_DEFAULT(UseCBCond, true);
+    }
+  } else if (UseCBCond) {
+    warning("CBCOND instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UseCBCond, false);
   }
 
 #ifdef COMPILER2
+  // T4 and newer Sparc cpus have fast RDPC.
+  if (has_fast_rdpc() && FLAG_IS_DEFAULT(UseRDPCForConstantTableBase)) {
+//    FLAG_SET_DEFAULT(UseRDPCForConstantTableBase, true);
+  }
+
   // Currently not supported anywhere.
   FLAG_SET_DEFAULT(UseFPUForSpilling, false);
 #endif
 
   char buf[512];
-  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
-               (has_v8() ? ", has_v8" : ""),
-               (has_v9() ? ", has_v9" : ""),
+  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+               (has_v9() ? ", v9" : (has_v8() ? ", v8" : "")),
                (has_hardware_popc() ? ", popc" : ""),
-               (has_vis1() ? ", has_vis1" : ""),
-               (has_vis2() ? ", has_vis2" : ""),
-               (has_vis3() ? ", has_vis3" : ""),
-               (has_blk_init() ? ", has_blk_init" : ""),
-               (is_ultra3() ? ", is_ultra3" : ""),
-               (is_sun4v() ? ", is_sun4v" : ""),
-               (is_niagara() ? ", is_niagara" : ""),
-               (is_niagara_plus() ? ", is_niagara_plus" : ""),
-               (is_sparc64() ? ", is_sparc64" : ""),
+               (has_vis1() ? ", vis1" : ""),
+               (has_vis2() ? ", vis2" : ""),
+               (has_vis3() ? ", vis3" : ""),
+               (has_blk_init() ? ", blk_init" : ""),
+               (has_cbcond() ? ", cbcond" : ""),
+               (is_ultra3() ? ", ultra3" : ""),
+               (is_sun4v() ? ", sun4v" : ""),
+               (is_niagara_plus() ? ", niagara_plus" : (is_niagara() ? ", niagara" : "")),
+               (is_sparc64() ? ", sparc64" : ""),
                (!has_hardware_mul32() ? ", no-mul32" : ""),
                (!has_hardware_div32() ? ", no-div32" : ""),
                (!has_hardware_fsmuld() ? ", no-fsmuld" : ""));
--- a/src/cpu/sparc/vm/vm_version_sparc.hpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/vm_version_sparc.hpp	Thu Jul 21 11:25:07 2011 -0700
@@ -31,44 +31,46 @@
 class VM_Version: public Abstract_VM_Version {
 protected:
   enum Feature_Flag {
-    v8_instructions    = 0,
-    hardware_mul32     = 1,
-    hardware_div32     = 2,
-    hardware_fsmuld    = 3,
-    hardware_popc      = 4,
-    v9_instructions    = 5,
-    vis1_instructions  = 6,
-    vis2_instructions  = 7,
-    sun4v_instructions = 8,
+    v8_instructions      = 0,
+    hardware_mul32       = 1,
+    hardware_div32       = 2,
+    hardware_fsmuld      = 3,
+    hardware_popc        = 4,
+    v9_instructions      = 5,
+    vis1_instructions    = 6,
+    vis2_instructions    = 7,
+    sun4v_instructions   = 8,
     blk_init_instructions = 9,
-    fmaf_instructions  = 10,
-    fmau_instructions  = 11,
-    vis3_instructions  = 12,
-    sparc64_family     = 13,
-    T_family           = 14,
-    T1_model           = 15
+    fmaf_instructions    = 10,
+    fmau_instructions    = 11,
+    vis3_instructions    = 12,
+    sparc64_family       = 13,
+    T_family             = 14,
+    T1_model             = 15,
+    cbcond_instructions  = 16
   };
 
   enum Feature_Flag_Set {
     unknown_m           = 0,
     all_features_m      = -1,
 
-    v8_instructions_m   = 1 << v8_instructions,
-    hardware_mul32_m    = 1 << hardware_mul32,
-    hardware_div32_m    = 1 << hardware_div32,
-    hardware_fsmuld_m   = 1 << hardware_fsmuld,
-    hardware_popc_m     = 1 << hardware_popc,
-    v9_instructions_m   = 1 << v9_instructions,
-    vis1_instructions_m = 1 << vis1_instructions,
-    vis2_instructions_m = 1 << vis2_instructions,
-    sun4v_m             = 1 << sun4v_instructions,
+    v8_instructions_m       = 1 << v8_instructions,
+    hardware_mul32_m        = 1 << hardware_mul32,
+    hardware_div32_m        = 1 << hardware_div32,
+    hardware_fsmuld_m       = 1 << hardware_fsmuld,
+    hardware_popc_m         = 1 << hardware_popc,
+    v9_instructions_m       = 1 << v9_instructions,
+    vis1_instructions_m     = 1 << vis1_instructions,
+    vis2_instructions_m     = 1 << vis2_instructions,
+    sun4v_m                 = 1 << sun4v_instructions,
     blk_init_instructions_m = 1 << blk_init_instructions,
-    fmaf_instructions_m = 1 << fmaf_instructions,
-    fmau_instructions_m = 1 << fmau_instructions,
-    vis3_instructions_m = 1 << vis3_instructions,
-    sparc64_family_m    = 1 << sparc64_family,
-    T_family_m          = 1 << T_family,
-    T1_model_m          = 1 << T1_model,
+    fmaf_instructions_m     = 1 << fmaf_instructions,
+    fmau_instructions_m     = 1 << fmau_instructions,
+    vis3_instructions_m     = 1 << vis3_instructions,
+    sparc64_family_m        = 1 << sparc64_family,
+    T_family_m              = 1 << T_family,
+    T1_model_m              = 1 << T1_model,
+    cbcond_instructions_m   = 1 << cbcond_instructions,
 
     generic_v8_m        = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m,
     generic_v9_m        = generic_v8_m | v9_instructions_m,
@@ -111,20 +113,25 @@
   static bool has_vis2()                { return (_features & vis2_instructions_m) != 0; }
   static bool has_vis3()                { return (_features & vis3_instructions_m) != 0; }
   static bool has_blk_init()            { return (_features & blk_init_instructions_m) != 0; }
+  static bool has_cbcond()              { return (_features & cbcond_instructions_m) != 0; }
 
   static bool supports_compare_and_exchange()
                                         { return has_v9(); }
 
-  static bool is_ultra3()               { return (_features & ultra3_m) == ultra3_m; }
-  static bool is_sun4v()                { return (_features & sun4v_m) != 0; }
   // Returns true if the platform is in the niagara line (T series)
   // and newer than the niagara1.
   static bool is_niagara_plus()         { return is_T_family(_features) && !is_T1_model(_features); }
+
   // Fujitsu SPARC64
   static bool is_sparc64()              { return (_features & sparc64_family_m) != 0; }
 
+  static bool is_sun4v()                { return (_features & sun4v_m) != 0; }
+  static bool is_ultra3()               { return (_features & ultra3_m) == ultra3_m && !is_sun4v() && !is_sparc64(); }
+
   static bool has_fast_fxtof()          { return is_niagara() || is_sparc64() || has_v9() && !is_ultra3(); }
   static bool has_fast_idiv()           { return is_niagara_plus() || is_sparc64(); }
+  // T4 and newer Sparc have fast RDPC instruction.
+  static bool has_fast_rdpc()           { return is_niagara_plus() && has_cbcond(); }
 
   static const char* cpu_features()     { return _features_str; }
 
--- a/src/cpu/sparc/vm/vtableStubs_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/sparc/vm/vtableStubs_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -76,9 +76,7 @@
     Label L;
     // check offset vs vtable length
     __ ld(G3_scratch, instanceKlass::vtable_length_offset()*wordSize, G5);
-    __ cmp(G5, vtable_index*vtableEntry::size());
-    __ br(Assembler::greaterUnsigned, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ cmp_and_br_short(G5, vtable_index*vtableEntry::size(), Assembler::greaterUnsigned, Assembler::pt, L);
     __ set(vtable_index, O2);
     __ call_VM(noreg, CAST_FROM_FN_PTR(address, bad_compiled_vtable_index), O0, O2);
     __ bind(L);
@@ -95,8 +93,7 @@
 #ifndef PRODUCT
   if (DebugVtables) {
     Label L;
-    __ br_notnull(G5_method, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ br_notnull_short(G5_method, Assembler::pt, L);
     __ stop("Vtable entry is ZERO");
     __ bind(L);
   }
@@ -177,8 +174,7 @@
 #ifndef PRODUCT
   if (DebugVtables) {
     Label L01;
-    __ bpr(Assembler::rc_nz, false, Assembler::pt, L5_method, L01);
-    __ delayed()->nop();
+    __ br_notnull_short(L5_method, Assembler::pt, L01);
     __ stop("methodOop is null");
     __ bind(L01);
     __ verify_oop(L5_method);
--- a/src/cpu/x86/vm/x86_32.ad	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/x86/vm/x86_32.ad	Thu Jul 21 11:25:07 2011 -0700
@@ -1713,14 +1713,14 @@
     else                               emit_d32(cbuf,con);
   %}
 
-  enc_class Lbl (label labl) %{ // JMP, CALL
+  enc_class Lbl (label labl) %{ // GOTO
     Label *l = $labl$$label;
-    emit_d32(cbuf, l ? (l->loc_pos() - (cbuf.insts_size()+4)) : 0);
-  %}
-
-  enc_class LblShort (label labl) %{ // JMP, CALL
+    emit_d32(cbuf, (l->loc_pos() - (cbuf.insts_size()+4)));
+  %}
+
+  enc_class LblShort (label labl) %{ // GOTO
     Label *l = $labl$$label;
-    int disp = l ? (l->loc_pos() - (cbuf.insts_size()+1)) : 0;
+    int disp = l->loc_pos() - (cbuf.insts_size()+1);
     assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
     emit_d8(cbuf, disp);
   %}
@@ -1751,13 +1751,13 @@
     Label *l = $labl$$label;
     $$$emit8$primary;
     emit_cc(cbuf, $secondary, $cop$$cmpcode);
-    emit_d32(cbuf, l ? (l->loc_pos() - (cbuf.insts_size()+4)) : 0);
+    emit_d32(cbuf, (l->loc_pos() - (cbuf.insts_size()+4)));
   %}
 
   enc_class JccShort (cmpOp cop, label labl) %{    // JCC
     Label *l = $labl$$label;
     emit_cc(cbuf, $primary, $cop$$cmpcode);
-    int disp = l ? (l->loc_pos() - (cbuf.insts_size()+1)) : 0;
+    int disp = l->loc_pos() - (cbuf.insts_size()+1);
     assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
     emit_d8(cbuf, disp);
   %}
@@ -13172,7 +13172,7 @@
     bool ok = false;
     if ($cop$$cmpcode == Assembler::notEqual) {
        // the two jumps 6 bytes apart so the jump distances are too
-       parity_disp = l ? (l->loc_pos() - (cbuf.insts_size() + 4)) : 0;
+       parity_disp = l->loc_pos() - (cbuf.insts_size() + 4);
     } else if ($cop$$cmpcode == Assembler::equal) {
        parity_disp = 6;
        ok = true;
@@ -13182,7 +13182,7 @@
     emit_d32(cbuf, parity_disp);
     $$$emit8$primary;
     emit_cc(cbuf, $secondary, $cop$$cmpcode);
-    int disp = l ? (l->loc_pos() - (cbuf.insts_size() + 4)) : 0;
+    int disp = l->loc_pos() - (cbuf.insts_size() + 4);
     emit_d32(cbuf, disp);
   %}
   ins_pipe(pipe_jcc);
@@ -13368,7 +13368,7 @@
     emit_cc(cbuf, $primary, Assembler::parity);
     int parity_disp = -1;
     if ($cop$$cmpcode == Assembler::notEqual) {
-      parity_disp = l ? (l->loc_pos() - (cbuf.insts_size() + 1)) : 0;
+      parity_disp = l->loc_pos() - (cbuf.insts_size() + 1);
     } else if ($cop$$cmpcode == Assembler::equal) {
       parity_disp = 2;
     } else {
@@ -13376,7 +13376,7 @@
     }
     emit_d8(cbuf, parity_disp);
     emit_cc(cbuf, $primary, $cop$$cmpcode);
-    int disp = l ? (l->loc_pos() - (cbuf.insts_size() + 1)) : 0;
+    int disp = l->loc_pos() - (cbuf.insts_size() + 1);
     emit_d8(cbuf, disp);
     assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
     assert(-128 <= parity_disp && parity_disp <= 127, "Displacement too large for short jmp");
--- a/src/cpu/x86/vm/x86_64.ad	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/cpu/x86/vm/x86_64.ad	Thu Jul 21 11:25:07 2011 -0700
@@ -2428,16 +2428,16 @@
 
   enc_class Lbl(label labl)
   %{
-    // JMP, CALL
+    // GOTO
     Label* l = $labl$$label;
-    emit_d32(cbuf, l ? (l->loc_pos() - (cbuf.insts_size() + 4)) : 0);
+    emit_d32(cbuf, (l->loc_pos() - (cbuf.insts_size() + 4)));
   %}
 
   enc_class LblShort(label labl)
   %{
-    // JMP, CALL
+    // GOTO
     Label* l = $labl$$label;
-    int disp = l ? (l->loc_pos() - (cbuf.insts_size() + 1)) : 0;
+    int disp = l->loc_pos() - (cbuf.insts_size() + 1);
     assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
     emit_d8(cbuf, disp);
   %}
@@ -2466,7 +2466,7 @@
     Label* l = $labl$$label;
     $$$emit8$primary;
     emit_cc(cbuf, $secondary, $cop$$cmpcode);
-    emit_d32(cbuf, l ? (l->loc_pos() - (cbuf.insts_size() + 4)) : 0);
+    emit_d32(cbuf, (l->loc_pos() - (cbuf.insts_size() + 4)));
   %}
 
   enc_class JccShort (cmpOp cop, label labl)
@@ -2474,7 +2474,7 @@
   // JCC
     Label *l = $labl$$label;
     emit_cc(cbuf, $primary, $cop$$cmpcode);
-    int disp = l ? (l->loc_pos() - (cbuf.insts_size() + 1)) : 0;
+    int disp = l->loc_pos() - (cbuf.insts_size() + 1);
     assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
     emit_d8(cbuf, disp);
   %}
@@ -12131,7 +12131,7 @@
     int parity_disp = -1;
     if ($cop$$cmpcode == Assembler::notEqual) {
        // the two jumps 6 bytes apart so the jump distances are too
-       parity_disp = l ? (l->loc_pos() - (cbuf.insts_size() + 4)) : 0;
+       parity_disp = l->loc_pos() - (cbuf.insts_size() + 4);
     } else if ($cop$$cmpcode == Assembler::equal) {
        parity_disp = 6;
     } else {
@@ -12140,7 +12140,7 @@
     emit_d32(cbuf, parity_disp);
     $$$emit8$primary;
     emit_cc(cbuf, $secondary, $cop$$cmpcode);
-    int disp = l ? (l->loc_pos() - (cbuf.insts_size() + 4)) : 0;
+    int disp = l->loc_pos() - (cbuf.insts_size() + 4);
     emit_d32(cbuf, disp);
   %}
   ins_pipe(pipe_jcc);
@@ -12335,7 +12335,7 @@
     emit_cc(cbuf, $primary, Assembler::parity);
     int parity_disp = -1;
     if ($cop$$cmpcode == Assembler::notEqual) {
-      parity_disp = l ? (l->loc_pos() - (cbuf.insts_size() + 1)) : 0;
+      parity_disp = l->loc_pos() - (cbuf.insts_size() + 1);
     } else if ($cop$$cmpcode == Assembler::equal) {
       parity_disp = 2;
     } else {
@@ -12343,7 +12343,7 @@
     }
     emit_d8(cbuf, parity_disp);
     emit_cc(cbuf, $primary, $cop$$cmpcode);
-    int disp = l ? (l->loc_pos() - (cbuf.insts_size() + 1)) : 0;
+    int disp = l->loc_pos() - (cbuf.insts_size() + 1);
     emit_d8(cbuf, disp);
     assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
     assert(-128 <= parity_disp && parity_disp <= 127, "Displacement too large for short jmp");
--- a/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -114,6 +114,11 @@
 #endif
     if (av & AV_SPARC_VIS3)         features |= vis3_instructions_m;
 
+#ifndef AV_SPARC_CBCOND
+#define AV_SPARC_CBCOND 0x10000000  /* compare and branch instrs supported */
+#endif
+    if (av & AV_SPARC_CBCOND)       features |= cbcond_instructions_m;
+
   } else {
     // getisax(2) failed, use the old legacy code.
 #ifndef PRODUCT
--- a/src/share/vm/adlc/formssel.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/share/vm/adlc/formssel.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -3623,7 +3623,27 @@
   assert( mNode2->_opType, "Must have _opType");
   const Form *form  = globals[_opType];
   const Form *form2 = globals[mNode2->_opType];
-  return (form == form2);
+  if( form != form2 ) {
+    return false;
+  }
+
+  // Check that their children also match
+  if (_lChild ) {
+    if( !_lChild->equivalent(globals, mNode2->_lChild) )
+      return false;
+  } else if (mNode2->_lChild) {
+    return false; // I have NULL left child, mNode2 has non-NULL left child.
+  }
+
+  if (_rChild ) {
+    if( !_rChild->equivalent(globals, mNode2->_rChild) )
+      return false;
+  } else if (mNode2->_rChild) {
+    return false; // I have NULL right child, mNode2 has non-NULL right child.
+  }
+
+  // We've made it through the gauntlet.
+  return true;
 }
 
 //-------------------------- has_commutative_op -------------------------------
--- a/src/share/vm/adlc/output_c.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/share/vm/adlc/output_c.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -3088,10 +3088,10 @@
     int label_position = instr->label_position();
     if( label_position != -1 ) {
       // Set the label
-      fprintf(fp,"void %sNode::label_set( Label& label, uint block_num ) {\n", instr->_ident);
+      fprintf(fp,"void %sNode::label_set( Label* label, uint block_num ) {\n", instr->_ident);
       fprintf(fp,"  labelOper* oper  = (labelOper*)(opnd_array(%d));\n",
               label_position );
-      fprintf(fp,"  oper->_label     = &label;\n");
+      fprintf(fp,"  oper->_label     = label;\n");
       fprintf(fp,"  oper->_block_num = block_num;\n");
       fprintf(fp,"}\n");
     }
--- a/src/share/vm/adlc/output_h.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/share/vm/adlc/output_h.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -1520,7 +1520,7 @@
     int label_position = instr->label_position();
     if( label_position != -1 ) {
       // Set the label, stored in labelOper::_branch_label
-      fprintf(fp,"  virtual void           label_set( Label& label, uint block_num );\n");
+      fprintf(fp,"  virtual void           label_set( Label* label, uint block_num );\n");
     }
 
     // If this instruction contains a methodOper
--- a/src/share/vm/opto/compile.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/share/vm/opto/compile.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -517,7 +517,17 @@
   buf.stubs()->initialize_shared_locs( &locs_buf[lsize * 2], lsize);
 
   // Do the emission.
+
+  Label fakeL; // Fake label for branch instructions.
+  bool is_branch = n->is_Branch() && n->as_Mach()->ideal_Opcode() != Op_Jump;
+  if (is_branch) {
+    MacroAssembler masm(&buf);
+    masm.bind(fakeL);
+    n->as_Mach()->label_set(&fakeL, 0);
+  }
   n->emit(buf, this->regalloc());
+  if (is_branch) // Clear the reference to fake label.
+    n->as_Mach()->label_set(NULL, 0);
 
   // End scratch_emit_size section.
   set_in_scratch_emit_size(false);
--- a/src/share/vm/opto/machnode.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/share/vm/opto/machnode.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -409,7 +409,7 @@
 
 //------------------------------label_set--------------------------------------
 // Set the Label for a LabelOper, if an operand for this instruction
-void MachNode::label_set( Label& label, uint block_num ) {
+void MachNode::label_set( Label* label, uint block_num ) {
   ShouldNotCallThis();
 }
 
@@ -514,6 +514,9 @@
 void MachNullCheckNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   // only emits entries in the null-pointer exception handler table
 }
+void MachNullCheckNode::label_set(Label* label, uint block_num) {
+  // Nothing to emit
+}
 
 const RegMask &MachNullCheckNode::in_RegMask( uint idx ) const {
   if( idx == 0 ) return RegMask::Empty;
--- a/src/share/vm/opto/machnode.hpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/share/vm/opto/machnode.hpp	Thu Jul 21 11:25:07 2011 -0700
@@ -282,7 +282,7 @@
   virtual int ideal_Opcode()     const { return Op_Node; }
 
   // Set the branch inside jump MachNodes.  Error for non-branch Nodes.
-  virtual void label_set( Label& label, uint block_num );
+  virtual void label_set( Label* label, uint block_num );
 
   // Adds the label for the case
   virtual void add_case_label( int switch_val, Label* blockLabel);
@@ -531,6 +531,7 @@
   }
 
   virtual void emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const;
+  virtual void label_set(Label* label, uint block_num);
   virtual bool pinned() const { return true; };
   virtual void negate() { }
   virtual const class Type *bottom_type() const { return TypeTuple::IFBOTH; }
@@ -853,7 +854,7 @@
 
   virtual MachOper *clone(Compile* C) const;
 
-  virtual Label *label() const { return _label; }
+  virtual Label *label() const { assert(_label != NULL, "need Label"); return _label; }
 
   virtual uint           opcode() const;
 
--- a/src/share/vm/opto/output.cpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/share/vm/opto/output.cpp	Thu Jul 21 11:25:07 2011 -0700
@@ -1346,7 +1346,7 @@
             // For Branchs
             // This requires the TRUE branch target be in succs[0]
             uint block_num = b->non_connector_successor(0)->_pre_order;
-            mach->label_set( blk_labels[block_num], block_num );
+            mach->label_set( &blk_labels[block_num], block_num );
           }
         }
 
--- a/src/share/vm/runtime/globals.hpp	Thu Jul 21 08:38:25 2011 -0700
+++ b/src/share/vm/runtime/globals.hpp	Thu Jul 21 11:25:07 2011 -0700
@@ -1205,6 +1205,9 @@
   product(bool, UseUnalignedLoadStores, false,                              \
           "Use SSE2 MOVDQU instruction for Arraycopy")                      \
                                                                             \
+  product(bool, UseCBCond, false,                                           \
+          "Use compare and branch instruction on SPARC")                    \
+                                                                            \
   product(intx, FieldsAllocationStyle, 1,                                   \
           "0 - type based with oops first, 1 - with oops last, "            \
           "2 - oops in super and sub classes are together")                 \