changeset 421:72c5366e5d86

6743900: frequency based block layout Summary: post-register allocation pass that drives block layout by edge frequencies Reviewed-by: never, kvn
author rasbold
date Thu, 06 Nov 2008 14:59:10 -0800
parents f4fe12e429a4
children 0bf25c4807f9
files src/share/vm/opto/block.cpp src/share/vm/opto/block.hpp src/share/vm/opto/c2_globals.hpp src/share/vm/opto/compile.cpp src/share/vm/opto/compile.hpp src/share/vm/opto/gcm.cpp src/share/vm/opto/output.cpp src/share/vm/opto/phase.cpp src/share/vm/opto/phase.hpp
diffstat 9 files changed, 1003 insertions(+), 134 deletions(-) [+]
line wrap: on
line diff
--- a/src/share/vm/opto/block.cpp	Thu Oct 30 17:08:48 2008 -0700
+++ b/src/share/vm/opto/block.cpp	Thu Nov 06 14:59:10 2008 -0800
@@ -57,6 +57,14 @@
   _blocks[i] = b;
 }
 
+#ifndef PRODUCT
+void Block_List::print() {
+  for (uint i=0; i < size(); i++) {
+    tty->print("B%d ", _blocks[i]->_pre_order);
+  }
+  tty->print("size = %d\n", size());
+}
+#endif
 
 //=============================================================================
 
@@ -66,6 +74,12 @@
   // Check for Start block
   if( _pre_order == 1 ) return InteriorEntryAlignment;
   // Check for loop alignment
+  if (has_loop_alignment())  return loop_alignment();
+
+  return 1;                     // no particular alignment
+}
+
+uint Block::compute_loop_alignment() {
   Node *h = head();
   if( h->is_Loop() && h->as_Loop()->is_inner_loop() )  {
     // Pre- and post-loops have low trip count so do not bother with
@@ -83,13 +97,15 @@
     }
     return OptoLoopAlignment; // Otherwise align loop head
   }
+
   return 1;                     // no particular alignment
 }
 
 //-----------------------------------------------------------------------------
 // Compute the size of first 'inst_cnt' instructions in this block.
 // Return the number of instructions left to compute if the block has
-// less then 'inst_cnt' instructions.
+// less then 'inst_cnt' instructions. Stop, and return 0 if sum_size
+// exceeds OptoLoopAlignment.
 uint Block::compute_first_inst_size(uint& sum_size, uint inst_cnt,
                                     PhaseRegAlloc* ra) {
   uint last_inst = _nodes.size();
@@ -307,6 +323,8 @@
     tty->print("\tLoop: B%d-B%d ", bhead->_pre_order, bx->_pre_order);
     // Dump any loop-specific bits, especially for CountedLoops.
     loop->dump_spec(tty);
+  } else if (has_loop_alignment()) {
+    tty->print(" top-of-loop");
   }
   tty->print(" Freq: %g",_freq);
   if( Verbose || WizardMode ) {
@@ -509,9 +527,11 @@
   int branch_idx = b->_nodes.size() - b->_num_succs-1;
   if( branch_idx < 1 ) return false;
   Node *bra = b->_nodes[branch_idx];
-  if( bra->is_Catch() ) return true;
+  if( bra->is_Catch() )
+    return true;
   if( bra->is_Mach() ) {
-    if( bra->is_MachNullCheck() ) return true;
+    if( bra->is_MachNullCheck() )
+      return true;
     int iop = bra->as_Mach()->ideal_Opcode();
     if( iop == Op_FastLock || iop == Op_FastUnlock )
       return true;
@@ -557,10 +577,10 @@
     dead->_nodes[k]->del_req(j);
 }
 
-//------------------------------MoveToNext-------------------------------------
+//------------------------------move_to_next-----------------------------------
 // Helper function to move block bx to the slot following b_index. Return
 // true if the move is successful, otherwise false
-bool PhaseCFG::MoveToNext(Block* bx, uint b_index) {
+bool PhaseCFG::move_to_next(Block* bx, uint b_index) {
   if (bx == NULL) return false;
 
   // Return false if bx is already scheduled.
@@ -591,9 +611,9 @@
   return true;
 }
 
-//------------------------------MoveToEnd--------------------------------------
+//------------------------------move_to_end------------------------------------
 // Move empty and uncommon blocks to the end.
-void PhaseCFG::MoveToEnd(Block *b, uint i) {
+void PhaseCFG::move_to_end(Block *b, uint i) {
   int e = b->is_Empty();
   if (e != Block::not_empty) {
     if (e == Block::empty_with_goto) {
@@ -609,15 +629,31 @@
   _blocks.push(b);
 }
 
-//------------------------------RemoveEmpty------------------------------------
-// Remove empty basic blocks and useless branches.
-void PhaseCFG::RemoveEmpty() {
+//---------------------------set_loop_alignment--------------------------------
+// Set loop alignment for every block
+void PhaseCFG::set_loop_alignment() {
+  uint last = _num_blocks;
+  assert( _blocks[0] == _broot, "" );
+
+  for (uint i = 1; i < last; i++ ) {
+    Block *b = _blocks[i];
+    if (b->head()->is_Loop()) {
+      b->set_loop_alignment(b);
+    }
+  }
+}
+
+//-----------------------------remove_empty------------------------------------
+// Make empty basic blocks to be "connector" blocks, Move uncommon blocks
+// to the end.
+void PhaseCFG::remove_empty() {
   // Move uncommon blocks to the end
   uint last = _num_blocks;
-  uint i;
   assert( _blocks[0] == _broot, "" );
-  for( i = 1; i < last; i++ ) {
+
+  for (uint i = 1; i < last; i++) {
     Block *b = _blocks[i];
+    if (b->is_connector()) break;
 
     // Check for NeverBranch at block end.  This needs to become a GOTO to the
     // true target.  NeverBranch are treated as a conditional branch that
@@ -629,37 +665,40 @@
       convert_NeverBranch_to_Goto(b);
 
     // Look for uncommon blocks and move to end.
-    if( b->is_uncommon(_bbs) ) {
-      MoveToEnd(b, i);
-      last--;                   // No longer check for being uncommon!
-      if( no_flip_branch(b) ) { // Fall-thru case must follow?
-        b = _blocks[i];         // Find the fall-thru block
-        MoveToEnd(b, i);
-        last--;
+    if (!C->do_freq_based_layout()) {
+      if( b->is_uncommon(_bbs) ) {
+        move_to_end(b, i);
+        last--;                   // No longer check for being uncommon!
+        if( no_flip_branch(b) ) { // Fall-thru case must follow?
+          b = _blocks[i];         // Find the fall-thru block
+          move_to_end(b, i);
+          last--;
+        }
+        i--;                      // backup block counter post-increment
       }
-      i--;                      // backup block counter post-increment
     }
   }
 
-  // Remove empty blocks
-  uint j1;
+  // Move empty blocks to the end
   last = _num_blocks;
-  for( i=0; i < last; i++ ) {
+  for (uint i = 1; i < last; i++) {
     Block *b = _blocks[i];
-    if (i > 0) {
-      if (b->is_Empty() != Block::not_empty) {
-        MoveToEnd(b, i);
-        last--;
-        i--;
-      }
+    if (b->is_Empty() != Block::not_empty) {
+      move_to_end(b, i);
+      last--;
+      i--;
     }
   } // End of for all blocks
+}
 
+//-----------------------------fixup_flow--------------------------------------
+// Fix up the final control flow for basic blocks.
+void PhaseCFG::fixup_flow() {
   // Fixup final control flow for the blocks.  Remove jump-to-next
   // block.  If neither arm of a IF follows the conditional branch, we
   // have to add a second jump after the conditional.  We place the
   // TRUE branch target in succs[0] for both GOTOs and IFs.
-  for( i=0; i < _num_blocks; i++ ) {
+  for (uint i=0; i < _num_blocks; i++) {
     Block *b = _blocks[i];
     b->_pre_order = i;          // turn pre-order into block-index
 
@@ -700,7 +739,7 @@
         }
       }
       // Remove all CatchProjs
-      for (j1 = 0; j1 < b->_num_succs; j1++) b->_nodes.pop();
+      for (uint j1 = 0; j1 < b->_num_succs; j1++) b->_nodes.pop();
 
     } else if (b->_num_succs == 1) {
       // Block ends in a Goto?
@@ -730,8 +769,7 @@
       // successors after the current one, provided that the
       // successor was previously unscheduled, but moveable
       // (i.e., all paths to it involve a branch).
-      if( bnext != bs0 && bnext != bs1 ) {
-
+      if( !C->do_freq_based_layout() && bnext != bs0 && bnext != bs1 ) {
         // Choose the more common successor based on the probability
         // of the conditional branch.
         Block *bx = bs0;
@@ -751,9 +789,9 @@
         }
 
         // Attempt the more common successor first
-        if (MoveToNext(bx, i)) {
+        if (move_to_next(bx, i)) {
           bnext = bx;
-        } else if (MoveToNext(by, i)) {
+        } else if (move_to_next(by, i)) {
           bnext = by;
         }
       }
@@ -774,10 +812,8 @@
         // Flip projection for each target
         { ProjNode *tmp = proj0; proj0 = proj1; proj1 = tmp; }
 
-      } else if( bnext == bs1 ) { // Fall-thru is already in succs[1]
-
-      } else {                  // Else need a double-branch
-
+      } else if( bnext != bs1 ) {
+        // Need a double-branch
         // The existing conditional branch need not change.
         // Add a unconditional branch to the false target.
         // Alas, it must appear in its own block and adding a
@@ -786,8 +822,9 @@
       }
 
       // Make sure we TRUE branch to the target
-      if( proj0->Opcode() == Op_IfFalse )
+      if( proj0->Opcode() == Op_IfFalse ) {
         iff->negate();
+      }
 
       b->_nodes.pop();          // Remove IfFalse & IfTrue projections
       b->_nodes.pop();
@@ -796,9 +833,7 @@
       // Multi-exit block, e.g. a switch statement
       // But we don't need to do anything here
     }
-
   } // End of for all blocks
-
 }
 
 
@@ -905,7 +940,7 @@
   // Force the Union-Find mapping to be at least this large
   extend(max,0);
   // Initialize to be the ID mapping.
-  for( uint i=0; i<_max; i++ ) map(i,i);
+  for( uint i=0; i<max; i++ ) map(i,i);
 }
 
 //------------------------------Find_compress----------------------------------
@@ -937,7 +972,6 @@
   if( idx >= _max ) return idx;
   uint next = lookup(idx);
   while( next != idx ) {        // Scan chain of equivalences
-    assert( next < idx, "always union smaller" );
     idx = next;                 // until find a fixed-point
     next = lookup(idx);
   }
@@ -956,3 +990,491 @@
   assert( src < dst, "always union smaller" );
   map(dst,src);
 }
+
+#ifndef PRODUCT
+static void edge_dump(GrowableArray<CFGEdge *> *edges) {
+  tty->print_cr("---- Edges ----");
+  for (int i = 0; i < edges->length(); i++) {
+    CFGEdge *e = edges->at(i);
+    if (e != NULL) {
+      edges->at(i)->dump();
+    }
+  }
+}
+
+static void trace_dump(Trace *traces[], int count) {
+  tty->print_cr("---- Traces ----");
+  for (int i = 0; i < count; i++) {
+    Trace *tr = traces[i];
+    if (tr != NULL) {
+      tr->dump();
+    }
+  }
+}
+
+void Trace::dump( ) const {
+  tty->print_cr("Trace (freq %f)", first_block()->_freq);
+  for (Block *b = first_block(); b != NULL; b = next(b)) {
+    tty->print("  B%d", b->_pre_order);
+    if (b->head()->is_Loop()) {
+      tty->print(" (L%d)", b->compute_loop_alignment());
+    }
+    if (b->has_loop_alignment()) {
+      tty->print(" (T%d)", b->code_alignment());
+    }
+  }
+  tty->cr();
+}
+
+void CFGEdge::dump( ) const {
+  tty->print(" B%d  -->  B%d  Freq: %f  out:%3d%%  in:%3d%%  State: ",
+             from()->_pre_order, to()->_pre_order, freq(), _from_pct, _to_pct);
+  switch(state()) {
+  case connected:
+    tty->print("connected");
+    break;
+  case open:
+    tty->print("open");
+    break;
+  case interior:
+    tty->print("interior");
+    break;
+  }
+  if (infrequent()) {
+    tty->print("  infrequent");
+  }
+  tty->cr();
+}
+#endif
+
+//=============================================================================
+
+//------------------------------edge_order-------------------------------------
+// Comparison function for edges
+static int edge_order(CFGEdge **e0, CFGEdge **e1) {
+  float freq0 = (*e0)->freq();
+  float freq1 = (*e1)->freq();
+  if (freq0 != freq1) {
+    return freq0 > freq1 ? -1 : 1;
+  }
+
+  int dist0 = (*e0)->to()->_rpo - (*e0)->from()->_rpo;
+  int dist1 = (*e1)->to()->_rpo - (*e1)->from()->_rpo;
+
+  return dist1 - dist0;
+}
+
+//------------------------------trace_frequency_order--------------------------
+// Comparison function for edges
+static int trace_frequency_order(const void *p0, const void *p1) {
+  Trace *tr0 = *(Trace **) p0;
+  Trace *tr1 = *(Trace **) p1;
+  Block *b0 = tr0->first_block();
+  Block *b1 = tr1->first_block();
+
+  // The trace of connector blocks goes at the end;
+  // we only expect one such trace
+  if (b0->is_connector() != b1->is_connector()) {
+    return b1->is_connector() ? -1 : 1;
+  }
+
+  // Pull more frequently executed blocks to the beginning
+  float freq0 = b0->_freq;
+  float freq1 = b1->_freq;
+  if (freq0 != freq1) {
+    return freq0 > freq1 ? -1 : 1;
+  }
+
+  int diff = tr0->first_block()->_rpo - tr1->first_block()->_rpo;
+
+  return diff;
+}
+
+//------------------------------find_edges-------------------------------------
+// Find edges of interest, i.e, those which can fall through. Presumes that
+// edges which don't fall through are of low frequency and can be generally
+// ignored.  Initialize the list of traces.
+void PhaseBlockLayout::find_edges()
+{
+  // Walk the blocks, creating edges and Traces
+  uint i;
+  Trace *tr = NULL;
+  for (i = 0; i < _cfg._num_blocks; i++) {
+    Block *b = _cfg._blocks[i];
+    tr = new Trace(b, next, prev);
+    traces[tr->id()] = tr;
+
+    // All connector blocks should be at the end of the list
+    if (b->is_connector()) break;
+
+    // If this block and the next one have a one-to-one successor
+    // predecessor relationship, simply append the next block
+    int nfallthru = b->num_fall_throughs();
+    while (nfallthru == 1 &&
+           b->succ_fall_through(0)) {
+      Block *n = b->_succs[0];
+
+      // Skip over single-entry connector blocks, we don't want to
+      // add them to the trace.
+      while (n->is_connector() && n->num_preds() == 1) {
+        n = n->_succs[0];
+      }
+
+      // We see a merge point, so stop search for the next block
+      if (n->num_preds() != 1) break;
+
+      i++;
+      assert(n = _cfg._blocks[i], "expecting next block");
+      tr->append(n);
+      uf->map(n->_pre_order, tr->id());
+      traces[n->_pre_order] = NULL;
+      nfallthru = b->num_fall_throughs();
+      b = n;
+    }
+
+    if (nfallthru > 0) {
+      // Create a CFGEdge for each outgoing
+      // edge that could be a fall-through.
+      for (uint j = 0; j < b->_num_succs; j++ ) {
+        if (b->succ_fall_through(j)) {
+          Block *target = b->non_connector_successor(j);
+          float freq = b->_freq * b->succ_prob(j);
+          int from_pct = (int) ((100 * freq) / b->_freq);
+          int to_pct = (int) ((100 * freq) / target->_freq);
+          edges->append(new CFGEdge(b, target, freq, from_pct, to_pct));
+        }
+      }
+    }
+  }
+
+  // Group connector blocks into one trace
+  for (i++; i < _cfg._num_blocks; i++) {
+    Block *b = _cfg._blocks[i];
+    assert(b->is_connector(), "connector blocks at the end");
+    tr->append(b);
+    uf->map(b->_pre_order, tr->id());
+    traces[b->_pre_order] = NULL;
+  }
+}
+
+//------------------------------union_traces----------------------------------
+// Union two traces together in uf, and null out the trace in the list
+void PhaseBlockLayout::union_traces(Trace* updated_trace, Trace* old_trace)
+{
+  uint old_id = old_trace->id();
+  uint updated_id = updated_trace->id();
+
+  uint lo_id = updated_id;
+  uint hi_id = old_id;
+
+  // If from is greater than to, swap values to meet
+  // UnionFind guarantee.
+  if (updated_id > old_id) {
+    lo_id = old_id;
+    hi_id = updated_id;
+
+    // Fix up the trace ids
+    traces[lo_id] = traces[updated_id];
+    updated_trace->set_id(lo_id);
+  }
+
+  // Union the lower with the higher and remove the pointer
+  // to the higher.
+  uf->Union(lo_id, hi_id);
+  traces[hi_id] = NULL;
+}
+
+//------------------------------grow_traces-------------------------------------
+// Append traces together via the most frequently executed edges
+void PhaseBlockLayout::grow_traces()
+{
+  // Order the edges, and drive the growth of Traces via the most
+  // frequently executed edges.
+  edges->sort(edge_order);
+  for (int i = 0; i < edges->length(); i++) {
+    CFGEdge *e = edges->at(i);
+
+    if (e->state() != CFGEdge::open) continue;
+
+    Block *src_block = e->from();
+    Block *targ_block = e->to();
+
+    // Don't grow traces along backedges?
+    if (!BlockLayoutRotateLoops) {
+      if (targ_block->_rpo <= src_block->_rpo) {
+        targ_block->set_loop_alignment(targ_block);
+        continue;
+      }
+    }
+
+    Trace *src_trace = trace(src_block);
+    Trace *targ_trace = trace(targ_block);
+
+    // If the edge in question can join two traces at their ends,
+    // append one trace to the other.
+   if (src_trace->last_block() == src_block) {
+      if (src_trace == targ_trace) {
+        e->set_state(CFGEdge::interior);
+        if (targ_trace->backedge(e)) {
+          // Reset i to catch any newly eligible edge
+          // (Or we could remember the first "open" edge, and reset there)
+          i = 0;
+        }
+      } else if (targ_trace->first_block() == targ_block) {
+        e->set_state(CFGEdge::connected);
+        src_trace->append(targ_trace);
+        union_traces(src_trace, targ_trace);
+      }
+    }
+  }
+}
+
+//------------------------------merge_traces-----------------------------------
+// Embed one trace into another, if the fork or join points are sufficiently
+// balanced.
+void PhaseBlockLayout::merge_traces(bool fall_thru_only)
+{
+  // Walk the edge list a another time, looking at unprocessed edges.
+  // Fold in diamonds
+  for (int i = 0; i < edges->length(); i++) {
+    CFGEdge *e = edges->at(i);
+
+    if (e->state() != CFGEdge::open) continue;
+    if (fall_thru_only) {
+      if (e->infrequent()) continue;
+    }
+
+    Block *src_block = e->from();
+    Trace *src_trace = trace(src_block);
+    bool src_at_tail = src_trace->last_block() == src_block;
+
+    Block *targ_block  = e->to();
+    Trace *targ_trace  = trace(targ_block);
+    bool targ_at_start = targ_trace->first_block() == targ_block;
+
+    if (src_trace == targ_trace) {
+      // This may be a loop, but we can't do much about it.
+      e->set_state(CFGEdge::interior);
+      continue;
+    }
+
+    if (fall_thru_only) {
+      // If the edge links the middle of two traces, we can't do anything.
+      // Mark the edge and continue.
+      if (!src_at_tail & !targ_at_start) {
+        continue;
+      }
+
+      // Don't grow traces along backedges?
+      if (!BlockLayoutRotateLoops && (targ_block->_rpo <= src_block->_rpo)) {
+          continue;
+      }
+
+      // If both ends of the edge are available, why didn't we handle it earlier?
+      assert(src_at_tail ^ targ_at_start, "Should have caught this edge earlier.");
+
+      if (targ_at_start) {
+        // Insert the "targ" trace in the "src" trace if the insertion point
+        // is a two way branch.
+        // Better profitability check possible, but may not be worth it.
+        // Someday, see if the this "fork" has an associated "join";
+        // then make a policy on merging this trace at the fork or join.
+        // For example, other things being equal, it may be better to place this
+        // trace at the join point if the "src" trace ends in a two-way, but
+        // the insertion point is one-way.
+        assert(src_block->num_fall_throughs() == 2, "unexpected diamond");
+        e->set_state(CFGEdge::connected);
+        src_trace->insert_after(src_block, targ_trace);
+        union_traces(src_trace, targ_trace);
+      } else if (src_at_tail) {
+        if (src_trace != trace(_cfg._broot)) {
+          e->set_state(CFGEdge::connected);
+          targ_trace->insert_before(targ_block, src_trace);
+          union_traces(targ_trace, src_trace);
+        }
+      }
+    } else if (e->state() == CFGEdge::open) {
+      // Append traces, even without a fall-thru connection.
+      // But leave root entry at the begining of the block list.
+      if (targ_trace != trace(_cfg._broot)) {
+        e->set_state(CFGEdge::connected);
+        src_trace->append(targ_trace);
+        union_traces(src_trace, targ_trace);
+      }
+    }
+  }
+}
+
+//----------------------------reorder_traces-----------------------------------
+// Order the sequence of the traces in some desirable way, and fixup the
+// jumps at the end of each block.
+void PhaseBlockLayout::reorder_traces(int count)
+{
+  ResourceArea *area = Thread::current()->resource_area();
+  Trace ** new_traces = NEW_ARENA_ARRAY(area, Trace *, count);
+  Block_List worklist;
+  int new_count = 0;
+
+  // Compact the traces.
+  for (int i = 0; i < count; i++) {
+    Trace *tr = traces[i];
+    if (tr != NULL) {
+      new_traces[new_count++] = tr;
+    }
+  }
+
+  // The entry block should be first on the new trace list.
+  Trace *tr = trace(_cfg._broot);
+  assert(tr == new_traces[0], "entry trace misplaced");
+
+  // Sort the new trace list by frequency
+  qsort(new_traces + 1, new_count - 1, sizeof(new_traces[0]), trace_frequency_order);
+
+  // Patch up the successor blocks
+  _cfg._blocks.reset();
+  _cfg._num_blocks = 0;
+  for (int i = 0; i < new_count; i++) {
+    Trace *tr = new_traces[i];
+    if (tr != NULL) {
+      tr->fixup_blocks(_cfg);
+    }
+  }
+}
+
+//------------------------------PhaseBlockLayout-------------------------------
+// Order basic blocks based on frequency
+PhaseBlockLayout::PhaseBlockLayout(PhaseCFG &cfg) :
+  Phase(BlockLayout),
+  _cfg(cfg)
+{
+  ResourceMark rm;
+  ResourceArea *area = Thread::current()->resource_area();
+
+  // List of traces
+  int size = _cfg._num_blocks + 1;
+  traces = NEW_ARENA_ARRAY(area, Trace *, size);
+  memset(traces, 0, size*sizeof(Trace*));
+  next = NEW_ARENA_ARRAY(area, Block *, size);
+  memset(next,   0, size*sizeof(Block *));
+  prev = NEW_ARENA_ARRAY(area, Block *, size);
+  memset(prev  , 0, size*sizeof(Block *));
+
+  // List of edges
+  edges = new GrowableArray<CFGEdge*>;
+
+  // Mapping block index --> block_trace
+  uf = new UnionFind(size);
+  uf->reset(size);
+
+  // Find edges and create traces.
+  find_edges();
+
+  // Grow traces at their ends via most frequent edges.
+  grow_traces();
+
+  // Merge one trace into another, but only at fall-through points.
+  // This may make diamonds and other related shapes in a trace.
+  merge_traces(true);
+
+  // Run merge again, allowing two traces to be catenated, even if
+  // one does not fall through into the other. This appends loosely
+  // related traces to be near each other.
+  merge_traces(false);
+
+  // Re-order all the remaining traces by frequency
+  reorder_traces(size);
+
+  assert(_cfg._num_blocks >= (uint) (size - 1), "number of blocks can not shrink");
+}
+
+
+//------------------------------backedge---------------------------------------
+// Edge e completes a loop in a trace. If the target block is head of the
+// loop, rotate the loop block so that the loop ends in a conditional branch.
+bool Trace::backedge(CFGEdge *e) {
+  bool loop_rotated = false;
+  Block *src_block  = e->from();
+  Block *targ_block    = e->to();
+
+  assert(last_block() == src_block, "loop discovery at back branch");
+  if (first_block() == targ_block) {
+    if (BlockLayoutRotateLoops && last_block()->num_fall_throughs() < 2) {
+      // Find the last block in the trace that has a conditional
+      // branch.
+      Block *b;
+      for (b = last_block(); b != NULL; b = prev(b)) {
+        if (b->num_fall_throughs() == 2) {
+          break;
+        }
+      }
+
+      if (b != last_block() && b != NULL) {
+        loop_rotated = true;
+
+        // Rotate the loop by doing two-part linked-list surgery.
+        append(first_block());
+        break_loop_after(b);
+      }
+    }
+
+    // Backbranch to the top of a trace
+    // Scroll foward through the trace from the targ_block. If we find
+    // a loop head before another loop top, use the the loop head alignment.
+    for (Block *b = targ_block; b != NULL; b = next(b)) {
+      if (b->has_loop_alignment()) {
+        break;
+      }
+      if (b->head()->is_Loop()) {
+        targ_block = b;
+        break;
+      }
+    }
+
+    first_block()->set_loop_alignment(targ_block);
+
+  } else {
+    // Backbranch into the middle of a trace
+    targ_block->set_loop_alignment(targ_block);
+  }
+
+  return loop_rotated;
+}
+
+//------------------------------fixup_blocks-----------------------------------
+// push blocks onto the CFG list
+// ensure that blocks have the correct two-way branch sense
+void Trace::fixup_blocks(PhaseCFG &cfg) {
+  Block *last = last_block();
+  for (Block *b = first_block(); b != NULL; b = next(b)) {
+    cfg._blocks.push(b);
+    cfg._num_blocks++;
+    if (!b->is_connector()) {
+      int nfallthru = b->num_fall_throughs();
+      if (b != last) {
+        if (nfallthru == 2) {
+          // Ensure that the sense of the branch is correct
+          Block *bnext = next(b);
+          Block *bs0 = b->non_connector_successor(0);
+
+          MachNode *iff = b->_nodes[b->_nodes.size()-3]->as_Mach();
+          ProjNode *proj0 = b->_nodes[b->_nodes.size()-2]->as_Proj();
+          ProjNode *proj1 = b->_nodes[b->_nodes.size()-1]->as_Proj();
+
+          if (bnext == bs0) {
+            // Fall-thru case in succs[0], should be in succs[1]
+
+            // Flip targets in _succs map
+            Block *tbs0 = b->_succs[0];
+            Block *tbs1 = b->_succs[1];
+            b->_succs.map( 0, tbs1 );
+            b->_succs.map( 1, tbs0 );
+
+            // Flip projections to match targets
+            b->_nodes.map(b->_nodes.size()-2, proj1);
+            b->_nodes.map(b->_nodes.size()-1, proj0);
+          }
+        }
+      }
+    }
+  }
+}
--- a/src/share/vm/opto/block.hpp	Thu Oct 30 17:08:48 2008 -0700
+++ b/src/share/vm/opto/block.hpp	Thu Nov 06 14:59:10 2008 -0800
@@ -75,6 +75,7 @@
   void insert( uint i, Block *n );
   uint size() const { return _cnt; }
   void reset() { _cnt = 0; }
+  void print();
 };
 
 
@@ -129,7 +130,11 @@
   uint _rpo;                    // Number in reverse post order walk
 
   virtual bool is_block() { return true; }
-  float succ_prob(uint i); // return probability of i'th successor
+  float succ_prob(uint i);      // return probability of i'th successor
+  int num_fall_throughs();      // How many fall-through candidate this block has
+  void update_uncommon_branch(Block* un); // Lower branch prob to uncommon code
+  bool succ_fall_through(uint i); // Is successor "i" is a fall-through candidate
+  Block* lone_fall_through();   // Return lone fall-through Block or null
 
   Block* dom_lca(Block* that);  // Compute LCA in dominator tree.
 #ifdef ASSERT
@@ -144,6 +149,7 @@
   // Report the alignment required by this block.  Must be a power of 2.
   // The previous block will insert nops to get this alignment.
   uint code_alignment();
+  uint compute_loop_alignment();
 
   // BLOCK_FREQUENCY is a sentinel to mark uses of constant block frequencies.
   // It is currently also used to scale such frequencies relative to
@@ -184,11 +190,12 @@
       int current_alignment = current_offset & max_pad;
       if( current_alignment != 0 ) {
         uint padding = (block_alignment-current_alignment) & max_pad;
-        if( !head()->is_Loop() ||
-            padding <= (uint)MaxLoopPad ||
-            first_inst_size() > padding ) {
-          return padding;
+        if( has_loop_alignment() &&
+            padding > (uint)MaxLoopPad &&
+            first_inst_size() <= padding ) {
+          return 0;
         }
+        return padding;
       }
     }
     return 0;
@@ -202,6 +209,21 @@
   void set_connector() { _connector = true; }
   bool is_connector() const { return _connector; };
 
+  // Loop_alignment will be set for blocks which are at the top of loops.
+  // The block layout pass may rotate loops such that the loop head may not
+  // be the sequentially first block of the loop encountered in the linear
+  // list of blocks.  If the layout pass is not run, loop alignment is set
+  // for each block which is the head of a loop.
+  uint _loop_alignment;
+  void set_loop_alignment(Block *loop_top) {
+    uint new_alignment = loop_top->compute_loop_alignment();
+    if (new_alignment > _loop_alignment) {
+      _loop_alignment = new_alignment;
+    }
+  }
+  uint loop_alignment() const { return _loop_alignment; }
+  bool has_loop_alignment() const { return loop_alignment() > 0; }
+
   // Create a new Block with given head Node.
   // Creates the (empty) predecessor arrays.
   Block( Arena *a, Node *headnode )
@@ -219,7 +241,8 @@
       _raise_LCA_mark(0),
       _raise_LCA_visited(0),
       _first_inst_size(999999),
-      _connector(false) {
+      _connector(false),
+      _loop_alignment(0) {
     _nodes.push(headnode);
   }
 
@@ -275,6 +298,16 @@
     return s;
   }
 
+  // Return true if b is a successor of this block
+  bool has_successor(Block* b) const {
+    for (uint i = 0; i < _num_succs; i++ ) {
+      if (non_connector_successor(i) == b) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   // Successor block, after forwarding through connectors
   Block* non_connector_successor(int i) const {
     return _succs[i]->non_connector();
@@ -319,7 +352,6 @@
 
   // I'll need a few machine-specific GotoNodes.  Clone from this one.
   MachNode *_goto;
-  void insert_goto_at(uint block_no, uint succ_no);
 
   Block* insert_anti_dependences(Block* LCA, Node* load, bool verify = false);
   void verify_anti_dependences(Block* LCA, Node* load) {
@@ -379,10 +411,15 @@
   // Compute the instruction global latency with a backwards walk
   void ComputeLatenciesBackwards(VectorSet &visited, Node_List &stack);
 
+  // Set loop alignment
+  void set_loop_alignment();
+
   // Remove empty basic blocks
-  void RemoveEmpty();
-  bool MoveToNext(Block* bx, uint b_index);
-  void MoveToEnd(Block* bx, uint b_index);
+  void remove_empty();
+  void fixup_flow();
+  bool move_to_next(Block* bx, uint b_index);
+  void move_to_end(Block* bx, uint b_index);
+  void insert_goto_at(uint block_no, uint succ_no);
 
   // Check for NeverBranch at block end.  This needs to become a GOTO to the
   // true target.  NeverBranch are treated as a conditional branch that always
@@ -413,7 +450,7 @@
 };
 
 
-//------------------------------UnionFindInfo----------------------------------
+//------------------------------UnionFind--------------------------------------
 // Map Block indices to a block-index for a cfg-cover.
 // Array lookup in the optimized case.
 class UnionFind : public ResourceObj {
@@ -508,3 +545,166 @@
   void dump_tree() const;
 #endif
 };
+
+
+//----------------------------------CFGEdge------------------------------------
+// A edge between two basic blocks that will be embodied by a branch or a
+// fall-through.
+class CFGEdge : public ResourceObj {
+ private:
+  Block * _from;        // Source basic block
+  Block * _to;          // Destination basic block
+  float _freq;          // Execution frequency (estimate)
+  int   _state;
+  bool  _infrequent;
+  int   _from_pct;
+  int   _to_pct;
+
+  // Private accessors
+  int  from_pct() const { return _from_pct; }
+  int  to_pct()   const { return _to_pct;   }
+  int  from_infrequent() const { return from_pct() < BlockLayoutMinDiamondPercentage; }
+  int  to_infrequent()   const { return to_pct()   < BlockLayoutMinDiamondPercentage; }
+
+ public:
+  enum {
+    open,               // initial edge state; unprocessed
+    connected,          // edge used to connect two traces together
+    interior            // edge is interior to trace (could be backedge)
+  };
+
+  CFGEdge(Block *from, Block *to, float freq, int from_pct, int to_pct) :
+    _from(from), _to(to), _freq(freq),
+    _from_pct(from_pct), _to_pct(to_pct), _state(open) {
+    _infrequent = from_infrequent() || to_infrequent();
+  }
+
+  float  freq() const { return _freq; }
+  Block* from() const { return _from; }
+  Block* to  () const { return _to;   }
+  int  infrequent() const { return _infrequent; }
+  int state() const { return _state; }
+
+  void set_state(int state) { _state = state; }
+
+#ifndef PRODUCT
+  void dump( ) const;
+#endif
+};
+
+
+//-----------------------------------Trace-------------------------------------
+// An ordered list of basic blocks.
+class Trace : public ResourceObj {
+ private:
+  uint _id;             // Unique Trace id (derived from initial block)
+  Block ** _next_list;  // Array mapping index to next block
+  Block ** _prev_list;  // Array mapping index to previous block
+  Block * _first;       // First block in the trace
+  Block * _last;        // Last block in the trace
+
+  // Return the block that follows "b" in the trace.
+  Block * next(Block *b) const { return _next_list[b->_pre_order]; }
+  void set_next(Block *b, Block *n) const { _next_list[b->_pre_order] = n; }
+
+  // Return the block that preceeds "b" in the trace.
+  Block * prev(Block *b) const { return _prev_list[b->_pre_order]; }
+  void set_prev(Block *b, Block *p) const { _prev_list[b->_pre_order] = p; }
+
+  // We've discovered a loop in this trace. Reset last to be "b", and first as
+  // the block following "b
+  void break_loop_after(Block *b) {
+    _last = b;
+    _first = next(b);
+    set_prev(_first, NULL);
+    set_next(_last, NULL);
+  }
+
+ public:
+
+  Trace(Block *b, Block **next_list, Block **prev_list) :
+    _first(b),
+    _last(b),
+    _next_list(next_list),
+    _prev_list(prev_list),
+    _id(b->_pre_order) {
+    set_next(b, NULL);
+    set_prev(b, NULL);
+  };
+
+  // Return the id number
+  uint id() const { return _id; }
+  void set_id(uint id) { _id = id; }
+
+  // Return the first block in the trace
+  Block * first_block() const { return _first; }
+
+  // Return the last block in the trace
+  Block * last_block() const { return _last; }
+
+  // Insert a trace in the middle of this one after b
+  void insert_after(Block *b, Trace *tr) {
+    set_next(tr->last_block(), next(b));
+    if (next(b) != NULL) {
+      set_prev(next(b), tr->last_block());
+    }
+
+    set_next(b, tr->first_block());
+    set_prev(tr->first_block(), b);
+
+    if (b == _last) {
+      _last = tr->last_block();
+    }
+  }
+
+  void insert_before(Block *b, Trace *tr) {
+    Block *p = prev(b);
+    assert(p != NULL, "use append instead");
+    insert_after(p, tr);
+  }
+
+  // Append another trace to this one.
+  void append(Trace *tr) {
+    insert_after(_last, tr);
+  }
+
+  // Append a block at the end of this trace
+  void append(Block *b) {
+    set_next(_last, b);
+    set_prev(b, _last);
+    _last = b;
+  }
+
+  // Adjust the the blocks in this trace
+  void fixup_blocks(PhaseCFG &cfg);
+  bool backedge(CFGEdge *e);
+
+#ifndef PRODUCT
+  void dump( ) const;
+#endif
+};
+
+//------------------------------PhaseBlockLayout-------------------------------
+// Rearrange blocks into some canonical order, based on edges and their frequencies
+class PhaseBlockLayout : public Phase {
+  PhaseCFG &_cfg;               // Control flow graph
+
+  GrowableArray<CFGEdge *> *edges;
+  Trace **traces;
+  Block **next;
+  Block **prev;
+  UnionFind *uf;
+
+  // Given a block, find its encompassing Trace
+  Trace * trace(Block *b) {
+    return traces[uf->Find_compress(b->_pre_order)];
+  }
+ public:
+  PhaseBlockLayout(PhaseCFG &cfg);
+
+  void find_edges();
+  void grow_traces();
+  void merge_traces(bool loose_connections);
+  void reorder_traces(int count);
+  void union_traces(Trace* from, Trace* to);
+};
--- a/src/share/vm/opto/c2_globals.hpp	Thu Oct 30 17:08:48 2008 -0700
+++ b/src/share/vm/opto/c2_globals.hpp	Thu Nov 06 14:59:10 2008 -0800
@@ -396,5 +396,15 @@
                                                                             \
   diagnostic(intx, DominatorSearchLimit, 1000,                              \
           "Iterations limit in Node::dominates")                            \
+                                                                            \
+  product(bool, BlockLayoutByFrequency, true,                               \
+          "Use edge frequencies to drive block ordering")                   \
+                                                                            \
+  product(intx, BlockLayoutMinDiamondPercentage, 20,                        \
+          "Miniumum %% of a successor (predecessor) for which block layout "\
+          "a will allow a fork (join) in a single chain")                   \
+                                                                            \
+  product(bool, BlockLayoutRotateLoops, false,                              \
+          "Allow back branches to be fall throughs in the block layour")    \
 
 C2_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_NOTPRODUCT_FLAG)
--- a/src/share/vm/opto/compile.cpp	Thu Oct 30 17:08:48 2008 -0700
+++ b/src/share/vm/opto/compile.cpp	Thu Nov 06 14:59:10 2008 -0800
@@ -822,6 +822,7 @@
   Copy::zero_to_bytes(_trap_hist, sizeof(_trap_hist));
   set_decompile_count(0);
 
+  set_do_freq_based_layout(BlockLayoutByFrequency || method_has_option("BlockLayoutByFrequency"));
   // Compilation level related initialization
   if (env()->comp_level() == CompLevel_fast_compile) {
     set_num_loop_opts(Tier1LoopOptsCount);
@@ -1701,8 +1702,14 @@
   // are not adding any new instructions.  If any basic block is empty, we
   // can now safely remove it.
   {
-    NOT_PRODUCT( TracePhase t2("removeEmpty", &_t_removeEmptyBlocks, TimeCompiler); )
-    cfg.RemoveEmpty();
+    NOT_PRODUCT( TracePhase t2("blockOrdering", &_t_blockOrdering, TimeCompiler); )
+    cfg.remove_empty();
+    if (do_freq_based_layout()) {
+      PhaseBlockLayout layout(cfg);
+    } else {
+      cfg.set_loop_alignment();
+    }
+    cfg.fixup_flow();
   }
 
   // Perform any platform dependent postallocation verifications.
--- a/src/share/vm/opto/compile.hpp	Thu Oct 30 17:08:48 2008 -0700
+++ b/src/share/vm/opto/compile.hpp	Thu Nov 06 14:59:10 2008 -0800
@@ -154,6 +154,7 @@
   uint                  _decompile_count;       // Cumulative decompilation counts.
   bool                  _do_inlining;           // True if we intend to do inlining
   bool                  _do_scheduling;         // True if we intend to do scheduling
+  bool                  _do_freq_based_layout;  // True if we intend to do frequency based block layout
   bool                  _do_count_invocations;  // True if we generate code to count invocations
   bool                  _do_method_data_update; // True if we generate code to update methodDataOops
   int                   _AliasLevel;            // Locally-adjusted version of AliasLevel flag.
@@ -307,6 +308,8 @@
   void          set_do_inlining(bool z)         { _do_inlining = z; }
   bool              do_scheduling() const       { return _do_scheduling; }
   void          set_do_scheduling(bool z)       { _do_scheduling = z; }
+  bool              do_freq_based_layout() const{ return _do_freq_based_layout; }
+  void          set_do_freq_based_layout(bool z){ _do_freq_based_layout = z; }
   bool              do_count_invocations() const{ return _do_count_invocations; }
   void          set_do_count_invocations(bool z){ _do_count_invocations = z; }
   bool              do_method_data_update() const { return _do_method_data_update; }
--- a/src/share/vm/opto/gcm.cpp	Thu Oct 30 17:08:48 2008 -0700
+++ b/src/share/vm/opto/gcm.cpp	Thu Nov 06 14:59:10 2008 -0800
@@ -1319,11 +1319,33 @@
 //------------------------------Estimate_Block_Frequency-----------------------
 // Estimate block frequencies based on IfNode probabilities.
 void PhaseCFG::Estimate_Block_Frequency() {
-  int cnts = C->method() ? C->method()->interpreter_invocation_count() : 1;
-  // Most of our algorithms will die horribly if frequency can become
-  // negative so make sure cnts is a sane value.
-  if( cnts <= 0 ) cnts = 1;
-  float f = (float)cnts/(float)FreqCountInvocations;
+
+  // Force conditional branches leading to uncommon traps to be unlikely,
+  // not because we get to the uncommon_trap with less relative frequency,
+  // but because an uncommon_trap typically causes a deopt, so we only get
+  // there once.
+  if (C->do_freq_based_layout()) {
+    Block_List worklist;
+    Block* root_blk = _blocks[0];
+    for (uint i = 1; i < root_blk->num_preds(); i++) {
+      Block *pb = _bbs[root_blk->pred(i)->_idx];
+      if (pb->has_uncommon_code()) {
+        worklist.push(pb);
+      }
+    }
+    while (worklist.size() > 0) {
+      Block* uct = worklist.pop();
+      if (uct == _broot) continue;
+      for (uint i = 1; i < uct->num_preds(); i++) {
+        Block *pb = _bbs[uct->pred(i)->_idx];
+        if (pb->_num_succs == 1) {
+          worklist.push(pb);
+        } else if (pb->num_fall_throughs() == 2) {
+          pb->update_uncommon_branch(uct);
+        }
+      }
+    }
+  }
 
   // Create the loop tree and calculate loop depth.
   _root_loop = create_loop_tree();
@@ -1333,25 +1355,27 @@
   _root_loop->compute_freq();
 
   // Adjust all frequencies to be relative to a single method entry
-  _root_loop->_freq = f * 1.0;
+  _root_loop->_freq = 1.0;
   _root_loop->scale_freq();
 
   // force paths ending at uncommon traps to be infrequent
-  Block_List worklist;
-  Block* root_blk = _blocks[0];
-  for (uint i = 0; i < root_blk->num_preds(); i++) {
-    Block *pb = _bbs[root_blk->pred(i)->_idx];
-    if (pb->has_uncommon_code()) {
-      worklist.push(pb);
+  if (!C->do_freq_based_layout()) {
+    Block_List worklist;
+    Block* root_blk = _blocks[0];
+    for (uint i = 1; i < root_blk->num_preds(); i++) {
+      Block *pb = _bbs[root_blk->pred(i)->_idx];
+      if (pb->has_uncommon_code()) {
+        worklist.push(pb);
+      }
     }
-  }
-  while (worklist.size() > 0) {
-    Block* uct = worklist.pop();
-    uct->_freq = PROB_MIN;
-    for (uint i = 0; i < uct->num_preds(); i++) {
-      Block *pb = _bbs[uct->pred(i)->_idx];
-      if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
-        worklist.push(pb);
+    while (worklist.size() > 0) {
+      Block* uct = worklist.pop();
+      uct->_freq = PROB_MIN;
+      for (uint i = 1; i < uct->num_preds(); i++) {
+        Block *pb = _bbs[uct->pred(i)->_idx];
+        if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
+          worklist.push(pb);
+        }
       }
     }
   }
@@ -1556,22 +1580,6 @@
     }
   }
 
-#if 0
-  // Raise frequency of the loop backedge block, in an effort
-  // to keep it empty.  Skip the method level "loop".
-  if (_parent != NULL) {
-    CFGElement* s = _members.at(_members.length() - 1);
-    if (s->is_block()) {
-      Block* bk = s->as_Block();
-      if (bk->_num_succs == 1 && bk->_succs[0] == hd) {
-        // almost any value >= 1.0f works
-        // FIXME: raw constant
-        bk->_freq = 1.05f;
-      }
-    }
-  }
-#endif
-
   // For all loops other than the outer, "method" loop,
   // sum and normalize the exit probability. The "method" loop
   // should keep the initial exit probability of 1, so that
@@ -1589,12 +1597,15 @@
     // the probability of exit per loop entry.
     for (int i = 0; i < _exits.length(); i++) {
       Block* et = _exits.at(i).get_target();
-      float new_prob = _exits.at(i).get_prob() / exits_sum;
+      float new_prob = 0.0f;
+      if (_exits.at(i).get_prob() > 0.0f) {
+        new_prob = _exits.at(i).get_prob() / exits_sum;
+      }
       BlockProbPair bpp(et, new_prob);
       _exits.at_put(i, bpp);
     }
 
-    // Save the total, but guard against unreasoable probability,
+    // Save the total, but guard against unreasonable probability,
     // as the value is used to estimate the loop trip count.
     // An infinite trip count would blur relative block
     // frequencies.
@@ -1688,6 +1699,137 @@
   return 0.0f;
 }
 
+//------------------------------num_fall_throughs-----------------------------
+// Return the number of fall-through candidates for a block
+int Block::num_fall_throughs() {
+  int eidx = end_idx();
+  Node *n = _nodes[eidx];  // Get ending Node
+
+  int op = n->Opcode();
+  if (n->is_Mach()) {
+    if (n->is_MachNullCheck()) {
+      // In theory, either side can fall-thru, for simplicity sake,
+      // let's say only the false branch can now.
+      return 1;
+    }
+    op = n->as_Mach()->ideal_Opcode();
+  }
+
+  // Switch on branch type
+  switch( op ) {
+  case Op_CountedLoopEnd:
+  case Op_If:
+    return 2;
+
+  case Op_Root:
+  case Op_Goto:
+    return 1;
+
+  case Op_Catch: {
+    for (uint i = 0; i < _num_succs; i++) {
+      const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
+      if (ci->_con == CatchProjNode::fall_through_index) {
+        return 1;
+      }
+    }
+    return 0;
+  }
+
+  case Op_Jump:
+  case Op_NeverBranch:
+  case Op_TailCall:
+  case Op_TailJump:
+  case Op_Return:
+  case Op_Halt:
+  case Op_Rethrow:
+    return 0;
+
+  default:
+    ShouldNotReachHere();
+  }
+
+  return 0;
+}
+
+//------------------------------succ_fall_through-----------------------------
+// Return true if a specific successor could be fall-through target.
+bool Block::succ_fall_through(uint i) {
+  int eidx = end_idx();
+  Node *n = _nodes[eidx];  // Get ending Node
+
+  int op = n->Opcode();
+  if (n->is_Mach()) {
+    if (n->is_MachNullCheck()) {
+      // In theory, either side can fall-thru, for simplicity sake,
+      // let's say only the false branch can now.
+      return _nodes[i + eidx + 1]->Opcode() == Op_IfFalse;
+    }
+    op = n->as_Mach()->ideal_Opcode();
+  }
+
+  // Switch on branch type
+  switch( op ) {
+  case Op_CountedLoopEnd:
+  case Op_If:
+  case Op_Root:
+  case Op_Goto:
+    return true;
+
+  case Op_Catch: {
+    const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
+    return ci->_con == CatchProjNode::fall_through_index;
+  }
+
+  case Op_Jump:
+  case Op_NeverBranch:
+  case Op_TailCall:
+  case Op_TailJump:
+  case Op_Return:
+  case Op_Halt:
+  case Op_Rethrow:
+    return false;
+
+  default:
+    ShouldNotReachHere();
+  }
+
+  return false;
+}
+
+//------------------------------update_uncommon_branch------------------------
+// Update the probability of a two-branch to be uncommon
+void Block::update_uncommon_branch(Block* ub) {
+  int eidx = end_idx();
+  Node *n = _nodes[eidx];  // Get ending Node
+
+  int op = n->as_Mach()->ideal_Opcode();
+
+  assert(op == Op_CountedLoopEnd || op == Op_If, "must be a If");
+  assert(num_fall_throughs() == 2, "must be a two way branch block");
+
+  // Which successor is ub?
+  uint s;
+  for (s = 0; s <_num_succs; s++) {
+    if (_succs[s] == ub) break;
+  }
+  assert(s < 2, "uncommon successor must be found");
+
+  // If ub is the true path, make the proability small, else
+  // ub is the false path, and make the probability large
+  bool invert = (_nodes[s + eidx + 1]->Opcode() == Op_IfFalse);
+
+  // Get existing probability
+  float p = n->as_MachIf()->_prob;
+
+  if (invert) p = 1.0 - p;
+  if (p > PROB_MIN) {
+    p = PROB_MIN;
+  }
+  if (invert) p = 1.0 - p;
+
+  n->as_MachIf()->_prob = p;
+}
+
 //------------------------------update_succ_freq-------------------------------
 // Update the appropriate frequency associated with block 'b', a succesor of
 // a block in this loop.
--- a/src/share/vm/opto/output.cpp	Thu Oct 30 17:08:48 2008 -0700
+++ b/src/share/vm/opto/output.cpp	Thu Nov 06 14:59:10 2008 -0800
@@ -263,7 +263,7 @@
 # endif // ENABLE_ZAP_DEAD_LOCALS
 
 //------------------------------compute_loop_first_inst_sizes------------------
-// Compute the size of first NumberOfLoopInstrToAlign instructions at head
+// Compute the size of first NumberOfLoopInstrToAlign instructions at the top
 // of a loop. When aligning a loop we need to provide enough instructions
 // in cpu's fetch buffer to feed decoders. The loop alignment could be
 // avoided if we have enough instructions in fetch buffer at the head of a loop.
@@ -284,34 +284,23 @@
     for( uint i=1; i <= last_block; i++ ) {
       Block *b = _cfg->_blocks[i];
       // Check the first loop's block which requires an alignment.
-      if( b->head()->is_Loop() &&
-          b->code_alignment() > (uint)relocInfo::addr_unit() ) {
+      if( b->loop_alignment() > (uint)relocInfo::addr_unit() ) {
         uint sum_size = 0;
         uint inst_cnt = NumberOfLoopInstrToAlign;
-        inst_cnt = b->compute_first_inst_size(sum_size, inst_cnt,
-                                              _regalloc);
-        // Check the next fallthrough block if first loop's block does not have
-        // enough instructions.
-        if( inst_cnt > 0 && i < last_block ) {
-          // First, check if the first loop's block contains whole loop.
-          // LoopNode::LoopBackControl == 2.
-          Block *bx = _cfg->_bbs[b->pred(2)->_idx];
-          // Skip connector blocks (with limit in case of irreducible loops).
-          int search_limit = 16;
-          while( bx->is_connector() && search_limit-- > 0) {
-            bx = _cfg->_bbs[bx->pred(1)->_idx];
-          }
-          if( bx != b ) { // loop body is in several blocks.
-            Block *nb = NULL;
-            while( inst_cnt > 0 && i < last_block && nb != bx &&
-                  !_cfg->_blocks[i+1]->head()->is_Loop() ) {
-              i++;
-              nb = _cfg->_blocks[i];
-              inst_cnt  = nb->compute_first_inst_size(sum_size, inst_cnt,
-                                                      _regalloc);
-            } // while( inst_cnt > 0 && i < last_block  )
-          } // if( bx != b )
-        } // if( inst_cnt > 0 && i < last_block )
+        inst_cnt = b->compute_first_inst_size(sum_size, inst_cnt, _regalloc);
+
+        // Check subsequent fallthrough blocks if the loop's first
+        // block(s) does not have enough instructions.
+        Block *nb = b;
+        while( inst_cnt > 0 &&
+               i < last_block &&
+               !_cfg->_blocks[i+1]->has_loop_alignment() &&
+               !nb->has_successor(b) ) {
+          i++;
+          nb = _cfg->_blocks[i];
+          inst_cnt  = nb->compute_first_inst_size(sum_size, inst_cnt, _regalloc);
+        } // while( inst_cnt > 0 && i < last_block  )
+
         b->set_first_inst_size(sum_size);
       } // f( b->head()->is_Loop() )
     } // for( i <= last_block )
@@ -512,7 +501,7 @@
       // Get the size of the block
       uint blk_size = adr - blk_starts[i];
 
-      // When the next block starts a loop, we may insert pad NOP
+      // When the next block is the top of a loop, we may insert pad NOP
       // instructions.
       Block *nb = _cfg->_blocks[i+1];
       int current_offset = blk_starts[i] + blk_size;
@@ -1382,8 +1371,8 @@
 
     } // End for all instructions in block
 
-    // If the next block _starts_ a loop, pad this block out to align
-    // the loop start a little. Helps prevent pipe stalls at loop starts
+    // If the next block is the top of a loop, pad this block out to align
+    // the loop top a little. Helps prevent pipe stalls at loop back branches.
     int nop_size = (new (this) MachNopNode())->size(_regalloc);
     if( i<_cfg->_num_blocks-1 ) {
       Block *nb = _cfg->_blocks[i+1];
--- a/src/share/vm/opto/phase.cpp	Thu Oct 30 17:08:48 2008 -0700
+++ b/src/share/vm/opto/phase.cpp	Thu Nov 06 14:59:10 2008 -0800
@@ -46,7 +46,7 @@
 #ifndef PRODUCT
 elapsedTimer Phase::_t_graphReshaping;
 elapsedTimer Phase::_t_scheduler;
-elapsedTimer Phase::_t_removeEmptyBlocks;
+elapsedTimer Phase::_t_blockOrdering;
 elapsedTimer Phase::_t_macroExpand;
 elapsedTimer Phase::_t_peephole;
 elapsedTimer Phase::_t_codeGeneration;
@@ -128,7 +128,7 @@
     tty->print_cr ("      subtotal     : %3.3f sec,  %3.2f %%", regalloc_subtotal, percent_of_regalloc);
   }
   tty->print_cr ("    macroExpand  : %3.3f sec", Phase::_t_macroExpand.seconds());
-  tty->print_cr ("    removeEmpty  : %3.3f sec", Phase::_t_removeEmptyBlocks.seconds());
+  tty->print_cr ("    blockOrdering: %3.3f sec", Phase::_t_blockOrdering.seconds());
   tty->print_cr ("    peephole     : %3.3f sec", Phase::_t_peephole.seconds());
   tty->print_cr ("    codeGen      : %3.3f sec", Phase::_t_codeGeneration.seconds());
   tty->print_cr ("    install_code : %3.3f sec", Phase::_t_registerMethod.seconds());
@@ -137,7 +137,7 @@
     (DoEscapeAnalysis ? Phase::_t_escapeAnalysis.seconds() : 0.0) +
     Phase::_t_optimizer.seconds() + Phase::_t_graphReshaping.seconds() +
     Phase::_t_matcher.seconds() + Phase::_t_scheduler.seconds() +
-    Phase::_t_registerAllocation.seconds() + Phase::_t_removeEmptyBlocks.seconds() +
+    Phase::_t_registerAllocation.seconds() + Phase::_t_blockOrdering.seconds() +
     Phase::_t_macroExpand.seconds() + Phase::_t_peephole.seconds() +
     Phase::_t_codeGeneration.seconds() + Phase::_t_registerMethod.seconds();
   double percent_of_method_compile = ((phase_subtotal == 0.0) ? 0.0 : phase_subtotal / Phase::_t_methodCompilation.seconds()) * 100.0;
--- a/src/share/vm/opto/phase.hpp	Thu Oct 30 17:08:48 2008 -0700
+++ b/src/share/vm/opto/phase.hpp	Thu Nov 06 14:59:10 2008 -0800
@@ -40,16 +40,12 @@
     Optimistic,                 // Optimistic analysis phase
     GVN,                        // Pessimistic global value numbering phase
     Ins_Select,                 // Instruction selection phase
-    Copy_Elimination,           // Copy Elimination
-    Dead_Code_Elimination,      // DCE and compress Nodes
-    Conditional_Constant,       // Conditional Constant Propagation
     CFG,                        // Build a CFG
-    DefUse,                     // Build Def->Use chains
+    BlockLayout,                // Linear ordering of blocks
     Register_Allocation,        // Register allocation, duh
     LIVE,                       // Dragon-book LIVE range problem
     Interference_Graph,         // Building the IFG
     Coalesce,                   // Coalescing copies
-    Conditional_CProp,          // Conditional Constant Propagation
     Ideal_Loop,                 // Find idealized trip-counted loops
     Macro_Expand,               // Expand macro nodes
     Peephole,                   // Apply peephole optimizations
@@ -80,7 +76,7 @@
 #ifndef PRODUCT
   static elapsedTimer _t_graphReshaping;
   static elapsedTimer _t_scheduler;
-  static elapsedTimer _t_removeEmptyBlocks;
+  static elapsedTimer _t_blockOrdering;
   static elapsedTimer _t_macroExpand;
   static elapsedTimer _t_peephole;
   static elapsedTimer _t_codeGeneration;