changeset 31772:718fc367468d

8129920: Vectorized loop unrolling Summary: optimize loop opts for vectorizible loops. Reviewed-by: kvn, roland
author kvn
date Fri, 10 Jul 2015 11:59:09 -0700
parents c9f593020799
children f874931cbae7 13420c0a3ad5
files hotspot/src/share/vm/opto/loopTransform.cpp hotspot/src/share/vm/opto/loopUnswitch.cpp hotspot/src/share/vm/opto/loopnode.cpp hotspot/src/share/vm/opto/loopnode.hpp hotspot/src/share/vm/opto/superword.cpp hotspot/src/share/vm/opto/superword.hpp
diffstat 6 files changed, 83 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/hotspot/src/share/vm/opto/loopTransform.cpp	Fri Jul 10 11:31:49 2015 -0700
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp	Fri Jul 10 11:59:09 2015 -0700
@@ -280,6 +280,10 @@
       || (body_size * body_size + phase->C->live_nodes()) > phase->C->max_node_limit() ) {
     return false;           // too large to safely clone
   }
+
+  // check for vectorized loops, any peeling done was already applied
+  if (_head->is_CountedLoop() && _head->as_CountedLoop()->do_unroll_only()) return false;
+
   while( test != _head ) {      // Scan till run off top of loop
     if( test->is_If() ) {       // Test?
       Node *ctrl = phase->get_ctrl(test->in(1));
@@ -656,7 +660,12 @@
   _local_loop_unroll_limit = LoopUnrollLimit;
   _local_loop_unroll_factor = 4;
   int future_unroll_ct = cl->unrolled_count() * 2;
-  if (future_unroll_ct > LoopMaxUnroll) return false;
+  if (!cl->do_unroll_only()) {
+    if (future_unroll_ct > LoopMaxUnroll) return false;
+  } else {
+    // obey user constraints on vector mapped loops with additional unrolling applied
+    if ((future_unroll_ct / cl->slp_max_unroll()) > LoopMaxUnroll) return false;
+  }
 
   // Check for initial stride being a small enough constant
   if (abs(cl->stride_con()) > (1<<2)*future_unroll_ct) return false;
@@ -759,13 +768,19 @@
     if (LoopMaxUnroll > _local_loop_unroll_factor) {
       // Once policy_slp_analysis succeeds, mark the loop with the
       // maximal unroll factor so that we minimize analysis passes
-      if ((future_unroll_ct > _local_loop_unroll_factor) ||
-          (body_size > (uint)_local_loop_unroll_limit)) {
+      if (future_unroll_ct >= _local_loop_unroll_factor) {
         policy_unroll_slp_analysis(cl, phase, future_unroll_ct);
       }
     }
   }
 
+  int slp_max_unroll_factor = cl->slp_max_unroll();
+  if (cl->has_passed_slp()) {
+    if (slp_max_unroll_factor >= future_unroll_ct) return true;
+    // Normal case: loop too big
+    return false;
+  }
+
   // Check for being too big
   if (body_size > (uint)_local_loop_unroll_limit) {
     if (xors_in_loop >= 4 && body_size < (uint)LoopUnrollLimit*4) return true;
@@ -773,6 +788,10 @@
     return false;
   }
 
+  if(cl->do_unroll_only()) {
+    NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("policy_unroll passed vector loop(vlen=%d,factor = %d)\n", slp_max_unroll_factor, future_unroll_ct));
+  }
+
   // Unroll once!  (Each trip will soon do double iterations)
   return true;
 }
@@ -780,28 +799,24 @@
 void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_ct) {
   // Enable this functionality target by target as needed
   if (SuperWordLoopUnrollAnalysis) {
-    if (!cl->has_passed_slp()) {
+    if (!cl->was_slp_analyzed()) {
       SuperWord sw(phase);
       sw.transform_loop(this, false);
 
       // If the loop is slp canonical analyze it
       if (sw.early_return() == false) {
-        sw.unrolling_analysis(cl, _local_loop_unroll_factor);
+        sw.unrolling_analysis(_local_loop_unroll_factor);
       }
     }
 
-    int slp_max_unroll_factor = cl->slp_max_unroll();
-    if ((slp_max_unroll_factor > 4) &&
-        (slp_max_unroll_factor >= future_unroll_ct)) {
-      int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
-      if (new_limit > LoopUnrollLimit) {
-#ifndef PRODUCT
-        if (TraceSuperWordLoopUnrollAnalysis) {
-          tty->print_cr("slp analysis is applying unroll limit  %d, the original limit was %d\n",
-            new_limit, _local_loop_unroll_limit);
+    if (cl->has_passed_slp()) {
+      int slp_max_unroll_factor = cl->slp_max_unroll();
+      if (slp_max_unroll_factor >= future_unroll_ct) {
+        int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
+        if (new_limit > LoopUnrollLimit) {
+          NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("slp analysis unroll=%d, default limit=%d\n", new_limit, _local_loop_unroll_limit));
+          _local_loop_unroll_limit = new_limit;
         }
-#endif
-        _local_loop_unroll_limit = new_limit;
       }
     }
   }
@@ -830,6 +845,9 @@
   if (cl->is_main_no_pre_loop()) return false; // Disallowed for now.
   Node *trip_counter = cl->phi();
 
+  // check for vectorized loops, some opts are no longer needed
+  if (cl->do_unroll_only()) return false;
+
   // Check loop body for tests of trip-counter plus loop-invariant vs
   // loop-invariant.
   for (uint i = 0; i < _body.size(); i++) {
@@ -880,6 +898,8 @@
 // Return TRUE or FALSE if the loop should NEVER be RCE'd or aligned.  Useful
 // for unrolling loops with NO array accesses.
 bool IdealLoopTree::policy_peel_only( PhaseIdealLoop *phase ) const {
+  // check for vectorized loops, any peeling done was already applied
+  if (_head->is_CountedLoop() && _head->as_CountedLoop()->do_unroll_only()) return false;
 
   for( uint i = 0; i < _body.size(); i++ )
     if( _body[i]->is_Mem() )
--- a/hotspot/src/share/vm/opto/loopUnswitch.cpp	Fri Jul 10 11:31:49 2015 -0700
+++ b/hotspot/src/share/vm/opto/loopUnswitch.cpp	Fri Jul 10 11:59:09 2015 -0700
@@ -61,6 +61,12 @@
   if (!_head->is_Loop()) {
     return false;
   }
+
+  // check for vectorized loops, any unswitching was already applied
+  if (_head->is_CountedLoop() && _head->as_CountedLoop()->do_unroll_only()) {
+    return false;
+  }
+
   int nodes_left = phase->C->max_node_limit() - phase->C->live_nodes();
   if ((int)(2 * _body.size()) > nodes_left) {
     return false; // Too speculative if running low on nodes.
--- a/hotspot/src/share/vm/opto/loopnode.cpp	Fri Jul 10 11:31:49 2015 -0700
+++ b/hotspot/src/share/vm/opto/loopnode.cpp	Fri Jul 10 11:59:09 2015 -0700
@@ -2317,7 +2317,11 @@
     // Reassociate invariants and prep for split_thru_phi
     for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
       IdealLoopTree* lpt = iter.current();
-      if (!lpt->is_counted() || !lpt->is_inner()) continue;
+      bool is_counted = lpt->is_counted();
+      if (!is_counted || !lpt->is_inner()) continue;
+
+      // check for vectorized loops, any reassociation of invariants was already done
+      if (is_counted && lpt->_head->as_CountedLoop()->do_unroll_only()) continue;
 
       lpt->reassociate_invariants(this);
 
--- a/hotspot/src/share/vm/opto/loopnode.hpp	Fri Jul 10 11:31:49 2015 -0700
+++ b/hotspot/src/share/vm/opto/loopnode.hpp	Fri Jul 10 11:59:09 2015 -0700
@@ -64,7 +64,9 @@
          PartialPeelLoop=32,
          PartialPeelFailed=64,
          HasReductions=128,
-         PassedSlpAnalysis=256 };
+         WasSlpAnalyzed=256,
+         PassedSlpAnalysis=512,
+         DoUnrollOnly=1024 };
   char _unswitch_count;
   enum { _unswitch_max=3 };
 
@@ -80,7 +82,9 @@
   int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
   void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
   void mark_has_reductions() { _loop_flags |= HasReductions; }
+  void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; }
   void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
+  void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
 
   int unswitch_max() { return _unswitch_max; }
   int unswitch_count() { return _unswitch_count; }
@@ -212,7 +216,9 @@
   int is_main_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Main;   }
   int is_post_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Post;   }
   int is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
+  int was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; }
   int has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
+  int do_unroll_only      () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
   int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
   void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
 
@@ -235,6 +241,9 @@
   void set_nonexact_trip_count() {
     _loop_flags &= ~HasExactTripCount;
   }
+  void set_notpassed_slp() {
+    _loop_flags &= ~PassedSlpAnalysis;
+  }
 
   void set_profile_trip_cnt(float ptc) { _profile_trip_cnt = ptc; }
   float profile_trip_cnt()             { return _profile_trip_cnt; }
--- a/hotspot/src/share/vm/opto/superword.cpp	Fri Jul 10 11:31:49 2015 -0700
+++ b/hotspot/src/share/vm/opto/superword.cpp	Fri Jul 10 11:59:09 2015 -0700
@@ -100,6 +100,10 @@
     return;
   }
 
+  // We only re-enter slp when we vector mapped a queried loop and we want to
+  // continue unrolling, in this case, slp is not subsequently done.
+  if (cl->do_unroll_only()) return;
+
   // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
   CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
   if (pre_end == NULL) return;
@@ -121,12 +125,13 @@
 }
 
 //------------------------------early unrolling analysis------------------------------
-void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
+void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
   bool is_slp = true;
   ResourceMark rm;
   size_t ignored_size = lpt()->_body.size();
   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
   Node_Stack nstack((int)ignored_size);
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   Node *cl_exit = cl->loopexit();
 
   // First clear the entries
@@ -249,13 +254,9 @@
 
       // If a max vector exists which is not larger than _local_loop_unroll_factor
       // stop looking, we already have the max vector to map to.
-      if (cur_max_vector <= local_loop_unroll_factor) {
+      if (cur_max_vector < local_loop_unroll_factor) {
         is_slp = false;
-#ifndef PRODUCT
-        if (TraceSuperWordLoopUnrollAnalysis) {
-          tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
-        }
-#endif
+        NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("slp analysis fails: unroll limit greater than max vector\n"));
         break;
       }
 
@@ -268,8 +269,9 @@
     }
     if (is_slp) {
       local_loop_unroll_factor = max_vector;
+      cl->mark_passed_slp();
     }
-    cl->mark_passed_slp();
+    cl->mark_was_slp();
     cl->set_slp_max_unroll(local_loop_unroll_factor);
   }
 }
@@ -1758,7 +1760,9 @@
   }
 
   Compile* C = _phase->C;
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   uint max_vlen_in_bytes = 0;
+  uint max_vlen = 0;
   for (int i = 0; i < _block.length(); i++) {
     Node* n = _block.at(i);
     Node_List* p = my_pack(n);
@@ -1841,6 +1845,7 @@
       _igvn._worklist.push(vn);
 
       if (vlen_in_bytes > max_vlen_in_bytes) {
+        max_vlen = vlen;
         max_vlen_in_bytes = vlen_in_bytes;
       }
 #ifdef ASSERT
@@ -1852,6 +1857,18 @@
     }
   }
   C->set_max_vector_size(max_vlen_in_bytes);
+  if (SuperWordLoopUnrollAnalysis) {
+    if (cl->has_passed_slp()) {
+      uint slp_max_unroll_factor = cl->slp_max_unroll();
+      if (slp_max_unroll_factor == max_vlen) {
+        NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte));
+        // For atomic unrolled loops which are vector mapped, instigate more unrolling.
+        cl->set_notpassed_slp();
+        C->set_major_progress();
+        cl->mark_do_unroll_only();
+      }
+    }
+  }
 }
 
 //------------------------------vector_opd---------------------------
--- a/hotspot/src/share/vm/opto/superword.hpp	Fri Jul 10 11:31:49 2015 -0700
+++ b/hotspot/src/share/vm/opto/superword.hpp	Fri Jul 10 11:59:09 2015 -0700
@@ -241,7 +241,7 @@
 
   void transform_loop(IdealLoopTree* lpt, bool do_optimization);
 
-  void unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor);
+  void unrolling_analysis(int &local_loop_unroll_factor);
 
   // Accessors for SWPointer
   PhaseIdealLoop* phase()          { return _phase; }