changeset 8724:a97452742de1

8085932: Fixing bugs in detecting memory alignments in SuperWord Reviewed-by: kvn Contributed-by: jan.civlin@intel.com
author kvn
date Tue, 14 Jul 2015 09:55:22 -0700
parents 94403236f303
children 2545ed8e99c5
files src/share/vm/opto/compile.hpp src/share/vm/opto/loopnode.cpp src/share/vm/opto/superword.cpp src/share/vm/opto/superword.hpp
diffstat 4 files changed, 567 insertions(+), 60 deletions(-) [+]
line wrap: on
line diff
--- a/src/share/vm/opto/compile.hpp	Fri Jul 10 11:59:09 2015 -0700
+++ b/src/share/vm/opto/compile.hpp	Tue Jul 14 09:55:22 2015 -0700
@@ -140,6 +140,9 @@
   bool is_debug()                 const          { return _debug; }
   void set_debug(bool debug)                     { _debug = debug; }
   static const char* debug_option_name;
+
+  bool same_idx(node_idx_t k1, node_idx_t k2)  const { return idx(k1) == idx(k2); }
+  bool same_gen(node_idx_t k1, node_idx_t k2)  const { return gen(k1) == gen(k2); }
 };
 
 //------------------------------Compile----------------------------------------
--- a/src/share/vm/opto/loopnode.cpp	Fri Jul 10 11:59:09 2015 -0700
+++ b/src/share/vm/opto/loopnode.cpp	Tue Jul 14 09:55:22 2015 -0700
@@ -3682,7 +3682,6 @@
 }
 
 void PhaseIdealLoop::dump( IdealLoopTree *loop, uint idx, Node_List &rpo_list ) const {
-  CloneMap& cm = C->clone_map();
   loop->dump_head();
 
   // Now scan for CFG nodes in the same loop
@@ -3714,7 +3713,6 @@
         cached_idom = find_non_split_ctrl(cached_idom);
       }
       tty->print(" ID:%d",computed_idom->_idx);
-      cm.dump(n->_idx);
       n->dump();
       if( cached_idom != computed_idom ) {
         tty->print_cr("*** BROKEN IDOM!  Computed as: %d, cached as: %d",
@@ -3734,7 +3732,6 @@
           for( uint j = 0; j < loop->_nest; j++ )
             tty->print("  ");
           tty->print(" ");
-          cm.dump(m->_idx);
           m->dump();
         }
       }
--- a/src/share/vm/opto/superword.cpp	Fri Jul 10 11:59:09 2015 -0700
+++ b/src/share/vm/opto/superword.cpp	Tue Jul 14 09:55:22 2015 -0700
@@ -74,9 +74,15 @@
   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
   _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
   _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
-  _ii_order(arena(), 8, 0, 0),
-  _vector_loop_debug(phase->C->has_method() && phase->C->method_has_option("VectorizeDebug"))
-{}
+  _ii_order(arena(), 8, 0, 0)
+{
+#ifndef PRODUCT
+  _vector_loop_debug = 0;
+  if (_phase->C->method() != NULL) {
+    _phase->C->method()->has_option_value("VectorizeDebug", _vector_loop_debug);
+  }
+#endif
+}
 
 //------------------------------transform_loop---------------------------
 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
@@ -90,7 +96,6 @@
   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
 
   if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
-
   // Check for no control flow in body (other than exit)
   Node *cl_exit = cl->loopexit();
   if (cl_exit->in(0) != lpt->_head) return;
@@ -425,13 +430,15 @@
       // this reference to a vector-aligned address.
       best_align_to_mem_ref = mem_ref;
       best_iv_adjustment = iv_adjustment;
+      NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
     }
 
     SWPointer align_to_ref_p(mem_ref, this, NULL, false);
     // Set alignment relative to "align_to_ref" for all related memory operations.
     for (int i = memops.size() - 1; i >= 0; i--) {
       MemNode* s = memops.at(i)->as_Mem();
-      if (isomorphic(s, mem_ref)) {
+      if (isomorphic(s, mem_ref) &&
+           (!_do_vector_loop || same_origin_idx(s, mem_ref))) {
         SWPointer p2(s, this, NULL, false);
         if (p2.comparable(align_to_ref_p)) {
           int align = memory_alignment(s, iv_adjustment);
@@ -496,7 +503,7 @@
               Node_List* pair = new Node_List();
               pair->push(s1);
               pair->push(s2);
-              if (!_do_vector_loop || _clone_map.idx(s1->_idx) == _clone_map.idx(s2->_idx)) {
+              if (!_do_vector_loop || same_origin_idx(s1, s2)) {
                 _packset.append(pair);
               }
             }
@@ -533,8 +540,12 @@
           memops.push(s);
         }
         MemNode* best_align_to_mem_ref = find_align_to_ref(memops);
-        if (best_align_to_mem_ref == NULL) break;
+        if (best_align_to_mem_ref == NULL) {
+          NOT_PRODUCT(if (TraceSuperWord) tty->print_cr("SuperWord::find_adjacent_refs(): best_align_to_mem_ref == NULL");)
+          break;
+        }
         best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
+        NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
         // Restore list.
         while (memops.size() > orig_msize)
           (void)memops.pop();
@@ -560,6 +571,16 @@
 #endif
 }
 
+#ifndef PRODUCT
+void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment) {
+  if (is_trace_adjacent()) {
+    tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d",
+       best_align_to_mem_ref->_idx, best_iv_adjustment);
+       best_align_to_mem_ref->dump();
+  }
+}
+#endif
+
 //------------------------------find_align_to_ref---------------------------
 // Find a memory reference to align the loop induction variable to.
 // Looks first at stores then at loads, looking for a memory reference
@@ -756,9 +777,11 @@
   }
 
 #ifndef PRODUCT
-  if (TraceSuperWord)
-    tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
-                  offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
+  if (TraceSuperWord) {
+    tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
+      mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
+    mem_ref->dump();
+  }
 #endif
   return iv_adjustment;
 }
@@ -863,12 +886,14 @@
   Node* n = start;
   Node* prev = NULL;
   while (true) {
+    NOT_PRODUCT( if(is_trace_mem_slice()) tty->print_cr("SuperWord::mem_slice_preds: n %d", n->_idx);)
     assert(in_bb(n), "must be in block");
     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
       Node* out = n->fast_out(i);
       if (out->is_Load()) {
         if (in_bb(out)) {
           preds.push(out);
+          NOT_PRODUCT(if (TraceSuperWord && Verbose) tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", out->_idx);)
         }
       } else {
         // FIXME
@@ -883,10 +908,11 @@
         } else {
           assert(out == prev || prev == NULL, "no branches off of store slice");
         }
-      }
-    }
+      }//else
+    }//for
     if (n == stop) break;
     preds.push(n);
+    NOT_PRODUCT(if (TraceSuperWord && Verbose) tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", n->_idx);)
     prev = n;
     assert(n->is_Mem(), err_msg_res("unexpected node %s", n->Name()));
     n = n->in(MemNode::Memory);
@@ -2140,18 +2166,38 @@
     } else {
       _stk.pop(); // Remove post-visited node from stack
     }
-  }
-
+  }//while
+
+  int ii_current = -1;
+  unsigned int load_idx = -1;
+  _ii_order.clear();
   // Create real map of block indices for nodes
   for (int j = 0; j < _block.length(); j++) {
     Node* n = _block.at(j);
     set_bb_idx(n, j);
-  }
+    if (_do_vector_loop && n->is_Load()) {
+      if (ii_current == -1) {
+        ii_current = _clone_map.gen(n->_idx);
+        _ii_order.push(ii_current);
+        load_idx = _clone_map.idx(n->_idx);
+      } else if (_clone_map.idx(n->_idx) == load_idx && _clone_map.gen(n->_idx) != ii_current) {
+        ii_current = _clone_map.gen(n->_idx);
+        _ii_order.push(ii_current);
+      }
+    }
+  }//for
 
   // Ensure extra info is allocated.
   initialize_bb();
 
 #ifndef PRODUCT
+  if (_vector_loop_debug && _ii_order.length() > 0) {
+    tty->print("SuperWord::construct_bb: List of generations: ");
+    for (int jj = 0; jj < _ii_order.length(); ++jj) {
+      tty->print("  %d:%d", jj, _ii_order.at(jj));
+    }
+    tty->print_cr(" ");
+  }
   if (TraceSuperWord) {
     print_bb();
     tty->print_cr("\ndata entry nodes: %s", _data_entry.length() > 0 ? "" : "NONE");
@@ -2312,18 +2358,27 @@
 //------------------------------memory_alignment---------------------------
 // Alignment within a vector memory reference
 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
+  #ifndef PRODUCT
+    if(TraceSuperWord && Verbose) {
+      tty->print("SuperWord::memory_alignment within a vector memory reference for %d:  ", s->_idx); s->dump();
+    }
+  #endif
+  NOT_PRODUCT(SWPointer::Tracer::Depth ddd(0);)
   SWPointer p(s, this, NULL, false);
   if (!p.valid()) {
+    NOT_PRODUCT(if(is_trace_alignment()) tty->print("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");)
     return bottom_align;
   }
   int vw = vector_width_in_bytes(s);
   if (vw < 2) {
+    NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
     return bottom_align; // No vectors for this type
   }
   int offset  = p.offset_in_bytes();
   offset     += iv_adjust*p.memory_size();
   int off_rem = offset % vw;
   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
+  NOT_PRODUCT(if(TraceSuperWord && Verbose) tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d", off_rem, off_mod);)
   return off_mod;
 }
 
@@ -2732,13 +2787,20 @@
 
 
 //==============================SWPointer===========================
-
+#ifndef PRODUCT
+int SWPointer::Tracer::_depth = 0;
+#endif
 //----------------------------SWPointer------------------------
 SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
   _nstack(nstack), _analyze_only(analyze_only),
-  _stack_idx(0) {
+  _stack_idx(0)
+#ifndef PRODUCT
+  , _tracer(slp)
+#endif
+{
+  NOT_PRODUCT(_tracer.ctor_1(mem);)
 
   Node* adr = mem->in(MemNode::Address);
   if (!adr->is_AddP()) {
@@ -2757,16 +2819,29 @@
     assert(!valid(), "unsafe access");
     return;
   }
-  for (int i = 0; i < 3; i++) {
+
+  NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.store_depth();)
+  NOT_PRODUCT(_tracer.ctor_2(adr);)
+
+  int i;
+  for (i = 0; i < 3; i++) {
+    NOT_PRODUCT(_tracer.ctor_3(adr, i);)
+
     if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
       assert(!valid(), "too complex");
       return;
     }
     adr = adr->in(AddPNode::Address);
+    NOT_PRODUCT(_tracer.ctor_4(adr, i);)
+
     if (base == adr || !adr->is_AddP()) {
+      NOT_PRODUCT(_tracer.ctor_5(adr, base, i);)
       break; // stop looking at addp's
     }
   }
+  NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.restore_depth();)
+  NOT_PRODUCT(_tracer.ctor_6(mem);)
+
   _base = base;
   _adr  = adr;
   assert(valid(), "Usable");
@@ -2778,68 +2853,103 @@
   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
   _nstack(p->_nstack), _analyze_only(p->_analyze_only),
-  _stack_idx(p->_stack_idx) {}
-
+  _stack_idx(p->_stack_idx)
+  #ifndef PRODUCT
+  , _tracer(p->_slp)
+  #endif
+{}
+
+
+bool SWPointer::invariant(Node* n) {
+  NOT_PRODUCT(Tracer::Depth dd;)
+  Node *n_c = phase()->get_ctrl(n);
+  NOT_PRODUCT(_tracer.invariant_1(n, n_c);)
+  return !lpt()->is_member(phase()->get_loop(n_c));
+}
 //------------------------scaled_iv_plus_offset--------------------
 // Match: k*iv + offset
 // where: k is a constant that maybe zero, and
 //        offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
 bool SWPointer::scaled_iv_plus_offset(Node* n) {
+  NOT_PRODUCT(Tracer::Depth ddd;)
+  NOT_PRODUCT(_tracer.scaled_iv_plus_offset_1(n);)
+
   if (scaled_iv(n)) {
+    NOT_PRODUCT(_tracer.scaled_iv_plus_offset_2(n);)
     return true;
   }
+
   if (offset_plus_k(n)) {
+    NOT_PRODUCT(_tracer.scaled_iv_plus_offset_3(n);)
     return true;
   }
+
   int opc = n->Opcode();
   if (opc == Op_AddI) {
     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2))) {
+      NOT_PRODUCT(_tracer.scaled_iv_plus_offset_4(n);)
       return true;
     }
     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
+      NOT_PRODUCT(_tracer.scaled_iv_plus_offset_5(n);)
       return true;
     }
   } else if (opc == Op_SubI) {
     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2), true)) {
+      NOT_PRODUCT(_tracer.scaled_iv_plus_offset_6(n);)
       return true;
     }
     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
       _scale *= -1;
+      NOT_PRODUCT(_tracer.scaled_iv_plus_offset_7(n);)
       return true;
     }
   }
+
+  NOT_PRODUCT(_tracer.scaled_iv_plus_offset_8(n);)
   return false;
 }
 
 //----------------------------scaled_iv------------------------
 // Match: k*iv where k is a constant that's not zero
 bool SWPointer::scaled_iv(Node* n) {
-  if (_scale != 0) {
-    return false;  // already found a scale
+  NOT_PRODUCT(Tracer::Depth ddd;)
+  NOT_PRODUCT(_tracer.scaled_iv_1(n);)
+
+  if (_scale != 0) { // already found a scale
+    NOT_PRODUCT(_tracer.scaled_iv_2(n, _scale);)
+    return false;
   }
+
   if (n == iv()) {
     _scale = 1;
+    NOT_PRODUCT(_tracer.scaled_iv_3(n, _scale);)
     return true;
   }
   if (_analyze_only && (invariant(n) == false)) {
     _nstack->push(n, _stack_idx++);
   }
+
   int opc = n->Opcode();
   if (opc == Op_MulI) {
     if (n->in(1) == iv() && n->in(2)->is_Con()) {
       _scale = n->in(2)->get_int();
+      NOT_PRODUCT(_tracer.scaled_iv_4(n, _scale);)
       return true;
     } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
       _scale = n->in(1)->get_int();
+      NOT_PRODUCT(_tracer.scaled_iv_5(n, _scale);)
       return true;
     }
   } else if (opc == Op_LShiftI) {
     if (n->in(1) == iv() && n->in(2)->is_Con()) {
       _scale = 1 << n->in(2)->get_int();
+      NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);)
       return true;
     }
   } else if (opc == Op_ConvI2L) {
     if (scaled_iv_plus_offset(n->in(1))) {
+      NOT_PRODUCT(_tracer.scaled_iv_7(n);)
       return true;
     }
   } else if (opc == Op_LShiftL) {
@@ -2847,17 +2957,22 @@
       // Need to preserve the current _offset value, so
       // create a temporary object for this expression subtree.
       // Hacky, so should re-engineer the address pattern match.
+      NOT_PRODUCT(Tracer::Depth dddd;)
       SWPointer tmp(this);
+      NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);)
+
       if (tmp.scaled_iv_plus_offset(n->in(1))) {
-        if (tmp._invar == NULL) {
+        if (tmp._invar == NULL || _slp->do_vector_loop()) {
           int mult = 1 << n->in(2)->get_int();
           _scale   = tmp._scale  * mult;
           _offset += tmp._offset * mult;
+          NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, mult);)
           return true;
         }
       }
     }
   }
+  NOT_PRODUCT(_tracer.scaled_iv_10(n);)
   return false;
 }
 
@@ -2865,9 +2980,13 @@
 // Match: offset is (k [+/- invariant])
 // where k maybe zero and invariant is optional, but not both.
 bool SWPointer::offset_plus_k(Node* n, bool negate) {
+  NOT_PRODUCT(Tracer::Depth ddd;)
+  NOT_PRODUCT(_tracer.offset_plus_k_1(n);)
+
   int opc = n->Opcode();
   if (opc == Op_ConI) {
     _offset += negate ? -(n->get_int()) : n->get_int();
+    NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);)
     return true;
   } else if (opc == Op_ConL) {
     // Okay if value fits into an int
@@ -2876,11 +2995,17 @@
       jlong loff = n->get_long();
       jint  off  = (jint)loff;
       _offset += negate ? -off : loff;
+      NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);)
       return true;
     }
+    NOT_PRODUCT(_tracer.offset_plus_k_4(n);)
     return false;
   }
-  if (_invar != NULL) return false; // already have an invariant
+  if (_invar != NULL) { // already has an invariant
+    NOT_PRODUCT(_tracer.offset_plus_k_5(n, _invar);)
+    return false;
+  }
+
   if (_analyze_only && (invariant(n) == false)) {
     _nstack->push(n, _stack_idx++);
   }
@@ -2889,11 +3014,13 @@
       _negate_invar = negate;
       _invar = n->in(1);
       _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
+      NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, _negate_invar, _offset);)
       return true;
     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
       _negate_invar = negate;
       _invar = n->in(2);
+      NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, _negate_invar, _offset);)
       return true;
     }
   }
@@ -2902,19 +3029,24 @@
       _negate_invar = negate;
       _invar = n->in(1);
       _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
+      NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, _negate_invar, _offset);)
       return true;
     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
       _negate_invar = !negate;
       _invar = n->in(2);
+      NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, _negate_invar, _offset);)
       return true;
     }
   }
   if (invariant(n)) {
     _negate_invar = negate;
     _invar = n;
+    NOT_PRODUCT(_tracer.offset_plus_k_10(n, _invar, _negate_invar, _offset);)
     return true;
   }
+
+  NOT_PRODUCT(_tracer.offset_plus_k_11(n);)
   return false;
 }
 
@@ -2930,6 +3062,287 @@
 #endif
 }
 
+//----------------------------tracing------------------------
+#ifndef PRODUCT
+void SWPointer::Tracer::print_depth() {
+  for (int ii = 0; ii<_depth; ++ii) tty->print("  ");
+}
+
+void SWPointer::Tracer::ctor_1 (Node* mem) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print(" %d SWPointer::SWPointer: start alignment analysis", mem->_idx); mem->dump();
+  }
+}
+
+void SWPointer::Tracer::ctor_2(Node* adr) {
+  if(_slp->is_trace_alignment()) {
+    //store_depth();
+    inc_depth();
+    print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: ", adr->_idx); adr->dump();
+    inc_depth();
+    print_depth(); tty->print(" %d (base) SWPointer::SWPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump();
+  }
+}
+
+void SWPointer::Tracer::ctor_3(Node* adr, int i) {
+  if(_slp->is_trace_alignment()) {
+    inc_depth();
+    Node* offset = adr->in(AddPNode::Offset);
+    print_depth(); tty->print(" %d (offset) SWPointer::SWPointer: i = %d: ", offset->_idx, i); offset->dump();
+  }
+}
+
+void SWPointer::Tracer::ctor_4(Node* adr, int i) {
+  if(_slp->is_trace_alignment()) {
+    inc_depth();
+    print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: i = %d: ", adr->_idx, i); adr->dump();
+  }
+}
+
+void SWPointer::Tracer::ctor_5(Node* adr, Node* base, int i) {
+  if(_slp->is_trace_alignment()) {
+    inc_depth();
+    if (base == adr) {
+      print_depth(); tty->print_cr("  \\ %d (adr) == %d (base) SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, base->_idx, i);
+    } else if (!adr->is_AddP()) {
+      print_depth(); tty->print_cr("  \\ %d (adr) is NOT Addp SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, i);
+    }
+  }
+}
+
+void SWPointer::Tracer::ctor_6(Node* mem) {
+  if(_slp->is_trace_alignment()) {
+    //restore_depth();
+    print_depth(); tty->print_cr(" %d (adr) SWPointer::SWPointer: stop analysis", mem->_idx);
+  }
+}
+
+void SWPointer::Tracer::invariant_1(Node *n, Node *n_c) {
+  if (_slp->do_vector_loop() && _slp->is_debug() && _slp->_lpt->is_member(_slp->_phase->get_loop(n_c)) != (int)_slp->in_bb(n)) {
+    int is_member =  _slp->_lpt->is_member(_slp->_phase->get_loop(n_c));
+    int in_bb     =  _slp->in_bb(n);
+    print_depth(); tty->print("  \\ ");  tty->print_cr(" %d SWPointer::invariant  conditions differ: n_c %d", n->_idx, n_c->_idx);
+    print_depth(); tty->print("  \\ ");  tty->print_cr("is_member %d, in_bb %d", is_member, in_bb);
+    print_depth(); tty->print("  \\ ");  n->dump();
+    print_depth(); tty->print("  \\ ");  n_c->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_plus_offset_1(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset testing node: ", n->_idx);
+    n->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_plus_offset_2(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_plus_offset_3(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_plus_offset_4(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_plus_offset_5(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_plus_offset_6(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
+    print_depth(); tty->print("  \\  %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_plus_offset_7(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_plus_offset_8(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: FAILED", n->_idx);
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_1(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print(" %d SWPointer::scaled_iv: testing node: ", n->_idx); n->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_2(Node* n, int scale) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED since another _scale has been detected before", n->_idx);
+    print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: _scale (%d) != 0", scale);
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_3(Node* n, int scale) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, scale);
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_4(Node* n, int scale) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_5(Node* n, int scale) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is iv: ", n->in(2)->_idx); n->in(2)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_6(Node* n, int scale) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftI PASSED, setting _scale = %d", n->_idx, scale);
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_7(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_ConvI2L PASSED", n->_idx);
+    print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset: ", n->in(1)->_idx);
+    inc_depth(); inc_depth();
+    print_depth(); n->in(1)->dump();
+    dec_depth(); dec_depth();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_8(Node* n, SWPointer* tmp) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print(" %d SWPointer::scaled_iv: Op_LShiftL, creating tmp SWPointer: ", n->_idx); tmp->print();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_9(Node* n, int scale, int _offset, int mult) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftL PASSED, setting _scale = %d, _offset = %d", n->_idx, scale, _offset);
+    print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset, in(2) %d used to get mult = %d: _scale = %d, _offset = %d",
+    n->in(1)->_idx, n->in(2)->_idx, mult, scale, _offset);
+    inc_depth(); inc_depth();
+    print_depth(); n->in(1)->dump();
+    print_depth(); n->in(2)->dump();
+    dec_depth(); dec_depth();
+  }
+}
+
+void SWPointer::Tracer::scaled_iv_10(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED", n->_idx);
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_1(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print(" %d SWPointer::offset_plus_k: testing node: ", n->_idx); n->dump();
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_2(Node* n, int _offset) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConI PASSED, setting _offset = %d", n->_idx, _offset);
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_3(Node* n, int _offset) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConL PASSED, setting _offset = %d", n->_idx, _offset);
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_4(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
+    print_depth(); tty->print_cr("  \\ " JLONG_FORMAT " SWPointer::offset_plus_k: Op_ConL FAILED, k is too big", n->get_long());
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_5(Node* n, Node* _invar) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED since another invariant has been detected before", n->_idx);
+    print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: _invar != NULL: ", _invar->_idx); _invar->dump();
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
+    n->_idx, _negate_invar, _invar->_idx, _offset);
+    print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
+    n->_idx, _negate_invar, _invar->_idx, _offset);
+    print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI is PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
+    n->_idx, _negate_invar, _invar->_idx, _offset);
+    print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
+    print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
+    print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
+    print_depth(); tty->print_cr("  \\ %d SWPointer::offset_plus_k: is invariant", n->_idx);
+  }
+}
+
+void SWPointer::Tracer::offset_plus_k_11(Node* n) {
+  if(_slp->is_trace_alignment()) {
+    print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
+  }
+}
+
+#endif
 // ========================= OrderedPair =====================
 
 const OrderedPair OrderedPair::initial;
@@ -3076,13 +3489,20 @@
 //
 // --------------------------------- vectorization/simd -----------------------------------
 //
+bool SuperWord::same_origin_idx(Node* a, Node* b) const {
+  return a != NULL && b != NULL && _clone_map.same_idx(a->_idx, b->_idx);
+}
+bool SuperWord::same_generation(Node* a, Node* b) const {
+  return a != NULL && b != NULL && _clone_map.same_gen(a->_idx, b->_idx);
+}
+
 Node*  SuperWord::find_phi_for_mem_dep(LoadNode* ld) {
   assert(in_bb(ld), "must be in block");
   if (_clone_map.gen(ld->_idx) == _ii_first) {
 #ifndef PRODUCT
     if (_vector_loop_debug) {
       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d",
-                    _clone_map.gen(ld->_idx));
+        _clone_map.gen(ld->_idx));
     }
 #endif
     return NULL; //we think that any ld in the first gen being vectorizable
@@ -3094,18 +3514,18 @@
 #ifndef PRODUCT
     if (_vector_loop_debug) {
       tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed",
-                    mem->_idx, ld->_idx);
+        mem->_idx, ld->_idx);
       ld->dump();
       mem->dump();
     }
 #endif
     return NULL;
   }
-  if (!in_bb(mem) || _clone_map.gen(mem->_idx) == _clone_map.gen(ld->_idx)) {
+  if (!in_bb(mem) || same_generation(mem, ld)) {
 #ifndef PRODUCT
     if (_vector_loop_debug) {
       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d",
-                    _clone_map.gen(mem->_idx));
+        _clone_map.gen(mem->_idx));
     }
 #endif
     return NULL; // does not depend on loop volatile node or depends on the same generation
@@ -3136,7 +3556,7 @@
 #ifndef PRODUCT
     if (_vector_loop_debug) {
       tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head",
-                    ld->_idx, phi->_idx);
+        ld->_idx, phi->_idx);
       ld->dump();
       phi->dump();
     }
@@ -3151,11 +3571,11 @@
 Node* SuperWord::first_node(Node* nd) {
   for (int ii = 0; ii < _iteration_first.length(); ii++) {
     Node* nnn = _iteration_first.at(ii);
-    if (_clone_map.idx(nnn->_idx) == _clone_map.idx(nd->_idx)) {
+    if (same_origin_idx(nnn, nd)) {
 #ifndef PRODUCT
       if (_vector_loop_debug) {
         tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)",
-                      nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
+          nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
       }
 #endif
       return nnn;
@@ -3165,7 +3585,7 @@
 #ifndef PRODUCT
   if (_vector_loop_debug) {
     tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)",
-                  nd->_idx, _clone_map.idx(nd->_idx));
+      nd->_idx, _clone_map.idx(nd->_idx));
   }
 #endif
   return 0;
@@ -3174,11 +3594,11 @@
 Node* SuperWord::last_node(Node* nd) {
   for (int ii = 0; ii < _iteration_last.length(); ii++) {
     Node* nnn = _iteration_last.at(ii);
-    if (_clone_map.idx(nnn->_idx) == _clone_map.idx(nd->_idx)) {
+    if (same_origin_idx(nnn, nd)) {
 #ifndef PRODUCT
       if (_vector_loop_debug) {
         tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d",
-                      _clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
+          _clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
       }
 #endif
       return nnn;
@@ -3219,9 +3639,11 @@
         } else if (_ii_first != _clone_map.gen(ii->_idx)) {
 #ifndef PRODUCT
           if (TraceSuperWord && Verbose) {
-            tty->print_cr("SuperWord::mark_generations _ii_first error - found different generations in two nodes ");
+            tty->print_cr("SuperWord::mark_generations: _ii_first was found before and not equal to one in this node (%d)", _ii_first);
             ii->dump();
-            ii_err->dump();
+            if (ii_err!= 0) {
+              ii_err->dump();
+            }
           }
 #endif
           return -1; // this phi has Stores from different generations of unroll and cannot be simd/vectorized
@@ -3252,8 +3674,7 @@
   }
 
   // building order of iterations
-  assert(_ii_order.length() == 0, "should be empty");
-  if (ii_err != 0) {
+  if (_ii_order.length() == 0 && ii_err != 0) {
     assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb");
     Node* nd = ii_err;
     while(_clone_map.gen(nd->_idx) != _ii_last) {
@@ -3261,7 +3682,7 @@
       bool found = false;
       for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) {
         Node* use = nd->fast_out(i);
-        if (_clone_map.idx(use->_idx) == _clone_map.idx(nd->_idx) && use->as_Store()->in(MemNode::Memory) == nd) {
+        if (same_origin_idx(use, nd) && use->as_Store()->in(MemNode::Memory) == nd) {
           found = true;
           nd = use;
           break;
@@ -3303,7 +3724,7 @@
 
 bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) {
   assert(gold->is_Add() && fix->is_Add() || gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes");
-  assert(_clone_map.idx(gold->_idx) == _clone_map.idx(fix->_idx), "should be clones of the same node");
+  assert(same_origin_idx(gold, fix), "should be clones of the same node");
   Node* gin1 = gold->in(1);
   Node* gin2 = gold->in(2);
   Node* fin1 = fix->in(1);
@@ -3311,12 +3732,12 @@
   bool swapped = false;
 
   if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin1)) {
-    if (_clone_map.idx(gin1->_idx) == _clone_map.idx(fin1->_idx) &&
-        _clone_map.idx(gin2->_idx) == _clone_map.idx(fin2->_idx)) {
+    if (same_origin_idx(gin1, fin1) &&
+        same_origin_idx(gin2, fin2)) {
       return true; // nothing to fix
     }
-    if (_clone_map.idx(gin1->_idx) == _clone_map.idx(fin2->_idx) &&
-        _clone_map.idx(gin2->_idx) == _clone_map.idx(fin1->_idx)) {
+    if (same_origin_idx(gin1, fin2) &&
+        same_origin_idx(gin2, fin1)) {
       fix->swap_edges(1, 2);
       swapped = true;
     }
@@ -3364,7 +3785,7 @@
       for (int gen = 1; gen < _ii_order.length(); ++gen) {
         for (int kk = 0; kk < _block.length(); kk++) {
           Node* clone = _block.at(kk);
-          if (_clone_map.idx(clone->_idx) == _clone_map.idx(nd->_idx) &&
+          if (same_origin_idx(clone, nd) &&
               _clone_map.gen(clone->_idx) == _ii_order.at(gen)) {
             if (nd->is_Add() || nd->is_Mul()) {
               fix_commutative_inputs(nd, clone);
@@ -3429,13 +3850,12 @@
       if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) {
         for (int i = 0; i < _block.length(); i++) {
           Node* ld2 = _block.at(i);
-          if (ld2->is_Load() &&
-              _clone_map.idx(ld->_idx) == _clone_map.idx(ld2->_idx) &&
-              _clone_map.gen(ld->_idx) != _clone_map.gen(ld2->_idx)) { // <= do not collect the first generation ld
+          if (ld2->is_Load() && same_origin_idx(ld, ld2) &&
+              !same_generation(ld, ld2)) { // <= do not collect the first generation ld
 #ifndef PRODUCT
             if (_vector_loop_debug) {
               tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)",
-                            ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
+                ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
             }
 #endif
             // could not do on-the-fly, since iterator is immutable
@@ -3453,7 +3873,7 @@
 #ifndef PRODUCT
       if (_vector_loop_debug) {
         tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d",
-                      MemNode::Memory, ld->_idx, phi->_idx);
+          MemNode::Memory, ld->_idx, phi->_idx);
       }
 #endif
       _igvn.replace_input_of(ld, MemNode::Memory, phi);
--- a/src/share/vm/opto/superword.hpp	Fri Jul 10 11:59:09 2015 -0700
+++ b/src/share/vm/opto/superword.hpp	Tue Jul 14 09:55:22 2015 -0700
@@ -203,6 +203,7 @@
 // -----------------------------SuperWord---------------------------------
 // Transforms scalar operations into packed (superword) operations.
 class SuperWord : public ResourceObj {
+ friend class SWPointer;
  private:
   PhaseIdealLoop* _phase;
   Arena*          _arena;
@@ -247,8 +248,17 @@
   PhaseIdealLoop* phase()          { return _phase; }
   IdealLoopTree* lpt()             { return _lpt; }
   PhiNode* iv()                    { return _iv; }
+
   bool early_return()              { return _early_return; }
 
+#ifndef PRODUCT
+  bool     is_debug()              { return _vector_loop_debug > 0; }
+  bool     is_trace_alignment()    { return (_vector_loop_debug & 2) > 0; }
+  bool     is_trace_mem_slice()    { return (_vector_loop_debug & 4) > 0; }
+  bool     is_trace_loop()         { return (_vector_loop_debug & 8) > 0; }
+  bool     is_trace_adjacent()     { return (_vector_loop_debug & 16) > 0; }
+#endif
+  bool     do_vector_loop()        { return _do_vector_loop; }
  private:
   IdealLoopTree* _lpt;             // Current loop tree node
   LoopNode*      _lp;              // Current LoopNode
@@ -257,12 +267,14 @@
   bool           _race_possible;   // In cases where SDMU is true
   bool           _early_return;    // True if we do not initialize
   bool           _do_vector_loop;  // whether to do vectorization/simd style
-  bool           _vector_loop_debug; // provide more printing in debug mode
   int            _num_work_vecs;   // Number of non memory vector operations
   int            _num_reductions;  // Number of reduction expressions applied
   int            _ii_first;        // generation with direct deps from mem phi
   int            _ii_last;         // generation with direct deps to   mem phi
   GrowableArray<int> _ii_order;
+#ifndef PRODUCT
+  uintx          _vector_loop_debug; // provide more printing in debug mode
+#endif
 
   // Accessors
   Arena* arena()                   { return _arena; }
@@ -325,12 +337,20 @@
   Node_List* my_pack(Node* n)                { return !in_bb(n) ? NULL : _node_info.adr_at(bb_idx(n))->_my_pack; }
   void set_my_pack(Node* n, Node_List* p)    { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; }
 
+  // CloneMap utilities
+  bool same_origin_idx(Node* a, Node* b) const;
+  bool same_generation(Node* a, Node* b) const;
+
   // methods
 
   // Extract the superword level parallelism
   void SLP_extract();
   // Find the adjacent memory references and create pack pairs for them.
   void find_adjacent_refs();
+  // Tracing support
+  #ifndef PRODUCT
+  void find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment);
+  #endif
   // Find a memory reference to align the loop induction variable to.
   MemNode* find_align_to_ref(Node_List &memops);
   // Calculate loop's iv adjustment for this memory ops.
@@ -340,13 +360,13 @@
   // rebuild the graph so all loads in different iterations of cloned loop become dependant on phi node (in _do_vector_loop only)
   bool hoist_loads_in_graph();
   // Test whether MemNode::Memory dependency to the same load but in the first iteration of this loop is coming from memory phi
-  // Return false if failed.
+  // Return false if failed
   Node* find_phi_for_mem_dep(LoadNode* ld);
   // Return same node but from the first generation. Return 0, if not found
   Node* first_node(Node* nd);
   // Return same node as this but from the last generation. Return 0, if not found
   Node* last_node(Node* n);
-  // Mark nodes belonging to first and last generation,
+  // Mark nodes belonging to first and last generation
   // returns first generation index or -1 if vectorization/simd is impossible
   int mark_generations();
   // swapping inputs of commutative instruction (Add or Mul)
@@ -483,10 +503,7 @@
   IdealLoopTree*  lpt()   { return _slp->lpt(); }
   PhiNode*        iv()    { return _slp->iv();  } // Induction var
 
-  bool invariant(Node* n) {
-    Node *n_c = phase()->get_ctrl(n);
-    return !lpt()->is_member(phase()->get_loop(n_c));
-  }
+  bool invariant(Node* n);
 
   // Match: k*iv + offset
   bool scaled_iv_plus_offset(Node* n);
@@ -545,6 +562,76 @@
   static bool comparable(int cmp) { return cmp < NotComparable; }
 
   void print();
+
+#ifndef PRODUCT
+  class Tracer {
+    friend class SuperWord;
+    friend class SWPointer;
+    SuperWord*   _slp;
+    static int   _depth;
+    int _depth_save;
+    void print_depth();
+    int  depth() const    { return _depth; }
+    void set_depth(int d) { _depth = d; }
+    void inc_depth()      { _depth++;}
+    void dec_depth()      { if (_depth > 0) _depth--;}
+    void store_depth()    {_depth_save = _depth;}
+    void restore_depth()  {_depth = _depth_save;}
+
+    class Depth {
+      friend class Tracer;
+      friend class SWPointer;
+      friend class SuperWord;
+      Depth()  { ++_depth; }
+      Depth(int x)  { _depth = 0; }
+      ~Depth() { if (_depth > 0) --_depth;}
+    };
+    Tracer (SuperWord* slp) : _slp(slp) {}
+
+    // tracing functions
+    void ctor_1(Node* mem);
+    void ctor_2(Node* adr);
+    void ctor_3(Node* adr, int i);
+    void ctor_4(Node* adr, int i);
+    void ctor_5(Node* adr, Node* base,  int i);
+    void ctor_6(Node* mem);
+
+    void invariant_1(Node *n, Node *n_c);
+
+    void scaled_iv_plus_offset_1(Node* n);
+    void scaled_iv_plus_offset_2(Node* n);
+    void scaled_iv_plus_offset_3(Node* n);
+    void scaled_iv_plus_offset_4(Node* n);
+    void scaled_iv_plus_offset_5(Node* n);
+    void scaled_iv_plus_offset_6(Node* n);
+    void scaled_iv_plus_offset_7(Node* n);
+    void scaled_iv_plus_offset_8(Node* n);
+
+    void scaled_iv_1(Node* n);
+    void scaled_iv_2(Node* n, int scale);
+    void scaled_iv_3(Node* n, int scale);
+    void scaled_iv_4(Node* n, int scale);
+    void scaled_iv_5(Node* n, int scale);
+    void scaled_iv_6(Node* n, int scale);
+    void scaled_iv_7(Node* n);
+    void scaled_iv_8(Node* n, SWPointer* tmp);
+    void scaled_iv_9(Node* n, int _scale, int _offset, int mult);
+    void scaled_iv_10(Node* n);
+
+    void offset_plus_k_1(Node* n);
+    void offset_plus_k_2(Node* n, int _offset);
+    void offset_plus_k_3(Node* n, int _offset);
+    void offset_plus_k_4(Node* n);
+    void offset_plus_k_5(Node* n, Node* _invar);
+    void offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset);
+    void offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset);
+    void offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset);
+    void offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset);
+    void offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset);
+    void offset_plus_k_11(Node* n);
+
+  } _tracer;//TRacer;
+#endif
 };