changeset 57362:63004af6fc57

8233033: C2 produces wrong result while unswitching a loop due to lost control dependencies Summary: Adding missing control dependencies when cloning loop predicates at loop unswitching. Reviewed-by: roland, vlivanov, thartmann
author chagedorn
date Wed, 11 Dec 2019 14:33:32 +0100
parents 9b513dbd57a3
children 99c493a347a1
files src/hotspot/share/opto/loopUnswitch.cpp test/hotspot/jtreg/compiler/loopopts/PartialPeelingUnswitch.java
diffstat 2 files changed, 258 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/share/opto/loopUnswitch.cpp	Wed Dec 11 14:08:20 2019 +0100
+++ b/src/hotspot/share/opto/loopUnswitch.cpp	Wed Dec 11 14:33:32 2019 +0100
@@ -262,7 +262,7 @@
   ProjNode* ifslow = new IfFalseNode(iff);
   register_node(ifslow, outer_loop, iff, dom_depth(iff));
 
-  // Clone the loop body.  The clone becomes the fast loop.  The
+  // Clone the loop body.  The clone becomes the slow loop.  The
   // original pre-header will (illegally) have 3 control users
   // (old & new loops & new if).
   clone_loop(loop, old_new, dom_depth(head->skip_strip_mined()), mode, iff);
@@ -281,6 +281,123 @@
   _igvn.replace_input_of(slow_l, LoopNode::EntryControl, ifslow_pred);
   set_idom(slow_l, ifslow_pred, dom_depth(l));
 
+  if (iffast != iffast_pred && entry->outcnt() > 1) {
+    // This situation occurs when only non-CFG nodes (i.e. no control dependencies between them) with a control
+    // input from the loop header were partially peeled before (now control dependent on loop entry control).
+    // If additional CFG nodes were peeled, then the insertion point of the loop predicates from the parsing stage
+    // would not be found anymore and the predicates not cloned at all (i.e. iffast == iffast_pred) as it happens
+    // for normal peeling. Those partially peeled statements have a control input from the old loop entry control
+    // and need to be executed after the predicates. These control dependencies need to be removed from the old
+    // entry control and added to the new entry control nodes 'iffast_pred' and 'ifslow_pred'. Since each node can
+    // only have one control input, we need to create clones for all statements (2) that can be reached over a path
+    // from the old entry control 'entry' (1) to a loop phi (8, 9). The old nodes (2) will be moved to the fast loop and the
+    // new cloned nodes (10) to the slow loop.
+    //
+    // The result of the following algorithm is visualized below. The cloned loop predicates for the fast loop
+    // are between the loop selection node (3) and the entry control for the fast loop (4) and for the slow loop
+    // between the loop selection node (3) and the entry control for the slow loop (5), respectively.
+    //
+    //      1 entry                                    1 entry
+    //      /     \                                       |
+    //  2 stmt    3 iff                                 3 iff
+    //   |        /  \                                 /     \
+    //   |      ..    ..                             ..       ..
+    //   |      /      \                             /         \
+    //   | 4 iffast_p  5 ifslow_p          4 iffast_p          5 ifslow_p
+    //   |     |          |                /    \               /       \
+    //   |   6 head  7 slow_head   ==>  6 head  2 stmt   7 slow_head  10 cloned_stmt
+    //   |     |          |                \    /               \       /
+    //   +--\  |    +--\  |                8 phi                  9 phi
+    //   |   8 phi  |  9 phi
+    //   |          |
+    //   +----------+
+    //
+    assert(ifslow != ifslow_pred, "sanity - must also be different");
+
+    ResourceMark rm;
+    Unique_Node_List worklist;
+    Unique_Node_List phis;
+    Node_List old_clone;
+    LoopNode* slow_head = old_new[head->_idx]->as_Loop();
+
+    // 1) Do a BFS starting from the outputs of the original entry control node 'entry' to all (loop) phis
+    // and add the non-phi nodes to the worklist.
+    // First get all outputs of 'entry' which are not the new "loop selection check" 'iff'.
+    for (DUIterator_Fast imax, i = entry->fast_outs(imax); i < imax; i++) {
+      Node* stmt = entry->fast_out(i);
+      if (stmt != iff) {
+        assert(!stmt->is_CFG(), "cannot be a CFG node");
+        worklist.push(stmt);
+      }
+    }
+
+    // Then do a BFS from all collected nodes so far and stop if a phi node is hit.
+    // Keep track of them on a separate 'phis' list to adjust their inputs later.
+    for (uint i = 0; i < worklist.size(); i++) {
+      Node* stmt = worklist.at(i);
+      for (DUIterator_Fast jmax, j = stmt->fast_outs(jmax); j < jmax; j++) {
+        Node* out = stmt->fast_out(j);
+        assert(!out->is_CFG(), "cannot be a CFG node");
+        if (out->is_Phi()) {
+          assert(out->in(PhiNode::Region) == head || out->in(PhiNode::Region) == slow_head,
+                 "phi must be either part of the slow or the fast loop");
+          phis.push(out);
+        } else {
+          worklist.push(out);
+        }
+      }
+    }
+
+    // 2) All nodes of interest are in 'worklist' and are now cloned. This could not be done simultaneously
+    // in step 1 in an easy way because we could have cloned a node which has an input that is added to the
+    // worklist later. As a result, the BFS would hit a clone which does not need to be cloned again.
+    // While cloning a node, the control inputs to 'entry' are updated such that the old node points to
+    // 'iffast_pred' and the clone to 'ifslow_pred', respectively.
+    for (uint i = 0; i < worklist.size(); i++) {
+      Node* stmt = worklist.at(i);
+      assert(!stmt->is_CFG(), "cannot be a CFG node");
+      Node* cloned_stmt = stmt->clone();
+      old_clone.map(stmt->_idx, cloned_stmt);
+      _igvn.register_new_node_with_optimizer(cloned_stmt);
+
+      if (stmt->in(0) == entry) {
+        _igvn.replace_input_of(stmt, 0, iffast_pred);
+        set_ctrl(stmt, iffast_pred);
+        _igvn.replace_input_of(cloned_stmt, 0, ifslow_pred);
+        set_ctrl(cloned_stmt, ifslow_pred);
+      }
+    }
+
+    // 3) Update the entry control of all collected phi nodes of the slow loop to use the cloned nodes
+    // instead of the old ones from the worklist
+    for (uint i = 0; i < phis.size(); i++) {
+      assert(phis.at(i)->is_Phi(), "must be a phi");
+      PhiNode* phi = phis.at(i)->as_Phi();
+      if (phi->in(PhiNode::Region) == slow_head) {
+        // Slow loop: Update phi entry control to use the cloned version instead of the old one from the worklist
+        Node* entry_control = phi->in(LoopNode::EntryControl);
+        _igvn.replace_input_of(phi, LoopNode::EntryControl, old_clone[phi->in(LoopNode::EntryControl)->_idx]);
+      }
+
+    }
+
+    // 4) Replace all input edges of cloned nodes from old nodes on the worklist by an input edge from their
+    // corresponding cloned version.
+    for (uint i = 0; i < worklist.size(); i++) {
+      Node* stmt = worklist.at(i);
+      for (uint j = 0; j < stmt->req(); j++) {
+        Node* in = stmt->in(j);
+        if (in == NULL) {
+          continue;
+        }
+
+        if (worklist.contains(in)) {
+          // Replace the edge old1->clone_of_old_2 with an edge clone_of_old1->clone_of_old2
+          old_clone[stmt->_idx]->set_req(j, old_clone[in->_idx]);
+        }
+      }
+    }
+  }
   recompute_dom_depth();
 
   return iffast;
@@ -303,7 +420,7 @@
   ProjNode* ifslow = new IfFalseNode(iff);
   register_node(ifslow, outer_loop, iff, dom_depth(iff));
 
-  // Clone the loop body.  The clone becomes the fast loop.  The
+  // Clone the loop body.  The clone becomes the slow loop.  The
   // original pre-header will (illegally) have 3 control users
   // (old & new loops & new if).
   clone_loop(loop, old_new, dom_depth(head), CloneIncludesStripMined, iff);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/hotspot/jtreg/compiler/loopopts/PartialPeelingUnswitch.java	Wed Dec 11 14:33:32 2019 +0100
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8233033
+ * @summary Tests if partially peeled statements are not executed before the loop predicates of the unswitched fast loop.
+ *
+ * @run main/othervm -Xbatch -XX:-TieredCompilation
+ *      -XX:CompileCommand=compileonly,compiler.loopopts.PartialPeelingUnswitch::test*
+ *      compiler.loopopts.PartialPeelingUnswitch
+ */
+
+package compiler.loopopts;
+
+public class PartialPeelingUnswitch {
+
+    public static int iFld;
+    public static int x = 42;
+    public static int y = 31;
+    public static int[] iArr = new int[10];
+
+    public int test() {
+        /*
+         * The inner loop of this test is first partially peeled and then unswitched. An uncommon trap is hit in one
+         * of the cloned loop predicates for the fast loop (set up at unswitching stage). The only partially peeled
+         * statement "iFld += 7" was wrongly executed before the predicates (and before the loop itself).
+         * When hitting the uncommon trap, "iFld >>= 1" was not yet executed. As a result, the interpreter directly
+         * reexecuted "iFld += 7" again. This resulted in a wrong result for "iFld". The fix makes peeled statements
+         * control dependant on the cloned loop predicates such that they are executed after them.
+         */
+        iFld = 13;
+        for (int i = 0; i < 8; i++) {
+            int j = 10;
+            while (--j > 0) {
+                iFld += -7;
+                switch ((i * 5) + 102) {
+                case 120:
+                    break;
+                case 103:
+                    break;
+                case 116:
+                    break;
+                default:
+                    iFld >>= 1;
+                }
+            }
+        }
+        return iFld;
+    }
+
+    public int test2() {
+        /*
+         * Same nested loop structure as in test() but with more statements that are partially peeled from the inner loop.
+         * Afterwards the inner loop is unswitched.
+         */
+        iFld = 13;
+        int k = 0;
+        for (int i = 0; i < 8; i++) {
+            int j = 10;
+            while (--j > 0) {
+                // All statements before the switch expression are partially peeled
+                iFld += -7;
+                x = y + iFld;
+                y = iArr[5];
+                k = 6;
+                iArr[5] = 5;
+                iArr[6] += 23;
+                iArr[7] = iArr[8] + iArr[6];
+                iArr[j] = 34;
+                switch ((i * 5) + 102) {
+                case 120:
+                    break;
+                case 103:
+                    break;
+                case 116:
+                    break;
+                default:
+                    iFld >>= 1;
+                }
+            }
+        }
+        return iFld + k;
+    }
+
+    public static void main(String[] strArr) {
+        PartialPeelingUnswitch _instance = new PartialPeelingUnswitch();
+        for (int i = 0; i < 200; i++) {
+            int result = _instance.test();
+            if (result != -7) {
+                throw new RuntimeException("Result should always be -7 but was " + result);
+            }
+        }
+
+        for (int i = 0; i < 200; i++) {
+            int result = _instance.test2();
+            check(-1, result);
+            check(-7, iFld);
+            check(-9, x);
+            check(5, y);
+            check(5, iArr[5]);
+            check(149, iArr[6]);
+            check(183, iArr[7]);
+
+            // Reset fields
+            for (int j = 0; j < 10; j++) {
+                iArr[j] = 0;
+            }
+            x = 42;
+            y = 31;
+        }
+    }
+
+    public static void check(int expected, int actual) {
+        if (expected != actual) {
+            throw new RuntimeException("Wrong result, expected: " + expected + ", actual: " + actual);
+        }
+    }
+}