changeset 58647:29d878d3af35

8241040: Support for AVX-512 Ternary Logic Instruction. Summary: A new pass has been added which folds expression tree involving vector boolean logic operations into a MacroLogic node. Reviewed-by: vlivanov, neliasso
author jbhateja
date Thu, 02 Apr 2020 22:38:23 +0530
parents 117b5f22ad28
children 9be4780d3c58
files src/hotspot/cpu/x86/assembler_x86.cpp src/hotspot/cpu/x86/assembler_x86.hpp src/hotspot/cpu/x86/macroAssembler_x86.cpp src/hotspot/cpu/x86/macroAssembler_x86.hpp src/hotspot/cpu/x86/x86.ad src/hotspot/share/adlc/formssel.cpp src/hotspot/share/opto/c2_globals.hpp src/hotspot/share/opto/classes.hpp src/hotspot/share/opto/compile.cpp src/hotspot/share/opto/compile.hpp src/hotspot/share/opto/matcher.cpp src/hotspot/share/opto/vectornode.cpp src/hotspot/share/opto/vectornode.hpp test/hotspot/jtreg/compiler/vectorization/TestMacroLogicVector.java test/micro/org/openjdk/bench/vm/compiler/MacroLogicOpt.java
diffstat 15 files changed, 842 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/cpu/x86/assembler_x86.cpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp	Thu Apr 02 22:38:23 2020 +0530
@@ -6216,6 +6216,30 @@
   emit_operand(dst, src);
 }
 
+void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len) {
+  assert(VM_Version::supports_evex(), "requires EVEX support");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src3->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x25);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8);
+}
+
+void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len) {
+  assert(VM_Version::supports_evex(), "requires EVEX support");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
+  assert(dst != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  vex_prefix(src3, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x25);
+  emit_operand(dst, src3);
+  emit_int8(imm8);
+}
 
 // vinserti forms
 
--- a/src/hotspot/cpu/x86/assembler_x86.hpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp	Thu Apr 02 22:38:23 2020 +0530
@@ -2198,6 +2198,9 @@
   void evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  // Ternary logic instruction.
+  void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
+  void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address     src3, int vector_len);
 
   // vinserti forms
   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Thu Apr 02 22:38:23 2020 +0530
@@ -3757,6 +3757,14 @@
   BLOCK_COMMENT("} verify_oop");
 }
 
+void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
+  if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
+    vpternlogd(dst, 0xFF, dst, dst, vector_len);
+  } else {
+    assert(UseAVX > 0, "");
+    vpcmpeqb(dst, dst, dst, vector_len);
+  }
+}
 
 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
                                                       Register tmp,
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Thu Apr 02 22:38:23 2020 +0530
@@ -1721,6 +1721,8 @@
   void cache_wb(Address line);
   void cache_wbsync(bool is_pre);
 #endif // _LP64
+
+  void vallones(XMMRegister dst, int vector_len);
 };
 
 /**
--- a/src/hotspot/cpu/x86/x86.ad	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/cpu/x86/x86.ad	Thu Apr 02 22:38:23 2020 +0530
@@ -1363,6 +1363,11 @@
         return false; // 128bit vroundpd is not available
       }
       break;
+    case Op_MacroLogicV:
+      if (UseAVX < 3 || !UseVectorMacroLogic) {
+        return false;
+      }
+      break;
 #ifndef _LP64
     case Op_AddReductionVF:
     case Op_AddReductionVD:
@@ -1408,6 +1413,7 @@
   //   * implementation limitations
   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
   //   * 128bit vroundpd instruction is present only in AVX1
+  int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
   switch (opcode) {
     case Op_AbsVF:
     case Op_NegVF:
@@ -1426,6 +1432,12 @@
         return false; // implementation limitation (only vcmov8F_reg is present)
       }
       break;
+    case Op_MacroLogicV:
+      if (!VM_Version::supports_evex() ||
+          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
+        return false;
+      }
+      break;
     case Op_CMoveVD:
       if (vlen != 4) {
         return false; // implementation limitation (only vcmov4D_reg is present)
@@ -3356,6 +3368,20 @@
   ins_pipe( fpu_reg_reg );
 %}
 
+instruct ReplI_M1(vec dst, immI_M1 con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (ReplicateB con));
+  match(Set dst (ReplicateS con));
+  match(Set dst (ReplicateI con));
+  effect(TEMP dst);
+  format %{ "vallones $dst" %}
+  ins_encode %{
+    int vector_len = vector_length_encoding(this);
+    __ vallones($dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ====================ReplicateL=======================================
 
 #ifdef _LP64
@@ -3488,6 +3514,18 @@
   ins_pipe( fpu_reg_reg );
 %}
 
+instruct ReplL_M1(vec dst, immL_M1 con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (ReplicateL con));
+  effect(TEMP dst);
+  format %{ "vallones $dst" %}
+  ins_encode %{
+    int vector_len = vector_length_encoding(this);
+    __ vallones($dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ====================ReplicateF=======================================
 
 instruct ReplF_reg(vec dst, vlRegF src) %{
@@ -5154,3 +5192,27 @@
   %}
   ins_pipe( pipe_slow );
 %}
+
+// --------------------------------- Bitwise Ternary Logic ----------------------------------
+
+instruct vpternlogdB(vec dst, vec src2, vec src3, immU8 func) %{
+  match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
+  effect(TEMP dst);
+  format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
+  ins_encode %{
+    int vector_len = vector_length_encoding(this);
+    __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vpternlogdB_mem(vec dst, vec src2, memory src3, immU8 func) %{
+  match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
+  effect(TEMP dst);
+  format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
+  ins_encode %{
+    int vector_len = vector_length_encoding(this);
+    __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
--- a/src/hotspot/share/adlc/formssel.cpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/share/adlc/formssel.cpp	Thu Apr 02 22:38:23 2020 +0530
@@ -4168,7 +4168,7 @@
     "MulReductionVF", "MulReductionVD",
     "MaxReductionV", "MinReductionV",
     "AndReductionV", "OrReductionV", "XorReductionV",
-    "MulAddVS2VI",
+    "MulAddVS2VI", "MacroLogicV",
     "LShiftCntV","RShiftCntV",
     "LShiftVB","LShiftVS","LShiftVI","LShiftVL",
     "RShiftVB","RShiftVS","RShiftVI","RShiftVL",
--- a/src/hotspot/share/opto/c2_globals.hpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/share/opto/c2_globals.hpp	Thu Apr 02 22:38:23 2020 +0530
@@ -186,6 +186,9 @@
   notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false,                 \
           "Trace what Superword Level Parallelism analysis applies")        \
                                                                             \
+  diagnostic(bool, UseVectorMacroLogic, true,                               \
+          "Use ternary macro logic instructions")                           \
+                                                                            \
   product(intx,  LoopUnrollMin, 4,                                          \
           "Minimum number of unroll loop bodies before checking progress"   \
           "of rounds of unroll,optimize,..")                                \
--- a/src/hotspot/share/opto/classes.hpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/share/opto/classes.hpp	Thu Apr 02 22:38:23 2020 +0530
@@ -312,6 +312,7 @@
 macro(SubL)
 macro(TailCall)
 macro(TailJump)
+macro(MacroLogicV)
 macro(ThreadLocal)
 macro(Unlock)
 macro(URShiftI)
--- a/src/hotspot/share/opto/compile.cpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/share/opto/compile.cpp	Thu Apr 02 22:38:23 2020 +0530
@@ -76,6 +76,7 @@
 #include "utilities/align.hpp"
 #include "utilities/copy.hpp"
 #include "utilities/macros.hpp"
+#include "utilities/resourceHash.hpp"
 
 
 // -------------------- Compile::mach_constant_base_node -----------------------
@@ -2228,6 +2229,11 @@
     igvn.optimize();
   }
 
+  if (C->max_vector_size() > 0) {
+    C->optimize_logic_cones(igvn);
+    igvn.optimize();
+  }
+
   DEBUG_ONLY( _modified_nodes = NULL; )
  } // (End scope of igvn; run destructor if necessary for asserts.)
 
@@ -2244,6 +2250,317 @@
  print_method(PHASE_OPTIMIZE_FINISHED, 2);
 }
 
+//---------------------------- Bitwise operation packing optimization ---------------------------
+
+static bool is_vector_unary_bitwise_op(Node* n) {
+  return n->Opcode() == Op_XorV &&
+         VectorNode::is_vector_bitwise_not_pattern(n);
+}
+
+static bool is_vector_binary_bitwise_op(Node* n) {
+  switch (n->Opcode()) {
+    case Op_AndV:
+    case Op_OrV:
+      return true;
+
+    case Op_XorV:
+      return !is_vector_unary_bitwise_op(n);
+
+    default:
+      return false;
+  }
+}
+
+static bool is_vector_ternary_bitwise_op(Node* n) {
+  return n->Opcode() == Op_MacroLogicV;
+}
+
+static bool is_vector_bitwise_op(Node* n) {
+  return is_vector_unary_bitwise_op(n)  ||
+         is_vector_binary_bitwise_op(n) ||
+         is_vector_ternary_bitwise_op(n);
+}
+
+static bool is_vector_bitwise_cone_root(Node* n) {
+  if (!is_vector_bitwise_op(n)) {
+    return false;
+  }
+  for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
+    if (is_vector_bitwise_op(n->fast_out(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static uint collect_unique_inputs(Node* n, Unique_Node_List& partition, Unique_Node_List& inputs) {
+  uint cnt = 0;
+  if (is_vector_bitwise_op(n)) {
+    if (VectorNode::is_vector_bitwise_not_pattern(n)) {
+      for (uint i = 1; i < n->req(); i++) {
+        Node* in = n->in(i);
+        bool skip = VectorNode::is_all_ones_vector(in);
+        if (!skip && !inputs.member(in)) {
+          inputs.push(in);
+          cnt++;
+        }
+      }
+      assert(cnt <= 1, "not unary");
+    } else {
+      uint last_req = n->req();
+      if (is_vector_ternary_bitwise_op(n)) {
+        last_req = n->req() - 1; // skip last input
+      }
+      for (uint i = 1; i < last_req; i++) {
+        Node* def = n->in(i);
+        if (!inputs.member(def)) {
+          inputs.push(def);
+          cnt++;
+        }
+      }
+    }
+    partition.push(n);
+  } else { // not a bitwise operations
+    if (!inputs.member(n)) {
+      inputs.push(n);
+      cnt++;
+    }
+  }
+  return cnt;
+}
+
+void Compile::collect_logic_cone_roots(Unique_Node_List& list) {
+  Unique_Node_List useful_nodes;
+  C->identify_useful_nodes(useful_nodes);
+
+  for (uint i = 0; i < useful_nodes.size(); i++) {
+    Node* n = useful_nodes.at(i);
+    if (is_vector_bitwise_cone_root(n)) {
+      list.push(n);
+    }
+  }
+}
+
+Node* Compile::xform_to_MacroLogicV(PhaseIterGVN& igvn,
+                                    const TypeVect* vt,
+                                    Unique_Node_List& partition,
+                                    Unique_Node_List& inputs) {
+  assert(partition.size() == 2 || partition.size() == 3, "not supported");
+  assert(inputs.size()    == 2 || inputs.size()    == 3, "not supported");
+  assert(Matcher::match_rule_supported_vector(Op_MacroLogicV, vt->length(), vt->element_basic_type()), "not supported");
+
+  Node* in1 = inputs.at(0);
+  Node* in2 = inputs.at(1);
+  Node* in3 = (inputs.size() == 3 ? inputs.at(2) : in2);
+
+  uint func = compute_truth_table(partition, inputs);
+  return igvn.transform(MacroLogicVNode::make(igvn, in3, in2, in1, func, vt));
+}
+
+static uint extract_bit(uint func, uint pos) {
+  return (func & (1 << pos)) >> pos;
+}
+
+//
+//  A macro logic node represents a truth table. It has 4 inputs,
+//  First three inputs corresponds to 3 columns of a truth table
+//  and fourth input captures the logic function.
+//
+//  eg.  fn = (in1 AND in2) OR in3;
+//
+//      MacroNode(in1,in2,in3,fn)
+//
+//  -----------------
+//  in1 in2 in3  fn
+//  -----------------
+//  0    0   0    0
+//  0    0   1    1
+//  0    1   0    0
+//  0    1   1    1
+//  1    0   0    0
+//  1    0   1    1
+//  1    1   0    1
+//  1    1   1    1
+//
+
+uint Compile::eval_macro_logic_op(uint func, uint in1 , uint in2, uint in3) {
+  int res = 0;
+  for (int i = 0; i < 8; i++) {
+    int bit1 = extract_bit(in1, i);
+    int bit2 = extract_bit(in2, i);
+    int bit3 = extract_bit(in3, i);
+
+    int func_bit_pos = (bit1 << 2 | bit2 << 1 | bit3);
+    int func_bit = extract_bit(func, func_bit_pos);
+
+    res |= func_bit << i;
+  }
+  return res;
+}
+
+static uint eval_operand(Node* n, ResourceHashtable<Node*,uint>& eval_map) {
+  assert(n != NULL, "");
+  assert(eval_map.contains(n), "absent");
+  return *(eval_map.get(n));
+}
+
+static void eval_operands(Node* n,
+                          uint& func1, uint& func2, uint& func3,
+                          ResourceHashtable<Node*,uint>& eval_map) {
+  assert(is_vector_bitwise_op(n), "");
+  func1 = eval_operand(n->in(1), eval_map);
+
+  if (is_vector_binary_bitwise_op(n)) {
+    func2 = eval_operand(n->in(2), eval_map);
+  } else if (is_vector_ternary_bitwise_op(n)) {
+    func2 = eval_operand(n->in(2), eval_map);
+    func3 = eval_operand(n->in(3), eval_map);
+  } else {
+    assert(is_vector_unary_bitwise_op(n), "not unary");
+  }
+}
+
+uint Compile::compute_truth_table(Unique_Node_List& partition, Unique_Node_List& inputs) {
+  assert(inputs.size() <= 3, "sanity");
+  ResourceMark rm;
+  uint res = 0;
+  ResourceHashtable<Node*,uint> eval_map;
+
+  // Populate precomputed functions for inputs.
+  // Each input corresponds to one column of 3 input truth-table.
+  uint input_funcs[] = { 0xAA,   // (_, _, a) -> a
+                         0xCC,   // (_, b, _) -> b
+                         0xF0 }; // (c, _, _) -> c
+  for (uint i = 0; i < inputs.size(); i++) {
+    eval_map.put(inputs.at(i), input_funcs[i]);
+  }
+
+  for (uint i = 0; i < partition.size(); i++) {
+    Node* n = partition.at(i);
+
+    uint func1 = 0, func2 = 0, func3 = 0;
+    eval_operands(n, func1, func2, func3, eval_map);
+
+    switch (n->Opcode()) {
+      case Op_OrV:
+        assert(func3 == 0, "not binary");
+        res = func1 | func2;
+        break;
+      case Op_AndV:
+        assert(func3 == 0, "not binary");
+        res = func1 & func2;
+        break;
+      case Op_XorV:
+        if (VectorNode::is_vector_bitwise_not_pattern(n)) {
+          assert(func2 == 0 && func3 == 0, "not unary");
+          res = (~func1) & 0xFF;
+        } else {
+          assert(func3 == 0, "not binary");
+          res = func1 ^ func2;
+        }
+        break;
+      case Op_MacroLogicV:
+        // Ordering of inputs may change during evaluation of sub-tree
+        // containing MacroLogic node as a child node, thus a re-evaluation
+        // makes sure that function is evaluated in context of current
+        // inputs.
+        res = eval_macro_logic_op(n->in(4)->get_int(), func1, func2, func3);
+        break;
+
+      default: assert(false, "not supported: %s", n->Name());
+    }
+    assert(res <= 0xFF, "invalid");
+    eval_map.put(n, res);
+  }
+  return res;
+}
+
+bool Compile::compute_logic_cone(Node* n, Unique_Node_List& partition, Unique_Node_List& inputs) {
+  assert(partition.size() == 0, "not empty");
+  assert(inputs.size() == 0, "not empty");
+  assert(!is_vector_ternary_bitwise_op(n), "not supported");
+
+  bool is_unary_op = is_vector_unary_bitwise_op(n);
+  if (is_unary_op) {
+    assert(collect_unique_inputs(n, partition, inputs) == 1, "not unary");
+    return false; // too few inputs
+  }
+
+  assert(is_vector_binary_bitwise_op(n), "not binary");
+  Node* in1 = n->in(1);
+  Node* in2 = n->in(2);
+
+  int in1_unique_inputs_cnt = collect_unique_inputs(in1, partition, inputs);
+  int in2_unique_inputs_cnt = collect_unique_inputs(in2, partition, inputs);
+  partition.push(n);
+
+  // Too many inputs?
+  if (inputs.size() > 3) {
+    partition.clear();
+    inputs.clear();
+    { // Recompute in2 inputs
+      Unique_Node_List not_used;
+      in2_unique_inputs_cnt = collect_unique_inputs(in2, not_used, not_used);
+    }
+    // Pick the node with minimum number of inputs.
+    if (in1_unique_inputs_cnt >= 3 && in2_unique_inputs_cnt >= 3) {
+      return false; // still too many inputs
+    }
+    // Recompute partition & inputs.
+    Node* child       = (in1_unique_inputs_cnt < in2_unique_inputs_cnt ? in1 : in2);
+    collect_unique_inputs(child, partition, inputs);
+
+    Node* other_input = (in1_unique_inputs_cnt < in2_unique_inputs_cnt ? in2 : in1);
+    inputs.push(other_input);
+
+    partition.push(n);
+  }
+
+  return (partition.size() == 2 || partition.size() == 3) &&
+         (inputs.size()    == 2 || inputs.size()    == 3);
+}
+
+void Compile::process_logic_cone_root(PhaseIterGVN &igvn, Node *n, VectorSet &visited) {
+  assert(is_vector_bitwise_op(n), "not a root");
+
+  visited.set(n->_idx);
+
+  // 1) Do a DFS walk over the logic cone.
+  for (uint i = 1; i < n->req(); i++) {
+    Node* in = n->in(i);
+    if (!visited.test(in->_idx) && is_vector_bitwise_op(in)) {
+      process_logic_cone_root(igvn, in, visited);
+    }
+  }
+
+  // 2) Bottom up traversal: Merge node[s] with
+  // the parent to form macro logic node.
+  Unique_Node_List partition;
+  Unique_Node_List inputs;
+  if (compute_logic_cone(n, partition, inputs)) {
+    const TypeVect* vt = n->bottom_type()->is_vect();
+    Node* macro_logic = xform_to_MacroLogicV(igvn, vt, partition, inputs);
+    igvn.replace_node(n, macro_logic);
+  }
+}
+
+void Compile::optimize_logic_cones(PhaseIterGVN &igvn) {
+  ResourceMark rm;
+  if (Matcher::match_rule_supported(Op_MacroLogicV)) {
+    Unique_Node_List list;
+    collect_logic_cone_roots(list);
+
+    while (list.size() > 0) {
+      Node* n = list.pop();
+      const TypeVect* vt = n->bottom_type()->is_vect();
+      bool supported = Matcher::match_rule_supported_vector(Op_MacroLogicV, vt->length(), vt->element_basic_type());
+      if (supported) {
+        VectorSet visited(comp_arena());
+        process_logic_cone_root(igvn, n, visited);
+      }
+    }
+  }
+}
 
 //------------------------------Code_Gen---------------------------------------
 // Given a graph, generate code for it
@@ -4225,6 +4542,7 @@
   }
 }
 
+
 // Move Allocate nodes to the start of the list
 void Compile::sort_macro_nodes() {
   int count = macro_count();
--- a/src/hotspot/share/opto/compile.hpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/share/opto/compile.hpp	Thu Apr 02 22:38:23 2020 +0530
@@ -83,6 +83,7 @@
 class TypePtr;
 class TypeOopPtr;
 class TypeFunc;
+class TypeVect;
 class Unique_Node_List;
 class nmethod;
 class WarmCallInfo;
@@ -1106,6 +1107,15 @@
   void final_graph_reshaping_walk( Node_Stack &nstack, Node *root, Final_Reshape_Counts &frc );
   void eliminate_redundant_card_marks(Node* n);
 
+  // Logic cone optimization.
+  void optimize_logic_cones(PhaseIterGVN &igvn);
+  void collect_logic_cone_roots(Unique_Node_List& list);
+  void process_logic_cone_root(PhaseIterGVN &igvn, Node* n, VectorSet& visited);
+  bool compute_logic_cone(Node* n, Unique_Node_List& partition, Unique_Node_List& inputs);
+  uint compute_truth_table(Unique_Node_List& partition, Unique_Node_List& inputs);
+  uint eval_macro_logic_op(uint func, uint op1, uint op2, uint op3);
+  Node* xform_to_MacroLogicV(PhaseIterGVN &igvn, const TypeVect* vt, Unique_Node_List& partitions, Unique_Node_List& inputs);
+
  public:
 
   // Note:  Histogram array size is about 1 Kb.
--- a/src/hotspot/share/opto/matcher.cpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/share/opto/matcher.cpp	Thu Apr 02 22:38:23 2020 +0530
@@ -2220,6 +2220,7 @@
     case Op_FmaF:
     case Op_FmaVD:
     case Op_FmaVF:
+    case Op_MacroLogicV:
       set_shared(n); // Force result into register (it will be anyways)
       break;
     case Op_ConP: {  // Convert pointers above the centerline to NUL
@@ -2313,6 +2314,15 @@
       n->del_req(3);
       break;
     }
+    case Op_MacroLogicV: {
+      Node* pair1 = new BinaryNode(n->in(1), n->in(2));
+      Node* pair2 = new BinaryNode(n->in(3), n->in(4));
+      n->set_req(1, pair1);
+      n->set_req(2, pair2);
+      n->del_req(4);
+      n->del_req(3);
+      break;
+    }
     case Op_LoopLimit: {
       Node* pair1 = new BinaryNode(n->in(1), n->in(2));
       n->set_req(1, pair1);
--- a/src/hotspot/share/opto/vectornode.cpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/share/opto/vectornode.cpp	Thu Apr 02 22:38:23 2020 +0530
@@ -518,6 +518,39 @@
   }
 }
 
+static bool is_con_M1(Node* n) {
+  if (n->is_Con()) {
+    const Type* t = n->bottom_type();
+    if (t->isa_int() && t->is_int()->get_con() == -1) {
+      return true;
+    }
+    if (t->isa_long() && t->is_long()->get_con() == -1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool VectorNode::is_all_ones_vector(Node* n) {
+  switch (n->Opcode()) {
+  case Op_ReplicateB:
+  case Op_ReplicateS:
+  case Op_ReplicateI:
+  case Op_ReplicateL:
+    return is_con_M1(n->in(1));
+  default:
+    return false;
+  }
+}
+
+bool VectorNode::is_vector_bitwise_not_pattern(Node* n) {
+  if (n->Opcode() == Op_XorV) {
+    return is_all_ones_vector(n->in(1)) ||
+           is_all_ones_vector(n->in(2));
+  }
+  return false;
+}
+
 // Return initial Pack node. Additional operands added with add_opd() calls.
 PackNode* PackNode::make(Node* s, uint vlen, BasicType bt) {
   const TypeVect* vt = TypeVect::make(bt, vlen);
@@ -740,3 +773,13 @@
   }
   return false;
 }
+
+MacroLogicVNode* MacroLogicVNode::make(PhaseGVN& gvn, Node* v1, Node* v2, Node* v3,
+                                      uint truth_table, const TypeVect* vt) {
+  assert(truth_table <= 0xFF, "invalid");
+  assert(v1->bottom_type() == vt, "mismatch");
+  assert(v2->bottom_type() == vt, "mismatch");
+  assert(v3->bottom_type() == vt, "mismatch");
+  Node* fn = gvn.intcon(truth_table);
+  return new MacroLogicVNode(v1, v2, v3, fn, vt);
+}
--- a/src/hotspot/share/opto/vectornode.hpp	Thu Apr 02 18:22:27 2020 +0200
+++ b/src/hotspot/share/opto/vectornode.hpp	Thu Apr 02 22:38:23 2020 +0530
@@ -51,6 +51,14 @@
     init_req(3, n3);
   }
 
+  VectorNode(Node *n0, Node* n1, Node* n2, Node* n3, const TypeVect* vt) : TypeNode(vt, 5) {
+    init_class_id(Class_Vector);
+    init_req(1, n0);
+    init_req(2, n1);
+    init_req(3, n2);
+    init_req(4, n3);
+  }
+
   const TypeVect* vect_type() const { return type()->is_vect(); }
   uint length() const { return vect_type()->length(); } // Vector length
   uint length_in_bytes() const { return vect_type()->length_in_bytes(); }
@@ -72,6 +80,9 @@
   static bool is_muladds2i(Node* n);
   static bool is_roundopD(Node * n);
   static bool is_invariant_vector(Node* n);
+  static bool is_all_ones_vector(Node* n);
+  static bool is_vector_bitwise_not_pattern(Node* n);
+
   // [Start, end) half-open range defining which operands are vectors
   static void vector_operands(Node* n, uint* start, uint* end);
 
@@ -982,4 +993,17 @@
   virtual const Type *Value(PhaseGVN *phase) const { return TypeInt::INT; }
 };
 
+//------------------------------MacroLogicVNode-------------------------------
+// Vector logical operations packing node.
+class MacroLogicVNode : public VectorNode {
+private:
+  MacroLogicVNode(Node* in1, Node* in2, Node* in3, Node* fn, const TypeVect* vt)
+  : VectorNode(in1, in2, in3, fn, vt) {}
+
+public:
+  virtual int Opcode() const;
+
+  static MacroLogicVNode* make(PhaseGVN& igvn, Node* in1, Node* in2, Node* in3, uint truth_table, const TypeVect* vt);
+};
+
 #endif // SHARE_OPTO_VECTORNODE_HPP
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/hotspot/jtreg/compiler/vectorization/TestMacroLogicVector.java	Thu Apr 02 22:38:23 2020 +0530
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8241040
+ * @library /test/lib
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
+ *           -Xbatch -XX:-TieredCompilation -XX:CICompilerCount=1 -XX:UseAVX=3
+ *           -XX:CompileCommand=quiet -XX:CompileCommand=compileonly,*::test*
+ *           -XX:+TraceNewVectors compiler.vectorization.TestMacroLogicVector
+ */
+package compiler.vectorization;
+
+import jdk.test.lib.Utils;
+
+import java.util.Random;
+import java.util.concurrent.Callable;
+
+public class TestMacroLogicVector {
+    static int intFunc1(int a, int b, int c) {
+        int v1 = (a & b) ^ (a & c) ^ (b & c);
+        int v2 = (~a & b) | (~b & c) | (~c & a);
+        return v1 & v2;
+    }
+    static void testInt1(int[] r, int[] a, int[] b, int[] c) {
+        for (int i = 0; i < r.length; i++) {
+            r[i] = intFunc1(a[i], b[i], c[i]);
+        }
+    }
+    static void verifyInt1(int[] r, int[] a, int[] b, int[] c) {
+        for (int i = 0; i < r.length; i++) {
+            int expected = intFunc1(a[i], b[i], c[i]);
+            if (r[i] != expected) {
+                throw new AssertionError(String.format("at #%d: r=%d, expected = %d = intFunc1(%d,%d,%d)",
+                                                       i, r[i], expected, a[i], b[i], c[i]));
+            }
+        }
+    }
+    static int intFunc2(int a) {
+        int v1 = (a & a) ^ (a & a) ^ (a & a);
+        int v2 = (~a & ~a) | (~a & a) | (a & a);
+        return v1 & v2;
+    }
+    static void testInt2(int[] r, int[] a) {
+        for (int i = 0; i < r.length; i++) {
+            r[i] = intFunc2(a[i]);
+        }
+    }
+    static void verifyInt2(int[] r, int[] a) {
+        for (int i = 0; i < r.length; i++) {
+            int expected = intFunc2(a[i]);
+            if (r[i] != expected) {
+                throw new AssertionError(String.format("at #%d: r=%d, expected = %d = intFunc2(%d)",
+                                                       i, r[i], expected, a[i]));
+            }
+        }
+    }
+
+    static int intFunc3(int a, int b) {
+        return (~a & b) ^ (a | b);
+    }
+    static void testInt3(int[] r, int[] a, int[] b) {
+        for (int i = 0; i < r.length; i++) {
+            r[i] = intFunc3(a[i], b[i]);
+        }
+    }
+    static void verifyInt3(int[] r, int[] a, int[] b) {
+        for (int i = 0; i < r.length; i++) {
+            int expected = intFunc3(a[i], b[i]);
+            if (r[i] != expected) {
+                throw new AssertionError(String.format("at #%d: r=%d, expected = %d = intFunc3(%d,%d)",
+                                                       i, r[i], expected, a[i], b[i]));
+            }
+        }
+    }
+
+    static int intFunc4(int a, int b, int c, int d, int e, int f) {
+        int v1 = (~a | ~b) & ~c;
+        int v2 = ~d & (~e & f);
+        return v1 | v2;
+    }
+
+    static void testInt4(int[] r, int[] a, int[] b, int[] c, int[] d, int[] e, int[] f) {
+        for (int i = 0; i < r.length; i++) {
+            r[i] = intFunc4(a[i], b[i], c[i], d[i], e[i], f[i]);
+        }
+    }
+
+    static void verifyTest4(int[] r, int[] a, int[] b, int[] c, int[] d, int[] e, int[] f) {
+        for (int i = 0; i < r.length; i++) {
+            long expected = intFunc4(a[i], b[i], c[i], d[i], e[i], f[i]);
+            if (r[i] != expected) {
+                throw new AssertionError(
+                        String.format("at #%d: r=%d, expected = %d = intFunc4(%d,%d,%d,%d,%d,%d)",
+                                      i, r[i], expected, a[i], b[i], c[i], d[i], e[i], f[i]));
+            }
+        }
+    }
+    // ===================================================== //
+
+    static long longFunc(long a, long b, long c) {
+        long v1 = (a & b) ^ (a & c) ^ (b & c);
+        long v2 = (~a & b) | (~b & c) | (~c & a);
+        return v1 & v2;
+    }
+    static void testLong(long[] r, long[] a, long[] b, long[] c) {
+        for (int i = 0; i < r.length; i++) {
+            r[i] = longFunc(a[i], b[i], c[i]);
+        }
+    }
+    static void verifyLong(long[] r, long[] a, long[] b, long[] c) {
+        for (int i = 0; i < r.length; i++) {
+            long expected = longFunc(a[i], b[i], c[i]);
+            if (r[i] != expected) {
+                throw new AssertionError(
+                        String.format("at #%d: r=%d, expected = %d = longFunc(%d,%d,%d)",
+                                      i, r[i], expected, a[i], b[i], c[i]));
+            }
+        }
+    }
+
+    // ===================================================== //
+
+    private static final Random R = Utils.getRandomInstance();
+
+    static int[] fillIntRandom(Callable<int[]> factory) {
+        try {
+            int[] arr = factory.call();
+            for (int i = 0; i < arr.length; i++) {
+                arr[i] = R.nextInt();
+            }
+            return arr;
+        } catch (Exception e) {
+            throw new InternalError(e);
+        }
+    }
+    static long[] fillLongRandom(Callable<long[]> factory) {
+        try {
+            long[] arr = factory.call();
+            for (int i = 0; i < arr.length; i++) {
+                arr[i] = R.nextLong();
+            }
+            return arr;
+        } catch (Exception e) {
+            throw new InternalError(e);
+        }
+    }
+
+    // ===================================================== //
+
+    static final int SIZE = 128;
+
+    public static void main(String[] args) {
+
+        int[] r = new int[SIZE];
+        int[] a = fillIntRandom(()-> new int[SIZE]);
+        int[] b = fillIntRandom(()-> new int[SIZE]);
+        int[] c = fillIntRandom(()-> new int[SIZE]);
+        int[] d = fillIntRandom(()-> new int[SIZE]);
+        int[] e = fillIntRandom(()-> new int[SIZE]);
+        int[] f = fillIntRandom(()-> new int[SIZE]);
+
+        long[] rl = new long[SIZE];
+        long[] al = fillLongRandom(() -> new long[SIZE]);
+        long[] bl = fillLongRandom(() -> new long[SIZE]);
+        long[] cl = fillLongRandom(() -> new long[SIZE]);
+
+        for (int i = 0; i < 20_000; i++) {
+            testInt1(r, a, b, c);
+            verifyInt1(r, a, b, c);
+
+            testInt2(r, a);
+            verifyInt2(r, a);
+
+            testInt3(r, a, b);
+            verifyInt3(r, a, b);
+
+            testInt4(r, a, b, c, d, e, f);
+            verifyTest4(r, a, b, c, d, e, f);
+
+            testLong(rl, al, bl, cl);
+            verifyLong(rl, al, bl, cl);
+
+        }
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/micro/org/openjdk/bench/vm/compiler/MacroLogicOpt.java	Thu Apr 02 22:38:23 2020 +0530
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@State(Scope.Thread)
+public class MacroLogicOpt {
+  @Param({"64","128","256","512","1024","2048","4096"}) private int VECLEN;
+
+  private  int [] ai = new int[VECLEN];
+  private  int [] bi = new int[VECLEN];
+  private  int [] ci = new int[VECLEN];
+  private  int [] ri = new int[VECLEN];
+
+  private  long [] al = new long[VECLEN];
+  private  long [] bl = new long[VECLEN];
+  private  long [] cl = new long[VECLEN];
+  private  long [] dl = new long[VECLEN];
+  private  long [] el = new long[VECLEN];
+  private  long [] fl = new long[VECLEN];
+  private  long [] rl = new long[VECLEN];
+
+  private Random r = new Random();
+
+  @Setup
+  public void init() {
+    ai = new int[VECLEN];
+    bi = new int[VECLEN];
+    ci = new int[VECLEN];
+    ri = new int[VECLEN];
+
+    al = new long[VECLEN];
+    bl = new long[VECLEN];
+    cl = new long[VECLEN];
+    dl = new long[VECLEN];
+    el = new long[VECLEN];
+    fl = new long[VECLEN];
+    rl = new long[VECLEN];
+    for (int i=0; i<VECLEN; i++) {
+      ai[i] = r.nextInt();
+      bi[i] = r.nextInt();
+      ci[i] = r.nextInt();
+
+      al[i] = r.nextLong();
+      bl[i] = r.nextLong();
+      cl[i] = r.nextLong();
+      dl[i] = r.nextLong();
+      el[i] = r.nextLong();
+      fl[i] = r.nextLong();
+    }
+  }
+
+  @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+  private int run_workload1(int count, int [] a , int [] b, int [] c, int [] r) {
+      for(int i = 0 ; i < r.length ; i++)
+          r[i] = (((a[i] & b[i]) ^ (a[i] & c[i]) ^ (b[i] & c[i]))  &  ((~a[i] & b[i]) | (~b[i] & c[i])  | ~c[i] & a[i]));
+    return r[count];
+  }
+
+  @Benchmark
+  public  void workload1_caller(Blackhole bh) {
+    int r = 0;
+    for(int i = 0 ; i < 10000; i++)
+       r += run_workload1(i&(ri.length-1), ai, bi, ci, ri);
+    bh.consume(r);
+  }
+
+  @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+  private long run_workload2(int count, long [] a , long [] b, long [] c, long [] r) {
+      for(int i = 0 ; i < r.length ; i++)
+          r[i] = (((a[i] & b[i]) ^ (a[i] & c[i]) ^ (b[i] & c[i]))  &  ((~a[i] & b[i]) | (~b[i] & c[i])  | ~c[i] & a[i]));
+    return r[count];
+  }
+
+  @Benchmark
+  public void workload2_caller(Blackhole bh) {
+    long r = 0;
+    for(int i = 0 ; i < 100000; i++)
+       r += run_workload2(i&(rl.length-1), al, bl, cl, rl);
+    bh.consume(r);
+  }
+
+  @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+  private long run_workload3(int count, long [] a , long [] b, long [] c,
+                           long [] d, long [] e, long [] f, long [] r) {
+    for(int i = 0 ; i < r.length ; i++)
+      r[i] = (((~a[i] | ~b[i]) & (~c[i])) | (~d[i] & (~e[i] & f[i])));
+    return r[count];
+  }
+
+  @Benchmark
+  public void workload3_caller(Blackhole bh) {
+    long r = 0;
+    for(int i = 0 ; i < 10000; i++)
+       r += run_workload3(i&(ri.length-1), al, bl, cl, dl, el, fl, rl);
+    bh.consume(r);
+  }
+}