changeset 53170:4bb6e0871bf7

8214751: X86: Support for VNNI Instructions Reviewed-by: kvn Contributed-by: razvan.a.lupusoru@intel.com, vivek.r.deshpande@intel.com
author vdeshpande
date Wed, 12 Dec 2018 14:48:34 -0800
parents 9e28eff3d40f
children 2626982cf4f7
files src/hotspot/cpu/x86/assembler_x86.cpp src/hotspot/cpu/x86/assembler_x86.hpp src/hotspot/cpu/x86/vm_version_x86.cpp src/hotspot/cpu/x86/vm_version_x86.hpp src/hotspot/cpu/x86/x86.ad src/hotspot/cpu/x86/x86_32.ad src/hotspot/cpu/x86/x86_64.ad src/hotspot/share/adlc/formssel.cpp src/hotspot/share/opto/classes.hpp src/hotspot/share/opto/loopnode.hpp src/hotspot/share/opto/loopopts.cpp src/hotspot/share/opto/matcher.cpp src/hotspot/share/opto/mulnode.hpp src/hotspot/share/opto/superword.cpp src/hotspot/share/opto/superword.hpp src/hotspot/share/opto/vectornode.cpp src/hotspot/share/opto/vectornode.hpp test/hotspot/jtreg/compiler/loopopts/superword/Vec_MulAddS2I.java
diffstat 18 files changed, 491 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/cpu/x86/assembler_x86.cpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp	Wed Dec 12 14:48:34 2018 -0800
@@ -3966,6 +3966,34 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::pmaddwd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0xF5);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
+    (vector_len == AVX_256bit ? VM_Version::supports_avx2() :
+    (vector_len == AVX_512bit ? VM_Version::supports_evex() : 0)), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0xF5);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_vnni(), "must support vnni");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x52);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 // generic
 void Assembler::pop(Register dst) {
   int encode = prefix_and_encode(dst->encoding());
--- a/src/hotspot/cpu/x86/assembler_x86.hpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp	Wed Dec 12 14:48:34 2018 -0800
@@ -1668,6 +1668,12 @@
 
   void evpmovdb(Address dst, XMMRegister src, int vector_len);
 
+  // Multiply add
+  void pmaddwd(XMMRegister dst, XMMRegister src);
+  void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  // Multiply add accumulate
+  void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
 #ifndef _LP64 // no 32bit push/pop on amd64
   void popl(Address dst);
 #endif
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp	Wed Dec 12 14:48:34 2018 -0800
@@ -1289,7 +1289,7 @@
       if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
         UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
       }
-      if (supports_sse4_2() && supports_ht()) { // Newest Intel cpus
+      if ((supports_sse4_2() && supports_ht()) || supports_avx()) { // Newest Intel cpus
         if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
           UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
         }
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp	Wed Dec 12 14:48:34 2018 -0800
@@ -336,6 +336,7 @@
 #define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
 #define CPU_VPCLMULQDQ ((uint64_t)UCONST64(0x4000000000)) //Vector carryless multiplication
 #define CPU_VAES ((uint64_t)UCONST64(0x8000000000))    // Vector AES instructions
+#define CPU_VNNI ((uint64_t)UCONST64(0x16000000000))   // Vector Neural Network Instructions
 
   enum Extended_Family {
     // AMD
@@ -548,6 +549,8 @@
           result |= CPU_VPCLMULQDQ;
         if (_cpuid_info.sef_cpuid7_ecx.bits.vaes != 0)
           result |= CPU_VAES;
+        if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vnni != 0)
+          result |= CPU_VNNI;
       }
     }
     if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
@@ -828,6 +831,7 @@
   static bool supports_vpopcntdq()  { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
   static bool supports_vpclmulqdq() { return (_features & CPU_VPCLMULQDQ) != 0; }
   static bool supports_vaes()       { return (_features & CPU_VAES) != 0; }
+  static bool supports_vnni()       { return (_features & CPU_VNNI) != 0; }
 
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
--- a/src/hotspot/cpu/x86/x86.ad	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/cpu/x86/x86.ad	Wed Dec 12 14:48:34 2018 -0800
@@ -1446,6 +1446,10 @@
       if (VM_Version::supports_on_spin_wait() == false)
         ret_value = false;
       break;
+    case Op_MulAddVS2VI:
+      if (UseSSE < 2)
+        ret_value = false;
+      break;
   }
 
   return ret_value;  // Per default match rules are supported.
@@ -9855,6 +9859,118 @@
   ins_pipe( pipe_slow );
 %}
 
+// --------------------------------- Vector Multiply Add --------------------------------------
+
+instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
+  predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulAddVS2VI dst src1));
+  format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
+  ins_encode %{
+    __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulAddVS2VI src1 src2));
+  format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
+  predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulAddVS2VI dst src1));
+  format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
+  ins_encode %{
+    __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulAddVS2VI src1 src2));
+  format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (MulAddVS2VI src1 src2));
+  format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (MulAddVS2VI src1 src2));
+  format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- Vector Multiply Add Add ----------------------------------
+
+instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
+  format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
+  format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
+  format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
+  format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // --------------------------------- PopCount --------------------------------------
 
 instruct vpopcount2I(vecD dst, vecD src) %{
--- a/src/hotspot/cpu/x86/x86_32.ad	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/cpu/x86/x86_32.ad	Wed Dec 12 14:48:34 2018 -0800
@@ -7755,6 +7755,16 @@
   ins_pipe( ialu_reg_mem_alu0 );
 %}
 
+instruct mulAddS2I_rReg(rRegI dst, rRegI src1, rRegI src2, rRegI src3, eFlagsReg cr)
+%{
+  match(Set dst (MulAddS2I (Binary dst src1) (Binary src2 src3)));
+  effect(KILL cr, KILL src2);
+
+  expand %{ mulI_rReg(dst, src1, cr);
+           mulI_rReg(src2, src3, cr);
+           addI_rReg(dst, src2, cr); %}
+%}
+
 // Multiply Register Int to Long
 instruct mulI2L(eADXRegL dst, eAXRegI src, nadxRegI src1, eFlagsReg flags) %{
   // Basic Idea: long = (long)int * (long)int
--- a/src/hotspot/cpu/x86/x86_64.ad	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/cpu/x86/x86_64.ad	Wed Dec 12 14:48:34 2018 -0800
@@ -8175,6 +8175,16 @@
   ins_pipe(ialu_reg_mem_alu0);
 %}
 
+instruct mulAddS2I_rReg(rRegI dst, rRegI src1, rRegI src2, rRegI src3, rFlagsReg cr)
+%{
+  match(Set dst (MulAddS2I (Binary dst src1) (Binary src2 src3)));
+  effect(KILL cr, KILL src2);
+
+  expand %{ mulI_rReg(dst, src1, cr);
+           mulI_rReg(src2, src3, cr);
+           addI_rReg(dst, src2, cr); %}
+%}
+
 instruct mulL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
 %{
   match(Set dst (MulL dst src));
--- a/src/hotspot/share/adlc/formssel.cpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/adlc/formssel.cpp	Wed Dec 12 14:48:34 2018 -0800
@@ -4181,6 +4181,7 @@
     "AddReductionVF", "AddReductionVD",
     "MulReductionVI", "MulReductionVL",
     "MulReductionVF", "MulReductionVD",
+    "MulAddVS2VI",
     "LShiftCntV","RShiftCntV",
     "LShiftVB","LShiftVS","LShiftVI","LShiftVL",
     "RShiftVB","RShiftVS","RShiftVI","RShiftVL",
--- a/src/hotspot/share/opto/classes.hpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/opto/classes.hpp	Wed Dec 12 14:48:34 2018 -0800
@@ -201,6 +201,7 @@
 macro(LoopLimit)
 macro(Mach)
 macro(MachProj)
+macro(MulAddS2I)
 macro(MaxI)
 macro(MemBarAcquire)
 macro(LoadFence)
@@ -341,6 +342,7 @@
 macro(MulReductionVF)
 macro(MulVD)
 macro(MulReductionVD)
+macro(MulAddVS2VI)
 macro(FmaVD)
 macro(FmaVF)
 macro(DivVF)
--- a/src/hotspot/share/opto/loopnode.hpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/opto/loopnode.hpp	Wed Dec 12 14:48:34 2018 -0800
@@ -1249,6 +1249,9 @@
   // important (common) to do address expressions.
   Node *remix_address_expressions( Node *n );
 
+  // Convert add to muladd to generate MuladdS2I under certain criteria
+  Node * convert_add_to_muladd(Node * n);
+
   // Attempt to use a conditional move instead of a phi/branch
   Node *conditional_move( Node *n );
 
--- a/src/hotspot/share/opto/loopopts.cpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/opto/loopopts.cpp	Wed Dec 12 14:48:34 2018 -0800
@@ -493,6 +493,54 @@
   return NULL;
 }
 
+// Optimize ((in1[2*i] * in2[2*i]) + (in1[2*i+1] * in2[2*i+1]))
+Node *PhaseIdealLoop::convert_add_to_muladd(Node* n) {
+  assert(n->Opcode() == Op_AddI, "sanity");
+  Node * nn = NULL;
+  Node * in1 = n->in(1);
+  Node * in2 = n->in(2);
+  if (in1->Opcode() == Op_MulI && in2->Opcode() == Op_MulI) {
+    IdealLoopTree* loop_n = get_loop(get_ctrl(n));
+    if (loop_n->_head->as_Loop()->is_valid_counted_loop() &&
+        Matcher::match_rule_supported(Op_MulAddS2I) &&
+        Matcher::match_rule_supported(Op_MulAddVS2VI)) {
+      Node* mul_in1 = in1->in(1);
+      Node* mul_in2 = in1->in(2);
+      Node* mul_in3 = in2->in(1);
+      Node* mul_in4 = in2->in(2);
+      if (mul_in1->Opcode() == Op_LoadS &&
+          mul_in2->Opcode() == Op_LoadS &&
+          mul_in3->Opcode() == Op_LoadS &&
+          mul_in4->Opcode() == Op_LoadS) {
+        IdealLoopTree* loop1 = get_loop(get_ctrl(mul_in1));
+        IdealLoopTree* loop2 = get_loop(get_ctrl(mul_in2));
+        IdealLoopTree* loop3 = get_loop(get_ctrl(mul_in3));
+        IdealLoopTree* loop4 = get_loop(get_ctrl(mul_in4));
+        IdealLoopTree* loop5 = get_loop(get_ctrl(in1));
+        IdealLoopTree* loop6 = get_loop(get_ctrl(in2));
+        // All nodes should be in the same counted loop.
+        if (loop_n == loop1 && loop_n == loop2 && loop_n == loop3 &&
+            loop_n == loop4 && loop_n == loop5 && loop_n == loop6) {
+          Node* adr1 = mul_in1->in(MemNode::Address);
+          Node* adr2 = mul_in2->in(MemNode::Address);
+          Node* adr3 = mul_in3->in(MemNode::Address);
+          Node* adr4 = mul_in4->in(MemNode::Address);
+          if (adr1->is_AddP() && adr2->is_AddP() && adr3->is_AddP() && adr4->is_AddP()) {
+            if ((adr1->in(AddPNode::Base) == adr3->in(AddPNode::Base)) &&
+                (adr2->in(AddPNode::Base) == adr4->in(AddPNode::Base))) {
+              nn = new MulAddS2INode(mul_in1, mul_in2, mul_in3, mul_in4);
+              register_new_node(nn, get_ctrl(n));
+              _igvn.replace_node(n, nn);
+              return nn;
+            }
+          }
+        }
+      }
+    }
+  }
+  return nn;
+}
+
 //------------------------------conditional_move-------------------------------
 // Attempt to replace a Phi with a conditional move.  We have some pretty
 // strict profitability requirements.  All Phis at the merge point must
@@ -927,6 +975,11 @@
   Node *m = remix_address_expressions( n );
   if( m ) return m;
 
+  if (n_op == Op_AddI) {
+    Node *nn = convert_add_to_muladd( n );
+    if ( nn ) return nn;
+  }
+
   if (n->is_ConstraintCast()) {
     Node* dom_cast = n->as_ConstraintCast()->dominating_cast(&_igvn, this);
     // ConstraintCastNode::dominating_cast() uses node control input to determine domination.
--- a/src/hotspot/share/opto/matcher.cpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/opto/matcher.cpp	Wed Dec 12 14:48:34 2018 -0800
@@ -2352,6 +2352,15 @@
       n->del_req(3);
       break;
     }
+    case Op_MulAddS2I: {
+      Node* pair1 = new BinaryNode(n->in(1), n->in(2));
+      Node* pair2 = new BinaryNode(n->in(3), n->in(4));
+      n->set_req(1, pair1);
+      n->set_req(2, pair2);
+      n->del_req(4);
+      n->del_req(3);
+      break;
+    }
     default:
       break;
   }
--- a/src/hotspot/share/opto/mulnode.hpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/opto/mulnode.hpp	Wed Dec 12 14:48:34 2018 -0800
@@ -285,4 +285,15 @@
   virtual const Type* Value(PhaseGVN* phase) const;
 };
 
+//------------------------------MulAddS2INode----------------------------------
+// Multiply shorts into integers and add them.
+// Semantics: I_OUT = S1 * S2 + S3 * S4
+class MulAddS2INode : public Node {
+public:
+  MulAddS2INode(Node* in1, Node *in2, Node *in3, Node* in4) : Node(0, in1, in2, in3, in4) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return TypeInt::INT; }
+  virtual uint ideal_reg() const { return Op_RegI; }
+};
+
 #endif // SHARE_VM_OPTO_MULNODE_HPP
--- a/src/hotspot/share/opto/superword.cpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/opto/superword.cpp	Wed Dec 12 14:48:34 2018 -0800
@@ -645,6 +645,10 @@
           // with a different alignment were created before.
           for (uint i = 0; i < align_to_refs.size(); i++) {
             MemNode* mr = align_to_refs.at(i)->as_Mem();
+            if (mr == mem_ref) {
+              // Skip when we are looking at same memory operation.
+              continue;
+            }
             if (same_velt_type(mr, mem_ref) &&
                 memory_alignment(mr, iv_adjustment) != 0)
               create_pack = false;
@@ -846,6 +850,27 @@
   return NULL;
 }
 
+//------------------span_works_for_memory_size-----------------------------
+static bool span_works_for_memory_size(MemNode* mem, int span, int mem_size, int offset) {
+  bool span_matches_memory = false;
+  if ((mem_size == type2aelembytes(T_BYTE) || mem_size == type2aelembytes(T_SHORT))
+    && ABS(span) == type2aelembytes(T_INT)) {
+    // There is a mismatch on span size compared to memory.
+    for (DUIterator_Fast jmax, j = mem->fast_outs(jmax); j < jmax; j++) {
+      Node* use = mem->fast_out(j);
+      if (!VectorNode::is_type_transition_to_int(use)) {
+        return false;
+      }
+    }
+    // If all uses transition to integer, it means that we can successfully align even on mismatch.
+    return true;
+  }
+  else {
+    span_matches_memory = ABS(span) == mem_size;
+  }
+  return span_matches_memory && (ABS(offset) % mem_size) == 0;
+}
+
 //------------------------------ref_is_alignable---------------------------
 // Can the preloop align the reference to position zero in the vector?
 bool SuperWord::ref_is_alignable(SWPointer& p) {
@@ -862,7 +887,7 @@
   int offset   = p.offset_in_bytes();
   // Stride one accesses are alignable if offset is aligned to memory operation size.
   // Offset can be unaligned when UseUnalignedAccesses is used.
-  if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
+  if (span_works_for_memory_size(p.mem(), span, mem_size, offset)) {
     return true;
   }
   // If the initial offset from start of the object is computable,
@@ -915,6 +940,28 @@
   }
   return false;
 }
+//---------------------------get_vw_bytes_special------------------------
+int SuperWord::get_vw_bytes_special(MemNode* s) {
+  // Get the vector width in bytes.
+  int vw = vector_width_in_bytes(s);
+
+  // Check for special case where there is an MulAddS2I usage where short vectors are going to need combined.
+  BasicType btype = velt_basic_type(s);
+  if (type2aelembytes(btype) == 2) {
+    bool should_combine_adjacent = true;
+    for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) {
+      Node* user = s->fast_out(i);
+      if (!VectorNode::is_muladds2i(user)) {
+        should_combine_adjacent = false;
+      }
+    }
+    if (should_combine_adjacent) {
+      vw = MIN2(Matcher::max_vector_size(btype)*type2aelembytes(btype), vw * 2);
+    }
+  }
+
+  return vw;
+}
 
 //---------------------------get_iv_adjustment---------------------------
 // Calculate loop's iv adjustment for this memory ops.
@@ -923,7 +970,7 @@
   int offset = align_to_ref_p.offset_in_bytes();
   int scale  = align_to_ref_p.scale_in_bytes();
   int elt_size = align_to_ref_p.memory_size();
-  int vw       = vector_width_in_bytes(mem_ref);
+  int vw       = get_vw_bytes_special(mem_ref);
   assert(vw > 1, "sanity");
   int iv_adjustment;
   if (scale != 0) {
@@ -2303,6 +2350,12 @@
         const TypePtr* atyp = n->adr_type();
         vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
         vlen_in_bytes = vn->as_StoreVector()->memory_size();
+      } else if (VectorNode::is_muladds2i(n)) {
+        assert(n->req() == 5u, "MulAddS2I should have 4 operands.");
+        Node* in1 = vector_opd(p, 1);
+        Node* in2 = vector_opd(p, 2);
+        vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else if (n->req() == 3 && !is_cmov_pack(p)) {
         // Promote operands to vector
         Node* in1 = NULL;
@@ -2615,6 +2668,16 @@
     }
     assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
     pk->add_opd(in);
+    if (VectorNode::is_muladds2i(pi)) {
+      Node* in2 = pi->in(opd_idx + 2);
+      assert(my_pack(in2) == NULL, "Should already have been unpacked");
+      if (my_pack(in2) != NULL) {
+        NOT_PRODUCT(if (is_trace_loop_reverse() || TraceLoopOpts) { tty->print_cr("Should already have been unpacked"); })
+          return NULL;
+      }
+      assert(opd_bt == in2->bottom_type()->basic_type(), "all same type");
+      pk->add_opd(in2);
+    }
   }
   _igvn.register_new_node_with_optimizer(pk);
   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
@@ -2692,6 +2755,21 @@
     }
     return true;
   }
+  if (VectorNode::is_muladds2i(use)) {
+    // MulAddS2I takes shorts and produces ints - hence the special checks
+    // on alignment and size.
+    if (u_pk->size() * 2 != d_pk->size()) {
+      return false;
+    }
+    for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) {
+      Node* ui = u_pk->at(i);
+      Node* di = d_pk->at(i);
+      if (alignment(ui) != alignment(di) * 2) {
+        return false;
+      }
+    }
+    return true;
+  }
   if (u_pk->size() != d_pk->size())
     return false;
   for (uint i = 0; i < u_pk->size(); i++) {
@@ -3017,7 +3095,7 @@
     NOT_PRODUCT(if(is_trace_alignment()) tty->print("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");)
     return bottom_align;
   }
-  int vw = vector_width_in_bytes(s);
+  int vw = get_vw_bytes_special(s);
   if (vw < 2) {
     NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
     return bottom_align; // No vectors for this type
--- a/src/hotspot/share/opto/superword.hpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/opto/superword.hpp	Wed Dec 12 14:48:34 2018 -0800
@@ -347,6 +347,7 @@
     BasicType bt = velt_basic_type(n);
     return vector_width(n)*type2aelembytes(bt);
   }
+  int get_vw_bytes_special(MemNode* s);
   MemNode* align_to_ref()            { return _align_to_ref; }
   void  set_align_to_ref(MemNode* m) { _align_to_ref = m; }
 
--- a/src/hotspot/share/opto/vectornode.cpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/opto/vectornode.cpp	Wed Dec 12 14:48:34 2018 -0800
@@ -196,6 +196,8 @@
   case Op_StoreF:
   case Op_StoreD:
     return Op_StoreVector;
+  case Op_MulAddS2I:
+    return Op_MulAddVS2VI;
 
   default:
     return 0; // Unimplemented
@@ -214,6 +216,25 @@
   return false;
 }
 
+bool VectorNode::is_type_transition_short_to_int(Node* n) {
+  switch (n->Opcode()) {
+  case Op_MulAddS2I:
+    return true;
+  }
+  return false;
+}
+
+bool VectorNode::is_type_transition_to_int(Node* n) {
+  return is_type_transition_short_to_int(n);
+}
+
+bool VectorNode::is_muladds2i(Node* n) {
+  if (n->Opcode() == Op_MulAddS2I) {
+    return true;
+  }
+  return false;
+}
+
 bool VectorNode::is_shift(Node* n) {
   switch (n->Opcode()) {
   case Op_LShiftI:
@@ -277,6 +298,7 @@
   case Op_AndI: case Op_AndL:
   case Op_OrI:  case Op_OrL:
   case Op_XorI: case Op_XorL:
+  case Op_MulAddS2I:
     *start = 1;
     *end   = 3; // 2 vector operands
     break;
@@ -354,6 +376,8 @@
   case Op_AndV: return new AndVNode(n1, n2, vt);
   case Op_OrV:  return new OrVNode (n1, n2, vt);
   case Op_XorV: return new XorVNode(n1, n2, vt);
+
+  case Op_MulAddVS2VI: return new MulAddVS2VINode(n1, n2, vt);
   default:
     fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
     return NULL;
--- a/src/hotspot/share/opto/vectornode.hpp	Wed Dec 12 15:35:20 2018 -0500
+++ b/src/hotspot/share/opto/vectornode.hpp	Wed Dec 12 14:48:34 2018 -0800
@@ -67,6 +67,9 @@
   static int  opcode(int opc, BasicType bt);
   static bool implemented(int opc, uint vlen, BasicType bt);
   static bool is_shift(Node* n);
+  static bool is_type_transition_short_to_int(Node* n);
+  static bool is_type_transition_to_int(Node* n);
+  static bool is_muladds2i(Node* n);
   static bool is_invariant_vector(Node* n);
   // [Start, end) half-open range defining which operands are vectors
   static void vector_operands(Node* n, uint* start, uint* end);
@@ -261,6 +264,14 @@
   virtual int Opcode() const;
 };
 
+//------------------------------MulAddVS2VINode--------------------------------
+// Vector multiply shorts to int and add adjacent ints.
+class MulAddVS2VINode : public VectorNode {
+  public:
+    MulAddVS2VINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+    virtual int Opcode() const;
+};
+
 //------------------------------FmaVDNode--------------------------------------
 // Vector multiply double
 class FmaVDNode : public VectorNode {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/Vec_MulAddS2I.java	Wed Dec 12 14:48:34 2018 -0800
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8214751
+ * @summary Add C2 x86 Superword support for VNNI VPDPWSSD Instruction
+ * @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64"
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:+SuperWord
+ *      -XX:LoopMaxUnroll=2
+ *      compiler.loopopts.superword.Vec_MulAddS2I
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWord
+ *      -XX:LoopMaxUnroll=2
+ *      compiler.loopopts.superword.Vec_MulAddS2I
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:+SuperWord
+ *      -XX:LoopMaxUnroll=4
+ *      compiler.loopopts.superword.Vec_MulAddS2I
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWord
+ *      -XX:LoopMaxUnroll=4
+ *      compiler.loopopts.superword.Vec_MulAddS2I
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:+SuperWord
+ *      -XX:LoopMaxUnroll=8
+ *      compiler.loopopts.superword.Vec_MulAddS2I
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWord
+ *      -XX:LoopMaxUnroll=8
+ *      compiler.loopopts.superword.Vec_MulAddS2I
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:+SuperWord
+ *      -XX:LoopMaxUnroll=16
+ *      compiler.loopopts.superword.Vec_MulAddS2I
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWord
+ *      -XX:LoopMaxUnroll=16
+ *      compiler.loopopts.superword.Vec_MulAddS2I
+ */
+
+package compiler.loopopts.superword;
+import java.util.Random;
+
+public class Vec_MulAddS2I {
+        static final int NUM = 1024;
+        static int[] out =  new int[NUM];
+        static short[] in1 = new short[2*NUM];
+        static short[] in2 = new short[2*NUM];
+    public static void main(String[] args) throws Exception {
+        Vec_MulAddS2IInit(in1, in2);
+        int result = 0;
+        int valid = 204800000;
+        for (int j = 0; j < 10000*512; j++) {
+            result = Vec_MulAddS2IImplement(in1, in2, out);
+        }
+        if (result == valid) {
+            System.out.println("Success");
+        } else {
+            System.out.println("Invalid calculation of element variables in the out array: " + result);
+            System.out.println("Expected value for each element of out array = " + valid);
+            throw new Exception("Failed");
+        }
+    }
+
+    public static void Vec_MulAddS2IInit(
+            short[] in1,
+            short[] in2) {
+        for (int i=0; i<2*NUM; i++) {
+            in1[i] = (short)4;
+            in2[i] = (short)5;
+        }
+    }
+
+    public static int Vec_MulAddS2IImplement(
+            short[] in1,
+            short[] in2,
+            int[] out) {
+        for (int i = 0; i < NUM; i++) {
+            out[i] += ((in1[2*i] * in2[2*i]) + (in1[2*i+1] * in2[2*i+1]));
+        }
+        Random rand = new Random();
+        int n = rand.nextInt(NUM-1);
+        return out[n];
+    }
+
+}