changeset 58850:d2995967041f vectorIntrinsics

Summary: Float/Double min/max implementation for x86. Contributed-by: jatin.bhateja@intel.com
author sviswanathan
date Wed, 06 Mar 2019 13:45:32 -0800
parents 9b51e3ed675d
children d31cac6904e1 7be65c01c29e
files src/hotspot/cpu/x86/assembler_x86.cpp src/hotspot/cpu/x86/assembler_x86.hpp src/hotspot/cpu/x86/x86.ad
diffstat 3 files changed, 371 insertions(+), 345 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/cpu/x86/assembler_x86.cpp	Fri Mar 01 18:29:15 2019 -0800
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp	Wed Mar 06 13:45:32 2019 -0800
@@ -4074,6 +4074,26 @@
   emit_operand(as_Register(dst_enc), src);
 }
 
+void Assembler::evpmovd2m(KRegister kdst, XMMRegister src, int vector_len) {
+  assert(UseAVX > 2  && VM_Version::supports_avx512dq(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(kdst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0x39);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpmovq2m(KRegister kdst, XMMRegister src, int vector_len) {
+  assert(UseAVX > 2  && VM_Version::supports_avx512dq(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(kdst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0x39);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::pcmpgtq(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
--- a/src/hotspot/cpu/x86/assembler_x86.hpp	Fri Mar 01 18:29:15 2019 -0800
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp	Wed Mar 06 13:45:32 2019 -0800
@@ -2067,6 +2067,8 @@
   void shlxq(Register dst, Register src1, Register src2);
 
   //====================VECTOR ARITHMETIC=====================================
+  void evpmovd2m(KRegister kdst, XMMRegister src, int vector_len);
+  void evpmovq2m(KRegister kdst, XMMRegister src, int vector_len);
 
   // Add Packed Floating-Point Values
   void addpd(XMMRegister dst, XMMRegister src);
--- a/src/hotspot/cpu/x86/x86.ad	Fri Mar 01 18:29:15 2019 -0800
+++ b/src/hotspot/cpu/x86/x86.ad	Wed Mar 06 13:45:32 2019 -0800
@@ -1505,9 +1505,14 @@
           if (UseSSE < 4 && (bt == T_BYTE || bt == T_INT || bt == T_LONG))
             ret_value = false;
 
-          // Float/Double intrinsics disabled till we fix the implementation to match Math.max/Math.min
-          if (bt == T_FLOAT || bt == T_DOUBLE)
-            ret_value = false;
+          if ((bt == T_FLOAT || bt == T_DOUBLE)) {
+            // Float/Double intrinsics are handled for targets supporting AVX family.
+            if (UseAVX == 0)
+              ret_value = false;
+            // 512 bit Float/Double intrinsics need AVX512DQ
+            if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512))
+              ret_value = false;
+          }
           break;
         case Op_MulVB:
         case Op_LShiftVB:
@@ -14320,189 +14325,187 @@
 %}
 
 // Float vector Min
-instruct min2F_reg(vecD dst, vecD src1, vecD src2) %{
-  predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MinV src1 src2));
-  effect(TEMP dst);
-  format %{ "movsd  $dst,$src1\n\t"
-            "minps  $dst,$src2\t! " %}
-  ins_encode %{
-    __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
-    __ minps($dst$$XMMRegister, $src2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min2F_reg_avx(vecD dst, vecD src1, vecD src2) %{
+instruct min2F_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min4F_reg(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MinV src1 src2));
-  effect(TEMP dst);
-  format %{ "movdqu  $dst,$src1\n\t"
-            "minps   $dst,$src2\t! " %}
-  ins_encode %{
-    __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
-    __ minps($dst$$XMMRegister, $src2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min4F_reg_avx(vecX dst, vecX src1, vecX src2) %{
+  match(Set dst (MinV a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{
+     "blendvps         $atmp,$a,$b,$a             \n\t"
+     "blendvps         $btmp,$b,$a,$a             \n\t"
+     "vminps           $tmp,$atmp,$btmp           \n\t"
+     "cmpps.unordered  $btmp, $atmp, $atmp        \n\t"
+     "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct min4F_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min4F_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min8F_reg_avx(vecY dst, vecY src1, vecY src2) %{
+  match(Set dst (MinV a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{
+     "blendvps         $atmp,$a,$b,$a             \n\t"
+     "blendvps         $btmp,$b,$a,$a             \n\t"
+     "vminps           $tmp,$atmp,$btmp           \n\t"
+     "cmpps.unordered  $btmp, $atmp, $atmp        \n\t"
+     "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct min8F_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min8F_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min16F_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  match(Set dst (MinV a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{
+     "blendvps         $atmp,$a,$b,$a             \n\t"
+     "blendvps         $btmp,$b,$a,$a             \n\t"
+     "vminps           $tmp,$atmp,$btmp           \n\t"
+     "cmpps.unordered  $btmp, $atmp, $atmp        \n\t"
+     "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct min16F_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512dq() &&  n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
+  match(Set dst (MinV a b));
+  effect(USE a, USE b, TEMP atmp, TEMP btmp);
+  format %{ 
+     "vpmovd2m         k1,$a                    \n\t"
+     "vblendmps        $atmp,k1,$a,$b           \n\t"
+     "vblendmps        $btmp,k1,$b,$a           \n\t"
+     "vminps           $dst,$atmp,$btmp         \n\t"
+     "vcmpps.unordered      k1,$atmp,$atmp           \n\t"
+     "vmovaps          $dst,k1,$atmp            \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 2;
+    KRegister ktmp = k1;
+    KRegister mask = k0;
+    __ evpmovd2m(ktmp, $a$$XMMRegister, vector_len); 
+    __ evblendmps($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
+    __ evblendmps($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
+    __ vminps($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ evcmpps(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ evmovdqul($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 // Double vector Min
-instruct minD_reg(vecD dst, vecD src1, vecD src2) %{
-  predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MinV src1 src2));
-  effect(TEMP dst);
-  format %{ "movsd  $dst,$src1\n\t"
-            "minpd  $dst,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
-    __ minpd($dst$$XMMRegister, $src2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min1D_reg_avx(vecD dst, vecD src1, vecD src2) %{
+instruct min1D_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min2D_reg(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MinV src1 src2));
-  effect(TEMP dst);
-  format %{ "movdqu  $dst,$src1\n\t"
-            "minpd   $dst,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
-    __ minpd($dst$$XMMRegister, $src2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min2D_reg_avx(vecX dst, vecX src1, vecX src2) %{
+  match(Set dst (MinV a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{ 
+     "blendvpd         $atmp,$a,$b,$a           \n\t"
+     "blendvpd         $btmp,$b,$a,$a           \n\t"
+     "vminpd           $tmp,$atmp,$btmp         \n\t"
+     "cmppd.unordered  $btmp, $atmp, $atmp      \n\t"
+     "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct min2D_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min2D_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min4D_reg_avx(vecY dst, vecY src1, vecY src2) %{
+  match(Set dst (MinV a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{ 
+     "blendvpd         $atmp,$a,$b,$a           \n\t"
+     "blendvpd         $btmp,$b,$a,$a           \n\t"
+     "vminpd           $tmp,$atmp,$btmp         \n\t"
+     "cmppd.unordered  $btmp, $atmp, $atmp      \n\t"
+     "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct min4D_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min4D_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct min8D_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MinV src1 src2));
-  format %{ "vminpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  match(Set dst (MinV a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{ 
+     "blendvpd         $atmp,$a,$b,$a           \n\t"
+     "blendvpd         $btmp,$b,$a,$a           \n\t"
+     "vminpd           $tmp,$atmp,$btmp         \n\t"
+     "cmppd.unordered  $btmp, $atmp, $atmp      \n\t"
+     "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct min8D_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
+  match(Set dst (MinV a b));
+  effect(USE a, USE b, TEMP atmp, TEMP btmp);
+  format %{ 
+     "vpmovq2m         k1,$a                    \n\t"
+     "vblendmpd        $atmp,k1,$a,$b           \n\t"
+     "vblendmpd        $btmp,k1,$b,$a           \n\t"
+     "vminpd           $dst,$atmp,$btmp         \n\t"
+     "vcmppd.unordered      k1,$atmp,$atmp           \n\t"
+     "vmovapd          $dst,k1,$atmp            \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 2;
+    KRegister ktmp = k1;
+    KRegister mask = k0;
+    __ evpmovq2m(ktmp, $a$$XMMRegister, vector_len); 
+    __ evblendmpd($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
+    __ evblendmpd($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
+    __ vminpd($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ evcmppd(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ evmovdquq($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -14851,187 +14854,188 @@
 %}
 
 // Float Vector Max
-instruct max2F_reg(vecD dst, vecD src1, vecD src2) %{
-  predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MaxV src1 src2));
-  effect(TEMP dst);
-  format %{ "movsd  $dst,$src1\n\t"
-            "maxps  $dst,$src2\t! " %}
-  ins_encode %{
-    __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
-    __ maxps($dst$$XMMRegister, $src2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max2F_reg_avx(vecD dst, vecD src1, vecD src2) %{
+instruct max2F_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max4F_reg(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MaxV src1 src2));
-  effect(TEMP dst);
-  format %{ "movdqu  $dst,$src1\n\t"
-            "maxps   $dst,$src2\t! " %}
-  ins_encode %{
-    __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
-    __ maxps($dst$$XMMRegister, $src2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max4F_reg_avx(vecX dst, vecX src1, vecX src2) %{
+  match(Set dst (MaxV a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{ 
+     "blendvps         $btmp,$b,$a,$b           \n\t"
+     "blendvps         $atmp,$a,$b,$b           \n\t"
+     "vmaxps           $tmp,$atmp,$btmp         \n\t"
+     "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
+     "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct max4F_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max4F_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max8F_reg_avx(vecY dst, vecY src1, vecY src2) %{
+  match(Set dst (MaxV a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{ 
+     "blendvps         $btmp,$b,$a,$b           \n\t"
+     "blendvps         $atmp,$a,$b,$b           \n\t"
+     "vmaxps           $tmp,$atmp,$btmp         \n\t"
+     "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
+     "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct max8F_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max8F_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max16F_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxps  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  match(Set dst (MaxV a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{ 
+     "blendvps         $btmp,$b,$a,$b           \n\t"
+     "blendvps         $atmp,$a,$b,$b           \n\t"
+     "vmaxps           $tmp,$atmp,$btmp         \n\t"
+     "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
+     "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct max16F_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
+  match(Set dst (MaxV a b));
+  effect(USE a, USE b, TEMP atmp, TEMP btmp);
+  format %{ 
+     "vpmovd2m         k1,$b              \n\t"
+     "vblendmps        $atmp,k1,$a,$b     \n\t"
+     "vblendmps        $btmp,k1,$b,$a     \n\t"
+     "vmaxps           $dst,$atmp,$btmp   \n\t"
+     "vcmpps.unordered      k1,$atmp,$atmp     \n\t"
+     "vmovaps          $dst,k1,$atmp      \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 2;
+    KRegister ktmp = k1; 
+    KRegister mask = k0;
+    __ evpmovd2m(ktmp, $b$$XMMRegister, vector_len); 
+    __ evblendmps($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
+    __ evblendmps($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
+    __ vmaxps($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ evcmpps(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ evmovdqul($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 // Double Vector Max
-instruct maxD_reg(vecD dst, vecD src1, vecD src2) %{
-  predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MaxV src1 src2));
-  effect(TEMP dst);
-  format %{ "movsd  $dst,$src1\n\t"
-            "maxpd  $dst,$src2\t! " %}
-  ins_encode %{
-    __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
-    __ maxpd($dst$$XMMRegister, $src2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max1D_reg_avx(vecD dst, vecD src1, vecD src2) %{
+instruct max1D_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max2D_reg(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MaxV src1 src2));
-  effect(TEMP dst);
-  format %{ "movdqu  $dst,$src1\n\t"
-            "maxpd   $dst,$src2\t! " %}
-  ins_encode %{
-    __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
-    __ maxpd($dst$$XMMRegister, $src2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max2D_reg_avx(vecX dst, vecX src1, vecX src2) %{
+  match(Set dst (MaxV a b));
+  effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
+  format %{ 
+     "blendvpd         $btmp,$b,$a,$b            \n\t"
+     "blendvpd         $atmp,$a,$b,$b            \n\t"
+     "vmaxpd           $tmp,$atmp,$btmp          \n\t"
+     "cmppd.unordered  $btmp, $atmp, $atmp       \n\t"
+     "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct max2D_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max2D_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max4D_reg_avx(vecY dst, vecY src1, vecY src2) %{
+  match(Set dst (MaxV a b));
+  effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
+  format %{ 
+     "blendvpd         $btmp,$b,$a,$b            \n\t"
+     "blendvpd         $atmp,$a,$b,$b            \n\t"
+     "vmaxpd           $tmp,$atmp,$btmp          \n\t"
+     "cmppd.unordered  $btmp, $atmp, $atmp       \n\t"
+     "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct max4D_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max4D_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct max8D_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
-  match(Set dst (MaxV src1 src2));
-  format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
-  ins_encode %{
-    int vector_len = 2;
-    __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  match(Set dst (MaxV a b));
+  effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
+  format %{ 
+     "blendvpd         $btmp,$b,$a,$b            \n\t"
+     "blendvpd         $atmp,$a,$b,$b            \n\t"
+     "vmaxpd           $tmp,$atmp,$btmp          \n\t"
+     "cmppd.unordered  $btmp, $atmp, $atmp       \n\t"
+     "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
+    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+
+instruct max8D_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
+  match(Set dst (MaxV a b));
+  effect(USE a, USE b, TEMP atmp, TEMP btmp);
+  format %{ 
+     "vpmovq2m         k1,$b              \n\t"
+     "vblendmpd        $atmp,k1,$a,$b     \n\t"
+     "vblendmpd        $btmp,k1,$b,$a     \n\t"
+     "vmaxpd           $dst,$atmp,$btmp   \n\t"
+     "vcmppd.unordered      k1,$atmp,$atmp     \n\t"
+     "vmovapd          $dst,k1,$atmp      \n\t"
+  %}
+  ins_encode %{
+    int vector_len = 2;
+    KRegister ktmp = k1; 
+    KRegister mask = k0;
+    __ evpmovq2m(ktmp, $b$$XMMRegister, vector_len); 
+    __ evblendmpd($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
+    __ evblendmpd($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
+    __ vmaxpd($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ evcmppd(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
+    __ evmovdquq($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}