changeset 12224:138e5abe35a9

8165381: Update for x86 SHA512 using AVX2 Summary: Add intrinsics for x86 AVX2 architecture with no SHA instructions. Reviewed-by: kvn Contributed-by: smita.kamath@intel.com
author kvn
date Fri, 21 Oct 2016 10:16:09 -0700
parents 272fd21a0917
children aa7e3876ea74
files src/cpu/x86/vm/assembler_x86.cpp src/cpu/x86/vm/assembler_x86.hpp src/cpu/x86/vm/macroAssembler_x86.cpp src/cpu/x86/vm/macroAssembler_x86.hpp src/cpu/x86/vm/macroAssembler_x86_sha.cpp src/cpu/x86/vm/stubGenerator_x86_64.cpp src/cpu/x86/vm/stubRoutines_x86.cpp src/cpu/x86/vm/stubRoutines_x86.hpp src/cpu/x86/vm/vm_version_x86.cpp src/cpu/x86/vm/x86.ad test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java
diffstat 11 files changed, 668 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/x86/vm/assembler_x86.cpp	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Fri Oct 21 10:16:09 2016 -0700
@@ -3298,6 +3298,15 @@
   emit_int8(imm8);
 }
 
+void Assembler::vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionAttr attributes(AVX_256bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x06);
+  emit_int8(0xC0 | encode);
+  emit_int8(imm8);
+}
+
 
 void Assembler::pause() {
   emit_int8((unsigned char)0xF3);
@@ -7359,7 +7368,7 @@
   emit_int8((unsigned char)(0xF & cop));
 }
 
-void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
+void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
@@ -7370,6 +7379,15 @@
   emit_int8((unsigned char)(0xF0 & src2_enc<<4));
 }
 
+void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8((unsigned char)0x02);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8((unsigned char)imm8);
+}
+
 void Assembler::shlxl(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi2(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
--- a/src/cpu/x86/vm/assembler_x86.hpp	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Fri Oct 21 10:16:09 2016 -0700
@@ -1550,6 +1550,7 @@
   void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
   void vpermq(XMMRegister dst, XMMRegister src, int imm8);
   void vperm2i128(XMMRegister dst,  XMMRegister nds, XMMRegister src, int imm8);
+  void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
 
   void pause();
 
@@ -2105,7 +2106,8 @@
 
   // AVX support for vectorized conditional move (double). The following two instructions used only coupled.
   void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
-  void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
+  void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
+  void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
 
  protected:
   // Next instructions require address alignment 16 bytes SSE mode.
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri Oct 21 10:16:09 2016 -0700
@@ -4309,6 +4309,15 @@
   }
 }
 
+void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
+  if (reachable(src)) {
+    Assembler::vpand(dst, nds, as_Address(src), vector_len);
+  } else {
+    lea(rscratch1, src);
+    Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
+  }
+}
+
 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
   int dst_enc = dst->encoding();
   int src_enc = src->encoding();
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Fri Oct 21 10:16:09 2016 -0700
@@ -943,6 +943,23 @@
                    bool multi_block, XMMRegister shuf_mask);
 #endif
 
+#ifdef _LP64
+ private:
+  void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
+                                     Register e, Register f, Register g, Register h, int iteration);
+
+  void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+                                          Register a, Register b, Register c, Register d, Register e, Register f,
+                                          Register g, Register h, int iteration);
+
+  void addmq(int disp, Register r1, Register r2);
+ public:
+  void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+                   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+                   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
+                   XMMRegister shuf_mask);
+#endif
+
   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
                  Register buf, Register state, Register ofs, Register limit, Register rsp,
@@ -1177,6 +1194,10 @@
   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
+  void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
+  void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
+
   void vpbroadcastw(XMMRegister dst, XMMRegister src);
 
   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
--- a/src/cpu/x86/vm/macroAssembler_x86_sha.cpp	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/macroAssembler_x86_sha.cpp	Fri Oct 21 10:16:09 2016 -0700
@@ -674,6 +674,11 @@
   movl(Address(r1, disp), r2);
 }
 
+void MacroAssembler::addmq(int disp, Register r1, Register r2) {
+  addq(r2, Address(r1, disp));
+  movq(Address(r1, disp), r2);
+}
+
 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
   Register buf, Register state, Register ofs, Register limit, Register rsp,
@@ -1026,4 +1031,488 @@
 bind(compute_size_end1);
   }
 }
+
+void MacroAssembler::sha512_AVX2_one_round_compute(Register  old_h, Register a, Register b, Register c,
+                                                   Register d, Register e, Register f, Register g, Register h,
+                                                   int iteration)
+{
+
+    const Register& y0 = r13;
+    const Register& y1 = r14;
+    const Register& y2 = r15;
+#ifdef _WIN64
+    const Register& y3 = rcx;
+#else
+    const Register& y3 = rdi;
+#endif
+    const Register& T1 = r12;
+
+    if (iteration % 4 > 0) {
+      addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0;
+    }
+    movq(y2, f); //y2 = f; CH
+    rorxq(y0, e, 41); //y0 = e >> 41; S1A
+    rorxq(y1, e, 18); //y1 = e >> 18; S1B
+    xorq(y2, g); //y2 = f^g; CH
+
+    xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
+    rorxq(y1, e, 14); //y1 = (e >> 14); S1
+    andq(y2, e); //y2 = (f^g)&e; CH
+
+    if (iteration % 4 > 0 ) {
+      addq(old_h, y3); //h = t1 + S0 + MAJ
+    }
+    xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
+    rorxq(T1, a, 34); //T1 = a >> 34; S0B
+    xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH
+    rorxq(y1, a, 39); //y1 = a >> 39; S0A
+    movq(y3, a); //y3 = a; MAJA
+
+    xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
+    rorxq(T1, a, 28); //T1 = (a >> 28); S0
+    addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; --
+    orq(y3, c); //y3 = a | c; MAJA
+
+    xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
+    movq(T1, a); //T1 = a; MAJB
+    andq(y3, b); //y3 = (a | c)&b; MAJA
+    andq(T1, c); //T1 = a&c; MAJB
+    addq(y2, y0); //y2 = S1 + CH; --
+
+    addq(d, h); //d = k + w + h + d; --
+    orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
+    addq(h, y1); //h = k + w + h + S0; --
+
+    addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
+
+    if (iteration % 4 == 3) {
+      addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
+      addq(h, y3); //h = t1 + S0 + MAJ; --
+    }
+}
+
+void MacroAssembler::sha512_AVX2_one_round_and_schedule(
+    XMMRegister xmm4, // ymm4
+    XMMRegister xmm5, // ymm5
+    XMMRegister xmm6, // ymm6
+    XMMRegister xmm7, // ymm7
+    Register a, //rax
+    Register b, //rbx
+    Register c, //rdi
+    Register d, //rsi
+    Register e, //r8
+    Register f, //r9
+    Register g, //r10
+    Register h, //r11
+    int iteration)
+{
+
+    const Register& y0 = r13;
+    const Register& y1 = r14;
+    const Register& y2 = r15;
+#ifdef _WIN64
+    const Register& y3 = rcx;
+#else
+    const Register& y3 = rdi;
+#endif
+    const Register& T1 = r12;
+
+    if (iteration % 4 == 0) {
+      // Extract w[t - 7]
+      // xmm0 = W[-7]
+      vperm2f128(xmm0, xmm7, xmm6, 3);
+      vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit);
+
+      // Calculate w[t - 16] + w[t - 7]
+      vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16]
+      // Extract w[t - 15]
+      //xmm1 = W[-15]
+      vperm2f128(xmm1, xmm5, xmm4, 3);
+      vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit);
+
+      // Calculate sigma0
+      // Calculate w[t - 15] ror 1
+      vpsrlq(xmm2, xmm1, 1, AVX_256bit);
+      vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit);
+      vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1
+      // Calculate w[t - 15] shr 7
+      vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7
+
+    } else if (iteration % 4 == 1) {
+      //Calculate w[t - 15] ror 8
+      vpsrlq(xmm2, xmm1, 8, AVX_256bit);
+      vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit);
+      vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8
+
+      //XOR the three components
+      vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7
+      vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0
+
+      //Add three components, w[t - 16], w[t - 7] and sigma0
+      vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0
+
+      // Move to appropriate lanes for calculating w[16] and w[17]
+      vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA }
+
+      address MASK_YMM_LO = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
+      //Move to appropriate lanes for calculating w[18] and w[19]
+      vpand(xmm0, xmm0, ExternalAddress(MASK_YMM_LO + 32), AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 }
+      //Calculate w[16] and w[17] in both 128 bit lanes
+      //Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+      vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA}
+      vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA}
+
+    } else if (iteration % 4 == 2) {
+      vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA}
+      vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA}
+      vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA}
+      vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+      vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA}
+      vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA}
+      vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA}
+      vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA }
+
+      //Add sigma1 to the other components to get w[16] and w[17]
+      vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] }
+
+      //Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+      vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--}
+
+    } else if (iteration % 4 == 3){
+      vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--}
+      vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--}
+      vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--}
+      vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+      vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--}
+      vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--}
+      vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--}
+      vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- }
+
+      //Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
+      vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- }
+
+      //Form w[19, w[18], w17], w[16]
+      vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] }
+    }
+
+    movq(y3, a); //y3 = a; MAJA
+    rorxq(y0, e, 41); // y0 = e >> 41; S1A
+    rorxq(y1, e, 18); //y1 = e >> 18; S1B
+    addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; --
+    orq(y3, c); //y3 = a | c; MAJA
+    movq(y2, f); //y2 = f; CH
+
+    xorq(y2, g); //y2 = f^g; CH
+
+    rorxq(T1, a, 34); //T1 = a >> 34; S0B
+    xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
+
+    rorxq(y1, e, 14); //y1 = (e >> 14); S1
+
+    andq(y2, e); //y2 = (f^g) & e; CH
+    addq(d, h); //d = k + w + h + d; --
+
+    andq(y3, b); //y3 = (a | c)&b; MAJA
+    xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
+    rorxq(y1, a, 39); //y1 = a >> 39; S0A
+
+    xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
+    rorxq(T1, a, 28); //T1 = (a >> 28); S0
+    xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH
+
+    xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
+    movq(T1, a); //T1 = a; MAJB
+
+    andq(T1, c); //T1 = a&c; MAJB
+    addq(y2, y0); //y2 = S1 + CH; --
+
+    orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
+    addq(h, y1); //h = k + w + h + S0; --
+
+    addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
+    addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
+    addq(h, y3); //h = t1 + S0 + MAJ; --
+}
+
+void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+                                 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+                                 Register buf, Register state, Register ofs, Register limit, Register rsp,
+                                 bool multi_block, XMMRegister shuf_mask)
+{
+
+    Label loop0, loop1, loop2, done_hash,
+    compute_block_size, compute_size,
+    compute_block_size_end, compute_size_end;
+
+    address K512_W = StubRoutines::x86::k512_W_addr();
+    address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
+    address pshuffle_byte_flip_mask_addr = 0;
+
+    const XMMRegister& XFER = xmm0; // YTMP0
+    const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9
+#ifdef _WIN64
+    const Register& INP = rcx; //1st arg
+    const Register& CTX = rdx; //2nd arg
+    const Register& NUM_BLKS = r8; //3rd arg
+    const Register& c = rdi;
+    const Register& d = rsi;
+    const Register& e = r8;
+    const Register& y3 = rcx;
+    const Register& offset = r8;
+    const Register& input_limit = r9;
+#else
+    const Register& INP = rdi; //1st arg
+    const Register& CTX = rsi; //2nd arg
+    const Register& NUM_BLKS = rdx; //3rd arg
+    const Register& c  = rcx;
+    const Register& d  = r8;
+    const Register& e  = rdx;
+    const Register& y3 = rdi;
+    const Register& offset = rdx;
+    const Register& input_limit = rcx;
+#endif
+
+    const Register& TBL = rbp;
+
+    const Register& a = rax;
+    const Register& b = rbx;
+
+    const Register& f = r9;
+    const Register& g = r10;
+    const Register& h = r11;
+
+    //Local variables as defined in assembly file.
+    enum
+    {
+      _XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8
+      _SRND_SIZE = 8, // resq 1
+      _INP_SIZE = 8,
+      _INP_END_SIZE = 8,
+      _RSP_SAVE_SIZE = 8,  // defined as resq 1
+
+#ifdef _WIN64
+      _GPR_SAVE_SIZE = 8 * 8, // defined as resq 8
+#else
+      _GPR_SAVE_SIZE = 6 * 8 // resq 6
+#endif
+    };
+
+    enum
+    {
+      _XFER = 0,
+      _SRND = _XFER + _XFER_SIZE, // 32
+      _INP = _SRND + _SRND_SIZE, // 40
+      _INP_END = _INP + _INP_SIZE, // 48
+      _RSP = _INP_END + _INP_END_SIZE, // 56
+      _GPR = _RSP + _RSP_SAVE_SIZE, // 64
+      _STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux.
+    };
+
+//Saving offset and limit as it will help with blocksize calculation for multiblock SHA512.
+#ifdef _WIN64
+    push(r8);    // win64: this is ofs
+    push(r9);    // win64: this is limit, we need them again at the very end.
+#else
+    push(rdx);   // linux : this is ofs, need at the end for multiblock calculation
+    push(rcx);   // linux: This is the limit.
+#endif
+
+    //Allocate Stack Space
+    movq(rax, rsp);
+    subq(rsp, _STACK_SIZE);
+    andq(rsp, -32);
+    movq(Address(rsp, _RSP), rax);
+
+    //Save GPRs
+    movq(Address(rsp, _GPR), rbp);
+    movq(Address(rsp, (_GPR + 8)), rbx);
+    movq(Address(rsp, (_GPR + 16)), r12);
+    movq(Address(rsp, (_GPR + 24)), r13);
+    movq(Address(rsp, (_GPR + 32)), r14);
+    movq(Address(rsp, (_GPR + 40)), r15);
+
+#ifdef _WIN64
+    movq(Address(rsp, (_GPR + 48)), rsi);
+    movq(Address(rsp, (_GPR + 56)), rdi);
+#endif
+
+    vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit);
+    vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit);
+
+    if (multi_block) {
+      xorq(rax, rax);
+      bind(compute_block_size);
+      cmpptr(offset, input_limit); // Assuming that offset is less than limit.
+      jccb(Assembler::aboveEqual, compute_block_size_end);
+      addq(offset, 128);
+      addq(rax, 128);
+      jmpb(compute_block_size);
+
+      bind(compute_block_size_end);
+      movq(NUM_BLKS, rax);
+
+      cmpq(NUM_BLKS, 0);
+      jcc(Assembler::equal, done_hash);
+    } else {
+      xorq(NUM_BLKS, NUM_BLKS); //If single block.
+      addq(NUM_BLKS, 128);
+    }
+
+    addq(NUM_BLKS, INP); //pointer to end of data
+    movq(Address(rsp, _INP_END), NUM_BLKS);
+
+    //load initial digest
+    movq(a, Address(CTX, 8 * 0));
+    movq(b, Address(CTX, 8 * 1));
+    movq(c, Address(CTX, 8 * 2));
+    movq(d, Address(CTX, 8 * 3));
+    movq(e, Address(CTX, 8 * 4));
+    movq(f, Address(CTX, 8 * 5));
+    movq(g, Address(CTX, 8 * 6));
+    movq(h, Address(CTX, 8 * 7));
+
+    pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512;
+    vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip
+
+    bind(loop0);
+    lea(TBL, ExternalAddress(K512_W));
+
+    //byte swap first 16 dwords
+    vmovdqu(xmm4, Address(INP, 32 * 0));
+    vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit);
+    vmovdqu(xmm5, Address(INP, 32 * 1));
+    vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit);
+    vmovdqu(xmm6, Address(INP, 32 * 2));
+    vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit);
+    vmovdqu(xmm7, Address(INP, 32 * 3));
+    vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit);
+
+    movq(Address(rsp, _INP), INP);
+
+    movslq(Address(rsp, _SRND), 4);
+    align(16);
+
+    //Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule
+    bind(loop1);
+    vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
+    vmovdqu(Address(rsp, _XFER), xmm0);
+    //four rounds and schedule
+    sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0);
+    sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1);
+    sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2);
+    sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3);
+
+    vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
+    vmovdqu(Address(rsp, _XFER), xmm0);
+    //four rounds and schedule
+    sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0);
+    sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1);
+    sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2);
+    sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3);
+
+    vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit);
+    vmovdqu(Address(rsp, _XFER), xmm0);
+    //four rounds and schedule
+    sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0);
+    sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1);
+    sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2);
+    sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3);
+
+    vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit);
+    vmovdqu(Address(rsp, _XFER), xmm0);
+    addq(TBL, 4 * 32);
+    //four rounds and schedule
+    sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0);
+    sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1);
+    sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2);
+    sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3);
+
+    subq(Address(rsp, _SRND), 1);
+    jcc(Assembler::notEqual, loop1);
+
+    movslq(Address(rsp, _SRND), 2);
+
+    bind(loop2);
+    vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
+    vmovdqu(Address(rsp, _XFER), xmm0);
+    //four rounds and compute.
+    sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0);
+    sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1);
+    sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2);
+    sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3);
+
+    vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
+    vmovdqu(Address(rsp, _XFER), xmm0);
+    addq(TBL, 2 * 32);
+    // four rounds and compute.
+    sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0);
+    sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1);
+    sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2);
+    sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3);
+
+    vmovdqu(xmm4, xmm6);
+    vmovdqu(xmm5, xmm7);
+
+    subq(Address(rsp, _SRND), 1);
+    jcc(Assembler::notEqual, loop2);
+
+    addmq(8 * 0, CTX, a);
+    addmq(8 * 1, CTX, b);
+    addmq(8 * 2, CTX, c);
+    addmq(8 * 3, CTX, d);
+    addmq(8 * 4, CTX, e);
+    addmq(8 * 5, CTX, f);
+    addmq(8 * 6, CTX, g);
+    addmq(8 * 7, CTX, h);
+
+    movq(INP, Address(rsp, _INP));
+    addq(INP, 128);
+    cmpq(INP, Address(rsp, _INP_END));
+    jcc(Assembler::notEqual, loop0);
+
+    bind(done_hash);
+
+    //Restore GPRs
+    movq(rbp, Address(rsp, (_GPR + 0)));
+    movq(rbx, Address(rsp, (_GPR + 8)));
+    movq(r12, Address(rsp, (_GPR + 16)));
+    movq(r13, Address(rsp, (_GPR + 24)));
+    movq(r14, Address(rsp, (_GPR + 32)));
+    movq(r15, Address(rsp, (_GPR + 40)));
+
+#ifdef _WIN64
+    movq(rsi, Address(rsp, (_GPR + 48)));
+    movq(rdi, Address(rsp, (_GPR + 56)));
+#endif
+
+    //Restore Stack Pointer
+    movq(rsp, Address(rsp, _RSP));
+
+#ifdef _WIN64
+    pop(r9);
+    pop(r8);
+#else
+    pop(rcx);
+    pop(rdx);
+#endif
+
+    if (multi_block) {
+#ifdef _WIN64
+      const Register& limit_end = r9;
+      const Register& ofs_end = r8;
+#else
+      const Register& limit_end = rcx;
+      const Register& ofs_end   = rdx;
+#endif
+      movq(rax, ofs_end);
+      bind(compute_size);
+      cmpptr(rax, limit_end);
+      jccb(Assembler::aboveEqual, compute_size_end);
+      addq(rax, 128);
+      jmpb(compute_size);
+      bind(compute_size_end);
+    }
+}
+
 #endif //#ifdef _LP64
+
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri Oct 21 10:16:09 2016 -0700
@@ -3718,6 +3718,25 @@
     return start;
   }
 
+  //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+  address generate_pshuffle_byte_flip_mask_sha512() {
+    __ align(32);
+    StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
+    address start = __ pc();
+    if (VM_Version::supports_avx2()) {
+      __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
+      __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
+      __ emit_data64(0x1011121314151617, relocInfo::none);
+      __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
+      __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
+      __ emit_data64(0x0000000000000000, relocInfo::none);
+      __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
+      __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
+    }
+
+    return start;
+  }
+
 // ofs and limit are use for multi-block byte array.
 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
   address generate_sha256_implCompress(bool multi_block, const char *name) {
@@ -3761,6 +3780,39 @@
     return start;
   }
 
+  address generate_sha512_implCompress(bool multi_block, const char *name) {
+    assert(VM_Version::supports_avx2(), "");
+    assert(VM_Version::supports_bmi2(), "");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register buf = c_rarg0;
+    Register state = c_rarg1;
+    Register ofs = c_rarg2;
+    Register limit = c_rarg3;
+
+    const XMMRegister msg = xmm0;
+    const XMMRegister state0 = xmm1;
+    const XMMRegister state1 = xmm2;
+    const XMMRegister msgtmp0 = xmm3;
+    const XMMRegister msgtmp1 = xmm4;
+    const XMMRegister msgtmp2 = xmm5;
+    const XMMRegister msgtmp3 = xmm6;
+    const XMMRegister msgtmp4 = xmm7;
+
+    const XMMRegister shuf_mask = xmm8;
+
+    __ enter();
+
+    __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
+    buf, state, ofs, limit, rsp, multi_block, shuf_mask);
+
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
   // to hide instruction latency
   //
@@ -5081,6 +5133,12 @@
       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
     }
+    if (UseSHA512Intrinsics) {
+      StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
+      StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
+      StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
+      StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
+    }
 
     // Generate GHASH intrinsics code
     if (UseGHASHIntrinsics) {
--- a/src/cpu/x86/vm/stubRoutines_x86.cpp	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/stubRoutines_x86.cpp	Fri Oct 21 10:16:09 2016 -0700
@@ -48,6 +48,8 @@
 address StubRoutines::x86::_k256_adr = NULL;
 #ifdef _LP64
 address StubRoutines::x86::_k256_W_adr = NULL;
+address StubRoutines::x86::_k512_W_addr = NULL;
+address StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = NULL;
 #endif
 address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL;
 
@@ -297,4 +299,49 @@
 // used in MacroAssembler::sha256_AVX2
 // dynamically built from _k256
 ALIGNED_(64) juint StubRoutines::x86::_k256_W[2*sizeof(StubRoutines::x86::_k256)];
+
+// used in MacroAssembler::sha512_AVX2
+ALIGNED_(64) julong StubRoutines::x86::_k512_W[] =
+{
+    0x428a2f98d728ae22LL, 0x7137449123ef65cdLL,
+    0xb5c0fbcfec4d3b2fLL, 0xe9b5dba58189dbbcLL,
+    0x3956c25bf348b538LL, 0x59f111f1b605d019LL,
+    0x923f82a4af194f9bLL, 0xab1c5ed5da6d8118LL,
+    0xd807aa98a3030242LL, 0x12835b0145706fbeLL,
+    0x243185be4ee4b28cLL, 0x550c7dc3d5ffb4e2LL,
+    0x72be5d74f27b896fLL, 0x80deb1fe3b1696b1LL,
+    0x9bdc06a725c71235LL, 0xc19bf174cf692694LL,
+    0xe49b69c19ef14ad2LL, 0xefbe4786384f25e3LL,
+    0x0fc19dc68b8cd5b5LL, 0x240ca1cc77ac9c65LL,
+    0x2de92c6f592b0275LL, 0x4a7484aa6ea6e483LL,
+    0x5cb0a9dcbd41fbd4LL, 0x76f988da831153b5LL,
+    0x983e5152ee66dfabLL, 0xa831c66d2db43210LL,
+    0xb00327c898fb213fLL, 0xbf597fc7beef0ee4LL,
+    0xc6e00bf33da88fc2LL, 0xd5a79147930aa725LL,
+    0x06ca6351e003826fLL, 0x142929670a0e6e70LL,
+    0x27b70a8546d22ffcLL, 0x2e1b21385c26c926LL,
+    0x4d2c6dfc5ac42aedLL, 0x53380d139d95b3dfLL,
+    0x650a73548baf63deLL, 0x766a0abb3c77b2a8LL,
+    0x81c2c92e47edaee6LL, 0x92722c851482353bLL,
+    0xa2bfe8a14cf10364LL, 0xa81a664bbc423001LL,
+    0xc24b8b70d0f89791LL, 0xc76c51a30654be30LL,
+    0xd192e819d6ef5218LL, 0xd69906245565a910LL,
+    0xf40e35855771202aLL, 0x106aa07032bbd1b8LL,
+    0x19a4c116b8d2d0c8LL, 0x1e376c085141ab53LL,
+    0x2748774cdf8eeb99LL, 0x34b0bcb5e19b48a8LL,
+    0x391c0cb3c5c95a63LL, 0x4ed8aa4ae3418acbLL,
+    0x5b9cca4f7763e373LL, 0x682e6ff3d6b2b8a3LL,
+    0x748f82ee5defb2fcLL, 0x78a5636f43172f60LL,
+    0x84c87814a1f0ab72LL, 0x8cc702081a6439ecLL,
+    0x90befffa23631e28LL, 0xa4506cebde82bde9LL,
+    0xbef9a3f7b2c67915LL, 0xc67178f2e372532bLL,
+    0xca273eceea26619cLL, 0xd186b8c721c0c207LL,
+    0xeada7dd6cde0eb1eLL, 0xf57d4f7fee6ed178LL,
+    0x06f067aa72176fbaLL, 0x0a637dc5a2c898a6LL,
+    0x113f9804bef90daeLL, 0x1b710b35131c471bLL,
+    0x28db77f523047d84LL, 0x32caab7b40c72493LL,
+    0x3c9ebe0a15c9bebcLL, 0x431d67c49c100d4cLL,
+    0x4cc5d4becb3e42b6LL, 0x597f299cfc657e2aLL,
+    0x5fcb6fab3ad6faecLL, 0x6c44198c4a475817LL,
+};
 #endif
--- a/src/cpu/x86/vm/stubRoutines_x86.hpp	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/stubRoutines_x86.hpp	Fri Oct 21 10:16:09 2016 -0700
@@ -33,7 +33,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 20000 LP64_ONLY(+10000),         // simply increase if too small (assembler will crash if too small)
-  code_size2 = 33800 LP64_ONLY(+1200)           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 33800 LP64_ONLY(+10000)           // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
@@ -134,6 +134,10 @@
 #ifdef _LP64
   static juint _k256_W[];
   static address _k256_W_adr;
+  static julong _k512_W[];
+  static address _k512_W_addr;
+  // byte flip mask for sha512
+  static address _pshuffle_byte_flip_mask_addr_sha512;
 #endif
   // byte flip mask for sha256
   static address _pshuffle_byte_flip_mask_addr;
@@ -192,6 +196,8 @@
   static address k256_addr()      { return _k256_adr; }
 #ifdef _LP64
   static address k256_W_addr()    { return _k256_W_adr; }
+  static address k512_W_addr()    { return _k512_W_addr; }
+  static address pshuffle_byte_flip_mask_addr_sha512() { return _pshuffle_byte_flip_mask_addr_sha512; }
 #endif
   static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
   static void generate_CRC32C_table(bool is_pclmulqdq_supported);
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Fri Oct 21 10:16:09 2016 -0700
@@ -769,7 +769,11 @@
     FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
   }
 
-  if (UseSHA512Intrinsics) {
+  if (UseSHA) {
+    if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
+    }
+  } else if (UseSHA512Intrinsics) {
     warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU.");
     FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
   }
--- a/src/cpu/x86/vm/x86.ad	Fri Oct 21 17:55:02 2016 +0200
+++ b/src/cpu/x86/vm/x86.ad	Fri Oct 21 10:16:09 2016 -0700
@@ -8173,13 +8173,13 @@
   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
   effect(TEMP dst, USE src1, USE src2);
   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
-            "vpblendd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
+            "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
          %}
   ins_encode %{
     int vector_len = 1;
     int cond = (Assembler::Condition)($copnd$$cmpcode);
     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
-    __ vpblendd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
--- a/test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java	Fri Oct 21 17:55:02 2016 +0200
+++ b/test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java	Fri Oct 21 10:16:09 2016 -0700
@@ -78,9 +78,14 @@
                       new CPUSpecificPredicate("aarch64.*", new String[] { "sha256" },null)))))));
 
     public static final BooleanSupplier SHA512_INSTRUCTION_AVAILABLE
-            = new OrPredicate(
-                    new CPUSpecificPredicate("sparc.*", new String[] { "sha512" },null),
-                    new CPUSpecificPredicate("aarch64.*", new String[] { "sha512" },null));
+            = new OrPredicate(new CPUSpecificPredicate("x86.*", new String[] { "sha" },null),
+              new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "sha" },null),
+              new OrPredicate(new CPUSpecificPredicate("i386.*", new String[] { "sha" },null),
+              new OrPredicate(new CPUSpecificPredicate("x86_64", new String[] { "avx2", "bmi2" }, null),
+              new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "avx2", "bmi2" }, null),
+              new OrPredicate(
+                      new CPUSpecificPredicate("sparc.*", new String[] { "sha512" },null),
+                      new CPUSpecificPredicate("aarch64.*", new String[] { "sha512" },null)))))));
 
     public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE
             = new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE,