changeset 8605:405cb20a06a9

Merge
author kvn
date Fri, 19 Jun 2015 15:24:07 -0700
parents cff206797bdd 8f8c4af059a9
children b125c7ae3995
files src/share/vm/runtime/globals.hpp
diffstat 33 files changed, 944 insertions(+), 69 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -190,6 +190,11 @@
     }
   }
 
+  if (UseGHASHIntrinsics) {
+    warning("GHASH intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
     UseCRC32Intrinsics = true;
   }
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -176,6 +176,11 @@
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
 
+  if (UseGHASHIntrinsics) {
+    warning("GHASH intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   if (UseSHA) {
     warning("SHA instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseSHA, false);
--- a/src/cpu/sparc/vm/assembler_sparc.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -129,6 +129,7 @@
     flog3_op3    = 0x36,
     edge_op3     = 0x36,
     fsrc_op3     = 0x36,
+    xmulx_op3    = 0x36,
     impdep2_op3  = 0x37,
     stpartialf_op3 = 0x37,
     jmpl_op3     = 0x38,
@@ -220,6 +221,8 @@
     mdtox_opf          = 0x110,
     mstouw_opf         = 0x111,
     mstosw_opf         = 0x113,
+    xmulx_opf          = 0x115,
+    xmulxhi_opf        = 0x116,
     mxtod_opf          = 0x118,
     mwtos_opf          = 0x119,
 
@@ -1212,6 +1215,9 @@
   void movwtos( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
   void movxtod( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
 
+  void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
+  void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
+
   // Crypto SHA instructions
 
   void sha1()   { sha1_only();    emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -4786,6 +4786,130 @@
     return start;
   }
 
+  /* Single and multi-block ghash operations */
+  address generate_ghash_processBlocks() {
+      __ align(CodeEntryAlignment);
+      Label L_ghash_loop, L_aligned, L_main;
+      StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+      address start = __ pc();
+
+      Register state = I0;
+      Register subkeyH = I1;
+      Register data = I2;
+      Register len = I3;
+
+      __ save_frame(0);
+
+      __ ldx(state, 0, O0);
+      __ ldx(state, 8, O1);
+
+      // Loop label for multiblock operations
+      __ BIND(L_ghash_loop);
+
+      // Check if 'data' is unaligned
+      __ andcc(data, 7, G1);
+      __ br(Assembler::zero, false, Assembler::pt, L_aligned);
+      __ delayed()->nop();
+
+      Register left_shift = L1;
+      Register right_shift = L2;
+      Register data_ptr = L3;
+
+      // Get left and right shift values in bits
+      __ sll(G1, LogBitsPerByte, left_shift);
+      __ mov(64, right_shift);
+      __ sub(right_shift, left_shift, right_shift);
+
+      // Align to read 'data'
+      __ sub(data, G1, data_ptr);
+
+      // Load first 8 bytes of 'data'
+      __ ldx(data_ptr, 0, O4);
+      __ sllx(O4, left_shift, O4);
+      __ ldx(data_ptr, 8, O5);
+      __ srlx(O5, right_shift, G4);
+      __ bset(G4, O4);
+
+      // Load second 8 bytes of 'data'
+      __ sllx(O5, left_shift, O5);
+      __ ldx(data_ptr, 16, G4);
+      __ srlx(G4, right_shift, G4);
+      __ ba(L_main);
+      __ delayed()->bset(G4, O5);
+
+      // If 'data' is aligned, load normally
+      __ BIND(L_aligned);
+      __ ldx(data, 0, O4);
+      __ ldx(data, 8, O5);
+
+      __ BIND(L_main);
+      __ ldx(subkeyH, 0, O2);
+      __ ldx(subkeyH, 8, O3);
+
+      __ xor3(O0, O4, O0);
+      __ xor3(O1, O5, O1);
+
+      __ xmulxhi(O0, O3, G3);
+      __ xmulx(O0, O2, O5);
+      __ xmulxhi(O1, O2, G4);
+      __ xmulxhi(O1, O3, G5);
+      __ xmulx(O0, O3, G1);
+      __ xmulx(O1, O3, G2);
+      __ xmulx(O1, O2, O3);
+      __ xmulxhi(O0, O2, O4);
+
+      __ mov(0xE1, O0);
+      __ sllx(O0, 56, O0);
+
+      __ xor3(O5, G3, O5);
+      __ xor3(O5, G4, O5);
+      __ xor3(G5, G1, G1);
+      __ xor3(G1, O3, G1);
+      __ srlx(G2, 63, O1);
+      __ srlx(G1, 63, G3);
+      __ sllx(G2, 63, O3);
+      __ sllx(G2, 58, O2);
+      __ xor3(O3, O2, O2);
+
+      __ sllx(G1, 1, G1);
+      __ or3(G1, O1, G1);
+
+      __ xor3(G1, O2, G1);
+
+      __ sllx(G2, 1, G2);
+
+      __ xmulxhi(G1, O0, O1);
+      __ xmulx(G1, O0, O2);
+      __ xmulxhi(G2, O0, O3);
+      __ xmulx(G2, O0, G1);
+
+      __ xor3(O4, O1, O4);
+      __ xor3(O5, O2, O5);
+      __ xor3(O5, O3, O5);
+
+      __ sllx(O4, 1, O2);
+      __ srlx(O5, 63, O3);
+
+      __ or3(O2, O3, O0);
+
+      __ sllx(O5, 1, O1);
+      __ srlx(G1, 63, O2);
+      __ or3(O1, O2, O1);
+      __ xor3(O1, G3, O1);
+
+      __ deccc(len);
+      __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
+      __ delayed()->add(data, 16, data);
+
+      __ stx(O0, I0, 0);
+      __ stx(O1, I0, 8);
+
+      __ ret();
+      __ delayed()->restore();
+
+      return start;
+  }
+
   void generate_initial() {
     // Generates all stubs and initializes the entry points
 
@@ -4859,6 +4983,10 @@
       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
+    // generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    }
 
     // generate SHA1/SHA256/SHA512 intrinsics code
     if (UseSHA1Intrinsics) {
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -300,6 +300,17 @@
     }
   }
 
+  // GHASH/GCM intrinsics
+  if (has_vis3() && (UseVIS > 2)) {
+    if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+      UseGHASHIntrinsics = true;
+    }
+  } else if (UseGHASHIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+      warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   // SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
   if (has_sha1() || has_sha256() || has_sha512()) {
     if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
--- a/src/cpu/x86/vm/assembler_x86.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -3095,8 +3095,16 @@
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F,
-                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  emit_int8(0x73);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(shift);
+}
+
+void Assembler::pslldq(XMMRegister dst, int shift) {
+  // Shift left 128 bit value in xmm register by number of bytes.
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
--- a/src/cpu/x86/vm/assembler_x86.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -1666,6 +1666,8 @@
 
   // Shift Right by bytes Logical DoubleQuadword Immediate
   void psrldq(XMMRegister dst, int shift);
+  // Shift Left by bytes Logical DoubleQuadword Immediate
+  void pslldq(XMMRegister dst, int shift);
 
   // Logical Compare 128bit
   void ptest(XMMRegister dst, XMMRegister src);
--- a/src/cpu/x86/vm/c2_init_x86.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/x86/vm/c2_init_x86.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -58,4 +58,6 @@
       OptoReg::invalidate(i);
     }
   }
+
+  SuperWordLoopUnrollAnalysis = true;
 }
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -2727,6 +2727,167 @@
     return start;
   }
 
+  // byte swap x86 long
+  address generate_ghash_long_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+    address start = __ pc();
+    __ emit_data(0x0b0a0908, relocInfo::none, 0);
+    __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
+    __ emit_data(0x03020100, relocInfo::none, 0);
+    __ emit_data(0x07060504, relocInfo::none, 0);
+
+  return start;
+  }
+
+  // byte swap x86 byte array
+  address generate_ghash_byte_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+    address start = __ pc();
+    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+    __ emit_data(0x08090a0b, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+  return start;
+  }
+
+  /* Single and multi-block ghash operations */
+  address generate_ghash_processBlocks() {
+    assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
+    __ align(CodeEntryAlignment);
+    Label L_ghash_loop, L_exit;
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+    address start = __ pc();
+
+    const Register state        = rdi;
+    const Register subkeyH      = rsi;
+    const Register data         = rdx;
+    const Register blocks       = rcx;
+
+    const Address  state_param(rbp, 8+0);
+    const Address  subkeyH_param(rbp, 8+4);
+    const Address  data_param(rbp, 8+8);
+    const Address  blocks_param(rbp, 8+12);
+
+    const XMMRegister xmm_temp0 = xmm0;
+    const XMMRegister xmm_temp1 = xmm1;
+    const XMMRegister xmm_temp2 = xmm2;
+    const XMMRegister xmm_temp3 = xmm3;
+    const XMMRegister xmm_temp4 = xmm4;
+    const XMMRegister xmm_temp5 = xmm5;
+    const XMMRegister xmm_temp6 = xmm6;
+    const XMMRegister xmm_temp7 = xmm7;
+
+    __ enter();
+
+    __ movptr(state, state_param);
+    __ movptr(subkeyH, subkeyH_param);
+    __ movptr(data, data_param);
+    __ movptr(blocks, blocks_param);
+
+    __ movdqu(xmm_temp0, Address(state, 0));
+    __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+    __ movdqu(xmm_temp1, Address(subkeyH, 0));
+    __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+    __ BIND(L_ghash_loop);
+    __ movdqu(xmm_temp2, Address(data, 0));
+    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+    __ pxor(xmm_temp0, xmm_temp2);
+
+    //
+    // Multiply with the hash key
+    //
+    __ movdqu(xmm_temp3, xmm_temp0);
+    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
+    __ movdqu(xmm_temp4, xmm_temp0);
+    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
+
+    __ movdqu(xmm_temp5, xmm_temp0);
+    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
+    __ movdqu(xmm_temp6, xmm_temp0);
+    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
+
+    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
+
+    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
+    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
+    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
+    __ pxor(xmm_temp3, xmm_temp5);
+    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
+                                        // of the carry-less multiplication of
+                                        // xmm0 by xmm1.
+
+    // We shift the result of the multiplication by one bit position
+    // to the left to cope for the fact that the bits are reversed.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp4, xmm_temp6);
+    __ pslld (xmm_temp3, 1);
+    __ pslld(xmm_temp6, 1);
+    __ psrld(xmm_temp7, 31);
+    __ psrld(xmm_temp4, 31);
+    __ movdqu(xmm_temp5, xmm_temp7);
+    __ pslldq(xmm_temp4, 4);
+    __ pslldq(xmm_temp7, 4);
+    __ psrldq(xmm_temp5, 12);
+    __ por(xmm_temp3, xmm_temp7);
+    __ por(xmm_temp6, xmm_temp4);
+    __ por(xmm_temp6, xmm_temp5);
+
+    //
+    // First phase of the reduction
+    //
+    // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
+    // independently.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp4, xmm_temp3);
+    __ movdqu(xmm_temp5, xmm_temp3);
+    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
+    __ pslld(xmm_temp4, 30);    // packed right shift shifting << 30
+    __ pslld(xmm_temp5, 25);    // packed right shift shifting << 25
+    __ pxor(xmm_temp7, xmm_temp4);      // xor the shifted versions
+    __ pxor(xmm_temp7, xmm_temp5);
+    __ movdqu(xmm_temp4, xmm_temp7);
+    __ pslldq(xmm_temp7, 12);
+    __ psrldq(xmm_temp4, 4);
+    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
+
+    //
+    // Second phase of the reduction
+    //
+    // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
+    // shift operations.
+    __ movdqu(xmm_temp2, xmm_temp3);
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp5, xmm_temp3);
+    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
+    __ psrld(xmm_temp7, 2);     // packed left shifting >> 2
+    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
+    __ pxor(xmm_temp2, xmm_temp7);      // xor the shifted versions
+    __ pxor(xmm_temp2, xmm_temp5);
+    __ pxor(xmm_temp2, xmm_temp4);
+    __ pxor(xmm_temp3, xmm_temp2);
+    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
+
+    __ decrement(blocks);
+    __ jcc(Assembler::zero, L_exit);
+    __ movdqu(xmm_temp0, xmm_temp6);
+    __ addptr(data, 16);
+    __ jmp(L_ghash_loop);
+
+    __ BIND(L_exit);
+       // Byte swap 16-byte result
+    __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
+
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
   /**
    *  Arguments:
    *
@@ -3026,6 +3187,13 @@
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
     }
 
+    // Generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    }
+
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
                                                    &StubRoutines::_safefetch32_fault_pc,
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -3681,6 +3681,175 @@
     return start;
   }
 
+
+  // byte swap x86 long
+  address generate_ghash_long_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+    address start = __ pc();
+    __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
+    __ emit_data64(0x0706050403020100, relocInfo::none );
+  return start;
+  }
+
+  // byte swap x86 byte array
+  address generate_ghash_byte_swap_mask() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+    address start = __ pc();
+    __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
+    __ emit_data64(0x0001020304050607, relocInfo::none );
+  return start;
+  }
+
+  /* Single and multi-block ghash operations */
+  address generate_ghash_processBlocks() {
+    __ align(CodeEntryAlignment);
+    Label L_ghash_loop, L_exit;
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+    address start = __ pc();
+
+    const Register state        = c_rarg0;
+    const Register subkeyH      = c_rarg1;
+    const Register data         = c_rarg2;
+    const Register blocks       = c_rarg3;
+
+#ifdef _WIN64
+    const int XMM_REG_LAST  = 10;
+#endif
+
+    const XMMRegister xmm_temp0 = xmm0;
+    const XMMRegister xmm_temp1 = xmm1;
+    const XMMRegister xmm_temp2 = xmm2;
+    const XMMRegister xmm_temp3 = xmm3;
+    const XMMRegister xmm_temp4 = xmm4;
+    const XMMRegister xmm_temp5 = xmm5;
+    const XMMRegister xmm_temp6 = xmm6;
+    const XMMRegister xmm_temp7 = xmm7;
+    const XMMRegister xmm_temp8 = xmm8;
+    const XMMRegister xmm_temp9 = xmm9;
+    const XMMRegister xmm_temp10 = xmm10;
+
+    __ enter();
+
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-10
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+#endif
+
+    __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+    __ movdqu(xmm_temp0, Address(state, 0));
+    __ pshufb(xmm_temp0, xmm_temp10);
+
+
+    __ BIND(L_ghash_loop);
+    __ movdqu(xmm_temp2, Address(data, 0));
+    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+    __ movdqu(xmm_temp1, Address(subkeyH, 0));
+    __ pshufb(xmm_temp1, xmm_temp10);
+
+    __ pxor(xmm_temp0, xmm_temp2);
+
+    //
+    // Multiply with the hash key
+    //
+    __ movdqu(xmm_temp3, xmm_temp0);
+    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
+    __ movdqu(xmm_temp4, xmm_temp0);
+    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
+
+    __ movdqu(xmm_temp5, xmm_temp0);
+    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
+    __ movdqu(xmm_temp6, xmm_temp0);
+    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
+
+    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
+
+    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
+    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
+    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
+    __ pxor(xmm_temp3, xmm_temp5);
+    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
+                                        // of the carry-less multiplication of
+                                        // xmm0 by xmm1.
+
+    // We shift the result of the multiplication by one bit position
+    // to the left to cope for the fact that the bits are reversed.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp8, xmm_temp6);
+    __ pslld(xmm_temp3, 1);
+    __ pslld(xmm_temp6, 1);
+    __ psrld(xmm_temp7, 31);
+    __ psrld(xmm_temp8, 31);
+    __ movdqu(xmm_temp9, xmm_temp7);
+    __ pslldq(xmm_temp8, 4);
+    __ pslldq(xmm_temp7, 4);
+    __ psrldq(xmm_temp9, 12);
+    __ por(xmm_temp3, xmm_temp7);
+    __ por(xmm_temp6, xmm_temp8);
+    __ por(xmm_temp6, xmm_temp9);
+
+    //
+    // First phase of the reduction
+    //
+    // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+    // independently.
+    __ movdqu(xmm_temp7, xmm_temp3);
+    __ movdqu(xmm_temp8, xmm_temp3);
+    __ movdqu(xmm_temp9, xmm_temp3);
+    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
+    __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
+    __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
+    __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
+    __ pxor(xmm_temp7, xmm_temp9);
+    __ movdqu(xmm_temp8, xmm_temp7);
+    __ pslldq(xmm_temp7, 12);
+    __ psrldq(xmm_temp8, 4);
+    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
+
+    //
+    // Second phase of the reduction
+    //
+    // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+    // shift operations.
+    __ movdqu(xmm_temp2, xmm_temp3);
+    __ movdqu(xmm_temp4, xmm_temp3);
+    __ movdqu(xmm_temp5, xmm_temp3);
+    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
+    __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
+    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
+    __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
+    __ pxor(xmm_temp2, xmm_temp5);
+    __ pxor(xmm_temp2, xmm_temp8);
+    __ pxor(xmm_temp3, xmm_temp2);
+    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
+
+    __ decrement(blocks);
+    __ jcc(Assembler::zero, L_exit);
+    __ movdqu(xmm_temp0, xmm_temp6);
+    __ addptr(data, 16);
+    __ jmp(L_ghash_loop);
+
+    __ BIND(L_exit);
+    __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
+    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
+
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+#endif
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
   /**
    *  Arguments:
    *
@@ -4120,6 +4289,13 @@
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
 
+    // Generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    }
+
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
                                                        &StubRoutines::_safefetch32_fault_pc,
--- a/src/cpu/x86/vm/stubRoutines_x86.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/x86/vm/stubRoutines_x86.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,6 +33,8 @@
 
 address StubRoutines::x86::_verify_mxcsr_entry = NULL;
 address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
+address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
+address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
 
 uint64_t StubRoutines::x86::_crc_by128_masks[] =
 {
--- a/src/cpu/x86/vm/stubRoutines_x86.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/x86/vm/stubRoutines_x86.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,10 +36,15 @@
   // masks and table for CRC32
   static uint64_t _crc_by128_masks[];
   static juint    _crc_table[];
+  // swap mask for ghash
+  static address _ghash_long_swap_mask_addr;
+  static address _ghash_byte_swap_mask_addr;
 
  public:
   static address verify_mxcsr_entry()    { return _verify_mxcsr_entry; }
   static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
   static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
+  static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
+  static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
 
 #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -677,6 +677,17 @@
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
 
+  // GHASH/GCM intrinsics
+  if (UseCLMUL && (UseSSE > 2)) {
+    if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+      UseGHASHIntrinsics = true;
+    }
+  } else if (UseGHASHIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+      warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU");
+    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+  }
+
   if (UseSHA) {
     warning("SHA instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseSHA, false);
--- a/src/share/vm/classfile/vmSymbols.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/classfile/vmSymbols.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -846,6 +846,12 @@
    do_name(     implCompressMB_name,                               "implCompressMultiBlock")                            \
    do_signature(implCompressMB_signature,                          "([BII)I")                                           \
                                                                                                                         \
+  /* support for com.sun.crypto.provider.GHASH */                                                                       \
+  do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH")                                              \
+  do_intrinsic(_ghash_processBlocks, com_sun_crypto_provider_ghash, processBlocks_name, ghash_processBlocks_signature, F_S) \
+   do_name(processBlocks_name, "processBlocks")                                                                         \
+   do_signature(ghash_processBlocks_signature, "([BII[J[J)V")                                                           \
+                                                                                                                        \
   /* support for java.util.zip */                                                                                       \
   do_class(java_util_zip_CRC32,           "java/util/zip/CRC32")                                                        \
   do_intrinsic(_updateCRC32,               java_util_zip_CRC32,   update_name, int2_int_signature,               F_SN)  \
--- a/src/share/vm/code/debugInfo.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/code/debugInfo.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -26,6 +26,7 @@
 #include "code/debugInfo.hpp"
 #include "code/debugInfoRec.hpp"
 #include "code/nmethod.hpp"
+#include "oops/oop.inline.hpp"
 #include "runtime/handles.inline.hpp"
 
 PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
@@ -47,6 +48,12 @@
   write_int(recorder()->oop_recorder()->find_index(h));
 }
 
+oop DebugInfoReadStream::read_oop() {
+  oop o = code()->oop_at(read_int());
+  assert(o->is_oop_or_null(), "oop only");
+  return o;
+}
+
 ScopeValue* DebugInfoReadStream::read_object_value() {
   int id = read_int();
 #ifdef ASSERT
--- a/src/share/vm/code/debugInfo.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/code/debugInfo.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -266,11 +266,7 @@
 
   } ;
 
-  oop read_oop() {
-    oop o = code()->oop_at(read_int());
-    assert(o == NULL || o->is_oop(), "oop only");
-    return o;
-  }
+  oop read_oop();
   Method* read_method() {
     Method* o = (Method*)(code()->metadata_at(read_int()));
     // is_metadata() is a faster check than is_metaspace_object()
--- a/src/share/vm/opto/c2_globals.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/c2_globals.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -191,6 +191,13 @@
   product(intx,  LoopMaxUnroll, 16,                                         \
           "Maximum number of unrolls for main loop")                        \
                                                                             \
+  product(bool,  SuperWordLoopUnrollAnalysis, false,                        \
+          "Map number of unrolls for main loop via "                        \
+          "Superword Level Parallelism analysis")                           \
+                                                                            \
+  notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false,                 \
+          "Trace what Superword Level Parallelism analysis applies")        \
+                                                                            \
   product(intx,  LoopUnrollMin, 4,                                          \
           "Minimum number of unroll loop bodies before checking progress"   \
           "of rounds of unroll,optimize,..")                                \
--- a/src/share/vm/opto/escape.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/escape.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -966,6 +966,7 @@
                   strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 ||
--- a/src/share/vm/opto/library_call.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/library_call.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -278,6 +278,7 @@
   Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
   Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
   Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
+  bool inline_ghash_processBlocks();
   bool inline_sha_implCompress(vmIntrinsics::ID id);
   bool inline_digestBase_implCompressMB(int predicate);
   bool inline_sha_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass_SHA,
@@ -528,6 +529,10 @@
     predicates = 3;
     break;
 
+  case vmIntrinsics::_ghash_processBlocks:
+    if (!UseGHASHIntrinsics) return NULL;
+    break;
+
   case vmIntrinsics::_updateCRC32:
   case vmIntrinsics::_updateBytesCRC32:
   case vmIntrinsics::_updateByteBufferCRC32:
@@ -929,6 +934,9 @@
   case vmIntrinsics::_mulAdd:
     return inline_mulAdd();
 
+  case vmIntrinsics::_ghash_processBlocks:
+    return inline_ghash_processBlocks();
+
   case vmIntrinsics::_encodeISOArray:
     return inline_encodeISOArray();
 
@@ -5858,6 +5866,35 @@
   return _gvn.transform(region);
 }
 
+//------------------------------inline_ghash_processBlocks
+bool LibraryCallKit::inline_ghash_processBlocks() {
+  address stubAddr;
+  const char *stubName;
+  assert(UseGHASHIntrinsics, "need GHASH intrinsics support");
+
+  stubAddr = StubRoutines::ghash_processBlocks();
+  stubName = "ghash_processBlocks";
+
+  Node* data           = argument(0);
+  Node* offset         = argument(1);
+  Node* len            = argument(2);
+  Node* state          = argument(3);
+  Node* subkeyH        = argument(4);
+
+  Node* state_start  = array_element_address(state, intcon(0), T_LONG);
+  assert(state_start, "state is NULL");
+  Node* subkeyH_start  = array_element_address(subkeyH, intcon(0), T_LONG);
+  assert(subkeyH_start, "subkeyH is NULL");
+  Node* data_start  = array_element_address(data, offset, T_BYTE);
+  assert(data_start, "data is NULL");
+
+  Node* ghash = make_runtime_call(RC_LEAF|RC_NO_FP,
+                                  OptoRuntime::ghash_processBlocks_Type(),
+                                  stubAddr, stubName, TypePtr::BOTTOM,
+                                  state_start, subkeyH_start, data_start, len);
+  return true;
+}
+
 //------------------------------inline_sha_implCompress-----------------------
 //
 // Calculate SHA (i.e., SHA-1) for single-block byte[] array.
--- a/src/share/vm/opto/loopTransform.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/loopTransform.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -38,6 +38,7 @@
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
 #include "opto/subnode.hpp"
+#include "opto/superword.hpp"
 #include "opto/vectornode.hpp"
 
 //------------------------------is_loop_exit-----------------------------------
@@ -640,7 +641,7 @@
 //------------------------------policy_unroll----------------------------------
 // Return TRUE or FALSE if the loop should be unrolled or not.  Unroll if
 // the loop is a CountedLoop and the body is small enough.
-bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) const {
+bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
 
   CountedLoopNode *cl = _head->as_CountedLoop();
   assert(cl->is_normal_loop() || cl->is_main_loop(), "");
@@ -652,6 +653,8 @@
   // After split at least one iteration will be executed in pre-loop.
   if (cl->trip_count() <= (uint)(cl->is_normal_loop() ? 2 : 1)) return false;
 
+  _local_loop_unroll_limit = LoopUnrollLimit;
+  _local_loop_unroll_factor = 4;
   int future_unroll_ct = cl->unrolled_count() * 2;
   if (future_unroll_ct > LoopMaxUnroll) return false;
 
@@ -747,8 +750,24 @@
     } // switch
   }
 
+  if (UseSuperWord) {
+    if (!cl->is_reduction_loop()) {
+      phase->mark_reductions(this);
+    }
+
+    // Only attempt slp analysis when user controls do not prohibit it
+    if (LoopMaxUnroll > _local_loop_unroll_factor) {
+      // Once policy_slp_analysis succeeds, mark the loop with the
+      // maximal unroll factor so that we minimize analysis passes
+      if ((future_unroll_ct > _local_loop_unroll_factor) ||
+          (body_size > (uint)_local_loop_unroll_limit)) {
+        policy_unroll_slp_analysis(cl, phase, future_unroll_ct);
+      }
+    }
+  }
+
   // Check for being too big
-  if (body_size > (uint)LoopUnrollLimit) {
+  if (body_size > (uint)_local_loop_unroll_limit) {
     if (xors_in_loop >= 4 && body_size < (uint)LoopUnrollLimit*4) return true;
     // Normal case: loop too big
     return false;
@@ -758,6 +777,36 @@
   return true;
 }
 
+void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_ct) {
+  // Enable this functionality target by target as needed
+  if (SuperWordLoopUnrollAnalysis) {
+    if (!cl->has_passed_slp()) {
+      SuperWord sw(phase);
+      sw.transform_loop(this, false);
+
+      // If the loop is slp canonical analyze it
+      if (sw.early_return() == false) {
+        sw.unrolling_analysis(cl, _local_loop_unroll_factor);
+      }
+    }
+
+    int slp_max_unroll_factor = cl->slp_max_unroll();
+    if ((slp_max_unroll_factor > 4) &&
+        (slp_max_unroll_factor >= future_unroll_ct)) {
+      int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
+      if (new_limit > LoopUnrollLimit) {
+#ifndef PRODUCT
+        if (TraceSuperWordLoopUnrollAnalysis) {
+          tty->print_cr("slp analysis is applying unroll limit  %d, the original limit was %d\n",
+            new_limit, _local_loop_unroll_limit);
+        }
+#endif
+        _local_loop_unroll_limit = new_limit;
+      }
+    }
+  }
+}
+
 //------------------------------policy_align-----------------------------------
 // Return TRUE or FALSE if the loop should be cache-line aligned.  Gather the
 // expression that does the alignment.  Note that only one array base can be
@@ -1611,6 +1660,7 @@
               // iff the uses conform
               if (ok) {
                 def_node->add_flag(Node::Flag_is_reduction);
+                loop_head->mark_has_reductions();
               }
             }
           }
@@ -2517,7 +2567,6 @@
     // and we'd rather unroll the post-RCE'd loop SO... do not unroll if
     // peeling.
     if (should_unroll && !should_peel) {
-      phase->mark_reductions(this);
       phase->do_unroll(this, old_new, true);
     }
 
--- a/src/share/vm/opto/loopnode.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/loopnode.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -2408,7 +2408,7 @@
     for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
       IdealLoopTree* lpt = iter.current();
       if (lpt->is_counted()) {
-        sw.transform_loop(lpt);
+        sw.transform_loop(lpt, true);
       }
     }
   }
--- a/src/share/vm/opto/loopnode.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/loopnode.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -62,7 +62,9 @@
          HasExactTripCount=8,
          InnerLoop=16,
          PartialPeelLoop=32,
-         PartialPeelFailed=64 };
+         PartialPeelFailed=64,
+         HasReductions=128,
+         PassedSlpAnalysis=256 };
   char _unswitch_count;
   enum { _unswitch_max=3 };
 
@@ -77,6 +79,8 @@
   void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; }
   int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
   void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
+  void mark_has_reductions() { _loop_flags |= HasReductions; }
+  void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
 
   int unswitch_max() { return _unswitch_max; }
   int unswitch_count() { return _unswitch_count; }
@@ -155,11 +159,15 @@
   // unroll,optimize,unroll,optimize,... is making progress
   int _node_count_before_unroll;
 
+  // If slp analysis is performed we record the maximum
+  // vector mapped unroll factor here
+  int _slp_maximum_unroll_factor;
+
 public:
   CountedLoopNode( Node *entry, Node *backedge )
     : LoopNode(entry, backedge), _main_idx(0), _trip_count(max_juint),
       _profile_trip_cnt(COUNT_UNKNOWN), _unrolled_count_log2(0),
-      _node_count_before_unroll(0) {
+      _node_count_before_unroll(0), _slp_maximum_unroll_factor(0) {
     init_class_id(Class_CountedLoop);
     // Initialize _trip_count to the largest possible value.
     // Will be reset (lower) if the loop's trip count is known.
@@ -199,10 +207,12 @@
 
   // A 'main' loop that is ONLY unrolled or peeled, never RCE'd or
   // Aligned, may be missing it's pre-loop.
-  int is_normal_loop() const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
-  int is_pre_loop   () const { return (_loop_flags&PreMainPostFlagsMask) == Pre;    }
-  int is_main_loop  () const { return (_loop_flags&PreMainPostFlagsMask) == Main;   }
-  int is_post_loop  () const { return (_loop_flags&PreMainPostFlagsMask) == Post;   }
+  int is_normal_loop   () const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
+  int is_pre_loop      () const { return (_loop_flags&PreMainPostFlagsMask) == Pre;    }
+  int is_main_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Main;   }
+  int is_post_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Post;   }
+  int is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
+  int has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
   int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
   void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
 
@@ -232,8 +242,10 @@
   void double_unrolled_count() { _unrolled_count_log2++; }
   int  unrolled_count()        { return 1 << MIN2(_unrolled_count_log2, BitsPerInt-3); }
 
-  void set_node_count_before_unroll(int ct) { _node_count_before_unroll = ct; }
-  int  node_count_before_unroll()           { return _node_count_before_unroll; }
+  void set_node_count_before_unroll(int ct)  { _node_count_before_unroll = ct; }
+  int  node_count_before_unroll()            { return _node_count_before_unroll; }
+  void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; }
+  int  slp_max_unroll() const                { return _slp_maximum_unroll_factor; }
 
 #ifndef PRODUCT
   virtual void dump_spec(outputStream *st) const;
@@ -336,6 +348,8 @@
   Node *_tail;                  // Tail of loop
   inline Node *tail();          // Handle lazy update of _tail field
   PhaseIdealLoop* _phase;
+  int _local_loop_unroll_limit;
+  int _local_loop_unroll_factor;
 
   Node_List _body;              // Loop body for inner loops
 
@@ -356,7 +370,8 @@
       _safepts(NULL),
       _required_safept(NULL),
       _allow_optimizations(true),
-      _nest(0), _irreducible(0), _has_call(0), _has_sfpt(0), _rce_candidate(0)
+      _nest(0), _irreducible(0), _has_call(0), _has_sfpt(0), _rce_candidate(0),
+      _local_loop_unroll_limit(0), _local_loop_unroll_factor(0)
   { }
 
   // Is 'l' a member of 'this'?
@@ -444,7 +459,10 @@
 
   // Return TRUE or FALSE if the loop should be unrolled or not.  Unroll if
   // the loop is a CountedLoop and the body is small enough.
-  bool policy_unroll( PhaseIdealLoop *phase ) const;
+  bool policy_unroll(PhaseIdealLoop *phase);
+
+  // Loop analyses to map to a maximal superword unrolling for vectorization.
+  void policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_ct);
 
   // Return TRUE or FALSE if the loop should be range-check-eliminated.
   // Gather a list of IF tests that are dominated by iteration splitting;
--- a/src/share/vm/opto/runtime.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/runtime.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -987,7 +987,25 @@
   return TypeFunc::make(domain, range);
 }
 
+// GHASH block processing
+const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
+    int argcnt = 4;
 
+    const Type** fields = TypeTuple::fields(argcnt);
+    int argp = TypeFunc::Parms;
+    fields[argp++] = TypePtr::NOTNULL;    // state
+    fields[argp++] = TypePtr::NOTNULL;    // subkeyH
+    fields[argp++] = TypePtr::NOTNULL;    // data
+    fields[argp++] = TypeInt::INT;        // blocks
+    assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+    const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+    // result type needed
+    fields = TypeTuple::fields(1);
+    fields[TypeFunc::Parms+0] = NULL; // void
+    const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+    return TypeFunc::make(domain, range);
+}
 
 //------------- Interpreter state access for on stack replacement
 const TypeFunc* OptoRuntime::osr_end_Type() {
--- a/src/share/vm/opto/runtime.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/runtime.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -316,6 +316,8 @@
 
   static const TypeFunc* mulAdd_Type();
 
+  static const TypeFunc* ghash_processBlocks_Type();
+
   static const TypeFunc* updateBytesCRC32_Type();
 
   // leaf on stack replacement interpreter accessor types
--- a/src/share/vm/opto/superword.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/superword.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -68,6 +68,7 @@
   _bb(NULL),                              // basic block
   _iv(NULL),                              // induction var
   _race_possible(false),                  // cases where SDMU is true
+  _early_return(true),                    // analysis evaluations routine
   _num_work_vecs(0),                      // amount of vector work we have
   _num_reductions(0),                     // amount of reduction work we have
   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
@@ -78,7 +79,7 @@
 {}
 
 //------------------------------transform_loop---------------------------
-void SuperWord::transform_loop(IdealLoopTree* lpt) {
+void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
   assert(UseSuperWord, "should be");
   // Do vectors exist on this architecture?
   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
@@ -113,8 +114,156 @@
   // For now, define one block which is the entire loop body
   set_bb(cl);
 
-  assert(_packset.length() == 0, "packset must be empty");
-  SLP_extract();
+  if (do_optimization) {
+    assert(_packset.length() == 0, "packset must be empty");
+    SLP_extract();
+  }
+}
+
+//------------------------------early unrolling analysis------------------------------
+void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
+  bool is_slp = true;
+  ResourceMark rm;
+  size_t ignored_size = lpt()->_body.size();
+  int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
+  Node_Stack nstack((int)ignored_size);
+  Node *cl_exit = cl->loopexit();
+
+  // First clear the entries
+  for (uint i = 0; i < lpt()->_body.size(); i++) {
+    ignored_loop_nodes[i] = -1;
+  }
+
+  int max_vector = Matcher::max_vector_size(T_INT);
+
+  // Process the loop, some/all of the stack entries will not be in order, ergo
+  // need to preprocess the ignored initial state before we process the loop
+  for (uint i = 0; i < lpt()->_body.size(); i++) {
+    Node* n = lpt()->_body.at(i);
+    if (n == cl->incr() ||
+      n->is_reduction() ||
+      n->is_AddP() ||
+      n->is_Cmp() ||
+      n->is_IfTrue() ||
+      n->is_CountedLoop() ||
+      (n == cl_exit)) {
+      ignored_loop_nodes[i] = n->_idx;
+      continue;
+    }
+
+    if (n->is_If()) {
+      IfNode *iff = n->as_If();
+      if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
+        if (lpt()->is_loop_exit(iff)) {
+          ignored_loop_nodes[i] = n->_idx;
+          continue;
+        }
+      }
+    }
+
+    if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
+      Node* n_tail = n->in(LoopNode::LoopBackControl);
+      if (n_tail != n->in(LoopNode::EntryControl)) {
+        if (!n_tail->is_Mem()) {
+          is_slp = false;
+          break;
+        }
+      }
+    }
+
+    // This must happen after check of phi/if
+    if (n->is_Phi() || n->is_If()) {
+      ignored_loop_nodes[i] = n->_idx;
+      continue;
+    }
+
+    if (n->is_LoadStore() || n->is_MergeMem() ||
+      (n->is_Proj() && !n->as_Proj()->is_CFG())) {
+      is_slp = false;
+      break;
+    }
+
+    if (n->is_Mem()) {
+      MemNode* current = n->as_Mem();
+      BasicType bt = current->memory_type();
+      if (is_java_primitive(bt) == false) {
+        ignored_loop_nodes[i] = n->_idx;
+        continue;
+      }
+      Node* adr = n->in(MemNode::Address);
+      Node* n_ctrl = _phase->get_ctrl(adr);
+
+      // save a queue of post process nodes
+      if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
+        // Process the memory expression
+        int stack_idx = 0;
+        bool have_side_effects = true;
+        if (adr->is_AddP() == false) {
+          nstack.push(adr, stack_idx++);
+        } else {
+          // Mark the components of the memory operation in nstack
+          SWPointer p1(current, this, &nstack, true);
+          have_side_effects = p1.node_stack()->is_nonempty();
+        }
+
+        // Process the pointer stack
+        while (have_side_effects) {
+          Node* pointer_node = nstack.node();
+          for (uint j = 0; j < lpt()->_body.size(); j++) {
+            Node* cur_node = lpt()->_body.at(j);
+            if (cur_node == pointer_node) {
+              ignored_loop_nodes[j] = cur_node->_idx;
+              break;
+            }
+          }
+          nstack.pop();
+          have_side_effects = nstack.is_nonempty();
+        }
+      }
+    }
+  }
+
+  if (is_slp) {
+    // Now we try to find the maximum supported consistent vector which the machine
+    // description can use
+    for (uint i = 0; i < lpt()->_body.size(); i++) {
+      if (ignored_loop_nodes[i] != -1) continue;
+
+      BasicType bt;
+      Node* n = lpt()->_body.at(i);
+      if (n->is_Store()) {
+        bt = n->as_Mem()->memory_type();
+      } else {
+        bt = n->bottom_type()->basic_type();
+      }
+
+      int cur_max_vector = Matcher::max_vector_size(bt);
+
+      // If a max vector exists which is not larger than _local_loop_unroll_factor
+      // stop looking, we already have the max vector to map to.
+      if (cur_max_vector <= local_loop_unroll_factor) {
+        is_slp = false;
+#ifndef PRODUCT
+        if (TraceSuperWordLoopUnrollAnalysis) {
+          tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
+        }
+#endif
+        break;
+      }
+
+      // Map the maximal common vector
+      if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
+        if (cur_max_vector < max_vector) {
+          max_vector = cur_max_vector;
+        }
+      }
+    }
+    if (is_slp) {
+      local_loop_unroll_factor = max_vector;
+    }
+    cl->mark_passed_slp();
+    cl->set_slp_max_unroll(local_loop_unroll_factor);
+  }
 }
 
 //------------------------------SLP_extract---------------------------
@@ -268,12 +417,12 @@
       best_iv_adjustment = iv_adjustment;
     }
 
-    SWPointer align_to_ref_p(mem_ref, this);
+    SWPointer align_to_ref_p(mem_ref, this, NULL, false);
     // Set alignment relative to "align_to_ref" for all related memory operations.
     for (int i = memops.size() - 1; i >= 0; i--) {
       MemNode* s = memops.at(i)->as_Mem();
       if (isomorphic(s, mem_ref)) {
-        SWPointer p2(s, this);
+        SWPointer p2(s, this, NULL, false);
         if (p2.comparable(align_to_ref_p)) {
           int align = memory_alignment(s, iv_adjustment);
           set_alignment(s, align);
@@ -294,7 +443,7 @@
           // iterations in pre-loop will be not enough to align it.
           create_pack = false;
         } else {
-          SWPointer p2(best_align_to_mem_ref, this);
+          SWPointer p2(best_align_to_mem_ref, this, NULL, false);
           if (align_to_ref_p.invar() != p2.invar()) {
             // Do not vectorize memory accesses with different invariants
             // if unaligned memory accesses are not allowed.
@@ -411,7 +560,7 @@
   // Count number of comparable memory ops
   for (uint i = 0; i < memops.size(); i++) {
     MemNode* s1 = memops.at(i)->as_Mem();
-    SWPointer p1(s1, this);
+    SWPointer p1(s1, this, NULL, false);
     // Discard if pre loop can't align this reference
     if (!ref_is_alignable(p1)) {
       *cmp_ct.adr_at(i) = 0;
@@ -420,7 +569,7 @@
     for (uint j = i+1; j < memops.size(); j++) {
       MemNode* s2 = memops.at(j)->as_Mem();
       if (isomorphic(s1, s2)) {
-        SWPointer p2(s2, this);
+        SWPointer p2(s2, this, NULL, false);
         if (p1.comparable(p2)) {
           (*cmp_ct.adr_at(i))++;
           (*cmp_ct.adr_at(j))++;
@@ -441,7 +590,7 @@
     if (s->is_Store()) {
       int vw = vector_width_in_bytes(s);
       assert(vw > 1, "sanity");
-      SWPointer p(s, this);
+      SWPointer p(s, this, NULL, false);
       if (cmp_ct.at(j) >  max_ct ||
           cmp_ct.at(j) == max_ct &&
             (vw >  max_vw ||
@@ -464,7 +613,7 @@
       if (s->is_Load()) {
         int vw = vector_width_in_bytes(s);
         assert(vw > 1, "sanity");
-        SWPointer p(s, this);
+        SWPointer p(s, this, NULL, false);
         if (cmp_ct.at(j) >  max_ct ||
             cmp_ct.at(j) == max_ct &&
               (vw >  max_vw ||
@@ -575,7 +724,7 @@
 //---------------------------get_iv_adjustment---------------------------
 // Calculate loop's iv adjustment for this memory ops.
 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
-  SWPointer align_to_ref_p(mem_ref, this);
+  SWPointer align_to_ref_p(mem_ref, this, NULL, false);
   int offset = align_to_ref_p.offset_in_bytes();
   int scale  = align_to_ref_p.scale_in_bytes();
   int elt_size = align_to_ref_p.memory_size();
@@ -649,13 +798,13 @@
       if (_dg.dep(s1)->in_cnt() == 0) {
         _dg.make_edge(slice, s1);
       }
-      SWPointer p1(s1->as_Mem(), this);
+      SWPointer p1(s1->as_Mem(), this, NULL, false);
       bool sink_dependent = true;
       for (int k = j - 1; k >= 0; k--) {
         Node* s2 = _nlist.at(k);
         if (s1->is_Load() && s2->is_Load())
           continue;
-        SWPointer p2(s2->as_Mem(), this);
+        SWPointer p2(s2->as_Mem(), this, NULL, false);
 
         int cmp = p1.cmp(p2);
         if (SuperWordRTDepCheck &&
@@ -795,8 +944,8 @@
   if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
       _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
     return false;
-  SWPointer p1(s1->as_Mem(), this);
-  SWPointer p2(s2->as_Mem(), this);
+  SWPointer p1(s1->as_Mem(), this, NULL, false);
+  SWPointer p2(s2->as_Mem(), this, NULL, false);
   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
   return diff == data_size(s1);
@@ -1615,13 +1764,13 @@
       if (n->is_Load()) {
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);
-        SWPointer p1(n->as_Mem(), this);
+        SWPointer p1(n->as_Mem(), this, NULL, false);
         // Identify the memory dependency for the new loadVector node by
         // walking up through memory chain.
         // This is done to give flexibility to the new loadVector node so that
         // it can move above independent storeVector nodes.
         while (mem->is_StoreVector()) {
-          SWPointer p2(mem->as_Mem(), this);
+          SWPointer p2(mem->as_Mem(), this, NULL, false);
           int cmp = p1.cmp(p2);
           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
             mem = mem->in(MemNode::Memory);
@@ -2138,7 +2287,7 @@
 //------------------------------memory_alignment---------------------------
 // Alignment within a vector memory reference
 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
-  SWPointer p(s, this);
+  SWPointer p(s, this, NULL, false);
   if (!p.valid()) {
     return bottom_align;
   }
@@ -2315,7 +2464,7 @@
   Node *orig_limit = pre_opaq->original_loop_limit();
   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
 
-  SWPointer align_to_ref_p(align_to_ref, this);
+  SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
   assert(align_to_ref_p.valid(), "sanity");
 
   // Given:
@@ -2489,6 +2638,7 @@
   _bb = NULL;
   _iv = NULL;
   _race_possible = 0;
+  _early_return = false;
   _num_work_vecs = 0;
   _num_reductions = 0;
 }
@@ -2559,9 +2709,11 @@
 //==============================SWPointer===========================
 
 //----------------------------SWPointer------------------------
-SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
+SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
-  _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {
+  _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
+  _nstack(nstack), _analyze_only(analyze_only),
+  _stack_idx(0) {
 
   Node* adr = mem->in(MemNode::Address);
   if (!adr->is_AddP()) {
@@ -2599,7 +2751,9 @@
 // the pattern match of an address expression.
 SWPointer::SWPointer(SWPointer* p) :
   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
-  _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {}
+  _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
+  _nstack(p->_nstack), _analyze_only(p->_analyze_only),
+  _stack_idx(p->_stack_idx) {}
 
 //------------------------scaled_iv_plus_offset--------------------
 // Match: k*iv + offset
@@ -2642,6 +2796,9 @@
     _scale = 1;
     return true;
   }
+  if (_analyze_only && (invariant(n) == false)) {
+    _nstack->push(n, _stack_idx++);
+  }
   int opc = n->Opcode();
   if (opc == Op_MulI) {
     if (n->in(1) == iv() && n->in(2)->is_Con()) {
@@ -2699,6 +2856,9 @@
     return false;
   }
   if (_invar != NULL) return false; // already have an invariant
+  if (_analyze_only && (invariant(n) == false)) {
+    _nstack->push(n, _stack_idx++);
+  }
   if (opc == Op_AddI) {
     if (n->in(2)->is_Con() && invariant(n->in(1))) {
       _negate_invar = negate;
--- a/src/share/vm/opto/superword.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/opto/superword.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -239,12 +239,15 @@
  public:
   SuperWord(PhaseIdealLoop* phase);
 
-  void transform_loop(IdealLoopTree* lpt);
+  void transform_loop(IdealLoopTree* lpt, bool do_optimization);
+
+  void unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor);
 
   // Accessors for SWPointer
   PhaseIdealLoop* phase()          { return _phase; }
   IdealLoopTree* lpt()             { return _lpt; }
   PhiNode* iv()                    { return _iv; }
+  bool early_return()              { return _early_return; }
 
  private:
   IdealLoopTree* _lpt;             // Current loop tree node
@@ -252,6 +255,7 @@
   Node*          _bb;              // Current basic block
   PhiNode*       _iv;              // Induction var
   bool           _race_possible;   // In cases where SDMU is true
+  bool           _early_return;    // True if we do not initialize
   bool           _do_vector_loop;  // whether to do vectorization/simd style
   bool           _vector_loop_debug; // provide more printing in debug mode
   int            _num_work_vecs;   // Number of non memory vector operations
@@ -462,15 +466,18 @@
 // Information about an address for dependence checking and vector alignment
 class SWPointer VALUE_OBJ_CLASS_SPEC {
  protected:
-  MemNode*   _mem;     // My memory reference node
-  SuperWord* _slp;     // SuperWord class
+  MemNode*   _mem;           // My memory reference node
+  SuperWord* _slp;           // SuperWord class
 
-  Node* _base;         // NULL if unsafe nonheap reference
-  Node* _adr;          // address pointer
-  jint  _scale;        // multiplier for iv (in bytes), 0 if no loop iv
-  jint  _offset;       // constant offset (in bytes)
-  Node* _invar;        // invariant offset (in bytes), NULL if none
-  bool  _negate_invar; // if true then use: (0 - _invar)
+  Node* _base;               // NULL if unsafe nonheap reference
+  Node* _adr;                // address pointer
+  jint  _scale;              // multiplier for iv (in bytes), 0 if no loop iv
+  jint  _offset;             // constant offset (in bytes)
+  Node* _invar;              // invariant offset (in bytes), NULL if none
+  bool  _negate_invar;       // if true then use: (0 - _invar)
+  Node_Stack* _nstack;       // stack used to record a swpointer trace of variants
+  bool        _analyze_only; // Used in loop unrolling only for swpointer trace
+  uint        _stack_idx;    // Used in loop unrolling only for swpointer trace
 
   PhaseIdealLoop* phase() { return _slp->phase(); }
   IdealLoopTree*  lpt()   { return _slp->lpt(); }
@@ -497,7 +504,7 @@
     NotComparable = (Less | Greater | Equal)
   };
 
-  SWPointer(MemNode* mem, SuperWord* slp);
+  SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only);
   // Following is used to create a temporary object during
   // the pattern match of an address expression.
   SWPointer(SWPointer* p);
@@ -505,14 +512,15 @@
   bool valid()  { return _adr != NULL; }
   bool has_iv() { return _scale != 0; }
 
-  Node* base()            { return _base; }
-  Node* adr()             { return _adr; }
-  MemNode* mem()          { return _mem; }
-  int   scale_in_bytes()  { return _scale; }
-  Node* invar()           { return _invar; }
-  bool  negate_invar()    { return _negate_invar; }
-  int   offset_in_bytes() { return _offset; }
-  int   memory_size()     { return _mem->memory_size(); }
+  Node* base()             { return _base; }
+  Node* adr()              { return _adr; }
+  MemNode* mem()           { return _mem; }
+  int   scale_in_bytes()   { return _scale; }
+  Node* invar()            { return _invar; }
+  bool  negate_invar()     { return _negate_invar; }
+  int   offset_in_bytes()  { return _offset; }
+  int   memory_size()      { return _mem->memory_size(); }
+  Node_Stack* node_stack() { return _nstack; }
 
   // Comparable?
   int cmp(SWPointer& q) {
--- a/src/share/vm/runtime/globals.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/runtime/globals.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -674,6 +674,9 @@
   product(bool, UseSHA, false,                                              \
           "Control whether SHA instructions can be used on SPARC")          \
                                                                             \
+  product(bool, UseGHASHIntrinsics, false,                                  \
+          "Use intrinsics for GHASH versions of crypto")                    \
+                                                                            \
   product(size_t, LargePageSizeInBytes, 0,                                  \
           "Large page size (0 to let VM choose the page size)")             \
                                                                             \
--- a/src/share/vm/runtime/stubRoutines.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/runtime/stubRoutines.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -125,6 +125,7 @@
 address StubRoutines::_aescrypt_decryptBlock               = NULL;
 address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
 address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;
+address StubRoutines::_ghash_processBlocks                 = NULL;
 
 address StubRoutines::_sha1_implCompress     = NULL;
 address StubRoutines::_sha1_implCompressMB   = NULL;
--- a/src/share/vm/runtime/stubRoutines.hpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/runtime/stubRoutines.hpp	Fri Jun 19 15:24:07 2015 -0700
@@ -185,6 +185,7 @@
   static address _aescrypt_decryptBlock;
   static address _cipherBlockChaining_encryptAESCrypt;
   static address _cipherBlockChaining_decryptAESCrypt;
+  static address _ghash_processBlocks;
 
   static address _sha1_implCompress;
   static address _sha1_implCompressMB;
@@ -346,6 +347,7 @@
   static address aescrypt_decryptBlock()                { return _aescrypt_decryptBlock; }
   static address cipherBlockChaining_encryptAESCrypt()  { return _cipherBlockChaining_encryptAESCrypt; }
   static address cipherBlockChaining_decryptAESCrypt()  { return _cipherBlockChaining_decryptAESCrypt; }
+  static address ghash_processBlocks() { return _ghash_processBlocks; }
 
   static address sha1_implCompress()     { return _sha1_implCompress; }
   static address sha1_implCompressMB()   { return _sha1_implCompressMB; }
--- a/src/share/vm/runtime/vmStructs.cpp	Thu Jun 18 22:38:36 2015 -0700
+++ b/src/share/vm/runtime/vmStructs.cpp	Fri Jun 19 15:24:07 2015 -0700
@@ -828,6 +828,7 @@
      static_field(StubRoutines,                _aescrypt_decryptBlock,                        address)                               \
      static_field(StubRoutines,                _cipherBlockChaining_encryptAESCrypt,          address)                               \
      static_field(StubRoutines,                _cipherBlockChaining_decryptAESCrypt,          address)                               \
+     static_field(StubRoutines,                _ghash_processBlocks,                          address)                               \
      static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
      static_field(StubRoutines,                _crc_table_adr,                                address)                               \
      static_field(StubRoutines,                _multiplyToLen,                                address)                               \
--- a/test/compiler/codegen/7184394/TestAESBase.java	Thu Jun 18 22:38:36 2015 -0700
+++ b/test/compiler/codegen/7184394/TestAESBase.java	Fri Jun 19 15:24:07 2015 -0700
@@ -31,6 +31,7 @@
 import java.util.Random;
 import javax.crypto.Cipher;
 import javax.crypto.SecretKey;
+import javax.crypto.spec.GCMParameterSpec;
 import javax.crypto.spec.IvParameterSpec;
 import javax.crypto.spec.SecretKeySpec;
 
@@ -62,6 +63,10 @@
   Cipher dCipher;
   AlgorithmParameters algParams;
   SecretKey key;
+  GCMParameterSpec gcm_spec;
+  byte[] aad;
+  int tlen = 12;
+  byte[] iv;
 
   static int numThreads = 0;
   int  threadId;
@@ -100,6 +105,12 @@
         int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
         IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
         cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+      } else if (mode.equals("GCM")) {
+          iv = new byte[64];
+          random.nextBytes(iv);
+          aad = new byte[5];
+          random.nextBytes(aad);
+          gcm_init();
       } else {
         algParams = cipher.getParameters();
         cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
@@ -186,4 +197,12 @@
   }
 
   abstract void childShowCipher();
+
+  void gcm_init() throws Exception {
+    tlen = 12;
+    gcm_spec = new GCMParameterSpec(tlen * 8, iv);
+    cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+    cipher.init(Cipher.ENCRYPT_MODE, key, gcm_spec);
+    cipher.update(aad);
+  }
 }
--- a/test/compiler/codegen/7184394/TestAESEncode.java	Thu Jun 18 22:38:36 2015 -0700
+++ b/test/compiler/codegen/7184394/TestAESEncode.java	Fri Jun 19 15:24:07 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -32,7 +32,11 @@
   @Override
   public void run() {
     try {
-      if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+      if (mode.equals("GCM")) {
+        gcm_init();
+      } else if (!noReinit) {
+        cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+      }
       encode = new byte[encodeLength];
       if (testingMisalignment) {
         int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
--- a/test/compiler/codegen/7184394/TestAESMain.java	Thu Jun 18 22:38:36 2015 -0700
+++ b/test/compiler/codegen/7184394/TestAESMain.java	Fri Jun 19 15:24:07 2015 -0700
@@ -44,6 +44,13 @@
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
  *
  * @author Tom Deneau
  */