changeset 8043:0d78aecb0948

8152172: PPC64: Support AES intrinsics Summary: Add support for AES intrinsics on PPC64. Reviewed-by: kvn, mdoerr, simonis, zmajo Contributed-by: Hiroshi H Horii <horii@jp.ibm.com>
author simonis
date Wed, 10 Aug 2016 14:59:21 +0200
parents 1f6b58e0bd06
children db2cffccdb85
files src/cpu/ppc/vm/assembler_ppc.hpp src/cpu/ppc/vm/assembler_ppc.inline.hpp src/cpu/ppc/vm/stubGenerator_ppc.cpp src/cpu/ppc/vm/vm_version_ppc.cpp src/cpu/ppc/vm/vm_version_ppc.hpp src/share/vm/opto/graphKit.cpp src/share/vm/opto/library_call.cpp
diffstat 7 files changed, 484 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/ppc/vm/assembler_ppc.hpp	Fri Aug 05 10:47:35 2016 +0000
+++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Wed Aug 10 14:59:21 2016 +0200
@@ -589,6 +589,7 @@
     VNOR_OPCODE    = (4u  << OPCODE_SHIFT | 1284u     ),
     VOR_OPCODE     = (4u  << OPCODE_SHIFT | 1156u     ),
     VXOR_OPCODE    = (4u  << OPCODE_SHIFT | 1220u     ),
+    VRLD_OPCODE    = (4u  << OPCODE_SHIFT |  196u     ),
     VRLB_OPCODE    = (4u  << OPCODE_SHIFT |    4u     ),
     VRLW_OPCODE    = (4u  << OPCODE_SHIFT |  132u     ),
     VRLH_OPCODE    = (4u  << OPCODE_SHIFT |   68u     ),
@@ -1918,6 +1919,7 @@
   inline void vnor(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vor(      VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vxor(     VectorRegister d, VectorRegister a, VectorRegister b);
+  inline void vrld(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vrlb(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vrlw(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vrlh(     VectorRegister d, VectorRegister a, VectorRegister b);
--- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Fri Aug 05 10:47:35 2016 +0000
+++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Wed Aug 10 14:59:21 2016 +0200
@@ -739,6 +739,7 @@
 inline void Assembler::vnor(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VNOR_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vor(     VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VOR_OPCODE      | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vxor(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VXOR_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
+inline void Assembler::vrld(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLD_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vrlb(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLB_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vrlw(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLW_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vrlh(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLH_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Fri Aug 05 10:47:35 2016 +0000
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Wed Aug 10 14:59:21 2016 +0200
@@ -1961,6 +1961,434 @@
     return start;
   }
 
+  // Arguments for generated stub (little endian only):
+  //   R3_ARG1   - source byte array address
+  //   R4_ARG2   - destination byte array address
+  //   R5_ARG3   - round key array
+  address generate_aescrypt_encryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+
+    address start = __ function_entry();
+
+    Label L_doLast;
+
+    Register from           = R3_ARG1;  // source array address
+    Register to             = R4_ARG2;  // destination array address
+    Register key            = R5_ARG3;  // round key array
+
+    Register keylen         = R8;
+    Register temp           = R9;
+    Register keypos         = R10;
+    Register hex            = R11;
+    Register fifteen        = R12;
+
+    VectorRegister vRet     = VR0;
+
+    VectorRegister vKey1    = VR1;
+    VectorRegister vKey2    = VR2;
+    VectorRegister vKey3    = VR3;
+    VectorRegister vKey4    = VR4;
+
+    VectorRegister fromPerm = VR5;
+    VectorRegister keyPerm  = VR6;
+    VectorRegister toPerm   = VR7;
+    VectorRegister fSplt    = VR8;
+
+    VectorRegister vTmp1    = VR9;
+    VectorRegister vTmp2    = VR10;
+    VectorRegister vTmp3    = VR11;
+    VectorRegister vTmp4    = VR12;
+
+    VectorRegister vLow     = VR13;
+    VectorRegister vHigh    = VR14;
+
+    __ li              (hex, 16);
+    __ li              (fifteen, 15);
+    __ vspltisb        (fSplt, 0x0f);
+
+    // load unaligned from[0-15] to vsRet
+    __ lvx             (vRet, from);
+    __ lvx             (vTmp1, fifteen, from);
+    __ lvsl            (fromPerm, from);
+    __ vxor            (fromPerm, fromPerm, fSplt);
+    __ vperm           (vRet, vRet, vTmp1, fromPerm);
+
+    // load keylen (44 or 52 or 60)
+    __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
+
+    // to load keys
+    __ lvsr            (keyPerm, key);
+    __ vxor            (vTmp2, vTmp2, vTmp2);
+    __ vspltisb        (vTmp2, -16);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vsldoi          (keyPerm, keyPerm, keyPerm, -8);
+
+    // load the 1st round key to vKey1
+    __ li              (keypos, 0);
+    __ lvx             (vKey1, keypos, key);
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey1, vTmp1, vKey1, keyPerm);
+
+    // 1st round
+    __ vxor (vRet, vRet, vKey1);
+
+    // load the 2nd round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 3rd round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // load the 4th round key to vKey3
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
+
+    // load the 5th round key to vKey4
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
+
+    // 2nd - 5th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+    __ vcipher (vRet, vRet, vKey3);
+    __ vcipher (vRet, vRet, vKey4);
+
+    // load the 6th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 7th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // load the 8th round key to vKey3
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
+
+    // load the 9th round key to vKey4
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
+
+    // 6th - 9th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+    __ vcipher (vRet, vRet, vKey3);
+    __ vcipher (vRet, vRet, vKey4);
+
+    // load the 10th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 11th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // if all round keys are loaded, skip next 4 rounds
+    __ cmpwi           (CCR0, keylen, 44);
+    __ beq             (CCR0, L_doLast);
+
+    // 10th - 11th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+
+    // load the 12th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 13th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // if all round keys are loaded, skip next 2 rounds
+    __ cmpwi           (CCR0, keylen, 52);
+    __ beq             (CCR0, L_doLast);
+
+    // 12th - 13th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+
+    // load the 14th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 15th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    __ bind(L_doLast);
+
+    // last two rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipherlast (vRet, vRet, vKey2);
+
+    __ neg             (temp, to);
+    __ lvsr            (toPerm, temp);
+    __ vspltisb        (vTmp2, -1);
+    __ vxor            (vTmp1, vTmp1, vTmp1);
+    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
+    __ vxor            (toPerm, toPerm, fSplt);
+    __ lvx             (vTmp1, to);
+    __ vperm           (vRet, vRet, vRet, toPerm);
+    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
+    __ lvx             (vTmp4, fifteen, to);
+    __ stvx            (vTmp1, to);
+    __ vsel            (vRet, vRet, vTmp4, vTmp2);
+    __ stvx            (vRet, fifteen, to);
+
+    __ blr();
+     return start;
+  }
+
+  // Arguments for generated stub (little endian only):
+  //   R3_ARG1   - source byte array address
+  //   R4_ARG2   - destination byte array address
+  //   R5_ARG3   - K (key) in little endian int array
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+
+    address start = __ function_entry();
+
+    Label L_doLast;
+    Label L_do44;
+    Label L_do52;
+    Label L_do60;
+
+    Register from           = R3_ARG1;  // source array address
+    Register to             = R4_ARG2;  // destination array address
+    Register key            = R5_ARG3;  // round key array
+
+    Register keylen         = R8;
+    Register temp           = R9;
+    Register keypos         = R10;
+    Register hex            = R11;
+    Register fifteen        = R12;
+
+    VectorRegister vRet     = VR0;
+
+    VectorRegister vKey1    = VR1;
+    VectorRegister vKey2    = VR2;
+    VectorRegister vKey3    = VR3;
+    VectorRegister vKey4    = VR4;
+    VectorRegister vKey5    = VR5;
+
+    VectorRegister fromPerm = VR6;
+    VectorRegister keyPerm  = VR7;
+    VectorRegister toPerm   = VR8;
+    VectorRegister fSplt    = VR9;
+
+    VectorRegister vTmp1    = VR10;
+    VectorRegister vTmp2    = VR11;
+    VectorRegister vTmp3    = VR12;
+    VectorRegister vTmp4    = VR13;
+
+    VectorRegister vLow     = VR14;
+    VectorRegister vHigh    = VR15;
+
+    __ li              (hex, 16);
+    __ li              (fifteen, 15);
+    __ vspltisb        (fSplt, 0x0f);
+
+    // load unaligned from[0-15] to vsRet
+    __ lvx             (vRet, from);
+    __ lvx             (vTmp1, fifteen, from);
+    __ lvsl            (fromPerm, from);
+    __ vxor            (fromPerm, fromPerm, fSplt);
+    __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
+
+    // load keylen (44 or 52 or 60)
+    __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
+
+    // to load keys
+    __ lvsr            (keyPerm, key);
+    __ vxor            (vTmp2, vTmp2, vTmp2);
+    __ vspltisb        (vTmp2, -16);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vsldoi          (keyPerm, keyPerm, keyPerm, -8);
+
+    __ cmpwi           (CCR0, keylen, 44);
+    __ beq             (CCR0, L_do44);
+
+    __ cmpwi           (CCR0, keylen, 52);
+    __ beq             (CCR0, L_do52);
+
+    // load the 15th round key to vKey11
+    __ li              (keypos, 240);
+    __ lvx             (vTmp1, keypos, key);
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // load the 14th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
+
+    // load the 13th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+
+    // load the 12th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
+
+    // load the 11th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
+
+    // 1st - 5th rounds
+    __ vxor            (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+    __ vncipher        (vRet, vRet, vKey4);
+    __ vncipher        (vRet, vRet, vKey5);
+
+    __ b               (L_doLast);
+
+    __ bind            (L_do52);
+
+    // load the 13th round key to vKey11
+    __ li              (keypos, 208);
+    __ lvx             (vTmp1, keypos, key);
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // load the 12th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
+
+    // load the 11th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+
+    // 1st - 3rd rounds
+    __ vxor            (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+
+    __ b               (L_doLast);
+
+    __ bind            (L_do44);
+
+    // load the 11th round key to vKey11
+    __ li              (keypos, 176);
+    __ lvx             (vTmp1, keypos, key);
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // 1st round
+    __ vxor            (vRet, vRet, vKey1);
+
+    __ bind            (L_doLast);
+
+    // load the 10th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 9th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // load the 8th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
+
+    // load the 7th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
+
+    // load the 6th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey5, vTmp2, vTmp1, keyPerm);
+
+    // last 10th - 6th rounds
+    __ vncipher        (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+    __ vncipher        (vRet, vRet, vKey4);
+    __ vncipher        (vRet, vRet, vKey5);
+
+    // load the 5th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // load the 4th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
+
+    // load the 3rd round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+
+    // load the 2nd round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
+
+    // load the 1st round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
+
+    // last 5th - 1th rounds
+    __ vncipher        (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+    __ vncipher        (vRet, vRet, vKey4);
+    __ vncipherlast    (vRet, vRet, vKey5);
+
+    __ neg             (temp, to);
+    __ lvsr            (toPerm, temp);
+    __ vspltisb        (vTmp2, -1);
+    __ vxor            (vTmp1, vTmp1, vTmp1);
+    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
+    __ vxor            (toPerm, toPerm, fSplt);
+    __ lvx             (vTmp1, to);
+    __ vperm           (vRet, vRet, vRet, toPerm);
+    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
+    __ lvx             (vTmp4, fifteen, to);
+    __ stvx            (vTmp1, to);
+    __ vsel            (vRet, vRet, vTmp4, vTmp2);
+    __ stvx            (vRet, fifteen, to);
+
+    __ blr();
+     return start;
+  }
+
   void generate_arraycopy_stubs() {
     // Note: the disjoint stubs must be generated first, some of
     // the conjoint stubs use them.
@@ -2083,10 +2511,6 @@
     // arraycopy stubs used by compilers
     generate_arraycopy_stubs();
 
-    if (UseAESIntrinsics) {
-      guarantee(!UseAESIntrinsics, "not yet implemented.");
-    }
-
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
                                                        &StubRoutines::_safefetch32_fault_pc,
@@ -2094,6 +2518,12 @@
     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
                                                        &StubRoutines::_safefetchN_fault_pc,
                                                        &StubRoutines::_safefetchN_continuation_pc);
+
+    if (UseAESIntrinsics) {
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+    }
+
   }
 
  public:
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Fri Aug 05 10:47:35 2016 +0000
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Wed Aug 10 14:59:21 2016 +0200
@@ -102,7 +102,7 @@
   // Create and print feature-string.
   char buf[(num_features+1) * 16]; // Max 16 chars per feature.
   jio_snprintf(buf, sizeof(buf),
-               "ppc64%s%s%s%s%s%s%s%s",
+               "ppc64%s%s%s%s%s%s%s%s%s",
                (has_fsqrt()   ? " fsqrt"   : ""),
                (has_isel()    ? " isel"    : ""),
                (has_lxarxeh() ? " lxarxeh" : ""),
@@ -111,7 +111,8 @@
                (has_popcntb() ? " popcntb" : ""),
                (has_popcntw() ? " popcntw" : ""),
                (has_fcfids()  ? " fcfids"  : ""),
-               (has_vand()    ? " vand"    : "")
+               (has_vand()    ? " vand"    : ""),
+               (has_vcipher() ? " aes"     : "")
                // Make sure number of %s matches num_features!
               );
   _features_str = strdup(buf);
@@ -156,6 +157,28 @@
   }
 
   // The AES intrinsic stubs require AES instruction support.
+#if defined(VM_LITTLE_ENDIAN)
+  if (has_vcipher()) {
+    if (FLAG_IS_DEFAULT(UseAES)) {
+      UseAES = true;
+    }
+  } else if (UseAES) {
+    if (!FLAG_IS_DEFAULT(UseAES))
+      warning("AES instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAES, false);
+  }
+
+  if (UseAES && has_vcipher()) {
+    if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+      UseAESIntrinsics = true;
+    }
+  } else if (UseAESIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseAESIntrinsics))
+      warning("AES intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+  }
+
+#else
   if (UseAES) {
     warning("AES instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseAES, false);
@@ -165,6 +188,7 @@
       warning("AES intrinsics are not available on this CPU");
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
+#endif
 
   if (UseSHA) {
     warning("SHA instructions are not available on this CPU");
@@ -452,6 +476,7 @@
   a->popcntw(R7, R5);                          // code[7] -> popcntw
   a->fcfids(F3, F4);                           // code[8] -> fcfids
   a->vand(VR0, VR0, VR0);                      // code[9] -> vand
+  a->vcipher(VR0, VR1, VR2);                   // code[10] -> vcipher
   a->blr();
 
   // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@@ -495,6 +520,7 @@
   if (code[feature_cntr++]) features |= popcntw_m;
   if (code[feature_cntr++]) features |= fcfids_m;
   if (code[feature_cntr++]) features |= vand_m;
+  if (code[feature_cntr++]) features |= vcipher_m;
 
   // Print the detection code.
   if (PrintAssembly) {
--- a/src/cpu/ppc/vm/vm_version_ppc.hpp	Fri Aug 05 10:47:35 2016 +0000
+++ b/src/cpu/ppc/vm/vm_version_ppc.hpp	Wed Aug 10 14:59:21 2016 +0200
@@ -42,6 +42,7 @@
     fcfids,
     vand,
     dcba,
+    vcipher,
     num_features // last entry to count features
   };
   enum Feature_Flag_Set {
@@ -56,6 +57,7 @@
     fcfids_m              = (1 << fcfids ),
     vand_m                = (1 << vand   ),
     dcba_m                = (1 << dcba   ),
+    vcipher_m             = (1 << vcipher),
     all_features_m        = -1
   };
   static int  _features;
@@ -83,6 +85,7 @@
   static bool has_fcfids()  { return (_features & fcfids_m) != 0; }
   static bool has_vand()    { return (_features & vand_m) != 0; }
   static bool has_dcba()    { return (_features & dcba_m) != 0; }
+  static bool has_vcipher() { return (_features & vcipher_m) != 0; }
 
   static const char* cpu_features() { return _features_str; }
 
--- a/src/share/vm/opto/graphKit.cpp	Fri Aug 05 10:47:35 2016 +0000
+++ b/src/share/vm/opto/graphKit.cpp	Wed Aug 10 14:59:21 2016 +0200
@@ -1683,6 +1683,9 @@
   const Type* elemtype = arytype->elem();
   BasicType elembt = elemtype->array_element_basic_type();
   Node* adr = array_element_address(ary, idx, elembt, arytype->size());
+  if (elembt == T_NARROWOOP) {
+    elembt = T_OBJECT; // To satisfy switch in LoadNode::make()
+  }
   Node* ld = make_load(ctl, adr, elemtype, elembt, arytype, MemNode::unordered);
   return ld;
 }
--- a/src/share/vm/opto/library_call.cpp	Fri Aug 05 10:47:35 2016 +0000
+++ b/src/share/vm/opto/library_call.cpp	Wed Aug 10 14:59:21 2016 +0200
@@ -6459,7 +6459,20 @@
 
 //------------------------------get_key_start_from_aescrypt_object-----------------------
 Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) {
+#ifdef PPC64
+  // MixColumns for decryption can be reduced by preprocessing MixColumns with round keys.
+  // Intel's extention is based on this optimization and AESCrypt generates round keys by preprocessing MixColumns.
+  // However, ppc64 vncipher processes MixColumns and requires the same round keys with encryption.
+  // The ppc64 stubs of encryption and decryption use the same round keys (sessionK[0]).
+  Node* objSessionK = load_field_from_object(aescrypt_object, "sessionK", "[[I", /*is_exact*/ false);
+  assert (objSessionK != NULL, "wrong version of com.sun.crypto.provider.AESCrypt");
+  if (objSessionK == NULL) {
+    return (Node *) NULL;
+  }
+  Node* objAESCryptKey = load_array_element(control(), objSessionK, intcon(0), TypeAryPtr::OOPS);
+#else
   Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false);
+#endif // PPC64
   assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt");
   if (objAESCryptKey == NULL) return (Node *) NULL;