changeset 13485:6c7faa516fc6

6340864: Implement vectorization optimizations in hotspot-server Summary: Added asm encoding and mach nodes for vector arithmetic instructions on x86. Reviewed-by: roland
author kvn
date Mon, 20 Aug 2012 09:07:21 -0700
parents d3fc5d192448
children 4f0635e148c1
files hotspot/src/cpu/x86/vm/assembler_x86.cpp hotspot/src/cpu/x86/vm/assembler_x86.hpp hotspot/src/cpu/x86/vm/x86.ad hotspot/src/cpu/x86/vm/x86_32.ad hotspot/src/cpu/x86/vm/x86_64.ad hotspot/src/share/vm/opto/classes.hpp hotspot/src/share/vm/opto/loopnode.cpp hotspot/src/share/vm/opto/superword.cpp hotspot/src/share/vm/opto/vectornode.cpp hotspot/src/share/vm/opto/vectornode.hpp hotspot/test/compiler/6340864/TestByteVect.java hotspot/test/compiler/6340864/TestDoubleVect.java hotspot/test/compiler/6340864/TestFloatVect.java hotspot/test/compiler/6340864/TestIntVect.java hotspot/test/compiler/6340864/TestLongVect.java hotspot/test/compiler/6340864/TestShortVect.java
diffstat 16 files changed, 9105 insertions(+), 604 deletions(-) [+]
line wrap: on
line diff
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Aug 20 09:07:21 2012 -0700
@@ -999,32 +999,22 @@
 
 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x58);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::addsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x58);
-  emit_operand(dst, src);
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::addss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x58);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::addss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x58);
-  emit_operand(dst, src);
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::andl(Address dst, int32_t imm32) {
@@ -1052,36 +1042,6 @@
   emit_arith(0x23, 0xC0, dst, src);
 }
 
-void Assembler::andpd(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x54);
-  emit_operand(dst, src);
-}
-
-void Assembler::andpd(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x54);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::andps(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_NONE);
-  emit_byte(0x54);
-  emit_operand(dst, src);
-}
-
-void Assembler::andps(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
-  emit_byte(0x54);
-  emit_byte(0xC0 | encode);
-}
-
 void Assembler::bsfl(Register dst, Register src) {
   int encode = prefix_and_encode(dst->encoding(), src->encoding());
   emit_byte(0x0F);
@@ -1246,61 +1206,42 @@
   // NOTE: dbx seems to decode this as comiss even though the
   // 0x66 is there. Strangly ucomisd comes out correct
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66);
-  emit_byte(0x2F);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x2F);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::comiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x2F);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x2F);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
-  emit_byte(0xE6);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x5B);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5A);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5A);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
@@ -1312,10 +1253,7 @@
 
 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x2A);
-  emit_operand(dst, src);
+  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
@@ -1327,25 +1265,17 @@
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x2A);
-  emit_operand(dst, src);
+  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5A);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5A);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
 }
 
 
@@ -1373,32 +1303,22 @@
 
 void Assembler::divsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5E);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5E);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::divss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5E);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::divss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5E);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::emms() {
@@ -1634,16 +1554,12 @@
 
 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x28);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x28);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
@@ -1712,24 +1628,17 @@
 
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x6F);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movdqu(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3);
-  emit_byte(0x6F);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
-  emit_byte(0x6F);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movdqu(Address dst, XMMRegister src) {
@@ -1810,10 +1719,7 @@
 // The selection is done in MacroAssembler::movdbl() and movflt().
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x12);
-  emit_operand(dst, src);
+  emit_simd_arith(0x12, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movq( MMXRegister dst, Address src ) {
@@ -1870,17 +1776,12 @@
 
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x10);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::movsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F2);
-  emit_byte(0x10);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::movsd(Address dst, XMMRegister src) {
@@ -1893,17 +1794,12 @@
 
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x10);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3);
-  emit_byte(0x10);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movss(Address dst, XMMRegister src) {
@@ -2001,32 +1897,22 @@
 
 void Assembler::mulsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x59);
-  emit_operand(dst, src);
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x59);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::mulss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x59);
-  emit_operand(dst, src);
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x59);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::negl(Register dst) {
@@ -2315,17 +2201,12 @@
 void Assembler::packuswb(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x67);
-  emit_operand(dst, src);
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x67);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
@@ -2339,7 +2220,7 @@
 
 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
   emit_byte(0x61);
   emit_byte(0xC0 | encode);
   emit_byte(imm8);
@@ -2355,7 +2236,7 @@
 
 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
   emit_byte(0x30);
   emit_byte(0xC0 | encode);
 }
@@ -2456,28 +2337,10 @@
   a_byte(p);
 }
 
-void Assembler::por(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0xEB);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::por(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0xEB);
-  emit_operand(dst, src);
-}
-
 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x70);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66);
   emit_byte(mode & 0xFF);
 
 }
@@ -2496,9 +2359,7 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
-  emit_byte(0x70);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2);
   emit_byte(mode & 0xFF);
 }
 
@@ -2513,18 +2374,6 @@
   emit_byte(mode & 0xFF);
 }
 
-void Assembler::psrlq(XMMRegister dst, int shift) {
-  // Shift 64 bit value logically right by specified number of bits.
-  // HMM Table D-1 says sse2 or mmx.
-  // Do not confuse it with psrldq SSE2 instruction which
-  // shifts 128 bit value in xmm register by number of bytes.
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
-  emit_byte(0x73);
-  emit_byte(0xC0 | encode);
-  emit_byte(shift);
-}
-
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@@ -2545,7 +2394,7 @@
 
 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
   emit_byte(0x17);
   emit_byte(0xC0 | encode);
 }
@@ -2553,40 +2402,28 @@
 void Assembler::punpcklbw(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x60);
-  emit_operand(dst, src);
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x60);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpckldq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x62);
-  emit_operand(dst, src);
+  emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x62);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x6C);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::push(int32_t imm32) {
@@ -2616,22 +2453,6 @@
 }
 #endif
 
-void Assembler::pxor(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0xEF);
-  emit_operand(dst, src);
-}
-
-void Assembler::pxor(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0xEF);
-  emit_byte(0xC0 | encode);
-}
-
 void Assembler::rcll(Register dst, int imm8) {
   assert(isShiftCount(imm8), "illegal shift count");
   int encode = prefix_and_encode(dst->encoding());
@@ -2790,32 +2611,22 @@
 
 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x51);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::sqrtsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x51);
-  emit_operand(dst, src);
+  emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x51);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::sqrtss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x51);
-  emit_operand(dst, src);
+  emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::stmxcsr( Address dst) {
@@ -2865,32 +2676,22 @@
 
 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5C);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::subsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5C);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::subss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5C);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::subss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5C);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::testb(Register dst, int imm8) {
@@ -2928,32 +2729,22 @@
 
 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66);
-  emit_byte(0x2E);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x2E);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::ucomiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x2E);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x2E);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
 }
 
 
@@ -2995,211 +2786,714 @@
   emit_arith(0x33, 0xC0, dst, src);
 }
 
+
+// AVX 3-operands scalar float-point arithmetic instructions
+
+void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+}
+
+void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+}
+
+void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}
+
+void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}
+
+void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+}
+
+void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+}
+
+void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}
+
+void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}
+
+void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+}
+
+void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+}
+
+void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}
+
+void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}
+
+void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+}
+
+void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+}
+
+void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}
+
+void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}
+
+//====================VECTOR ARITHMETIC=====================================
+
+// Float-point vector arithmetic
+
+void Assembler::addpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::addps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::subpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::subps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::mulps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::divpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::divps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::andpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::andps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::andps(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::andpd(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x57);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::xorpd(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x57);
-  emit_operand(dst, src);
-}
-
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
+}
 
 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
-  emit_byte(0x57);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::xorpd(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::xorps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_NONE);
-  emit_byte(0x57);
-  emit_operand(dst, src);
-}
-
-// AVX 3-operands non destructive source instructions (encoded with VEX prefix)
-
-void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x58);
-  emit_operand(dst, src);
-}
-
-void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-  assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x58);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x58);
-  emit_operand(dst, src);
-}
-
-void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-  assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x58);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
-  emit_byte(0x54);
-  emit_operand(dst, src);
-}
-
-void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
-  emit_byte(0x54);
-  emit_operand(dst, src);
-}
-
-void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x5E);
-  emit_operand(dst, src);
-}
-
-void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-  assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x5E);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x5E);
-  emit_operand(dst, src);
-}
-
-void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-  assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x5E);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x59);
-  emit_operand(dst, src);
-}
-
-void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-  assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x59);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x59);
-  emit_operand(dst, src);
-}
-
-void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-  assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x59);
-  emit_byte(0xC0 | encode);
-}
-
-
-void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x5C);
-  emit_operand(dst, src);
-}
-
-void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-  assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x5C);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x5C);
-  emit_operand(dst, src);
-}
-
-void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-  assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x5C);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
-  emit_byte(0x57);
-  emit_operand(dst, src);
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
-  emit_byte(0x57);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
-  emit_byte(0x57);
-  emit_operand(dst, src);
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
 }
 
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, vector256);
-  emit_byte(0x57);
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+
+// Integer vector arithmetic
+void Assembler::paddb(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::paddw(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFD, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::paddd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFE, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::paddq(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::psubb(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF8, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::psubw(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF9, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::psubd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFA, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::psubq(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFB, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sse4_1(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x40);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+  emit_byte(0x40);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  InstructionMark im(this);
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
+  emit_byte(0x40);
+  emit_operand(dst, src);
+}
+
+// Shift packed integers left by specified number of bits.
+void Assembler::psllw(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
+  emit_byte(0x71);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::pslld(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM6 is for /6 encoding: 66 0F 72 /6 ib
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
+  emit_byte(0x72);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psllq(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM6 is for /6 encoding: 66 0F 73 /6 ib
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
+  emit_byte(0x73);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::pslld(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
+  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM6 is for /6 encoding: 66 0F 72 /6 ib
+  emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM6 is for /6 encoding: 66 0F 73 /6 ib
+  emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+// Shift packed integers logically right by specified number of bits.
+void Assembler::psrlw(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
+  emit_byte(0x71);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psrld(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM2 is for /2 encoding: 66 0F 72 /2 ib
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
+  emit_byte(0x72);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psrlq(XMMRegister dst, int shift) {
+  // Do not confuse it with psrldq SSE2 instruction which
+  // shifts 128 bit value in xmm register by number of bytes.
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
+  emit_byte(0x73);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::psrld(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+// Shift packed integers arithmetically right by specified number of bits.
+void Assembler::psraw(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
+  emit_byte(0x71);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psrad(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM4 is for /4 encoding: 66 0F 72 /4 ib
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
+  emit_byte(0x72);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
+  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
+  emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+
+// AND packed integers
+void Assembler::pand(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xDB, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::por(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xEB, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::pxor(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xEF, dst, src, VEX_SIMD_66);
+}
+
 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx2() || (!vector256) && VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
-  emit_byte(0xEF);
-  emit_byte(0xC0 | encode);
-}
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
 
 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
@@ -3805,6 +4099,49 @@
   }
 }
 
+void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, pre);
+  emit_byte(opcode);
+  emit_operand(dst, src);
+}
+
+void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
+  int encode = simd_prefix_and_encode(dst, dst, src, pre);
+  emit_byte(opcode);
+  emit_byte(0xC0 | encode);
+}
+
+// Versions with no second source register (non-destructive source).
+void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
+  InstructionMark im(this);
+  simd_prefix(dst, xnoreg, src, pre);
+  emit_byte(opcode);
+  emit_operand(dst, src);
+}
+
+void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre);
+  emit_byte(opcode);
+  emit_byte(0xC0 | encode);
+}
+
+// 3-operands AVX instructions
+void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
+                               Address src, VexSimdPrefix pre, bool vector256) {
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, pre, vector256);
+  emit_byte(opcode);
+  emit_operand(dst, src);
+}
+
+void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
+                               XMMRegister src, VexSimdPrefix pre, bool vector256) {
+  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector256);
+  emit_byte(opcode);
+  emit_byte(0xC0 | encode);
+}
+
 #ifndef _LP64
 
 void Assembler::incl(Register dst) {
@@ -7968,21 +8305,21 @@
   }
 }
 
-void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
   if (reachable(src)) {
-    vandpd(dst, nds, as_Address(src));
+    vandpd(dst, nds, as_Address(src), vector256);
   } else {
     lea(rscratch1, src);
-    vandpd(dst, nds, Address(rscratch1, 0));
-  }
-}
-
-void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+    vandpd(dst, nds, Address(rscratch1, 0), vector256);
+  }
+}
+
+void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
   if (reachable(src)) {
-    vandps(dst, nds, as_Address(src));
+    vandps(dst, nds, as_Address(src), vector256);
   } else {
     lea(rscratch1, src);
-    vandps(dst, nds, Address(rscratch1, 0));
+    vandps(dst, nds, Address(rscratch1, 0), vector256);
   }
 }
 
@@ -8040,21 +8377,21 @@
   }
 }
 
-void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
   if (reachable(src)) {
-    vxorpd(dst, nds, as_Address(src));
+    vxorpd(dst, nds, as_Address(src), vector256);
   } else {
     lea(rscratch1, src);
-    vxorpd(dst, nds, Address(rscratch1, 0));
-  }
-}
-
-void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+    vxorpd(dst, nds, Address(rscratch1, 0), vector256);
+  }
+}
+
+void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
   if (reachable(src)) {
-    vxorps(dst, nds, as_Address(src));
+    vxorps(dst, nds, as_Address(src), vector256);
   } else {
     lea(rscratch1, src);
-    vxorps(dst, nds, Address(rscratch1, 0));
+    vxorps(dst, nds, Address(rscratch1, 0), vector256);
   }
 }
 
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Aug 20 09:07:21 2012 -0700
@@ -617,6 +617,7 @@
                    VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
     simd_prefix(dst, xnoreg, src, pre, opc);
   }
+
   void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
     simd_prefix(src, dst, pre);
   }
@@ -626,16 +627,10 @@
     simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
   }
 
-
   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
                              VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
                              bool rex_w = false, bool vector256 = false);
 
-  int simd_prefix_and_encode(XMMRegister dst, XMMRegister src,
-                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
-    return simd_prefix_and_encode(dst, xnoreg, src, pre, opc);
-  }
-
   // Move/convert 32-bit integer value.
   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
                              VexSimdPrefix pre) {
@@ -677,6 +672,15 @@
   void emit_arith(int op1, int op2, Register dst, jobject obj);
   void emit_arith(int op1, int op2, Register dst, Register src);
 
+  void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
+  void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
+  void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
+  void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
+  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
+                      Address src, VexSimdPrefix pre, bool vector256);
+  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
+                      XMMRegister src, VexSimdPrefix pre, bool vector256);
+
   void emit_operand(Register reg,
                     Register base, Register index, Address::ScaleFactor scale,
                     int disp,
@@ -891,12 +895,6 @@
   void andq(Register dst, Address src);
   void andq(Register dst, Register src);
 
-  // Bitwise Logical AND of Packed Double-Precision Floating-Point Values
-  void andpd(XMMRegister dst, XMMRegister src);
-
-  // Bitwise Logical AND of Packed Single-Precision Floating-Point Values
-  void andps(XMMRegister dst, XMMRegister src);
-
   void bsfl(Register dst, Register src);
   void bsrl(Register dst, Register src);
 
@@ -1436,10 +1434,6 @@
   void prefetcht2(Address src);
   void prefetchw(Address src);
 
-  // POR - Bitwise logical OR
-  void por(XMMRegister dst, XMMRegister src);
-  void por(XMMRegister dst, Address src);
-
   // Shuffle Packed Doublewords
   void pshufd(XMMRegister dst, XMMRegister src, int mode);
   void pshufd(XMMRegister dst, Address src,     int mode);
@@ -1448,9 +1442,6 @@
   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
   void pshuflw(XMMRegister dst, Address src,     int mode);
 
-  // Shift Right by bits Logical Quadword Immediate
-  void psrlq(XMMRegister dst, int shift);
-
   // Shift Right by bytes Logical DoubleQuadword Immediate
   void psrldq(XMMRegister dst, int shift);
 
@@ -1475,10 +1466,6 @@
 
   void pushq(Address src);
 
-  // Xor Packed Byte Integer Values
-  void pxor(XMMRegister dst, Address src);
-  void pxor(XMMRegister dst, XMMRegister src);
-
   void rcll(Register dst, int imm8);
 
   void rclq(Register dst, int imm8);
@@ -1601,15 +1588,10 @@
   void xorq(Register dst, Address src);
   void xorq(Register dst, Register src);
 
-  // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
-  void xorpd(XMMRegister dst, XMMRegister src);
-
-  // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
-  void xorps(XMMRegister dst, XMMRegister src);
-
   void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
 
   // AVX 3-operands scalar instructions (encoded with VEX prefix)
+
   void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vaddss(XMMRegister dst, XMMRegister nds, Address src);
@@ -1627,14 +1609,147 @@
   void vsubss(XMMRegister dst, XMMRegister nds, Address src);
   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
 
-  // AVX Vector instrucitons.
-  void vandpd(XMMRegister dst, XMMRegister nds, Address src);
-  void vandps(XMMRegister dst, XMMRegister nds, Address src);
-  void vxorpd(XMMRegister dst, XMMRegister nds, Address src);
-  void vxorps(XMMRegister dst, XMMRegister nds, Address src);
+
+  //====================VECTOR ARITHMETIC=====================================
+
+  // Add Packed Floating-Point Values
+  void addpd(XMMRegister dst, XMMRegister src);
+  void addps(XMMRegister dst, XMMRegister src);
+  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Subtract Packed Floating-Point Values
+  void subpd(XMMRegister dst, XMMRegister src);
+  void subps(XMMRegister dst, XMMRegister src);
+  void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Multiply Packed Floating-Point Values
+  void mulpd(XMMRegister dst, XMMRegister src);
+  void mulps(XMMRegister dst, XMMRegister src);
+  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Divide Packed Floating-Point Values
+  void divpd(XMMRegister dst, XMMRegister src);
+  void divps(XMMRegister dst, XMMRegister src);
+  void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Bitwise Logical AND of Packed Floating-Point Values
+  void andpd(XMMRegister dst, XMMRegister src);
+  void andps(XMMRegister dst, XMMRegister src);
+  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Bitwise Logical XOR of Packed Floating-Point Values
+  void xorpd(XMMRegister dst, XMMRegister src);
+  void xorps(XMMRegister dst, XMMRegister src);
   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Add packed integers
+  void paddb(XMMRegister dst, XMMRegister src);
+  void paddw(XMMRegister dst, XMMRegister src);
+  void paddd(XMMRegister dst, XMMRegister src);
+  void paddq(XMMRegister dst, XMMRegister src);
+  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Sub packed integers
+  void psubb(XMMRegister dst, XMMRegister src);
+  void psubw(XMMRegister dst, XMMRegister src);
+  void psubd(XMMRegister dst, XMMRegister src);
+  void psubq(XMMRegister dst, XMMRegister src);
+  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Multiply packed integers (only shorts and ints)
+  void pmullw(XMMRegister dst, XMMRegister src);
+  void pmulld(XMMRegister dst, XMMRegister src);
+  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Shift left packed integers
+  void psllw(XMMRegister dst, int shift);
+  void pslld(XMMRegister dst, int shift);
+  void psllq(XMMRegister dst, int shift);
+  void psllw(XMMRegister dst, XMMRegister shift);
+  void pslld(XMMRegister dst, XMMRegister shift);
+  void psllq(XMMRegister dst, XMMRegister shift);
+  void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+
+  // Logical shift right packed integers
+  void psrlw(XMMRegister dst, int shift);
+  void psrld(XMMRegister dst, int shift);
+  void psrlq(XMMRegister dst, int shift);
+  void psrlw(XMMRegister dst, XMMRegister shift);
+  void psrld(XMMRegister dst, XMMRegister shift);
+  void psrlq(XMMRegister dst, XMMRegister shift);
+  void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+
+  // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
+  void psraw(XMMRegister dst, int shift);
+  void psrad(XMMRegister dst, int shift);
+  void psraw(XMMRegister dst, XMMRegister shift);
+  void psrad(XMMRegister dst, XMMRegister shift);
+  void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+
+  // And packed integers
+  void pand(XMMRegister dst, XMMRegister src);
+  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Or packed integers
+  void por(XMMRegister dst, XMMRegister src);
+  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Xor packed integers
+  void pxor(XMMRegister dst, XMMRegister src);
   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Copy low 128bit into high 128bit of YMM registers.
   void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
 
@@ -2532,11 +2647,13 @@
   void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
-  void vandpd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandpd(dst, nds, src); }
-  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
-
-  void vandps(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandps(dst, nds, src); }
-  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); }
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256)     { Assembler::vandpd(dst, nds, src, vector256); }
+  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
+
+  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); }
+  void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256)     { Assembler::vandps(dst, nds, src, vector256); }
+  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
 
   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
   void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
@@ -2565,12 +2682,12 @@
   // AVX Vector instructions
 
   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
-  void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); }
-  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
+  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
 
   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
-  void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); }
-  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
+  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
 
   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
     if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
@@ -2578,6 +2695,12 @@
     else
       Assembler::vxorpd(dst, nds, src, vector256);
   }
+  void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+    if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
+      Assembler::vpxor(dst, nds, src, vector256);
+    else
+      Assembler::vxorpd(dst, nds, src, vector256);
+  }
 
   // Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector.
   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
--- a/hotspot/src/cpu/x86/vm/x86.ad	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/x86.ad	Mon Aug 20 09:07:21 2012 -0700
@@ -500,6 +500,24 @@
   0  /*bottom*/
 };
 
+const bool Matcher::match_rule_supported(int opcode) {
+  if (!has_match_rule(opcode))
+    return false;
+
+  switch (opcode) {
+    case Op_PopCountI:
+    case Op_PopCountL:
+      if (!UsePopCountInstruction)
+        return false;
+    case Op_MulVI:
+      if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
+        return false;
+    break;
+  }
+
+  return true;  // Per default match rules are supported.
+}
+
 // Max vector size in bytes. 0 if not supported.
 const int Matcher::vector_width_in_bytes(BasicType bt) {
   assert(is_java_primitive(bt), "only primitive type vectors");
@@ -1439,8 +1457,9 @@
   ins_cost(150);
   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
   ins_encode %{
+    bool vector256 = false;
     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signmask()));
+              ExternalAddress(float_signmask()), vector256);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1464,8 +1483,9 @@
   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
             "# abs double by sign masking" %}
   ins_encode %{
+    bool vector256 = false;
     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signmask()));
+              ExternalAddress(double_signmask()), vector256);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1487,8 +1507,9 @@
   ins_cost(150);
   format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
   ins_encode %{
+    bool vector256 = false;
     __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signflip()));
+              ExternalAddress(float_signflip()), vector256);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1512,8 +1533,9 @@
   format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
             "# neg double by sign flipping" %}
   ins_encode %{
+    bool vector256 = false;
     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signflip()));
+              ExternalAddress(double_signflip()), vector256);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -2382,3 +2404,2416 @@
   ins_pipe( fpu_reg_reg );
 %}
 
+// ====================VECTOR ARITHMETIC=======================================
+
+// --------------------------------- ADD --------------------------------------
+
+// Bytes vector add
+instruct vadd4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVB dst src));
+  format %{ "paddb   $dst,$src\t! add packed4B" %}
+  ins_encode %{
+    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVB src1 src2));
+  format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVB dst src));
+  format %{ "paddb   $dst,$src\t! add packed8B" %}
+  ins_encode %{
+    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVB src1 src2));
+  format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (AddVB dst src));
+  format %{ "paddb   $dst,$src\t! add packed16B" %}
+  ins_encode %{
+    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVB src1 src2));
+  format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVB src (LoadVector mem)));
+  format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
+  match(Set dst (AddVB src1 src2));
+  format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
+  match(Set dst (AddVB src (LoadVector mem)));
+  format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Shorts/Chars vector add
+instruct vadd2S(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVS dst src));
+  format %{ "paddw   $dst,$src\t! add packed2S" %}
+  ins_encode %{
+    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVS src1 src2));
+  format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4S(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVS dst src));
+  format %{ "paddw   $dst,$src\t! add packed4S" %}
+  ins_encode %{
+    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVS src1 src2));
+  format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8S(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVS dst src));
+  format %{ "paddw   $dst,$src\t! add packed8S" %}
+  ins_encode %{
+    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVS src1 src2));
+  format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVS src (LoadVector mem)));
+  format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVS src1 src2));
+  format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVS src (LoadVector mem)));
+  format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector add
+instruct vadd2I(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVI dst src));
+  format %{ "paddd   $dst,$src\t! add packed2I" %}
+  ins_encode %{
+    __ paddd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVI src1 src2));
+  format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4I(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVI dst src));
+  format %{ "paddd   $dst,$src\t! add packed4I" %}
+  ins_encode %{
+    __ paddd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVI src1 src2));
+  format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVI src (LoadVector mem)));
+  format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVI src1 src2));
+  format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVI src (LoadVector mem)));
+  format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Longs vector add
+instruct vadd2L(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVL dst src));
+  format %{ "paddq   $dst,$src\t! add packed2L" %}
+  ins_encode %{
+    __ paddq($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVL src1 src2));
+  format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVL src (LoadVector mem)));
+  format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVL src1 src2));
+  format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVL src (LoadVector mem)));
+  format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Floats vector add
+instruct vadd2F(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVF dst src));
+  format %{ "addps   $dst,$src\t! add packed2F" %}
+  ins_encode %{
+    __ addps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVF src1 src2));
+  format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4F(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVF dst src));
+  format %{ "addps   $dst,$src\t! add packed4F" %}
+  ins_encode %{
+    __ addps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVF src1 src2));
+  format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVF src (LoadVector mem)));
+  format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVF src1 src2));
+  format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVF src (LoadVector mem)));
+  format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Doubles vector add
+instruct vadd2D(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVD dst src));
+  format %{ "addpd   $dst,$src\t! add packed2D" %}
+  ins_encode %{
+    __ addpd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVD src1 src2));
+  format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVD src (LoadVector mem)));
+  format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVD src1 src2));
+  format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVD src (LoadVector mem)));
+  format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- SUB --------------------------------------
+
+// Bytes vector sub
+instruct vsub4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVB dst src));
+  format %{ "psubb   $dst,$src\t! sub packed4B" %}
+  ins_encode %{
+    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVB src1 src2));
+  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (SubVB dst src));
+  format %{ "psubb   $dst,$src\t! sub packed8B" %}
+  ins_encode %{
+    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVB src1 src2));
+  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (SubVB dst src));
+  format %{ "psubb   $dst,$src\t! sub packed16B" %}
+  ins_encode %{
+    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVB src1 src2));
+  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVB src (LoadVector mem)));
+  format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
+  match(Set dst (SubVB src1 src2));
+  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
+  match(Set dst (SubVB src (LoadVector mem)));
+  format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Shorts/Chars vector sub
+instruct vsub2S(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVS dst src));
+  format %{ "psubw   $dst,$src\t! sub packed2S" %}
+  ins_encode %{
+    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVS src1 src2));
+  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4S(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVS dst src));
+  format %{ "psubw   $dst,$src\t! sub packed4S" %}
+  ins_encode %{
+    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVS src1 src2));
+  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8S(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (SubVS dst src));
+  format %{ "psubw   $dst,$src\t! sub packed8S" %}
+  ins_encode %{
+    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVS src1 src2));
+  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVS src (LoadVector mem)));
+  format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVS src1 src2));
+  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVS src (LoadVector mem)));
+  format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector sub
+instruct vsub2I(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVI dst src));
+  format %{ "psubd   $dst,$src\t! sub packed2I" %}
+  ins_encode %{
+    __ psubd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVI src1 src2));
+  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4I(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVI dst src));
+  format %{ "psubd   $dst,$src\t! sub packed4I" %}
+  ins_encode %{
+    __ psubd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVI src1 src2));
+  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVI src (LoadVector mem)));
+  format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVI src1 src2));
+  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVI src (LoadVector mem)));
+  format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Longs vector sub
+instruct vsub2L(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVL dst src));
+  format %{ "psubq   $dst,$src\t! sub packed2L" %}
+  ins_encode %{
+    __ psubq($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVL src1 src2));
+  format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVL src (LoadVector mem)));
+  format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVL src1 src2));
+  format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVL src (LoadVector mem)));
+  format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Floats vector sub
+instruct vsub2F(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVF dst src));
+  format %{ "subps   $dst,$src\t! sub packed2F" %}
+  ins_encode %{
+    __ subps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVF src1 src2));
+  format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4F(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVF dst src));
+  format %{ "subps   $dst,$src\t! sub packed4F" %}
+  ins_encode %{
+    __ subps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVF src1 src2));
+  format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVF src (LoadVector mem)));
+  format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVF src1 src2));
+  format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVF src (LoadVector mem)));
+  format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Doubles vector sub
+instruct vsub2D(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVD dst src));
+  format %{ "subpd   $dst,$src\t! sub packed2D" %}
+  ins_encode %{
+    __ subpd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVD src1 src2));
+  format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVD src (LoadVector mem)));
+  format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVD src1 src2));
+  format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVD src (LoadVector mem)));
+  format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- MUL --------------------------------------
+
+// Shorts/Chars vector mul
+instruct vmul2S(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVS dst src));
+  format %{ "pmullw $dst,$src\t! mul packed2S" %}
+  ins_encode %{
+    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVS src1 src2));
+  format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4S(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVS dst src));
+  format %{ "pmullw  $dst,$src\t! mul packed4S" %}
+  ins_encode %{
+    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVS src1 src2));
+  format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8S(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (MulVS dst src));
+  format %{ "pmullw  $dst,$src\t! mul packed8S" %}
+  ins_encode %{
+    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVS src1 src2));
+  format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVS src (LoadVector mem)));
+  format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (MulVS src1 src2));
+  format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (MulVS src (LoadVector mem)));
+  format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector mul (sse4_1)
+instruct vmul2I(vecD dst, vecD src) %{
+  predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVI dst src));
+  format %{ "pmulld  $dst,$src\t! mul packed2I" %}
+  ins_encode %{
+    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVI src1 src2));
+  format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I(vecX dst, vecX src) %{
+  predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI dst src));
+  format %{ "pmulld  $dst,$src\t! mul packed4I" %}
+  ins_encode %{
+    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src1 src2));
+  format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVI src1 src2));
+  format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Floats vector mul
+instruct vmul2F(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVF dst src));
+  format %{ "mulps   $dst,$src\t! mul packed2F" %}
+  ins_encode %{
+    __ mulps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVF src1 src2));
+  format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4F(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVF dst src));
+  format %{ "mulps   $dst,$src\t! mul packed4F" %}
+  ins_encode %{
+    __ mulps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVF src1 src2));
+  format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVF src (LoadVector mem)));
+  format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVF src1 src2));
+  format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVF src (LoadVector mem)));
+  format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Doubles vector mul
+instruct vmul2D(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVD dst src));
+  format %{ "mulpd   $dst,$src\t! mul packed2D" %}
+  ins_encode %{
+    __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVD src1 src2));
+  format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVD src (LoadVector mem)));
+  format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVD src1 src2));
+  format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVD src (LoadVector mem)));
+  format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- DIV --------------------------------------
+
+// Floats vector div
+instruct vdiv2F(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVF dst src));
+  format %{ "divps   $dst,$src\t! div packed2F" %}
+  ins_encode %{
+    __ divps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (DivVF src1 src2));
+  format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4F(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (DivVF dst src));
+  format %{ "divps   $dst,$src\t! div packed4F" %}
+  ins_encode %{
+    __ divps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (DivVF src1 src2));
+  format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (DivVF src (LoadVector mem)));
+  format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (DivVF src1 src2));
+  format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (DivVF src (LoadVector mem)));
+  format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Doubles vector div
+instruct vdiv2D(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVD dst src));
+  format %{ "divpd   $dst,$src\t! div packed2D" %}
+  ins_encode %{
+    __ divpd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (DivVD src1 src2));
+  format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (DivVD src (LoadVector mem)));
+  format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (DivVD src1 src2));
+  format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (DivVD src (LoadVector mem)));
+  format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ------------------------------ LeftShift -----------------------------------
+
+// Shorts/Chars vector left shift
+instruct vsll2S(vecS dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2S_imm(vecS dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2S_reg(vecS dst, vecS src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4S(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4S_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4S_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8S(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8S_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8S_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll16S_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector left shift
+instruct vsll2I(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI dst shift));
+  format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
+  ins_encode %{
+    __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2I_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI dst shift));
+  format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
+  ins_encode %{
+    __ pslld($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2I_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4I(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI dst shift));
+  format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
+  ins_encode %{
+    __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4I_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI dst shift));
+  format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
+  ins_encode %{
+    __ pslld($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4I_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8I_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Longs vector left shift
+instruct vsll2L(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL dst shift));
+  format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
+  ins_encode %{
+    __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2L_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL dst shift));
+  format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
+  ins_encode %{
+    __ psllq($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2L_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4L_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ----------------------- LogicalRightShift -----------------------------------
+
+// Shorts/Chars vector logical right shift produces incorrect Java result
+// for negative data because java code convert short value into int with
+// sign extension before a shift.
+
+// Integers vector logical right shift
+instruct vsrl2I(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI dst shift));
+  format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
+  ins_encode %{
+    __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2I_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI dst shift));
+  format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
+  ins_encode %{
+    __ psrld($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2I_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4I(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI dst shift));
+  format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
+  ins_encode %{
+    __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4I_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI dst shift));
+  format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
+  ins_encode %{
+    __ psrld($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4I_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8I_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Longs vector logical right shift
+instruct vsrl2L(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL dst shift));
+  format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
+  ins_encode %{
+    __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2L_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL dst shift));
+  format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
+  ins_encode %{
+    __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2L_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4L_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ------------------- ArithmeticRightShift -----------------------------------
+
+// Shorts/Chars vector arithmetic right shift
+instruct vsra2S(vecS dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2S_imm(vecS dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2S_reg(vecS dst, vecS src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4S(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4S_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4S_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8S(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8S_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8S_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra16S_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector arithmetic right shift
+instruct vsra2I(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI dst shift));
+  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
+  ins_encode %{
+    __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2I_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI dst shift));
+  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
+  ins_encode %{
+    __ psrad($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2I_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4I(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI dst shift));
+  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
+  ins_encode %{
+    __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4I_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI dst shift));
+  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
+  ins_encode %{
+    __ psrad($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4I_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8I_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// There are no longs vector arithmetic right shift instructions.
+
+
+// --------------------------------- AND --------------------------------------
+
+instruct vand4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (AndV dst src));
+  format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
+  ins_encode %{
+    __ pand($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (AndV src1 src2));
+  format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV dst src));
+  format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    __ pand($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV src1 src2));
+  format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (AndV dst src));
+  format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    __ pand($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (AndV src1 src2));
+  format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (AndV src (LoadVector mem)));
+  format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (AndV src1 src2));
+  format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (AndV src (LoadVector mem)));
+  format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- OR ---------------------------------------
+
+instruct vor4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (OrV dst src));
+  format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
+  ins_encode %{
+    __ por($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (OrV src1 src2));
+  format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (OrV dst src));
+  format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
+  ins_encode %{
+    __ por($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (OrV src1 src2));
+  format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (OrV dst src));
+  format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
+  ins_encode %{
+    __ por($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (OrV src1 src2));
+  format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (OrV src (LoadVector mem)));
+  format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (OrV src1 src2));
+  format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (OrV src (LoadVector mem)));
+  format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- XOR --------------------------------------
+
+instruct vxor4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (XorV dst src));
+  format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (XorV src1 src2));
+  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV dst src));
+  format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV src1 src2));
+  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (XorV dst src));
+  format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (XorV src1 src2));
+  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (XorV src (LoadVector mem)));
+  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (XorV src1 src2));
+  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (XorV src (LoadVector mem)));
+  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Mon Aug 20 09:07:21 2012 -0700
@@ -1367,22 +1367,6 @@
   return offset;
 }
 
-
-const bool Matcher::match_rule_supported(int opcode) {
-  if (!has_match_rule(opcode))
-    return false;
-
-  switch (opcode) {
-    case Op_PopCountI:
-    case Op_PopCountL:
-      if (!UsePopCountInstruction)
-        return false;
-    break;
-  }
-  
-  return true;  // Per default match rules are supported.
-}
-
 int Matcher::regnum_to_fpu_offset(int regnum) {
   return regnum - 32; // The FP registers are in the second chunk
 }
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Mon Aug 20 09:07:21 2012 -0700
@@ -1513,22 +1513,6 @@
   return offset;
 }
 
-
-const bool Matcher::match_rule_supported(int opcode) {
-  if (!has_match_rule(opcode))
-    return false;
-
-  switch (opcode) {
-    case Op_PopCountI:
-    case Op_PopCountL:
-      if (!UsePopCountInstruction)
-        return false;
-    break;
-  }
-
-  return true;  // Per default match rules are supported.
-}
-
 int Matcher::regnum_to_fpu_offset(int regnum)
 {
   return regnum - 32; // The FP registers are in the second chunk
@@ -6427,6 +6411,31 @@
   ins_pipe(ialu_reg_reg); // XXX
 %}
 
+// Convert oop into int for vectors alignment masking
+instruct convP2I(rRegI dst, rRegP src)
+%{
+  match(Set dst (ConvL2I (CastP2X src)));
+
+  format %{ "movl    $dst, $src\t# ptr -> int" %}
+  ins_encode %{
+    __ movl($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // XXX
+%}
+
+// Convert compressed oop into int for vectors alignment masking
+// in case of 32bit oops (heap < 4Gb).
+instruct convN2I(rRegI dst, rRegN src)
+%{
+  predicate(Universe::narrow_oop_shift() == 0);
+  match(Set dst (ConvL2I (CastP2X (DecodeN src))));
+
+  format %{ "movl    $dst, $src\t# compressed ptr -> int" %}
+  ins_encode %{
+    __ movl($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // XXX
+%}
 
 // Convert oop pointer into compressed form
 instruct encodeHeapOop(rRegN dst, rRegP src, rFlagsReg cr) %{
@@ -10049,11 +10058,10 @@
   ins_pipe( pipe_slow );
 %}
 
-// The next instructions have long latency and use Int unit. Set high cost.
 instruct MoveI2F_reg_reg(regF dst, rRegI src) %{
   match(Set dst (MoveI2F src));
   effect(DEF dst, USE src);
-  ins_cost(300);
+  ins_cost(100);
   format %{ "movd    $dst,$src\t# MoveI2F" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
@@ -10064,7 +10072,7 @@
 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
   match(Set dst (MoveL2D src));
   effect(DEF dst, USE src);
-  ins_cost(300);
+  ins_cost(100);
   format %{ "movd    $dst,$src\t# MoveL2D" %}
   ins_encode %{
      __ movdq($dst$$XMMRegister, $src$$Register);
--- a/hotspot/src/share/vm/opto/classes.hpp	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/share/vm/opto/classes.hpp	Mon Aug 20 09:07:21 2012 -0700
@@ -256,6 +256,8 @@
 macro(SubVL)
 macro(SubVF)
 macro(SubVD)
+macro(MulVS)
+macro(MulVI)
 macro(MulVF)
 macro(MulVD)
 macro(DivVF)
@@ -263,9 +265,15 @@
 macro(LShiftVB)
 macro(LShiftVS)
 macro(LShiftVI)
+macro(LShiftVL)
 macro(RShiftVB)
 macro(RShiftVS)
 macro(RShiftVI)
+macro(RShiftVL)
+macro(URShiftVB)
+macro(URShiftVS)
+macro(URShiftVI)
+macro(URShiftVL)
 macro(AndV)
 macro(OrV)
 macro(XorV)
--- a/hotspot/src/share/vm/opto/loopnode.cpp	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/share/vm/opto/loopnode.cpp	Mon Aug 20 09:07:21 2012 -0700
@@ -1773,6 +1773,8 @@
     if (stride_con > 0) tty->print("+");
     tty->print("%d", stride_con);
 
+    tty->print(" (%d iters) ", (int)cl->profile_trip_cnt());
+
     if (cl->is_pre_loop ()) tty->print(" pre" );
     if (cl->is_main_loop()) tty->print(" main");
     if (cl->is_post_loop()) tty->print(" post");
--- a/hotspot/src/share/vm/opto/superword.cpp	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/share/vm/opto/superword.cpp	Mon Aug 20 09:07:21 2012 -0700
@@ -1357,6 +1357,12 @@
         // Promote operands to vector
         Node* in1 = vector_opd(p, 1);
         Node* in2 = vector_opd(p, 2);
+        if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) {
+          // Move invariant vector input into second position to avoid register spilling.
+          Node* tmp = in1;
+          in1 = in2;
+          in2 = tmp;
+        }
         vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
       } else {
         ShouldNotReachHere();
@@ -1400,6 +1406,36 @@
     if (opd->is_Vector() || opd->is_LoadVector()) {
       return opd; // input is matching vector
     }
+    if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
+      // No vector is needed for shift count.
+      // Vector instructions do not mask shift count, do it here.
+      Compile* C = _phase->C;
+      Node* cnt = opd;
+      juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
+      const TypeInt* t = opd->find_int_type();
+      if (t != NULL && t->is_con()) {
+        juint shift = t->get_con();
+        if (shift > mask) { // Unsigned cmp
+          cnt = ConNode::make(C, TypeInt::make(shift & mask));
+        }
+      } else {
+        if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
+          cnt = ConNode::make(C, TypeInt::make(mask));
+          _phase->_igvn.register_new_node_with_optimizer(cnt);
+          cnt = new (C, 3) AndINode(opd, cnt);
+          _phase->_igvn.register_new_node_with_optimizer(cnt);
+          _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
+        }
+        assert(opd->bottom_type()->isa_int(), "int type only");
+        // Move non constant shift count into XMM register.
+        cnt = new (_phase->C, 2) MoveI2FNode(cnt);
+      }
+      if (cnt != opd) {
+        _phase->_igvn.register_new_node_with_optimizer(cnt);
+        _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
+      }
+      return cnt;
+    }
     assert(!opd->is_StoreVector(), "such vector is not expected here");
     // Convert scalar input to vector with the same number of elements as
     // p0's vector. Use p0's type because size of operand's container in
@@ -1718,37 +1754,27 @@
   for (int i = _block.length() - 1; i >= 0; i--) {
     Node* n = _block.at(i);
     // Only integer types need be examined
-    if (n->bottom_type()->isa_int()) {
+    const Type* vt = velt_type(n);
+    if (vt->basic_type() == T_INT) {
       uint start, end;
       vector_opd_range(n, &start, &end);
       const Type* vt = velt_type(n);
 
       for (uint j = start; j < end; j++) {
         Node* in  = n->in(j);
-        // Don't propagate through a type conversion
-        if (n->bottom_type() != in->bottom_type())
-          continue;
-        switch(in->Opcode()) {
-        case Op_AddI:    case Op_AddL:
-        case Op_SubI:    case Op_SubL:
-        case Op_MulI:    case Op_MulL:
-        case Op_AndI:    case Op_AndL:
-        case Op_OrI:     case Op_OrL:
-        case Op_XorI:    case Op_XorL:
-        case Op_LShiftI: case Op_LShiftL:
-        case Op_CMoveI:  case Op_CMoveL:
-          if (in_bb(in)) {
-            bool same_type = true;
-            for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
-              Node *use = in->fast_out(k);
-              if (!in_bb(use) || !same_velt_type(use, n)) {
-                same_type = false;
-                break;
-              }
+        // Don't propagate through a memory
+        if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
+            data_size(n) < data_size(in)) {
+          bool same_type = true;
+          for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
+            Node *use = in->fast_out(k);
+            if (!in_bb(use) || !same_velt_type(use, n)) {
+              same_type = false;
+              break;
             }
-            if (same_type) {
-              set_velt_type(in, vt);
-            }
+          }
+          if (same_type) {
+            set_velt_type(in, vt);
           }
         }
       }
@@ -1792,10 +1818,8 @@
   }
   const Type* t = _igvn.type(n);
   if (t->basic_type() == T_INT) {
-    if (t->higher_equal(TypeInt::BOOL))  return TypeInt::BOOL;
-    if (t->higher_equal(TypeInt::BYTE))  return TypeInt::BYTE;
-    if (t->higher_equal(TypeInt::CHAR))  return TypeInt::CHAR;
-    if (t->higher_equal(TypeInt::SHORT)) return TypeInt::SHORT;
+    // A narrow type of arithmetic operations will be determined by
+    // propagating the type of memory operations.
     return TypeInt::INT;
   }
   return t;
@@ -1940,7 +1964,7 @@
   //     lim0 == original pre loop limit
   //     V == v_align (power of 2)
   //     invar == extra invariant piece of the address expression
-  //     e == k [ +/- invar ]
+  //     e == offset [ +/- invar ]
   //
   // When reassociating expressions involving '%' the basic rules are:
   //     (a - b) % k == 0   =>  a % k == b % k
@@ -1993,13 +2017,12 @@
   int elt_size = align_to_ref_p.memory_size();
   int v_align  = vw / elt_size;
   assert(v_align > 1, "sanity");
-  int k        = align_to_ref_p.offset_in_bytes() / elt_size;
+  int offset   = align_to_ref_p.offset_in_bytes() / elt_size;
+  Node *offsn  = _igvn.intcon(offset);
 
-  Node *kn   = _igvn.intcon(k);
-
-  Node *e = kn;
+  Node *e = offsn;
   if (align_to_ref_p.invar() != NULL) {
-    // incorporate any extra invariant piece producing k +/- invar >>> log2(elt)
+    // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
     Node* aref     = new (_phase->C, 3) URShiftINode(align_to_ref_p.invar(), log2_elt);
     _phase->_igvn.register_new_node_with_optimizer(aref);
@@ -2014,15 +2037,15 @@
   }
   if (vw > ObjectAlignmentInBytes) {
     // incorporate base e +/- base && Mask >>> log2(elt)
-    Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
     Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
     _phase->_igvn.register_new_node_with_optimizer(xbase);
-    Node* masked_xbase  = new (_phase->C, 3) AndXNode(xbase, mask);
+#ifdef _LP64
+    xbase  = new (_phase->C, 2) ConvL2INode(xbase);
+    _phase->_igvn.register_new_node_with_optimizer(xbase);
+#endif
+    Node* mask = _igvn.intcon(vw-1);
+    Node* masked_xbase  = new (_phase->C, 3) AndINode(xbase, mask);
     _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
-#ifdef _LP64
-    masked_xbase  = new (_phase->C, 2) ConvL2INode(masked_xbase);
-    _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
-#endif
     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
     Node* bref     = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
     _phase->_igvn.register_new_node_with_optimizer(bref);
--- a/hotspot/src/share/vm/opto/vectornode.cpp	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/share/vm/opto/vectornode.cpp	Mon Aug 20 09:07:21 2012 -0700
@@ -69,6 +69,15 @@
   case Op_SubD:
     assert(bt == T_DOUBLE, "must be");
     return Op_SubVD;
+  case Op_MulI:
+    switch (bt) {
+    case T_BOOLEAN:
+    case T_BYTE:   return 0;   // Unimplemented
+    case T_CHAR:
+    case T_SHORT:  return Op_MulVS;
+    case T_INT:    return Matcher::match_rule_supported(Op_MulVI) ? Op_MulVI : 0; // SSE4_1
+    }
+    ShouldNotReachHere();
   case Op_MulF:
     assert(bt == T_FLOAT, "must be");
     return Op_MulVF;
@@ -90,6 +99,9 @@
     case T_INT:    return Op_LShiftVI;
     }
     ShouldNotReachHere();
+  case Op_LShiftL:
+    assert(bt == T_LONG, "must be");
+    return Op_LShiftVL;
   case Op_RShiftI:
     switch (bt) {
     case T_BOOLEAN:
@@ -99,6 +111,21 @@
     case T_INT:    return Op_RShiftVI;
     }
     ShouldNotReachHere();
+  case Op_RShiftL:
+    assert(bt == T_LONG, "must be");
+    return Op_RShiftVL;
+  case Op_URShiftI:
+    switch (bt) {
+    case T_BOOLEAN:
+    case T_BYTE:   return Op_URShiftVB;
+    case T_CHAR:
+    case T_SHORT:  return Op_URShiftVS;
+    case T_INT:    return Op_URShiftVI;
+    }
+    ShouldNotReachHere();
+  case Op_URShiftL:
+    assert(bt == T_LONG, "must be");
+    return Op_URShiftVL;
   case Op_AndI:
   case Op_AndL:
     return Op_AndV;
@@ -140,6 +167,34 @@
   return false;
 }
 
+bool VectorNode::is_shift(Node* n) {
+  switch (n->Opcode()) {
+  case Op_LShiftI:
+  case Op_LShiftL:
+  case Op_RShiftI:
+  case Op_RShiftL:
+  case Op_URShiftI:
+  case Op_URShiftL:
+    return true;
+  }
+  return false;
+}
+
+// Check if input is loop invarient vector.
+bool VectorNode::is_invariant_vector(Node* n) {
+  // Only Replicate vector nodes are loop invarient for now.
+  switch (n->Opcode()) {
+  case Op_ReplicateB:
+  case Op_ReplicateS:
+  case Op_ReplicateI:
+  case Op_ReplicateL:
+  case Op_ReplicateF:
+  case Op_ReplicateD:
+    return true;
+  }
+  return false;
+}
+
 // Return the vector version of a scalar operation node.
 VectorNode* VectorNode::make(Compile* C, int opc, Node* n1, Node* n2, uint vlen, BasicType bt) {
   const TypeVect* vt = TypeVect::make(bt, vlen);
@@ -160,6 +215,8 @@
   case Op_SubVF: return new (C, 3) SubVFNode(n1, n2, vt);
   case Op_SubVD: return new (C, 3) SubVDNode(n1, n2, vt);
 
+  case Op_MulVS: return new (C, 3) MulVSNode(n1, n2, vt);
+  case Op_MulVI: return new (C, 3) MulVINode(n1, n2, vt);
   case Op_MulVF: return new (C, 3) MulVFNode(n1, n2, vt);
   case Op_MulVD: return new (C, 3) MulVDNode(n1, n2, vt);
 
@@ -169,10 +226,17 @@
   case Op_LShiftVB: return new (C, 3) LShiftVBNode(n1, n2, vt);
   case Op_LShiftVS: return new (C, 3) LShiftVSNode(n1, n2, vt);
   case Op_LShiftVI: return new (C, 3) LShiftVINode(n1, n2, vt);
+  case Op_LShiftVL: return new (C, 3) LShiftVLNode(n1, n2, vt);
 
   case Op_RShiftVB: return new (C, 3) RShiftVBNode(n1, n2, vt);
   case Op_RShiftVS: return new (C, 3) RShiftVSNode(n1, n2, vt);
   case Op_RShiftVI: return new (C, 3) RShiftVINode(n1, n2, vt);
+  case Op_RShiftVL: return new (C, 3) RShiftVLNode(n1, n2, vt);
+
+  case Op_URShiftVB: return new (C, 3) URShiftVBNode(n1, n2, vt);
+  case Op_URShiftVS: return new (C, 3) URShiftVSNode(n1, n2, vt);
+  case Op_URShiftVI: return new (C, 3) URShiftVINode(n1, n2, vt);
+  case Op_URShiftVL: return new (C, 3) URShiftVLNode(n1, n2, vt);
 
   case Op_AndV: return new (C, 3) AndVNode(n1, n2, vt);
   case Op_OrV:  return new (C, 3) OrVNode (n1, n2, vt);
--- a/hotspot/src/share/vm/opto/vectornode.hpp	Wed Aug 15 16:49:38 2012 -0700
+++ b/hotspot/src/share/vm/opto/vectornode.hpp	Mon Aug 20 09:07:21 2012 -0700
@@ -46,6 +46,7 @@
 
   const TypeVect* vect_type() const { return type()->is_vect(); }
   uint length() const { return vect_type()->length(); } // Vector length
+  uint length_in_bytes() const { return vect_type()->length_in_bytes(); }
 
   virtual int Opcode() const;
 
@@ -57,7 +58,8 @@
 
   static int  opcode(int opc, uint vlen, BasicType bt);
   static bool implemented(int opc, uint vlen, BasicType bt);
-
+  static bool is_shift(Node* n);
+  static bool is_invariant_vector(Node* n);
 };
 
 //===========================Vector=ALU=Operations====================================
@@ -158,6 +160,22 @@
   virtual int Opcode() const;
 };
 
+//------------------------------MulVSNode---------------------------------------
+// Vector multiply short
+class MulVSNode : public VectorNode {
+ public:
+  MulVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------MulVINode---------------------------------------
+// Vector multiply int
+class MulVINode : public VectorNode {
+ public:
+  MulVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------MulVFNode---------------------------------------
 // Vector multiply float
 class MulVFNode : public VectorNode {
@@ -191,7 +209,7 @@
 };
 
 //------------------------------LShiftVBNode---------------------------------------
-// Vector lshift byte
+// Vector left shift bytes
 class LShiftVBNode : public VectorNode {
  public:
   LShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
@@ -199,7 +217,7 @@
 };
 
 //------------------------------LShiftVSNode---------------------------------------
-// Vector lshift shorts
+// Vector left shift shorts
 class LShiftVSNode : public VectorNode {
  public:
   LShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
@@ -207,39 +225,88 @@
 };
 
 //------------------------------LShiftVINode---------------------------------------
-// Vector lshift ints
+// Vector left shift ints
 class LShiftVINode : public VectorNode {
  public:
   LShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
-//------------------------------URShiftVBNode---------------------------------------
-// Vector urshift bytes
+//------------------------------LShiftVLNode---------------------------------------
+// Vector left shift longs
+class LShiftVLNode : public VectorNode {
+ public:
+  LShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------RShiftVBNode---------------------------------------
+// Vector right arithmetic (signed) shift bytes
 class RShiftVBNode : public VectorNode {
  public:
   RShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
-//------------------------------URShiftVSNode---------------------------------------
-// Vector urshift shorts
+//------------------------------RShiftVSNode---------------------------------------
+// Vector right arithmetic (signed) shift shorts
 class RShiftVSNode : public VectorNode {
  public:
   RShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
-//------------------------------URShiftVINode---------------------------------------
-// Vector urshift ints
+//------------------------------RShiftVINode---------------------------------------
+// Vector right arithmetic (signed) shift ints
 class RShiftVINode : public VectorNode {
  public:
   RShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
+//------------------------------RShiftVLNode---------------------------------------
+// Vector right arithmetic (signed) shift longs
+class RShiftVLNode : public VectorNode {
+ public:
+  RShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------URShiftVBNode---------------------------------------
+// Vector right logical (unsigned) shift bytes
+class URShiftVBNode : public VectorNode {
+ public:
+  URShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------URShiftVSNode---------------------------------------
+// Vector right logical (unsigned) shift shorts
+class URShiftVSNode : public VectorNode {
+ public:
+  URShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------URShiftVINode---------------------------------------
+// Vector right logical (unsigned) shift ints
+class URShiftVINode : public VectorNode {
+ public:
+  URShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------URShiftVLNode---------------------------------------
+// Vector right logical (unsigned) shift longs
+class URShiftVLNode : public VectorNode {
+ public:
+  URShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+
 //------------------------------AndVNode---------------------------------------
-// Vector and
+// Vector and integer
 class AndVNode : public VectorNode {
  public:
   AndVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
@@ -247,7 +314,7 @@
 };
 
 //------------------------------OrVNode---------------------------------------
-// Vector or
+// Vector or integer
 class OrVNode : public VectorNode {
  public:
   OrVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
@@ -255,7 +322,7 @@
 };
 
 //------------------------------XorVNode---------------------------------------
-// Vector xor
+// Vector xor integer
 class XorVNode : public VectorNode {
  public:
   XorVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/6340864/TestByteVect.java	Mon Aug 20 09:07:21 2012 -0700
@@ -0,0 +1,1274 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestByteVect
+ */
+
+public class TestByteVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int ADD_INIT = 0;
+  private static final int BIT_MASK = 0xB7;
+  private static final int VALUE = 3;
+  private static final int SHIFT = 8;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Byte vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    byte[] a0 = new byte[ARRLEN];
+    byte[] a1 = new byte[ARRLEN];
+    byte[] a2 = new byte[ARRLEN];
+    byte[] a3 = new byte[ARRLEN];
+    byte[] a4 = new byte[ARRLEN];
+    short[] p2 = new short[ARRLEN/2];
+      int[] p4 = new   int[ARRLEN/4];
+     long[] p8 = new  long[ARRLEN/8];
+    // Initialize
+    int gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      byte val = (byte)(ADD_INIT+i);
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = (byte)VALUE;
+      a3[i] = (byte)-VALUE;
+      a4[i] = (byte)BIT_MASK;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, (byte)VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, (byte)VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, (byte)VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, (byte)VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, (byte)-VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, (byte)-VALUE);
+      test_diva(a0, a1, a3);
+      test_andc(a0, a1);
+      test_andv(a0, a1, (byte)BIT_MASK);
+      test_anda(a0, a1, a4);
+      test_orc(a0, a1);
+      test_orv(a0, a1, (byte)BIT_MASK);
+      test_ora(a0, a1, a4);
+      test_xorc(a0, a1);
+      test_xorv(a0, a1, (byte)BIT_MASK);
+      test_xora(a0, a1, a4);
+      test_sllc(a0, a1);
+      test_sllv(a0, a1, VALUE);
+      test_srlc(a0, a1);
+      test_srlv(a0, a1, VALUE);
+      test_srac(a0, a1);
+      test_srav(a0, a1, VALUE);
+      test_sllc_n(a0, a1);
+      test_sllv(a0, a1, -VALUE);
+      test_srlc_n(a0, a1);
+      test_srlv(a0, a1, -VALUE);
+      test_srac_n(a0, a1);
+      test_srav(a0, a1, -VALUE);
+      test_sllc_o(a0, a1);
+      test_sllv(a0, a1, SHIFT);
+      test_srlc_o(a0, a1);
+      test_srlv(a0, a1, SHIFT);
+      test_srac_o(a0, a1);
+      test_srav(a0, a1, SHIFT);
+      test_sllc_on(a0, a1);
+      test_sllv(a0, a1, -SHIFT);
+      test_srlc_on(a0, a1);
+      test_srlv(a0, a1, -SHIFT);
+      test_srac_on(a0, a1);
+      test_srav(a0, a1, -SHIFT);
+      test_pack2(p2, a1);
+      test_unpack2(a0, p2);
+      test_pack2_swap(p2, a1);
+      test_unpack2_swap(a0, p2);
+      test_pack4(p4, a1);
+      test_unpack4(a0, p4);
+      test_pack4_swap(p4, a1);
+      test_unpack4_swap(a0, p4);
+      test_pack8(p8, a1);
+      test_unpack8(a0, p8);
+      test_pack8_swap(p8, a1);
+      test_unpack8_swap(a0, p8);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      int sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, (byte)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], (byte)((byte)(ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, (byte)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], (byte)((byte)(ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, (byte)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, (byte)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, (byte)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, (byte)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/(-VALUE)));
+      }
+
+      test_andc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_andv(a0, a1, (byte)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_anda(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_anda: ", i, a0[i], (byte)((byte)(ADD_INIT+i)&BIT_MASK));
+      }
+
+      test_orc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_orv(a0, a1, (byte)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_ora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ora: ", i, a0[i], (byte)((byte)(ADD_INIT+i)|BIT_MASK));
+      }
+
+      test_xorc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xorv(a0, a1, (byte)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xora: ", i, a0[i], (byte)((byte)(ADD_INIT+i)^BIT_MASK));
+      }
+
+      test_sllc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<VALUE));
+      }
+      test_sllv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<VALUE));
+      }
+
+      test_srlc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>VALUE));
+      }
+      test_srlv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>VALUE));
+      }
+
+      test_srac(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>VALUE));
+      }
+      test_srav(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>VALUE));
+      }
+
+      test_sllc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<(-VALUE)));
+      }
+      test_sllv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<(-VALUE)));
+      }
+
+      test_srlc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>(-VALUE)));
+      }
+      test_srlv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>(-VALUE)));
+      }
+
+      test_srac_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-VALUE)));
+      }
+      test_srav(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-VALUE)));
+      }
+
+      test_sllc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<SHIFT));
+      }
+      test_sllv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<SHIFT));
+      }
+
+      test_srlc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>SHIFT));
+      }
+      test_srlv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>SHIFT));
+      }
+
+      test_srac_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>SHIFT));
+      }
+      test_srav(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>SHIFT));
+      }
+
+      test_sllc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<(-SHIFT)));
+      }
+      test_sllv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<(-SHIFT)));
+      }
+
+      test_srlc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+      test_srlv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+
+      test_srac_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-SHIFT)));
+      }
+      test_srav(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-SHIFT)));
+      }
+
+      test_pack2(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2: ", i, p2[i], (short)(((short)(ADD_INIT+2*i) & 0xFF) | ((short)(ADD_INIT+2*i+1) << 8)));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack2_swap(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2_swap: ", i, p2[i], (short)(((short)(ADD_INIT+2*i+1) & 0xFF) | ((short)(ADD_INIT+2*i) << 8)));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2_swap(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2_swap: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack4(p4, a1);
+      for (int i=0; i<ARRLEN/4; i++) {
+        errn += verify("test_pack4: ", i, p4[i],  ((int)(ADD_INIT+4*i+0) & 0xFF) |
+                                                 (((int)(ADD_INIT+4*i+1) & 0xFF) <<  8)  |
+                                                 (((int)(ADD_INIT+4*i+2) & 0xFF) << 16)  |
+                                                 (((int)(ADD_INIT+4*i+3) & 0xFF) << 24));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack4(a0, p4);
+      for (int i=0; i<(ARRLEN&(-4)); i++) {
+        errn += verify("test_unpack4: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack4_swap(p4, a1);
+      for (int i=0; i<ARRLEN/4; i++) {
+        errn += verify("test_pack4_swap: ", i, p4[i],  ((int)(ADD_INIT+4*i+3) & 0xFF) |
+                                                      (((int)(ADD_INIT+4*i+2) & 0xFF) <<  8)  |
+                                                      (((int)(ADD_INIT+4*i+1) & 0xFF) << 16)  |
+                                                      (((int)(ADD_INIT+4*i+0) & 0xFF) << 24));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack4_swap(a0, p4);
+      for (int i=0; i<(ARRLEN&(-4)); i++) {
+        errn += verify("test_unpack4_swap: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack8(p8, a1);
+      for (int i=0; i<ARRLEN/8; i++) {
+        errn += verify("test_pack8: ", i, p8[i],  ((long)(ADD_INIT+8*i+0) & 0xFFl) |
+                                                 (((long)(ADD_INIT+8*i+1) & 0xFFl) <<  8)  |
+                                                 (((long)(ADD_INIT+8*i+2) & 0xFFl) << 16)  |
+                                                 (((long)(ADD_INIT+8*i+3) & 0xFFl) << 24)  |
+                                                 (((long)(ADD_INIT+8*i+4) & 0xFFl) << 32)  |
+                                                 (((long)(ADD_INIT+8*i+5) & 0xFFl) << 40)  |
+                                                 (((long)(ADD_INIT+8*i+6) & 0xFFl) << 48)  |
+                                                 (((long)(ADD_INIT+8*i+7) & 0xFFl) << 56));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack8(a0, p8);
+      for (int i=0; i<(ARRLEN&(-8)); i++) {
+        errn += verify("test_unpack8: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack8_swap(p8, a1);
+      for (int i=0; i<ARRLEN/8; i++) {
+        errn += verify("test_pack8_swap: ", i, p8[i],  ((long)(ADD_INIT+8*i+7) & 0xFFl) |
+                                                      (((long)(ADD_INIT+8*i+6) & 0xFFl) <<  8)  |
+                                                      (((long)(ADD_INIT+8*i+5) & 0xFFl) << 16)  |
+                                                      (((long)(ADD_INIT+8*i+4) & 0xFFl) << 24)  |
+                                                      (((long)(ADD_INIT+8*i+3) & 0xFFl) << 32)  |
+                                                      (((long)(ADD_INIT+8*i+2) & 0xFFl) << 40)  |
+                                                      (((long)(ADD_INIT+8*i+1) & 0xFFl) << 48)  |
+                                                      (((long)(ADD_INIT+8*i+0) & 0xFFl) << 56));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack8_swap(a0, p8);
+      for (int i=0; i<(ARRLEN&(-8)); i++) {
+        errn += verify("test_unpack8_swap: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, (byte)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, (byte)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (byte)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (byte)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (byte)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (byte)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andv(a0, a1, (byte)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_anda(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_anda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orv(a0, a1, (byte)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorv(a0, a1, (byte)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2_swap(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2_swap(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2_swap: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack4(p4, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack4: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack4(a0, p4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack4: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack4_swap(p4, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack4_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack4_swap(a0, p4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack4_swap: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack8(p8, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack8: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack8(a0, p8);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack8: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack8_swap(p8, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack8_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack8_swap(a0, p8);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack8_swap: " + (end - start));
+
+    return errn;
+  }
+
+  static int test_sum(byte[] a1) {
+    int sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]+VALUE);
+    }
+  }
+  static void test_addv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]+b);
+    }
+  }
+  static void test_adda(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]-VALUE);
+    }
+  }
+  static void test_subv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]-b);
+    }
+  }
+  static void test_suba(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]*b);
+    }
+  }
+  static void test_mula(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]/b);
+    }
+  }
+  static void test_diva(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]/a2[i]);
+    }
+  }
+
+  static void test_andc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]&BIT_MASK);
+    }
+  }
+  static void test_andv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]&b);
+    }
+  }
+  static void test_anda(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]&a2[i]);
+    }
+  }
+
+  static void test_orc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]|BIT_MASK);
+    }
+  }
+  static void test_orv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]|b);
+    }
+  }
+  static void test_ora(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]|a2[i]);
+    }
+  }
+
+  static void test_xorc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]^BIT_MASK);
+    }
+  }
+  static void test_xorv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]^b);
+    }
+  }
+  static void test_xora(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]^a2[i]);
+    }
+  }
+
+  static void test_sllc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<VALUE);
+    }
+  }
+  static void test_sllc_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<(-VALUE));
+    }
+  }
+  static void test_sllc_o(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<SHIFT);
+    }
+  }
+  static void test_sllc_on(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<(-SHIFT));
+    }
+  }
+  static void test_sllv(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<b);
+    }
+  }
+
+  static void test_srlc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>VALUE);
+    }
+  }
+  static void test_srlc_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>(-VALUE));
+    }
+  }
+  static void test_srlc_o(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>SHIFT);
+    }
+  }
+  static void test_srlc_on(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>(-SHIFT));
+    }
+  }
+  static void test_srlv(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>b);
+    }
+  }
+
+  static void test_srac(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>VALUE);
+    }
+  }
+  static void test_srac_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>(-VALUE));
+    }
+  }
+  static void test_srac_o(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>SHIFT);
+    }
+  }
+  static void test_srac_on(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>(-SHIFT));
+    }
+  }
+  static void test_srav(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>b);
+    }
+  }
+
+  static void test_pack2(short[] p2, byte[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      short l0 = (short)a1[i*2+0];
+      short l1 = (short)a1[i*2+1];
+      p2[i] = (short)((l1 << 8) | (l0 & 0xFF));
+    }
+  }
+  static void test_unpack2(byte[] a0, short[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      short l = p2[i];
+      a0[i*2+0] = (byte)(l & 0xFF);
+      a0[i*2+1] = (byte)(l >> 8);
+    }
+  }
+  static void test_pack2_swap(short[] p2, byte[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      short l0 = (short)a1[i*2+0];
+      short l1 = (short)a1[i*2+1];
+      p2[i] = (short)((l0 << 8) | (l1 & 0xFF));
+    }
+  }
+  static void test_unpack2_swap(byte[] a0, short[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      short l = p2[i];
+      a0[i*2+0] = (byte)(l >> 8);
+      a0[i*2+1] = (byte)(l & 0xFF);
+    }
+  }
+
+  static void test_pack4(int[] p4, byte[] a1) {
+    if (p4.length*4 > a1.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      int l0 = (int)a1[i*4+0];
+      int l1 = (int)a1[i*4+1];
+      int l2 = (int)a1[i*4+2];
+      int l3 = (int)a1[i*4+3];
+      p4[i] = (l0 & 0xFF) |
+             ((l1 & 0xFF) <<  8) |
+             ((l2 & 0xFF) << 16) |
+             ((l3 & 0xFF) << 24);
+    }
+  }
+  static void test_unpack4(byte[] a0, int[] p4) {
+    if (p4.length*4 > a0.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      int l = p4[i];
+      a0[i*4+0] = (byte)(l & 0xFF);
+      a0[i*4+1] = (byte)(l >>  8);
+      a0[i*4+2] = (byte)(l >> 16);
+      a0[i*4+3] = (byte)(l >> 24);
+    }
+  }
+  static void test_pack4_swap(int[] p4, byte[] a1) {
+    if (p4.length*4 > a1.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      int l0 = (int)a1[i*4+0];
+      int l1 = (int)a1[i*4+1];
+      int l2 = (int)a1[i*4+2];
+      int l3 = (int)a1[i*4+3];
+      p4[i] = (l3 & 0xFF) |
+             ((l2 & 0xFF) <<  8) |
+             ((l1 & 0xFF) << 16) |
+             ((l0 & 0xFF) << 24);
+    }
+  }
+  static void test_unpack4_swap(byte[] a0, int[] p4) {
+    if (p4.length*4 > a0.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      int l = p4[i];
+      a0[i*4+0] = (byte)(l >> 24);
+      a0[i*4+1] = (byte)(l >> 16);
+      a0[i*4+2] = (byte)(l >>  8);
+      a0[i*4+3] = (byte)(l & 0xFF);
+    }
+  }
+
+  static void test_pack8(long[] p8, byte[] a1) {
+    if (p8.length*8 > a1.length) return;
+    for (int i = 0; i < p8.length; i+=1) {
+      long l0 = (long)a1[i*8+0];
+      long l1 = (long)a1[i*8+1];
+      long l2 = (long)a1[i*8+2];
+      long l3 = (long)a1[i*8+3];
+      long l4 = (long)a1[i*8+4];
+      long l5 = (long)a1[i*8+5];
+      long l6 = (long)a1[i*8+6];
+      long l7 = (long)a1[i*8+7];
+      p8[i] = (l0 & 0xFFl) |
+             ((l1 & 0xFFl) <<  8) |
+             ((l2 & 0xFFl) << 16) |
+             ((l3 & 0xFFl) << 24) |
+             ((l4 & 0xFFl) << 32) |
+             ((l5 & 0xFFl) << 40) |
+             ((l6 & 0xFFl) << 48) |
+             ((l7 & 0xFFl) << 56);
+    }
+  }
+  static void test_unpack8(byte[] a0, long[] p8) {
+    if (p8.length*8 > a0.length) return;
+    for (int i = 0; i < p8.length; i+=1) {
+      long l = p8[i];
+      a0[i*8+0] = (byte)(l & 0xFFl);
+      a0[i*8+1] = (byte)(l >>  8);
+      a0[i*8+2] = (byte)(l >> 16);
+      a0[i*8+3] = (byte)(l >> 24);
+      a0[i*8+4] = (byte)(l >> 32);
+      a0[i*8+5] = (byte)(l >> 40);
+      a0[i*8+6] = (byte)(l >> 48);
+      a0[i*8+7] = (byte)(l >> 56);
+    }
+  }
+  static void test_pack8_swap(long[] p8, byte[] a1) {
+    if (p8.length*8 > a1.length) return;
+    for (int i = 0; i < p8.length; i+=1) {
+      long l0 = (long)a1[i*8+0];
+      long l1 = (long)a1[i*8+1];
+      long l2 = (long)a1[i*8+2];
+      long l3 = (long)a1[i*8+3];
+      long l4 = (long)a1[i*8+4];
+      long l5 = (long)a1[i*8+5];
+      long l6 = (long)a1[i*8+6];
+      long l7 = (long)a1[i*8+7];
+      p8[i] = (l7 & 0xFFl) |
+             ((l6 & 0xFFl) <<  8) |
+             ((l5 & 0xFFl) << 16) |
+             ((l4 & 0xFFl) << 24) |
+             ((l3 & 0xFFl) << 32) |
+             ((l2 & 0xFFl) << 40) |
+             ((l1 & 0xFFl) << 48) |
+             ((l0 & 0xFFl) << 56);
+    }
+  }
+  static void test_unpack8_swap(byte[] a0, long[] p8) {
+    if (p8.length*8 > a0.length) return;
+    for (int i = 0; i < p8.length; i+=1) {
+      long l = p8[i];
+      a0[i*8+0] = (byte)(l >> 56);
+      a0[i*8+1] = (byte)(l >> 48);
+      a0[i*8+2] = (byte)(l >> 40);
+      a0[i*8+3] = (byte)(l >> 32);
+      a0[i*8+4] = (byte)(l >> 24);
+      a0[i*8+5] = (byte)(l >> 16);
+      a0[i*8+6] = (byte)(l >>  8);
+      a0[i*8+7] = (byte)(l & 0xFFl);
+    }
+  }
+
+  static int verify(String text, int i, byte elem, byte val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + Integer.toHexString(elem) + " != " + Integer.toHexString(val));
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + Long.toHexString(elem) + " != " + Long.toHexString(val));
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/6340864/TestDoubleVect.java	Mon Aug 20 09:07:21 2012 -0700
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestDoubleVect
+ */
+
+public class TestDoubleVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final double ADD_INIT = -7500.;
+  private static final double VALUE = 15.;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Double vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    double[] a0 = new double[ARRLEN];
+    double[] a1 = new double[ARRLEN];
+    double[] a2 = new double[ARRLEN];
+    double[] a3 = new double[ARRLEN];
+    // Initialize
+    double gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      double val = ADD_INIT+(double)i;
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = VALUE;
+      a3[i] = -VALUE;
+    }
+
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, -VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, -VALUE);
+      test_diva(a0, a1, a3);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      double sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+      // Overwrite with NaN values
+      a1[0] = Double.NaN;
+      a1[1] = Double.POSITIVE_INFINITY;
+      a1[2] = Double.NEGATIVE_INFINITY;
+      a1[3] = Double.MAX_VALUE;
+      a1[4] = Double.MIN_VALUE;
+      a1[5] = Double.MIN_NORMAL;
+
+      a2[6] = a1[0];
+      a2[7] = a1[1];
+      a2[8] = a1[2];
+      a2[9] = a1[3];
+      a2[10] = a1[4];
+      a2[11] = a1[5];
+
+      a3[6] = -a2[6];
+      a3[7] = -a2[7];
+      a3[8] = -a2[8];
+      a3[9] = -a2[9];
+      a3[10] = -a2[10];
+      a3[11] = -a2[11];
+
+      test_addc(a0, a1);
+      errn += verify("test_addc: ", 0, a0[0], (Double.NaN+VALUE));
+      errn += verify("test_addc: ", 1, a0[1], (Double.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_addc: ", 2, a0[2], (Double.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_addc: ", 3, a0[3], (Double.MAX_VALUE+VALUE));
+      errn += verify("test_addc: ", 4, a0[4], (Double.MIN_VALUE+VALUE));
+      errn += verify("test_addc: ", 5, a0[5], (Double.MIN_NORMAL+VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, VALUE);
+      errn += verify("test_addv: ", 0, a0[0], (Double.NaN+VALUE));
+      errn += verify("test_addv: ", 1, a0[1], (Double.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_addv: ", 2, a0[2], (Double.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_addv: ", 3, a0[3], (Double.MAX_VALUE+VALUE));
+      errn += verify("test_addv: ", 4, a0[4], (Double.MIN_VALUE+VALUE));
+      errn += verify("test_addv: ", 5, a0[5], (Double.MIN_NORMAL+VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      errn += verify("test_adda: ", 0, a0[0], (Double.NaN+VALUE));
+      errn += verify("test_adda: ", 1, a0[1], (Double.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_adda: ", 2, a0[2], (Double.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_adda: ", 3, a0[3], (Double.MAX_VALUE+VALUE));
+      errn += verify("test_adda: ", 4, a0[4], (Double.MIN_VALUE+VALUE));
+      errn += verify("test_adda: ", 5, a0[5], (Double.MIN_NORMAL+VALUE));
+      errn += verify("test_adda: ", 6, a0[6], ((ADD_INIT+6)+Double.NaN));
+      errn += verify("test_adda: ", 7, a0[7], ((ADD_INIT+7)+Double.POSITIVE_INFINITY));
+      errn += verify("test_adda: ", 8, a0[8], ((ADD_INIT+8)+Double.NEGATIVE_INFINITY));
+      errn += verify("test_adda: ", 9, a0[9], ((ADD_INIT+9)+Double.MAX_VALUE));
+      errn += verify("test_adda: ", 10, a0[10], ((ADD_INIT+10)+Double.MIN_VALUE));
+      errn += verify("test_adda: ", 11, a0[11], ((ADD_INIT+11)+Double.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      errn += verify("test_subc: ", 0, a0[0], (Double.NaN-VALUE));
+      errn += verify("test_subc: ", 1, a0[1], (Double.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_subc: ", 2, a0[2], (Double.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_subc: ", 3, a0[3], (Double.MAX_VALUE-VALUE));
+      errn += verify("test_subc: ", 4, a0[4], (Double.MIN_VALUE-VALUE));
+      errn += verify("test_subc: ", 5, a0[5], (Double.MIN_NORMAL-VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, VALUE);
+      errn += verify("test_subv: ", 0, a0[0], (Double.NaN-VALUE));
+      errn += verify("test_subv: ", 1, a0[1], (Double.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_subv: ", 2, a0[2], (Double.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_subv: ", 3, a0[3], (Double.MAX_VALUE-VALUE));
+      errn += verify("test_subv: ", 4, a0[4], (Double.MIN_VALUE-VALUE));
+      errn += verify("test_subv: ", 5, a0[5], (Double.MIN_NORMAL-VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      errn += verify("test_suba: ", 0, a0[0], (Double.NaN-VALUE));
+      errn += verify("test_suba: ", 1, a0[1], (Double.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_suba: ", 2, a0[2], (Double.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_suba: ", 3, a0[3], (Double.MAX_VALUE-VALUE));
+      errn += verify("test_suba: ", 4, a0[4], (Double.MIN_VALUE-VALUE));
+      errn += verify("test_suba: ", 5, a0[5], (Double.MIN_NORMAL-VALUE));
+      errn += verify("test_suba: ", 6, a0[6], ((ADD_INIT+6)-Double.NaN));
+      errn += verify("test_suba: ", 7, a0[7], ((ADD_INIT+7)-Double.POSITIVE_INFINITY));
+      errn += verify("test_suba: ", 8, a0[8], ((ADD_INIT+8)-Double.NEGATIVE_INFINITY));
+      errn += verify("test_suba: ", 9, a0[9], ((ADD_INIT+9)-Double.MAX_VALUE));
+      errn += verify("test_suba: ", 10, a0[10], ((ADD_INIT+10)-Double.MIN_VALUE));
+      errn += verify("test_suba: ", 11, a0[11], ((ADD_INIT+11)-Double.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      errn += verify("test_mulc: ", 0, a0[0], (Double.NaN*VALUE));
+      errn += verify("test_mulc: ", 1, a0[1], (Double.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mulc: ", 2, a0[2], (Double.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mulc: ", 3, a0[3], (Double.MAX_VALUE*VALUE));
+      errn += verify("test_mulc: ", 4, a0[4], (Double.MIN_VALUE*VALUE));
+      errn += verify("test_mulc: ", 5, a0[5], (Double.MIN_NORMAL*VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, VALUE);
+      errn += verify("test_mulv: ", 0, a0[0], (Double.NaN*VALUE));
+      errn += verify("test_mulv: ", 1, a0[1], (Double.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mulv: ", 2, a0[2], (Double.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mulv: ", 3, a0[3], (Double.MAX_VALUE*VALUE));
+      errn += verify("test_mulv: ", 4, a0[4], (Double.MIN_VALUE*VALUE));
+      errn += verify("test_mulv: ", 5, a0[5], (Double.MIN_NORMAL*VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      errn += verify("test_mula: ", 0, a0[0], (Double.NaN*VALUE));
+      errn += verify("test_mula: ", 1, a0[1], (Double.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mula: ", 2, a0[2], (Double.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mula: ", 3, a0[3], (Double.MAX_VALUE*VALUE));
+      errn += verify("test_mula: ", 4, a0[4], (Double.MIN_VALUE*VALUE));
+      errn += verify("test_mula: ", 5, a0[5], (Double.MIN_NORMAL*VALUE));
+      errn += verify("test_mula: ", 6, a0[6], ((ADD_INIT+6)*Double.NaN));
+      errn += verify("test_mula: ", 7, a0[7], ((ADD_INIT+7)*Double.POSITIVE_INFINITY));
+      errn += verify("test_mula: ", 8, a0[8], ((ADD_INIT+8)*Double.NEGATIVE_INFINITY));
+      errn += verify("test_mula: ", 9, a0[9], ((ADD_INIT+9)*Double.MAX_VALUE));
+      errn += verify("test_mula: ", 10, a0[10], ((ADD_INIT+10)*Double.MIN_VALUE));
+      errn += verify("test_mula: ", 11, a0[11], ((ADD_INIT+11)*Double.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      errn += verify("test_divc: ", 0, a0[0], (Double.NaN/VALUE));
+      errn += verify("test_divc: ", 1, a0[1], (Double.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_divc: ", 2, a0[2], (Double.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_divc: ", 3, a0[3], (Double.MAX_VALUE/VALUE));
+      errn += verify("test_divc: ", 4, a0[4], (Double.MIN_VALUE/VALUE));
+      errn += verify("test_divc: ", 5, a0[5], (Double.MIN_NORMAL/VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, VALUE);
+      errn += verify("test_divv: ", 0, a0[0], (Double.NaN/VALUE));
+      errn += verify("test_divv: ", 1, a0[1], (Double.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_divv: ", 2, a0[2], (Double.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_divv: ", 3, a0[3], (Double.MAX_VALUE/VALUE));
+      errn += verify("test_divv: ", 4, a0[4], (Double.MIN_VALUE/VALUE));
+      errn += verify("test_divv: ", 5, a0[5], (Double.MIN_NORMAL/VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      errn += verify("test_diva: ", 0, a0[0], (Double.NaN/VALUE));
+      errn += verify("test_diva: ", 1, a0[1], (Double.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_diva: ", 2, a0[2], (Double.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_diva: ", 3, a0[3], (Double.MAX_VALUE/VALUE));
+      errn += verify("test_diva: ", 4, a0[4], (Double.MIN_VALUE/VALUE));
+      errn += verify("test_diva: ", 5, a0[5], (Double.MIN_NORMAL/VALUE));
+      errn += verify("test_diva: ", 6, a0[6], ((ADD_INIT+6)/Double.NaN));
+      errn += verify("test_diva: ", 7, a0[7], ((ADD_INIT+7)/Double.POSITIVE_INFINITY));
+      errn += verify("test_diva: ", 8, a0[8], ((ADD_INIT+8)/Double.NEGATIVE_INFINITY));
+      errn += verify("test_diva: ", 9, a0[9], ((ADD_INIT+9)/Double.MAX_VALUE));
+      errn += verify("test_diva: ", 10, a0[10], ((ADD_INIT+10)/Double.MIN_VALUE));
+      errn += verify("test_diva: ", 11, a0[11], ((ADD_INIT+11)/Double.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      errn += verify("test_mulc_n: ", 0, a0[0], (Double.NaN*(-VALUE)));
+      errn += verify("test_mulc_n: ", 1, a0[1], (Double.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulc_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulc_n: ", 3, a0[3], (Double.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mulc_n: ", 4, a0[4], (Double.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mulc_n: ", 5, a0[5], (Double.MIN_NORMAL*(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, -VALUE);
+      errn += verify("test_mulv_n: ", 0, a0[0], (Double.NaN*(-VALUE)));
+      errn += verify("test_mulv_n: ", 1, a0[1], (Double.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulv_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulv_n: ", 3, a0[3], (Double.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mulv_n: ", 4, a0[4], (Double.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mulv_n: ", 5, a0[5], (Double.MIN_NORMAL*(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      errn += verify("test_mula_n: ", 0, a0[0], (Double.NaN*(-VALUE)));
+      errn += verify("test_mula_n: ", 1, a0[1], (Double.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mula_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mula_n: ", 3, a0[3], (Double.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mula_n: ", 4, a0[4], (Double.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mula_n: ", 5, a0[5], (Double.MIN_NORMAL*(-VALUE)));
+      errn += verify("test_mula_n: ", 6, a0[6], ((ADD_INIT+6)*(-Double.NaN)));
+      errn += verify("test_mula_n: ", 7, a0[7], ((ADD_INIT+7)*(-Double.POSITIVE_INFINITY)));
+      errn += verify("test_mula_n: ", 8, a0[8], ((ADD_INIT+8)*(-Double.NEGATIVE_INFINITY)));
+      errn += verify("test_mula_n: ", 9, a0[9], ((ADD_INIT+9)*(-Double.MAX_VALUE)));
+      errn += verify("test_mula_n: ", 10, a0[10], ((ADD_INIT+10)*(-Double.MIN_VALUE)));
+      errn += verify("test_mula_n: ", 11, a0[11], ((ADD_INIT+11)*(-Double.MIN_NORMAL)));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      errn += verify("test_divc_n: ", 0, a0[0], (Double.NaN/(-VALUE)));
+      errn += verify("test_divc_n: ", 1, a0[1], (Double.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divc_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divc_n: ", 3, a0[3], (Double.MAX_VALUE/(-VALUE)));
+      errn += verify("test_divc_n: ", 4, a0[4], (Double.MIN_VALUE/(-VALUE)));
+      errn += verify("test_divc_n: ", 5, a0[5], (Double.MIN_NORMAL/(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, -VALUE);
+      errn += verify("test_divv_n: ", 0, a0[0], (Double.NaN/(-VALUE)));
+      errn += verify("test_divv_n: ", 1, a0[1], (Double.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divv_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divv_n: ", 3, a0[3], (Double.MAX_VALUE/(-VALUE)));
+      errn += verify("test_divv_n: ", 4, a0[4], (Double.MIN_VALUE/(-VALUE)));
+      errn += verify("test_divv_n: ", 5, a0[5], (Double.MIN_NORMAL/(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      errn += verify("test_diva_n: ", 0, a0[0], (Double.NaN/(-VALUE)));
+      errn += verify("test_diva_n: ", 1, a0[1], (Double.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_diva_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_diva_n: ", 3, a0[3], (Double.MAX_VALUE/(-VALUE)));
+      errn += verify("test_diva_n: ", 4, a0[4], (Double.MIN_VALUE/(-VALUE)));
+      errn += verify("test_diva_n: ", 5, a0[5], (Double.MIN_NORMAL/(-VALUE)));
+      errn += verify("test_diva_n: ", 6, a0[6], ((ADD_INIT+6)/(-Double.NaN)));
+      errn += verify("test_diva_n: ", 7, a0[7], ((ADD_INIT+7)/(-Double.POSITIVE_INFINITY)));
+      errn += verify("test_diva_n: ", 8, a0[8], ((ADD_INIT+8)/(-Double.NEGATIVE_INFINITY)));
+      errn += verify("test_diva_n: ", 9, a0[9], ((ADD_INIT+9)/(-Double.MAX_VALUE)));
+      errn += verify("test_diva_n: ", 10, a0[10], ((ADD_INIT+10)/(-Double.MIN_VALUE)));
+      errn += verify("test_diva_n: ", 11, a0[11], ((ADD_INIT+11)/(-Double.MIN_NORMAL)));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    return errn;
+  }
+
+  static double test_sum(double[] a1) {
+    double sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+VALUE);
+    }
+  }
+  static void test_addv(double[] a0, double[] a1, double b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+b);
+    }
+  }
+  static void test_adda(double[] a0, double[] a1, double[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-VALUE);
+    }
+  }
+  static void test_subv(double[] a0, double[] a1, double b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-b);
+    }
+  }
+  static void test_suba(double[] a0, double[] a1, double[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(double[] a0, double[] a1, double b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*b);
+    }
+  }
+  static void test_mula(double[] a0, double[] a1, double[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(double[] a0, double[] a1, double b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/b);
+    }
+  }
+  static void test_diva(double[] a0, double[] a1, double[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/a2[i]);
+    }
+  }
+
+  static int verify(String text, int i, double elem, double val) {
+    if (elem != val && !(Double.isNaN(elem) && Double.isNaN(val))) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/6340864/TestFloatVect.java	Mon Aug 20 09:07:21 2012 -0700
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestFloatVect
+ */
+
+public class TestFloatVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final float ADD_INIT = -7500.f;
+  private static final float VALUE = 15.f;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Float vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    float[] a0 = new float[ARRLEN];
+    float[] a1 = new float[ARRLEN];
+    float[] a2 = new float[ARRLEN];
+    float[] a3 = new float[ARRLEN];
+    // Initialize
+    float gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      float val = ADD_INIT+(float)i;
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = VALUE;
+      a3[i] = -VALUE;
+    }
+
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, -VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, -VALUE);
+      test_diva(a0, a1, a3);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      float sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+      // Overwrite with NaN values
+      a1[0] = Float.NaN;
+      a1[1] = Float.POSITIVE_INFINITY;
+      a1[2] = Float.NEGATIVE_INFINITY;
+      a1[3] = Float.MAX_VALUE;
+      a1[4] = Float.MIN_VALUE;
+      a1[5] = Float.MIN_NORMAL;
+
+      a2[6] = a1[0];
+      a2[7] = a1[1];
+      a2[8] = a1[2];
+      a2[9] = a1[3];
+      a2[10] = a1[4];
+      a2[11] = a1[5];
+
+      a3[6] = -a2[6];
+      a3[7] = -a2[7];
+      a3[8] = -a2[8];
+      a3[9] = -a2[9];
+      a3[10] = -a2[10];
+      a3[11] = -a2[11];
+
+      test_addc(a0, a1);
+      errn += verify("test_addc: ", 0, a0[0], (Float.NaN+VALUE));
+      errn += verify("test_addc: ", 1, a0[1], (Float.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_addc: ", 2, a0[2], (Float.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_addc: ", 3, a0[3], (Float.MAX_VALUE+VALUE));
+      errn += verify("test_addc: ", 4, a0[4], (Float.MIN_VALUE+VALUE));
+      errn += verify("test_addc: ", 5, a0[5], (Float.MIN_NORMAL+VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, VALUE);
+      errn += verify("test_addv: ", 0, a0[0], (Float.NaN+VALUE));
+      errn += verify("test_addv: ", 1, a0[1], (Float.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_addv: ", 2, a0[2], (Float.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_addv: ", 3, a0[3], (Float.MAX_VALUE+VALUE));
+      errn += verify("test_addv: ", 4, a0[4], (Float.MIN_VALUE+VALUE));
+      errn += verify("test_addv: ", 5, a0[5], (Float.MIN_NORMAL+VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      errn += verify("test_adda: ", 0, a0[0], (Float.NaN+VALUE));
+      errn += verify("test_adda: ", 1, a0[1], (Float.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_adda: ", 2, a0[2], (Float.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_adda: ", 3, a0[3], (Float.MAX_VALUE+VALUE));
+      errn += verify("test_adda: ", 4, a0[4], (Float.MIN_VALUE+VALUE));
+      errn += verify("test_adda: ", 5, a0[5], (Float.MIN_NORMAL+VALUE));
+      errn += verify("test_adda: ", 6, a0[6], ((ADD_INIT+6)+Float.NaN));
+      errn += verify("test_adda: ", 7, a0[7], ((ADD_INIT+7)+Float.POSITIVE_INFINITY));
+      errn += verify("test_adda: ", 8, a0[8], ((ADD_INIT+8)+Float.NEGATIVE_INFINITY));
+      errn += verify("test_adda: ", 9, a0[9], ((ADD_INIT+9)+Float.MAX_VALUE));
+      errn += verify("test_adda: ", 10, a0[10], ((ADD_INIT+10)+Float.MIN_VALUE));
+      errn += verify("test_adda: ", 11, a0[11], ((ADD_INIT+11)+Float.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      errn += verify("test_subc: ", 0, a0[0], (Float.NaN-VALUE));
+      errn += verify("test_subc: ", 1, a0[1], (Float.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_subc: ", 2, a0[2], (Float.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_subc: ", 3, a0[3], (Float.MAX_VALUE-VALUE));
+      errn += verify("test_subc: ", 4, a0[4], (Float.MIN_VALUE-VALUE));
+      errn += verify("test_subc: ", 5, a0[5], (Float.MIN_NORMAL-VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, VALUE);
+      errn += verify("test_subv: ", 0, a0[0], (Float.NaN-VALUE));
+      errn += verify("test_subv: ", 1, a0[1], (Float.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_subv: ", 2, a0[2], (Float.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_subv: ", 3, a0[3], (Float.MAX_VALUE-VALUE));
+      errn += verify("test_subv: ", 4, a0[4], (Float.MIN_VALUE-VALUE));
+      errn += verify("test_subv: ", 5, a0[5], (Float.MIN_NORMAL-VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      errn += verify("test_suba: ", 0, a0[0], (Float.NaN-VALUE));
+      errn += verify("test_suba: ", 1, a0[1], (Float.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_suba: ", 2, a0[2], (Float.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_suba: ", 3, a0[3], (Float.MAX_VALUE-VALUE));
+      errn += verify("test_suba: ", 4, a0[4], (Float.MIN_VALUE-VALUE));
+      errn += verify("test_suba: ", 5, a0[5], (Float.MIN_NORMAL-VALUE));
+      errn += verify("test_suba: ", 6, a0[6], ((ADD_INIT+6)-Float.NaN));
+      errn += verify("test_suba: ", 7, a0[7], ((ADD_INIT+7)-Float.POSITIVE_INFINITY));
+      errn += verify("test_suba: ", 8, a0[8], ((ADD_INIT+8)-Float.NEGATIVE_INFINITY));
+      errn += verify("test_suba: ", 9, a0[9], ((ADD_INIT+9)-Float.MAX_VALUE));
+      errn += verify("test_suba: ", 10, a0[10], ((ADD_INIT+10)-Float.MIN_VALUE));
+      errn += verify("test_suba: ", 11, a0[11], ((ADD_INIT+11)-Float.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      errn += verify("test_mulc: ", 0, a0[0], (Float.NaN*VALUE));
+      errn += verify("test_mulc: ", 1, a0[1], (Float.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mulc: ", 2, a0[2], (Float.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mulc: ", 3, a0[3], (Float.MAX_VALUE*VALUE));
+      errn += verify("test_mulc: ", 4, a0[4], (Float.MIN_VALUE*VALUE));
+      errn += verify("test_mulc: ", 5, a0[5], (Float.MIN_NORMAL*VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, VALUE);
+      errn += verify("test_mulv: ", 0, a0[0], (Float.NaN*VALUE));
+      errn += verify("test_mulv: ", 1, a0[1], (Float.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mulv: ", 2, a0[2], (Float.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mulv: ", 3, a0[3], (Float.MAX_VALUE*VALUE));
+      errn += verify("test_mulv: ", 4, a0[4], (Float.MIN_VALUE*VALUE));
+      errn += verify("test_mulv: ", 5, a0[5], (Float.MIN_NORMAL*VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      errn += verify("test_mula: ", 0, a0[0], (Float.NaN*VALUE));
+      errn += verify("test_mula: ", 1, a0[1], (Float.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mula: ", 2, a0[2], (Float.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mula: ", 3, a0[3], (Float.MAX_VALUE*VALUE));
+      errn += verify("test_mula: ", 4, a0[4], (Float.MIN_VALUE*VALUE));
+      errn += verify("test_mula: ", 5, a0[5], (Float.MIN_NORMAL*VALUE));
+      errn += verify("test_mula: ", 6, a0[6], ((ADD_INIT+6)*Float.NaN));
+      errn += verify("test_mula: ", 7, a0[7], ((ADD_INIT+7)*Float.POSITIVE_INFINITY));
+      errn += verify("test_mula: ", 8, a0[8], ((ADD_INIT+8)*Float.NEGATIVE_INFINITY));
+      errn += verify("test_mula: ", 9, a0[9], ((ADD_INIT+9)*Float.MAX_VALUE));
+      errn += verify("test_mula: ", 10, a0[10], ((ADD_INIT+10)*Float.MIN_VALUE));
+      errn += verify("test_mula: ", 11, a0[11], ((ADD_INIT+11)*Float.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      errn += verify("test_divc: ", 0, a0[0], (Float.NaN/VALUE));
+      errn += verify("test_divc: ", 1, a0[1], (Float.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_divc: ", 2, a0[2], (Float.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_divc: ", 3, a0[3], (Float.MAX_VALUE/VALUE));
+      errn += verify("test_divc: ", 4, a0[4], (Float.MIN_VALUE/VALUE));
+      errn += verify("test_divc: ", 5, a0[5], (Float.MIN_NORMAL/VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, VALUE);
+      errn += verify("test_divv: ", 0, a0[0], (Float.NaN/VALUE));
+      errn += verify("test_divv: ", 1, a0[1], (Float.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_divv: ", 2, a0[2], (Float.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_divv: ", 3, a0[3], (Float.MAX_VALUE/VALUE));
+      errn += verify("test_divv: ", 4, a0[4], (Float.MIN_VALUE/VALUE));
+      errn += verify("test_divv: ", 5, a0[5], (Float.MIN_NORMAL/VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      errn += verify("test_diva: ", 0, a0[0], (Float.NaN/VALUE));
+      errn += verify("test_diva: ", 1, a0[1], (Float.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_diva: ", 2, a0[2], (Float.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_diva: ", 3, a0[3], (Float.MAX_VALUE/VALUE));
+      errn += verify("test_diva: ", 4, a0[4], (Float.MIN_VALUE/VALUE));
+      errn += verify("test_diva: ", 5, a0[5], (Float.MIN_NORMAL/VALUE));
+      errn += verify("test_diva: ", 6, a0[6], ((ADD_INIT+6)/Float.NaN));
+      errn += verify("test_diva: ", 7, a0[7], ((ADD_INIT+7)/Float.POSITIVE_INFINITY));
+      errn += verify("test_diva: ", 8, a0[8], ((ADD_INIT+8)/Float.NEGATIVE_INFINITY));
+      errn += verify("test_diva: ", 9, a0[9], ((ADD_INIT+9)/Float.MAX_VALUE));
+      errn += verify("test_diva: ", 10, a0[10], ((ADD_INIT+10)/Float.MIN_VALUE));
+      errn += verify("test_diva: ", 11, a0[11], ((ADD_INIT+11)/Float.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      errn += verify("test_mulc_n: ", 0, a0[0], (Float.NaN*(-VALUE)));
+      errn += verify("test_mulc_n: ", 1, a0[1], (Float.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulc_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulc_n: ", 3, a0[3], (Float.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mulc_n: ", 4, a0[4], (Float.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mulc_n: ", 5, a0[5], (Float.MIN_NORMAL*(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, -VALUE);
+      errn += verify("test_mulv_n: ", 0, a0[0], (Float.NaN*(-VALUE)));
+      errn += verify("test_mulv_n: ", 1, a0[1], (Float.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulv_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulv_n: ", 3, a0[3], (Float.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mulv_n: ", 4, a0[4], (Float.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mulv_n: ", 5, a0[5], (Float.MIN_NORMAL*(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      errn += verify("test_mula_n: ", 0, a0[0], (Float.NaN*(-VALUE)));
+      errn += verify("test_mula_n: ", 1, a0[1], (Float.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mula_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mula_n: ", 3, a0[3], (Float.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mula_n: ", 4, a0[4], (Float.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mula_n: ", 5, a0[5], (Float.MIN_NORMAL*(-VALUE)));
+      errn += verify("test_mula_n: ", 6, a0[6], ((ADD_INIT+6)*(-Float.NaN)));
+      errn += verify("test_mula_n: ", 7, a0[7], ((ADD_INIT+7)*(-Float.POSITIVE_INFINITY)));
+      errn += verify("test_mula_n: ", 8, a0[8], ((ADD_INIT+8)*(-Float.NEGATIVE_INFINITY)));
+      errn += verify("test_mula_n: ", 9, a0[9], ((ADD_INIT+9)*(-Float.MAX_VALUE)));
+      errn += verify("test_mula_n: ", 10, a0[10], ((ADD_INIT+10)*(-Float.MIN_VALUE)));
+      errn += verify("test_mula_n: ", 11, a0[11], ((ADD_INIT+11)*(-Float.MIN_NORMAL)));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      errn += verify("test_divc_n: ", 0, a0[0], (Float.NaN/(-VALUE)));
+      errn += verify("test_divc_n: ", 1, a0[1], (Float.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divc_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divc_n: ", 3, a0[3], (Float.MAX_VALUE/(-VALUE)));
+      errn += verify("test_divc_n: ", 4, a0[4], (Float.MIN_VALUE/(-VALUE)));
+      errn += verify("test_divc_n: ", 5, a0[5], (Float.MIN_NORMAL/(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, -VALUE);
+      errn += verify("test_divv_n: ", 0, a0[0], (Float.NaN/(-VALUE)));
+      errn += verify("test_divv_n: ", 1, a0[1], (Float.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divv_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divv_n: ", 3, a0[3], (Float.MAX_VALUE/(-VALUE)));
+      errn += verify("test_divv_n: ", 4, a0[4], (Float.MIN_VALUE/(-VALUE)));
+      errn += verify("test_divv_n: ", 5, a0[5], (Float.MIN_NORMAL/(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      errn += verify("test_diva_n: ", 0, a0[0], (Float.NaN/(-VALUE)));
+      errn += verify("test_diva_n: ", 1, a0[1], (Float.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_diva_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_diva_n: ", 3, a0[3], (Float.MAX_VALUE/(-VALUE)));
+      errn += verify("test_diva_n: ", 4, a0[4], (Float.MIN_VALUE/(-VALUE)));
+      errn += verify("test_diva_n: ", 5, a0[5], (Float.MIN_NORMAL/(-VALUE)));
+      errn += verify("test_diva_n: ", 6, a0[6], ((ADD_INIT+6)/(-Float.NaN)));
+      errn += verify("test_diva_n: ", 7, a0[7], ((ADD_INIT+7)/(-Float.POSITIVE_INFINITY)));
+      errn += verify("test_diva_n: ", 8, a0[8], ((ADD_INIT+8)/(-Float.NEGATIVE_INFINITY)));
+      errn += verify("test_diva_n: ", 9, a0[9], ((ADD_INIT+9)/(-Float.MAX_VALUE)));
+      errn += verify("test_diva_n: ", 10, a0[10], ((ADD_INIT+10)/(-Float.MIN_VALUE)));
+      errn += verify("test_diva_n: ", 11, a0[11], ((ADD_INIT+11)/(-Float.MIN_NORMAL)));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    return errn;
+  }
+
+  static float test_sum(float[] a1) {
+    float sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+VALUE);
+    }
+  }
+  static void test_addv(float[] a0, float[] a1, float b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+b);
+    }
+  }
+  static void test_adda(float[] a0, float[] a1, float[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-VALUE);
+    }
+  }
+  static void test_subv(float[] a0, float[] a1, float b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-b);
+    }
+  }
+  static void test_suba(float[] a0, float[] a1, float[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(float[] a0, float[] a1, float b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*b);
+    }
+  }
+  static void test_mula(float[] a0, float[] a1, float[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(float[] a0, float[] a1, float b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/b);
+    }
+  }
+  static void test_diva(float[] a0, float[] a1, float[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/a2[i]);
+    }
+  }
+
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val && !(Float.isNaN(elem) && Float.isNaN(val))) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/6340864/TestIntVect.java	Mon Aug 20 09:07:21 2012 -0700
@@ -0,0 +1,1012 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestIntVect
+ */
+
+public class TestIntVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int ADD_INIT = Integer.MAX_VALUE-500;
+  private static final int BIT_MASK = 0xEC80F731;
+  private static final int VALUE = 15;
+  private static final int SHIFT = 32;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Integer vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    int[] a0 = new int[ARRLEN];
+    int[] a1 = new int[ARRLEN];
+    int[] a2 = new int[ARRLEN];
+    int[] a3 = new int[ARRLEN];
+    int[] a4 = new int[ARRLEN];
+    long[] p2 = new long[ARRLEN/2];
+    // Initialize
+    int gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      int val = (int)(ADD_INIT+i);
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = (int)VALUE;
+      a3[i] = (int)-VALUE;
+      a4[i] = (int)BIT_MASK;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, (int)VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, (int)VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, (int)VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, (int)VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, (int)-VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, (int)-VALUE);
+      test_diva(a0, a1, a3);
+      test_andc(a0, a1);
+      test_andv(a0, a1, (int)BIT_MASK);
+      test_anda(a0, a1, a4);
+      test_orc(a0, a1);
+      test_orv(a0, a1, (int)BIT_MASK);
+      test_ora(a0, a1, a4);
+      test_xorc(a0, a1);
+      test_xorv(a0, a1, (int)BIT_MASK);
+      test_xora(a0, a1, a4);
+      test_sllc(a0, a1);
+      test_sllv(a0, a1, VALUE);
+      test_srlc(a0, a1);
+      test_srlv(a0, a1, VALUE);
+      test_srac(a0, a1);
+      test_srav(a0, a1, VALUE);
+      test_sllc_n(a0, a1);
+      test_sllv(a0, a1, -VALUE);
+      test_srlc_n(a0, a1);
+      test_srlv(a0, a1, -VALUE);
+      test_srac_n(a0, a1);
+      test_srav(a0, a1, -VALUE);
+      test_sllc_o(a0, a1);
+      test_sllv(a0, a1, SHIFT);
+      test_srlc_o(a0, a1);
+      test_srlv(a0, a1, SHIFT);
+      test_srac_o(a0, a1);
+      test_srav(a0, a1, SHIFT);
+      test_sllc_on(a0, a1);
+      test_sllv(a0, a1, -SHIFT);
+      test_srlc_on(a0, a1);
+      test_srlv(a0, a1, -SHIFT);
+      test_srac_on(a0, a1);
+      test_srav(a0, a1, -SHIFT);
+      test_pack2(p2, a1);
+      test_unpack2(a0, p2);
+      test_pack2_swap(p2, a1);
+      test_unpack2_swap(a0, p2);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      int sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], (int)((int)(ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, (int)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], (int)((int)(ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], (int)((int)(ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], (int)((int)(ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, (int)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], (int)((int)(ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], (int)((int)(ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], (int)((int)(ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, (int)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], (int)((int)(ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], (int)((int)(ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], (int)((int)(ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, (int)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], (int)((int)(ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], (int)((int)(ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], (int)((int)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, (int)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], (int)((int)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], (int)((int)(ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], (int)((int)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, (int)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], (int)((int)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], (int)((int)(ADD_INIT+i)/(-VALUE)));
+      }
+
+      test_andc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andc: ", i, a0[i], (int)((int)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_andv(a0, a1, (int)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andv: ", i, a0[i], (int)((int)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_anda(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_anda: ", i, a0[i], (int)((int)(ADD_INIT+i)&BIT_MASK));
+      }
+
+      test_orc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orc: ", i, a0[i], (int)((int)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_orv(a0, a1, (int)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orv: ", i, a0[i], (int)((int)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_ora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ora: ", i, a0[i], (int)((int)(ADD_INIT+i)|BIT_MASK));
+      }
+
+      test_xorc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorc: ", i, a0[i], (int)((int)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xorv(a0, a1, (int)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorv: ", i, a0[i], (int)((int)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xora: ", i, a0[i], (int)((int)(ADD_INIT+i)^BIT_MASK));
+      }
+
+      test_sllc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc: ", i, a0[i], (int)((int)(ADD_INIT+i)<<VALUE));
+      }
+      test_sllv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv: ", i, a0[i], (int)((int)(ADD_INIT+i)<<VALUE));
+      }
+
+      test_srlc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>VALUE));
+      }
+      test_srlv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>VALUE));
+      }
+
+      test_srac(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac: ", i, a0[i], (int)((int)(ADD_INIT+i)>>VALUE));
+      }
+      test_srav(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav: ", i, a0[i], (int)((int)(ADD_INIT+i)>>VALUE));
+      }
+
+      test_sllc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_n: ", i, a0[i], (int)((int)(ADD_INIT+i)<<(-VALUE)));
+      }
+      test_sllv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_n: ", i, a0[i], (int)((int)(ADD_INIT+i)<<(-VALUE)));
+      }
+
+      test_srlc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_n: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>(-VALUE)));
+      }
+      test_srlv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_n: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>(-VALUE)));
+      }
+
+      test_srac_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_n: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-VALUE)));
+      }
+      test_srav(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_n: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-VALUE)));
+      }
+
+      test_sllc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_o: ", i, a0[i], (int)((int)(ADD_INIT+i)<<SHIFT));
+      }
+      test_sllv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_o: ", i, a0[i], (int)((int)(ADD_INIT+i)<<SHIFT));
+      }
+
+      test_srlc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_o: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>SHIFT));
+      }
+      test_srlv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_o: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>SHIFT));
+      }
+
+      test_srac_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_o: ", i, a0[i], (int)((int)(ADD_INIT+i)>>SHIFT));
+      }
+      test_srav(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_o: ", i, a0[i], (int)((int)(ADD_INIT+i)>>SHIFT));
+      }
+
+      test_sllc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_on: ", i, a0[i], (int)((int)(ADD_INIT+i)<<(-SHIFT)));
+      }
+      test_sllv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_on: ", i, a0[i], (int)((int)(ADD_INIT+i)<<(-SHIFT)));
+      }
+
+      test_srlc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+      test_srlv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+
+      test_srac_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-SHIFT)));
+      }
+      test_srav(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-SHIFT)));
+      }
+
+      test_pack2(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2: ", i, p2[i], ((long)(ADD_INIT+2*i) & 0xFFFFFFFFl) | ((long)(ADD_INIT+2*i+1) << 32));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2: ", i, a0[i], (ADD_INIT+i));
+      }
+
+      test_pack2_swap(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2_swap: ", i, p2[i], ((long)(ADD_INIT+2*i+1) & 0xFFFFFFFFl) | ((long)(ADD_INIT+2*i) << 32));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2_swap(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2_swap: ", i, a0[i], (ADD_INIT+i));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, (int)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, (int)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (int)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (int)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (int)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (int)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andv(a0, a1, (int)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_anda(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_anda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orv(a0, a1, (int)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorv(a0, a1, (int)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2_swap(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2_swap(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2_swap: " + (end - start));
+
+    return errn;
+  }
+
+  static int test_sum(int[] a1) {
+    int sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]+VALUE);
+    }
+  }
+  static void test_addv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]+b);
+    }
+  }
+  static void test_adda(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]-VALUE);
+    }
+  }
+  static void test_subv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]-b);
+    }
+  }
+  static void test_suba(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]*b);
+    }
+  }
+  static void test_mula(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]/b);
+    }
+  }
+  static void test_diva(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]/a2[i]);
+    }
+  }
+
+  static void test_andc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]&BIT_MASK);
+    }
+  }
+  static void test_andv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]&b);
+    }
+  }
+  static void test_anda(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]&a2[i]);
+    }
+  }
+
+  static void test_orc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]|BIT_MASK);
+    }
+  }
+  static void test_orv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]|b);
+    }
+  }
+  static void test_ora(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]|a2[i]);
+    }
+  }
+
+  static void test_xorc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]^BIT_MASK);
+    }
+  }
+  static void test_xorv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]^b);
+    }
+  }
+  static void test_xora(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]^a2[i]);
+    }
+  }
+
+  static void test_sllc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<VALUE);
+    }
+  }
+  static void test_sllc_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<(-VALUE));
+    }
+  }
+  static void test_sllc_o(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<SHIFT);
+    }
+  }
+  static void test_sllc_on(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<(-SHIFT));
+    }
+  }
+  static void test_sllv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<b);
+    }
+  }
+
+  static void test_srlc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>VALUE);
+    }
+  }
+  static void test_srlc_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>(-VALUE));
+    }
+  }
+  static void test_srlc_o(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>SHIFT);
+    }
+  }
+  static void test_srlc_on(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>(-SHIFT));
+    }
+  }
+  static void test_srlv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>b);
+    }
+  }
+
+  static void test_srac(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>VALUE);
+    }
+  }
+  static void test_srac_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>(-VALUE));
+    }
+  }
+  static void test_srac_o(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>SHIFT);
+    }
+  }
+  static void test_srac_on(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>(-SHIFT));
+    }
+  }
+  static void test_srav(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>b);
+    }
+  }
+
+  static void test_pack2(long[] p2, int[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      long l0 = (long)a1[i*2+0];
+      long l1 = (long)a1[i*2+1];
+      p2[i] = (l1 << 32) | (l0 & 0xFFFFFFFFl);
+    }
+  }
+  static void test_unpack2(int[] a0, long[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      long l = p2[i];
+      a0[i*2+0] = (int)(l & 0xFFFFFFFFl);
+      a0[i*2+1] = (int)(l >> 32);
+    }
+  }
+  static void test_pack2_swap(long[] p2, int[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      long l0 = (long)a1[i*2+0];
+      long l1 = (long)a1[i*2+1];
+      p2[i] = (l0 << 32) | (l1 & 0xFFFFFFFFl);
+    }
+  }
+  static void test_unpack2_swap(int[] a0, long[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      long l = p2[i];
+      a0[i*2+0] = (int)(l >> 32);
+      a0[i*2+1] = (int)(l & 0xFFFFFFFFl);
+    }
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + Long.toHexString(elem) + " != " + Long.toHexString(val));
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/6340864/TestLongVect.java	Mon Aug 20 09:07:21 2012 -0700
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestLongVect
+ */
+
+public class TestLongVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final long ADD_INIT = Long.MAX_VALUE-500;
+  private static final long BIT_MASK = 0xEC80F731EC80F731L;
+  private static final int VALUE = 31;
+  private static final int SHIFT = 64;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Long vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    long[] a0 = new long[ARRLEN];
+    long[] a1 = new long[ARRLEN];
+    long[] a2 = new long[ARRLEN];
+    long[] a3 = new long[ARRLEN];
+    long[] a4 = new long[ARRLEN];
+    // Initialize
+    long gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      long val = (long)(ADD_INIT+i);
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = (long)VALUE;
+      a3[i] = (long)-VALUE;
+      a4[i] = (long)BIT_MASK;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, (long)VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, (long)VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, (long)VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, (long)VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, (long)-VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, (long)-VALUE);
+      test_diva(a0, a1, a3);
+      test_andc(a0, a1);
+      test_andv(a0, a1, (long)BIT_MASK);
+      test_anda(a0, a1, a4);
+      test_orc(a0, a1);
+      test_orv(a0, a1, (long)BIT_MASK);
+      test_ora(a0, a1, a4);
+      test_xorc(a0, a1);
+      test_xorv(a0, a1, (long)BIT_MASK);
+      test_xora(a0, a1, a4);
+      test_sllc(a0, a1);
+      test_sllv(a0, a1, VALUE);
+      test_srlc(a0, a1);
+      test_srlv(a0, a1, VALUE);
+      test_srac(a0, a1);
+      test_srav(a0, a1, VALUE);
+      test_sllc_n(a0, a1);
+      test_sllv(a0, a1, -VALUE);
+      test_srlc_n(a0, a1);
+      test_srlv(a0, a1, -VALUE);
+      test_srac_n(a0, a1);
+      test_srav(a0, a1, -VALUE);
+      test_sllc_o(a0, a1);
+      test_sllv(a0, a1, SHIFT);
+      test_srlc_o(a0, a1);
+      test_srlv(a0, a1, SHIFT);
+      test_srac_o(a0, a1);
+      test_srav(a0, a1, SHIFT);
+      test_sllc_on(a0, a1);
+      test_sllv(a0, a1, -SHIFT);
+      test_srlc_on(a0, a1);
+      test_srlv(a0, a1, -SHIFT);
+      test_srac_on(a0, a1);
+      test_srav(a0, a1, -SHIFT);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      long sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], (long)((long)(ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, (long)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], (long)((long)(ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], (long)((long)(ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], (long)((long)(ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, (long)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], (long)((long)(ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], (long)((long)(ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], (long)((long)(ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, (long)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], (long)((long)(ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], (long)((long)(ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], (long)((long)(ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, (long)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], (long)((long)(ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], (long)((long)(ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], (long)((long)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, (long)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], (long)((long)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], (long)((long)(ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], (long)((long)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, (long)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], (long)((long)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], (long)((long)(ADD_INIT+i)/(-VALUE)));
+      }
+
+      test_andc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andc: ", i, a0[i], (long)((long)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_andv(a0, a1, (long)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andv: ", i, a0[i], (long)((long)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_anda(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_anda: ", i, a0[i], (long)((long)(ADD_INIT+i)&BIT_MASK));
+      }
+
+      test_orc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orc: ", i, a0[i], (long)((long)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_orv(a0, a1, (long)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orv: ", i, a0[i], (long)((long)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_ora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ora: ", i, a0[i], (long)((long)(ADD_INIT+i)|BIT_MASK));
+      }
+
+      test_xorc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorc: ", i, a0[i], (long)((long)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xorv(a0, a1, (long)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorv: ", i, a0[i], (long)((long)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xora: ", i, a0[i], (long)((long)(ADD_INIT+i)^BIT_MASK));
+      }
+
+      test_sllc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc: ", i, a0[i], (long)((long)(ADD_INIT+i)<<VALUE));
+      }
+      test_sllv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv: ", i, a0[i], (long)((long)(ADD_INIT+i)<<VALUE));
+      }
+
+      test_srlc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>VALUE));
+      }
+      test_srlv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>VALUE));
+      }
+
+      test_srac(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac: ", i, a0[i], (long)((long)(ADD_INIT+i)>>VALUE));
+      }
+      test_srav(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav: ", i, a0[i], (long)((long)(ADD_INIT+i)>>VALUE));
+      }
+
+      test_sllc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_n: ", i, a0[i], (long)((long)(ADD_INIT+i)<<(-VALUE)));
+      }
+      test_sllv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_n: ", i, a0[i], (long)((long)(ADD_INIT+i)<<(-VALUE)));
+      }
+
+      test_srlc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_n: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>(-VALUE)));
+      }
+      test_srlv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_n: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>(-VALUE)));
+      }
+
+      test_srac_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_n: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-VALUE)));
+      }
+      test_srav(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_n: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-VALUE)));
+      }
+
+      test_sllc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_o: ", i, a0[i], (long)((long)(ADD_INIT+i)<<SHIFT));
+      }
+      test_sllv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_o: ", i, a0[i], (long)((long)(ADD_INIT+i)<<SHIFT));
+      }
+
+      test_srlc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_o: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>SHIFT));
+      }
+      test_srlv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_o: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>SHIFT));
+      }
+
+      test_srac_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_o: ", i, a0[i], (long)((long)(ADD_INIT+i)>>SHIFT));
+      }
+      test_srav(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_o: ", i, a0[i], (long)((long)(ADD_INIT+i)>>SHIFT));
+      }
+
+      test_sllc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_on: ", i, a0[i], (long)((long)(ADD_INIT+i)<<(-SHIFT)));
+      }
+      test_sllv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_on: ", i, a0[i], (long)((long)(ADD_INIT+i)<<(-SHIFT)));
+      }
+
+      test_srlc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+      test_srlv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+
+      test_srac_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-SHIFT)));
+      }
+      test_srav(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-SHIFT)));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, (long)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, (long)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (long)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (long)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (long)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (long)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andv(a0, a1, (long)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_anda(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_anda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orv(a0, a1, (long)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorv(a0, a1, (long)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_on: " + (end - start));
+
+    return errn;
+  }
+
+  static long test_sum(long[] a1) {
+    long sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]+VALUE);
+    }
+  }
+  static void test_addv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]+b);
+    }
+  }
+  static void test_adda(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]-VALUE);
+    }
+  }
+  static void test_subv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]-b);
+    }
+  }
+  static void test_suba(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]*b);
+    }
+  }
+  static void test_mula(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]/b);
+    }
+  }
+  static void test_diva(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]/a2[i]);
+    }
+  }
+
+  static void test_andc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]&BIT_MASK);
+    }
+  }
+  static void test_andv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]&b);
+    }
+  }
+  static void test_anda(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]&a2[i]);
+    }
+  }
+
+  static void test_orc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]|BIT_MASK);
+    }
+  }
+  static void test_orv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]|b);
+    }
+  }
+  static void test_ora(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]|a2[i]);
+    }
+  }
+
+  static void test_xorc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]^BIT_MASK);
+    }
+  }
+  static void test_xorv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]^b);
+    }
+  }
+  static void test_xora(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]^a2[i]);
+    }
+  }
+
+  static void test_sllc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<VALUE);
+    }
+  }
+  static void test_sllc_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<(-VALUE));
+    }
+  }
+  static void test_sllc_o(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<SHIFT);
+    }
+  }
+  static void test_sllc_on(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<(-SHIFT));
+    }
+  }
+  static void test_sllv(long[] a0, long[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<b);
+    }
+  }
+
+  static void test_srlc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>VALUE);
+    }
+  }
+  static void test_srlc_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>(-VALUE));
+    }
+  }
+  static void test_srlc_o(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>SHIFT);
+    }
+  }
+  static void test_srlc_on(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>(-SHIFT));
+    }
+  }
+  static void test_srlv(long[] a0, long[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>b);
+    }
+  }
+
+  static void test_srac(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>VALUE);
+    }
+  }
+  static void test_srac_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>(-VALUE));
+    }
+  }
+  static void test_srac_o(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>SHIFT);
+    }
+  }
+  static void test_srac_on(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>(-SHIFT));
+    }
+  }
+  static void test_srav(long[] a0, long[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>b);
+    }
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/6340864/TestShortVect.java	Mon Aug 20 09:07:21 2012 -0700
@@ -0,0 +1,1127 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestShortVect
+ */
+
+public class TestShortVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int ADD_INIT = Short.MAX_VALUE-500;
+  private static final int BIT_MASK = 0xB731;
+  private static final int VALUE = 7;
+  private static final int SHIFT = 16;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Short vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    short[] a0 = new short[ARRLEN];
+    short[] a1 = new short[ARRLEN];
+    short[] a2 = new short[ARRLEN];
+    short[] a3 = new short[ARRLEN];
+    short[] a4 = new short[ARRLEN];
+     int[] p2 = new  int[ARRLEN/2];
+    long[] p4 = new long[ARRLEN/4];
+    // Initialize
+    int gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      short val = (short)(ADD_INIT+i);
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = (short)VALUE;
+      a3[i] = (short)-VALUE;
+      a4[i] = (short)BIT_MASK;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, (short)VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, (short)VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, (short)VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, (short)VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, (short)-VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, (short)-VALUE);
+      test_diva(a0, a1, a3);
+      test_andc(a0, a1);
+      test_andv(a0, a1, (short)BIT_MASK);
+      test_anda(a0, a1, a4);
+      test_orc(a0, a1);
+      test_orv(a0, a1, (short)BIT_MASK);
+      test_ora(a0, a1, a4);
+      test_xorc(a0, a1);
+      test_xorv(a0, a1, (short)BIT_MASK);
+      test_xora(a0, a1, a4);
+      test_sllc(a0, a1);
+      test_sllv(a0, a1, VALUE);
+      test_srlc(a0, a1);
+      test_srlv(a0, a1, VALUE);
+      test_srac(a0, a1);
+      test_srav(a0, a1, VALUE);
+      test_sllc_n(a0, a1);
+      test_sllv(a0, a1, -VALUE);
+      test_srlc_n(a0, a1);
+      test_srlv(a0, a1, -VALUE);
+      test_srac_n(a0, a1);
+      test_srav(a0, a1, -VALUE);
+      test_sllc_o(a0, a1);
+      test_sllv(a0, a1, SHIFT);
+      test_srlc_o(a0, a1);
+      test_srlv(a0, a1, SHIFT);
+      test_srac_o(a0, a1);
+      test_srav(a0, a1, SHIFT);
+      test_sllc_on(a0, a1);
+      test_sllv(a0, a1, -SHIFT);
+      test_srlc_on(a0, a1);
+      test_srlv(a0, a1, -SHIFT);
+      test_srac_on(a0, a1);
+      test_srav(a0, a1, -SHIFT);
+      test_pack2(p2, a1);
+      test_unpack2(a0, p2);
+      test_pack2_swap(p2, a1);
+      test_unpack2_swap(a0, p2);
+      test_pack4(p4, a1);
+      test_unpack4(a0, p4);
+      test_pack4_swap(p4, a1);
+      test_unpack4_swap(a0, p4);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      int sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], (short)((short)(ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, (short)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], (short)((short)(ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], (short)((short)(ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], (short)((short)(ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, (short)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], (short)((short)(ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], (short)((short)(ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], (short)((short)(ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, (short)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], (short)((short)(ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], (short)((short)(ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], (short)((short)(ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, (short)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], (short)((short)(ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], (short)((short)(ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], (short)((short)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, (short)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], (short)((short)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], (short)((short)(ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], (short)((short)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, (short)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], (short)((short)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], (short)((short)(ADD_INIT+i)/(-VALUE)));
+      }
+
+      test_andc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andc: ", i, a0[i], (short)((short)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_andv(a0, a1, (short)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andv: ", i, a0[i], (short)((short)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_anda(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_anda: ", i, a0[i], (short)((short)(ADD_INIT+i)&BIT_MASK));
+      }
+
+      test_orc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orc: ", i, a0[i], (short)((short)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_orv(a0, a1, (short)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orv: ", i, a0[i], (short)((short)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_ora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ora: ", i, a0[i], (short)((short)(ADD_INIT+i)|BIT_MASK));
+      }
+
+      test_xorc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorc: ", i, a0[i], (short)((short)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xorv(a0, a1, (short)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorv: ", i, a0[i], (short)((short)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xora: ", i, a0[i], (short)((short)(ADD_INIT+i)^BIT_MASK));
+      }
+
+      test_sllc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc: ", i, a0[i], (short)((short)(ADD_INIT+i)<<VALUE));
+      }
+      test_sllv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv: ", i, a0[i], (short)((short)(ADD_INIT+i)<<VALUE));
+      }
+
+      test_srlc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>VALUE));
+      }
+      test_srlv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>VALUE));
+      }
+
+      test_srac(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac: ", i, a0[i], (short)((short)(ADD_INIT+i)>>VALUE));
+      }
+      test_srav(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav: ", i, a0[i], (short)((short)(ADD_INIT+i)>>VALUE));
+      }
+
+      test_sllc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_n: ", i, a0[i], (short)((short)(ADD_INIT+i)<<(-VALUE)));
+      }
+      test_sllv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_n: ", i, a0[i], (short)((short)(ADD_INIT+i)<<(-VALUE)));
+      }
+
+      test_srlc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_n: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>(-VALUE)));
+      }
+      test_srlv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_n: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>(-VALUE)));
+      }
+
+      test_srac_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_n: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-VALUE)));
+      }
+      test_srav(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_n: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-VALUE)));
+      }
+
+      test_sllc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_o: ", i, a0[i], (short)((short)(ADD_INIT+i)<<SHIFT));
+      }
+      test_sllv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_o: ", i, a0[i], (short)((short)(ADD_INIT+i)<<SHIFT));
+      }
+
+      test_srlc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_o: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>SHIFT));
+      }
+      test_srlv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_o: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>SHIFT));
+      }
+
+      test_srac_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_o: ", i, a0[i], (short)((short)(ADD_INIT+i)>>SHIFT));
+      }
+      test_srav(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_o: ", i, a0[i], (short)((short)(ADD_INIT+i)>>SHIFT));
+      }
+
+      test_sllc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_on: ", i, a0[i], (short)((short)(ADD_INIT+i)<<(-SHIFT)));
+      }
+      test_sllv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_on: ", i, a0[i], (short)((short)(ADD_INIT+i)<<(-SHIFT)));
+      }
+
+      test_srlc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+      test_srlv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+
+      test_srac_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-SHIFT)));
+      }
+      test_srav(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-SHIFT)));
+      }
+
+      test_pack2(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2: ", i, p2[i], ((int)(ADD_INIT+2*i) & 0xFFFF) | ((int)(ADD_INIT+2*i+1) << 16));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2: ", i, a0[i], (short)(ADD_INIT+i));
+      }
+
+      test_pack2_swap(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2_swap: ", i, p2[i], ((int)(ADD_INIT+2*i+1) & 0xFFFF) | ((int)(ADD_INIT+2*i) << 16));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2_swap(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2_swap: ", i, a0[i], (short)(ADD_INIT+i));
+      }
+
+      test_pack4(p4, a1);
+      for (int i=0; i<ARRLEN/4; i++) {
+        errn += verify("test_pack4: ", i, p4[i],  ((long)(ADD_INIT+4*i+0) & 0xFFFFl) |
+                                                 (((long)(ADD_INIT+4*i+1) & 0xFFFFl) << 16)  |
+                                                 (((long)(ADD_INIT+4*i+2) & 0xFFFFl) << 32)  |
+                                                 (((long)(ADD_INIT+4*i+3) & 0xFFFFl) << 48));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack4(a0, p4);
+      for (int i=0; i<(ARRLEN&(-4)); i++) {
+        errn += verify("test_unpack4: ", i, a0[i], (short)(ADD_INIT+i));
+      }
+
+      test_pack4_swap(p4, a1);
+      for (int i=0; i<ARRLEN/4; i++) {
+        errn += verify("test_pack4_swap: ", i, p4[i],  ((long)(ADD_INIT+4*i+3) & 0xFFFFl) |
+                                                      (((long)(ADD_INIT+4*i+2) & 0xFFFFl) << 16)  |
+                                                      (((long)(ADD_INIT+4*i+1) & 0xFFFFl) << 32)  |
+                                                      (((long)(ADD_INIT+4*i+0) & 0xFFFFl) << 48));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack4_swap(a0, p4);
+      for (int i=0; i<(ARRLEN&(-4)); i++) {
+        errn += verify("test_unpack4_swap: ", i, a0[i], (short)(ADD_INIT+i));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, (short)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, (short)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (short)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (short)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (short)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (short)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andv(a0, a1, (short)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_anda(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_anda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orv(a0, a1, (short)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorv(a0, a1, (short)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2(a0, p2);
+    }