changeset 3668:137868b7aa6f

7196199: java/text/Bidi/Bug6665028.java failed: Bidi run count incorrect Summary: Save whole XMM/YMM registers in safepoint interrupt handler. Reviewed-by: roland, twisti
author kvn
date Mon, 17 Sep 2012 19:39:07 -0700
parents 8d3cc6612bd1
children 9d89c76b0505
files src/cpu/sparc/vm/sharedRuntime_sparc.cpp src/cpu/x86/vm/assembler_x86.cpp src/cpu/x86/vm/assembler_x86.hpp src/cpu/x86/vm/sharedRuntime_x86_32.cpp src/cpu/x86/vm/sharedRuntime_x86_64.cpp src/cpu/x86/vm/x86.ad src/share/vm/c1/c1_Compilation.cpp src/share/vm/c1/c1_Compilation.hpp src/share/vm/ci/ciEnv.cpp src/share/vm/ci/ciEnv.hpp src/share/vm/code/nmethod.cpp src/share/vm/code/nmethod.hpp src/share/vm/opto/compile.cpp src/share/vm/opto/compile.hpp src/share/vm/opto/output.cpp src/share/vm/opto/superword.cpp src/share/vm/runtime/sharedRuntime.cpp src/share/vm/runtime/sharedRuntime.hpp test/compiler/7196199/Test7196199.java
diffstat 19 files changed, 655 insertions(+), 205 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -313,6 +313,14 @@
 
 }
 
+// Is vector's size (in bytes) bigger than a size saved by default?
+// 8 bytes FP registers are saved by default on SPARC.
+bool SharedRuntime::is_wide_vector(int size) {
+  // Note, MaxVectorSize == 8 on SPARC.
+  assert(size <= 8, err_msg_res("%d bytes vectors are not supported", size));
+  return size > 8;
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
@@ -3734,7 +3742,7 @@
 // the 64-bit %o's, then do a save, then fixup the caller's SP (our FP).
 // Tricky, tricky, tricky...
 
-SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) {
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
 
   // allocate space for the code
@@ -3752,6 +3760,7 @@
 
   int start = __ offset();
 
+  bool cause_return = (poll_type == POLL_AT_RETURN);
   // If this causes a return before the processing, then do a "restore"
   if (cause_return) {
     __ restore();
--- a/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -3496,6 +3496,33 @@
   emit_byte(0x01);
 }
 
+void Assembler::vinsertf128h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x18);
+  emit_operand(dst, src);
+  // 0x01 - insert into upper 128 bits
+  emit_byte(0x01);
+}
+
+void Assembler::vextractf128h(Address dst, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x19);
+  emit_operand(src, dst);
+  // 0x01 - extract from upper 128 bits
+  emit_byte(0x01);
+}
+
 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
   bool vector256 = true;
@@ -3507,6 +3534,33 @@
   emit_byte(0x01);
 }
 
+void Assembler::vinserti128h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx2(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x38);
+  emit_operand(dst, src);
+  // 0x01 - insert into upper 128 bits
+  emit_byte(0x01);
+}
+
+void Assembler::vextracti128h(Address dst, XMMRegister src) {
+  assert(VM_Version::supports_avx2(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x39);
+  emit_operand(src, dst);
+  // 0x01 - extract from upper 128 bits
+  emit_byte(0x01);
+}
+
 void Assembler::vzeroupper() {
   assert(VM_Version::supports_avx(), "");
   (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
@@ -8907,11 +8961,9 @@
   pusha();
 
   // if we are coming from c1, xmm registers may be live
-  if (UseSSE >= 1) {
-    subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
-  }
   int off = 0;
   if (UseSSE == 1)  {
+    subptr(rsp, sizeof(jdouble)*8);
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
@@ -8921,23 +8973,50 @@
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
   } else if (UseSSE >= 2)  {
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
+#ifdef COMPILER2
+    if (MaxVectorSize > 16) {
+      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+      // Save upper half of YMM registes
+      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+      vextractf128h(Address(rsp,  0),xmm0);
+      vextractf128h(Address(rsp, 16),xmm1);
+      vextractf128h(Address(rsp, 32),xmm2);
+      vextractf128h(Address(rsp, 48),xmm3);
+      vextractf128h(Address(rsp, 64),xmm4);
+      vextractf128h(Address(rsp, 80),xmm5);
+      vextractf128h(Address(rsp, 96),xmm6);
+      vextractf128h(Address(rsp,112),xmm7);
 #ifdef _LP64
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
+      vextractf128h(Address(rsp,128),xmm8);
+      vextractf128h(Address(rsp,144),xmm9);
+      vextractf128h(Address(rsp,160),xmm10);
+      vextractf128h(Address(rsp,176),xmm11);
+      vextractf128h(Address(rsp,192),xmm12);
+      vextractf128h(Address(rsp,208),xmm13);
+      vextractf128h(Address(rsp,224),xmm14);
+      vextractf128h(Address(rsp,240),xmm15);
+#endif
+    }
+#endif
+    // Save whole 128bit (16 bytes) XMM regiters
+    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    movdqu(Address(rsp,off++*16),xmm0);
+    movdqu(Address(rsp,off++*16),xmm1);
+    movdqu(Address(rsp,off++*16),xmm2);
+    movdqu(Address(rsp,off++*16),xmm3);
+    movdqu(Address(rsp,off++*16),xmm4);
+    movdqu(Address(rsp,off++*16),xmm5);
+    movdqu(Address(rsp,off++*16),xmm6);
+    movdqu(Address(rsp,off++*16),xmm7);
+#ifdef _LP64
+    movdqu(Address(rsp,off++*16),xmm8);
+    movdqu(Address(rsp,off++*16),xmm9);
+    movdqu(Address(rsp,off++*16),xmm10);
+    movdqu(Address(rsp,off++*16),xmm11);
+    movdqu(Address(rsp,off++*16),xmm12);
+    movdqu(Address(rsp,off++*16),xmm13);
+    movdqu(Address(rsp,off++*16),xmm14);
+    movdqu(Address(rsp,off++*16),xmm15);
 #endif
   }
 
@@ -9015,28 +9094,52 @@
     movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
     movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
     movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    addptr(rsp, sizeof(jdouble)*8);
   } else if (UseSSE >= 2)  {
-    movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    // Restore whole 128bit (16 bytes) XMM regiters
+    movdqu(xmm0, Address(rsp,off++*16));
+    movdqu(xmm1, Address(rsp,off++*16));
+    movdqu(xmm2, Address(rsp,off++*16));
+    movdqu(xmm3, Address(rsp,off++*16));
+    movdqu(xmm4, Address(rsp,off++*16));
+    movdqu(xmm5, Address(rsp,off++*16));
+    movdqu(xmm6, Address(rsp,off++*16));
+    movdqu(xmm7, Address(rsp,off++*16));
 #ifdef _LP64
-    movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
+    movdqu(xmm8, Address(rsp,off++*16));
+    movdqu(xmm9, Address(rsp,off++*16));
+    movdqu(xmm10, Address(rsp,off++*16));
+    movdqu(xmm11, Address(rsp,off++*16));
+    movdqu(xmm12, Address(rsp,off++*16));
+    movdqu(xmm13, Address(rsp,off++*16));
+    movdqu(xmm14, Address(rsp,off++*16));
+    movdqu(xmm15, Address(rsp,off++*16));
 #endif
-  }
-  if (UseSSE >= 1) {
-    addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+#ifdef COMPILER2
+    if (MaxVectorSize > 16) {
+      // Restore upper half of YMM registes.
+      vinsertf128h(xmm0, Address(rsp,  0));
+      vinsertf128h(xmm1, Address(rsp, 16));
+      vinsertf128h(xmm2, Address(rsp, 32));
+      vinsertf128h(xmm3, Address(rsp, 48));
+      vinsertf128h(xmm4, Address(rsp, 64));
+      vinsertf128h(xmm5, Address(rsp, 80));
+      vinsertf128h(xmm6, Address(rsp, 96));
+      vinsertf128h(xmm7, Address(rsp,112));
+#ifdef _LP64
+      vinsertf128h(xmm8, Address(rsp,128));
+      vinsertf128h(xmm9, Address(rsp,144));
+      vinsertf128h(xmm10, Address(rsp,160));
+      vinsertf128h(xmm11, Address(rsp,176));
+      vinsertf128h(xmm12, Address(rsp,192));
+      vinsertf128h(xmm13, Address(rsp,208));
+      vinsertf128h(xmm14, Address(rsp,224));
+      vinsertf128h(xmm15, Address(rsp,240));
+#endif
+      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    }
+#endif
   }
   popa();
 }
--- a/src/cpu/x86/vm/assembler_x86.hpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Mon Sep 17 19:39:07 2012 -0700
@@ -1743,6 +1743,12 @@
   void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
 
+  // Load/store high 128bit of YMM registers which does not destroy other half.
+  void vinsertf128h(XMMRegister dst, Address src);
+  void vinserti128h(XMMRegister dst, Address src);
+  void vextractf128h(Address dst, XMMRegister src);
+  void vextracti128h(Address dst, XMMRegister src);
+
   // AVX instruction which is used to clear upper 128 bits of YMM registers and
   // to avoid transaction penalty between AVX and SSE states. There is no
   // penalty if legacy SSE instructions are encoded using VEX prefix because
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -46,11 +46,11 @@
 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
 
 class RegisterSaver {
-  enum { FPU_regs_live = 8 /*for the FPU stack*/+8/*eight more for XMM registers*/ };
   // Capture info about frame layout
+#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
   enum layout {
                 fpu_state_off = 0,
-                fpu_state_end = fpu_state_off+FPUStateSizeInWords-1,
+                fpu_state_end = fpu_state_off+FPUStateSizeInWords,
                 st0_off, st0H_off,
                 st1_off, st1H_off,
                 st2_off, st2H_off,
@@ -59,16 +59,16 @@
                 st5_off, st5H_off,
                 st6_off, st6H_off,
                 st7_off, st7H_off,
-
-                xmm0_off, xmm0H_off,
-                xmm1_off, xmm1H_off,
-                xmm2_off, xmm2H_off,
-                xmm3_off, xmm3H_off,
-                xmm4_off, xmm4H_off,
-                xmm5_off, xmm5H_off,
-                xmm6_off, xmm6H_off,
-                xmm7_off, xmm7H_off,
-                flags_off,
+                xmm_off,
+                DEF_XMM_OFFS(0),
+                DEF_XMM_OFFS(1),
+                DEF_XMM_OFFS(2),
+                DEF_XMM_OFFS(3),
+                DEF_XMM_OFFS(4),
+                DEF_XMM_OFFS(5),
+                DEF_XMM_OFFS(6),
+                DEF_XMM_OFFS(7),
+                flags_off = xmm7_off + 16/BytesPerInt + 1, // 16-byte stack alignment fill word
                 rdi_off,
                 rsi_off,
                 ignore_off,  // extra copy of rbp,
@@ -83,13 +83,13 @@
                 rbp_off,
                 return_off,      // slot for return address
                 reg_save_size };
-
+  enum { FPU_regs_live = flags_off - fpu_state_end };
 
   public:
 
   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words,
-                                     int* total_frame_words, bool verify_fpu = true);
-  static void restore_live_registers(MacroAssembler* masm);
+                                     int* total_frame_words, bool verify_fpu = true, bool save_vectors = false);
+  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 
   static int rax_offset() { return rax_off; }
   static int rbx_offset() { return rbx_off; }
@@ -113,9 +113,20 @@
 };
 
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words,
-                                           int* total_frame_words, bool verify_fpu) {
-
-  int frame_size_in_bytes =  (reg_save_size + additional_frame_words) * wordSize;
+                                           int* total_frame_words, bool verify_fpu, bool save_vectors) {
+  int vect_words = 0;
+#ifdef COMPILER2
+  if (save_vectors) {
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    // Save upper half of YMM registes
+    vect_words = 8 * 16 / wordSize;
+    additional_frame_words += vect_words;
+  }
+#else
+  assert(!save_vectors, "vectors are generated only by C2");
+#endif
+  int frame_size_in_bytes = (reg_save_size + additional_frame_words) * wordSize;
   int frame_words = frame_size_in_bytes / wordSize;
   *total_frame_words = frame_words;
 
@@ -129,7 +140,7 @@
   __ enter();
   __ pusha();
   __ pushf();
-  __ subptr(rsp,FPU_regs_live*sizeof(jdouble)); // Push FPU registers space
+  __ subptr(rsp,FPU_regs_live*wordSize); // Push FPU registers space
   __ push_FPU_state();          // Save FPU state & init
 
   if (verify_fpu) {
@@ -183,14 +194,28 @@
     __ movflt(Address(rsp,xmm6_off*wordSize),xmm6);
     __ movflt(Address(rsp,xmm7_off*wordSize),xmm7);
   } else if( UseSSE >= 2 ) {
-    __ movdbl(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movdbl(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movdbl(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movdbl(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movdbl(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movdbl(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movdbl(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movdbl(Address(rsp,xmm7_off*wordSize),xmm7);
+    // Save whole 128bit (16 bytes) XMM regiters
+    __ movdqu(Address(rsp,xmm0_off*wordSize),xmm0);
+    __ movdqu(Address(rsp,xmm1_off*wordSize),xmm1);
+    __ movdqu(Address(rsp,xmm2_off*wordSize),xmm2);
+    __ movdqu(Address(rsp,xmm3_off*wordSize),xmm3);
+    __ movdqu(Address(rsp,xmm4_off*wordSize),xmm4);
+    __ movdqu(Address(rsp,xmm5_off*wordSize),xmm5);
+    __ movdqu(Address(rsp,xmm6_off*wordSize),xmm6);
+    __ movdqu(Address(rsp,xmm7_off*wordSize),xmm7);
+  }
+
+  if (vect_words > 0) {
+    assert(vect_words*wordSize == 128, "");
+    __ subptr(rsp, 128); // Save upper half of YMM registes
+    __ vextractf128h(Address(rsp,  0),xmm0);
+    __ vextractf128h(Address(rsp, 16),xmm1);
+    __ vextractf128h(Address(rsp, 32),xmm2);
+    __ vextractf128h(Address(rsp, 48),xmm3);
+    __ vextractf128h(Address(rsp, 64),xmm4);
+    __ vextractf128h(Address(rsp, 80),xmm5);
+    __ vextractf128h(Address(rsp, 96),xmm6);
+    __ vextractf128h(Address(rsp,112),xmm7);
   }
 
   // Set an oopmap for the call site.  This oopmap will map all
@@ -253,10 +278,20 @@
 
 }
 
-void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
-
+void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
   // Recover XMM & FPU state
-  if( UseSSE == 1 ) {
+  int additional_frame_bytes = 0;
+#ifdef COMPILER2
+  if (restore_vectors) {
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    additional_frame_bytes = 128;
+  }
+#else
+  assert(!restore_vectors, "vectors are generated only by C2");
+#endif
+  if (UseSSE == 1) {
+    assert(additional_frame_bytes == 0, "");
     __ movflt(xmm0,Address(rsp,xmm0_off*wordSize));
     __ movflt(xmm1,Address(rsp,xmm1_off*wordSize));
     __ movflt(xmm2,Address(rsp,xmm2_off*wordSize));
@@ -265,18 +300,33 @@
     __ movflt(xmm5,Address(rsp,xmm5_off*wordSize));
     __ movflt(xmm6,Address(rsp,xmm6_off*wordSize));
     __ movflt(xmm7,Address(rsp,xmm7_off*wordSize));
-  } else if( UseSSE >= 2 ) {
-    __ movdbl(xmm0,Address(rsp,xmm0_off*wordSize));
-    __ movdbl(xmm1,Address(rsp,xmm1_off*wordSize));
-    __ movdbl(xmm2,Address(rsp,xmm2_off*wordSize));
-    __ movdbl(xmm3,Address(rsp,xmm3_off*wordSize));
-    __ movdbl(xmm4,Address(rsp,xmm4_off*wordSize));
-    __ movdbl(xmm5,Address(rsp,xmm5_off*wordSize));
-    __ movdbl(xmm6,Address(rsp,xmm6_off*wordSize));
-    __ movdbl(xmm7,Address(rsp,xmm7_off*wordSize));
+  } else if (UseSSE >= 2) {
+#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes)
+    __ movdqu(xmm0,STACK_ADDRESS(xmm0_off));
+    __ movdqu(xmm1,STACK_ADDRESS(xmm1_off));
+    __ movdqu(xmm2,STACK_ADDRESS(xmm2_off));
+    __ movdqu(xmm3,STACK_ADDRESS(xmm3_off));
+    __ movdqu(xmm4,STACK_ADDRESS(xmm4_off));
+    __ movdqu(xmm5,STACK_ADDRESS(xmm5_off));
+    __ movdqu(xmm6,STACK_ADDRESS(xmm6_off));
+    __ movdqu(xmm7,STACK_ADDRESS(xmm7_off));
+#undef STACK_ADDRESS
+  }
+  if (restore_vectors) {
+    // Restore upper half of YMM registes.
+    assert(additional_frame_bytes == 128, "");
+    __ vinsertf128h(xmm0, Address(rsp,  0));
+    __ vinsertf128h(xmm1, Address(rsp, 16));
+    __ vinsertf128h(xmm2, Address(rsp, 32));
+    __ vinsertf128h(xmm3, Address(rsp, 48));
+    __ vinsertf128h(xmm4, Address(rsp, 64));
+    __ vinsertf128h(xmm5, Address(rsp, 80));
+    __ vinsertf128h(xmm6, Address(rsp, 96));
+    __ vinsertf128h(xmm7, Address(rsp,112));
+    __ addptr(rsp, additional_frame_bytes);
   }
   __ pop_FPU_state();
-  __ addptr(rsp, FPU_regs_live*sizeof(jdouble)); // Pop FPU registers
+  __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
 
   __ popf();
   __ popa();
@@ -308,6 +358,13 @@
   __ addptr(rsp, return_off * wordSize);
 }
 
+// Is vector's size (in bytes) bigger than a size saved by default?
+// 16 bytes XMM registers are saved by default using SSE2 movdqu instructions.
+// Note, MaxVectorSize == 0 with UseSSE < 2 and vectors are not generated.
+bool SharedRuntime::is_wide_vector(int size) {
+  return size > 16;
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
@@ -2732,7 +2789,6 @@
   return 0;
 }
 
-
 //------------------------------generate_deopt_blob----------------------------
 void SharedRuntime::generate_deopt_blob() {
   // allocate space for the code
@@ -3270,7 +3326,7 @@
 // setup oopmap, and calls safepoint code to stop the compiled code for
 // a safepoint.
 //
-SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) {
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
 
   // Account for thread arg in our frame
   const int additional_words = 1;
@@ -3290,17 +3346,18 @@
   const Register java_thread = rdi; // callee-saved for VC++
   address start   = __ pc();
   address call_pc = NULL;
-
+  bool cause_return = (poll_type == POLL_AT_RETURN);
+  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
   // If cause_return is true we are at a poll_return and there is
   // the return address on the stack to the caller on the nmethod
   // that is safepoint. We can leave this return on the stack and
   // effectively complete the return and safepoint in the caller.
   // Otherwise we push space for a return address that the safepoint
   // handler will install later to make the stack walking sensible.
-  if( !cause_return )
-    __ push(rbx);                // Make room for return address (or push it again)
-
-  map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, false);
+  if (!cause_return)
+    __ push(rbx);  // Make room for return address (or push it again)
+
+  map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, false, save_vectors);
 
   // The following is basically a call_VM. However, we need the precise
   // address of the call in order to generate an oopmap. Hence, we do all the
@@ -3312,7 +3369,7 @@
   __ set_last_Java_frame(java_thread, noreg, noreg, NULL);
 
   // if this was not a poll_return then we need to correct the return address now.
-  if( !cause_return ) {
+  if (!cause_return) {
     __ movptr(rax, Address(java_thread, JavaThread::saved_exception_pc_offset()));
     __ movptr(Address(rbp, wordSize), rax);
   }
@@ -3340,15 +3397,14 @@
   __ jcc(Assembler::equal, noException);
 
   // Exception pending
-
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);
 
   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 
   __ bind(noException);
 
   // Normal exit, register restoring and exit
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);
 
   __ ret(0);
 
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -116,8 +116,8 @@
   };
 
  public:
-  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words);
-  static void restore_live_registers(MacroAssembler* masm);
+  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
+  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 
   // Offsets into the register save area
   // Used by deoptimization when it is managing result register
@@ -134,7 +134,19 @@
   static void restore_result_registers(MacroAssembler* masm);
 };
 
-OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words) {
+OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
+  int vect_words = 0;
+#ifdef COMPILER2
+  if (save_vectors) {
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    // Save upper half of YMM registes
+    vect_words = 16 * 16 / wordSize;
+    additional_frame_words += vect_words;
+  }
+#else
+  assert(!save_vectors, "vectors are generated only by C2");
+#endif
 
   // Always make the frame size 16-byte aligned
   int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
@@ -155,6 +167,27 @@
 
   __ enter();          // rsp becomes 16-byte aligned here
   __ push_CPU_state(); // Push a multiple of 16 bytes
+
+  if (vect_words > 0) {
+    assert(vect_words*wordSize == 256, "");
+    __ subptr(rsp, 256); // Save upper half of YMM registes
+    __ vextractf128h(Address(rsp,  0),xmm0);
+    __ vextractf128h(Address(rsp, 16),xmm1);
+    __ vextractf128h(Address(rsp, 32),xmm2);
+    __ vextractf128h(Address(rsp, 48),xmm3);
+    __ vextractf128h(Address(rsp, 64),xmm4);
+    __ vextractf128h(Address(rsp, 80),xmm5);
+    __ vextractf128h(Address(rsp, 96),xmm6);
+    __ vextractf128h(Address(rsp,112),xmm7);
+    __ vextractf128h(Address(rsp,128),xmm8);
+    __ vextractf128h(Address(rsp,144),xmm9);
+    __ vextractf128h(Address(rsp,160),xmm10);
+    __ vextractf128h(Address(rsp,176),xmm11);
+    __ vextractf128h(Address(rsp,192),xmm12);
+    __ vextractf128h(Address(rsp,208),xmm13);
+    __ vextractf128h(Address(rsp,224),xmm14);
+    __ vextractf128h(Address(rsp,240),xmm15);
+  }
   if (frame::arg_reg_save_area_bytes != 0) {
     // Allocate argument register save area
     __ subptr(rsp, frame::arg_reg_save_area_bytes);
@@ -167,112 +200,111 @@
 
   OopMapSet *oop_maps = new OopMapSet();
   OopMap* map = new OopMap(frame_size_in_slots, 0);
-  map->set_callee_saved(VMRegImpl::stack2reg( rax_off  + additional_frame_slots), rax->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rcx_off  + additional_frame_slots), rcx->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rdx_off  + additional_frame_slots), rdx->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rbx_off  + additional_frame_slots), rbx->as_VMReg());
+
+#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
+
+  map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
   // rbp location is known implicitly by the frame sender code, needs no oopmap
   // and the location where rbp was saved by is ignored
-  map->set_callee_saved(VMRegImpl::stack2reg( rsi_off  + additional_frame_slots), rsi->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rdi_off  + additional_frame_slots), rdi->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r8_off   + additional_frame_slots), r8->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r9_off   + additional_frame_slots), r9->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r10_off  + additional_frame_slots), r10->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r11_off  + additional_frame_slots), r11->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r12_off  + additional_frame_slots), r12->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r13_off  + additional_frame_slots), r13->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r14_off  + additional_frame_slots), r14->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r15_off  + additional_frame_slots), r15->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm0_off  + additional_frame_slots), xmm0->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm1_off  + additional_frame_slots), xmm1->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm2_off  + additional_frame_slots), xmm2->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm3_off  + additional_frame_slots), xmm3->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm4_off  + additional_frame_slots), xmm4->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm5_off  + additional_frame_slots), xmm5->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm6_off  + additional_frame_slots), xmm6->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm7_off  + additional_frame_slots), xmm7->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm8_off  + additional_frame_slots), xmm8->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm9_off  + additional_frame_slots), xmm9->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm10_off + additional_frame_slots), xmm10->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm11_off + additional_frame_slots), xmm11->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm12_off + additional_frame_slots), xmm12->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm13_off + additional_frame_slots), xmm13->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm14_off + additional_frame_slots), xmm14->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm15_off + additional_frame_slots), xmm15->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
 
   // %%% These should all be a waste but we'll keep things as they were for now
   if (true) {
-    map->set_callee_saved(VMRegImpl::stack2reg( raxH_off  + additional_frame_slots),
-                          rax->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rcxH_off  + additional_frame_slots),
-                          rcx->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rdxH_off  + additional_frame_slots),
-                          rdx->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rbxH_off  + additional_frame_slots),
-                          rbx->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
     // rbp location is known implicitly by the frame sender code, needs no oopmap
-    map->set_callee_saved(VMRegImpl::stack2reg( rsiH_off  + additional_frame_slots),
-                          rsi->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rdiH_off  + additional_frame_slots),
-                          rdi->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r8H_off   + additional_frame_slots),
-                          r8->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r9H_off   + additional_frame_slots),
-                          r9->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r10H_off  + additional_frame_slots),
-                          r10->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r11H_off  + additional_frame_slots),
-                          r11->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r12H_off  + additional_frame_slots),
-                          r12->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r13H_off  + additional_frame_slots),
-                          r13->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r14H_off  + additional_frame_slots),
-                          r14->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r15H_off  + additional_frame_slots),
-                          r15->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm0H_off  + additional_frame_slots),
-                          xmm0->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm1H_off  + additional_frame_slots),
-                          xmm1->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm2H_off  + additional_frame_slots),
-                          xmm2->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm3H_off  + additional_frame_slots),
-                          xmm3->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm4H_off  + additional_frame_slots),
-                          xmm4->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm5H_off  + additional_frame_slots),
-                          xmm5->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm6H_off  + additional_frame_slots),
-                          xmm6->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm7H_off  + additional_frame_slots),
-                          xmm7->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm8H_off  + additional_frame_slots),
-                          xmm8->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm9H_off  + additional_frame_slots),
-                          xmm9->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm10H_off + additional_frame_slots),
-                          xmm10->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm11H_off + additional_frame_slots),
-                          xmm11->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm12H_off + additional_frame_slots),
-                          xmm12->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm13H_off + additional_frame_slots),
-                          xmm13->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm14H_off + additional_frame_slots),
-                          xmm14->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm15H_off + additional_frame_slots),
-                          xmm15->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
   }
 
   return map;
 }
 
-void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
+void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
   if (frame::arg_reg_save_area_bytes != 0) {
     // Pop arg register save area
     __ addptr(rsp, frame::arg_reg_save_area_bytes);
   }
+#ifdef COMPILER2
+  if (restore_vectors) {
+    // Restore upper half of YMM registes.
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    __ vinsertf128h(xmm0, Address(rsp,  0));
+    __ vinsertf128h(xmm1, Address(rsp, 16));
+    __ vinsertf128h(xmm2, Address(rsp, 32));
+    __ vinsertf128h(xmm3, Address(rsp, 48));
+    __ vinsertf128h(xmm4, Address(rsp, 64));
+    __ vinsertf128h(xmm5, Address(rsp, 80));
+    __ vinsertf128h(xmm6, Address(rsp, 96));
+    __ vinsertf128h(xmm7, Address(rsp,112));
+    __ vinsertf128h(xmm8, Address(rsp,128));
+    __ vinsertf128h(xmm9, Address(rsp,144));
+    __ vinsertf128h(xmm10, Address(rsp,160));
+    __ vinsertf128h(xmm11, Address(rsp,176));
+    __ vinsertf128h(xmm12, Address(rsp,192));
+    __ vinsertf128h(xmm13, Address(rsp,208));
+    __ vinsertf128h(xmm14, Address(rsp,224));
+    __ vinsertf128h(xmm15, Address(rsp,240));
+    __ addptr(rsp, 256);
+  }
+#else
+  assert(!restore_vectors, "vectors are generated only by C2");
+#endif
   // Recover CPU state
   __ pop_CPU_state();
   // Get the rbp described implicitly by the calling convention (no oopMap)
@@ -297,6 +329,12 @@
   __ addptr(rsp, return_offset_in_bytes());
 }
 
+// Is vector's size (in bytes) bigger than a size saved by default?
+// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
+bool SharedRuntime::is_wide_vector(int size) {
+  return size > 16;
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
@@ -3235,7 +3273,6 @@
   return 0;
 }
 
-
 //------------------------------generate_deopt_blob----------------------------
 void SharedRuntime::generate_deopt_blob() {
   // Allocate space for the code
@@ -3740,7 +3777,7 @@
 // Generate a special Compile2Runtime blob that saves all registers,
 // and setup oopmap.
 //
-SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) {
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
   assert(StubRoutines::forward_exception_entry() != NULL,
          "must be generated before");
 
@@ -3755,6 +3792,8 @@
   address start   = __ pc();
   address call_pc = NULL;
   int frame_size_in_words;
+  bool cause_return = (poll_type == POLL_AT_RETURN);
+  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
 
   // Make room for return address (or push it again)
   if (!cause_return) {
@@ -3762,7 +3801,7 @@
   }
 
   // Save registers, fpu state, and flags
-  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
 
   // The following is basically a call_VM.  However, we need the precise
   // address of the call in order to generate an oopmap. Hence, we do all the
@@ -3799,7 +3838,7 @@
 
   // Exception pending
 
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);
 
   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 
@@ -3807,7 +3846,7 @@
   __ bind(noException);
 
   // Normal exit, restore registers and exit.
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);
 
   __ ret(0);
 
--- a/src/cpu/x86/vm/x86.ad	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/cpu/x86/vm/x86.ad	Mon Sep 17 19:39:07 2012 -0700
@@ -498,6 +498,7 @@
     case Op_PopCountL:
       if (!UsePopCountInstruction)
         return false;
+    break;
     case Op_MulVI:
       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
         return false;
--- a/src/share/vm/c1/c1_Compilation.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/c1/c1_Compilation.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -346,7 +346,8 @@
     implicit_exception_table(),
     compiler(),
     _env->comp_level(),
-    has_unsafe_access()
+    has_unsafe_access(),
+    SharedRuntime::is_wide_vector(max_vector_size())
   );
 }
 
--- a/src/share/vm/c1/c1_Compilation.hpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/c1/c1_Compilation.hpp	Mon Sep 17 19:39:07 2012 -0700
@@ -127,6 +127,7 @@
   bool has_exception_handlers() const            { return _has_exception_handlers; }
   bool has_fpu_code() const                      { return _has_fpu_code; }
   bool has_unsafe_access() const                 { return _has_unsafe_access; }
+  int max_vector_size() const                    { return 0; }
   ciMethod* method() const                       { return _method; }
   int osr_bci() const                            { return _osr_bci; }
   bool is_osr_compile() const                    { return osr_bci() >= 0; }
--- a/src/share/vm/ci/ciEnv.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/ci/ciEnv.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -921,7 +921,8 @@
                             ImplicitExceptionTable* inc_table,
                             AbstractCompiler* compiler,
                             int comp_level,
-                            bool has_unsafe_access) {
+                            bool has_unsafe_access,
+                            bool has_wide_vectors) {
   VM_ENTRY_MARK;
   nmethod* nm = NULL;
   {
@@ -1016,6 +1017,7 @@
       }
     } else {
       nm->set_has_unsafe_access(has_unsafe_access);
+      nm->set_has_wide_vectors(has_wide_vectors);
 
       // Record successful registration.
       // (Put nm into the task handle *before* publishing to the Java heap.)
--- a/src/share/vm/ci/ciEnv.hpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/ci/ciEnv.hpp	Mon Sep 17 19:39:07 2012 -0700
@@ -362,7 +362,8 @@
                        ImplicitExceptionTable*   inc_table,
                        AbstractCompiler*         compiler,
                        int                       comp_level,
-                       bool                      has_unsafe_access);
+                       bool                      has_unsafe_access,
+                       bool                      has_wide_vectors);
 
 
   // Access to certain well known ciObjects.
--- a/src/share/vm/code/nmethod.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/code/nmethod.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -463,6 +463,7 @@
   _has_unsafe_access          = 0;
   _has_method_handle_invokes  = 0;
   _lazy_critical_native       = 0;
+  _has_wide_vectors           = 0;
   _marked_for_deoptimization  = 0;
   _lock_count                 = 0;
   _stack_traversal_mark       = 0;
--- a/src/share/vm/code/nmethod.hpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/code/nmethod.hpp	Mon Sep 17 19:39:07 2012 -0700
@@ -177,6 +177,7 @@
   unsigned int _has_unsafe_access:1;         // May fault due to unsafe access.
   unsigned int _has_method_handle_invokes:1; // Has this method MethodHandle invokes?
   unsigned int _lazy_critical_native:1;      // Lazy JNI critical native
+  unsigned int _has_wide_vectors:1;          // Preserve wide vectors at safepoints
 
   // Protected by Patching_lock
   unsigned char _state;                      // {alive, not_entrant, zombie, unloaded}
@@ -442,6 +443,9 @@
   bool  is_lazy_critical_native() const           { return _lazy_critical_native; }
   void  set_lazy_critical_native(bool z)          { _lazy_critical_native = z; }
 
+  bool  has_wide_vectors() const                  { return _has_wide_vectors; }
+  void  set_has_wide_vectors(bool z)              { _has_wide_vectors = z; }
+
   int   comp_level() const                        { return _comp_level; }
 
   // Support for oops in scopes and relocs:
--- a/src/share/vm/opto/compile.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/opto/compile.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -825,7 +825,8 @@
                            &_handler_table, &_inc_table,
                            compiler,
                            env()->comp_level(),
-                           has_unsafe_access()
+                           has_unsafe_access(),
+                           SharedRuntime::is_wide_vector(max_vector_size())
                            );
   }
 }
@@ -963,6 +964,7 @@
   _trap_can_recompile = false;  // no traps emitted yet
   _major_progress = true; // start out assuming good things will happen
   set_has_unsafe_access(false);
+  set_max_vector_size(0);
   Copy::zero_to_bytes(_trap_hist, sizeof(_trap_hist));
   set_decompile_count(0);
 
--- a/src/share/vm/opto/compile.hpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/opto/compile.hpp	Mon Sep 17 19:39:07 2012 -0700
@@ -279,6 +279,7 @@
   bool                  _has_split_ifs;         // True if the method _may_ have some split-if
   bool                  _has_unsafe_access;     // True if the method _may_ produce faults in unsafe loads or stores.
   bool                  _has_stringbuilder;     // True StringBuffers or StringBuilders are allocated
+  int                   _max_vector_size;       // Maximum size of generated vectors
   uint                  _trap_hist[trapHistLength];  // Cumulative traps
   bool                  _trap_can_recompile;    // Have we emitted a recompiling trap?
   uint                  _decompile_count;       // Cumulative decompilation counts.
@@ -443,6 +444,8 @@
   void          set_has_unsafe_access(bool z)   { _has_unsafe_access = z; }
   bool              has_stringbuilder() const   { return _has_stringbuilder; }
   void          set_has_stringbuilder(bool z)   { _has_stringbuilder = z; }
+  int               max_vector_size() const     { return _max_vector_size; }
+  void          set_max_vector_size(int s)      { _max_vector_size = s; }
   void          set_trap_count(uint r, uint c)  { assert(r < trapHistLength, "oob");        _trap_hist[r] = c; }
   uint              trap_count(uint r) const    { assert(r < trapHistLength, "oob"); return _trap_hist[r]; }
   bool              trap_can_recompile() const  { return _trap_can_recompile; }
--- a/src/share/vm/opto/output.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/opto/output.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -1869,7 +1869,9 @@
   if (!do_scheduling())
     return;
 
-  assert(MaxVectorSize <= 8, "scheduling code works only with pairs");
+  // Scheduling code works only with pairs (8 bytes) maximum.
+  if (max_vector_size() > 8)
+    return;
 
   NOT_PRODUCT( TracePhase t2("isched", &_t_instrSched, TimeCompiler); )
 
--- a/src/share/vm/opto/superword.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/opto/superword.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -1350,11 +1350,14 @@
     insert_extracts(_packset.at(i));
   }
 
+  Compile* C = _phase->C;
+  uint max_vlen_in_bytes = 0;
   for (int i = 0; i < _block.length(); i++) {
     Node* n = _block.at(i);
     Node_List* p = my_pack(n);
     if (p && n == executed_last(p)) {
       uint vlen = p->size();
+      uint vlen_in_bytes = 0;
       Node* vn = NULL;
       Node* low_adr = p->at(0);
       Node* first   = executed_first(p);
@@ -1364,7 +1367,8 @@
         Node* mem = first->in(MemNode::Memory);
         Node* adr = low_adr->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
-        vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
+        vn = LoadVectorNode::make(C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_LoadVector()->memory_size();
       } else if (n->is_Store()) {
         // Promote value to be stored to vector
         Node* val = vector_opd(p, MemNode::ValueIn);
@@ -1372,7 +1376,8 @@
         Node* mem = first->in(MemNode::Memory);
         Node* adr = low_adr->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
-        vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
+        vn = StoreVectorNode::make(C, opc, ctl, mem, adr, atyp, val, vlen);
+        vlen_in_bytes = vn->as_StoreVector()->memory_size();
       } else if (n->req() == 3) {
         // Promote operands to vector
         Node* in1 = vector_opd(p, 1);
@@ -1383,7 +1388,8 @@
           in1 = in2;
           in2 = tmp;
         }
-        vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
+        vn = VectorNode::make(C, opc, in1, in2, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else {
         ShouldNotReachHere();
       }
@@ -1395,6 +1401,10 @@
         _igvn.replace_node(pm, vn);
       }
       _igvn._worklist.push(vn);
+
+      if (vlen_in_bytes > max_vlen_in_bytes) {
+        max_vlen_in_bytes = vlen_in_bytes;
+      }
 #ifdef ASSERT
       if (TraceNewVectors) {
         tty->print("new Vector node: ");
@@ -1403,6 +1413,7 @@
 #endif
     }
   }
+  C->set_max_vector_size(max_vlen_in_bytes);
 }
 
 //------------------------------vector_opd---------------------------
@@ -1439,7 +1450,7 @@
         }
         assert(opd->bottom_type()->isa_int(), "int type only");
         // Move non constant shift count into XMM register.
-        cnt = new (_phase->C, 2) MoveI2FNode(cnt);
+        cnt = new (C, 2) MoveI2FNode(cnt);
       }
       if (cnt != opd) {
         _phase->_igvn.register_new_node_with_optimizer(cnt);
@@ -1480,10 +1491,10 @@
   _phase->_igvn.register_new_node_with_optimizer(pk);
   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
 #ifdef ASSERT
-    if (TraceNewVectors) {
-      tty->print("new Vector node: ");
-      pk->dump();
-    }
+  if (TraceNewVectors) {
+    tty->print("new Vector node: ");
+    pk->dump();
+  }
 #endif
   return pk;
 }
--- a/src/share/vm/runtime/sharedRuntime.cpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/runtime/sharedRuntime.cpp	Mon Sep 17 19:39:07 2012 -0700
@@ -88,6 +88,7 @@
 RuntimeStub*        SharedRuntime::_resolve_static_call_blob;
 
 DeoptimizationBlob* SharedRuntime::_deopt_blob;
+SafepointBlob*      SharedRuntime::_polling_page_vectors_safepoint_handler_blob;
 SafepointBlob*      SharedRuntime::_polling_page_safepoint_handler_blob;
 SafepointBlob*      SharedRuntime::_polling_page_return_handler_blob;
 
@@ -104,8 +105,14 @@
   _resolve_virtual_call_blob           = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_virtual_call_C),      "resolve_virtual_call");
   _resolve_static_call_blob            = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_static_call_C),       "resolve_static_call");
 
-  _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), false);
-  _polling_page_return_handler_blob    = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), true);
+#ifdef COMPILER2
+  // Vectors are generated only by C2.
+  if (is_wide_vector(MaxVectorSize)) {
+    _polling_page_vectors_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_VECTOR_LOOP);
+  }
+#endif // COMPILER2
+  _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_LOOP);
+  _polling_page_return_handler_blob    = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_RETURN);
 
   generate_deopt_blob();
 
@@ -535,10 +542,15 @@
     "Only polling locations are used for safepoint");
 
   bool at_poll_return = ((nmethod*)cb)->is_at_poll_return(pc);
+  bool has_wide_vectors = ((nmethod*)cb)->has_wide_vectors();
   if (at_poll_return) {
     assert(SharedRuntime::polling_page_return_handler_blob() != NULL,
            "polling page return stub not created yet");
     stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
+  } else if (has_wide_vectors) {
+    assert(SharedRuntime::polling_page_vectors_safepoint_handler_blob() != NULL,
+           "polling page vectors safepoint stub not created yet");
+    stub = SharedRuntime::polling_page_vectors_safepoint_handler_blob()->entry_point();
   } else {
     assert(SharedRuntime::polling_page_safepoint_handler_blob() != NULL,
            "polling page safepoint stub not created yet");
--- a/src/share/vm/runtime/sharedRuntime.hpp	Mon Sep 17 17:02:10 2012 -0700
+++ b/src/share/vm/runtime/sharedRuntime.hpp	Mon Sep 17 19:39:07 2012 -0700
@@ -62,6 +62,7 @@
 
   static DeoptimizationBlob* _deopt_blob;
 
+  static SafepointBlob*      _polling_page_vectors_safepoint_handler_blob;
   static SafepointBlob*      _polling_page_safepoint_handler_blob;
   static SafepointBlob*      _polling_page_return_handler_blob;
 
@@ -75,7 +76,8 @@
 #endif // !PRODUCT
 
  private:
-  static SafepointBlob* generate_handler_blob(address call_ptr, bool cause_return);
+  enum { POLL_AT_RETURN,  POLL_AT_LOOP, POLL_AT_VECTOR_LOOP };
+  static SafepointBlob* generate_handler_blob(address call_ptr, int poll_type);
   static RuntimeStub*   generate_resolve_blob(address destination, const char* name);
 
  public:
@@ -223,6 +225,7 @@
 
   static SafepointBlob* polling_page_return_handler_blob()     { return _polling_page_return_handler_blob; }
   static SafepointBlob* polling_page_safepoint_handler_blob()  { return _polling_page_safepoint_handler_blob; }
+  static SafepointBlob* polling_page_vectors_safepoint_handler_blob()  { return _polling_page_vectors_safepoint_handler_blob; }
 
   // Counters
 #ifndef PRODUCT
@@ -416,6 +419,10 @@
   // when an interrupt occurs.
   static uint out_preserve_stack_slots();
 
+  // Is vector's size (in bytes) bigger than a size saved by default?
+  // For example, on x86 16 bytes XMM registers are saved by default.
+  static bool is_wide_vector(int size);
+
   // Save and restore a native result
   static void    save_native_result(MacroAssembler *_masm, BasicType ret_type, int frame_slots );
   static void restore_native_result(MacroAssembler *_masm, BasicType ret_type, int frame_slots );
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7196199/Test7196199.java	Mon Sep 17 19:39:07 2012 -0700
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7196199
+ * @summary java/text/Bidi/Bug6665028.java failed: Bidi run count incorrect
+ *
+ * @run main/othervm/timeout=400 -Xmx32m -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:CompileCommand=exclude,Test7196199.test -XX:+SafepointALot -XX:GuaranteedSafepointInterval=100 Test7196199
+ */
+
+
+public class Test7196199 {
+  private static final int ARRLEN = 97;
+  private static final int ITERS  = 5000;
+  private static final int INI_ITERS  = 1000;
+  private static final int SFP_ITERS  = 10000;
+  private static final float SFP_ITERS_F  = 10000.f;
+  private static final float VALUE = 15.f;
+  public static void main(String args[]) {
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    float[] a0 = new float[ARRLEN];
+    float[] a1 = new float[ARRLEN];
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a0[i] = 0.f;
+      a1[i] = (float)i;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<INI_ITERS; i++) {
+      test_incrc(a0);
+      test_incrv(a0, VALUE);
+      test_addc(a0, a1);
+      test_addv(a0, a1, VALUE);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    for (int i=0; i<ARRLEN; i++)
+      a0[i] = 0.f;
+
+    System.out.println("  test_incrc");
+    for (int j=0; j<ITERS; j++) {
+      test_incrc(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_incrc: ", i, a0[i], VALUE*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    System.out.println("  test_incrv");
+    for (int j=0; j<ITERS; j++) {
+      test_incrv(a0, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_incrv: ", i, a0[i], VALUE*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    System.out.println("  test_addc");
+    for (int j=0; j<ITERS; j++) {
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], ((float)i + VALUE)*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    System.out.println("  test_addv");
+    for (int j=0; j<ITERS; j++) {
+      test_addv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], ((float)i + VALUE)*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_incrc(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_incrc: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_incrv(a0, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_incrv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_addv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_incrc(float[] a0) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += VALUE;
+      }
+    }
+  }
+  static void test_incrv(float[] a0, float b) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += b;
+      }
+    }
+  }
+  static void test_addc(float[] a0, float[] a1) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += a1[i]+VALUE;
+      }
+    }
+  }
+  static void test_addv(float[] a0, float[] a1, float b) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += a1[i]+b;
+      }
+    }
+  }
+
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
+