changeset 3455:de2f17add1fb

Merge
author kvn
date Thu, 28 Jun 2012 10:35:28 -0700
parents 7d5f65916db0 751bd303aa45
children 22de825d6faf
files
diffstat 87 files changed, 21344 insertions(+), 3297 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/sparc/vm/sparc.ad	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/sparc/vm/sparc.ad	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -678,18 +678,26 @@
 
 static inline jdouble replicate_immI(int con, int count, int width) {
   // Load a constant replicated "count" times with width "width"
+  assert(count*width == 8 && width <= 4, "sanity");
   int bit_width = width * 8;
-  jlong elt_val = con;
-  elt_val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
-  jlong val = elt_val;
+  jlong val = con;
+  val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
   for (int i = 0; i < count - 1; i++) {
-    val <<= bit_width;
-    val |= elt_val;
+    val |= (val << bit_width);
   }
   jdouble dval = *((jdouble*) &val);  // coerce to double type
   return dval;
 }
 
+static inline jdouble replicate_immF(float con) {
+  // Replicate float con 2 times and pack into vector.
+  int val = *((int*)&con);
+  jlong lval = val;
+  lval = (lval << 32) | (lval & 0xFFFFFFFFl);
+  jdouble dval = *((jdouble*) &lval);  // coerce to double type
+  return dval;
+}
+
 // Standard Sparc opcode form2 field breakdown
 static inline void emit2_19(CodeBuffer &cbuf, int f30, int f29, int f25, int f22, int f20, int f19, int f0 ) {
   f0 &= (1<<19)-1;     // Mask displacement to 19 bits
@@ -791,6 +799,7 @@
     case Assembler::stdf_op3: st_op = Op_StoreD; break;
 
     case Assembler::ldsb_op3: ld_op = Op_LoadB; break;
+    case Assembler::ldub_op3: ld_op = Op_LoadUB; break;
     case Assembler::lduh_op3: ld_op = Op_LoadUS; break;
     case Assembler::ldsh_op3: ld_op = Op_LoadS; break;
     case Assembler::ldx_op3:  // may become LoadP or stay LoadI
@@ -799,7 +808,6 @@
     case Assembler::ldd_op3:  ld_op = Op_LoadL; break;
     case Assembler::ldf_op3:  ld_op = Op_LoadF; break;
     case Assembler::lddf_op3: ld_op = Op_LoadD; break;
-    case Assembler::ldub_op3: ld_op = Op_LoadB; break;
     case Assembler::prefetch_op3: ld_op = Op_LoadI; break;
 
     default: ShouldNotReachHere();
@@ -840,10 +848,7 @@
           !(n->ideal_Opcode()==Op_PrefetchRead  && ld_op==Op_LoadI) &&
           !(n->ideal_Opcode()==Op_PrefetchWrite && ld_op==Op_LoadI) &&
           !(n->ideal_Opcode()==Op_PrefetchAllocation && ld_op==Op_LoadI) &&
-          !(n->ideal_Opcode()==Op_Load2I    && ld_op==Op_LoadD) &&
-          !(n->ideal_Opcode()==Op_Load4C    && ld_op==Op_LoadD) &&
-          !(n->ideal_Opcode()==Op_Load4S    && ld_op==Op_LoadD) &&
-          !(n->ideal_Opcode()==Op_Load8B    && ld_op==Op_LoadD) &&
+          !(n->ideal_Opcode()==Op_LoadVector && ld_op==Op_LoadD) &&
           !(n->rule() == loadUB_rule)) {
         verify_oops_warning(n, n->ideal_Opcode(), ld_op);
       }
@@ -855,9 +860,7 @@
           !(n->ideal_Opcode()==Op_StoreI && st_op==Op_StoreF) &&
           !(n->ideal_Opcode()==Op_StoreF && st_op==Op_StoreI) &&
           !(n->ideal_Opcode()==Op_StoreL && st_op==Op_StoreI) &&
-          !(n->ideal_Opcode()==Op_Store2I && st_op==Op_StoreD) &&
-          !(n->ideal_Opcode()==Op_Store4C && st_op==Op_StoreD) &&
-          !(n->ideal_Opcode()==Op_Store8B && st_op==Op_StoreD) &&
+          !(n->ideal_Opcode()==Op_StoreVector && st_op==Op_StoreD) &&
           !(n->ideal_Opcode()==Op_StoreD && st_op==Op_StoreI && n->rule() == storeD0_rule)) {
         verify_oops_warning(n, n->ideal_Opcode(), st_op);
       }
@@ -1849,16 +1852,45 @@
 address last_rethrow = NULL;  // debugging aid for Rethrow encoding
 #endif
 
+// Map Types to machine register types
+const int Matcher::base2reg[Type::lastype] = {
+  Node::NotAMachineReg,0,0, Op_RegI, Op_RegL, 0, Op_RegN,
+  Node::NotAMachineReg, Node::NotAMachineReg, /* tuple, array */
+  0, Op_RegD, 0, 0, /* Vectors */
+  Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, /* the pointers */
+  0, 0/*abio*/,
+  Op_RegP /* Return address */, 0, /* the memories */
+  Op_RegF, Op_RegF, Op_RegF, Op_RegD, Op_RegD, Op_RegD,
+  0  /*bottom*/
+};
+
 // Vector width in bytes
-const uint Matcher::vector_width_in_bytes(void) {
+const int Matcher::vector_width_in_bytes(BasicType bt) {
+  assert(MaxVectorSize == 8, "");
   return 8;
 }
 
 // Vector ideal reg
-const uint Matcher::vector_ideal_reg(void) {
+const int Matcher::vector_ideal_reg(int size) {
+  assert(MaxVectorSize == 8, "");
   return Op_RegD;
 }
 
+// Limits on vector size (number of elements) loaded into vector.
+const int Matcher::max_vector_size(const BasicType bt) {
+  assert(is_java_primitive(bt), "only primitive type vectors");
+  return vector_width_in_bytes(bt)/type2aelembytes(bt);
+}
+
+const int Matcher::min_vector_size(const BasicType bt) {
+  return max_vector_size(bt); // Same as max.
+}
+
+// SPARC doesn't support misaligned vectors store/load.
+const bool Matcher::misaligned_vectors_ok() {
+  return false;
+}
+
 // USII supports fxtof through the whole range of number, USIII doesn't
 const bool Matcher::convL2FSupported(void) {
   return VM_Version::has_fast_fxtof();
@@ -3125,50 +3157,6 @@
     __ membar( Assembler::Membar_mask_bits(Assembler::StoreLoad) );
   %}
 
-  enc_class enc_repl8b( iRegI src, iRegL dst ) %{
-    MacroAssembler _masm(&cbuf);
-    Register src_reg = reg_to_register_object($src$$reg);
-    Register dst_reg = reg_to_register_object($dst$$reg);
-    __ sllx(src_reg, 56, dst_reg);
-    __ srlx(dst_reg,  8, O7);
-    __ or3 (dst_reg, O7, dst_reg);
-    __ srlx(dst_reg, 16, O7);
-    __ or3 (dst_reg, O7, dst_reg);
-    __ srlx(dst_reg, 32, O7);
-    __ or3 (dst_reg, O7, dst_reg);
-  %}
-
-  enc_class enc_repl4b( iRegI src, iRegL dst ) %{
-    MacroAssembler _masm(&cbuf);
-    Register src_reg = reg_to_register_object($src$$reg);
-    Register dst_reg = reg_to_register_object($dst$$reg);
-    __ sll(src_reg, 24, dst_reg);
-    __ srl(dst_reg,  8, O7);
-    __ or3(dst_reg, O7, dst_reg);
-    __ srl(dst_reg, 16, O7);
-    __ or3(dst_reg, O7, dst_reg);
-  %}
-
-  enc_class enc_repl4s( iRegI src, iRegL dst ) %{
-    MacroAssembler _masm(&cbuf);
-    Register src_reg = reg_to_register_object($src$$reg);
-    Register dst_reg = reg_to_register_object($dst$$reg);
-    __ sllx(src_reg, 48, dst_reg);
-    __ srlx(dst_reg, 16, O7);
-    __ or3 (dst_reg, O7, dst_reg);
-    __ srlx(dst_reg, 32, O7);
-    __ or3 (dst_reg, O7, dst_reg);
-  %}
-
-  enc_class enc_repl2i( iRegI src, iRegL dst ) %{
-    MacroAssembler _masm(&cbuf);
-    Register src_reg = reg_to_register_object($src$$reg);
-    Register dst_reg = reg_to_register_object($dst$$reg);
-    __ sllx(src_reg, 32, dst_reg);
-    __ srlx(dst_reg, 32, O7);
-    __ or3 (dst_reg, O7, dst_reg);
-  %}
-
 %}
 
 //----------FRAME--------------------------------------------------------------
@@ -5932,50 +5920,6 @@
   ins_pipe(iload_mem);
 %}
 
-// Load Aligned Packed Byte into a Double Register
-instruct loadA8B(regD dst, memory mem) %{
-  match(Set dst (Load8B mem));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "LDDF   $mem,$dst\t! packed8B" %}
-  opcode(Assembler::lddf_op3);
-  ins_encode(simple_form3_mem_reg( mem, dst ) );
-  ins_pipe(floadD_mem);
-%}
-
-// Load Aligned Packed Char into a Double Register
-instruct loadA4C(regD dst, memory mem) %{
-  match(Set dst (Load4C mem));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "LDDF   $mem,$dst\t! packed4C" %}
-  opcode(Assembler::lddf_op3);
-  ins_encode(simple_form3_mem_reg( mem, dst ) );
-  ins_pipe(floadD_mem);
-%}
-
-// Load Aligned Packed Short into a Double Register
-instruct loadA4S(regD dst, memory mem) %{
-  match(Set dst (Load4S mem));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "LDDF   $mem,$dst\t! packed4S" %}
-  opcode(Assembler::lddf_op3);
-  ins_encode(simple_form3_mem_reg( mem, dst ) );
-  ins_pipe(floadD_mem);
-%}
-
-// Load Aligned Packed Int into a Double Register
-instruct loadA2I(regD dst, memory mem) %{
-  match(Set dst (Load2I mem));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "LDDF   $mem,$dst\t! packed2I" %}
-  opcode(Assembler::lddf_op3);
-  ins_encode(simple_form3_mem_reg( mem, dst ) );
-  ins_pipe(floadD_mem);
-%}
-
 // Load Range
 instruct loadRange(iRegI dst, memory mem) %{
   match(Set dst (LoadRange mem));
@@ -6599,17 +6543,6 @@
   ins_pipe(fstoreF_mem_zero);
 %}
 
-// Store Aligned Packed Bytes in Double register to memory
-instruct storeA8B(memory mem, regD src) %{
-  match(Set mem (Store8B mem src));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STDF   $src,$mem\t! packed8B" %}
-  opcode(Assembler::stdf_op3);
-  ins_encode(simple_form3_mem_reg( mem, src ) );
-  ins_pipe(fstoreD_mem_reg);
-%}
-
 // Convert oop pointer into compressed form
 instruct encodeHeapOop(iRegN dst, iRegP src) %{
   predicate(n->bottom_type()->make_ptr()->ptr() != TypePtr::NotNull);
@@ -6654,62 +6587,6 @@
 %}
 
 
-// Store Zero into Aligned Packed Bytes
-instruct storeA8B0(memory mem, immI0 zero) %{
-  match(Set mem (Store8B mem zero));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STX    $zero,$mem\t! packed8B" %}
-  opcode(Assembler::stx_op3);
-  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
-  ins_pipe(fstoreD_mem_zero);
-%}
-
-// Store Aligned Packed Chars/Shorts in Double register to memory
-instruct storeA4C(memory mem, regD src) %{
-  match(Set mem (Store4C mem src));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STDF   $src,$mem\t! packed4C" %}
-  opcode(Assembler::stdf_op3);
-  ins_encode(simple_form3_mem_reg( mem, src ) );
-  ins_pipe(fstoreD_mem_reg);
-%}
-
-// Store Zero into Aligned Packed Chars/Shorts
-instruct storeA4C0(memory mem, immI0 zero) %{
-  match(Set mem (Store4C mem (Replicate4C zero)));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STX    $zero,$mem\t! packed4C" %}
-  opcode(Assembler::stx_op3);
-  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
-  ins_pipe(fstoreD_mem_zero);
-%}
-
-// Store Aligned Packed Ints in Double register to memory
-instruct storeA2I(memory mem, regD src) %{
-  match(Set mem (Store2I mem src));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STDF   $src,$mem\t! packed2I" %}
-  opcode(Assembler::stdf_op3);
-  ins_encode(simple_form3_mem_reg( mem, src ) );
-  ins_pipe(fstoreD_mem_reg);
-%}
-
-// Store Zero into Aligned Packed Ints
-instruct storeA2I0(memory mem, immI0 zero) %{
-  match(Set mem (Store2I mem zero));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STX    $zero,$mem\t! packed2I" %}
-  opcode(Assembler::stx_op3);
-  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
-  ins_pipe(fstoreD_mem_zero);
-%}
-
-
 //----------MemBar Instructions-----------------------------------------------
 // Memory barrier flavors
 
@@ -8880,150 +8757,6 @@
   ins_pipe(ialu_reg_imm);
 %}
 
-// Replicate scalar to packed byte values in Double register
-instruct Repl8B_reg_helper(iRegL dst, iRegI src) %{
-  effect(DEF dst, USE src);
-  format %{ "SLLX  $src,56,$dst\n\t"
-            "SRLX  $dst, 8,O7\n\t"
-            "OR    $dst,O7,$dst\n\t"
-            "SRLX  $dst,16,O7\n\t"
-            "OR    $dst,O7,$dst\n\t"
-            "SRLX  $dst,32,O7\n\t"
-            "OR    $dst,O7,$dst\t! replicate8B" %}
-  ins_encode( enc_repl8b(src, dst));
-  ins_pipe(ialu_reg);
-%}
-
-// Replicate scalar to packed byte values in Double register
-instruct Repl8B_reg(stackSlotD dst, iRegI src) %{
-  match(Set dst (Replicate8B src));
-  expand %{
-    iRegL tmp;
-    Repl8B_reg_helper(tmp, src);
-    regL_to_stkD(dst, tmp);
-  %}
-%}
-
-// Replicate scalar constant to packed byte values in Double register
-instruct Repl8B_immI(regD dst, immI13 con, o7RegI tmp) %{
-  match(Set dst (Replicate8B con));
-  effect(KILL tmp);
-  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl8B($con)" %}
-  ins_encode %{
-    // XXX This is a quick fix for 6833573.
-    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 8, 1)), $dst$$FloatRegister);
-    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 8, 1)), $tmp$$Register);
-    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
-  %}
-  ins_pipe(loadConFD);
-%}
-
-// Replicate scalar to packed char values into stack slot
-instruct Repl4C_reg_helper(iRegL dst, iRegI src) %{
-  effect(DEF dst, USE src);
-  format %{ "SLLX  $src,48,$dst\n\t"
-            "SRLX  $dst,16,O7\n\t"
-            "OR    $dst,O7,$dst\n\t"
-            "SRLX  $dst,32,O7\n\t"
-            "OR    $dst,O7,$dst\t! replicate4C" %}
-  ins_encode( enc_repl4s(src, dst) );
-  ins_pipe(ialu_reg);
-%}
-
-// Replicate scalar to packed char values into stack slot
-instruct Repl4C_reg(stackSlotD dst, iRegI src) %{
-  match(Set dst (Replicate4C src));
-  expand %{
-    iRegL tmp;
-    Repl4C_reg_helper(tmp, src);
-    regL_to_stkD(dst, tmp);
-  %}
-%}
-
-// Replicate scalar constant to packed char values in Double register
-instruct Repl4C_immI(regD dst, immI con, o7RegI tmp) %{
-  match(Set dst (Replicate4C con));
-  effect(KILL tmp);
-  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl4C($con)" %}
-  ins_encode %{
-    // XXX This is a quick fix for 6833573.
-    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 4, 2)), $dst$$FloatRegister);
-    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 4, 2)), $tmp$$Register);
-    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
-  %}
-  ins_pipe(loadConFD);
-%}
-
-// Replicate scalar to packed short values into stack slot
-instruct Repl4S_reg_helper(iRegL dst, iRegI src) %{
-  effect(DEF dst, USE src);
-  format %{ "SLLX  $src,48,$dst\n\t"
-            "SRLX  $dst,16,O7\n\t"
-            "OR    $dst,O7,$dst\n\t"
-            "SRLX  $dst,32,O7\n\t"
-            "OR    $dst,O7,$dst\t! replicate4S" %}
-  ins_encode( enc_repl4s(src, dst) );
-  ins_pipe(ialu_reg);
-%}
-
-// Replicate scalar to packed short values into stack slot
-instruct Repl4S_reg(stackSlotD dst, iRegI src) %{
-  match(Set dst (Replicate4S src));
-  expand %{
-    iRegL tmp;
-    Repl4S_reg_helper(tmp, src);
-    regL_to_stkD(dst, tmp);
-  %}
-%}
-
-// Replicate scalar constant to packed short values in Double register
-instruct Repl4S_immI(regD dst, immI con, o7RegI tmp) %{
-  match(Set dst (Replicate4S con));
-  effect(KILL tmp);
-  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl4S($con)" %}
-  ins_encode %{
-    // XXX This is a quick fix for 6833573.
-    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 4, 2)), $dst$$FloatRegister);
-    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 4, 2)), $tmp$$Register);
-    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
-  %}
-  ins_pipe(loadConFD);
-%}
-
-// Replicate scalar to packed int values in Double register
-instruct Repl2I_reg_helper(iRegL dst, iRegI src) %{
-  effect(DEF dst, USE src);
-  format %{ "SLLX  $src,32,$dst\n\t"
-            "SRLX  $dst,32,O7\n\t"
-            "OR    $dst,O7,$dst\t! replicate2I" %}
-  ins_encode( enc_repl2i(src, dst));
-  ins_pipe(ialu_reg);
-%}
-
-// Replicate scalar to packed int values in Double register
-instruct Repl2I_reg(stackSlotD dst, iRegI src) %{
-  match(Set dst (Replicate2I src));
-  expand %{
-    iRegL tmp;
-    Repl2I_reg_helper(tmp, src);
-    regL_to_stkD(dst, tmp);
-  %}
-%}
-
-// Replicate scalar zero constant to packed int values in Double register
-instruct Repl2I_immI(regD dst, immI con, o7RegI tmp) %{
-  match(Set dst (Replicate2I con));
-  effect(KILL tmp);
-  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl2I($con)" %}
-  ins_encode %{
-    // XXX This is a quick fix for 6833573.
-    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 2, 4)), $dst$$FloatRegister);
-    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 2, 4)), $tmp$$Register);
-    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
-  %}
-  ins_pipe(loadConFD);
-%}
-
 //----------Control Flow Instructions------------------------------------------
 // Compare Instructions
 // Compare Integers
@@ -10742,6 +10475,308 @@
   ins_pipe(istore_mem_reg);
 %}
 
+// ====================VECTOR INSTRUCTIONS=====================================
+
+// Load Aligned Packed values into a Double Register
+instruct loadV8(regD dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 8);
+  match(Set dst (LoadVector mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "LDDF   $mem,$dst\t! load vector (8 bytes)" %}
+  ins_encode %{
+    __ ldf(FloatRegisterImpl::D, $mem$$Address, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(floadD_mem);
+%}
+
+// Store Vector in Double register to memory
+instruct storeV8(memory mem, regD src) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STDF   $src,$mem\t! store vector (8 bytes)" %}
+  ins_encode %{
+    __ stf(FloatRegisterImpl::D, as_DoubleFloatRegister($src$$reg), $mem$$Address);
+  %}
+  ins_pipe(fstoreD_mem_reg);
+%}
+
+// Store Zero into vector in memory
+instruct storeV8B_zero(memory mem, immI0 zero) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem (ReplicateB zero)));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STX    $zero,$mem\t! store zero vector (8 bytes)" %}
+  ins_encode %{
+    __ stx(G0, $mem$$Address);
+  %}
+  ins_pipe(fstoreD_mem_zero);
+%}
+
+instruct storeV4S_zero(memory mem, immI0 zero) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem (ReplicateS zero)));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STX    $zero,$mem\t! store zero vector (4 shorts)" %}
+  ins_encode %{
+    __ stx(G0, $mem$$Address);
+  %}
+  ins_pipe(fstoreD_mem_zero);
+%}
+
+instruct storeV2I_zero(memory mem, immI0 zero) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem (ReplicateI zero)));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STX    $zero,$mem\t! store zero vector (2 ints)" %}
+  ins_encode %{
+    __ stx(G0, $mem$$Address);
+  %}
+  ins_pipe(fstoreD_mem_zero);
+%}
+
+instruct storeV2F_zero(memory mem, immF0 zero) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem (ReplicateF zero)));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STX    $zero,$mem\t! store zero vector (2 floats)" %}
+  ins_encode %{
+    __ stx(G0, $mem$$Address);
+  %}
+  ins_pipe(fstoreD_mem_zero);
+%}
+
+// Replicate scalar to packed byte values into Double register
+instruct Repl8B_reg(regD dst, iRegI src, iRegL tmp, o7RegL tmp2) %{
+  predicate(n->as_Vector()->length() == 8 && UseVIS >= 3);
+  match(Set dst (ReplicateB src));
+  effect(DEF dst, USE src, TEMP tmp, KILL tmp2);
+  format %{ "SLLX  $src,56,$tmp\n\t"
+            "SRLX  $tmp, 8,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\n\t"
+            "SRLX  $tmp,16,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\n\t"
+            "SRLX  $tmp,32,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\t! replicate8B\n\t"
+            "MOVXTOD $tmp,$dst\t! MoveL2D" %}
+  ins_encode %{
+    Register Rsrc = $src$$Register;
+    Register Rtmp = $tmp$$Register;
+    Register Rtmp2 = $tmp2$$Register;
+    __ sllx(Rsrc,    56, Rtmp);
+    __ srlx(Rtmp,     8, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ srlx(Rtmp,    16, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ srlx(Rtmp,    32, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ movxtod(Rtmp, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar to packed byte values into Double stack
+instruct Repl8B_stk(stackSlotD dst, iRegI src, iRegL tmp, o7RegL tmp2) %{
+  predicate(n->as_Vector()->length() == 8 && UseVIS < 3);
+  match(Set dst (ReplicateB src));
+  effect(DEF dst, USE src, TEMP tmp, KILL tmp2);
+  format %{ "SLLX  $src,56,$tmp\n\t"
+            "SRLX  $tmp, 8,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\n\t"
+            "SRLX  $tmp,16,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\n\t"
+            "SRLX  $tmp,32,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\t! replicate8B\n\t"
+            "STX   $tmp,$dst\t! regL to stkD" %}
+  ins_encode %{
+    Register Rsrc = $src$$Register;
+    Register Rtmp = $tmp$$Register;
+    Register Rtmp2 = $tmp2$$Register;
+    __ sllx(Rsrc,    56, Rtmp);
+    __ srlx(Rtmp,     8, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ srlx(Rtmp,    16, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ srlx(Rtmp,    32, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ set ($dst$$disp + STACK_BIAS, Rtmp2);
+    __ stx (Rtmp, Rtmp2, $dst$$base$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar constant to packed byte values in Double register
+instruct Repl8B_immI(regD dst, immI13 con, o7RegI tmp) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB con));
+  effect(KILL tmp);
+  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl8B($con)" %}
+  ins_encode %{
+    // XXX This is a quick fix for 6833573.
+    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 8, 1)), $dst$$FloatRegister);
+    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 8, 1)), $tmp$$Register);
+    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(loadConFD);
+%}
+
+// Replicate scalar to packed char/short values into Double register
+instruct Repl4S_reg(regD dst, iRegI src, iRegL tmp, o7RegL tmp2) %{
+  predicate(n->as_Vector()->length() == 4 && UseVIS >= 3);
+  match(Set dst (ReplicateS src));
+  effect(DEF dst, USE src, TEMP tmp, KILL tmp2);
+  format %{ "SLLX  $src,48,$tmp\n\t"
+            "SRLX  $tmp,16,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\n\t"
+            "SRLX  $tmp,32,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\t! replicate4S\n\t"
+            "MOVXTOD $tmp,$dst\t! MoveL2D" %}
+  ins_encode %{
+    Register Rsrc = $src$$Register;
+    Register Rtmp = $tmp$$Register;
+    Register Rtmp2 = $tmp2$$Register;
+    __ sllx(Rsrc,    48, Rtmp);
+    __ srlx(Rtmp,    16, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ srlx(Rtmp,    32, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ movxtod(Rtmp, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar to packed char/short values into Double stack
+instruct Repl4S_stk(stackSlotD dst, iRegI src, iRegL tmp, o7RegL tmp2) %{
+  predicate(n->as_Vector()->length() == 4 && UseVIS < 3);
+  match(Set dst (ReplicateS src));
+  effect(DEF dst, USE src, TEMP tmp, KILL tmp2);
+  format %{ "SLLX  $src,48,$tmp\n\t"
+            "SRLX  $tmp,16,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\n\t"
+            "SRLX  $tmp,32,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\t! replicate4S\n\t"
+            "STX   $tmp,$dst\t! regL to stkD" %}
+  ins_encode %{
+    Register Rsrc = $src$$Register;
+    Register Rtmp = $tmp$$Register;
+    Register Rtmp2 = $tmp2$$Register;
+    __ sllx(Rsrc,    48, Rtmp);
+    __ srlx(Rtmp,    16, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ srlx(Rtmp,    32, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ set ($dst$$disp + STACK_BIAS, Rtmp2);
+    __ stx (Rtmp, Rtmp2, $dst$$base$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar constant to packed char/short values in Double register
+instruct Repl4S_immI(regD dst, immI con, o7RegI tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS con));
+  effect(KILL tmp);
+  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl4S($con)" %}
+  ins_encode %{
+    // XXX This is a quick fix for 6833573.
+    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 4, 2)), $dst$$FloatRegister);
+    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 4, 2)), $tmp$$Register);
+    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(loadConFD);
+%}
+
+// Replicate scalar to packed int values into Double register
+instruct Repl2I_reg(regD dst, iRegI src, iRegL tmp, o7RegL tmp2) %{
+  predicate(n->as_Vector()->length() == 2 && UseVIS >= 3);
+  match(Set dst (ReplicateI src));
+  effect(DEF dst, USE src, TEMP tmp, KILL tmp2);
+  format %{ "SLLX  $src,32,$tmp\n\t"
+            "SRLX  $tmp,32,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\t! replicate2I\n\t"
+            "MOVXTOD $tmp,$dst\t! MoveL2D" %}
+  ins_encode %{
+    Register Rsrc = $src$$Register;
+    Register Rtmp = $tmp$$Register;
+    Register Rtmp2 = $tmp2$$Register;
+    __ sllx(Rsrc,    32, Rtmp);
+    __ srlx(Rtmp,    32, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ movxtod(Rtmp, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar to packed int values into Double stack
+instruct Repl2I_stk(stackSlotD dst, iRegI src, iRegL tmp, o7RegL tmp2) %{
+  predicate(n->as_Vector()->length() == 2 && UseVIS < 3);
+  match(Set dst (ReplicateI src));
+  effect(DEF dst, USE src, TEMP tmp, KILL tmp2);
+  format %{ "SLLX  $src,32,$tmp\n\t"
+            "SRLX  $tmp,32,$tmp2\n\t"
+            "OR    $tmp,$tmp2,$tmp\t! replicate2I\n\t"
+            "STX   $tmp,$dst\t! regL to stkD" %}
+  ins_encode %{
+    Register Rsrc = $src$$Register;
+    Register Rtmp = $tmp$$Register;
+    Register Rtmp2 = $tmp2$$Register;
+    __ sllx(Rsrc,    32, Rtmp);
+    __ srlx(Rtmp,    32, Rtmp2);
+    __ or3 (Rtmp, Rtmp2, Rtmp);
+    __ set ($dst$$disp + STACK_BIAS, Rtmp2);
+    __ stx (Rtmp, Rtmp2, $dst$$base$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar zero constant to packed int values in Double register
+instruct Repl2I_immI(regD dst, immI con, o7RegI tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI con));
+  effect(KILL tmp);
+  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl2I($con)" %}
+  ins_encode %{
+    // XXX This is a quick fix for 6833573.
+    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 2, 4)), $dst$$FloatRegister);
+    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 2, 4)), $tmp$$Register);
+    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(loadConFD);
+%}
+
+// Replicate scalar to packed float values into Double stack
+instruct Repl2F_stk(stackSlotD dst, regF src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  ins_cost(MEMORY_REF_COST*2);
+  format %{ "STF    $src,$dst.hi\t! packed2F\n\t"
+            "STF    $src,$dst.lo" %}
+  opcode(Assembler::stf_op3);
+  ins_encode(simple_form3_mem_reg(dst, src), form3_mem_plus_4_reg(dst, src));
+  ins_pipe(fstoreF_stk_reg);
+%}
+
+// Replicate scalar zero constant to packed float values in Double register
+instruct Repl2F_immF(regD dst, immF con, o7RegI tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF con));
+  effect(KILL tmp);
+  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl2F($con)" %}
+  ins_encode %{
+    // XXX This is a quick fix for 6833573.
+    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immF($con$$constant)), $dst$$FloatRegister);
+    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immF($con$$constant)), $tmp$$Register);
+    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(loadConFD);
+%}
+
 //----------PEEPHOLE RULES-----------------------------------------------------
 // These must follow all instruction definitions as they use the names
 // defined in the instructions definitions.
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -217,6 +217,8 @@
   // Currently not supported anywhere.
   FLAG_SET_DEFAULT(UseFPUForSpilling, false);
 
+  MaxVectorSize = 8;
+
   assert((InteriorEntryAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size");
 #endif
 
--- a/src/cpu/x86/vm/assembler_x86.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1637,6 +1637,13 @@
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE);
+  emit_byte(0x16);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::movb(Register dst, Address src) {
   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
   InstructionMark im(this);
@@ -1686,6 +1693,14 @@
   emit_operand(dst, src);
 }
 
+void Assembler::movdl(Address dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66);
+  emit_byte(0x7E);
+  emit_operand(src, dst);
+}
+
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
@@ -1716,6 +1731,35 @@
   emit_operand(src, dst);
 }
 
+// Move Unaligned 256bit Vector
+void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
+  assert(UseAVX, "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
+  emit_byte(0x6F);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vmovdqu(XMMRegister dst, Address src) {
+  assert(UseAVX, "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
+  emit_byte(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::vmovdqu(Address dst, XMMRegister src) {
+  assert(UseAVX, "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  // swap src<->dst for encoding
+  assert(src != xnoreg, "sanity");
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256);
+  emit_byte(0x7F);
+  emit_operand(src, dst);
+}
+
 // Uses zero extension on 64bit
 
 void Assembler::movl(Register dst, int32_t imm32) {
@@ -3112,6 +3156,13 @@
   emit_operand(dst, src);
 }
 
+void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
+  emit_byte(0x57);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
@@ -3120,6 +3171,30 @@
   emit_operand(dst, src);
 }
 
+void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, vector256);
+  emit_byte(0x57);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
+  emit_byte(0x18);
+  emit_byte(0xC0 | encode);
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
+  emit_byte(0x01);
+}
+
+void Assembler::vzeroupper() {
+  assert(VM_Version::supports_avx(), "");
+  (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
+  emit_byte(0x77);
+}
+
 
 #ifndef _LP64
 // 32bit only pieces of the assembler
--- a/src/cpu/x86/vm/assembler_x86.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -591,8 +591,9 @@
 
   void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
                   VexSimdPrefix pre, bool vector256 = false) {
-     vex_prefix(src, nds->encoding(), dst->encoding(),
-                pre, VEX_OPCODE_0F, false, vector256);
+    int dst_enc = dst->encoding();
+    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
   }
 
   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
@@ -600,9 +601,12 @@
                              bool vex_w, bool vector256);
 
   int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
-                             VexSimdPrefix pre, bool vector256 = false) {
-     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
-                                  pre, VEX_OPCODE_0F, false, vector256);
+                             VexSimdPrefix pre, bool vector256 = false,
+                             VexOpcode opc = VEX_OPCODE_0F) {
+    int src_enc = src->encoding();
+    int dst_enc = dst->encoding();
+    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256);
   }
 
   void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
@@ -1261,6 +1265,7 @@
   void movdl(XMMRegister dst, Register src);
   void movdl(Register dst, XMMRegister src);
   void movdl(XMMRegister dst, Address src);
+  void movdl(Address dst, XMMRegister src);
 
   // Move Double Quadword
   void movdq(XMMRegister dst, Register src);
@@ -1274,6 +1279,14 @@
   void movdqu(XMMRegister dst, Address src);
   void movdqu(XMMRegister dst, XMMRegister src);
 
+  // Move Unaligned 256bit Vector
+  void vmovdqu(Address dst, XMMRegister src);
+  void vmovdqu(XMMRegister dst, Address src);
+  void vmovdqu(XMMRegister dst, XMMRegister src);
+
+  // Move lower 64bit to high 64bit in 128bit register
+  void movlhps(XMMRegister dst, XMMRegister src);
+
   void movl(Register dst, int32_t imm32);
   void movl(Address dst, int32_t imm32);
   void movl(Register dst, Register src);
@@ -1615,6 +1628,17 @@
   void vxorpd(XMMRegister dst, XMMRegister nds, Address src);
   void vxorps(XMMRegister dst, XMMRegister nds, Address src);
 
+  // AVX Vector instrucitons.
+  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
+
+  // AVX instruction which is used to clear upper 128 bits of YMM registers and
+  // to avoid transaction penalty between AVX and SSE states. There is no
+  // penalty if legacy SSE instructions are encoded using VEX prefix because
+  // they always clear upper 128 bits. It should be used before calling
+  // runtime code and native libraries.
+  void vzeroupper();
 
  protected:
   // Next instructions require address alignment 16 bytes SSE mode.
@@ -2529,9 +2553,13 @@
   void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
+  // AVX Vector instructions
+
+  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
   void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); }
   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
+  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
   void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); }
   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
--- a/src/cpu/x86/vm/register_x86.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/register_x86.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -35,7 +35,7 @@
 const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr +
                                                                  2 * FloatRegisterImpl::number_of_registers;
 const int ConcreteRegisterImpl::max_xmm = ConcreteRegisterImpl::max_fpr +
-                                                                 2 * XMMRegisterImpl::number_of_registers;
+                                                                 8 * XMMRegisterImpl::number_of_registers;
 const char* RegisterImpl::name() const {
   const char* names[number_of_registers] = {
 #ifndef AMD64
--- a/src/cpu/x86/vm/register_x86.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/register_x86.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -158,7 +158,7 @@
   XMMRegister successor() const                          { return as_XMMRegister(encoding() + 1); }
 
   // accessors
-  int   encoding() const                          { assert(is_valid(), "invalid register"); return (intptr_t)this; }
+  int   encoding() const                          { assert(is_valid(), err_msg("invalid register (%d)", (int)(intptr_t)this )); return (intptr_t)this; }
   bool  is_valid() const                          { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; }
   const char* name() const;
 };
@@ -216,7 +216,7 @@
                                RegisterImpl::number_of_registers +  // "H" half of a 64bit register
 #endif // AMD64
                            2 * FloatRegisterImpl::number_of_registers +
-                           2 * XMMRegisterImpl::number_of_registers +
+                           8 * XMMRegisterImpl::number_of_registers +
                            1 // eflags
   };
 
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -467,6 +467,32 @@
   if (!supports_avx ()) // Drop to 0 if no AVX  support
     UseAVX = 0;
 
+#ifdef COMPILER2
+  if (UseFPUForSpilling) {
+    if (UseSSE < 2) {
+      // Only supported with SSE2+
+      FLAG_SET_DEFAULT(UseFPUForSpilling, false);
+    }
+  }
+  if (MaxVectorSize > 0) {
+    if (!is_power_of_2(MaxVectorSize)) {
+      warning("MaxVectorSize must be a power of 2");
+      FLAG_SET_DEFAULT(MaxVectorSize, 32);
+    }
+    if (MaxVectorSize > 32) {
+      FLAG_SET_DEFAULT(MaxVectorSize, 32);
+    }
+    if (MaxVectorSize > 16 && UseAVX == 0) {
+      // Only supported with AVX+
+      FLAG_SET_DEFAULT(MaxVectorSize, 16);
+    }
+    if (UseSSE < 2) {
+      // Only supported with SSE2+
+      FLAG_SET_DEFAULT(MaxVectorSize, 0);
+    }
+  }
+#endif
+
   // On new cpus instructions which update whole XMM register should be used
   // to prevent partial register stall due to dependencies on high half.
   //
@@ -544,6 +570,12 @@
       }
     }
 
+#ifdef COMPILER2
+    if (MaxVectorSize > 16) {
+      // Limit vectors size to 16 bytes on current AMD cpus.
+      FLAG_SET_DEFAULT(MaxVectorSize, 16);
+    }
+#endif // COMPILER2
   }
 
   if( is_intel() ) { // Intel cpus specific settings
@@ -606,15 +638,6 @@
     FLAG_SET_DEFAULT(UsePopCountInstruction, false);
   }
 
-#ifdef COMPILER2
-  if (UseFPUForSpilling) {
-    if (UseSSE < 2) {
-      // Only supported with SSE2+
-      FLAG_SET_DEFAULT(UseFPUForSpilling, false);
-    }
-  }
-#endif
-
   assert(0 <= ReadPrefetchInstr && ReadPrefetchInstr <= 3, "invalid value");
   assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 3, "invalid value");
 
--- a/src/cpu/x86/vm/vmreg_x86.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/vmreg_x86.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -48,8 +48,9 @@
 
   XMMRegister xreg = ::as_XMMRegister(0);
   for ( ; i < ConcreteRegisterImpl::max_xmm ; ) {
-    regName[i++] = xreg->name();
-    regName[i++] = xreg->name();
+    for (int j = 0 ; j < 8 ; j++) {
+      regName[i++] = xreg->name();
+    }
     xreg = xreg->successor();
   }
   for ( ; i < ConcreteRegisterImpl::number_of_registers ; i ++ ) {
--- a/src/cpu/x86/vm/vmreg_x86.inline.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/vmreg_x86.inline.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -39,7 +39,7 @@
 }
 
 inline VMReg XMMRegisterImpl::as_VMReg() {
-  return VMRegImpl::as_VMReg((encoding() << 1) + ConcreteRegisterImpl::max_fpr);
+  return VMRegImpl::as_VMReg((encoding() << 3) + ConcreteRegisterImpl::max_fpr);
 }
 
 
@@ -75,7 +75,7 @@
 inline XMMRegister VMRegImpl::as_XMMRegister() {
   assert( is_XMMRegister() && is_even(value()), "must be" );
   // Yuk
-  return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 1);
+  return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 3);
 }
 
 inline   bool VMRegImpl::is_concrete() {
--- a/src/cpu/x86/vm/x86.ad	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/x86.ad	Thu Jun 28 10:35:28 2012 -0700
@@ -24,6 +24,456 @@
 
 // X86 Common Architecture Description File
 
+//----------REGISTER DEFINITION BLOCK------------------------------------------
+// This information is used by the matcher and the register allocator to
+// describe individual registers and classes of registers within the target
+// archtecture.
+
+register %{
+//----------Architecture Description Register Definitions----------------------
+// General Registers
+// "reg_def"  name ( register save type, C convention save type,
+//                   ideal register type, encoding );
+// Register Save Types:
+//
+// NS  = No-Save:       The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method, &
+//                      that they do not need to be saved at call sites.
+//
+// SOC = Save-On-Call:  The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method,
+//                      but that they must be saved at call sites.
+//
+// SOE = Save-On-Entry: The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, but they do not need to be saved at call
+//                      sites.
+//
+// AS  = Always-Save:   The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, & that they must be saved at call sites.
+//
+// Ideal Register Type is used to determine how to save & restore a
+// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
+// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
+//
+// The encoding number is the actual bit-pattern placed into the opcodes.
+
+// XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
+// Word a in each register holds a Float, words ab hold a Double.
+// The whole registers are used in SSE4.2 version intrinsics,
+// array copy stubs and superword operations (see UseSSE42Intrinsics,
+// UseXMMForArrayCopy and UseSuperword flags).
+// XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
+// Linux ABI:   No register preserved across function calls
+//              XMM0-XMM7 might hold parameters
+// Windows ABI: XMM6-XMM15 preserved across function calls
+//              XMM0-XMM3 might hold parameters
+
+reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
+reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next());
+reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next());
+reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next());
+reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next());
+reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
+reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next());
+reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next());
+reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next());
+reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next());
+reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
+reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next());
+reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next());
+reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next());
+reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next());
+reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
+reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next());
+reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next());
+reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next());
+reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next());
+reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
+reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next());
+reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next());
+reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next());
+reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next());
+reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
+reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next());
+reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next());
+reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next());
+reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next());
+reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+#ifdef _WIN64
+
+reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg());
+reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next());
+reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next());
+reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next());
+reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next());
+reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
+reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next());
+reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next());
+reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next());
+reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next());
+reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
+reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next());
+reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next());
+reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next());
+reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next());
+reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
+reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next());
+reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next());
+reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next());
+reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next());
+reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
+reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next());
+reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next());
+reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next());
+reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next());
+reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
+reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next());
+reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next());
+reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next());
+reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next());
+reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
+reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next());
+reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next());
+reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next());
+reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next());
+reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
+reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next());
+reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next());
+reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next());
+reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next());
+reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
+reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next());
+reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next());
+reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next());
+reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next());
+reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
+reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next());
+reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next());
+reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next());
+reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next());
+reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+#else // _WIN64
+
+reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
+reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next());
+reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next());
+reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next());
+reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next());
+reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
+reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next());
+reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next());
+reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next());
+reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next());
+reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+#ifdef _LP64
+
+reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
+reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next());
+reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next());
+reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next());
+reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next());
+reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
+reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next());
+reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next());
+reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next());
+reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next());
+reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
+reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next());
+reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next());
+reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next());
+reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next());
+reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
+reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next());
+reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next());
+reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next());
+reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next());
+reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
+reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next());
+reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next());
+reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next());
+reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next());
+reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
+reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next());
+reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next());
+reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next());
+reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next());
+reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
+reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next());
+reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next());
+reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next());
+reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next());
+reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
+reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next());
+reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next());
+reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next());
+reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next());
+reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+#endif // _LP64
+
+#endif // _WIN64
+
+#ifdef _LP64
+reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
+#else
+reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
+#endif // _LP64
+
+alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
+                   XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
+                   XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
+                   XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
+                   XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
+                   XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
+                   XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
+                   XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
+#ifdef _LP64
+                  ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
+                   XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
+                   XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
+                   XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
+                   XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
+                   XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
+                   XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
+                   XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
+#endif
+                   );
+
+// flags allocation class should be last.
+alloc_class chunk2(RFLAGS);
+
+// Singleton class for condition codes
+reg_class int_flags(RFLAGS);
+
+// Class for all float registers
+reg_class float_reg(XMM0,
+                    XMM1,
+                    XMM2,
+                    XMM3,
+                    XMM4,
+                    XMM5,
+                    XMM6,
+                    XMM7
+#ifdef _LP64
+                   ,XMM8,
+                    XMM9,
+                    XMM10,
+                    XMM11,
+                    XMM12,
+                    XMM13,
+                    XMM14,
+                    XMM15
+#endif
+                    );
+
+// Class for all double registers
+reg_class double_reg(XMM0,  XMM0b,
+                     XMM1,  XMM1b,
+                     XMM2,  XMM2b,
+                     XMM3,  XMM3b,
+                     XMM4,  XMM4b,
+                     XMM5,  XMM5b,
+                     XMM6,  XMM6b,
+                     XMM7,  XMM7b
+#ifdef _LP64
+                    ,XMM8,  XMM8b,
+                     XMM9,  XMM9b,
+                     XMM10, XMM10b,
+                     XMM11, XMM11b,
+                     XMM12, XMM12b,
+                     XMM13, XMM13b,
+                     XMM14, XMM14b,
+                     XMM15, XMM15b
+#endif
+                     );
+
+// Class for all 32bit vector registers
+reg_class vectors_reg(XMM0,
+                      XMM1,
+                      XMM2,
+                      XMM3,
+                      XMM4,
+                      XMM5,
+                      XMM6,
+                      XMM7
+#ifdef _LP64
+                     ,XMM8,
+                      XMM9,
+                      XMM10,
+                      XMM11,
+                      XMM12,
+                      XMM13,
+                      XMM14,
+                      XMM15
+#endif
+                      );
+
+// Class for all 64bit vector registers
+reg_class vectord_reg(XMM0,  XMM0b,
+                      XMM1,  XMM1b,
+                      XMM2,  XMM2b,
+                      XMM3,  XMM3b,
+                      XMM4,  XMM4b,
+                      XMM5,  XMM5b,
+                      XMM6,  XMM6b,
+                      XMM7,  XMM7b
+#ifdef _LP64
+                     ,XMM8,  XMM8b,
+                      XMM9,  XMM9b,
+                      XMM10, XMM10b,
+                      XMM11, XMM11b,
+                      XMM12, XMM12b,
+                      XMM13, XMM13b,
+                      XMM14, XMM14b,
+                      XMM15, XMM15b
+#endif
+                      );
+
+// Class for all 128bit vector registers
+reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
+                      XMM1,  XMM1b,  XMM1c,  XMM1d,
+                      XMM2,  XMM2b,  XMM2c,  XMM2d,
+                      XMM3,  XMM3b,  XMM3c,  XMM3d,
+                      XMM4,  XMM4b,  XMM4c,  XMM4d,
+                      XMM5,  XMM5b,  XMM5c,  XMM5d,
+                      XMM6,  XMM6b,  XMM6c,  XMM6d,
+                      XMM7,  XMM7b,  XMM7c,  XMM7d
+#ifdef _LP64
+                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
+                      XMM9,  XMM9b,  XMM9c,  XMM9d,
+                      XMM10, XMM10b, XMM10c, XMM10d,
+                      XMM11, XMM11b, XMM11c, XMM11d,
+                      XMM12, XMM12b, XMM12c, XMM12d,
+                      XMM13, XMM13b, XMM13c, XMM13d,
+                      XMM14, XMM14b, XMM14c, XMM14d,
+                      XMM15, XMM15b, XMM15c, XMM15d
+#endif
+                      );
+
+// Class for all 256bit vector registers
+reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
+                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
+                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
+                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
+                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
+                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
+                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
+                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
+#ifdef _LP64
+                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
+                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
+                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
+                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
+                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
+                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
+                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
+                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
+#endif
+                      );
+
+%}
+
 source %{
   // Float masks come from different places depending on platform.
 #ifdef _LP64
@@ -38,6 +488,252 @@
   static address double_signflip() { return (address)double_signflip_pool; }
 #endif
 
+// Map Types to machine register types
+const int Matcher::base2reg[Type::lastype] = {
+  Node::NotAMachineReg,0,0, Op_RegI, Op_RegL, 0, Op_RegN,
+  Node::NotAMachineReg, Node::NotAMachineReg, /* tuple, array */
+  Op_VecS, Op_VecD, Op_VecX, Op_VecY, /* Vectors */
+  Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, /* the pointers */
+  0, 0/*abio*/,
+  Op_RegP /* Return address */, 0, /* the memories */
+  Op_RegF, Op_RegF, Op_RegF, Op_RegD, Op_RegD, Op_RegD,
+  0  /*bottom*/
+};
+
+// Max vector size in bytes. 0 if not supported.
+const int Matcher::vector_width_in_bytes(BasicType bt) {
+  assert(is_java_primitive(bt), "only primitive type vectors");
+  if (UseSSE < 2) return 0;
+  // SSE2 supports 128bit vectors for all types.
+  // AVX2 supports 256bit vectors for all types.
+  int size = (UseAVX > 1) ? 32 : 16;
+  // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
+  if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
+    size = 32;
+  // Use flag to limit vector size.
+  size = MIN2(size,(int)MaxVectorSize);
+  // Minimum 2 values in vector (or 4 for bytes).
+  switch (bt) {
+  case T_DOUBLE:
+  case T_LONG:
+    if (size < 16) return 0;
+  case T_FLOAT:
+  case T_INT:
+    if (size < 8) return 0;
+  case T_BOOLEAN:
+  case T_BYTE:
+  case T_CHAR:
+  case T_SHORT:
+    if (size < 4) return 0;
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  return size;
+}
+
+// Limits on vector size (number of elements) loaded into vector.
+const int Matcher::max_vector_size(const BasicType bt) {
+  return vector_width_in_bytes(bt)/type2aelembytes(bt);
+}
+const int Matcher::min_vector_size(const BasicType bt) {
+  int max_size = max_vector_size(bt);
+  // Min size which can be loaded into vector is 4 bytes.
+  int size = (type2aelembytes(bt) == 1) ? 4 : 2;
+  return MIN2(size,max_size);
+}
+
+// Vector ideal reg corresponding to specidied size in bytes
+const int Matcher::vector_ideal_reg(int size) {
+  assert(MaxVectorSize >= size, "");
+  switch(size) {
+    case  4: return Op_VecS;
+    case  8: return Op_VecD;
+    case 16: return Op_VecX;
+    case 32: return Op_VecY;
+  }
+  ShouldNotReachHere();
+  return 0;
+}
+
+// x86 supports misaligned vectors store/load.
+const bool Matcher::misaligned_vectors_ok() {
+  return !AlignVector; // can be changed by flag
+}
+
+// Helper methods for MachSpillCopyNode::implementation().
+static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
+                          int src_hi, int dst_hi, uint ireg, outputStream* st) {
+  // In 64-bit VM size calculation is very complex. Emitting instructions
+  // into scratch buffer is used to get size in 64-bit VM.
+  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
+  assert(ireg == Op_VecS || // 32bit vector
+         (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
+         (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
+         "no non-adjacent vector moves" );
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    int offset = __ offset();
+    switch (ireg) {
+    case Op_VecS: // copy whole register
+    case Op_VecD:
+    case Op_VecX:
+      __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+      break;
+    case Op_VecY:
+      __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    int size = __ offset() - offset;
+#ifdef ASSERT
+    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
+    assert(!do_size || size == 4, "incorrect size calculattion");
+#endif
+    return size;
+#ifndef PRODUCT
+  } else if (!do_size) {
+    switch (ireg) {
+    case Op_VecS:
+    case Op_VecD:
+    case Op_VecX:
+      st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
+      break;
+    case Op_VecY:
+      st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+#endif
+  }
+  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
+  return 4;
+}
+
+static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
+                            int stack_offset, int reg, uint ireg, outputStream* st) {
+  // In 64-bit VM size calculation is very complex. Emitting instructions
+  // into scratch buffer is used to get size in 64-bit VM.
+  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    int offset = __ offset();
+    if (is_load) {
+      switch (ireg) {
+      case Op_VecS:
+        __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        break;
+      case Op_VecD:
+        __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        break;
+      case Op_VecX:
+        __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        break;
+      case Op_VecY:
+        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+    } else { // store
+      switch (ireg) {
+      case Op_VecS:
+        __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        break;
+      case Op_VecD:
+        __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        break;
+      case Op_VecX:
+        __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        break;
+      case Op_VecY:
+        __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+    }
+    int size = __ offset() - offset;
+#ifdef ASSERT
+    int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
+    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
+    assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
+#endif
+    return size;
+#ifndef PRODUCT
+  } else if (!do_size) {
+    if (is_load) {
+      switch (ireg) {
+      case Op_VecS:
+        st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
+        break;
+      case Op_VecD:
+        st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
+        break;
+       case Op_VecX:
+        st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
+        break;
+      case Op_VecY:
+        st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+    } else { // store
+      switch (ireg) {
+      case Op_VecS:
+        st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
+        break;
+      case Op_VecD:
+        st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
+        break;
+       case Op_VecX:
+        st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
+        break;
+      case Op_VecY:
+        st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+    }
+#endif
+  }
+  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
+  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
+  return 5+offset_size;
+}
+
+static inline jfloat replicate4_imm(int con, int width) {
+  // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
+  assert(width == 1 || width == 2, "only byte or short types here");
+  int bit_width = width * 8;
+  jint val = con;
+  val &= (1 << bit_width) - 1;  // mask off sign bits
+  while(bit_width < 32) {
+    val |= (val << bit_width);
+    bit_width <<= 1;
+  }
+  jfloat fval = *((jfloat*) &val);  // coerce to float type
+  return fval;
+}
+
+static inline jdouble replicate8_imm(int con, int width) {
+  // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
+  assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
+  int bit_width = width * 8;
+  jlong val = con;
+  val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
+  while(bit_width < 64) {
+    val |= (val << bit_width);
+    bit_width <<= 1;
+  }
+  jdouble dval = *((jdouble*) &val);  // coerce to double type
+  return dval;
+}
+
 #ifndef PRODUCT
   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
     st->print("nop \t# %d bytes pad for loops and calls", _count);
@@ -103,6 +799,46 @@
 
 %}
 
+
+//----------OPERANDS-----------------------------------------------------------
+// Operand definitions must precede instruction definitions for correct parsing
+// in the ADLC because operands constitute user defined types which are used in
+// instruction definitions.
+
+// Vectors
+operand vecS() %{
+  constraint(ALLOC_IN_RC(vectors_reg));
+  match(VecS);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecY() %{
+  constraint(ALLOC_IN_RC(vectory_reg));
+  match(VecY);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+
 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 
 // ============================================================================
@@ -852,3 +1588,797 @@
   ins_pipe(pipe_slow);
 %}
 
+
+// ====================VECTOR INSTRUCTIONS=====================================
+
+// Load vectors (4 bytes long)
+instruct loadV4(vecS dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 4);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Load vectors (8 bytes long)
+instruct loadV8(vecD dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 8);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Load vectors (16 bytes long)
+instruct loadV16(vecX dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 16);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
+  ins_encode %{
+    __ movdqu($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Load vectors (32 bytes long)
+instruct loadV32(vecY dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 32);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
+  ins_encode %{
+    __ vmovdqu($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Store vectors
+instruct storeV4(memory mem, vecS src) %{
+  predicate(n->as_StoreVector()->memory_size() == 4);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
+  ins_encode %{
+    __ movdl($mem$$Address, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct storeV8(memory mem, vecD src) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct storeV16(memory mem, vecX src) %{
+  predicate(n->as_StoreVector()->memory_size() == 16);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
+  ins_encode %{
+    __ movdqu($mem$$Address, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct storeV32(memory mem, vecY src) %{
+  predicate(n->as_StoreVector()->memory_size() == 32);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
+  ins_encode %{
+    __ vmovdqu($mem$$Address, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate byte scalar to be vector
+instruct Repl4B(vecS dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate4B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate8B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\t! replicate16B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32);
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate32B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate byte scalar immediate to be vector by loading from const table.
+instruct Repl4B_imm(vecS dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateB con));
+  format %{ "movss   $dst,[$constantaddress]\t! replicate4B($con)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_imm(vecD dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate8B($con)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateB con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate16B($con)\n\t"
+            "movlhps $dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32);
+  match(Set dst (ReplicateB con));
+  format %{ "movsd   $dst,[$constantaddress]\t! lreplicate32B($con)\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate byte scalar zero to be vector
+instruct Repl4B_zero(vecS dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateB zero));
+  format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8B_zero(vecD dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB zero));
+  format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl16B_zero(vecX dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateB zero));
+  format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl32B_zero(vecY dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 32);
+  match(Set dst (ReplicateB zero));
+  format %{ "vxorpd  $dst,$dst,$dst\t! replicate32B zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate char/short (2 byte) scalar to be vector
+instruct Repl2S(vecS dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate2S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate4S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8S(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\t! replicate8S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate16S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2S_imm(vecS dst, immI con) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateS con));
+  format %{ "movss   $dst,[$constantaddress]\t! replicate2S($con)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S_imm(vecD dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate4S($con)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8S_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateS con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate8S($con)\n\t"
+            "movlhps $dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateS con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate16S($con)\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate char/short (2 byte) scalar zero to be vector
+instruct Repl2S_zero(vecS dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateS zero));
+  format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S_zero(vecD dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS zero));
+  format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8S_zero(vecX dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateS zero));
+  format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl16S_zero(vecY dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateS zero));
+  format %{ "vxorpd  $dst,$dst,$dst\t! replicate16S zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate integer (4 byte) scalar to be vector
+instruct Repl2I(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2I_imm(vecD dst, immI con) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate2I($con)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate4I($con)\n\t"
+            "movlhps $dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateI con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integer could be loaded into xmm register directly from memory.
+instruct Repl2I_mem(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI (LoadVector mem)));
+  format %{ "movd    $dst,$mem\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $mem$$Address);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI (LoadVector mem)));
+  format %{ "movd    $dst,$mem\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $mem$$Address);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateI (LoadVector mem)));
+  format %{ "movd    $dst,$mem\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $mem$$Address);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate integer (4 byte) scalar zero to be vector
+instruct Repl2I_zero(vecD dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI zero));
+  format %{ "pxor    $dst,$dst\t! replicate2I" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_zero(vecX dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI zero));
+  format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8I_zero(vecY dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateI zero));
+  format %{ "vxorpd  $dst,$dst,$dst\t! replicate8I zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl2L(vecX dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL src));
+  format %{ "movdq   $dst,$src\n\t"
+            "movlhps $dst,$dst\t! replicate2L" %}
+  ins_encode %{
+    __ movdq($dst$$XMMRegister, $src$$Register);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L(vecY dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL src));
+  format %{ "movdq   $dst,$src\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movdq($dst$$XMMRegister, $src$$Register);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "movlhps $dst,$dst\t! replicate2L"%}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+// Replicate long (8 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2L_imm(vecX dst, immL con) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate2L($con)\n\t"
+            "movlhps $dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress($con));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_imm(vecY dst, immL con) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate4L($con)\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress($con));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Long could be loaded into xmm register directly from memory.
+instruct Repl2L_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL (LoadVector mem)));
+  format %{ "movq    $dst,$mem\n\t"
+            "movlhps $dst,$dst\t! replicate2L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL (LoadVector mem)));
+  format %{ "movq    $dst,$mem\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate long (8 byte) scalar zero to be vector
+instruct Repl2L_zero(vecX dst, immL0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL zero));
+  format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4L_zero(vecY dst, immL0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL zero));
+  format %{ "vxorpd  $dst,$dst,$dst\t! replicate4L zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate float (4 byte) scalar to be vector
+instruct Repl2F(vecD dst, regF src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F(vecX dst, regF src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F(vecY dst, regF src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$src,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate float (4 byte) scalar zero to be vector
+instruct Repl2F_zero(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateF zero));
+  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate double (8 bytes) scalar to be vector
+instruct Repl2D(vecX dst, regD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateD src));
+  format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D(vecY dst, regD src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateD src));
+  format %{ "pshufd  $dst,$src,0x44\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate double (8 byte) scalar zero to be vector
+instruct Repl2D_zero(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateD zero));
+  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
--- a/src/cpu/x86/vm/x86_32.ad	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/x86_32.ad	Thu Jun 28 10:35:28 2012 -0700
@@ -74,9 +74,6 @@
 reg_def EAX(SOC, SOC, Op_RegI, 0, rax->as_VMReg());
 reg_def ESP( NS,  NS, Op_RegI, 4, rsp->as_VMReg());
 
-// Special Registers
-reg_def EFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
-
 // Float registers.  We treat TOS/FPR0 special.  It is invisible to the
 // allocator, and only shows up in the encodings.
 reg_def FPR0L( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad());
@@ -105,27 +102,6 @@
 reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg());
 reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());
 
-// XMM registers.  128-bit registers or 4 words each, labeled a-d.
-// Word a in each register holds a Float, words ab hold a Double.
-// We currently do not use the SIMD capabilities, so registers cd
-// are unused at the moment.
-reg_def XMM0a( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
-reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next());
-reg_def XMM1a( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
-reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next());
-reg_def XMM2a( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
-reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next());
-reg_def XMM3a( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
-reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next());
-reg_def XMM4a( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
-reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next());
-reg_def XMM5a( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
-reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next());
-reg_def XMM6a( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
-reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next());
-reg_def XMM7a( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
-reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next());
-
 // Specify priority of register selection within phases of register
 // allocation.  Highest priority is first.  A useful heuristic is to
 // give registers a low priority when they are required by machine
@@ -138,15 +114,6 @@
                     FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H,
                     FPR6L, FPR6H, FPR7L, FPR7H );
 
-alloc_class chunk1( XMM0a, XMM0b,
-                    XMM1a, XMM1b,
-                    XMM2a, XMM2b,
-                    XMM3a, XMM3b,
-                    XMM4a, XMM4b,
-                    XMM5a, XMM5b,
-                    XMM6a, XMM6b,
-                    XMM7a, XMM7b, EFLAGS);
-
 
 //----------Architecture Description Register Classes--------------------------
 // Several register classes are automatically defined based upon information in
@@ -159,12 +126,12 @@
 // Class for all registers
 reg_class any_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX, ESP);
 // Class for general registers
-reg_class e_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX);
+reg_class int_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX);
 // Class for general registers which may be used for implicit null checks on win95
 // Also safe for use by tailjump. We don't want to allocate in rbp,
-reg_class e_reg_no_rbp(EAX, EDX, EDI, ESI, ECX, EBX);
+reg_class int_reg_no_rbp(EAX, EDX, EDI, ESI, ECX, EBX);
 // Class of "X" registers
-reg_class x_reg(EBX, ECX, EDX, EAX);
+reg_class int_x_reg(EBX, ECX, EDX, EAX);
 // Class of registers that can appear in an address with no offset.
 // EBP and ESP require an extra instruction byte for zero offset.
 // Used in fast-unlock
@@ -193,8 +160,6 @@
 reg_class sp_reg(ESP);
 // Singleton class for instruction pointer
 // reg_class ip_reg(EIP);
-// Singleton class for condition codes
-reg_class int_flags(EFLAGS);
 // Class of integer register pairs
 reg_class long_reg( EAX,EDX, ECX,EBX, EBP,EDI );
 // Class of integer register pairs that aligns with calling convention
@@ -206,29 +171,18 @@
 // Floating point registers.  Notice FPR0 is not a choice.
 // FPR0 is not ever allocated; we use clever encodings to fake
 // a 2-address instructions out of Intels FP stack.
-reg_class flt_reg( FPR1L,FPR2L,FPR3L,FPR4L,FPR5L,FPR6L,FPR7L );
-
-// make a register class for SSE registers
-reg_class xmm_reg(XMM0a, XMM1a, XMM2a, XMM3a, XMM4a, XMM5a, XMM6a, XMM7a);
-
-// make a double register class for SSE2 registers
-reg_class xdb_reg(XMM0a,XMM0b, XMM1a,XMM1b, XMM2a,XMM2b, XMM3a,XMM3b,
-                  XMM4a,XMM4b, XMM5a,XMM5b, XMM6a,XMM6b, XMM7a,XMM7b );
-
-reg_class dbl_reg( FPR1L,FPR1H, FPR2L,FPR2H, FPR3L,FPR3H,
-                   FPR4L,FPR4H, FPR5L,FPR5H, FPR6L,FPR6H,
-                   FPR7L,FPR7H );
-
-reg_class flt_reg0( FPR1L );
-reg_class dbl_reg0( FPR1L,FPR1H );
-reg_class dbl_reg1( FPR2L,FPR2H );
-reg_class dbl_notreg0( FPR2L,FPR2H, FPR3L,FPR3H, FPR4L,FPR4H,
-                       FPR5L,FPR5H, FPR6L,FPR6H, FPR7L,FPR7H );
-
-// XMM6 and XMM7 could be used as temporary registers for long, float and
-// double values for SSE2.
-reg_class xdb_reg6( XMM6a,XMM6b );
-reg_class xdb_reg7( XMM7a,XMM7b );
+reg_class fp_flt_reg( FPR1L,FPR2L,FPR3L,FPR4L,FPR5L,FPR6L,FPR7L );
+
+reg_class fp_dbl_reg( FPR1L,FPR1H, FPR2L,FPR2H, FPR3L,FPR3H,
+                      FPR4L,FPR4H, FPR5L,FPR5H, FPR6L,FPR6H,
+                      FPR7L,FPR7H );
+
+reg_class fp_flt_reg0( FPR1L );
+reg_class fp_dbl_reg0( FPR1L,FPR1H );
+reg_class fp_dbl_reg1( FPR2L,FPR2H );
+reg_class fp_dbl_notreg0( FPR2L,FPR2H, FPR3L,FPR3H, FPR4L,FPR4H,
+                          FPR5L,FPR5H, FPR6L,FPR6H, FPR7L,FPR7H );
+
 %}
 
 
@@ -412,7 +366,7 @@
   }
 }
 
-   // eRegI ereg, memory mem) %{    // emit_reg_mem
+   // rRegI ereg, memory mem) %{    // emit_reg_mem
 void encode_RegMem( CodeBuffer &cbuf, int reg_encoding, int base, int index, int scale, int displace, bool displace_is_oop ) {
   // There is no index & no scale, use form without SIB byte
   if ((index == 0x4) &&
@@ -787,7 +741,7 @@
 #endif
   }
   int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
-  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes.
+  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   return size+5+offset_size;
 }
 
@@ -821,7 +775,7 @@
     }
 #endif
   }
-  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes.
+  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
   // Only MOVAPS SSE prefix uses 1 byte.
   int sz = 4;
   if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) &&
@@ -903,6 +857,108 @@
   return impl_helper(cbuf,do_size,false,offset,st_op,op,op_str,size, st);
 }
 
+// Next two methods are shared by 32- and 64-bit VM. They are defined in x86.ad.
+static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
+                          int src_hi, int dst_hi, uint ireg, outputStream* st);
+
+static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
+                            int stack_offset, int reg, uint ireg, outputStream* st);
+
+static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_offset,
+                                     int dst_offset, uint ireg, outputStream* st) {
+  int calc_size = 0;
+  int src_offset_size = (src_offset == 0) ? 0 : ((src_offset < 0x80) ? 1 : 4);
+  int dst_offset_size = (dst_offset == 0) ? 0 : ((dst_offset < 0x80) ? 1 : 4);
+  switch (ireg) {
+  case Op_VecS:
+    calc_size = 3+src_offset_size + 3+dst_offset_size;
+    break;
+  case Op_VecD:
+    calc_size = 3+src_offset_size + 3+dst_offset_size;
+    src_offset += 4;
+    dst_offset += 4;
+    src_offset_size = (src_offset == 0) ? 0 : ((src_offset < 0x80) ? 1 : 4);
+    dst_offset_size = (dst_offset == 0) ? 0 : ((dst_offset < 0x80) ? 1 : 4);
+    calc_size += 3+src_offset_size + 3+dst_offset_size;
+    break;
+  case Op_VecX:
+    calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
+    break;
+  case Op_VecY:
+    calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    int offset = __ offset();
+    switch (ireg) {
+    case Op_VecS:
+      __ pushl(Address(rsp, src_offset));
+      __ popl (Address(rsp, dst_offset));
+      break;
+    case Op_VecD:
+      __ pushl(Address(rsp, src_offset));
+      __ popl (Address(rsp, dst_offset));
+      __ pushl(Address(rsp, src_offset+4));
+      __ popl (Address(rsp, dst_offset+4));
+      break;
+    case Op_VecX:
+      __ movdqu(Address(rsp, -16), xmm0);
+      __ movdqu(xmm0, Address(rsp, src_offset));
+      __ movdqu(Address(rsp, dst_offset), xmm0);
+      __ movdqu(xmm0, Address(rsp, -16));
+      break;
+    case Op_VecY:
+      __ vmovdqu(Address(rsp, -32), xmm0);
+      __ vmovdqu(xmm0, Address(rsp, src_offset));
+      __ vmovdqu(Address(rsp, dst_offset), xmm0);
+      __ vmovdqu(xmm0, Address(rsp, -32));
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    int size = __ offset() - offset;
+    assert(size == calc_size, "incorrect size calculattion");
+    return size;
+#ifndef PRODUCT
+  } else if (!do_size) {
+    switch (ireg) {
+    case Op_VecS:
+      st->print("pushl   [rsp + #%d]\t# 32-bit mem-mem spill\n\t"
+                "popl    [rsp + #%d]",
+                src_offset, dst_offset);
+      break;
+    case Op_VecD:
+      st->print("pushl   [rsp + #%d]\t# 64-bit mem-mem spill\n\t"
+                "popq    [rsp + #%d]\n\t"
+                "pushl   [rsp + #%d]\n\t"
+                "popq    [rsp + #%d]",
+                src_offset, dst_offset, src_offset+4, dst_offset+4);
+      break;
+     case Op_VecX:
+      st->print("movdqu  [rsp - #16], xmm0\t# 128-bit mem-mem spill\n\t"
+                "movdqu  xmm0, [rsp + #%d]\n\t"
+                "movdqu  [rsp + #%d], xmm0\n\t"
+                "movdqu  xmm0, [rsp - #16]",
+                src_offset, dst_offset);
+      break;
+    case Op_VecY:
+      st->print("vmovdqu [rsp - #32], xmm0\t# 256-bit mem-mem spill\n\t"
+                "vmovdqu xmm0, [rsp + #%d]\n\t"
+                "vmovdqu [rsp + #%d], xmm0\n\t"
+                "vmovdqu xmm0, [rsp - #32]",
+                src_offset, dst_offset);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+#endif
+  }
+  return calc_size;
+}
+
 uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bool do_size, outputStream* st ) const {
   // Get registers to move
   OptoReg::Name src_second = ra_->get_reg_second(in(1));
@@ -923,6 +979,29 @@
   if( src_first == dst_first && src_second == dst_second )
     return size;            // Self copy, no move
 
+  if (bottom_type()->isa_vect() != NULL) {
+    uint ireg = ideal_reg();
+    assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
+    assert((src_first_rc != rc_float && dst_first_rc != rc_float), "sanity");
+    assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
+    if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
+      // mem -> mem
+      int src_offset = ra_->reg2offset(src_first);
+      int dst_offset = ra_->reg2offset(dst_first);
+      return vec_stack_to_stack_helper(cbuf, do_size, src_offset, dst_offset, ireg, st);
+    } else if (src_first_rc == rc_xmm && dst_first_rc == rc_xmm ) {
+      return vec_mov_helper(cbuf, do_size, src_first, dst_first, src_second, dst_second, ireg, st);
+    } else if (src_first_rc == rc_xmm && dst_first_rc == rc_stack ) {
+      int stack_offset = ra_->reg2offset(dst_first);
+      return vec_spill_helper(cbuf, do_size, false, stack_offset, src_first, ireg, st);
+    } else if (src_first_rc == rc_stack && dst_first_rc == rc_xmm ) {
+      int stack_offset = ra_->reg2offset(src_first);
+      return vec_spill_helper(cbuf, do_size, true,  stack_offset, dst_first, ireg, st);
+    } else {
+      ShouldNotReachHere();
+    }
+  }
+
   // --------------------------------------
   // Check for mem-mem move.  push/pop to move.
   if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
@@ -1313,16 +1392,6 @@
   return true;
 }
 
-// Vector width in bytes
-const uint Matcher::vector_width_in_bytes(void) {
-  return UseSSE >= 2 ? 8 : 0;
-}
-
-// Vector ideal reg
-const uint Matcher::vector_ideal_reg(void) {
-  return Op_RegD;
-}
-
 // Is this branch offset short enough that a short branch can be used?
 //
 // NOTE: If the platform does not provide any short branch variants, then
@@ -1452,7 +1521,7 @@
 // arguments in those registers not be available to the callee.
 bool Matcher::can_be_java_arg( int reg ) {
   if(  reg == ECX_num   || reg == EDX_num   ) return true;
-  if( (reg == XMM0a_num || reg == XMM1a_num) && UseSSE>=1 ) return true;
+  if( (reg == XMM0_num  || reg == XMM1_num ) && UseSSE>=1 ) return true;
   if( (reg == XMM0b_num || reg == XMM1b_num) && UseSSE>=2 ) return true;
   return false;
 }
@@ -1565,16 +1634,16 @@
     emit_opcode(cbuf,0x66);
   %}
 
-  enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
+  enc_class RegReg (rRegI dst, rRegI src) %{    // RegReg(Many)
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
-  enc_class OpcRegReg (immI opcode, eRegI dst, eRegI src) %{    // OpcRegReg(Many)
+  enc_class OpcRegReg (immI opcode, rRegI dst, rRegI src) %{    // OpcRegReg(Many)
     emit_opcode(cbuf,$opcode$$constant);
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
-  enc_class mov_r32_imm0( eRegI dst ) %{
+  enc_class mov_r32_imm0( rRegI dst ) %{
     emit_opcode( cbuf, 0xB8 + $dst$$reg ); // 0xB8+ rd   -- MOV r32  ,imm32
     emit_d32   ( cbuf, 0x0  );             //                         imm32==0x0
   %}
@@ -1621,7 +1690,7 @@
   %}
 
   // Dense encoding for older common ops
-  enc_class Opc_plus(immI opcode, eRegI reg) %{
+  enc_class Opc_plus(immI opcode, rRegI reg) %{
     emit_opcode(cbuf, $opcode$$constant + $reg$$reg);
   %}
 
@@ -1637,7 +1706,7 @@
     }
   %}
 
-  enc_class OpcSErm (eRegI dst, immI imm) %{    // OpcSEr/m
+  enc_class OpcSErm (rRegI dst, immI imm) %{    // OpcSEr/m
     // Emit primary opcode and set sign-extend bit
     // Check for 8-bit immediate, and set sign extend bit in opcode
     if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
@@ -1682,7 +1751,7 @@
     else                               emit_d32(cbuf,con);
   %}
 
-  enc_class OpcSReg (eRegI dst) %{    // BSWAP
+  enc_class OpcSReg (rRegI dst) %{    // BSWAP
     emit_cc(cbuf, $secondary, $dst$$reg );
   %}
 
@@ -1700,7 +1769,7 @@
     emit_rm(cbuf, 0x3, destlo, desthi);
   %}
 
-  enc_class RegOpc (eRegI div) %{    // IDIV, IMOD, JMP indirect, ...
+  enc_class RegOpc (rRegI div) %{    // IDIV, IMOD, JMP indirect, ...
     emit_rm(cbuf, 0x3, $secondary, $div$$reg );
   %}
 
@@ -1891,20 +1960,20 @@
 //                 runtime_call_Relocation::spec(), RELOC_IMM32 );
 //   %}
 
-  enc_class RegOpcImm (eRegI dst, immI8 shift) %{    // SHL, SAR, SHR
+  enc_class RegOpcImm (rRegI dst, immI8 shift) %{    // SHL, SAR, SHR
     $$$emit8$primary;
     emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
     $$$emit8$shift$$constant;
   %}
 
-  enc_class LdImmI (eRegI dst, immI src) %{    // Load Immediate
+  enc_class LdImmI (rRegI dst, immI src) %{    // Load Immediate
     // Load immediate does not have a zero or sign extended version
     // for 8-bit immediates
     emit_opcode(cbuf, 0xB8 + $dst$$reg);
     $$$emit32$src$$constant;
   %}
 
-  enc_class LdImmP (eRegI dst, immI src) %{    // Load Immediate
+  enc_class LdImmP (rRegI dst, immI src) %{    // Load Immediate
     // Load immediate does not have a zero or sign extended version
     // for 8-bit immediates
     emit_opcode(cbuf, $primary + $dst$$reg);
@@ -1943,15 +2012,15 @@
 
 
   // Encode a reg-reg copy.  If it is useless, then empty encoding.
-  enc_class enc_Copy( eRegI dst, eRegI src ) %{
+  enc_class enc_Copy( rRegI dst, rRegI src ) %{
     encode_Copy( cbuf, $dst$$reg, $src$$reg );
   %}
 
-  enc_class enc_CopyL_Lo( eRegI dst, eRegL src ) %{
+  enc_class enc_CopyL_Lo( rRegI dst, eRegL src ) %{
     encode_Copy( cbuf, $dst$$reg, $src$$reg );
   %}
 
-  enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
+  enc_class RegReg (rRegI dst, rRegI src) %{    // RegReg(Many)
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
@@ -1973,7 +2042,7 @@
     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg));
   %}
 
-  enc_class RegReg_HiLo( eRegL src, eRegI dst ) %{
+  enc_class RegReg_HiLo( eRegL src, rRegI dst ) %{
     emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($src$$reg));
   %}
 
@@ -2068,7 +2137,7 @@
     cbuf.set_insts_mark();            // Mark start of opcode for reloc info in mem operand
   %}
 
-  enc_class RegMem (eRegI ereg, memory mem) %{    // emit_reg_mem
+  enc_class RegMem (rRegI ereg, memory mem) %{    // emit_reg_mem
     int reg_encoding = $ereg$$reg;
     int base  = $mem$$base;
     int index = $mem$$index;
@@ -2132,7 +2201,7 @@
 
   // Clone of RegMem but accepts an extra parameter to access each
   // half of a double in memory; it never needs relocation info.
-  enc_class Mov_MemD_half_to_Reg (immI opcode, memory mem, immI disp_for_half, eRegI rm_reg) %{
+  enc_class Mov_MemD_half_to_Reg (immI opcode, memory mem, immI disp_for_half, rRegI rm_reg) %{
     emit_opcode(cbuf,$opcode$$constant);
     int reg_encoding = $rm_reg$$reg;
     int base     = $mem$$base;
@@ -2168,7 +2237,7 @@
     encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
   %}
 
-  enc_class RegLea (eRegI dst, eRegI src0, immI src1 ) %{    // emit_reg_lea
+  enc_class RegLea (rRegI dst, rRegI src0, immI src1 ) %{    // emit_reg_lea
     int reg_encoding = $dst$$reg;
     int base         = $src0$$reg;      // 0xFFFFFFFF indicates no base
     int index        = 0x04;            // 0x04 indicates no index
@@ -2178,7 +2247,7 @@
     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
   %}
 
-  enc_class min_enc (eRegI dst, eRegI src) %{    // MIN
+  enc_class min_enc (rRegI dst, rRegI src) %{    // MIN
     // Compare dst,src
     emit_opcode(cbuf,0x3B);
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
@@ -2190,7 +2259,7 @@
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
-  enc_class max_enc (eRegI dst, eRegI src) %{    // MAX
+  enc_class max_enc (rRegI dst, rRegI src) %{    // MAX
     // Compare dst,src
     emit_opcode(cbuf,0x3B);
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
@@ -2221,7 +2290,7 @@
     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
   %}
 
-  enc_class neg_reg(eRegI dst) %{
+  enc_class neg_reg(rRegI dst) %{
     // NEG $dst
     emit_opcode(cbuf,0xF7);
     emit_rm(cbuf, 0x3, 0x03, $dst$$reg );
@@ -2251,7 +2320,7 @@
     emit_rm(cbuf, 0x3, $p$$reg, tmpReg);
   %}
 
-  enc_class enc_cmpLTP_mem(eRegI p, eRegI q, memory mem, eCXRegI tmp) %{    // cadd_cmpLT
+  enc_class enc_cmpLTP_mem(rRegI p, rRegI q, memory mem, eCXRegI tmp) %{    // cadd_cmpLT
     int tmpReg = $tmp$$reg;
 
     // SUB $p,$q
@@ -2390,12 +2459,12 @@
   %}
 
   // Special case for moving an integer register to a stack slot.
-  enc_class OpcPRegSS( stackSlotI dst, eRegI src ) %{ // RegSS
+  enc_class OpcPRegSS( stackSlotI dst, rRegI src ) %{ // RegSS
     store_to_stackslot( cbuf, $primary, $src$$reg, $dst$$disp );
   %}
 
   // Special case for moving a register to a stack slot.
-  enc_class RegSS( stackSlotI dst, eRegI src ) %{ // RegSS
+  enc_class RegSS( stackSlotI dst, rRegI src ) %{ // RegSS
     // Opcode already emitted
     emit_rm( cbuf, 0x02, $src$$reg, ESP_enc );   // R/M byte
     emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);          // SIB byte
@@ -2640,7 +2709,7 @@
 // equal_result    = 0;
 // nan_result      = -1;
 
-  enc_class CmpF_Result(eRegI dst) %{
+  enc_class CmpF_Result(rRegI dst) %{
     // fnstsw_ax();
     emit_opcode( cbuf, 0xDF);
     emit_opcode( cbuf, 0xE0);
@@ -2685,7 +2754,7 @@
 // done:
   %}
 
-  enc_class convert_int_long( regL dst, eRegI src ) %{
+  enc_class convert_int_long( regL dst, rRegI src ) %{
     // mov $dst.lo,$src
     int dst_encoding = $dst$$reg;
     int src_encoding = $src$$reg;
@@ -2754,7 +2823,7 @@
     emit_rm( cbuf, 0x3, 0x4, $src$$reg);
   %}
 
-  enc_class long_multiply( eADXRegL dst, eRegL src, eRegI tmp ) %{
+  enc_class long_multiply( eADXRegL dst, eRegL src, rRegI tmp ) %{
     // Basic idea: lo(result) = lo(x_lo * y_lo)
     //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
     // MOV    $tmp,$src.lo
@@ -2780,7 +2849,7 @@
     emit_rm( cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $tmp$$reg );
   %}
 
-  enc_class long_multiply_con( eADXRegL dst, immL_127 src, eRegI tmp ) %{
+  enc_class long_multiply_con( eADXRegL dst, immL_127 src, rRegI tmp ) %{
     // Basic idea: lo(result) = lo(src * y_lo)
     //             hi(result) = hi(src * y_lo) + lo(src * y_hi)
     // IMUL   $tmp,EDX,$src
@@ -2836,7 +2905,7 @@
     emit_d8(cbuf, 4*4);
   %}
 
-  enc_class long_cmp_flags0( eRegL src, eRegI tmp ) %{
+  enc_class long_cmp_flags0( eRegL src, rRegI tmp ) %{
     // MOV   $tmp,$src.lo
     emit_opcode(cbuf, 0x8B);
     emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
@@ -2857,7 +2926,7 @@
     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($src1$$reg), HIGH_FROM_LOW($src2$$reg) );
   %}
 
-  enc_class long_cmp_flags2( eRegL src1, eRegL src2, eRegI tmp ) %{
+  enc_class long_cmp_flags2( eRegL src1, eRegL src2, rRegI tmp ) %{
     // CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits
     emit_opcode( cbuf, 0x3B );
     emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
@@ -2869,7 +2938,7 @@
     emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src2$$reg) );
   %}
 
-  enc_class long_cmp_flags3( eRegL src, eRegI tmp ) %{
+  enc_class long_cmp_flags3( eRegL src, rRegI tmp ) %{
     // XOR    $tmp,$tmp
     emit_opcode(cbuf,0x33);  // XOR
     emit_rm(cbuf,0x3, $tmp$$reg, $tmp$$reg);
@@ -3762,9 +3831,9 @@
     // in SSE2+ mode we want to keep the FPU stack clean so pretend
     // that C functions return float and double results in XMM0.
     if( ideal_reg == Op_RegD && UseSSE>=2 )
-      return OptoRegPair(XMM0b_num,XMM0a_num);
+      return OptoRegPair(XMM0b_num,XMM0_num);
     if( ideal_reg == Op_RegF && UseSSE>=2 )
-      return OptoRegPair(OptoReg::Bad,XMM0a_num);
+      return OptoRegPair(OptoReg::Bad,XMM0_num);
 
     return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
   %}
@@ -3775,9 +3844,9 @@
     static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
     static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
     if( ideal_reg == Op_RegD && UseSSE>=2 )
-      return OptoRegPair(XMM0b_num,XMM0a_num);
+      return OptoRegPair(XMM0b_num,XMM0_num);
     if( ideal_reg == Op_RegF && UseSSE>=1 )
-      return OptoRegPair(OptoReg::Bad,XMM0a_num);
+      return OptoRegPair(OptoReg::Bad,XMM0_num);
     return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
   %}
 
@@ -4147,8 +4216,8 @@
 
 // Register Operands
 // Integer Register
-operand eRegI() %{
-  constraint(ALLOC_IN_RC(e_reg));
+operand rRegI() %{
+  constraint(ALLOC_IN_RC(int_reg));
   match(RegI);
   match(xRegI);
   match(eAXRegI);
@@ -4163,8 +4232,8 @@
 %}
 
 // Subset of Integer Register
-operand xRegI(eRegI reg) %{
-  constraint(ALLOC_IN_RC(x_reg));
+operand xRegI(rRegI reg) %{
+  constraint(ALLOC_IN_RC(int_x_reg));
   match(reg);
   match(eAXRegI);
   match(eBXRegI);
@@ -4179,7 +4248,7 @@
 operand eAXRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(eax_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "EAX" %}
   interface(REG_INTER);
@@ -4189,7 +4258,7 @@
 operand eBXRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(ebx_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "EBX" %}
   interface(REG_INTER);
@@ -4198,7 +4267,7 @@
 operand eCXRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(ecx_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "ECX" %}
   interface(REG_INTER);
@@ -4207,7 +4276,7 @@
 operand eDXRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(edx_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "EDX" %}
   interface(REG_INTER);
@@ -4216,7 +4285,7 @@
 operand eDIRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(edi_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "EDI" %}
   interface(REG_INTER);
@@ -4263,7 +4332,7 @@
 operand eSIRegI(xRegI reg) %{
    constraint(ALLOC_IN_RC(esi_reg));
    match(reg);
-   match(eRegI);
+   match(rRegI);
 
    format %{ "ESI" %}
    interface(REG_INTER);
@@ -4284,7 +4353,7 @@
 %}
 
 operand eRegP() %{
-  constraint(ALLOC_IN_RC(e_reg));
+  constraint(ALLOC_IN_RC(int_reg));
   match(RegP);
   match(eAXRegP);
   match(eBXRegP);
@@ -4297,7 +4366,7 @@
 
 // On windows95, EBP is not safe to use for implicit null tests.
 operand eRegP_no_EBP() %{
-  constraint(ALLOC_IN_RC(e_reg_no_rbp));
+  constraint(ALLOC_IN_RC(int_reg_no_rbp));
   match(RegP);
   match(eAXRegP);
   match(eBXRegP);
@@ -4477,7 +4546,7 @@
 // Float register operands
 operand regDPR() %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(dbl_reg));
+  constraint(ALLOC_IN_RC(fp_dbl_reg));
   match(RegD);
   match(regDPR1);
   match(regDPR2);
@@ -4487,7 +4556,7 @@
 
 operand regDPR1(regDPR reg) %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(dbl_reg0));
+  constraint(ALLOC_IN_RC(fp_dbl_reg0));
   match(reg);
   format %{ "FPR1" %}
   interface(REG_INTER);
@@ -4495,7 +4564,7 @@
 
 operand regDPR2(regDPR reg) %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(dbl_reg1));
+  constraint(ALLOC_IN_RC(fp_dbl_reg1));
   match(reg);
   format %{ "FPR2" %}
   interface(REG_INTER);
@@ -4503,45 +4572,16 @@
 
 operand regnotDPR1(regDPR reg) %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(dbl_notreg0));
+  constraint(ALLOC_IN_RC(fp_dbl_notreg0));
   match(reg);
   format %{ %}
   interface(REG_INTER);
 %}
 
-// XMM Double register operands
-operand regD() %{
-  predicate( UseSSE>=2 );
-  constraint(ALLOC_IN_RC(xdb_reg));
-  match(RegD);
-  match(regD6);
-  match(regD7);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-// XMM6 double register operands
-operand regD6(regD reg) %{
-  predicate( UseSSE>=2 );
-  constraint(ALLOC_IN_RC(xdb_reg6));
-  match(reg);
-  format %{ "XMM6" %}
-  interface(REG_INTER);
-%}
-
-// XMM7 double register operands
-operand regD7(regD reg) %{
-  predicate( UseSSE>=2 );
-  constraint(ALLOC_IN_RC(xdb_reg7));
-  match(reg);
-  format %{ "XMM7" %}
-  interface(REG_INTER);
-%}
-
 // Float register operands
 operand regFPR() %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(flt_reg));
+  constraint(ALLOC_IN_RC(fp_flt_reg));
   match(RegF);
   match(regFPR1);
   format %{ %}
@@ -4551,21 +4591,30 @@
 // Float register operands
 operand regFPR1(regFPR reg) %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(flt_reg0));
+  constraint(ALLOC_IN_RC(fp_flt_reg0));
   match(reg);
   format %{ "FPR1" %}
   interface(REG_INTER);
 %}
 
-// XMM register operands
+// XMM Float register operands
 operand regF() %{
   predicate( UseSSE>=1 );
-  constraint(ALLOC_IN_RC(xmm_reg));
+  constraint(ALLOC_IN_RC(float_reg));
   match(RegF);
   format %{ %}
   interface(REG_INTER);
 %}
 
+// XMM Double register operands
+operand regD() %{
+  predicate( UseSSE>=2 );
+  constraint(ALLOC_IN_RC(double_reg));
+  match(RegD);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 
 //----------Memory Operands----------------------------------------------------
 // Direct Memory Operand
@@ -4583,7 +4632,7 @@
 
 // Indirect Memory Operand
 operand indirect(eRegP reg) %{
-  constraint(ALLOC_IN_RC(e_reg));
+  constraint(ALLOC_IN_RC(int_reg));
   match(reg);
 
   format %{ "[$reg]" %}
@@ -4622,7 +4671,7 @@
 %}
 
 // Indirect Memory Plus Long Offset Operand
-operand indOffset32X(eRegI reg, immP off) %{
+operand indOffset32X(rRegI reg, immP off) %{
   match(AddP off reg);
 
   format %{ "[$reg + $off]" %}
@@ -4635,7 +4684,7 @@
 %}
 
 // Indirect Memory Plus Index Register Plus Offset Operand
-operand indIndexOffset(eRegP reg, eRegI ireg, immI off) %{
+operand indIndexOffset(eRegP reg, rRegI ireg, immI off) %{
   match(AddP (AddP reg ireg) off);
 
   op_cost(10);
@@ -4649,7 +4698,7 @@
 %}
 
 // Indirect Memory Plus Index Register Plus Offset Operand
-operand indIndex(eRegP reg, eRegI ireg) %{
+operand indIndex(eRegP reg, rRegI ireg) %{
   match(AddP reg ireg);
 
   op_cost(10);
@@ -4667,7 +4716,7 @@
 // // -------------------------------------------------------------------------
 // // Scaled Memory Operands
 // // Indirect Memory Times Scale Plus Offset Operand
-// operand indScaleOffset(immP off, eRegI ireg, immI2 scale) %{
+// operand indScaleOffset(immP off, rRegI ireg, immI2 scale) %{
 //   match(AddP off (LShiftI ireg scale));
 //
 //   op_cost(10);
@@ -4681,7 +4730,7 @@
 // %}
 
 // Indirect Memory Times Scale Plus Index Register
-operand indIndexScale(eRegP reg, eRegI ireg, immI2 scale) %{
+operand indIndexScale(eRegP reg, rRegI ireg, immI2 scale) %{
   match(AddP reg (LShiftI ireg scale));
 
   op_cost(10);
@@ -4695,7 +4744,7 @@
 %}
 
 // Indirect Memory Times Scale Plus Index Register Plus Offset Operand
-operand indIndexScaleOffset(eRegP reg, immI off, eRegI ireg, immI2 scale) %{
+operand indIndexScaleOffset(eRegP reg, immI off, rRegI ireg, immI2 scale) %{
   match(AddP (AddP reg (LShiftI ireg scale)) off);
 
   op_cost(10);
@@ -4823,7 +4872,7 @@
 // Indirect Memory Operand
 operand indirect_win95_safe(eRegP_no_EBP reg)
 %{
-  constraint(ALLOC_IN_RC(e_reg));
+  constraint(ALLOC_IN_RC(int_reg));
   match(reg);
 
   op_cost(100);
@@ -4867,7 +4916,7 @@
 %}
 
 // Indirect Memory Plus Index Register Plus Offset Operand
-operand indIndexOffset_win95_safe(eRegP_no_EBP reg, eRegI ireg, immI off)
+operand indIndexOffset_win95_safe(eRegP_no_EBP reg, rRegI ireg, immI off)
 %{
   match(AddP (AddP reg ireg) off);
 
@@ -4882,7 +4931,7 @@
 %}
 
 // Indirect Memory Times Scale Plus Index Register
-operand indIndexScale_win95_safe(eRegP_no_EBP reg, eRegI ireg, immI2 scale)
+operand indIndexScale_win95_safe(eRegP_no_EBP reg, rRegI ireg, immI2 scale)
 %{
   match(AddP reg (LShiftI ireg scale));
 
@@ -4897,7 +4946,7 @@
 %}
 
 // Indirect Memory Times Scale Plus Index Register Plus Offset Operand
-operand indIndexScaleOffset_win95_safe(eRegP_no_EBP reg, immI off, eRegI ireg, immI2 scale)
+operand indIndexScaleOffset_win95_safe(eRegP_no_EBP reg, immI off, rRegI ireg, immI2 scale)
 %{
   match(AddP (AddP reg (LShiftI ireg scale)) off);
 
@@ -5086,7 +5135,7 @@
 //   Or: _mem if it requires the big decoder and a memory unit.
 
 // Integer ALU reg operation
-pipe_class ialu_reg(eRegI dst) %{
+pipe_class ialu_reg(rRegI dst) %{
     single_instruction;
     dst    : S4(write);
     dst    : S3(read);
@@ -5104,7 +5153,7 @@
 %}
 
 // Integer ALU reg operation using big decoder
-pipe_class ialu_reg_fat(eRegI dst) %{
+pipe_class ialu_reg_fat(rRegI dst) %{
     single_instruction;
     dst    : S4(write);
     dst    : S3(read);
@@ -5122,7 +5171,7 @@
 %}
 
 // Integer ALU reg-reg operation
-pipe_class ialu_reg_reg(eRegI dst, eRegI src) %{
+pipe_class ialu_reg_reg(rRegI dst, rRegI src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5140,7 +5189,7 @@
 %}
 
 // Integer ALU reg-reg operation
-pipe_class ialu_reg_reg_fat(eRegI dst, memory src) %{
+pipe_class ialu_reg_reg_fat(rRegI dst, memory src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5158,7 +5207,7 @@
 %}
 
 // Integer ALU reg-mem operation
-pipe_class ialu_reg_mem(eRegI dst, memory mem) %{
+pipe_class ialu_reg_mem(rRegI dst, memory mem) %{
     single_instruction;
     dst    : S5(write);
     mem    : S3(read);
@@ -5187,7 +5236,7 @@
 %}
 
 // Integer Store to Memory
-pipe_class ialu_mem_reg(memory mem, eRegI src) %{
+pipe_class ialu_mem_reg(memory mem, rRegI src) %{
     single_instruction;
     mem    : S3(read);
     src    : S5(read);
@@ -5216,7 +5265,7 @@
 %}
 
 // Integer ALU0 reg-reg operation
-pipe_class ialu_reg_reg_alu0(eRegI dst, eRegI src) %{
+pipe_class ialu_reg_reg_alu0(rRegI dst, rRegI src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5225,7 +5274,7 @@
 %}
 
 // Integer ALU0 reg-mem operation
-pipe_class ialu_reg_mem_alu0(eRegI dst, memory mem) %{
+pipe_class ialu_reg_mem_alu0(rRegI dst, memory mem) %{
     single_instruction;
     dst    : S5(write);
     mem    : S3(read);
@@ -5235,7 +5284,7 @@
 %}
 
 // Integer ALU reg-reg operation
-pipe_class ialu_cr_reg_reg(eFlagsReg cr, eRegI src1, eRegI src2) %{
+pipe_class ialu_cr_reg_reg(eFlagsReg cr, rRegI src1, rRegI src2) %{
     single_instruction;
     cr     : S4(write);
     src1   : S3(read);
@@ -5245,7 +5294,7 @@
 %}
 
 // Integer ALU reg-imm operation
-pipe_class ialu_cr_reg_imm(eFlagsReg cr, eRegI src1) %{
+pipe_class ialu_cr_reg_imm(eFlagsReg cr, rRegI src1) %{
     single_instruction;
     cr     : S4(write);
     src1   : S3(read);
@@ -5254,7 +5303,7 @@
 %}
 
 // Integer ALU reg-mem operation
-pipe_class ialu_cr_reg_mem(eFlagsReg cr, eRegI src1, memory src2) %{
+pipe_class ialu_cr_reg_mem(eFlagsReg cr, rRegI src1, memory src2) %{
     single_instruction;
     cr     : S4(write);
     src1   : S3(read);
@@ -5265,7 +5314,7 @@
 %}
 
 // Conditional move reg-reg
-pipe_class pipe_cmplt( eRegI p, eRegI q, eRegI y ) %{
+pipe_class pipe_cmplt( rRegI p, rRegI q, rRegI y ) %{
     instruction_count(4);
     y      : S4(read);
     q      : S3(read);
@@ -5274,7 +5323,7 @@
 %}
 
 // Conditional move reg-reg
-pipe_class pipe_cmov_reg( eRegI dst, eRegI src, eFlagsReg cr ) %{
+pipe_class pipe_cmov_reg( rRegI dst, rRegI src, eFlagsReg cr ) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5283,7 +5332,7 @@
 %}
 
 // Conditional move reg-mem
-pipe_class pipe_cmov_mem( eFlagsReg cr, eRegI dst, memory src) %{
+pipe_class pipe_cmov_mem( eFlagsReg cr, rRegI dst, memory src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5534,7 +5583,7 @@
 //               in the encode section of the architecture description.
 
 //----------BSWAP-Instruction--------------------------------------------------
-instruct bytes_reverse_int(eRegI dst) %{
+instruct bytes_reverse_int(rRegI dst) %{
   match(Set dst (ReverseBytesI dst));
 
   format %{ "BSWAP  $dst" %}
@@ -5555,7 +5604,7 @@
   ins_pipe( ialu_reg_reg);
 %}
 
-instruct bytes_reverse_unsigned_short(eRegI dst, eFlagsReg cr) %{
+instruct bytes_reverse_unsigned_short(rRegI dst, eFlagsReg cr) %{
   match(Set dst (ReverseBytesUS dst));
   effect(KILL cr);
 
@@ -5568,7 +5617,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct bytes_reverse_short(eRegI dst, eFlagsReg cr) %{
+instruct bytes_reverse_short(rRegI dst, eFlagsReg cr) %{
   match(Set dst (ReverseBytesS dst));
   effect(KILL cr);
 
@@ -5584,7 +5633,7 @@
 
 //---------- Zeros Count Instructions ------------------------------------------
 
-instruct countLeadingZerosI(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct countLeadingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{
   predicate(UseCountLeadingZerosInstruction);
   match(Set dst (CountLeadingZerosI src));
   effect(KILL cr);
@@ -5596,7 +5645,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countLeadingZerosI_bsr(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct countLeadingZerosI_bsr(rRegI dst, rRegI src, eFlagsReg cr) %{
   predicate(!UseCountLeadingZerosInstruction);
   match(Set dst (CountLeadingZerosI src));
   effect(KILL cr);
@@ -5621,7 +5670,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countLeadingZerosL(eRegI dst, eRegL src, eFlagsReg cr) %{
+instruct countLeadingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{
   predicate(UseCountLeadingZerosInstruction);
   match(Set dst (CountLeadingZerosL src));
   effect(TEMP dst, KILL cr);
@@ -5644,7 +5693,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countLeadingZerosL_bsr(eRegI dst, eRegL src, eFlagsReg cr) %{
+instruct countLeadingZerosL_bsr(rRegI dst, eRegL src, eFlagsReg cr) %{
   predicate(!UseCountLeadingZerosInstruction);
   match(Set dst (CountLeadingZerosL src));
   effect(TEMP dst, KILL cr);
@@ -5680,7 +5729,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countTrailingZerosI(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct countTrailingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (CountTrailingZerosI src));
   effect(KILL cr);
 
@@ -5699,7 +5748,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countTrailingZerosL(eRegI dst, eRegL src, eFlagsReg cr) %{
+instruct countTrailingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{
   match(Set dst (CountTrailingZerosL src));
   effect(TEMP dst, KILL cr);
 
@@ -5731,7 +5780,7 @@
 
 //---------- Population Count Instructions -------------------------------------
 
-instruct popCountI(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct popCountI(rRegI dst, rRegI src, eFlagsReg cr) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountI src));
   effect(KILL cr);
@@ -5743,7 +5792,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct popCountI_mem(eRegI dst, memory mem, eFlagsReg cr) %{
+instruct popCountI_mem(rRegI dst, memory mem, eFlagsReg cr) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountI (LoadI mem)));
   effect(KILL cr);
@@ -5756,7 +5805,7 @@
 %}
 
 // Note: Long.bitCount(long) returns an int.
-instruct popCountL(eRegI dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
+instruct popCountL(rRegI dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountL src));
   effect(KILL cr, TEMP tmp, TEMP dst);
@@ -5773,7 +5822,7 @@
 %}
 
 // Note: Long.bitCount(long) returns an int.
-instruct popCountL_mem(eRegI dst, memory mem, eRegI tmp, eFlagsReg cr) %{
+instruct popCountL_mem(rRegI dst, memory mem, rRegI tmp, eFlagsReg cr) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountL (LoadL mem)));
   effect(KILL cr, TEMP tmp, TEMP dst);
@@ -5877,7 +5926,7 @@
 %}
 
 // Load Short (16bit signed)
-instruct loadS(eRegI dst, memory mem) %{
+instruct loadS(rRegI dst, memory mem) %{
   match(Set dst (LoadS mem));
 
   ins_cost(125);
@@ -5891,7 +5940,7 @@
 %}
 
 // Load Short (16 bit signed) to Byte (8 bit signed)
-instruct loadS2B(eRegI dst, memory mem, immI_24 twentyfour) %{
+instruct loadS2B(rRegI dst, memory mem, immI_24 twentyfour) %{
   match(Set dst (RShiftI (LShiftI (LoadS mem) twentyfour) twentyfour));
 
   ins_cost(125);
@@ -5922,7 +5971,7 @@
 %}
 
 // Load Unsigned Short/Char (16bit unsigned)
-instruct loadUS(eRegI dst, memory mem) %{
+instruct loadUS(rRegI dst, memory mem) %{
   match(Set dst (LoadUS mem));
 
   ins_cost(125);
@@ -5936,7 +5985,7 @@
 %}
 
 // Load Unsigned Short/Char (16 bit UNsigned) to Byte (8 bit signed)
-instruct loadUS2B(eRegI dst, memory mem, immI_24 twentyfour) %{
+instruct loadUS2B(rRegI dst, memory mem, immI_24 twentyfour) %{
   match(Set dst (RShiftI (LShiftI (LoadUS mem) twentyfour) twentyfour));
 
   ins_cost(125);
@@ -5997,7 +6046,7 @@
 %}
 
 // Load Integer
-instruct loadI(eRegI dst, memory mem) %{
+instruct loadI(rRegI dst, memory mem) %{
   match(Set dst (LoadI mem));
 
   ins_cost(125);
@@ -6011,7 +6060,7 @@
 %}
 
 // Load Integer (32 bit signed) to Byte (8 bit signed)
-instruct loadI2B(eRegI dst, memory mem, immI_24 twentyfour) %{
+instruct loadI2B(rRegI dst, memory mem, immI_24 twentyfour) %{
   match(Set dst (RShiftI (LShiftI (LoadI mem) twentyfour) twentyfour));
 
   ins_cost(125);
@@ -6023,7 +6072,7 @@
 %}
 
 // Load Integer (32 bit signed) to Unsigned Byte (8 bit UNsigned)
-instruct loadI2UB(eRegI dst, memory mem, immI_255 mask) %{
+instruct loadI2UB(rRegI dst, memory mem, immI_255 mask) %{
   match(Set dst (AndI (LoadI mem) mask));
 
   ins_cost(125);
@@ -6035,7 +6084,7 @@
 %}
 
 // Load Integer (32 bit signed) to Short (16 bit signed)
-instruct loadI2S(eRegI dst, memory mem, immI_16 sixteen) %{
+instruct loadI2S(rRegI dst, memory mem, immI_16 sixteen) %{
   match(Set dst (RShiftI (LShiftI (LoadI mem) sixteen) sixteen));
 
   ins_cost(125);
@@ -6047,7 +6096,7 @@
 %}
 
 // Load Integer (32 bit signed) to Unsigned Short/Char (16 bit UNsigned)
-instruct loadI2US(eRegI dst, memory mem, immI_65535 mask) %{
+instruct loadI2US(rRegI dst, memory mem, immI_65535 mask) %{
   match(Set dst (AndI (LoadI mem) mask));
 
   ins_cost(125);
@@ -6208,7 +6257,7 @@
 %}
 
 // Load Range
-instruct loadRange(eRegI dst, memory mem) %{
+instruct loadRange(rRegI dst, memory mem) %{
   match(Set dst (LoadRange mem));
 
   ins_cost(125);
@@ -6305,66 +6354,6 @@
   ins_pipe( fpu_reg_mem );
 %}
 
-// Load Aligned Packed Byte to XMM register
-instruct loadA8B(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load8B mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed8B" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Short to XMM register
-instruct loadA4S(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load4S mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed4S" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Char to XMM register
-instruct loadA4C(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load4C mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed4C" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Integer to XMM register
-instruct load2IU(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load2I mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed2I" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Single to XMM
-instruct loadA2F(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load2F mem));
-  ins_cost(145);
-  format %{ "MOVQ  $dst,$mem\t! packed2F" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Load Effective Address
 instruct leaP8(eRegP dst, indOffset8 mem) %{
   match(Set dst mem);
@@ -6417,7 +6406,7 @@
 %}
 
 // Load Constant
-instruct loadConI(eRegI dst, immI src) %{
+instruct loadConI(rRegI dst, immI src) %{
   match(Set dst src);
 
   format %{ "MOV    $dst,$src" %}
@@ -6426,7 +6415,7 @@
 %}
 
 // Load Constant zero
-instruct loadConI0(eRegI dst, immI0 src, eFlagsReg cr) %{
+instruct loadConI0(rRegI dst, immI0 src, eFlagsReg cr) %{
   match(Set dst src);
   effect(KILL cr);
 
@@ -6594,7 +6583,7 @@
 %}
 
 // Load Stack Slot
-instruct loadSSI(eRegI dst, stackSlotI src) %{
+instruct loadSSI(rRegI dst, stackSlotI src) %{
   match(Set dst src);
   ins_cost(125);
 
@@ -6821,7 +6810,7 @@
 %}
 
 // Store Char/Short
-instruct storeC(memory mem, eRegI src) %{
+instruct storeC(memory mem, rRegI src) %{
   match(Set mem (StoreC mem src));
 
   ins_cost(125);
@@ -6832,7 +6821,7 @@
 %}
 
 // Store Integer
-instruct storeI(memory mem, eRegI src) %{
+instruct storeI(memory mem, rRegI src) %{
   match(Set mem (StoreI mem src));
 
   ins_cost(125);
@@ -6976,42 +6965,6 @@
   ins_pipe( ialu_mem_imm );
 %}
 
-// Store Aligned Packed Byte XMM register to memory
-instruct storeA8B(memory mem, regD src) %{
-  predicate(UseSSE>=1);
-  match(Set mem (Store8B mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed8B" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Store Aligned Packed Char/Short XMM register to memory
-instruct storeA4C(memory mem, regD src) %{
-  predicate(UseSSE>=1);
-  match(Set mem (Store4C mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed4C" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Store Aligned Packed Integer XMM register to memory
-instruct storeA2I(memory mem, regD src) %{
-  predicate(UseSSE>=1);
-  match(Set mem (Store2I mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed2I" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Store CMS card-mark Immediate
 instruct storeImmCM(memory mem, immI8 src) %{
   match(Set mem (StoreCM mem src));
@@ -7073,18 +7026,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Store Aligned Packed Single Float XMM register to memory
-instruct storeA2F(memory mem, regD src) %{
-  predicate(UseSSE>=1);
-  match(Set mem (Store2F mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed2F" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Store Float
 instruct storeFPR( memory mem, regFPR1 src) %{
   predicate(UseSSE==0);
@@ -7146,7 +7087,7 @@
 %}
 
 // Store Integer to stack slot
-instruct storeSSI(stackSlotI dst, eRegI src) %{
+instruct storeSSI(stackSlotI dst, rRegI src) %{
   match(Set dst src);
 
   ins_cost(100);
@@ -7271,7 +7212,7 @@
   ins_pipe(empty);
 %}
 
-instruct castP2X(eRegI dst, eRegP src ) %{
+instruct castP2X(rRegI dst, eRegP src ) %{
   match(Set dst (CastP2X src));
   ins_cost(50);
   format %{ "MOV    $dst, $src\t# CastP2X" %}
@@ -7281,7 +7222,7 @@
 
 //----------Conditional Move---------------------------------------------------
 // Conditional move
-instruct jmovI_reg(cmpOp cop, eFlagsReg cr, eRegI dst, eRegI src) %{
+instruct jmovI_reg(cmpOp cop, eFlagsReg cr, rRegI dst, rRegI src) %{
   predicate(!VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7298,7 +7239,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct jmovI_regU(cmpOpU cop, eFlagsRegU cr, eRegI dst, eRegI src) %{
+instruct jmovI_regU(cmpOpU cop, eFlagsRegU cr, rRegI dst, rRegI src) %{
   predicate(!VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7315,7 +7256,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovI_reg(eRegI dst, eRegI src, eFlagsReg cr, cmpOp cop ) %{
+instruct cmovI_reg(rRegI dst, rRegI src, eFlagsReg cr, cmpOp cop ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7325,7 +7266,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovI_regU( cmpOpU cop, eFlagsRegU cr, eRegI dst, eRegI src ) %{
+instruct cmovI_regU( cmpOpU cop, eFlagsRegU cr, rRegI dst, rRegI src ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7335,7 +7276,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovI_regUCF( cmpOpUCF cop, eFlagsRegUCF cr, eRegI dst, eRegI src ) %{
+instruct cmovI_regUCF( cmpOpUCF cop, eFlagsRegUCF cr, rRegI dst, rRegI src ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7345,7 +7286,7 @@
 %}
 
 // Conditional move
-instruct cmovI_mem(cmpOp cop, eFlagsReg cr, eRegI dst, memory src) %{
+instruct cmovI_mem(cmpOp cop, eFlagsReg cr, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -7356,7 +7297,7 @@
 %}
 
 // Conditional move
-instruct cmovI_memU(cmpOpU cop, eFlagsRegU cr, eRegI dst, memory src) %{
+instruct cmovI_memU(cmpOpU cop, eFlagsRegU cr, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -7366,7 +7307,7 @@
   ins_pipe( pipe_cmov_mem );
 %}
 
-instruct cmovI_memUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegI dst, memory src) %{
+instruct cmovI_memUCF(cmpOpUCF cop, eFlagsRegUCF cr, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -7620,7 +7561,7 @@
 //----------Arithmetic Instructions--------------------------------------------
 //----------Addition Instructions----------------------------------------------
 // Integer Addition Instructions
-instruct addI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct addI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (AddI dst src));
   effect(KILL cr);
 
@@ -7631,7 +7572,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct addI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct addI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (AddI dst src));
   effect(KILL cr);
 
@@ -7641,7 +7582,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
+instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{
   predicate(UseIncDec);
   match(Set dst (AddI dst src));
   effect(KILL cr);
@@ -7653,7 +7594,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct leaI_eReg_immI(eRegI dst, eRegI src0, immI src1) %{
+instruct leaI_eReg_immI(rRegI dst, rRegI src0, immI src1) %{
   match(Set dst (AddI src0 src1));
   ins_cost(110);
 
@@ -7673,7 +7614,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct decI_eReg(eRegI dst, immI_M1 src, eFlagsReg cr) %{
+instruct decI_eReg(rRegI dst, immI_M1 src, eFlagsReg cr) %{
   predicate(UseIncDec);
   match(Set dst (AddI dst src));
   effect(KILL cr);
@@ -7685,7 +7626,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct addP_eReg(eRegP dst, eRegI src, eFlagsReg cr) %{
+instruct addP_eReg(eRegP dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (AddP dst src));
   effect(KILL cr);
 
@@ -7707,7 +7648,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct addI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct addI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (AddI dst (LoadI src)));
   effect(KILL cr);
 
@@ -7718,7 +7659,7 @@
   ins_pipe( ialu_reg_mem );
 %}
 
-instruct addI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct addI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (AddI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -7780,7 +7721,7 @@
   ins_pipe( empty );
 %}
 
-instruct castII( eRegI dst ) %{
+instruct castII( rRegI dst ) %{
   match(Set dst (CastII dst));
   format %{ "#castII of $dst" %}
   ins_encode( /*empty encoding*/ );
@@ -7814,7 +7755,7 @@
 
 // Conditional-store of an int value.
 // ZF flag is set on success, reset otherwise.  Implemented with a CMPXCHG on Intel.
-instruct storeIConditional( memory mem, eAXRegI oldval, eRegI newval, eFlagsReg cr ) %{
+instruct storeIConditional( memory mem, eAXRegI oldval, rRegI newval, eFlagsReg cr ) %{
   match(Set cr (StoreIConditional mem (Binary oldval newval)));
   effect(KILL oldval);
   format %{ "CMPXCHG $mem,$newval\t# If EAX==$mem Then store $newval into $mem" %}
@@ -7847,7 +7788,7 @@
 
 // No flag versions for CompareAndSwap{P,I,L} because matcher can't match them
 
-instruct compareAndSwapL( eRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
+instruct compareAndSwapL( rRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
   match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
@@ -7860,7 +7801,7 @@
   ins_pipe( pipe_cmpxchg );
 %}
 
-instruct compareAndSwapP( eRegI res,  pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
+instruct compareAndSwapP( rRegI res,  pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
   match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
@@ -7872,7 +7813,7 @@
   ins_pipe( pipe_cmpxchg );
 %}
 
-instruct compareAndSwapI( eRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
+instruct compareAndSwapI( rRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
   match(Set res (CompareAndSwapI mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
@@ -7886,7 +7827,7 @@
 
 //----------Subtraction Instructions-------------------------------------------
 // Integer Subtraction Instructions
-instruct subI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (SubI dst src));
   effect(KILL cr);
 
@@ -7897,7 +7838,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct subI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct subI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (SubI dst src));
   effect(KILL cr);
 
@@ -7908,7 +7849,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct subI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct subI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (SubI dst (LoadI src)));
   effect(KILL cr);
 
@@ -7919,7 +7860,7 @@
   ins_pipe( ialu_reg_mem );
 %}
 
-instruct subI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct subI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (SubI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -7931,7 +7872,7 @@
 %}
 
 // Subtract from a pointer
-instruct subP_eReg(eRegP dst, eRegI src, immI0 zero, eFlagsReg cr) %{
+instruct subP_eReg(eRegP dst, rRegI src, immI0 zero, eFlagsReg cr) %{
   match(Set dst (AddP dst (SubI zero src)));
   effect(KILL cr);
 
@@ -7942,7 +7883,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct negI_eReg(eRegI dst, immI0 zero, eFlagsReg cr) %{
+instruct negI_eReg(rRegI dst, immI0 zero, eFlagsReg cr) %{
   match(Set dst (SubI zero dst));
   effect(KILL cr);
 
@@ -7957,7 +7898,7 @@
 //----------Multiplication/Division Instructions-------------------------------
 // Integer Multiplication Instructions
 // Multiply Register
-instruct mulI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct mulI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (MulI dst src));
   effect(KILL cr);
 
@@ -7970,7 +7911,7 @@
 %}
 
 // Multiply 32-bit Immediate
-instruct mulI_eReg_imm(eRegI dst, eRegI src, immI imm, eFlagsReg cr) %{
+instruct mulI_eReg_imm(rRegI dst, rRegI src, immI imm, eFlagsReg cr) %{
   match(Set dst (MulI src imm));
   effect(KILL cr);
 
@@ -8026,7 +7967,7 @@
 %}
 
 // Multiply Memory 32-bit Immediate
-instruct mulI_mem_imm(eRegI dst, memory src, immI imm, eFlagsReg cr) %{
+instruct mulI_mem_imm(rRegI dst, memory src, immI imm, eFlagsReg cr) %{
   match(Set dst (MulI (LoadI src) imm));
   effect(KILL cr);
 
@@ -8038,7 +7979,7 @@
 %}
 
 // Multiply Memory
-instruct mulI(eRegI dst, memory src, eFlagsReg cr) %{
+instruct mulI(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (MulI dst (LoadI src)));
   effect(KILL cr);
 
@@ -8075,7 +8016,7 @@
 %}
 
 // Multiply Register Long
-instruct mulL_eReg(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
+instruct mulL_eReg(eADXRegL dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
   match(Set dst (MulL dst src));
   effect(KILL cr, TEMP tmp);
   ins_cost(4*100+3*400);
@@ -8093,7 +8034,7 @@
 %}
 
 // Multiply Register Long where the left operand's high 32 bits are zero
-instruct mulL_eReg_lhi0(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
+instruct mulL_eReg_lhi0(eADXRegL dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
   predicate(is_operand_hi32_zero(n->in(1)));
   match(Set dst (MulL dst src));
   effect(KILL cr, TEMP tmp);
@@ -8114,7 +8055,7 @@
 %}
 
 // Multiply Register Long where the right operand's high 32 bits are zero
-instruct mulL_eReg_rhi0(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
+instruct mulL_eReg_rhi0(eADXRegL dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
   predicate(is_operand_hi32_zero(n->in(2)));
   match(Set dst (MulL dst src));
   effect(KILL cr, TEMP tmp);
@@ -8150,7 +8091,7 @@
 %}
 
 // Multiply Register Long by small constant
-instruct mulL_eReg_con(eADXRegL dst, immL_127 src, eRegI tmp, eFlagsReg cr) %{
+instruct mulL_eReg_con(eADXRegL dst, immL_127 src, rRegI tmp, eFlagsReg cr) %{
   match(Set dst (MulL dst src));
   effect(KILL cr, TEMP tmp);
   ins_cost(2*100+2*400);
@@ -8248,7 +8189,7 @@
 %}
 
 // Divide Register Long (no special case since divisor != -1)
-instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, rRegI tmp, rRegI tmp2, eFlagsReg cr ) %{
   match(Set dst (DivL dst imm));
   effect( TEMP tmp, TEMP tmp2, KILL cr );
   ins_cost(1000);
@@ -8319,7 +8260,7 @@
 %}
 
 // Remainder Register Long (remainder fit into 32 bits)
-instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, rRegI tmp, rRegI tmp2, eFlagsReg cr ) %{
   match(Set dst (ModL dst imm));
   effect( TEMP tmp, TEMP tmp2, KILL cr );
   ins_cost(1000);
@@ -8387,7 +8328,7 @@
 
 // Integer Shift Instructions
 // Shift Left by one
-instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct shlI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   match(Set dst (LShiftI dst shift));
   effect(KILL cr);
 
@@ -8399,7 +8340,7 @@
 %}
 
 // Shift Left by 8-bit immediate
-instruct salI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct salI_eReg_imm(rRegI dst, immI8 shift, eFlagsReg cr) %{
   match(Set dst (LShiftI dst shift));
   effect(KILL cr);
 
@@ -8411,7 +8352,7 @@
 %}
 
 // Shift Left by variable
-instruct salI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
+instruct salI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
   match(Set dst (LShiftI dst shift));
   effect(KILL cr);
 
@@ -8423,7 +8364,7 @@
 %}
 
 // Arithmetic shift right by one
-instruct sarI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct sarI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   match(Set dst (RShiftI dst shift));
   effect(KILL cr);
 
@@ -8445,7 +8386,7 @@
 %}
 
 // Arithmetic Shift Right by 8-bit immediate
-instruct sarI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct sarI_eReg_imm(rRegI dst, immI8 shift, eFlagsReg cr) %{
   match(Set dst (RShiftI dst shift));
   effect(KILL cr);
 
@@ -8468,7 +8409,7 @@
 %}
 
 // Arithmetic Shift Right by variable
-instruct sarI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
+instruct sarI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
   match(Set dst (RShiftI dst shift));
   effect(KILL cr);
 
@@ -8480,7 +8421,7 @@
 %}
 
 // Logical shift right by one
-instruct shrI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct shrI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   match(Set dst (URShiftI dst shift));
   effect(KILL cr);
 
@@ -8492,7 +8433,7 @@
 %}
 
 // Logical Shift Right by 8-bit immediate
-instruct shrI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct shrI_eReg_imm(rRegI dst, immI8 shift, eFlagsReg cr) %{
   match(Set dst (URShiftI dst shift));
   effect(KILL cr);
 
@@ -8506,7 +8447,7 @@
 
 // Logical Shift Right by 24, followed by Arithmetic Shift Left by 24.
 // This idiom is used by the compiler for the i2b bytecode.
-instruct i2b(eRegI dst, xRegI src, immI_24 twentyfour) %{
+instruct i2b(rRegI dst, xRegI src, immI_24 twentyfour) %{
   match(Set dst (RShiftI (LShiftI src twentyfour) twentyfour));
 
   size(3);
@@ -8519,7 +8460,7 @@
 
 // Logical Shift Right by 16, followed by Arithmetic Shift Left by 16.
 // This idiom is used by the compiler the i2s bytecode.
-instruct i2s(eRegI dst, xRegI src, immI_16 sixteen) %{
+instruct i2s(rRegI dst, xRegI src, immI_16 sixteen) %{
   match(Set dst (RShiftI (LShiftI src sixteen) sixteen));
 
   size(3);
@@ -8532,7 +8473,7 @@
 
 
 // Logical Shift Right by variable
-instruct shrI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
+instruct shrI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
   match(Set dst (URShiftI dst shift));
   effect(KILL cr);
 
@@ -8548,7 +8489,7 @@
 //----------Integer Logical Instructions---------------------------------------
 // And Instructions
 // And Register with Register
-instruct andI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct andI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (AndI dst src));
   effect(KILL cr);
 
@@ -8560,7 +8501,7 @@
 %}
 
 // And Register with Immediate
-instruct andI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct andI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (AndI dst src));
   effect(KILL cr);
 
@@ -8572,7 +8513,7 @@
 %}
 
 // And Register with Memory
-instruct andI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct andI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (AndI dst (LoadI src)));
   effect(KILL cr);
 
@@ -8584,7 +8525,7 @@
 %}
 
 // And Memory with Register
-instruct andI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct andI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (AndI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -8610,7 +8551,7 @@
 
 // Or Instructions
 // Or Register with Register
-instruct orI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct orI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (OrI dst src));
   effect(KILL cr);
 
@@ -8621,7 +8562,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct orI_eReg_castP2X(eRegI dst, eRegP src, eFlagsReg cr) %{
+instruct orI_eReg_castP2X(rRegI dst, eRegP src, eFlagsReg cr) %{
   match(Set dst (OrI dst (CastP2X src)));
   effect(KILL cr);
 
@@ -8634,7 +8575,7 @@
 
 
 // Or Register with Immediate
-instruct orI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct orI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (OrI dst src));
   effect(KILL cr);
 
@@ -8646,7 +8587,7 @@
 %}
 
 // Or Register with Memory
-instruct orI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct orI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (OrI dst (LoadI src)));
   effect(KILL cr);
 
@@ -8658,7 +8599,7 @@
 %}
 
 // Or Memory with Register
-instruct orI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct orI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (OrI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -8684,7 +8625,7 @@
 
 // ROL/ROR
 // ROL expand
-instruct rolI_eReg_imm1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct rolI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   effect(USE_DEF dst, USE shift, KILL cr);
 
   format %{ "ROL    $dst, $shift" %}
@@ -8693,7 +8634,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct rolI_eReg_imm8(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct rolI_eReg_imm8(rRegI dst, immI8 shift, eFlagsReg cr) %{
   effect(USE_DEF dst, USE shift, KILL cr);
 
   format %{ "ROL    $dst, $shift" %}
@@ -8713,7 +8654,7 @@
 // end of ROL expand
 
 // ROL 32bit by one once
-instruct rolI_eReg_i1(eRegI dst, immI1 lshift, immI_M1 rshift, eFlagsReg cr) %{
+instruct rolI_eReg_i1(rRegI dst, immI1 lshift, immI_M1 rshift, eFlagsReg cr) %{
   match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));
 
   expand %{
@@ -8722,7 +8663,7 @@
 %}
 
 // ROL 32bit var by imm8 once
-instruct rolI_eReg_i8(eRegI dst, immI8 lshift, immI8 rshift, eFlagsReg cr) %{
+instruct rolI_eReg_i8(rRegI dst, immI8 lshift, immI8 rshift, eFlagsReg cr) %{
   predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
   match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));
 
@@ -8750,7 +8691,7 @@
 %}
 
 // ROR expand
-instruct rorI_eReg_imm1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct rorI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   effect(USE_DEF dst, USE shift, KILL cr);
 
   format %{ "ROR    $dst, $shift" %}
@@ -8759,7 +8700,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct rorI_eReg_imm8(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct rorI_eReg_imm8(rRegI dst, immI8 shift, eFlagsReg cr) %{
   effect (USE_DEF dst, USE shift, KILL cr);
 
   format %{ "ROR    $dst, $shift" %}
@@ -8779,7 +8720,7 @@
 // end of ROR expand
 
 // ROR right once
-instruct rorI_eReg_i1(eRegI dst, immI1 rshift, immI_M1 lshift, eFlagsReg cr) %{
+instruct rorI_eReg_i1(rRegI dst, immI1 rshift, immI_M1 lshift, eFlagsReg cr) %{
   match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));
 
   expand %{
@@ -8788,7 +8729,7 @@
 %}
 
 // ROR 32bit by immI8 once
-instruct rorI_eReg_i8(eRegI dst, immI8 rshift, immI8 lshift, eFlagsReg cr) %{
+instruct rorI_eReg_i8(rRegI dst, immI8 rshift, immI8 lshift, eFlagsReg cr) %{
   predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
   match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));
 
@@ -8817,7 +8758,7 @@
 
 // Xor Instructions
 // Xor Register with Register
-instruct xorI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct xorI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (XorI dst src));
   effect(KILL cr);
 
@@ -8829,7 +8770,7 @@
 %}
 
 // Xor Register with Immediate -1
-instruct xorI_eReg_im1(eRegI dst, immI_M1 imm) %{
+instruct xorI_eReg_im1(rRegI dst, immI_M1 imm) %{
   match(Set dst (XorI dst imm));  
 
   size(2);
@@ -8841,7 +8782,7 @@
 %}
 
 // Xor Register with Immediate
-instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct xorI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (XorI dst src));
   effect(KILL cr);
 
@@ -8853,7 +8794,7 @@
 %}
 
 // Xor Register with Memory
-instruct xorI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct xorI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (XorI dst (LoadI src)));
   effect(KILL cr);
 
@@ -8865,7 +8806,7 @@
 %}
 
 // Xor Memory with Register
-instruct xorI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct xorI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (XorI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -8890,14 +8831,14 @@
 
 //----------Convert Int to Boolean---------------------------------------------
 
-instruct movI_nocopy(eRegI dst, eRegI src) %{
+instruct movI_nocopy(rRegI dst, rRegI src) %{
   effect( DEF dst, USE src );
   format %{ "MOV    $dst,$src" %}
   ins_encode( enc_Copy( dst, src) );
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct ci2b( eRegI dst, eRegI src, eFlagsReg cr ) %{
+instruct ci2b( rRegI dst, rRegI src, eFlagsReg cr ) %{
   effect( USE_DEF dst, USE src, KILL cr );
 
   size(4);
@@ -8908,7 +8849,7 @@
   ins_pipe( ialu_reg_reg_long );
 %}
 
-instruct convI2B( eRegI dst, eRegI src, eFlagsReg cr ) %{
+instruct convI2B( rRegI dst, rRegI src, eFlagsReg cr ) %{
   match(Set dst (Conv2B src));
 
   expand %{
@@ -8917,14 +8858,14 @@
   %}
 %}
 
-instruct movP_nocopy(eRegI dst, eRegP src) %{
+instruct movP_nocopy(rRegI dst, eRegP src) %{
   effect( DEF dst, USE src );
   format %{ "MOV    $dst,$src" %}
   ins_encode( enc_Copy( dst, src) );
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct cp2b( eRegI dst, eRegP src, eFlagsReg cr ) %{
+instruct cp2b( rRegI dst, eRegP src, eFlagsReg cr ) %{
   effect( USE_DEF dst, USE src, KILL cr );
   format %{ "NEG    $dst\n\t"
             "ADC    $dst,$src" %}
@@ -8933,7 +8874,7 @@
   ins_pipe( ialu_reg_reg_long );
 %}
 
-instruct convP2B( eRegI dst, eRegP src, eFlagsReg cr ) %{
+instruct convP2B( rRegI dst, eRegP src, eFlagsReg cr ) %{
   match(Set dst (Conv2B src));
 
   expand %{
@@ -8958,7 +8899,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpLTMask0( eRegI dst, immI0 zero, eFlagsReg cr ) %{
+instruct cmpLTMask0( rRegI dst, immI0 zero, eFlagsReg cr ) %{
   match(Set dst (CmpLTMask dst zero));
   effect( DEF dst, KILL cr );
   ins_cost(100);
@@ -9430,7 +9371,7 @@
 %}
 
 // Compare vs zero into -1,0,1
-instruct cmpDPR_0(eRegI dst, regDPR src1, immDPR0 zero, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpDPR_0(rRegI dst, regDPR src1, immDPR0 zero, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (CmpD3 src1 zero));
   effect(KILL cr, KILL rax);
@@ -9444,7 +9385,7 @@
 %}
 
 // Compare into -1,0,1
-instruct cmpDPR_reg(eRegI dst, regDPR src1, regDPR src2, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpDPR_reg(rRegI dst, regDPR src1, regDPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (CmpD3 src1 src2));
   effect(KILL cr, KILL rax);
@@ -10222,7 +10163,7 @@
 %}
 
 // Compare vs zero into -1,0,1
-instruct cmpFPR_0(eRegI dst, regFPR src1, immFPR0 zero, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpFPR_0(rRegI dst, regFPR src1, immFPR0 zero, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE == 0);
   match(Set dst (CmpF3 src1 zero));
   effect(KILL cr, KILL rax);
@@ -10236,7 +10177,7 @@
 %}
 
 // Compare into -1,0,1
-instruct cmpFPR_reg(eRegI dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpFPR_reg(rRegI dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE == 0);
   match(Set dst (CmpF3 src1 src2));
   effect(KILL cr, KILL rax);
@@ -11156,7 +11097,7 @@
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct convI2D_reg(regD dst, eRegI src) %{
+instruct convI2D_reg(regD dst, rRegI src) %{
   predicate( UseSSE>=2 && !UseXmmI2D );
   match(Set dst (ConvI2D src));
   format %{ "CVTSI2SD $dst,$src" %}
@@ -11176,7 +11117,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct convXI2D_reg(regD dst, eRegI src)
+instruct convXI2D_reg(regD dst, rRegI src)
 %{
   predicate( UseSSE>=2 && UseXmmI2D );
   match(Set dst (ConvI2D src));
@@ -11264,7 +11205,7 @@
 %}
 
 // Convert an int to a float in xmm; no rounding step needed.
-instruct convI2F_reg(regF dst, eRegI src) %{
+instruct convI2F_reg(regF dst, rRegI src) %{
   predicate( UseSSE==1 || UseSSE>=2 && !UseXmmI2F );
   match(Set dst (ConvI2F src));
   format %{ "CVTSI2SS $dst, $src" %}
@@ -11274,7 +11215,7 @@
   ins_pipe( pipe_slow );
 %}
 
- instruct convXI2F_reg(regF dst, eRegI src)
+ instruct convXI2F_reg(regF dst, rRegI src)
 %{
   predicate( UseSSE>=2 && UseXmmI2F );
   match(Set dst (ConvI2F src));
@@ -11288,7 +11229,7 @@
   ins_pipe(pipe_slow); // XXX
 %}
 
-instruct convI2L_reg( eRegL dst, eRegI src, eFlagsReg cr) %{
+instruct convI2L_reg( eRegL dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (ConvI2L src));
   effect(KILL cr);
   ins_cost(375);
@@ -11300,7 +11241,7 @@
 %}
 
 // Zero-extend convert int to long
-instruct convI2L_reg_zex(eRegL dst, eRegI src, immL_32bits mask, eFlagsReg flags ) %{
+instruct convI2L_reg_zex(eRegL dst, rRegI src, immL_32bits mask, eFlagsReg flags ) %{
   match(Set dst (AndL (ConvI2L src) mask) );
   effect( KILL flags );
   ins_cost(250);
@@ -11380,7 +11321,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct convL2I_reg( eRegI dst, eRegL src ) %{
+instruct convL2I_reg( rRegI dst, eRegL src ) %{
   match(Set dst (ConvL2I src));
   effect( DEF dst, USE src );
   format %{ "MOV    $dst,$src.lo" %}
@@ -11389,7 +11330,7 @@
 %}
 
 
-instruct MoveF2I_stack_reg(eRegI dst, stackSlotF src) %{
+instruct MoveF2I_stack_reg(rRegI dst, stackSlotF src) %{
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
   ins_cost(100);
@@ -11424,7 +11365,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveF2I_reg_reg_sse(eRegI dst, regF src) %{
+instruct MoveF2I_reg_reg_sse(rRegI dst, regF src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
@@ -11436,7 +11377,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveI2F_reg_stack(stackSlotF dst, eRegI src) %{
+instruct MoveI2F_reg_stack(stackSlotF dst, rRegI src) %{
   match(Set dst (MoveI2F src));
   effect( DEF dst, USE src );
 
@@ -11476,7 +11417,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveI2F_reg_reg_sse(regF dst, eRegI src) %{
+instruct MoveI2F_reg_reg_sse(regF dst, rRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveI2F src));
   effect( DEF dst, USE src );
@@ -11610,186 +11551,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate8B src));
-  format %{ "MOVDQA  $dst,$src\n\t"
-            "PUNPCKLBW $dst,$dst\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode %{
-    if ($dst$$reg != $src$$reg) {
-      __ movdqa($dst$$XMMRegister, $src$$XMMRegister);
-    }
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_eRegI(regD dst, eRegI src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate8B src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PUNPCKLBW $dst,$dst\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Replicate scalar zero to packed byte (1 byte) values in xmm
-instruct Repl8B_immI0(regD dst, immI0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate8B zero));
-  format %{ "PXOR  $dst,$dst\t! replicate8B" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4S src));
-  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_eRegI(regD dst, eRegI src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4S src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed short (2 byte) values in xmm
-instruct Repl4S_immI0(regD dst, immI0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4S zero));
-  format %{ "PXOR  $dst,$dst\t! replicate4S" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4C src));
-  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4C" %}
-  ins_encode %{
-    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_eRegI(regD dst, eRegI src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4C src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate4C" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed char (2 byte) values in xmm
-instruct Repl4C_immI0(regD dst, immI0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4C zero));
-  format %{ "PXOR  $dst,$dst\t! replicate4C" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2I src));
-  format %{ "PSHUFD $dst,$src,0x00\t! replicate2I" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_eRegI(regD dst, eRegI src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2I src));
-  format %{ "MOVD   $dst,$src\n\t"
-            "PSHUFD $dst,$dst,0x00\t! replicate2I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed integer (2 byte) values in xmm
-instruct Repl2I_immI0(regD dst, immI0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2I zero));
-  format %{ "PXOR  $dst,$dst\t! replicate2I" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2F src));
-  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_regF(regD dst, regF src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2F src));
-  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_immF0(regD dst, immF0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2F zero));
-  format %{ "PXOR  $dst,$dst\t! replicate2F" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
 
 // =======================================================================
 // fast clearing of an array
@@ -11898,7 +11659,7 @@
 
 //----------Control Flow Instructions------------------------------------------
 // Signed compare Instructions
-instruct compI_eReg(eFlagsReg cr, eRegI op1, eRegI op2) %{
+instruct compI_eReg(eFlagsReg cr, rRegI op1, rRegI op2) %{
   match(Set cr (CmpI op1 op2));
   effect( DEF cr, USE op1, USE op2 );
   format %{ "CMP    $op1,$op2" %}
@@ -11907,7 +11668,7 @@
   ins_pipe( ialu_cr_reg_reg );
 %}
 
-instruct compI_eReg_imm(eFlagsReg cr, eRegI op1, immI op2) %{
+instruct compI_eReg_imm(eFlagsReg cr, rRegI op1, immI op2) %{
   match(Set cr (CmpI op1 op2));
   effect( DEF cr, USE op1 );
   format %{ "CMP    $op1,$op2" %}
@@ -11918,7 +11679,7 @@
 %}
 
 // Cisc-spilled version of cmpI_eReg
-instruct compI_eReg_mem(eFlagsReg cr, eRegI op1, memory op2) %{
+instruct compI_eReg_mem(eFlagsReg cr, rRegI op1, memory op2) %{
   match(Set cr (CmpI op1 (LoadI op2)));
 
   format %{ "CMP    $op1,$op2" %}
@@ -11928,7 +11689,7 @@
   ins_pipe( ialu_cr_reg_mem );
 %}
 
-instruct testI_reg( eFlagsReg cr, eRegI src, immI0 zero ) %{
+instruct testI_reg( eFlagsReg cr, rRegI src, immI0 zero ) %{
   match(Set cr (CmpI src zero));
   effect( DEF cr, USE src );
 
@@ -11938,7 +11699,7 @@
   ins_pipe( ialu_cr_reg_imm );
 %}
 
-instruct testI_reg_imm( eFlagsReg cr, eRegI src, immI con, immI0 zero ) %{
+instruct testI_reg_imm( eFlagsReg cr, rRegI src, immI con, immI0 zero ) %{
   match(Set cr (CmpI (AndI src con) zero));
 
   format %{ "TEST   $src,$con" %}
@@ -11947,7 +11708,7 @@
   ins_pipe( ialu_cr_reg_imm );
 %}
 
-instruct testI_reg_mem( eFlagsReg cr, eRegI src, memory mem, immI0 zero ) %{
+instruct testI_reg_mem( eFlagsReg cr, rRegI src, memory mem, immI0 zero ) %{
   match(Set cr (CmpI (AndI src mem) zero));
 
   format %{ "TEST   $src,$mem" %}
@@ -11958,7 +11719,7 @@
 
 // Unsigned compare Instructions; really, same as signed except they
 // produce an eFlagsRegU instead of eFlagsReg.
-instruct compU_eReg(eFlagsRegU cr, eRegI op1, eRegI op2) %{
+instruct compU_eReg(eFlagsRegU cr, rRegI op1, rRegI op2) %{
   match(Set cr (CmpU op1 op2));
 
   format %{ "CMPu   $op1,$op2" %}
@@ -11967,7 +11728,7 @@
   ins_pipe( ialu_cr_reg_reg );
 %}
 
-instruct compU_eReg_imm(eFlagsRegU cr, eRegI op1, immI op2) %{
+instruct compU_eReg_imm(eFlagsRegU cr, rRegI op1, immI op2) %{
   match(Set cr (CmpU op1 op2));
 
   format %{ "CMPu   $op1,$op2" %}
@@ -11977,7 +11738,7 @@
 %}
 
 // // Cisc-spilled version of cmpU_eReg
-instruct compU_eReg_mem(eFlagsRegU cr, eRegI op1, memory op2) %{
+instruct compU_eReg_mem(eFlagsRegU cr, rRegI op1, memory op2) %{
   match(Set cr (CmpU op1 (LoadI op2)));
 
   format %{ "CMPu   $op1,$op2" %}
@@ -11988,7 +11749,7 @@
 %}
 
 // // Cisc-spilled version of cmpU_eReg
-//instruct compU_mem_eReg(eFlagsRegU cr, memory op1, eRegI op2) %{
+//instruct compU_mem_eReg(eFlagsRegU cr, memory op1, rRegI op2) %{
 //  match(Set cr (CmpU (LoadI op1) op2));
 //
 //  format %{ "CMPu   $op1,$op2" %}
@@ -11997,7 +11758,7 @@
 //  ins_encode( OpcP, RegMem( op1, op2) );
 //%}
 
-instruct testU_reg( eFlagsRegU cr, eRegI src, immI0 zero ) %{
+instruct testU_reg( eFlagsRegU cr, rRegI src, immI0 zero ) %{
   match(Set cr (CmpU src zero));
 
   format %{ "TESTu  $src,$src" %}
@@ -12093,7 +11854,7 @@
 //   *** Min and Max using the conditional move are slower than the
 //   *** branch version on a Pentium III.
 // // Conditional move for min
-//instruct cmovI_reg_lt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
+//instruct cmovI_reg_lt( rRegI op2, rRegI op1, eFlagsReg cr ) %{
 //  effect( USE_DEF op2, USE op1, USE cr );
 //  format %{ "CMOVlt $op2,$op1\t! min" %}
 //  opcode(0x4C,0x0F);
@@ -12102,7 +11863,7 @@
 //%}
 //
 //// Min Register with Register (P6 version)
-//instruct minI_eReg_p6( eRegI op1, eRegI op2 ) %{
+//instruct minI_eReg_p6( rRegI op1, rRegI op2 ) %{
 //  predicate(VM_Version::supports_cmov() );
 //  match(Set op2 (MinI op1 op2));
 //  ins_cost(200);
@@ -12114,7 +11875,7 @@
 //%}
 
 // Min Register with Register (generic version)
-instruct minI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
+instruct minI_eReg(rRegI dst, rRegI src, eFlagsReg flags) %{
   match(Set dst (MinI dst src));
   effect(KILL flags);
   ins_cost(300);
@@ -12129,7 +11890,7 @@
 //   *** Min and Max using the conditional move are slower than the
 //   *** branch version on a Pentium III.
 // // Conditional move for max
-//instruct cmovI_reg_gt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
+//instruct cmovI_reg_gt( rRegI op2, rRegI op1, eFlagsReg cr ) %{
 //  effect( USE_DEF op2, USE op1, USE cr );
 //  format %{ "CMOVgt $op2,$op1\t! max" %}
 //  opcode(0x4F,0x0F);
@@ -12138,7 +11899,7 @@
 //%}
 //
 // // Max Register with Register (P6 version)
-//instruct maxI_eReg_p6( eRegI op1, eRegI op2 ) %{
+//instruct maxI_eReg_p6( rRegI op1, rRegI op2 ) %{
 //  predicate(VM_Version::supports_cmov() );
 //  match(Set op2 (MaxI op1 op2));
 //  ins_cost(200);
@@ -12150,7 +11911,7 @@
 //%}
 
 // Max Register with Register (generic version)
-instruct maxI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
+instruct maxI_eReg(rRegI dst, rRegI src, eFlagsReg flags) %{
   match(Set dst (MaxI dst src));
   effect(KILL flags);
   ins_cost(300);
@@ -12211,7 +11972,7 @@
 // ============================================================================
 // Branch Instructions
 // Jump Table
-instruct jumpXtnd(eRegI switch_val) %{
+instruct jumpXtnd(rRegI switch_val) %{
   match(Jump switch_val);
   ins_cost(350);
   format %{  "JMP    [$constantaddress](,$switch_val,1)\n\t" %}
@@ -12629,7 +12390,7 @@
 // Manifest a CmpL result in the normal flags.  Only good for LT or GE
 // compares.  Can be used for LE or GT compares by reversing arguments.
 // NOT GOOD FOR EQ/NE tests.
-instruct cmpL_reg_flags_LTGE( flagsReg_long_LTGE flags, eRegL src1, eRegL src2, eRegI tmp ) %{
+instruct cmpL_reg_flags_LTGE( flagsReg_long_LTGE flags, eRegL src1, eRegL src2, rRegI tmp ) %{
   match( Set flags (CmpL src1 src2 ));
   effect( TEMP tmp );
   ins_cost(300);
@@ -12675,7 +12436,7 @@
 %}
 
 // Compare 2 longs and CMOVE ints.
-instruct cmovII_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, eRegI src) %{
+instruct cmovII_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, rRegI dst, rRegI src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
@@ -12685,7 +12446,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovII_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, memory src) %{
+instruct cmovII_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -12746,7 +12507,7 @@
 
 //======
 // Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.
-instruct cmpL_zero_flags_EQNE( flagsReg_long_EQNE flags, eRegL src, immL0 zero, eRegI tmp ) %{
+instruct cmpL_zero_flags_EQNE( flagsReg_long_EQNE flags, eRegL src, immL0 zero, rRegI tmp ) %{
   match( Set flags (CmpL src zero ));
   effect(TEMP tmp);
   ins_cost(200);
@@ -12803,7 +12564,7 @@
 %}
 
 // Compare 2 longs and CMOVE ints.
-instruct cmovII_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, eRegI src) %{
+instruct cmovII_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, rRegI dst, rRegI src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
@@ -12813,7 +12574,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovII_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, memory src) %{
+instruct cmovII_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -12875,7 +12636,7 @@
 //======
 // Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
 // Same as cmpL_reg_flags_LEGT except must negate src
-instruct cmpL_zero_flags_LEGT( flagsReg_long_LEGT flags, eRegL src, immL0 zero, eRegI tmp ) %{
+instruct cmpL_zero_flags_LEGT( flagsReg_long_LEGT flags, eRegL src, immL0 zero, rRegI tmp ) %{
   match( Set flags (CmpL src zero ));
   effect( TEMP tmp );
   ins_cost(300);
@@ -12889,7 +12650,7 @@
 // Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
 // Same as cmpL_reg_flags_LTGE except operands swapped.  Swapping operands
 // requires a commuted test to get the same result.
-instruct cmpL_reg_flags_LEGT( flagsReg_long_LEGT flags, eRegL src1, eRegL src2, eRegI tmp ) %{
+instruct cmpL_reg_flags_LEGT( flagsReg_long_LEGT flags, eRegL src1, eRegL src2, rRegI tmp ) %{
   match( Set flags (CmpL src1 src2 ));
   effect( TEMP tmp );
   ins_cost(300);
@@ -12936,7 +12697,7 @@
 %}
 
 // Compare 2 longs and CMOVE ints.
-instruct cmovII_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, eRegI src) %{
+instruct cmovII_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, rRegI dst, rRegI src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
@@ -12946,7 +12707,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovII_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, memory src) %{
+instruct cmovII_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -13275,11 +13036,11 @@
 // ---------EXAMPLE----------------------------------------------------------
 //
 // // pertinent parts of existing instructions in architecture description
-// instruct movI(eRegI dst, eRegI src) %{
+// instruct movI(rRegI dst, rRegI src) %{
 //   match(Set dst (CopyI src));
 // %}
 //
-// instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
+// instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{
 //   match(Set dst (AddI dst src));
 //   effect(KILL cr);
 // %}
@@ -13324,11 +13085,11 @@
 // %}
 
 // // Change load of spilled value to only a spill
-// instruct storeI(memory mem, eRegI src) %{
+// instruct storeI(memory mem, rRegI src) %{
 //   match(Set mem (StoreI mem src));
 // %}
 //
-// instruct loadI(eRegI dst, memory mem) %{
+// instruct loadI(rRegI dst, memory mem) %{
 //   match(Set dst (LoadI mem));
 // %}
 //
--- a/src/cpu/x86/vm/x86_64.ad	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/cpu/x86/vm/x86_64.ad	Thu Jun 28 10:35:28 2012 -0700
@@ -131,102 +131,6 @@
 
 // Floating Point Registers
 
-// XMM registers.  128-bit registers or 4 words each, labeled (a)-d.
-// Word a in each register holds a Float, words ab hold a Double.  We
-// currently do not use the SIMD capabilities, so registers cd are
-// unused at the moment.
-// XMM8-XMM15 must be encoded with REX.
-// Linux ABI:   No register preserved across function calls
-//              XMM0-XMM7 might hold parameters
-// Windows ABI: XMM6-XMM15 preserved across function calls
-//              XMM0-XMM3 might hold parameters
-
-reg_def XMM0   (SOC, SOC, Op_RegF,  0, xmm0->as_VMReg());
-reg_def XMM0_H (SOC, SOC, Op_RegF,  0, xmm0->as_VMReg()->next());
-
-reg_def XMM1   (SOC, SOC, Op_RegF,  1, xmm1->as_VMReg());
-reg_def XMM1_H (SOC, SOC, Op_RegF,  1, xmm1->as_VMReg()->next());
-
-reg_def XMM2   (SOC, SOC, Op_RegF,  2, xmm2->as_VMReg());
-reg_def XMM2_H (SOC, SOC, Op_RegF,  2, xmm2->as_VMReg()->next());
-
-reg_def XMM3   (SOC, SOC, Op_RegF,  3, xmm3->as_VMReg());
-reg_def XMM3_H (SOC, SOC, Op_RegF,  3, xmm3->as_VMReg()->next());
-
-reg_def XMM4   (SOC, SOC, Op_RegF,  4, xmm4->as_VMReg());
-reg_def XMM4_H (SOC, SOC, Op_RegF,  4, xmm4->as_VMReg()->next());
-
-reg_def XMM5   (SOC, SOC, Op_RegF,  5, xmm5->as_VMReg());
-reg_def XMM5_H (SOC, SOC, Op_RegF,  5, xmm5->as_VMReg()->next());
-
-#ifdef _WIN64
-
-reg_def XMM6   (SOC, SOE, Op_RegF,  6, xmm6->as_VMReg());
-reg_def XMM6_H (SOC, SOE, Op_RegF,  6, xmm6->as_VMReg()->next());
-
-reg_def XMM7   (SOC, SOE, Op_RegF,  7, xmm7->as_VMReg());
-reg_def XMM7_H (SOC, SOE, Op_RegF,  7, xmm7->as_VMReg()->next());
-
-reg_def XMM8   (SOC, SOE, Op_RegF,  8, xmm8->as_VMReg());
-reg_def XMM8_H (SOC, SOE, Op_RegF,  8, xmm8->as_VMReg()->next());
-
-reg_def XMM9   (SOC, SOE, Op_RegF,  9, xmm9->as_VMReg());
-reg_def XMM9_H (SOC, SOE, Op_RegF,  9, xmm9->as_VMReg()->next());
-
-reg_def XMM10  (SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
-reg_def XMM10_H(SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next());
-
-reg_def XMM11  (SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
-reg_def XMM11_H(SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next());
-
-reg_def XMM12  (SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
-reg_def XMM12_H(SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next());
-
-reg_def XMM13  (SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
-reg_def XMM13_H(SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next());
-
-reg_def XMM14  (SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
-reg_def XMM14_H(SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next());
-
-reg_def XMM15  (SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
-reg_def XMM15_H(SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next());
-
-#else
-
-reg_def XMM6   (SOC, SOC, Op_RegF,  6, xmm6->as_VMReg());
-reg_def XMM6_H (SOC, SOC, Op_RegF,  6, xmm6->as_VMReg()->next());
-
-reg_def XMM7   (SOC, SOC, Op_RegF,  7, xmm7->as_VMReg());
-reg_def XMM7_H (SOC, SOC, Op_RegF,  7, xmm7->as_VMReg()->next());
-
-reg_def XMM8   (SOC, SOC, Op_RegF,  8, xmm8->as_VMReg());
-reg_def XMM8_H (SOC, SOC, Op_RegF,  8, xmm8->as_VMReg()->next());
-
-reg_def XMM9   (SOC, SOC, Op_RegF,  9, xmm9->as_VMReg());
-reg_def XMM9_H (SOC, SOC, Op_RegF,  9, xmm9->as_VMReg()->next());
-
-reg_def XMM10  (SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
-reg_def XMM10_H(SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next());
-
-reg_def XMM11  (SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
-reg_def XMM11_H(SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next());
-
-reg_def XMM12  (SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
-reg_def XMM12_H(SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next());
-
-reg_def XMM13  (SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
-reg_def XMM13_H(SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next());
-
-reg_def XMM14  (SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
-reg_def XMM14_H(SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next());
-
-reg_def XMM15  (SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
-reg_def XMM15_H(SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next());
-
-#endif // _WIN64
-
-reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
-
 // Specify priority of register selection within phases of register
 // allocation.  Highest priority is first.  A useful heuristic is to
 // give registers a low priority when they are required by machine
@@ -252,26 +156,6 @@
                    R15,         R15_H,
                    RSP,         RSP_H);
 
-// XXX probably use 8-15 first on Linux
-alloc_class chunk1(XMM0,  XMM0_H,
-                   XMM1,  XMM1_H,
-                   XMM2,  XMM2_H,
-                   XMM3,  XMM3_H,
-                   XMM4,  XMM4_H,
-                   XMM5,  XMM5_H,
-                   XMM6,  XMM6_H,
-                   XMM7,  XMM7_H,
-                   XMM8,  XMM8_H,
-                   XMM9,  XMM9_H,
-                   XMM10, XMM10_H,
-                   XMM11, XMM11_H,
-                   XMM12, XMM12_H,
-                   XMM13, XMM13_H,
-                   XMM14, XMM14_H,
-                   XMM15, XMM15_H);
-
-alloc_class chunk2(RFLAGS);
-
 
 //----------Architecture Description Register Classes--------------------------
 // Several register classes are automatically defined based upon information in
@@ -501,46 +385,7 @@
 // Singleton class for instruction pointer
 // reg_class ip_reg(RIP);
 
-// Singleton class for condition codes
-reg_class int_flags(RFLAGS);
-
-// Class for all float registers
-reg_class float_reg(XMM0,
-                    XMM1,
-                    XMM2,
-                    XMM3,
-                    XMM4,
-                    XMM5,
-                    XMM6,
-                    XMM7,
-                    XMM8,
-                    XMM9,
-                    XMM10,
-                    XMM11,
-                    XMM12,
-                    XMM13,
-                    XMM14,
-                    XMM15);
-
-// Class for all double registers
-reg_class double_reg(XMM0,  XMM0_H,
-                     XMM1,  XMM1_H,
-                     XMM2,  XMM2_H,
-                     XMM3,  XMM3_H,
-                     XMM4,  XMM4_H,
-                     XMM5,  XMM5_H,
-                     XMM6,  XMM6_H,
-                     XMM7,  XMM7_H,
-                     XMM8,  XMM8_H,
-                     XMM9,  XMM9_H,
-                     XMM10, XMM10_H,
-                     XMM11, XMM11_H,
-                     XMM12, XMM12_H,
-                     XMM13, XMM13_H,
-                     XMM14, XMM14_H,
-                     XMM15, XMM15_H);
-%}
-
+%}
 
 //----------SOURCE BLOCK-------------------------------------------------------
 // This is a block of C++ code which provides values, functions, and
@@ -1027,12 +872,84 @@
   return rc_float;
 }
 
+// Next two methods are shared by 32- and 64-bit VM. They are defined in x86.ad.
+static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
+                          int src_hi, int dst_hi, uint ireg, outputStream* st);
+
+static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
+                            int stack_offset, int reg, uint ireg, outputStream* st);
+
+static void vec_stack_to_stack_helper(CodeBuffer *cbuf, int src_offset,
+                                      int dst_offset, uint ireg, outputStream* st) {
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    switch (ireg) {
+    case Op_VecS:
+      __ movq(Address(rsp, -8), rax);
+      __ movl(rax, Address(rsp, src_offset));
+      __ movl(Address(rsp, dst_offset), rax);
+      __ movq(rax, Address(rsp, -8));
+      break;
+    case Op_VecD:
+      __ pushq(Address(rsp, src_offset));
+      __ popq (Address(rsp, dst_offset));
+      break;
+    case Op_VecX:
+      __ pushq(Address(rsp, src_offset));
+      __ popq (Address(rsp, dst_offset));
+      __ pushq(Address(rsp, src_offset+8));
+      __ popq (Address(rsp, dst_offset+8));
+      break;
+    case Op_VecY:
+      __ vmovdqu(Address(rsp, -32), xmm0);
+      __ vmovdqu(xmm0, Address(rsp, src_offset));
+      __ vmovdqu(Address(rsp, dst_offset), xmm0);
+      __ vmovdqu(xmm0, Address(rsp, -32));
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+#ifndef PRODUCT
+  } else {
+    switch (ireg) {
+    case Op_VecS:
+      st->print("movq    [rsp - #8], rax\t# 32-bit mem-mem spill\n\t"
+                "movl    rax, [rsp + #%d]\n\t"
+                "movl    [rsp + #%d], rax\n\t"
+                "movq    rax, [rsp - #8]",
+                src_offset, dst_offset);
+      break;
+    case Op_VecD:
+      st->print("pushq   [rsp + #%d]\t# 64-bit mem-mem spill\n\t"
+                "popq    [rsp + #%d]",
+                src_offset, dst_offset);
+      break;
+     case Op_VecX:
+      st->print("pushq   [rsp + #%d]\t# 128-bit mem-mem spill\n\t"
+                "popq    [rsp + #%d]\n\t"
+                "pushq   [rsp + #%d]\n\t"
+                "popq    [rsp + #%d]",
+                src_offset, dst_offset, src_offset+8, dst_offset+8);
+      break;
+    case Op_VecY:
+      st->print("vmovdqu [rsp - #32], xmm0\t# 256-bit mem-mem spill\n\t"
+                "vmovdqu xmm0, [rsp + #%d]\n\t"
+                "vmovdqu [rsp + #%d], xmm0\n\t"
+                "vmovdqu xmm0, [rsp - #32]",
+                src_offset, dst_offset);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+#endif
+  }
+}
+
 uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
                                        PhaseRegAlloc* ra_,
                                        bool do_size,
-                                       outputStream* st) const
-{
-
+                                       outputStream* st) const {
+  assert(cbuf != NULL || st  != NULL, "sanity");
   // Get registers to move
   OptoReg::Name src_second = ra_->get_reg_second(in(1));
   OptoReg::Name src_first = ra_->get_reg_first(in(1));
@@ -1050,7 +967,30 @@
   if (src_first == dst_first && src_second == dst_second) {
     // Self copy, no move
     return 0;
-  } else if (src_first_rc == rc_stack) {
+  }
+  if (bottom_type()->isa_vect() != NULL) {
+    uint ireg = ideal_reg();
+    assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
+    assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
+    if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
+      // mem -> mem
+      int src_offset = ra_->reg2offset(src_first);
+      int dst_offset = ra_->reg2offset(dst_first);
+      vec_stack_to_stack_helper(cbuf, src_offset, dst_offset, ireg, st);
+    } else if (src_first_rc == rc_float && dst_first_rc == rc_float ) {
+      vec_mov_helper(cbuf, false, src_first, dst_first, src_second, dst_second, ireg, st);
+    } else if (src_first_rc == rc_float && dst_first_rc == rc_stack ) {
+      int stack_offset = ra_->reg2offset(dst_first);
+      vec_spill_helper(cbuf, false, false, stack_offset, src_first, ireg, st);
+    } else if (src_first_rc == rc_stack && dst_first_rc == rc_float ) {
+      int stack_offset = ra_->reg2offset(src_first);
+      vec_spill_helper(cbuf, false, true,  stack_offset, dst_first, ireg, st);
+    } else {
+      ShouldNotReachHere();
+    }
+    return 0;
+  }
+  if (src_first_rc == rc_stack) {
     // mem ->
     if (dst_first_rc == rc_stack) {
       // mem -> mem
@@ -1061,23 +1001,16 @@
         int src_offset = ra_->reg2offset(src_first);
         int dst_offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          emit_opcode(*cbuf, 0xFF);
-          encode_RegMem(*cbuf, RSI_enc, RSP_enc, 0x4, 0, src_offset, false);
-
-          emit_opcode(*cbuf, 0x8F);
-          encode_RegMem(*cbuf, RAX_enc, RSP_enc, 0x4, 0, dst_offset, false);
-
+          MacroAssembler _masm(cbuf);
+          __ pushq(Address(rsp, src_offset));
+          __ popq (Address(rsp, dst_offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("pushq   [rsp + #%d]\t# 64-bit mem-mem spill\n\t"
-                     "popq    [rsp + #%d]",
-                     src_offset,
-                     dst_offset);
+                    "popq    [rsp + #%d]",
+                     src_offset, dst_offset);
 #endif
         }
-        return
-          3 + ((src_offset == 0) ? 0 : (src_offset < 0x80 ? 1 : 4)) +
-          3 + ((dst_offset == 0) ? 0 : (dst_offset < 0x80 ? 1 : 4));
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1086,46 +1019,22 @@
         int src_offset = ra_->reg2offset(src_first);
         int dst_offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          emit_opcode(*cbuf, Assembler::REX_W);
-          emit_opcode(*cbuf, 0x89);
-          emit_opcode(*cbuf, 0x44);
-          emit_opcode(*cbuf, 0x24);
-          emit_opcode(*cbuf, 0xF8);
-
-          emit_opcode(*cbuf, 0x8B);
-          encode_RegMem(*cbuf,
-                        RAX_enc,
-                        RSP_enc, 0x4, 0, src_offset,
-                        false);
-
-          emit_opcode(*cbuf, 0x89);
-          encode_RegMem(*cbuf,
-                        RAX_enc,
-                        RSP_enc, 0x4, 0, dst_offset,
-                        false);
-
-          emit_opcode(*cbuf, Assembler::REX_W);
-          emit_opcode(*cbuf, 0x8B);
-          emit_opcode(*cbuf, 0x44);
-          emit_opcode(*cbuf, 0x24);
-          emit_opcode(*cbuf, 0xF8);
-
+          MacroAssembler _masm(cbuf);
+          __ movq(Address(rsp, -8), rax);
+          __ movl(rax, Address(rsp, src_offset));
+          __ movl(Address(rsp, dst_offset), rax);
+          __ movq(rax, Address(rsp, -8));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movq    [rsp - #8], rax\t# 32-bit mem-mem spill\n\t"
-                     "movl    rax, [rsp + #%d]\n\t"
-                     "movl    [rsp + #%d], rax\n\t"
-                     "movq    rax, [rsp - #8]",
-                     src_offset,
-                     dst_offset);
+                    "movl    rax, [rsp + #%d]\n\t"
+                    "movl    [rsp + #%d], rax\n\t"
+                    "movq    rax, [rsp - #8]",
+                     src_offset, dst_offset);
 #endif
         }
-        return
-          5 + // movq
-          3 + ((src_offset == 0) ? 0 : (src_offset < 0x80 ? 1 : 4)) + // movl
-          3 + ((dst_offset == 0) ? 0 : (dst_offset < 0x80 ? 1 : 4)) + // movl
-          5; // movq
       }
+      return 0;
     } else if (dst_first_rc == rc_int) {
       // mem -> gpr
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
@@ -1133,52 +1042,32 @@
         // 64-bit
         int offset = ra_->reg2offset(src_first);
         if (cbuf) {
-          if (Matcher::_regEncode[dst_first] < 8) {
-            emit_opcode(*cbuf, Assembler::REX_W);
-          } else {
-            emit_opcode(*cbuf, Assembler::REX_WR);
-          }
-          emit_opcode(*cbuf, 0x8B);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[dst_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movq(as_Register(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movq    %s, [rsp + #%d]\t# spill",
                      Matcher::regName[dst_first],
                      offset);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) + 4; // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         int offset = ra_->reg2offset(src_first);
         if (cbuf) {
-          if (Matcher::_regEncode[dst_first] >= 8) {
-            emit_opcode(*cbuf, Assembler::REX_R);
-          }
-          emit_opcode(*cbuf, 0x8B);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[dst_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movl(as_Register(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movl    %s, [rsp + #%d]\t# spill",
                      Matcher::regName[dst_first],
                      offset);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[dst_first] < 8)
-           ? 3
-           : 4); // REX
       }
+      return 0;
     } else if (dst_first_rc == rc_float) {
       // mem-> xmm
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
@@ -1189,18 +1078,13 @@
           MacroAssembler _masm(cbuf);
           __ movdbl( as_XMMRegister(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("%s  %s, [rsp + #%d]\t# spill",
                      UseXmmLoadAndClearUpper ? "movsd " : "movlpd",
                      Matcher::regName[dst_first],
                      offset);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[dst_first] >= 8)
-           ? 6
-           : (5 + ((UseAVX>0)?1:0))); // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1210,18 +1094,14 @@
           MacroAssembler _masm(cbuf);
           __ movflt( as_XMMRegister(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movss   %s, [rsp + #%d]\t# spill",
                      Matcher::regName[dst_first],
                      offset);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[dst_first] >= 8)
-           ? 6
-           : (5 + ((UseAVX>0)?1:0))); // REX
       }
+      return 0;
     }
   } else if (src_first_rc == rc_int) {
     // gpr ->
@@ -1232,113 +1112,65 @@
         // 64-bit
         int offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          if (Matcher::_regEncode[src_first] < 8) {
-            emit_opcode(*cbuf, Assembler::REX_W);
-          } else {
-            emit_opcode(*cbuf, Assembler::REX_WR);
-          }
-          emit_opcode(*cbuf, 0x89);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[src_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movq(Address(rsp, offset), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movq    [rsp + #%d], %s\t# spill",
                      offset,
                      Matcher::regName[src_first]);
 #endif
         }
-        return ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) + 4; // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         int offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          if (Matcher::_regEncode[src_first] >= 8) {
-            emit_opcode(*cbuf, Assembler::REX_R);
-          }
-          emit_opcode(*cbuf, 0x89);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[src_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movl(Address(rsp, offset), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movl    [rsp + #%d], %s\t# spill",
                      offset,
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[src_first] < 8)
-           ? 3
-           : 4); // REX
       }
+      return 0;
     } else if (dst_first_rc == rc_int) {
       // gpr -> gpr
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
           (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
         // 64-bit
         if (cbuf) {
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_W);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_WB);
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_WR);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_WRB);
-            }
-          }
-          emit_opcode(*cbuf, 0x8B);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[dst_first] & 7,
-                  Matcher::_regEncode[src_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movq(as_Register(Matcher::_regEncode[dst_first]),
+                  as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movq    %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return 3; // REX
+        return 0;
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         if (cbuf) {
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] >= 8) {
-              emit_opcode(*cbuf, Assembler::REX_B);
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_R);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_RB);
-            }
-          }
-          emit_opcode(*cbuf, 0x8B);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[dst_first] & 7,
-                  Matcher::_regEncode[src_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movl(as_Register(Matcher::_regEncode[dst_first]),
+                  as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movl    %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          (Matcher::_regEncode[src_first] < 8 && Matcher::_regEncode[dst_first] < 8)
-          ? 2
-          : 3; // REX
+        return 0;
       }
     } else if (dst_first_rc == rc_float) {
       // gpr -> xmm
@@ -1349,13 +1181,12 @@
           MacroAssembler _masm(cbuf);
           __ movdq( as_XMMRegister(Matcher::_regEncode[dst_first]), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movdq   %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return 5; // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1364,17 +1195,14 @@
           MacroAssembler _masm(cbuf);
           __ movdl( as_XMMRegister(Matcher::_regEncode[dst_first]), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movdl   %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          (Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
-          ? 5
-          : (4 + ((UseAVX>0)?1:0)); // REX
       }
+      return 0;
     }
   } else if (src_first_rc == rc_float) {
     // xmm ->
@@ -1388,17 +1216,12 @@
           MacroAssembler _masm(cbuf);
           __ movdbl( Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movsd   [rsp + #%d], %s\t# spill",
                      offset,
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[src_first] >= 8)
-           ? 6
-           : (5 + ((UseAVX>0)?1:0))); // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1408,18 +1231,14 @@
           MacroAssembler _masm(cbuf);
           __ movflt(Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movss   [rsp + #%d], %s\t# spill",
                      offset,
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[src_first] >=8)
-           ? 6
-           : (5 + ((UseAVX>0)?1:0))); // REX
       }
+      return 0;
     } else if (dst_first_rc == rc_int) {
       // xmm -> gpr
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
@@ -1429,13 +1248,12 @@
           MacroAssembler _masm(cbuf);
           __ movdq( as_Register(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movdq   %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return 5; // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1444,17 +1262,14 @@
           MacroAssembler _masm(cbuf);
           __ movdl( as_Register(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movdl   %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          (Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
-          ? 5
-          : (4 + ((UseAVX>0)?1:0)); // REX
       }
+      return 0;
     } else if (dst_first_rc == rc_float) {
       // xmm -> xmm
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
@@ -1464,17 +1279,13 @@
           MacroAssembler _masm(cbuf);
           __ movdbl( as_XMMRegister(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("%s  %s, %s\t# spill",
                      UseXmmRegToRegMoveAll ? "movapd" : "movsd ",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          (Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
-          ? 5
-          : (4 + ((UseAVX>0)?1:0)); // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1483,42 +1294,35 @@
           MacroAssembler _masm(cbuf);
           __ movflt( as_XMMRegister(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("%s  %s, %s\t# spill",
                      UseXmmRegToRegMoveAll ? "movaps" : "movss ",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return ((UseAVX>0) ? 5:
-          ((Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
-           ? (UseXmmRegToRegMoveAll ? 4 : 5)
-           : (UseXmmRegToRegMoveAll ? 3 : 4))); // REX
       }
+      return 0;
     }
   }
 
   assert(0," foo ");
   Unimplemented();
-
   return 0;
 }
 
 #ifndef PRODUCT
-void MachSpillCopyNode::format(PhaseRegAlloc *ra_, outputStream* st) const
-{
+void MachSpillCopyNode::format(PhaseRegAlloc *ra_, outputStream* st) const {
   implementation(NULL, ra_, false, st);
 }
 #endif
 
-void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const
-{
+void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   implementation(&cbuf, ra_, false, NULL);
 }
 
-uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const
-{
-  return implementation(NULL, ra_, true, NULL);
+uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
+  return MachNode::size(ra_);
 }
 
 //=============================================================================
@@ -1735,16 +1539,6 @@
   return true;
 }
 
-// Vector width in bytes
-const uint Matcher::vector_width_in_bytes(void) {
-  return 8;
-}
-
-// Vector ideal reg
-const uint Matcher::vector_ideal_reg(void) {
-  return Op_RegD;
-}
-
 // Is this branch offset short enough that a short branch can be used?
 //
 // NOTE: If the platform does not provide any short branch variants, then
@@ -1831,21 +1625,21 @@
 bool Matcher::can_be_java_arg(int reg)
 {
   return
-    reg ==  RDI_num || reg ==  RDI_H_num ||
-    reg ==  RSI_num || reg ==  RSI_H_num ||
-    reg ==  RDX_num || reg ==  RDX_H_num ||
-    reg ==  RCX_num || reg ==  RCX_H_num ||
-    reg ==   R8_num || reg ==   R8_H_num ||
-    reg ==   R9_num || reg ==   R9_H_num ||
-    reg ==  R12_num || reg ==  R12_H_num ||
-    reg == XMM0_num || reg == XMM0_H_num ||
-    reg == XMM1_num || reg == XMM1_H_num ||
-    reg == XMM2_num || reg == XMM2_H_num ||
-    reg == XMM3_num || reg == XMM3_H_num ||
-    reg == XMM4_num || reg == XMM4_H_num ||
-    reg == XMM5_num || reg == XMM5_H_num ||
-    reg == XMM6_num || reg == XMM6_H_num ||
-    reg == XMM7_num || reg == XMM7_H_num;
+    reg ==  RDI_num || reg == RDI_H_num ||
+    reg ==  RSI_num || reg == RSI_H_num ||
+    reg ==  RDX_num || reg == RDX_H_num ||
+    reg ==  RCX_num || reg == RCX_H_num ||
+    reg ==   R8_num || reg ==  R8_H_num ||
+    reg ==   R9_num || reg ==  R9_H_num ||
+    reg ==  R12_num || reg == R12_H_num ||
+    reg == XMM0_num || reg == XMM0b_num ||
+    reg == XMM1_num || reg == XMM1b_num ||
+    reg == XMM2_num || reg == XMM2b_num ||
+    reg == XMM3_num || reg == XMM3b_num ||
+    reg == XMM4_num || reg == XMM4b_num ||
+    reg == XMM5_num || reg == XMM5b_num ||
+    reg == XMM6_num || reg == XMM6b_num ||
+    reg == XMM7_num || reg == XMM7b_num;
 }
 
 bool Matcher::is_spillable_arg(int reg)
@@ -3220,10 +3014,11 @@
       OptoReg::Bad, // Op_RegI
       RAX_H_num,    // Op_RegP
       OptoReg::Bad, // Op_RegF
-      XMM0_H_num,   // Op_RegD
+      XMM0b_num,    // Op_RegD
       RAX_H_num     // Op_RegL
     };
-    assert(ARRAY_SIZE(hi) == _last_machine_leaf - 1, "missing type");
+    // Excluded flags and vector registers.
+    assert(ARRAY_SIZE(hi) == _last_machine_leaf - 5, "missing type");
     return OptoRegPair(hi[ideal_reg], lo[ideal_reg]);
   %}
 %}
@@ -3985,7 +3780,6 @@
   interface(REG_INTER);
 %}
 
-
 //----------Memory Operands----------------------------------------------------
 // Direct Memory Operand
 // operand direct(immP addr)
@@ -5416,61 +5210,6 @@
   ins_pipe(pipe_slow); // XXX
 %}
 
-// Load Aligned Packed Byte to XMM register
-instruct loadA8B(regD dst, memory mem) %{
-  match(Set dst (Load8B mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed8B" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Short to XMM register
-instruct loadA4S(regD dst, memory mem) %{
-  match(Set dst (Load4S mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed4S" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Char to XMM register
-instruct loadA4C(regD dst, memory mem) %{
-  match(Set dst (Load4C mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed4C" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Integer to XMM register
-instruct load2IU(regD dst, memory mem) %{
-  match(Set dst (Load2I mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed2I" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Single to XMM
-instruct loadA2F(regD dst, memory mem) %{
-  match(Set dst (Load2F mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed2F" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Load Effective Address
 instruct leaP8(rRegP dst, indOffset8 mem)
 %{
@@ -6200,39 +5939,6 @@
   ins_pipe(ialu_mem_imm);
 %}
 
-// Store Aligned Packed Byte XMM register to memory
-instruct storeA8B(memory mem, regD src) %{
-  match(Set mem (Store8B mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed8B" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Store Aligned Packed Char/Short XMM register to memory
-instruct storeA4C(memory mem, regD src) %{
-  match(Set mem (Store4C mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed4C" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Store Aligned Packed Integer XMM register to memory
-instruct storeA2I(memory mem, regD src) %{
-  match(Set mem (Store2I mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed2I" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Store CMS card-mark Immediate
 instruct storeImmCM0_reg(memory mem, immI0 zero)
 %{
@@ -6258,17 +5964,6 @@
   ins_pipe(ialu_mem_imm);
 %}
 
-// Store Aligned Packed Single Float XMM register to memory
-instruct storeA2F(memory mem, regD src) %{
-  match(Set mem (Store2F mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed2F" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Store Float
 instruct storeF(memory mem, regF src)
 %{
@@ -10377,172 +10072,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_reg(regD dst, regD src) %{
-  match(Set dst (Replicate8B src));
-  format %{ "MOVDQA  $dst,$src\n\t"
-            "PUNPCKLBW $dst,$dst\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode %{
-    if ($dst$$reg != $src$$reg) {
-      __ movdqa($dst$$XMMRegister, $src$$XMMRegister);
-    }
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_rRegI(regD dst, rRegI src) %{
-  match(Set dst (Replicate8B src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PUNPCKLBW $dst,$dst\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Replicate scalar zero to packed byte (1 byte) values in xmm
-instruct Repl8B_immI0(regD dst, immI0 zero) %{
-  match(Set dst (Replicate8B zero));
-  format %{ "PXOR  $dst,$dst\t! replicate8B" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_reg(regD dst, regD src) %{
-  match(Set dst (Replicate4S src));
-  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_rRegI(regD dst, rRegI src) %{
-  match(Set dst (Replicate4S src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed short (2 byte) values in xmm
-instruct Repl4S_immI0(regD dst, immI0 zero) %{
-  match(Set dst (Replicate4S zero));
-  format %{ "PXOR  $dst,$dst\t! replicate4S" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_reg(regD dst, regD src) %{
-  match(Set dst (Replicate4C src));
-  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4C" %}
-  ins_encode %{
-    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_rRegI(regD dst, rRegI src) %{
-  match(Set dst (Replicate4C src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate4C" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed char (2 byte) values in xmm
-instruct Repl4C_immI0(regD dst, immI0 zero) %{
-  match(Set dst (Replicate4C zero));
-  format %{ "PXOR  $dst,$dst\t! replicate4C" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_reg(regD dst, regD src) %{
-  match(Set dst (Replicate2I src));
-  format %{ "PSHUFD $dst,$src,0x00\t! replicate2I" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_rRegI(regD dst, rRegI src) %{
-  match(Set dst (Replicate2I src));
-  format %{ "MOVD   $dst,$src\n\t"
-            "PSHUFD $dst,$dst,0x00\t! replicate2I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed integer (2 byte) values in xmm
-instruct Repl2I_immI0(regD dst, immI0 zero) %{
-  match(Set dst (Replicate2I zero));
-  format %{ "PXOR  $dst,$dst\t! replicate2I" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_reg(regD dst, regD src) %{
-  match(Set dst (Replicate2F src));
-  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_regF(regD dst, regF src) %{
-  match(Set dst (Replicate2F src));
-  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_immF0(regD dst, immF0 zero) %{
-  match(Set dst (Replicate2F zero));
-  format %{ "PXOR  $dst,$dst\t! replicate2F" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 
 // =======================================================================
 // fast clearing of an array
--- a/src/os_cpu/bsd_x86/vm/os_bsd_x86.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/os_cpu/bsd_x86/vm/os_bsd_x86.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -516,7 +516,12 @@
       }
     }
 
-    if (thread->thread_state() == _thread_in_Java) {
+    // We test if stub is already set (by the stack overflow code
+    // above) so it is not overwritten by the code that follows. This
+    // check is not required on other platforms, because on other
+    // platforms we check for SIGSEGV only or SIGBUS only, where here
+    // we have to check for both SIGSEGV and SIGBUS.
+    if (thread->thread_state() == _thread_in_Java && stub == NULL) {
       // Java thread running in Java code => find exception handler if any
       // a fault inside compiled code, the interpreter, or a stub
 
--- a/src/share/vm/adlc/adlparse.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/adlc/adlparse.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -115,6 +115,12 @@
       parse_err(SYNERR, "expected one of - instruct, operand, ins_attrib, op_attrib, source, register, pipeline, encode\n     Found %s",ident);
     }
   }
+  // Add reg_class spill_regs after parsing.
+  RegisterForm *regBlock = _AD.get_registers();
+  if (regBlock == NULL) {
+    parse_err(SEMERR, "Did not declare 'register' definitions");
+  }
+  regBlock->addSpillRegClass();
 
   // Done with parsing, check consistency.
 
@@ -768,11 +774,12 @@
 
 //------------------------------reg_parse--------------------------------------
 void ADLParser::reg_parse(void) {
-
-  // Create the RegisterForm for the architecture description.
-  RegisterForm *regBlock = new RegisterForm();    // Build new Source object
-  regBlock->_linenum = linenum();
-  _AD.addForm(regBlock);
+  RegisterForm *regBlock = _AD.get_registers(); // Information about registers encoding
+  if (regBlock == NULL) {
+    // Create the RegisterForm for the architecture description.
+    regBlock = new RegisterForm();    // Build new Source object
+    _AD.addForm(regBlock);
+  }
 
   skipws();                       // Skip leading whitespace
   if (_curchar == '%' && *(_ptr+1) == '{') {
@@ -796,15 +803,11 @@
     parse_err(SYNERR, "Missing %c{ ... %c} block after register keyword.\n",'%','%');
     return;
   }
-
-  // Add reg_class spill_regs
-  regBlock->addSpillRegClass();
 }
 
 //------------------------------encode_parse-----------------------------------
 void ADLParser::encode_parse(void) {
   EncodeForm *encBlock;         // Information about instruction/operand encoding
-  char       *desc = NULL;      // String representation of encode rule
 
   _AD.getForm(&encBlock);
   if ( encBlock == NULL) {
--- a/src/share/vm/adlc/archDesc.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/adlc/archDesc.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -911,12 +911,24 @@
   // Find last character in idealOp, it specifies the type
   char  last_char = 0;
   const char *ptr = idealOp;
-  for( ; *ptr != '\0'; ++ptr) {
+  for (; *ptr != '\0'; ++ptr) {
     last_char = *ptr;
   }
 
+  // Match Vector types.
+  if (strncmp(idealOp, "Vec",3)==0) {
+    switch(last_char) {
+    case 'S':  return "TypeVect::VECTS";
+    case 'D':  return "TypeVect::VECTD";
+    case 'X':  return "TypeVect::VECTX";
+    case 'Y':  return "TypeVect::VECTY";
+    default:
+      internal_err("Vector type %s with unrecognized type\n",idealOp);
+    }
+  }
+
   // !!!!!
-  switch( last_char ) {
+  switch(last_char) {
   case 'I':    return "TypeInt::INT";
   case 'P':    return "TypePtr::BOTTOM";
   case 'N':    return "TypeNarrowOop::BOTTOM";
--- a/src/share/vm/adlc/forms.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/adlc/forms.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -265,47 +265,22 @@
   if( strcmp(opType,"LoadN")==0 )  return Form::idealN;
   if( strcmp(opType,"LoadRange")==0 )  return Form::idealI;
   if( strcmp(opType,"LoadS")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load16B")==0 )  return Form::idealB;
-  if( strcmp(opType,"Load8B")==0 )  return Form::idealB;
-  if( strcmp(opType,"Load4B")==0 )  return Form::idealB;
-  if( strcmp(opType,"Load8C")==0 )  return Form::idealC;
-  if( strcmp(opType,"Load4C")==0 )  return Form::idealC;
-  if( strcmp(opType,"Load2C")==0 )  return Form::idealC;
-  if( strcmp(opType,"Load8S")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load4S")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load2S")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load2D")==0 )  return Form::idealD;
-  if( strcmp(opType,"Load4F")==0 )  return Form::idealF;
-  if( strcmp(opType,"Load2F")==0 )  return Form::idealF;
-  if( strcmp(opType,"Load4I")==0 )  return Form::idealI;
-  if( strcmp(opType,"Load2I")==0 )  return Form::idealI;
-  if( strcmp(opType,"Load2L")==0 )  return Form::idealL;
+  if( strcmp(opType,"LoadVector")==0 )  return Form::idealV;
   assert( strcmp(opType,"Load") != 0, "Must type Loads" );
   return Form::none;
 }
 
 Form::DataType Form::is_store_to_memory(const char *opType) const {
   if( strcmp(opType,"StoreB")==0)  return Form::idealB;
-  if( strcmp(opType,"StoreCM")==0)  return Form::idealB;
+  if( strcmp(opType,"StoreCM")==0) return Form::idealB;
   if( strcmp(opType,"StoreC")==0)  return Form::idealC;
   if( strcmp(opType,"StoreD")==0)  return Form::idealD;
   if( strcmp(opType,"StoreF")==0)  return Form::idealF;
   if( strcmp(opType,"StoreI")==0)  return Form::idealI;
   if( strcmp(opType,"StoreL")==0)  return Form::idealL;
   if( strcmp(opType,"StoreP")==0)  return Form::idealP;
-  if( strcmp(opType,"StoreN")==0) return Form::idealN;
-  if( strcmp(opType,"Store16B")==0)  return Form::idealB;
-  if( strcmp(opType,"Store8B")==0)  return Form::idealB;
-  if( strcmp(opType,"Store4B")==0)  return Form::idealB;
-  if( strcmp(opType,"Store8C")==0)  return Form::idealC;
-  if( strcmp(opType,"Store4C")==0)  return Form::idealC;
-  if( strcmp(opType,"Store2C")==0)  return Form::idealC;
-  if( strcmp(opType,"Store2D")==0)  return Form::idealD;
-  if( strcmp(opType,"Store4F")==0)  return Form::idealF;
-  if( strcmp(opType,"Store2F")==0)  return Form::idealF;
-  if( strcmp(opType,"Store4I")==0)  return Form::idealI;
-  if( strcmp(opType,"Store2I")==0)  return Form::idealI;
-  if( strcmp(opType,"Store2L")==0)  return Form::idealL;
+  if( strcmp(opType,"StoreN")==0)  return Form::idealN;
+  if( strcmp(opType,"StoreVector")==0 )  return Form::idealV;
   assert( strcmp(opType,"Store") != 0, "Must type Stores" );
   return Form::none;
 }
--- a/src/share/vm/adlc/forms.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/adlc/forms.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -172,7 +172,8 @@
     idealB      =  6,  // Byte    type
     idealC      =  7,  // Char    type
     idealS      =  8,  // String  type
-    idealN      =  9   // Narrow oop types
+    idealN      =  9,  // Narrow oop types
+    idealV      = 10   // Vector  type
   };
   // Convert ideal name to a DataType, return DataType::none if not a 'ConX'
   Form::DataType  ideal_to_const_type(const char *ideal_type_name) const;
--- a/src/share/vm/adlc/formsopt.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/adlc/formsopt.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -66,7 +66,7 @@
 // for spill-slots/regs.
 void RegisterForm::addSpillRegClass() {
   // Stack slots start at the next available even register number.
-  _reg_ctr = (_reg_ctr+1) & ~1;
+  _reg_ctr = (_reg_ctr+7) & ~7;
   const char *rc_name   = "stack_slots";
   RegClass   *reg_class = new RegClass(rc_name);
   reg_class->_stack_or_reg = true;
@@ -150,9 +150,14 @@
 int RegisterForm::RegMask_Size() {
   // Need at least this many words
   int words_for_regs = (_reg_ctr + 31)>>5;
-  // Add a few for incoming & outgoing arguments to calls.
+  // The array of Register Mask bits should be large enough to cover
+  // all the machine registers and all parameters that need to be passed
+  // on the stack (stack registers) up to some interesting limit.  Methods
+  // that need more parameters will NOT be compiled.  On Intel, the limit
+  // is something like 90+ parameters.
+  // Add a few (3 words == 96 bits) for incoming & outgoing arguments to calls.
   // Round up to the next doubleword size.
-  return (words_for_regs + 2 + 1) & ~1;
+  return (words_for_regs + 3 + 1) & ~1;
 }
 
 void RegisterForm::dump() {                  // Debug printer
--- a/src/share/vm/adlc/formssel.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/adlc/formssel.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -432,6 +432,14 @@
   return  _matrule->is_ideal_store();
 }
 
+// Return 'true' if this instruction matches an ideal vector node
+bool InstructForm::is_vector() const {
+  if( _matrule == NULL ) return false;
+
+  return _matrule->is_vector();
+}
+
+
 // Return the input register that must match the output register
 // If this is not required, return 0
 uint InstructForm::two_address(FormDict &globals) {
@@ -751,6 +759,9 @@
 
   if (needs_base_oop_edge(globals)) return true;
 
+  if (is_vector()) return true;
+  if (is_mach_constant()) return true;
+
   return  false;
 }
 
@@ -3381,11 +3392,8 @@
     "StoreI","StoreL","StoreP","StoreN","StoreD","StoreF" ,
     "StoreB","StoreC","Store" ,"StoreFP",
     "LoadI", "LoadUI2L", "LoadL", "LoadP" ,"LoadN", "LoadD" ,"LoadF"  ,
-    "LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load"   ,
-    "Store4I","Store2I","Store2L","Store2D","Store4F","Store2F","Store16B",
-    "Store8B","Store4B","Store8C","Store4C","Store2C",
-    "Load4I" ,"Load2I" ,"Load2L" ,"Load2D" ,"Load4F" ,"Load2F" ,"Load16B" ,
-    "Load8B" ,"Load4B" ,"Load8C" ,"Load4C" ,"Load2C" ,"Load8S", "Load4S","Load2S",
+    "LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load" ,
+    "StoreVector", "LoadVector",
     "LoadRange", "LoadKlass", "LoadNKlass", "LoadL_unaligned", "LoadD_unaligned",
     "LoadPLocked",
     "StorePConditional", "StoreIConditional", "StoreLConditional",
@@ -3822,6 +3830,10 @@
          strcmp(opType,"RegL")==0 ||
          strcmp(opType,"RegF")==0 ||
          strcmp(opType,"RegD")==0 ||
+         strcmp(opType,"VecS")==0 ||
+         strcmp(opType,"VecD")==0 ||
+         strcmp(opType,"VecX")==0 ||
+         strcmp(opType,"VecY")==0 ||
          strcmp(opType,"Reg" )==0) ) {
       return 1;
     }
@@ -3938,19 +3950,12 @@
         strcmp(opType,"ReverseBytesL")==0 ||
         strcmp(opType,"ReverseBytesUS")==0 ||
         strcmp(opType,"ReverseBytesS")==0 ||
-        strcmp(opType,"Replicate16B")==0 ||
-        strcmp(opType,"Replicate8B")==0 ||
-        strcmp(opType,"Replicate4B")==0 ||
-        strcmp(opType,"Replicate8C")==0 ||
-        strcmp(opType,"Replicate4C")==0 ||
-        strcmp(opType,"Replicate8S")==0 ||
-        strcmp(opType,"Replicate4S")==0 ||
-        strcmp(opType,"Replicate4I")==0 ||
-        strcmp(opType,"Replicate2I")==0 ||
-        strcmp(opType,"Replicate2L")==0 ||
-        strcmp(opType,"Replicate4F")==0 ||
-        strcmp(opType,"Replicate2F")==0 ||
-        strcmp(opType,"Replicate2D")==0 ||
+        strcmp(opType,"ReplicateB")==0 ||
+        strcmp(opType,"ReplicateS")==0 ||
+        strcmp(opType,"ReplicateI")==0 ||
+        strcmp(opType,"ReplicateL")==0 ||
+        strcmp(opType,"ReplicateF")==0 ||
+        strcmp(opType,"ReplicateD")==0 ||
         0 /* 0 to line up columns nicely */ )
       return 1;
   }
@@ -4034,6 +4039,23 @@
   return ideal_load;
 }
 
+bool MatchRule::is_vector() const {
+  if( _rChild ) {
+    const char  *opType = _rChild->_opType;
+    if( strcmp(opType,"ReplicateB")==0 ||
+        strcmp(opType,"ReplicateS")==0 ||
+        strcmp(opType,"ReplicateI")==0 ||
+        strcmp(opType,"ReplicateL")==0 ||
+        strcmp(opType,"ReplicateF")==0 ||
+        strcmp(opType,"ReplicateD")==0 ||
+        strcmp(opType,"LoadVector")==0 ||
+        strcmp(opType,"StoreVector")==0 ||
+        0 /* 0 to line up columns nicely */ )
+      return true;
+  }
+  return false;
+}
+
 
 bool MatchRule::skip_antidep_check() const {
   // Some loads operate on what is effectively immutable memory so we
--- a/src/share/vm/adlc/formssel.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/adlc/formssel.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -160,6 +160,7 @@
   virtual bool        is_ideal_safepoint() const; // node matches 'SafePoint'
   virtual bool        is_ideal_nop() const;     // node matches 'Nop'
   virtual bool        is_ideal_control() const; // control node
+  virtual bool        is_vector() const;        // vector instruction
 
   virtual Form::CallType is_ideal_call() const; // matches ideal 'Call'
   virtual Form::DataType is_ideal_load() const; // node matches ideal 'LoadXNode'
@@ -1011,6 +1012,7 @@
   bool       is_ideal_goto() const;    // node matches ideal 'Goto'
   bool       is_ideal_loopEnd() const; // node matches ideal 'LoopEnd'
   bool       is_ideal_bool() const;    // node matches ideal 'Bool'
+  bool       is_vector() const;        // vector instruction
   Form::DataType is_ideal_load() const;// node matches ideal 'LoadXNode'
   // Should antidep checks be disabled for this rule
   // See definition of MatchRule::skip_antidep_check
--- a/src/share/vm/adlc/main.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/adlc/main.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -250,6 +250,7 @@
   AD.addInclude(AD._HPP_file, "opto/node.hpp");
   AD.addInclude(AD._HPP_file, "opto/regalloc.hpp");
   AD.addInclude(AD._HPP_file, "opto/subnode.hpp");
+  AD.addInclude(AD._HPP_file, "opto/vectornode.hpp");
   AD.addInclude(AD._CPP_CLONE_file, "precompiled.hpp");
   AD.addInclude(AD._CPP_CLONE_file, "adfiles", get_basename(AD._HPP_file._name));
   AD.addInclude(AD._CPP_EXPAND_file, "precompiled.hpp");
--- a/src/share/vm/classfile/vmSymbols.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/classfile/vmSymbols.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -111,6 +111,10 @@
   template(getBootClassPathEntryForClass_name,        "getBootClassPathEntryForClass")            \
   template(sun_misc_PostVMInitHook,                   "sun/misc/PostVMInitHook")                  \
                                                                                                   \
+  /* Java runtime version access */                                                               \
+  template(sun_misc_Version,                          "sun/misc/Version")                         \
+  template(java_runtime_name_name,                    "java_runtime_name")                        \
+                                                                                                  \
   /* class file format tags */                                                                    \
   template(tag_source_file,                           "SourceFile")                               \
   template(tag_inner_classes,                         "InnerClasses")                             \
--- a/src/share/vm/code/vmreg.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/code/vmreg.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -27,7 +27,7 @@
 #include "code/vmreg.hpp"
 
 // First VMReg value that could refer to a stack slot
-VMReg VMRegImpl::stack0 = (VMReg)(intptr_t)((ConcreteRegisterImpl::number_of_registers + 1) & ~1);
+VMReg VMRegImpl::stack0 = (VMReg)(intptr_t)((ConcreteRegisterImpl::number_of_registers + 7) & ~7);
 
 // VMRegs are 4 bytes wide on all platforms
 const int VMRegImpl::stack_slot_size = 4;
--- a/src/share/vm/memory/universe.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/memory/universe.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -273,7 +273,7 @@
   }
 
   static klassOop typeArrayKlassObj(BasicType t) {
-    assert((uint)t < T_VOID+1, "range check");
+    assert((uint)t < T_VOID+1, err_msg("range check for type: %s", type2name(t)));
     assert(_typeArrayKlassObjs[t] != NULL, "domain check");
     return _typeArrayKlassObjs[t];
   }
--- a/src/share/vm/opto/c2_globals.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/c2_globals.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -81,6 +81,13 @@
   product(intx, MaxLoopPad, (OptoLoopAlignment-1),                          \
           "Align a loop if padding size in bytes is less or equal to this value") \
                                                                             \
+  product(intx, MaxVectorSize, 32,                                          \
+          "Max vector size in bytes, "                                      \
+          "actual size could be less depending on elements type")           \
+                                                                            \
+  product(bool, AlignVector, false,                                         \
+          "Perform vector store/load alignment in loop")                    \
+                                                                            \
   product(intx, NumberOfLoopInstrToAlign, 4,                                \
           "Number of first instructions in a loop to align")                \
                                                                             \
@@ -292,9 +299,12 @@
   develop(bool, SuperWordRTDepCheck, false,                                 \
           "Enable runtime dependency checks.")                              \
                                                                             \
-  product(bool, TraceSuperWord, false,                                      \
+  notproduct(bool, TraceSuperWord, false,                                   \
           "Trace superword transforms")                                     \
                                                                             \
+  notproduct(bool, TraceNewVectors, false,                                  \
+          "Trace creation of Vector nodes")                                 \
+                                                                            \
   product_pd(bool, OptoBundling,                                            \
           "Generate nops to fill i-cache lines")                            \
                                                                             \
--- a/src/share/vm/opto/callGenerator.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/callGenerator.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -172,9 +172,11 @@
 
 JVMState* DynamicCallGenerator::generate(JVMState* jvms) {
   GraphKit kit(jvms);
+  Compile* C = kit.C;
+  PhaseGVN& gvn = kit.gvn();
 
-  if (kit.C->log() != NULL) {
-    kit.C->log()->elem("dynamic_call bci='%d'", jvms->bci());
+  if (C->log() != NULL) {
+    C->log()->elem("dynamic_call bci='%d'", jvms->bci());
   }
 
   // Get the constant pool cache from the caller class.
@@ -190,18 +192,21 @@
   size_t call_site_offset = cpcache->get_f1_offset(index);
 
   // Load the CallSite object from the constant pool cache.
-  const TypeOopPtr* cpcache_ptr = TypeOopPtr::make_from_constant(cpcache);
-  Node* cpcache_adr = kit.makecon(cpcache_ptr);
-  Node* call_site_adr = kit.basic_plus_adr(cpcache_adr, cpcache_adr, call_site_offset);
-  Node* call_site = kit.make_load(kit.control(), call_site_adr, TypeInstPtr::BOTTOM, T_OBJECT, Compile::AliasIdxRaw);
+  const TypeOopPtr* cpcache_type   = TypeOopPtr::make_from_constant(cpcache);  // returns TypeAryPtr of type T_OBJECT
+  const TypeOopPtr* call_site_type = TypeOopPtr::make_from_klass(C->env()->CallSite_klass());
+  Node* cpcache_adr   = kit.makecon(cpcache_type);
+  Node* call_site_adr = kit.basic_plus_adr(cpcache_adr, call_site_offset);
+  // The oops in the constant pool cache are not compressed; load then as raw pointers.
+  Node* call_site     = kit.make_load(kit.control(), call_site_adr, call_site_type, T_ADDRESS, Compile::AliasIdxRaw);
 
   // Load the target MethodHandle from the CallSite object.
-  Node* target_mh_adr = kit.basic_plus_adr(call_site, call_site, java_lang_invoke_CallSite::target_offset_in_bytes());
-  Node* target_mh = kit.make_load(kit.control(), target_mh_adr, TypeInstPtr::BOTTOM, T_OBJECT);
+  const TypeOopPtr* target_type = TypeOopPtr::make_from_klass(C->env()->MethodHandle_klass());
+  Node* target_mh_adr = kit.basic_plus_adr(call_site, java_lang_invoke_CallSite::target_offset_in_bytes());
+  Node* target_mh     = kit.make_load(kit.control(), target_mh_adr, target_type, T_OBJECT);
 
   address resolve_stub = SharedRuntime::get_resolve_opt_virtual_call_stub();
 
-  CallStaticJavaNode *call = new (kit.C, tf()->domain()->cnt()) CallStaticJavaNode(tf(), resolve_stub, method(), kit.bci());
+  CallStaticJavaNode* call = new (C, tf()->domain()->cnt()) CallStaticJavaNode(tf(), resolve_stub, method(), kit.bci());
   // invokedynamic is treated as an optimized invokevirtual.
   call->set_optimized_virtual(true);
   // Take extra care (in the presence of argument motion) not to trash the SP:
@@ -785,9 +790,10 @@
 
 JVMState* PredictedDynamicCallGenerator::generate(JVMState* jvms) {
   GraphKit kit(jvms);
+  Compile* C = kit.C;
   PhaseGVN& gvn = kit.gvn();
 
-  CompileLog* log = kit.C->log();
+  CompileLog* log = C->log();
   if (log != NULL) {
     log->elem("predicted_dynamic_call bci='%d'", jvms->bci());
   }
@@ -803,8 +809,8 @@
     Node* receiver = kit.argument(0);
 
     // Check if the MethodHandle is the expected one
-    Node* cmp = gvn.transform(new(kit.C, 3) CmpPNode(receiver, predicted_mh));
-    bol = gvn.transform(new(kit.C, 2) BoolNode(cmp, BoolTest::eq) );
+    Node* cmp = gvn.transform(new (C, 3) CmpPNode(receiver, predicted_mh));
+    bol = gvn.transform(new (C, 2) BoolNode(cmp, BoolTest::eq) );
   } else {
     // Get the constant pool cache from the caller class.
     ciMethod* caller_method = jvms->method();
@@ -818,22 +824,25 @@
     size_t call_site_offset = cpcache->get_f1_offset(index);
 
     // Load the CallSite object from the constant pool cache.
-    const TypeOopPtr* cpcache_ptr = TypeOopPtr::make_from_constant(cpcache);
-    Node* cpcache_adr   = kit.makecon(cpcache_ptr);
-    Node* call_site_adr = kit.basic_plus_adr(cpcache_adr, cpcache_adr, call_site_offset);
-    Node* call_site     = kit.make_load(kit.control(), call_site_adr, TypeInstPtr::BOTTOM, T_OBJECT, Compile::AliasIdxRaw);
+    const TypeOopPtr* cpcache_type   = TypeOopPtr::make_from_constant(cpcache);  // returns TypeAryPtr of type T_OBJECT
+    const TypeOopPtr* call_site_type = TypeOopPtr::make_from_klass(C->env()->CallSite_klass());
+    Node* cpcache_adr   = kit.makecon(cpcache_type);
+    Node* call_site_adr = kit.basic_plus_adr(cpcache_adr, call_site_offset);
+    // The oops in the constant pool cache are not compressed; load then as raw pointers.
+    Node* call_site     = kit.make_load(kit.control(), call_site_adr, call_site_type, T_ADDRESS, Compile::AliasIdxRaw);
 
     // Load the target MethodHandle from the CallSite object.
+    const TypeOopPtr* target_type = TypeOopPtr::make_from_klass(C->env()->MethodHandle_klass());
     Node* target_adr = kit.basic_plus_adr(call_site, call_site, java_lang_invoke_CallSite::target_offset_in_bytes());
-    Node* target_mh  = kit.make_load(kit.control(), target_adr, TypeInstPtr::BOTTOM, T_OBJECT);
+    Node* target_mh  = kit.make_load(kit.control(), target_adr, target_type, T_OBJECT);
 
     // Check if the MethodHandle is still the same.
-    Node* cmp = gvn.transform(new(kit.C, 3) CmpPNode(target_mh, predicted_mh));
-    bol = gvn.transform(new(kit.C, 2) BoolNode(cmp, BoolTest::eq) );
+    Node* cmp = gvn.transform(new (C, 3) CmpPNode(target_mh, predicted_mh));
+    bol = gvn.transform(new (C, 2) BoolNode(cmp, BoolTest::eq) );
   }
   IfNode* iff = kit.create_and_xform_if(kit.control(), bol, _hit_prob, COUNT_UNKNOWN);
-  kit.set_control( gvn.transform(new(kit.C, 1) IfTrueNode (iff)));
-  Node* slow_ctl = gvn.transform(new(kit.C, 1) IfFalseNode(iff));
+  kit.set_control( gvn.transform(new (C, 1) IfTrueNode (iff)));
+  Node* slow_ctl = gvn.transform(new (C, 1) IfFalseNode(iff));
 
   SafePointNode* slow_map = NULL;
   JVMState* slow_jvms;
@@ -882,7 +891,7 @@
 
   // Finish the diamond.
   kit.C->set_has_split_ifs(true); // Has chance for split-if optimization
-  RegionNode* region = new (kit.C, 3) RegionNode(3);
+  RegionNode* region = new (C, 3) RegionNode(3);
   region->init_req(1, kit.control());
   region->init_req(2, slow_map->control());
   kit.set_control(gvn.transform(region));
--- a/src/share/vm/opto/chaitin.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/chaitin.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -75,6 +75,7 @@
   // Flags
   if( _is_oop ) tty->print("Oop ");
   if( _is_float ) tty->print("Float ");
+  if( _is_vector ) tty->print("Vector ");
   if( _was_spilled1 ) tty->print("Spilled ");
   if( _was_spilled2 ) tty->print("Spilled2 ");
   if( _direct_conflict ) tty->print("Direct_conflict ");
@@ -479,16 +480,18 @@
 
   // Move important info out of the live_arena to longer lasting storage.
   alloc_node_regs(_names.Size());
-  for( uint i=0; i < _names.Size(); i++ ) {
-    if( _names[i] ) {           // Live range associated with Node?
-      LRG &lrg = lrgs( _names[i] );
-      if( lrg.num_regs() == 1 ) {
-        _node_regs[i].set1( lrg.reg() );
+  for (uint i=0; i < _names.Size(); i++) {
+    if (_names[i]) {           // Live range associated with Node?
+      LRG &lrg = lrgs(_names[i]);
+      if (!lrg.alive()) {
+        _node_regs[i].set_bad();
+      } else if (lrg.num_regs() == 1) {
+        _node_regs[i].set1(lrg.reg());
       } else {                  // Must be a register-pair
-        if( !lrg._fat_proj ) {  // Must be aligned adjacent register pair
+        if (!lrg._fat_proj) {   // Must be aligned adjacent register pair
           // Live ranges record the highest register in their mask.
           // We want the low register for the AD file writer's convenience.
-          _node_regs[i].set2( OptoReg::add(lrg.reg(),-1) );
+          _node_regs[i].set2( OptoReg::add(lrg.reg(),(1-lrg.num_regs())) );
         } else {                // Misaligned; extract 2 bits
           OptoReg::Name hi = lrg.reg(); // Get hi register
           lrg.Remove(hi);       // Yank from mask
@@ -568,7 +571,7 @@
         // Check for float-vs-int live range (used in register-pressure
         // calculations)
         const Type *n_type = n->bottom_type();
-        if( n_type->is_floatingpoint() )
+        if (n_type->is_floatingpoint())
           lrg._is_float = 1;
 
         // Check for twice prior spilling.  Once prior spilling might have
@@ -599,18 +602,28 @@
         // Limit result register mask to acceptable registers
         const RegMask &rm = n->out_RegMask();
         lrg.AND( rm );
-        // Check for bound register masks
-        const RegMask &lrgmask = lrg.mask();
-        if( lrgmask.is_bound1() || lrgmask.is_bound2() )
-          lrg._is_bound = 1;
-
-        // Check for maximum frequency value
-        if( lrg._maxfreq < b->_freq )
-          lrg._maxfreq = b->_freq;
 
         int ireg = n->ideal_reg();
         assert( !n->bottom_type()->isa_oop_ptr() || ireg == Op_RegP,
                 "oops must be in Op_RegP's" );
+
+        // Check for vector live range (only if vector register is used).
+        // On SPARC vector uses RegD which could be misaligned so it is not
+        // processes as vector in RA.
+        if (RegMask::is_vector(ireg))
+          lrg._is_vector = 1;
+        assert(n_type->isa_vect() == NULL || lrg._is_vector || ireg == Op_RegD,
+               "vector must be in vector registers");
+
+        // Check for bound register masks
+        const RegMask &lrgmask = lrg.mask();
+        if (lrgmask.is_bound(ireg))
+          lrg._is_bound = 1;
+
+        // Check for maximum frequency value
+        if (lrg._maxfreq < b->_freq)
+          lrg._maxfreq = b->_freq;
+
         // Check for oop-iness, or long/double
         // Check for multi-kill projection
         switch( ireg ) {
@@ -689,7 +702,7 @@
           // AND changes how we count interferences.  A mis-aligned
           // double can interfere with TWO aligned pairs, or effectively
           // FOUR registers!
-          if( rm.is_misaligned_Pair() ) {
+          if (rm.is_misaligned_pair()) {
             lrg._fat_proj = 1;
             lrg._is_bound = 1;
           }
@@ -706,6 +719,33 @@
           lrg.set_reg_pressure(1);
 #endif
           break;
+        case Op_VecS:
+          assert(Matcher::vector_size_supported(T_BYTE,4), "sanity");
+          assert(RegMask::num_registers(Op_VecS) == RegMask::SlotsPerVecS, "sanity");
+          lrg.set_num_regs(RegMask::SlotsPerVecS);
+          lrg.set_reg_pressure(1);
+          break;
+        case Op_VecD:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecD), "sanity");
+          assert(RegMask::num_registers(Op_VecD) == RegMask::SlotsPerVecD, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecD), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecD);
+          lrg.set_reg_pressure(1);
+          break;
+        case Op_VecX:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecX), "sanity");
+          assert(RegMask::num_registers(Op_VecX) == RegMask::SlotsPerVecX, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecX), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecX);
+          lrg.set_reg_pressure(1);
+          break;
+        case Op_VecY:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecY), "sanity");
+          assert(RegMask::num_registers(Op_VecY) == RegMask::SlotsPerVecY, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecY), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecY);
+          lrg.set_reg_pressure(1);
+          break;
         default:
           ShouldNotReachHere();
         }
@@ -763,24 +803,38 @@
         } else {
           lrg.AND( rm );
         }
+
         // Check for bound register masks
         const RegMask &lrgmask = lrg.mask();
-        if( lrgmask.is_bound1() || lrgmask.is_bound2() )
+        int kreg = n->in(k)->ideal_reg();
+        bool is_vect = RegMask::is_vector(kreg);
+        assert(n->in(k)->bottom_type()->isa_vect() == NULL ||
+               is_vect || kreg == Op_RegD,
+               "vector must be in vector registers");
+        if (lrgmask.is_bound(kreg))
           lrg._is_bound = 1;
+
         // If this use of a double forces a mis-aligned double,
         // flag as '_fat_proj' - really flag as allowing misalignment
         // AND changes how we count interferences.  A mis-aligned
         // double can interfere with TWO aligned pairs, or effectively
         // FOUR registers!
-        if( lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_Pair() ) {
+#ifdef ASSERT
+        if (is_vect) {
+          assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
+          assert(!lrg._fat_proj, "sanity");
+          assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+        }
+#endif
+        if (!is_vect && lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_pair()) {
           lrg._fat_proj = 1;
           lrg._is_bound = 1;
         }
         // if the LRG is an unaligned pair, we will have to spill
         // so clear the LRG's register mask if it is not already spilled
-        if ( !n->is_SpillCopy() &&
-               (lrg._def == NULL || lrg.is_multidef() || !lrg._def->is_SpillCopy()) &&
-               lrgmask.is_misaligned_Pair()) {
+        if (!is_vect && !n->is_SpillCopy() &&
+            (lrg._def == NULL || lrg.is_multidef() || !lrg._def->is_SpillCopy()) &&
+            lrgmask.is_misaligned_pair()) {
           lrg.Clear();
         }
 
@@ -793,12 +847,14 @@
   } // end for all blocks
 
   // Final per-liverange setup
-  for( uint i2=0; i2<_maxlrg; i2++ ) {
+  for (uint i2=0; i2<_maxlrg; i2++) {
     LRG &lrg = lrgs(i2);
-    if( lrg.num_regs() == 2 && !lrg._fat_proj )
-      lrg.ClearToPairs();
+    assert(!lrg._is_vector || !lrg._fat_proj, "sanity");
+    if (lrg.num_regs() > 1 && !lrg._fat_proj) {
+      lrg.clear_to_sets();
+    }
     lrg.compute_set_mask_size();
-    if( lrg.not_free() ) {      // Handle case where we lose from the start
+    if (lrg.not_free()) {      // Handle case where we lose from the start
       lrg.set_reg(OptoReg::Name(LRG::SPILL_REG));
       lrg._direct_conflict = 1;
     }
@@ -1104,22 +1160,17 @@
       // Choose a color which is legal for him
       RegMask tempmask = lrg.mask();
       tempmask.AND(lrgs(copy_lrg).mask());
-      OptoReg::Name reg;
-      if( lrg.num_regs() == 1 ) {
-        reg = tempmask.find_first_elem();
-      } else {
-        tempmask.ClearToPairs();
-        reg = tempmask.find_first_pair();
-      }
-      if( OptoReg::is_valid(reg) )
+      tempmask.clear_to_sets(lrg.num_regs());
+      OptoReg::Name reg = tempmask.find_first_set(lrg.num_regs());
+      if (OptoReg::is_valid(reg))
         return reg;
     }
   }
 
   // If no bias info exists, just go with the register selection ordering
-  if( lrg.num_regs() == 2 ) {
-    // Find an aligned pair
-    return OptoReg::add(lrg.mask().find_first_pair(),chunk);
+  if (lrg._is_vector || lrg.num_regs() == 2) {
+    // Find an aligned set
+    return OptoReg::add(lrg.mask().find_first_set(lrg.num_regs()),chunk);
   }
 
   // CNC - Fun hack.  Alternate 1st and 2nd selection.  Enables post-allocate
@@ -1149,6 +1200,7 @@
     // Use a heuristic to "bias" the color choice
     return bias_color(lrg, chunk);
 
+  assert(!lrg._is_vector, "should be not vector here" );
   assert( lrg.num_regs() >= 2, "dead live ranges do not color" );
 
   // Fat-proj case or misaligned double argument.
@@ -1238,14 +1290,16 @@
     }
     //assert(is_allstack == lrg->mask().is_AllStack(), "nbrs must not change AllStackedness");
     // Aligned pairs need aligned masks
-    if( lrg->num_regs() == 2 && !lrg->_fat_proj )
-      lrg->ClearToPairs();
+    assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
+    if (lrg->num_regs() > 1 && !lrg->_fat_proj) {
+      lrg->clear_to_sets();
+    }
 
     // Check if a color is available and if so pick the color
     OptoReg::Name reg = choose_color( *lrg, chunk );
 #ifdef SPARC
     debug_only(lrg->compute_set_mask_size());
-    assert(lrg->num_regs() != 2 || lrg->is_bound() || is_even(reg-1), "allocate all doubles aligned");
+    assert(lrg->num_regs() < 2 || lrg->is_bound() || is_even(reg-1), "allocate all doubles aligned");
 #endif
 
     //---------------
@@ -1277,17 +1331,16 @@
       // If the live range is not bound, then we actually had some choices
       // to make.  In this case, the mask has more bits in it than the colors
       // chosen.  Restrict the mask to just what was picked.
-      if( lrg->num_regs() == 1 ) { // Size 1 live range
+      int n_regs = lrg->num_regs();
+      assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
+      if (n_regs == 1 || !lrg->_fat_proj) {
+        assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecY, "sanity");
         lrg->Clear();           // Clear the mask
         lrg->Insert(reg);       // Set regmask to match selected reg
-        lrg->set_mask_size(1);
-      } else if( !lrg->_fat_proj ) {
-        // For pairs, also insert the low bit of the pair
-        assert( lrg->num_regs() == 2, "unbound fatproj???" );
-        lrg->Clear();           // Clear the mask
-        lrg->Insert(reg);       // Set regmask to match selected reg
-        lrg->Insert(OptoReg::add(reg,-1));
-        lrg->set_mask_size(2);
+        // For vectors and pairs, also insert the low bit of the pair
+        for (int i = 1; i < n_regs; i++)
+          lrg->Insert(OptoReg::add(reg,-i));
+        lrg->set_mask_size(n_regs);
       } else {                  // Else fatproj
         // mask must be equal to fatproj bits, by definition
       }
@@ -1483,7 +1536,7 @@
 
   // Check for AddP-related opcodes
   if( !derived->is_Phi() ) {
-    assert( derived->as_Mach()->ideal_Opcode() == Op_AddP, "" );
+    assert(derived->as_Mach()->ideal_Opcode() == Op_AddP, err_msg("but is: %s", derived->Name()));
     Node *base = derived->in(AddPNode::Base);
     derived_base_map[derived->_idx] = base;
     return base;
@@ -1860,12 +1913,20 @@
       sprintf(buf,"L%d",lidx);  // No register binding yet
     } else if( !lidx ) {        // Special, not allocated value
       strcpy(buf,"Special");
-    } else if( (lrgs(lidx).num_regs() == 1)
-                ? !lrgs(lidx).mask().is_bound1()
-                : !lrgs(lidx).mask().is_bound2() ) {
-      sprintf(buf,"L%d",lidx); // No register binding yet
-    } else {                    // Hah!  We have a bound machine register
-      print_reg( lrgs(lidx).reg(), this, buf );
+    } else {
+      if (lrgs(lidx)._is_vector) {
+        if (lrgs(lidx).mask().is_bound_set(lrgs(lidx).num_regs()))
+          print_reg( lrgs(lidx).reg(), this, buf ); // a bound machine register
+        else
+          sprintf(buf,"L%d",lidx); // No register binding yet
+      } else if( (lrgs(lidx).num_regs() == 1)
+                 ? lrgs(lidx).mask().is_bound1()
+                 : lrgs(lidx).mask().is_bound_pair() ) {
+        // Hah!  We have a bound machine register
+        print_reg( lrgs(lidx).reg(), this, buf );
+      } else {
+        sprintf(buf,"L%d",lidx); // No register binding yet
+      }
     }
   }
   return buf+strlen(buf);
--- a/src/share/vm/opto/chaitin.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/chaitin.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -99,8 +99,15 @@
   void set_mask_size( int size ) {
     assert((size == 65535) || (size == (int)_mask.Size()), "");
     _mask_size = size;
-    debug_only(_msize_valid=1;)
-    debug_only( if( _num_regs == 2 && !_fat_proj ) _mask.VerifyPairs(); )
+#ifdef ASSERT
+    _msize_valid=1;
+    if (_is_vector) {
+      assert(!_fat_proj, "sanity");
+      _mask.verify_sets(_num_regs);
+    } else if (_num_regs == 2 && !_fat_proj) {
+      _mask.verify_pairs();
+    }
+#endif
   }
   void compute_set_mask_size() { set_mask_size(compute_mask_size()); }
   int mask_size() const { assert( _msize_valid, "mask size not valid" );
@@ -116,7 +123,8 @@
   void Set_All() { _mask.Set_All(); debug_only(_msize_valid=1); _mask_size = RegMask::CHUNK_SIZE; }
   void Insert( OptoReg::Name reg ) { _mask.Insert(reg);  debug_only(_msize_valid=0;) }
   void Remove( OptoReg::Name reg ) { _mask.Remove(reg);  debug_only(_msize_valid=0;) }
-  void ClearToPairs() { _mask.ClearToPairs(); debug_only(_msize_valid=0;) }
+  void clear_to_pairs() { _mask.clear_to_pairs(); debug_only(_msize_valid=0;) }
+  void clear_to_sets()  { _mask.clear_to_sets(_num_regs); debug_only(_msize_valid=0;) }
 
   // Number of registers this live range uses when it colors
 private:
@@ -150,6 +158,7 @@
 
   uint   _is_oop:1,             // Live-range holds an oop
          _is_float:1,           // True if in float registers
+         _is_vector:1,          // True if in vector registers
          _was_spilled1:1,       // True if prior spilling on def
          _was_spilled2:1,       // True if twice prior spilling on def
          _is_bound:1,           // live range starts life with no
--- a/src/share/vm/opto/classes.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/classes.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -245,14 +245,12 @@
 macro(XorL)
 macro(Vector)
 macro(AddVB)
-macro(AddVC)
 macro(AddVS)
 macro(AddVI)
 macro(AddVL)
 macro(AddVF)
 macro(AddVD)
 macro(SubVB)
-macro(SubVC)
 macro(SubVS)
 macro(SubVI)
 macro(SubVL)
@@ -263,74 +261,36 @@
 macro(DivVF)
 macro(DivVD)
 macro(LShiftVB)
-macro(LShiftVC)
 macro(LShiftVS)
 macro(LShiftVI)
-macro(URShiftVB)
-macro(URShiftVC)
-macro(URShiftVS)
-macro(URShiftVI)
+macro(RShiftVB)
+macro(RShiftVS)
+macro(RShiftVI)
 macro(AndV)
 macro(OrV)
 macro(XorV)
-macro(VectorLoad)
-macro(Load16B)
-macro(Load8B)
-macro(Load4B)
-macro(Load8C)
-macro(Load4C)
-macro(Load2C)
-macro(Load8S)
-macro(Load4S)
-macro(Load2S)
-macro(Load4I)
-macro(Load2I)
-macro(Load2L)
-macro(Load4F)
-macro(Load2F)
-macro(Load2D)
-macro(VectorStore)
-macro(Store16B)
-macro(Store8B)
-macro(Store4B)
-macro(Store8C)
-macro(Store4C)
-macro(Store2C)
-macro(Store4I)
-macro(Store2I)
-macro(Store2L)
-macro(Store4F)
-macro(Store2F)
-macro(Store2D)
+macro(LoadVector)
+macro(StoreVector)
 macro(Pack)
 macro(PackB)
 macro(PackS)
-macro(PackC)
 macro(PackI)
 macro(PackL)
 macro(PackF)
 macro(PackD)
-macro(Pack2x1B)
-macro(Pack2x2B)
-macro(Replicate16B)
-macro(Replicate8B)
-macro(Replicate4B)
-macro(Replicate8S)
-macro(Replicate4S)
-macro(Replicate2S)
-macro(Replicate8C)
-macro(Replicate4C)
-macro(Replicate2C)
-macro(Replicate4I)
-macro(Replicate2I)
-macro(Replicate2L)
-macro(Replicate4F)
-macro(Replicate2F)
-macro(Replicate2D)
+macro(Pack2L)
+macro(Pack2D)
+macro(ReplicateB)
+macro(ReplicateS)
+macro(ReplicateI)
+macro(ReplicateL)
+macro(ReplicateF)
+macro(ReplicateD)
 macro(Extract)
 macro(ExtractB)
+macro(ExtractUB)
+macro(ExtractC)
 macro(ExtractS)
-macro(ExtractC)
 macro(ExtractI)
 macro(ExtractL)
 macro(ExtractF)
--- a/src/share/vm/opto/compile.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/compile.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -2591,38 +2591,12 @@
     }
     break;
 
-  case Op_Load16B:
-  case Op_Load8B:
-  case Op_Load4B:
-  case Op_Load8S:
-  case Op_Load4S:
-  case Op_Load2S:
-  case Op_Load8C:
-  case Op_Load4C:
-  case Op_Load2C:
-  case Op_Load4I:
-  case Op_Load2I:
-  case Op_Load2L:
-  case Op_Load4F:
-  case Op_Load2F:
-  case Op_Load2D:
-  case Op_Store16B:
-  case Op_Store8B:
-  case Op_Store4B:
-  case Op_Store8C:
-  case Op_Store4C:
-  case Op_Store2C:
-  case Op_Store4I:
-  case Op_Store2I:
-  case Op_Store2L:
-  case Op_Store4F:
-  case Op_Store2F:
-  case Op_Store2D:
+  case Op_LoadVector:
+  case Op_StoreVector:
     break;
 
   case Op_PackB:
   case Op_PackS:
-  case Op_PackC:
   case Op_PackI:
   case Op_PackF:
   case Op_PackL:
--- a/src/share/vm/opto/ifg.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/ifg.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -416,6 +416,7 @@
     if( lrgs(lidx).mask().is_UP() &&
         lrgs(lidx).mask_size() &&
         !lrgs(lidx)._is_float &&
+        !lrgs(lidx)._is_vector &&
         lrgs(lidx).mask().overlap(*Matcher::idealreg2regmask[Op_RegI]) )
       cnt += lrgs(lidx).reg_pressure();
   }
@@ -430,7 +431,7 @@
   while ((lidx = elements.next()) != 0) {
     if( lrgs(lidx).mask().is_UP() &&
         lrgs(lidx).mask_size() &&
-        lrgs(lidx)._is_float )
+        (lrgs(lidx)._is_float || lrgs(lidx)._is_vector))
       cnt += lrgs(lidx).reg_pressure();
   }
   return cnt;
@@ -439,8 +440,8 @@
 //------------------------------lower_pressure---------------------------------
 // Adjust register pressure down by 1.  Capture last hi-to-low transition,
 static void lower_pressure( LRG *lrg, uint where, Block *b, uint *pressure, uint *hrp_index ) {
-  if( lrg->mask().is_UP() && lrg->mask_size() ) {
-    if( lrg->_is_float ) {
+  if (lrg->mask().is_UP() && lrg->mask_size()) {
+    if (lrg->_is_float || lrg->_is_vector) {
       pressure[1] -= lrg->reg_pressure();
       if( pressure[1] == (uint)FLOATPRESSURE ) {
         hrp_index[1] = where;
@@ -522,8 +523,8 @@
       LRG &lrg = lrgs(lidx);
       lrg._area += cost;
       // Compute initial register pressure
-      if( lrg.mask().is_UP() && lrg.mask_size() ) {
-        if( lrg._is_float ) {   // Count float pressure
+      if (lrg.mask().is_UP() && lrg.mask_size()) {
+        if (lrg._is_float || lrg._is_vector) {   // Count float pressure
           pressure[1] += lrg.reg_pressure();
 #ifdef EXACT_PRESSURE
           if( pressure[1] > b->_freg_pressure )
@@ -681,13 +682,10 @@
         // according to its bindings.
         const RegMask &rmask = lrgs(r).mask();
         if( lrgs(r).is_bound() && !(n->rematerialize()) && rmask.is_NotEmpty() ) {
-          // Smear odd bits; leave only aligned pairs of bits.
-          RegMask r2mask = rmask;
-          r2mask.SmearToPairs();
           // Check for common case
           int r_size = lrgs(r).num_regs();
           OptoReg::Name r_reg = (r_size == 1) ? rmask.find_first_elem() : OptoReg::Physical;
-
+          // Smear odd bits
           IndexSetIterator elements(&liveout);
           uint l;
           while ((l = elements.next()) != 0) {
@@ -701,10 +699,15 @@
             // Remove the bits from LRG 'r' from LRG 'l' so 'l' no
             // longer interferes with 'r'.  If 'l' requires aligned
             // adjacent pairs, subtract out bit pairs.
-            if( lrg.num_regs() == 2 && !lrg._fat_proj ) {
+            assert(!lrg._is_vector || !lrg._fat_proj, "sanity");
+            if (lrg.num_regs() > 1 && !lrg._fat_proj) {
+              RegMask r2mask = rmask;
+              // Leave only aligned set of bits.
+              r2mask.smear_to_sets(lrg.num_regs());
+              // It includes vector case.
               lrg.SUBTRACT( r2mask );
               lrg.compute_set_mask_size();
-            } else if( r_size != 1 ) {
+            } else if( r_size != 1 ) { // fat proj
               lrg.SUBTRACT( rmask );
               lrg.compute_set_mask_size();
             } else {            // Common case: size 1 bound removal
@@ -763,8 +766,8 @@
             // Newly live things assumed live from here to top of block
             lrg._area += cost;
             // Adjust register pressure
-            if( lrg.mask().is_UP() && lrg.mask_size() ) {
-              if( lrg._is_float ) {
+            if (lrg.mask().is_UP() && lrg.mask_size()) {
+              if (lrg._is_float || lrg._is_vector) {
                 pressure[1] += lrg.reg_pressure();
 #ifdef EXACT_PRESSURE
                 if( pressure[1] > b->_freg_pressure )
--- a/src/share/vm/opto/lcm.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/lcm.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -139,6 +139,7 @@
     int iop = mach->ideal_Opcode();
     switch( iop ) {
     case Op_LoadB:
+    case Op_LoadUB:
     case Op_LoadUS:
     case Op_LoadD:
     case Op_LoadF:
@@ -445,6 +446,11 @@
     if( e->is_MachNullCheck() && e->in(1) == n )
       continue;
 
+    // Schedule IV increment last.
+    if (e->is_Mach() && e->as_Mach()->ideal_Opcode() == Op_CountedLoopEnd &&
+        e->in(1)->in(1) == n && n->is_iteratively_computed())
+      continue;
+
     uint n_choice  = 2;
 
     // See if this instruction is consumed by a branch. If so, then (as the
--- a/src/share/vm/opto/library_call.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/library_call.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -3592,8 +3592,10 @@
     }
 
     // Bail out if length is negative.
-    // ...Not needed, since the new_array will throw the right exception.
-    //generate_negative_guard(length, bailout, &length);
+    // Without this the new_array would throw
+    // NegativeArraySizeException but IllegalArgumentException is what
+    // should be thrown
+    generate_negative_guard(length, bailout, &length);
 
     if (bailout->req() > 1) {
       PreserveJVMState pjvms(this);
@@ -3617,7 +3619,9 @@
       // Extreme case:  Arrays.copyOf((Integer[])x, 10, String[].class).
       // This will fail a store-check if x contains any non-nulls.
       bool disjoint_bases = true;
-      bool length_never_negative = true;
+      // if start > orig_length then the length of the copy may be
+      // negative.
+      bool length_never_negative = !is_copyOfRange;
       generate_arraycopy(TypeAryPtr::OOPS, T_OBJECT,
                          original, start, newcopy, intcon(0), moved,
                          disjoint_bases, length_never_negative);
--- a/src/share/vm/opto/loopnode.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/loopnode.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -2751,7 +2751,8 @@
         // Do not count uncommon calls
         if( !n->is_CallStaticJava() || !n->as_CallStaticJava()->_name ) {
           Node *iff = n->in(0)->in(0);
-          if( !iff->is_If() ||
+          // No any calls for vectorized loops.
+          if( UseSuperWord || !iff->is_If() ||
               (n->in(0)->Opcode() == Op_IfFalse &&
                (1.0 - iff->as_If()->_prob) >= 0.01) ||
               (iff->as_If()->_prob >= 0.01) )
@@ -3216,7 +3217,8 @@
     case Op_ModF:
     case Op_ModD:
     case Op_LoadB:              // Same with Loads; they can sink
-    case Op_LoadUS:             // during loop optimizations.
+    case Op_LoadUB:             // during loop optimizations.
+    case Op_LoadUS:
     case Op_LoadD:
     case Op_LoadF:
     case Op_LoadI:
--- a/src/share/vm/opto/machnode.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/machnode.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -439,9 +439,9 @@
   // Don't remateralize somebody with bound inputs - it stretches a
   // fixed register lifetime.
   uint idx = oper_input_base();
-  if( req() > idx ) {
+  if (req() > idx) {
     const RegMask &rm = in_RegMask(idx);
-    if( rm.is_bound1() || rm.is_bound2() )
+    if (rm.is_bound(ideal_reg()))
       return false;
   }
 
--- a/src/share/vm/opto/machnode.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/machnode.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -319,6 +319,7 @@
 class MachTypeNode : public MachNode {
   virtual uint size_of() const { return sizeof(*this); } // Size is bigger
 public:
+  MachTypeNode( ) {}
   const Type *_bottom_type;
 
   virtual const class Type *bottom_type() const { return _bottom_type; }
@@ -370,12 +371,12 @@
 
 //------------------------------MachConstantNode-------------------------------
 // Machine node that holds a constant which is stored in the constant table.
-class MachConstantNode : public MachNode {
+class MachConstantNode : public MachTypeNode {
 protected:
   Compile::Constant _constant;  // This node's constant.
 
 public:
-  MachConstantNode() : MachNode() {
+  MachConstantNode() : MachTypeNode() {
     init_class_id(Class_MachConstant);
   }
 
--- a/src/share/vm/opto/matcher.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/matcher.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -35,6 +35,7 @@
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
 #include "opto/type.hpp"
+#include "opto/vectornode.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/os.hpp"
 #ifdef TARGET_ARCH_MODEL_x86_32
@@ -58,18 +59,6 @@
 
 OptoReg::Name OptoReg::c_frame_pointer;
 
-
-
-const int Matcher::base2reg[Type::lastype] = {
-  Node::NotAMachineReg,0,0, Op_RegI, Op_RegL, 0, Op_RegN,
-  Node::NotAMachineReg, Node::NotAMachineReg, /* tuple, array */
-  Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, /* the pointers */
-  0, 0/*abio*/,
-  Op_RegP /* Return address */, 0, /* the memories */
-  Op_RegF, Op_RegF, Op_RegF, Op_RegD, Op_RegD, Op_RegD,
-  0  /*bottom*/
-};
-
 const RegMask *Matcher::idealreg2regmask[_last_machine_leaf];
 RegMask Matcher::mreg2regmask[_last_Mach_Reg];
 RegMask Matcher::STACK_ONLY_mask;
@@ -107,6 +96,10 @@
   idealreg2spillmask  [Op_RegF] = NULL;
   idealreg2spillmask  [Op_RegD] = NULL;
   idealreg2spillmask  [Op_RegP] = NULL;
+  idealreg2spillmask  [Op_VecS] = NULL;
+  idealreg2spillmask  [Op_VecD] = NULL;
+  idealreg2spillmask  [Op_VecX] = NULL;
+  idealreg2spillmask  [Op_VecY] = NULL;
 
   idealreg2debugmask  [Op_RegI] = NULL;
   idealreg2debugmask  [Op_RegN] = NULL;
@@ -114,6 +107,10 @@
   idealreg2debugmask  [Op_RegF] = NULL;
   idealreg2debugmask  [Op_RegD] = NULL;
   idealreg2debugmask  [Op_RegP] = NULL;
+  idealreg2debugmask  [Op_VecS] = NULL;
+  idealreg2debugmask  [Op_VecD] = NULL;
+  idealreg2debugmask  [Op_VecX] = NULL;
+  idealreg2debugmask  [Op_VecY] = NULL;
 
   idealreg2mhdebugmask[Op_RegI] = NULL;
   idealreg2mhdebugmask[Op_RegN] = NULL;
@@ -121,6 +118,10 @@
   idealreg2mhdebugmask[Op_RegF] = NULL;
   idealreg2mhdebugmask[Op_RegD] = NULL;
   idealreg2mhdebugmask[Op_RegP] = NULL;
+  idealreg2mhdebugmask[Op_VecS] = NULL;
+  idealreg2mhdebugmask[Op_VecD] = NULL;
+  idealreg2mhdebugmask[Op_VecX] = NULL;
+  idealreg2mhdebugmask[Op_VecY] = NULL;
 
   debug_only(_mem_node = NULL;)   // Ideal memory node consumed by mach node
 }
@@ -134,7 +135,7 @@
     warped = OptoReg::add(warped, C->out_preserve_stack_slots());
     if( warped >= _in_arg_limit )
       _in_arg_limit = OptoReg::add(warped, 1); // Bump max stack slot seen
-    if (!RegMask::can_represent(warped)) {
+    if (!RegMask::can_represent_arg(warped)) {
       // the compiler cannot represent this method's calling sequence
       C->record_method_not_compilable_all_tiers("unsupported incoming calling sequence");
       return OptoReg::Bad;
@@ -302,7 +303,7 @@
   _out_arg_limit = OptoReg::add(_new_SP, C->out_preserve_stack_slots());
   assert( is_even(_out_arg_limit), "out_preserve must be even" );
 
-  if (!RegMask::can_represent(OptoReg::add(_out_arg_limit,-1))) {
+  if (!RegMask::can_represent_arg(OptoReg::add(_out_arg_limit,-1))) {
     // the compiler cannot represent this method's calling sequence
     C->record_method_not_compilable("must be able to represent all call arguments in reg mask");
   }
@@ -428,7 +429,7 @@
 void Matcher::init_first_stack_mask() {
 
   // Allocate storage for spill masks as masks for the appropriate load type.
-  RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * 3*6);
+  RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+4));
 
   idealreg2spillmask  [Op_RegN] = &rms[0];
   idealreg2spillmask  [Op_RegI] = &rms[1];
@@ -451,6 +452,11 @@
   idealreg2mhdebugmask[Op_RegD] = &rms[16];
   idealreg2mhdebugmask[Op_RegP] = &rms[17];
 
+  idealreg2spillmask  [Op_VecS] = &rms[18];
+  idealreg2spillmask  [Op_VecD] = &rms[19];
+  idealreg2spillmask  [Op_VecX] = &rms[20];
+  idealreg2spillmask  [Op_VecY] = &rms[21];
+
   OptoReg::Name i;
 
   // At first, start with the empty mask
@@ -462,7 +468,7 @@
     C->FIRST_STACK_mask().Insert(i);
 
   // Add in all bits past the outgoing argument area
-  guarantee(RegMask::can_represent(OptoReg::add(_out_arg_limit,-1)),
+  guarantee(RegMask::can_represent_arg(OptoReg::add(_out_arg_limit,-1)),
             "must be able to represent all call arguments in reg mask");
   init = _out_arg_limit;
   for (i = init; RegMask::can_represent(i); i = OptoReg::add(i,1))
@@ -472,21 +478,48 @@
   C->FIRST_STACK_mask().set_AllStack();
 
   // Make spill masks.  Registers for their class, plus FIRST_STACK_mask.
+  RegMask aligned_stack_mask = C->FIRST_STACK_mask();
+  // Keep spill masks aligned.
+  aligned_stack_mask.clear_to_pairs();
+  assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+
+  *idealreg2spillmask[Op_RegP] = *idealreg2regmask[Op_RegP];
 #ifdef _LP64
   *idealreg2spillmask[Op_RegN] = *idealreg2regmask[Op_RegN];
    idealreg2spillmask[Op_RegN]->OR(C->FIRST_STACK_mask());
+   idealreg2spillmask[Op_RegP]->OR(aligned_stack_mask);
+#else
+   idealreg2spillmask[Op_RegP]->OR(C->FIRST_STACK_mask());
 #endif
   *idealreg2spillmask[Op_RegI] = *idealreg2regmask[Op_RegI];
    idealreg2spillmask[Op_RegI]->OR(C->FIRST_STACK_mask());
   *idealreg2spillmask[Op_RegL] = *idealreg2regmask[Op_RegL];
-   idealreg2spillmask[Op_RegL]->OR(C->FIRST_STACK_mask());
+   idealreg2spillmask[Op_RegL]->OR(aligned_stack_mask);
   *idealreg2spillmask[Op_RegF] = *idealreg2regmask[Op_RegF];
    idealreg2spillmask[Op_RegF]->OR(C->FIRST_STACK_mask());
   *idealreg2spillmask[Op_RegD] = *idealreg2regmask[Op_RegD];
-   idealreg2spillmask[Op_RegD]->OR(C->FIRST_STACK_mask());
-  *idealreg2spillmask[Op_RegP] = *idealreg2regmask[Op_RegP];
-   idealreg2spillmask[Op_RegP]->OR(C->FIRST_STACK_mask());
+   idealreg2spillmask[Op_RegD]->OR(aligned_stack_mask);
 
+  if (Matcher::vector_size_supported(T_BYTE,4)) {
+    *idealreg2spillmask[Op_VecS] = *idealreg2regmask[Op_VecS];
+     idealreg2spillmask[Op_VecS]->OR(C->FIRST_STACK_mask());
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,2)) {
+    *idealreg2spillmask[Op_VecD] = *idealreg2regmask[Op_VecD];
+     idealreg2spillmask[Op_VecD]->OR(aligned_stack_mask);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,4)) {
+     aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecX);
+     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+    *idealreg2spillmask[Op_VecX] = *idealreg2regmask[Op_VecX];
+     idealreg2spillmask[Op_VecX]->OR(aligned_stack_mask);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,8)) {
+     aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecY);
+     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+    *idealreg2spillmask[Op_VecY] = *idealreg2regmask[Op_VecY];
+     idealreg2spillmask[Op_VecY]->OR(aligned_stack_mask);
+  }
    if (UseFPUForSpilling) {
      // This mask logic assumes that the spill operations are
      // symmetric and that the registers involved are the same size.
@@ -807,6 +840,25 @@
   idealreg2regmask[Op_RegF] = &spillF->out_RegMask();
   idealreg2regmask[Op_RegD] = &spillD->out_RegMask();
   idealreg2regmask[Op_RegP] = &spillP->out_RegMask();
+
+  // Vector regmasks.
+  if (Matcher::vector_size_supported(T_BYTE,4)) {
+    TypeVect::VECTS = TypeVect::make(T_BYTE, 4);
+    MachNode *spillVectS = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTS));
+    idealreg2regmask[Op_VecS] = &spillVectS->out_RegMask();
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,2)) {
+    MachNode *spillVectD = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTD));
+    idealreg2regmask[Op_VecD] = &spillVectD->out_RegMask();
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,4)) {
+    MachNode *spillVectX = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTX));
+    idealreg2regmask[Op_VecX] = &spillVectX->out_RegMask();
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,8)) {
+    MachNode *spillVectY = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTY));
+    idealreg2regmask[Op_VecY] = &spillVectY->out_RegMask();
+  }
 }
 
 #ifdef ASSERT
@@ -1063,7 +1115,7 @@
     // that is killed by the call.
     if( warped >= out_arg_limit_per_call )
       out_arg_limit_per_call = OptoReg::add(warped,1);
-    if (!RegMask::can_represent(warped)) {
+    if (!RegMask::can_represent_arg(warped)) {
       C->record_method_not_compilable_all_tiers("unsupported calling sequence");
       return OptoReg::Bad;
     }
@@ -1251,7 +1303,7 @@
     // this killed area.
     uint r_cnt = mcall->tf()->range()->cnt();
     MachProjNode *proj = new (C, 1) MachProjNode( mcall, r_cnt+10000, RegMask::Empty, MachProjNode::fat_proj );
-    if (!RegMask::can_represent(OptoReg::Name(out_arg_limit_per_call-1))) {
+    if (!RegMask::can_represent_arg(OptoReg::Name(out_arg_limit_per_call-1))) {
       C->record_method_not_compilable_all_tiers("unsupported outgoing calling sequence");
     } else {
       for (int i = begin_out_arg_area; i < out_arg_limit_per_call; i++)
--- a/src/share/vm/opto/matcher.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/matcher.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -250,10 +250,21 @@
   static const bool convL2FSupported(void);
 
   // Vector width in bytes
-  static const uint vector_width_in_bytes(void);
+  static const int vector_width_in_bytes(BasicType bt);
+
+  // Limits on vector size (number of elements).
+  static const int max_vector_size(const BasicType bt);
+  static const int min_vector_size(const BasicType bt);
+  static const bool vector_size_supported(const BasicType bt, int size) {
+    return (Matcher::max_vector_size(bt) >= size &&
+            Matcher::min_vector_size(bt) <= size);
+  }
 
   // Vector ideal reg
-  static const uint vector_ideal_reg(void);
+  static const int vector_ideal_reg(int len);
+
+  // CPU supports misaligned vectors store/load.
+  static const bool misaligned_vectors_ok();
 
   // Used to determine a "low complexity" 64-bit constant.  (Zero is simple.)
   // The standard of comparison is one (StoreL ConL) vs. two (StoreI ConI).
--- a/src/share/vm/opto/memnode.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/memnode.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1543,6 +1543,7 @@
     // had an original form like p1:(AddP x x (LShiftL quux 3)), where the
     // expression (LShiftL quux 3) independently optimized to the constant 8.
     if ((t->isa_int() == NULL) && (t->isa_long() == NULL)
+        && (_type->isa_vect() == NULL)
         && Opcode() != Op_LoadKlass && Opcode() != Op_LoadNKlass) {
       // t might actually be lower than _type, if _type is a unique
       // concrete subclass of abstract class t.
--- a/src/share/vm/opto/mulnode.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/mulnode.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -41,7 +41,9 @@
 class MulNode : public Node {
   virtual uint hash() const;
 public:
-  MulNode( Node *in1, Node *in2 ): Node(0,in1,in2) {}
+  MulNode( Node *in1, Node *in2 ): Node(0,in1,in2) {
+    init_class_id(Class_Mul);
+  }
 
   // Handle algebraic identities here.  If we have an identity, return the Node
   // we are equivalent to.  We look for "add of zero" as an identity.
--- a/src/share/vm/opto/node.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/node.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1576,6 +1576,9 @@
     } else {
       tty->print("no type");
     }
+  } else if (t->isa_vect() && this->is_MachSpillCopy()) {
+    // Dump MachSpillcopy vector type.
+    t->dump();
   }
   if (is_new) {
     debug_only(dump_orig(debug_orig()));
--- a/src/share/vm/opto/node.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/node.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -100,6 +100,7 @@
 class MemBarStoreStoreNode;
 class MemNode;
 class MergeMemNode;
+class MulNode;
 class MultiNode;
 class MultiBranchNode;
 class NeverBranchNode;
@@ -133,8 +134,8 @@
 class TypeNode;
 class UnlockNode;
 class VectorNode;
-class VectorLoadNode;
-class VectorStoreNode;
+class LoadVectorNode;
+class StoreVectorNode;
 class VectorSet;
 typedef void (*NFunc)(Node&,void*);
 extern "C" {
@@ -609,9 +610,9 @@
 
     DEFINE_CLASS_ID(Mem,   Node, 4)
       DEFINE_CLASS_ID(Load,  Mem, 0)
-        DEFINE_CLASS_ID(VectorLoad,  Load, 0)
+        DEFINE_CLASS_ID(LoadVector,  Load, 0)
       DEFINE_CLASS_ID(Store, Mem, 1)
-        DEFINE_CLASS_ID(VectorStore, Store, 0)
+        DEFINE_CLASS_ID(StoreVector, Store, 0)
       DEFINE_CLASS_ID(LoadStore, Mem, 2)
 
     DEFINE_CLASS_ID(Region, Node, 5)
@@ -629,8 +630,9 @@
     DEFINE_CLASS_ID(AddP,     Node, 9)
     DEFINE_CLASS_ID(BoxLock,  Node, 10)
     DEFINE_CLASS_ID(Add,      Node, 11)
-    DEFINE_CLASS_ID(Vector,   Node, 12)
-    DEFINE_CLASS_ID(ClearArray, Node, 13)
+    DEFINE_CLASS_ID(Mul,      Node, 12)
+    DEFINE_CLASS_ID(Vector,   Node, 13)
+    DEFINE_CLASS_ID(ClearArray, Node, 14)
 
     _max_classes  = ClassMask_ClearArray
   };
@@ -752,6 +754,7 @@
   DEFINE_CLASS_QUERY(MemBar)
   DEFINE_CLASS_QUERY(MemBarStoreStore)
   DEFINE_CLASS_QUERY(MergeMem)
+  DEFINE_CLASS_QUERY(Mul)
   DEFINE_CLASS_QUERY(Multi)
   DEFINE_CLASS_QUERY(MultiBranch)
   DEFINE_CLASS_QUERY(Parm)
@@ -767,8 +770,8 @@
   DEFINE_CLASS_QUERY(Sub)
   DEFINE_CLASS_QUERY(Type)
   DEFINE_CLASS_QUERY(Vector)
-  DEFINE_CLASS_QUERY(VectorLoad)
-  DEFINE_CLASS_QUERY(VectorStore)
+  DEFINE_CLASS_QUERY(LoadVector)
+  DEFINE_CLASS_QUERY(StoreVector)
   DEFINE_CLASS_QUERY(Unlock)
 
   #undef DEFINE_CLASS_QUERY
--- a/src/share/vm/opto/opcodes.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/opcodes.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -38,6 +38,10 @@
   "RegD",
   "RegL",
   "RegFlags",
+  "VecS",
+  "VecD",
+  "VecX",
+  "VecY",
   "_last_machine_leaf",
 #include "classes.hpp"
   "_last_class_name",
--- a/src/share/vm/opto/opcodes.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/opcodes.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,6 +36,10 @@
   macro(RegF)                   // Machine float   register
   macro(RegD)                   // Machine double  register
   macro(RegL)                   // Machine long    register
+  macro(VecS)                   // Machine vectors register
+  macro(VecD)                   // Machine vectord register
+  macro(VecX)                   // Machine vectorx register
+  macro(VecY)                   // Machine vectory register
   macro(RegFlags)               // Machine flags   register
   _last_machine_leaf,           // Split between regular opcodes and machine
 #include "classes.hpp"
--- a/src/share/vm/opto/postaloc.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/postaloc.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -27,13 +27,15 @@
 #include "opto/chaitin.hpp"
 #include "opto/machnode.hpp"
 
-// see if this register kind does not requires two registers
-static bool is_single_register(uint x) {
-#ifdef _LP64
-  return (x != Op_RegD && x != Op_RegL && x != Op_RegP);
-#else
-  return (x != Op_RegD && x != Op_RegL);
-#endif
+// See if this register (or pairs, or vector) already contains the value.
+static bool register_contains_value(Node* val, OptoReg::Name reg, int n_regs,
+                                    Node_List& value) {
+  for (int i = 0; i < n_regs; i++) {
+    OptoReg::Name nreg = OptoReg::add(reg,-i);
+    if (value[nreg] != val)
+      return false;
+  }
+  return true;
 }
 
 //---------------------------may_be_copy_of_callee-----------------------------
@@ -167,9 +169,11 @@
   const RegMask &use_mask = n->in_RegMask(idx);
   bool can_use = ( RegMask::can_represent(def_reg) ? (use_mask.Member(def_reg) != 0)
                                                    : (use_mask.is_AllStack() != 0));
-  // Check for a copy to or from a misaligned pair.
-  can_use = can_use && !use_mask.is_misaligned_Pair() && !def_lrg.mask().is_misaligned_Pair();
-
+  if (!RegMask::is_vector(def->ideal_reg())) {
+    // Check for a copy to or from a misaligned pair.
+    // It is workaround for a sparc with misaligned pairs.
+    can_use = can_use && !use_mask.is_misaligned_pair() && !def_lrg.mask().is_misaligned_pair();
+  }
   if (!can_use)
     return 0;
 
@@ -263,18 +267,16 @@
     val = skip_copies(n->in(k));
   }
 
-  if( val == x ) return blk_adjust; // No progress?
+  if (val == x) return blk_adjust; // No progress?
 
-  bool single = is_single_register(val->ideal_reg());
+  int n_regs = RegMask::num_registers(val->ideal_reg());
   uint val_idx = n2lidx(val);
   OptoReg::Name val_reg = lrgs(val_idx).reg();
 
   // See if it happens to already be in the correct register!
   // (either Phi's direct register, or the common case of the name
   // never-clobbered original-def register)
-  if( value[val_reg] == val &&
-      // Doubles check both halves
-      ( single || value[val_reg-1] == val ) ) {
+  if (register_contains_value(val, val_reg, n_regs, value)) {
     blk_adjust += use_prior_register(n,k,regnd[val_reg],current_block,value,regnd);
     if( n->in(k) == regnd[val_reg] ) // Success!  Quit trying
       return blk_adjust;
@@ -306,9 +308,10 @@
     }
 
     Node *vv = value[reg];
-    if( !single ) {             // Doubles check for aligned-adjacent pair
-      if( (reg&1)==0 ) continue;  // Wrong half of a pair
-      if( vv != value[reg-1] ) continue; // Not a complete pair
+    if (n_regs > 1) { // Doubles and vectors check for aligned-adjacent set
+      uint last = (n_regs-1); // Looking for the last part of a set
+      if ((reg&last) != last) continue; // Wrong part of a set
+      if (!register_contains_value(vv, reg, n_regs, value)) continue; // Different value
     }
     if( vv == val ||            // Got a direct hit?
         (t && vv && vv->bottom_type() == t && vv->is_Mach() &&
@@ -526,8 +529,9 @@
       if( pidx ) {
         value.map(preg,phi);
         regnd.map(preg,phi);
-        OptoReg::Name preg_lo = OptoReg::add(preg,-1);
-        if( !is_single_register(phi->ideal_reg()) ) {
+        int n_regs = RegMask::num_registers(phi->ideal_reg());
+        for (int l = 1; l < n_regs; l++) {
+          OptoReg::Name preg_lo = OptoReg::add(preg,-l);
           value.map(preg_lo,phi);
           regnd.map(preg_lo,phi);
         }
@@ -568,13 +572,16 @@
             value.map(ureg,valdef); // record improved reaching-def info
             regnd.map(ureg,   def);
             // Record other half of doubles
-            OptoReg::Name ureg_lo = OptoReg::add(ureg,-1);
-            if( !is_single_register(def->ideal_reg()) &&
-                ( !RegMask::can_represent(ureg_lo) ||
-                  lrgs(useidx).mask().Member(ureg_lo) ) && // Nearly always adjacent
-                !value[ureg_lo] ) {
-              value.map(ureg_lo,valdef); // record improved reaching-def info
-              regnd.map(ureg_lo,   def);
+            uint def_ideal_reg = def->ideal_reg();
+            int n_regs = RegMask::num_registers(def_ideal_reg);
+            for (int l = 1; l < n_regs; l++) {
+              OptoReg::Name ureg_lo = OptoReg::add(ureg,-l);
+              if (!value[ureg_lo] &&
+                  (!RegMask::can_represent(ureg_lo) ||
+                   lrgs(useidx).mask().Member(ureg_lo))) { // Nearly always adjacent
+                value.map(ureg_lo,valdef); // record improved reaching-def info
+                regnd.map(ureg_lo,   def);
+              }
             }
           }
         }
@@ -607,7 +614,8 @@
       }
 
       uint n_ideal_reg = n->ideal_reg();
-      if( is_single_register(n_ideal_reg) ) {
+      int n_regs = RegMask::num_registers(n_ideal_reg);
+      if (n_regs == 1) {
         // If Node 'n' does not change the value mapped by the register,
         // then 'n' is a useless copy.  Do not update the register->node
         // mapping so 'n' will go dead.
@@ -625,6 +633,25 @@
           assert( n->is_Copy(), "" );
           j -= replace_and_yank_if_dead(n, nreg, b, value, regnd);
         }
+      } else if (RegMask::is_vector(n_ideal_reg)) {
+        // If Node 'n' does not change the value mapped by the register,
+        // then 'n' is a useless copy.  Do not update the register->node
+        // mapping so 'n' will go dead.
+        if (!register_contains_value(val, nreg, n_regs, value)) {
+          // Update the mapping: record new Node defined by the register
+          regnd.map(nreg,n);
+          // Update mapping for defined *value*, which is the defined
+          // Node after skipping all copies.
+          value.map(nreg,val);
+          for (int l = 1; l < n_regs; l++) {
+            OptoReg::Name nreg_lo = OptoReg::add(nreg,-l);
+            regnd.map(nreg_lo, n );
+            value.map(nreg_lo,val);
+          }
+        } else if (n->is_Copy()) {
+          // Note: vector can't be constant and can't be copy of calee.
+          j -= replace_and_yank_if_dead(n, nreg, b, value, regnd);
+        }
       } else {
         // If the value occupies a register pair, record same info
         // in both registers.
--- a/src/share/vm/opto/reg_split.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/reg_split.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -74,12 +74,13 @@
   const RegMask *w_i_mask = w_mask->overlap( *i_mask ) ? w_mask : i_mask;
   const RegMask *w_o_mask;
 
+  int num_regs = RegMask::num_registers(ireg);
+  bool is_vect = RegMask::is_vector(ireg);
   if( w_mask->overlap( *o_mask ) && // Overlap AND
-      ((ireg != Op_RegL && ireg != Op_RegD // Single use or aligned
-#ifdef _LP64
-        && ireg != Op_RegP
-#endif
-         ) || o_mask->is_aligned_Pairs()) ) {
+      ((num_regs == 1) // Single use or aligned
+        ||  is_vect    // or vector
+        || !is_vect && o_mask->is_aligned_pairs()) ) {
+    assert(!is_vect || o_mask->is_aligned_sets(num_regs), "vectors are aligned");
     // Don't come here for mis-aligned doubles
     w_o_mask = w_mask;
   } else {                      // wide ideal mask does not overlap with o_mask
@@ -400,15 +401,17 @@
   // CNC - Turned off 7/8/99, causes too much spilling
   // if( lrg->_is_bound ) return false;
 
+  // Use float pressure numbers for vectors.
+  bool is_float_or_vector = lrg->_is_float || lrg->_is_vector;
   // Not yet reached the high-pressure cutoff point, so low pressure
-  uint hrp_idx = lrg->_is_float ? b->_fhrp_index : b->_ihrp_index;
+  uint hrp_idx = is_float_or_vector ? b->_fhrp_index : b->_ihrp_index;
   if( insidx < hrp_idx ) return false;
   // Register pressure for the block as a whole depends on reg class
-  int block_pres = lrg->_is_float ? b->_freg_pressure : b->_reg_pressure;
+  int block_pres = is_float_or_vector ? b->_freg_pressure : b->_reg_pressure;
   // Bound live ranges will split at the binding points first;
   // Intermediate splits should assume the live range's register set
   // got "freed up" and that num_regs will become INT_PRESSURE.
-  int bound_pres = lrg->_is_float ? FLOATPRESSURE : INTPRESSURE;
+  int bound_pres = is_float_or_vector ? FLOATPRESSURE : INTPRESSURE;
   // Effective register pressure limit.
   int lrg_pres = (lrg->get_invalid_mask_size() > lrg->num_regs())
     ? (lrg->get_invalid_mask_size() >> (lrg->num_regs()-1)) : bound_pres;
@@ -794,12 +797,15 @@
                   if( i < n->req() ) break;
                   insert_point--;
                 }
+                uint orig_eidx = b->end_idx();
                 maxlrg = split_DEF( n1, b, insert_point, maxlrg, Reachblock, debug_defs, splits, slidx);
                 // If it wasn't split bail
                 if (!maxlrg) {
                   return 0;
                 }
-                insidx++;
+                // Spill of NULL check mem op goes into the following block.
+                if (b->end_idx() > orig_eidx)
+                  insidx++;
               }
               // This is a new DEF, so update UP
               UPblock[slidx] = false;
@@ -960,7 +966,7 @@
             // Grab register mask info
             const RegMask &dmask = def->out_RegMask();
             const RegMask &umask = n->in_RegMask(inpidx);
-
+            bool is_vect = RegMask::is_vector(def->ideal_reg());
             assert(inpidx < oopoff, "cannot use-split oop map info");
 
             bool dup = UPblock[slidx];
@@ -972,7 +978,7 @@
             if( !umask.is_AllStack() &&
                 (int)umask.Size() <= lrgs(useidx).num_regs() &&
                 (!def->rematerialize() ||
-                 umask.is_misaligned_Pair())) {
+                 !is_vect && umask.is_misaligned_pair())) {
               // These need a Split regardless of overlap or pressure
               // SPLIT - NO DEF - NO CISC SPILL
               maxlrg = split_USE(def,b,n,inpidx,maxlrg,dup,false, splits,slidx);
@@ -1123,10 +1129,12 @@
         // Grab UP info for DEF
         const RegMask &dmask = n->out_RegMask();
         bool defup = dmask.is_UP();
+        int ireg = n->ideal_reg();
+        bool is_vect = RegMask::is_vector(ireg);
         // Only split at Def if this is a HRP block or bound (and spilled once)
         if( !n->rematerialize() &&
-            (((dmask.is_bound1() || dmask.is_bound2() || dmask.is_misaligned_Pair()) &&
-             (deflrg._direct_conflict || deflrg._must_spill)) ||
+            (((dmask.is_bound(ireg) || !is_vect && dmask.is_misaligned_pair()) &&
+              (deflrg._direct_conflict || deflrg._must_spill)) ||
              // Check for LRG being up in a register and we are inside a high
              // pressure area.  Spill it down immediately.
              (defup && is_high_pressure(b,&deflrg,insidx))) ) {
--- a/src/share/vm/opto/regmask.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/regmask.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -129,11 +129,34 @@
   0
 );
 
+//=============================================================================
+bool RegMask::is_vector(uint ireg) {
+  return (ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY);
+}
+
+int RegMask::num_registers(uint ireg) {
+    switch(ireg) {
+      case Op_VecY:
+        return 8;
+      case Op_VecX:
+        return 4;
+      case Op_VecD:
+      case Op_RegD:
+      case Op_RegL:
+#ifdef _LP64
+      case Op_RegP:
+#endif
+        return 2;
+    }
+    // Op_VecS and the rest ideal registers.
+    return 1;
+}
+
 //------------------------------find_first_pair--------------------------------
 // Find the lowest-numbered register pair in the mask.  Return the
 // HIGHEST register number in the pair, or BAD if no pairs.
 OptoReg::Name RegMask::find_first_pair() const {
-  VerifyPairs();
+  verify_pairs();
   for( int i = 0; i < RM_SIZE; i++ ) {
     if( _A[i] ) {               // Found some bits
       int bit = _A[i] & -_A[i]; // Extract low bit
@@ -146,30 +169,30 @@
 
 //------------------------------ClearToPairs-----------------------------------
 // Clear out partial bits; leave only bit pairs
-void RegMask::ClearToPairs() {
+void RegMask::clear_to_pairs() {
   for( int i = 0; i < RM_SIZE; i++ ) {
     int bits = _A[i];
     bits &= ((bits & 0x55555555)<<1); // 1 hi-bit set for each pair
     bits |= (bits>>1);          // Smear 1 hi-bit into a pair
     _A[i] = bits;
   }
-  VerifyPairs();
+  verify_pairs();
 }
 
 //------------------------------SmearToPairs-----------------------------------
 // Smear out partial bits; leave only bit pairs
-void RegMask::SmearToPairs() {
+void RegMask::smear_to_pairs() {
   for( int i = 0; i < RM_SIZE; i++ ) {
     int bits = _A[i];
     bits |= ((bits & 0x55555555)<<1); // Smear lo bit hi per pair
     bits |= ((bits & 0xAAAAAAAA)>>1); // Smear hi bit lo per pair
     _A[i] = bits;
   }
-  VerifyPairs();
+  verify_pairs();
 }
 
 //------------------------------is_aligned_pairs-------------------------------
-bool RegMask::is_aligned_Pairs() const {
+bool RegMask::is_aligned_pairs() const {
   // Assert that the register mask contains only bit pairs.
   for( int i = 0; i < RM_SIZE; i++ ) {
     int bits = _A[i];
@@ -204,7 +227,7 @@
 
 //------------------------------is_bound2--------------------------------------
 // Return TRUE if the mask contains an adjacent pair of bits and no other bits.
-int RegMask::is_bound2() const {
+int RegMask::is_bound_pair() const {
   if( is_AllStack() ) return false;
 
   int bit = -1;                 // Set to hold the one bit allowed
@@ -226,6 +249,132 @@
   return true;
 }
 
+static int low_bits[3] = { 0x55555555, 0x11111111, 0x01010101 };
+//------------------------------find_first_set---------------------------------
+// Find the lowest-numbered register set in the mask.  Return the
+// HIGHEST register number in the set, or BAD if no sets.
+// Works also for size 1.
+OptoReg::Name RegMask::find_first_set(int size) const {
+  verify_sets(size);
+  for (int i = 0; i < RM_SIZE; i++) {
+    if (_A[i]) {                // Found some bits
+      int bit = _A[i] & -_A[i]; // Extract low bit
+      // Convert to bit number, return hi bit in pair
+      return OptoReg::Name((i<<_LogWordBits)+find_lowest_bit(bit)+(size-1));
+    }
+  }
+  return OptoReg::Bad;
+}
+
+//------------------------------clear_to_sets----------------------------------
+// Clear out partial bits; leave only aligned adjacent bit pairs
+void RegMask::clear_to_sets(int size) {
+  if (size == 1) return;
+  assert(2 <= size && size <= 8, "update low bits table");
+  assert(is_power_of_2(size), "sanity");
+  int low_bits_mask = low_bits[size>>2];
+  for (int i = 0; i < RM_SIZE; i++) {
+    int bits = _A[i];
+    int sets = (bits & low_bits_mask);
+    for (int j = 1; j < size; j++) {
+      sets = (bits & (sets<<1)); // filter bits which produce whole sets
+    }
+    sets |= (sets>>1);           // Smear 1 hi-bit into a set
+    if (size > 2) {
+      sets |= (sets>>2);         // Smear 2 hi-bits into a set
+      if (size > 4) {
+        sets |= (sets>>4);       // Smear 4 hi-bits into a set
+      }
+    }
+    _A[i] = sets;
+  }
+  verify_sets(size);
+}
+
+//------------------------------smear_to_sets----------------------------------
+// Smear out partial bits to aligned adjacent bit sets
+void RegMask::smear_to_sets(int size) {
+  if (size == 1) return;
+  assert(2 <= size && size <= 8, "update low bits table");
+  assert(is_power_of_2(size), "sanity");
+  int low_bits_mask = low_bits[size>>2];
+  for (int i = 0; i < RM_SIZE; i++) {
+    int bits = _A[i];
+    int sets = 0;
+    for (int j = 0; j < size; j++) {
+      sets |= (bits & low_bits_mask);  // collect partial bits
+      bits  = bits>>1;
+    }
+    sets |= (sets<<1);           // Smear 1 lo-bit  into a set
+    if (size > 2) {
+      sets |= (sets<<2);         // Smear 2 lo-bits into a set
+      if (size > 4) {
+        sets |= (sets<<4);       // Smear 4 lo-bits into a set
+      }
+    }
+    _A[i] = sets;
+  }
+  verify_sets(size);
+}
+
+//------------------------------is_aligned_set--------------------------------
+bool RegMask::is_aligned_sets(int size) const {
+  if (size == 1) return true;
+  assert(2 <= size && size <= 8, "update low bits table");
+  assert(is_power_of_2(size), "sanity");
+  int low_bits_mask = low_bits[size>>2];
+  // Assert that the register mask contains only bit sets.
+  for (int i = 0; i < RM_SIZE; i++) {
+    int bits = _A[i];
+    while (bits) {              // Check bits for pairing
+      int bit = bits & -bits;   // Extract low bit
+      // Low bit is not odd means its mis-aligned.
+      if ((bit & low_bits_mask) == 0) return false;
+      // Do extra work since (bit << size) may overflow.
+      int hi_bit = bit << (size-1); // high bit
+      int set = hi_bit + ((hi_bit-1) & ~(bit-1));
+      // Check for aligned adjacent bits in this set
+      if ((bits & set) != set) return false;
+      bits -= set;  // Remove this set
+    }
+  }
+  return true;
+}
+
+//------------------------------is_bound_set-----------------------------------
+// Return TRUE if the mask contains one adjacent set of bits and no other bits.
+// Works also for size 1.
+int RegMask::is_bound_set(int size) const {
+  if( is_AllStack() ) return false;
+  assert(1 <= size && size <= 8, "update low bits table");
+  int bit = -1;                 // Set to hold the one bit allowed
+  for (int i = 0; i < RM_SIZE; i++) {
+    if (_A[i] ) {               // Found some bits
+      if (bit != -1)
+       return false;            // Already had bits, so fail
+      bit = _A[i] & -_A[i];     // Extract 1 bit from mask
+      int hi_bit = bit << (size-1); // high bit
+      if (hi_bit != 0) {        // Bit set stays in same word?
+        int set = hi_bit + ((hi_bit-1) & ~(bit-1));
+        if (set != _A[i])
+          return false;         // Require adjacent bit set and no more bits
+      } else {                  // Else its a split-set case
+        if (((-1) & ~(bit-1)) != _A[i])
+          return false;         // Found many bits, so fail
+        i++;                    // Skip iteration forward and check high part
+        assert(size <= 8, "update next code");
+        // The lower 24 bits should be 0 since it is split case and size <= 8.
+        int set = bit>>24;
+        set = set & -set; // Remove sign extension.
+        set = (((set << size) - 1) >> 8);
+        if (_A[i] != set) return false; // Require 1 lo bit in next word
+      }
+    }
+  }
+  // True for both the empty mask and for a bit set
+  return true;
+}
+
 //------------------------------is_UP------------------------------------------
 // UP means register only, Register plus stack, or stack only is DOWN
 bool RegMask::is_UP() const {
--- a/src/share/vm/opto/regmask.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/regmask.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -113,7 +113,11 @@
   // the controlling alignment constraint.  Note that this alignment
   // requirement is internal to the allocator, and independent of any
   // particular platform.
-  enum { SlotsPerLong = 2 };
+  enum { SlotsPerLong = 2,
+         SlotsPerVecS = 1,
+         SlotsPerVecD = 2,
+         SlotsPerVecX = 4,
+         SlotsPerVecY = 8 };
 
   // A constructor only used by the ADLC output.  All mask fields are filled
   // in directly.  Calls to this look something like RM(1,2,3,4);
@@ -193,20 +197,53 @@
   OptoReg::Name find_first_pair() const;
 
   // Clear out partial bits; leave only aligned adjacent bit pairs.
-  void ClearToPairs();
+  void clear_to_pairs();
   // Smear out partial bits; leave only aligned adjacent bit pairs.
-  void SmearToPairs();
+  void smear_to_pairs();
   // Verify that the mask contains only aligned adjacent bit pairs
-  void VerifyPairs() const { assert( is_aligned_Pairs(), "mask is not aligned, adjacent pairs" ); }
+  void verify_pairs() const { assert( is_aligned_pairs(), "mask is not aligned, adjacent pairs" ); }
   // Test that the mask contains only aligned adjacent bit pairs
-  bool is_aligned_Pairs() const;
+  bool is_aligned_pairs() const;
 
   // mask is a pair of misaligned registers
-  bool is_misaligned_Pair() const { return Size()==2 && !is_aligned_Pairs();}
+  bool is_misaligned_pair() const { return Size()==2 && !is_aligned_pairs(); }
   // Test for single register
   int is_bound1() const;
   // Test for a single adjacent pair
-  int is_bound2() const;
+  int is_bound_pair() const;
+  // Test for a single adjacent set of ideal register's size.
+  int is_bound(uint ireg) const {
+    if (is_vector(ireg)) {
+      if (is_bound_set(num_registers(ireg)))
+        return true;
+    } else if (is_bound1() || is_bound_pair()) {
+      return true;
+    }
+    return false;
+  }
+
+  // Find the lowest-numbered register set in the mask.  Return the
+  // HIGHEST register number in the set, or BAD if no sets.
+  // Assert that the mask contains only bit sets.
+  OptoReg::Name find_first_set(int size) const;
+
+  // Clear out partial bits; leave only aligned adjacent bit sets of size.
+  void clear_to_sets(int size);
+  // Smear out partial bits to aligned adjacent bit sets.
+  void smear_to_sets(int size);
+  // Verify that the mask contains only aligned adjacent bit sets
+  void verify_sets(int size) const { assert(is_aligned_sets(size), "mask is not aligned, adjacent sets"); }
+  // Test that the mask contains only aligned adjacent bit sets
+  bool is_aligned_sets(int size) const;
+
+  // mask is a set of misaligned registers
+  bool is_misaligned_set(int size) const { return (int)Size()==size && !is_aligned_sets(size);}
+
+  // Test for a single adjacent set
+  int is_bound_set(int size) const;
+
+  static bool is_vector(uint ireg);
+  static int num_registers(uint ireg);
 
   // Fast overlap test.  Non-zero if any registers in common.
   int overlap( const RegMask &rm ) const {
@@ -280,9 +317,15 @@
 
   static bool can_represent(OptoReg::Name reg) {
     // NOTE: -1 in computation reflects the usage of the last
-    //       bit of the regmask as an infinite stack flag.
+    //       bit of the regmask as an infinite stack flag and
+    //       -7 is to keep mask aligned for largest value (VecY).
     return (int)reg < (int)(CHUNK_SIZE-1);
   }
+  static bool can_represent_arg(OptoReg::Name reg) {
+    // NOTE: -SlotsPerVecY in computation reflects the need
+    //       to keep mask aligned for largest value (VecY).
+    return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecY);
+  }
 };
 
 // Do not use this constant directly in client code!
--- a/src/share/vm/opto/stringopts.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/stringopts.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -112,6 +112,7 @@
     _arguments->ins_req(0, value);
     _mode.insert_before(0, mode);
   }
+
   void push_string(Node* value) {
     push(value, StringMode);
   }
@@ -125,9 +126,56 @@
     push(value, CharMode);
   }
 
+  static bool is_SB_toString(Node* call) {
+    if (call->is_CallStaticJava()) {
+      CallStaticJavaNode* csj = call->as_CallStaticJava();
+      ciMethod* m = csj->method();
+      if (m != NULL &&
+          (m->intrinsic_id() == vmIntrinsics::_StringBuilder_toString ||
+           m->intrinsic_id() == vmIntrinsics::_StringBuffer_toString)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  static Node* skip_string_null_check(Node* value) {
+    // Look for a diamond shaped Null check of toString() result
+    // (could be code from String.valueOf()):
+    // (Proj == NULL) ? "null":"CastPP(Proj)#NotNULL
+    if (value->is_Phi()) {
+      int true_path = value->as_Phi()->is_diamond_phi();
+      if (true_path != 0) {
+        // phi->region->if_proj->ifnode->bool
+        BoolNode* b = value->in(0)->in(1)->in(0)->in(1)->as_Bool();
+        Node* cmp = b->in(1);
+        Node* v1 = cmp->in(1);
+        Node* v2 = cmp->in(2);
+        // Null check of the return of toString which can simply be skipped.
+        if (b->_test._test == BoolTest::ne &&
+            v2->bottom_type() == TypePtr::NULL_PTR &&
+            value->in(true_path)->Opcode() == Op_CastPP &&
+            value->in(true_path)->in(1) == v1 &&
+            v1->is_Proj() && is_SB_toString(v1->in(0))) {
+          return v1;
+        }
+      }
+    }
+    return value;
+  }
+
   Node* argument(int i) {
     return _arguments->in(i);
   }
+  Node* argument_uncast(int i) {
+    Node* arg = argument(i);
+    int amode = mode(i);
+    if (amode == StringConcat::StringMode ||
+        amode == StringConcat::StringNullCheckMode) {
+      arg = skip_string_null_check(arg);
+    }
+    return arg;
+  }
   void set_argument(int i, Node* value) {
     _arguments->set_req(i, value);
   }
@@ -206,9 +254,11 @@
 
 
 void StringConcat::eliminate_unneeded_control() {
-  eliminate_initialize(begin()->initialization());
   for (uint i = 0; i < _control.size(); i++) {
     Node* n = _control.at(i);
+    if (n->is_Allocate()) {
+      eliminate_initialize(n->as_Allocate()->initialization());
+    }
     if (n->is_Call()) {
       if (n != _end) {
         eliminate_call(n->as_Call());
@@ -239,14 +289,15 @@
   assert(result->_control.contains(other->_end), "what?");
   assert(result->_control.contains(_begin), "what?");
   for (int x = 0; x < num_arguments(); x++) {
-    if (argument(x) == arg) {
+    Node* argx = argument_uncast(x);
+    if (argx == arg) {
       // replace the toString result with the all the arguments that
       // made up the other StringConcat
       for (int y = 0; y < other->num_arguments(); y++) {
         result->append(other->argument(y), other->mode(y));
       }
     } else {
-      result->append(argument(x), mode(x));
+      result->append(argx, mode(x));
     }
   }
   result->set_allocation(other->_begin);
@@ -327,14 +378,9 @@
 
   while (worklist.size() > 0) {
     Node* ctrl = worklist.pop();
-    if (ctrl->is_CallStaticJava()) {
+    if (StringConcat::is_SB_toString(ctrl)) {
       CallStaticJavaNode* csj = ctrl->as_CallStaticJava();
-      ciMethod* m = csj->method();
-      if (m != NULL &&
-          (m->intrinsic_id() == vmIntrinsics::_StringBuffer_toString ||
-           m->intrinsic_id() == vmIntrinsics::_StringBuilder_toString)) {
-        string_calls.push(csj);
-      }
+      string_calls.push(csj);
     }
     if (ctrl->in(0) != NULL && !_visited.test_set(ctrl->in(0)->_idx)) {
       worklist.push(ctrl->in(0));
@@ -550,44 +596,40 @@
   for (int c = 0; c < concats.length(); c++) {
     StringConcat* sc = concats.at(c);
     for (int i = 0; i < sc->num_arguments(); i++) {
-      Node* arg = sc->argument(i);
-      if (arg->is_Proj() && arg->in(0)->is_CallStaticJava()) {
+      Node* arg = sc->argument_uncast(i);
+      if (arg->is_Proj() && StringConcat::is_SB_toString(arg->in(0))) {
         CallStaticJavaNode* csj = arg->in(0)->as_CallStaticJava();
-        if (csj->method() != NULL &&
-            (csj->method()->intrinsic_id() == vmIntrinsics::_StringBuilder_toString ||
-             csj->method()->intrinsic_id() == vmIntrinsics::_StringBuffer_toString)) {
-          for (int o = 0; o < concats.length(); o++) {
-            if (c == o) continue;
-            StringConcat* other = concats.at(o);
-            if (other->end() == csj) {
+        for (int o = 0; o < concats.length(); o++) {
+          if (c == o) continue;
+          StringConcat* other = concats.at(o);
+          if (other->end() == csj) {
+#ifndef PRODUCT
+            if (PrintOptimizeStringConcat) {
+              tty->print_cr("considering stacked concats");
+            }
+#endif
+
+            StringConcat* merged = sc->merge(other, arg);
+            if (merged->validate_control_flow()) {
 #ifndef PRODUCT
               if (PrintOptimizeStringConcat) {
-                tty->print_cr("considering stacked concats");
+                tty->print_cr("stacking would succeed");
               }
 #endif
-
-              StringConcat* merged = sc->merge(other, arg);
-              if (merged->validate_control_flow()) {
+              if (c < o) {
+                concats.remove_at(o);
+                concats.at_put(c, merged);
+              } else {
+                concats.remove_at(c);
+                concats.at_put(o, merged);
+              }
+              goto restart;
+            } else {
 #ifndef PRODUCT
-                if (PrintOptimizeStringConcat) {
-                  tty->print_cr("stacking would succeed");
-                }
+              if (PrintOptimizeStringConcat) {
+                tty->print_cr("stacking would fail");
+              }
 #endif
-                if (c < o) {
-                  concats.remove_at(o);
-                  concats.at_put(c, merged);
-                } else {
-                  concats.remove_at(c);
-                  concats.at_put(o, merged);
-                }
-                goto restart;
-              } else {
-#ifndef PRODUCT
-                if (PrintOptimizeStringConcat) {
-                  tty->print_cr("stacking would fail");
-                }
-#endif
-              }
             }
           }
         }
--- a/src/share/vm/opto/superword.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/superword.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -67,6 +67,10 @@
 
 //------------------------------transform_loop---------------------------
 void SuperWord::transform_loop(IdealLoopTree* lpt) {
+  assert(UseSuperWord, "should be");
+  // Do vectors exist on this architecture?
+  if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
+
   assert(lpt->_head->is_CountedLoop(), "must be");
   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
 
@@ -89,15 +93,12 @@
   Node *pre_opaq1 = pre_end->limit();
   if (pre_opaq1->Opcode() != Op_Opaque1) return;
 
-  // Do vectors exist on this architecture?
-  if (vector_width_in_bytes() == 0) return;
-
   init(); // initialize data structures
 
   set_lpt(lpt);
   set_lp(cl);
 
- // For now, define one block which is the entire loop body
+  // For now, define one block which is the entire loop body
   set_bb(cl);
 
   assert(_packset.length() == 0, "packset must be empty");
@@ -177,7 +178,7 @@
   Node_List memops;
   for (int i = 0; i < _block.length(); i++) {
     Node* n = _block.at(i);
-    if (n->is_Mem() && in_bb(n) &&
+    if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
         is_java_primitive(n->as_Mem()->memory_type())) {
       int align = memory_alignment(n->as_Mem(), 0);
       if (align != bottom_align) {
@@ -185,54 +186,141 @@
       }
     }
   }
-  if (memops.size() == 0) return;
 
-  // Find a memory reference to align to.  The pre-loop trip count
-  // is modified to align this reference to a vector-aligned address
-  find_align_to_ref(memops);
-  if (align_to_ref() == NULL) return;
+  Node_List align_to_refs;
+  int best_iv_adjustment = 0;
+  MemNode* best_align_to_mem_ref = NULL;
 
-  SWPointer align_to_ref_p(align_to_ref(), this);
-  int offset = align_to_ref_p.offset_in_bytes();
-  int scale  = align_to_ref_p.scale_in_bytes();
-  int vw              = vector_width_in_bytes();
-  int stride_sign     = (scale * iv_stride()) > 0 ? 1 : -1;
-  int iv_adjustment   = (stride_sign * vw - (offset % vw)) % vw;
+  while (memops.size() != 0) {
+    // Find a memory reference to align to.
+    MemNode* mem_ref = find_align_to_ref(memops);
+    if (mem_ref == NULL) break;
+    align_to_refs.push(mem_ref);
+    int iv_adjustment = get_iv_adjustment(mem_ref);
 
-#ifndef PRODUCT
-  if (TraceSuperWord)
-    tty->print_cr("\noffset = %d iv_adjustment = %d  elt_align = %d scale = %d iv_stride = %d",
-                  offset, iv_adjustment, align_to_ref_p.memory_size(), align_to_ref_p.scale_in_bytes(), iv_stride());
-#endif
+    if (best_align_to_mem_ref == NULL) {
+      // Set memory reference which is the best from all memory operations
+      // to be used for alignment. The pre-loop trip count is modified to align
+      // this reference to a vector-aligned address.
+      best_align_to_mem_ref = mem_ref;
+      best_iv_adjustment = iv_adjustment;
+    }
 
-  // Set alignment relative to "align_to_ref"
-  for (int i = memops.size() - 1; i >= 0; i--) {
-    MemNode* s = memops.at(i)->as_Mem();
-    SWPointer p2(s, this);
-    if (p2.comparable(align_to_ref_p)) {
-      int align = memory_alignment(s, iv_adjustment);
-      set_alignment(s, align);
-    } else {
-      memops.remove(i);
-    }
-  }
-
-  // Create initial pack pairs of memory operations
-  for (uint i = 0; i < memops.size(); i++) {
-    Node* s1 = memops.at(i);
-    for (uint j = 0; j < memops.size(); j++) {
-      Node* s2 = memops.at(j);
-      if (s1 != s2 && are_adjacent_refs(s1, s2)) {
-        int align = alignment(s1);
-        if (stmts_can_pack(s1, s2, align)) {
-          Node_List* pair = new Node_List();
-          pair->push(s1);
-          pair->push(s2);
-          _packset.append(pair);
+    SWPointer align_to_ref_p(mem_ref, this);
+    // Set alignment relative to "align_to_ref" for all related memory operations.
+    for (int i = memops.size() - 1; i >= 0; i--) {
+      MemNode* s = memops.at(i)->as_Mem();
+      if (isomorphic(s, mem_ref)) {
+        SWPointer p2(s, this);
+        if (p2.comparable(align_to_ref_p)) {
+          int align = memory_alignment(s, iv_adjustment);
+          set_alignment(s, align);
         }
       }
     }
-  }
+
+    // Create initial pack pairs of memory operations for which
+    // alignment is set and vectors will be aligned.
+    bool create_pack = true;
+    if (memory_alignment(mem_ref, best_iv_adjustment) == 0) {
+      if (!Matcher::misaligned_vectors_ok()) {
+        int vw = vector_width(mem_ref);
+        int vw_best = vector_width(best_align_to_mem_ref);
+        if (vw > vw_best) {
+          // Do not vectorize a memory access with more elements per vector
+          // if unaligned memory access is not allowed because number of
+          // iterations in pre-loop will be not enough to align it.
+          create_pack = false;
+        }
+      }
+    } else {
+      if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
+        // Can't allow vectorization of unaligned memory accesses with the
+        // same type since it could be overlapped accesses to the same array.
+        create_pack = false;
+      } else {
+        // Allow independent (different type) unaligned memory operations
+        // if HW supports them.
+        if (!Matcher::misaligned_vectors_ok()) {
+          create_pack = false;
+        } else {
+          // Check if packs of the same memory type but
+          // with a different alignment were created before.
+          for (uint i = 0; i < align_to_refs.size(); i++) {
+            MemNode* mr = align_to_refs.at(i)->as_Mem();
+            if (same_velt_type(mr, mem_ref) &&
+                memory_alignment(mr, iv_adjustment) != 0)
+              create_pack = false;
+          }
+        }
+      }
+    }
+    if (create_pack) {
+      for (uint i = 0; i < memops.size(); i++) {
+        Node* s1 = memops.at(i);
+        int align = alignment(s1);
+        if (align == top_align) continue;
+        for (uint j = 0; j < memops.size(); j++) {
+          Node* s2 = memops.at(j);
+          if (alignment(s2) == top_align) continue;
+          if (s1 != s2 && are_adjacent_refs(s1, s2)) {
+            if (stmts_can_pack(s1, s2, align)) {
+              Node_List* pair = new Node_List();
+              pair->push(s1);
+              pair->push(s2);
+              _packset.append(pair);
+            }
+          }
+        }
+      }
+    } else { // Don't create unaligned pack
+      // First, remove remaining memory ops of the same type from the list.
+      for (int i = memops.size() - 1; i >= 0; i--) {
+        MemNode* s = memops.at(i)->as_Mem();
+        if (same_velt_type(s, mem_ref)) {
+          memops.remove(i);
+        }
+      }
+
+      // Second, remove already constructed packs of the same type.
+      for (int i = _packset.length() - 1; i >= 0; i--) {
+        Node_List* p = _packset.at(i);
+        MemNode* s = p->at(0)->as_Mem();
+        if (same_velt_type(s, mem_ref)) {
+          remove_pack_at(i);
+        }
+      }
+
+      // If needed find the best memory reference for loop alignment again.
+      if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
+        // Put memory ops from remaining packs back on memops list for
+        // the best alignment search.
+        uint orig_msize = memops.size();
+        for (int i = 0; i < _packset.length(); i++) {
+          Node_List* p = _packset.at(i);
+          MemNode* s = p->at(0)->as_Mem();
+          assert(!same_velt_type(s, mem_ref), "sanity");
+          memops.push(s);
+        }
+        MemNode* best_align_to_mem_ref = find_align_to_ref(memops);
+        if (best_align_to_mem_ref == NULL) break;
+        best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
+        // Restore list.
+        while (memops.size() > orig_msize)
+          (void)memops.pop();
+      }
+    } // unaligned memory accesses
+
+    // Remove used mem nodes.
+    for (int i = memops.size() - 1; i >= 0; i--) {
+      MemNode* m = memops.at(i)->as_Mem();
+      if (alignment(m) != top_align) {
+        memops.remove(i);
+      }
+    }
+
+  } // while (memops.size() != 0
+  set_align_to_ref(best_align_to_mem_ref);
 
 #ifndef PRODUCT
   if (TraceSuperWord) {
@@ -246,7 +334,7 @@
 // Find a memory reference to align the loop induction variable to.
 // Looks first at stores then at loads, looking for a memory reference
 // with the largest number of references similar to it.
-void SuperWord::find_align_to_ref(Node_List &memops) {
+MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
   GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
 
   // Count number of comparable memory ops
@@ -270,20 +358,28 @@
     }
   }
 
-  // Find Store (or Load) with the greatest number of "comparable" references
+  // Find Store (or Load) with the greatest number of "comparable" references,
+  // biggest vector size, smallest data size and smallest iv offset.
   int max_ct        = 0;
+  int max_vw        = 0;
   int max_idx       = -1;
   int min_size      = max_jint;
   int min_iv_offset = max_jint;
   for (uint j = 0; j < memops.size(); j++) {
     MemNode* s = memops.at(j)->as_Mem();
     if (s->is_Store()) {
+      int vw = vector_width_in_bytes(s);
+      assert(vw > 1, "sanity");
       SWPointer p(s, this);
-      if (cmp_ct.at(j) > max_ct ||
-          cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
-                                     data_size(s) == min_size &&
-                                        p.offset_in_bytes() < min_iv_offset)) {
+      if (cmp_ct.at(j) >  max_ct ||
+          cmp_ct.at(j) == max_ct &&
+            (vw >  max_vw ||
+             vw == max_vw &&
+              (data_size(s) <  min_size ||
+               data_size(s) == min_size &&
+                 (p.offset_in_bytes() < min_iv_offset)))) {
         max_ct = cmp_ct.at(j);
+        max_vw = vw;
         max_idx = j;
         min_size = data_size(s);
         min_iv_offset = p.offset_in_bytes();
@@ -295,12 +391,18 @@
     for (uint j = 0; j < memops.size(); j++) {
       MemNode* s = memops.at(j)->as_Mem();
       if (s->is_Load()) {
+        int vw = vector_width_in_bytes(s);
+        assert(vw > 1, "sanity");
         SWPointer p(s, this);
-        if (cmp_ct.at(j) > max_ct ||
-            cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
-                                       data_size(s) == min_size &&
-                                          p.offset_in_bytes() < min_iv_offset)) {
+        if (cmp_ct.at(j) >  max_ct ||
+            cmp_ct.at(j) == max_ct &&
+              (vw >  max_vw ||
+               vw == max_vw &&
+                (data_size(s) <  min_size ||
+                 data_size(s) == min_size &&
+                   (p.offset_in_bytes() < min_iv_offset)))) {
           max_ct = cmp_ct.at(j);
+          max_vw = vw;
           max_idx = j;
           min_size = data_size(s);
           min_iv_offset = p.offset_in_bytes();
@@ -309,10 +411,7 @@
     }
   }
 
-  if (max_ct > 0)
-    set_align_to_ref(memops.at(max_idx)->as_Mem());
-
-#ifndef PRODUCT
+#ifdef ASSERT
   if (TraceSuperWord && Verbose) {
     tty->print_cr("\nVector memops after find_align_to_refs");
     for (uint i = 0; i < memops.size(); i++) {
@@ -321,6 +420,17 @@
     }
   }
 #endif
+
+  if (max_ct > 0) {
+#ifdef ASSERT
+    if (TraceSuperWord) {
+      tty->print("\nVector align to node: ");
+      memops.at(max_idx)->as_Mem()->dump();
+    }
+#endif
+    return memops.at(max_idx)->as_Mem();
+  }
+  return NULL;
 }
 
 //------------------------------ref_is_alignable---------------------------
@@ -341,7 +451,8 @@
 
   // If initial offset from start of object is computable,
   // compute alignment within the vector.
-  int vw = vector_width_in_bytes();
+  int vw = vector_width_in_bytes(p.mem());
+  assert(vw > 1, "sanity");
   if (vw % span == 0) {
     Node* init_nd = pre_end->init_trip();
     if (init_nd->is_Con() && p.invar() == NULL) {
@@ -361,6 +472,25 @@
   return false;
 }
 
+//---------------------------get_iv_adjustment---------------------------
+// Calculate loop's iv adjustment for this memory ops.
+int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
+  SWPointer align_to_ref_p(mem_ref, this);
+  int offset = align_to_ref_p.offset_in_bytes();
+  int scale  = align_to_ref_p.scale_in_bytes();
+  int vw       = vector_width_in_bytes(mem_ref);
+  assert(vw > 1, "sanity");
+  int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
+  int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw;
+
+#ifndef PRODUCT
+  if (TraceSuperWord)
+    tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
+                  offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw);
+#endif
+  return iv_adjustment;
+}
+
 //---------------------------dependence_graph---------------------------
 // Construct dependency graph.
 // Add dependence edges to load/store nodes for memory dependence
@@ -488,9 +618,13 @@
 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
 
   // Do not use superword for non-primitives
-  if((s1->is_Mem() && !is_java_primitive(s1->as_Mem()->memory_type())) ||
-     (s2->is_Mem() && !is_java_primitive(s2->as_Mem()->memory_type())))
+  BasicType bt1 = velt_basic_type(s1);
+  BasicType bt2 = velt_basic_type(s2);
+  if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
     return false;
+  if (Matcher::max_vector_size(bt1) < 2) {
+    return false; // No vectors for this type
+  }
 
   if (isomorphic(s1, s2)) {
     if (independent(s1, s2)) {
@@ -552,7 +686,7 @@
   if (s1->Opcode() != s2->Opcode()) return false;
   if (s1->req() != s2->req()) return false;
   if (s1->in(0) != s2->in(0)) return false;
-  if (velt_type(s1) != velt_type(s2)) return false;
+  if (!same_velt_type(s1, s2)) return false;
   return true;
 }
 
@@ -595,14 +729,16 @@
 //------------------------------set_alignment---------------------------
 void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
   set_alignment(s1, align);
-  set_alignment(s2, align + data_size(s1));
+  if (align == top_align || align == bottom_align) {
+    set_alignment(s2, align);
+  } else {
+    set_alignment(s2, align + data_size(s1));
+  }
 }
 
 //------------------------------data_size---------------------------
 int SuperWord::data_size(Node* s) {
-  const Type* t = velt_type(s);
-  BasicType  bt = t->array_element_basic_type();
-  int bsize = type2aelembytes(bt);
+  int bsize = type2aelembytes(velt_basic_type(s));
   assert(bsize != 0, "valid size");
   return bsize;
 }
@@ -631,9 +767,9 @@
 //------------------------------follow_use_defs---------------------------
 // Extend the packset by visiting operand definitions of nodes in pack p
 bool SuperWord::follow_use_defs(Node_List* p) {
+  assert(p->size() == 2, "just checking");
   Node* s1 = p->at(0);
   Node* s2 = p->at(1);
-  assert(p->size() == 2, "just checking");
   assert(s1->req() == s2->req(), "just checking");
   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
 
@@ -718,7 +854,12 @@
     for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
     for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
     if (i1 != i2) {
-      return false;
+      if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
+        // Further analysis relies on operands position matching.
+        u2->swap_edges(i1, i2);
+      } else {
+        return false;
+      }
     }
   } while (i1 < ct);
   return true;
@@ -727,7 +868,7 @@
 //------------------------------est_savings---------------------------
 // Estimate the savings from executing s1 and s2 as a pack
 int SuperWord::est_savings(Node* s1, Node* s2) {
-  int save = 2 - 1; // 2 operations per instruction in packed form
+  int save_in = 2 - 1; // 2 operations per instruction in packed form
 
   // inputs
   for (uint i = 1; i < s1->req(); i++) {
@@ -735,17 +876,18 @@
     Node* x2 = s2->in(i);
     if (x1 != x2) {
       if (are_adjacent_refs(x1, x2)) {
-        save += adjacent_profit(x1, x2);
+        save_in += adjacent_profit(x1, x2);
       } else if (!in_packset(x1, x2)) {
-        save -= pack_cost(2);
+        save_in -= pack_cost(2);
       } else {
-        save += unpack_cost(2);
+        save_in += unpack_cost(2);
       }
     }
   }
 
   // uses of result
   uint ct = 0;
+  int save_use = 0;
   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
     Node* s1_use = s1->fast_out(i);
     for (int j = 0; j < _packset.length(); j++) {
@@ -756,7 +898,7 @@
           if (p->at(p->size()-1) == s2_use) {
             ct++;
             if (are_adjacent_refs(s1_use, s2_use)) {
-              save += adjacent_profit(s1_use, s2_use);
+              save_use += adjacent_profit(s1_use, s2_use);
             }
           }
         }
@@ -764,10 +906,10 @@
     }
   }
 
-  if (ct < s1->outcnt()) save += unpack_cost(1);
-  if (ct < s2->outcnt()) save += unpack_cost(1);
+  if (ct < s1->outcnt()) save_use += unpack_cost(1);
+  if (ct < s2->outcnt()) save_use += unpack_cost(1);
 
-  return save;
+  return MAX2(save_in, save_use);
 }
 
 //------------------------------costs---------------------------
@@ -778,8 +920,9 @@
 //------------------------------combine_packs---------------------------
 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
 void SuperWord::combine_packs() {
-  bool changed;
-  do {
+  bool changed = true;
+  // Combine packs regardless max vector size.
+  while (changed) {
     changed = false;
     for (int i = 0; i < _packset.length(); i++) {
       Node_List* p1 = _packset.at(i);
@@ -787,6 +930,7 @@
       for (int j = 0; j < _packset.length(); j++) {
         Node_List* p2 = _packset.at(j);
         if (p2 == NULL) continue;
+        if (i == j) continue;
         if (p1->at(p1->size()-1) == p2->at(0)) {
           for (uint k = 1; k < p2->size(); k++) {
             p1->push(p2->at(k));
@@ -796,8 +940,39 @@
         }
       }
     }
-  } while (changed);
+  }
 
+  // Split packs which have size greater then max vector size.
+  for (int i = 0; i < _packset.length(); i++) {
+    Node_List* p1 = _packset.at(i);
+    if (p1 != NULL) {
+      BasicType bt = velt_basic_type(p1->at(0));
+      uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
+      assert(is_power_of_2(max_vlen), "sanity");
+      uint psize = p1->size();
+      if (!is_power_of_2(psize)) {
+        // Skip pack which can't be vector.
+        // case1: for(...) { a[i] = i; }    elements values are different (i+x)
+        // case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
+        _packset.at_put(i, NULL);
+        continue;
+      }
+      if (psize > max_vlen) {
+        Node_List* pack = new Node_List();
+        for (uint j = 0; j < psize; j++) {
+          pack->push(p1->at(j));
+          if (pack->size() >= max_vlen) {
+            assert(is_power_of_2(pack->size()), "sanity");
+            _packset.append(pack);
+            pack = new Node_List();
+          }
+        }
+        _packset.at_put(i, NULL);
+      }
+    }
+  }
+
+  // Compress list.
   for (int i = _packset.length() - 1; i >= 0; i--) {
     Node_List* p1 = _packset.at(i);
     if (p1 == NULL) {
@@ -880,8 +1055,7 @@
 // Can code be generated for pack p?
 bool SuperWord::implemented(Node_List* p) {
   Node* p0 = p->at(0);
-  int vopc = VectorNode::opcode(p0->Opcode(), p->size(), velt_type(p0));
-  return vopc > 0 && Matcher::has_match_rule(vopc);
+  return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
 }
 
 //------------------------------profitable---------------------------
@@ -939,36 +1113,36 @@
 }
 
 //-------------------------------remove_and_insert-------------------
-//remove "current" from its current position in the memory graph and insert
-//it after the appropriate insertion point (lip or uip)
+// Remove "current" from its current position in the memory graph and insert
+// it after the appropriate insertion point (lip or uip).
 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
                                   Node *uip, Unique_Node_List &sched_before) {
   Node* my_mem = current->in(MemNode::Memory);
-  _igvn.rehash_node_delayed(current);
-  _igvn.hash_delete(my_mem);
+  bool sched_up = sched_before.member(current);
 
-  //remove current_store from its current position in the memmory graph
+  // remove current_store from its current position in the memmory graph
   for (DUIterator i = current->outs(); current->has_out(i); i++) {
     Node* use = current->out(i);
     if (use->is_Mem()) {
       assert(use->in(MemNode::Memory) == current, "must be");
-      _igvn.rehash_node_delayed(use);
       if (use == prev) { // connect prev to my_mem
-        use->set_req(MemNode::Memory, my_mem);
+          _igvn.replace_input_of(use, MemNode::Memory, my_mem);
+          --i; //deleted this edge; rescan position
       } else if (sched_before.member(use)) {
-        _igvn.hash_delete(uip);
-        use->set_req(MemNode::Memory, uip);
+        if (!sched_up) { // Will be moved together with current
+          _igvn.replace_input_of(use, MemNode::Memory, uip);
+          --i; //deleted this edge; rescan position
+        }
       } else {
-        _igvn.hash_delete(lip);
-        use->set_req(MemNode::Memory, lip);
+        if (sched_up) { // Will be moved together with current
+          _igvn.replace_input_of(use, MemNode::Memory, lip);
+          --i; //deleted this edge; rescan position
+        }
       }
-      --i; //deleted this edge; rescan position
     }
   }
 
-  bool sched_up = sched_before.member(current);
   Node *insert_pt =  sched_up ?  uip : lip;
-  _igvn.hash_delete(insert_pt);
 
   // all uses of insert_pt's memory state should use current's instead
   for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
@@ -988,7 +1162,7 @@
   }
 
   //connect current to insert_pt
-  current->set_req(MemNode::Memory, insert_pt);
+  _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
 }
 
 //------------------------------co_locate_pack----------------------------------
@@ -1025,7 +1199,7 @@
         if (use->is_Mem() && use != previous)
           memops.push(use);
       }
-      if(current == first) break;
+      if (current == first) break;
       previous = current;
       current  = current->in(MemNode::Memory)->as_Mem();
     }
@@ -1038,27 +1212,37 @@
           Node *s2 = memops.at(j);
           if (!independent(s1, s2)) {
             if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
-              schedule_before_pack.push(s1); //s1 must be scheduled before
+              schedule_before_pack.push(s1); // s1 must be scheduled before
               Node_List* mem_pk = my_pack(s1);
               if (mem_pk != NULL) {
                 for (uint ii = 0; ii < mem_pk->size(); ii++) {
-                  Node* s = mem_pk->at(ii); // follow partner
+                  Node* s = mem_pk->at(ii);  // follow partner
                   if (memops.member(s) && !schedule_before_pack.member(s))
                     schedule_before_pack.push(s);
                 }
               }
+              break;
             }
           }
         }
       }
     }
 
+    Node*    upper_insert_pt = first->in(MemNode::Memory);
+    // Following code moves loads connected to upper_insert_pt below aliased stores.
+    // Collect such loads here and reconnect them back to upper_insert_pt later.
+    memops.clear();
+    for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
+      Node* use = upper_insert_pt->out(i);
+      if (!use->is_Store())
+        memops.push(use);
+    }
+
     MemNode* lower_insert_pt = last;
-    Node*    upper_insert_pt = first->in(MemNode::Memory);
     previous                 = last; //previous store in pk
     current                  = last->in(MemNode::Memory)->as_Mem();
 
-    //start scheduling from "last" to "first"
+    // start scheduling from "last" to "first"
     while (true) {
       assert(in_bb(current), "stay in block");
       assert(in_pack(previous, pk), "previous stays in pack");
@@ -1066,16 +1250,13 @@
 
       if (in_pack(current, pk)) {
         // Forward users of my memory state (except "previous) to my input memory state
-        _igvn.hash_delete(current);
         for (DUIterator i = current->outs(); current->has_out(i); i++) {
           Node* use = current->out(i);
           if (use->is_Mem() && use != previous) {
             assert(use->in(MemNode::Memory) == current, "must be");
             if (schedule_before_pack.member(use)) {
-              _igvn.hash_delete(upper_insert_pt);
               _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
             } else {
-              _igvn.hash_delete(lower_insert_pt);
               _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
             }
             --i; // deleted this edge; rescan position
@@ -1089,6 +1270,14 @@
       if (current == first) break;
       current = my_mem->as_Mem();
     } // end while
+
+    // Reconnect loads back to upper_insert_pt.
+    for (uint i = 0; i < memops.size(); i++) {
+      Node *ld = memops.at(i);
+      if (ld->in(MemNode::Memory) != upper_insert_pt) {
+        _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
+      }
+    }
   } else if (pk->at(0)->is_Load()) { //load
     // all loads in the pack should have the same memory state. By default,
     // we use the memory state of the last load. However, if any load could
@@ -1149,35 +1338,30 @@
       Node* vn = NULL;
       Node* low_adr = p->at(0);
       Node* first   = executed_first(p);
+      int   opc = n->Opcode();
       if (n->is_Load()) {
-        int   opc = n->Opcode();
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);
         Node* adr = low_adr->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
-        vn = VectorLoadNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen);
-
+        vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
       } else if (n->is_Store()) {
         // Promote value to be stored to vector
         Node* val = vector_opd(p, MemNode::ValueIn);
-
-        int   opc = n->Opcode();
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);
         Node* adr = low_adr->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
-        vn = VectorStoreNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
-
+        vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
       } else if (n->req() == 3) {
         // Promote operands to vector
         Node* in1 = vector_opd(p, 1);
         Node* in2 = vector_opd(p, 2);
-        vn = VectorNode::make(_phase->C, n->Opcode(), in1, in2, vlen, velt_type(n));
-
+        vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
       } else {
         ShouldNotReachHere();
       }
-
+      assert(vn != NULL, "sanity");
       _phase->_igvn.register_new_node_with_optimizer(vn);
       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
       for (uint j = 0; j < p->size(); j++) {
@@ -1185,6 +1369,12 @@
         _igvn.replace_node(pm, vn);
       }
       _igvn._worklist.push(vn);
+#ifdef ASSERT
+      if (TraceNewVectors) {
+        tty->print("new Vector node: ");
+        vn->dump();
+      }
+#endif
     }
   }
 }
@@ -1207,10 +1397,10 @@
   }
 
   if (same_opd) {
-    if (opd->is_Vector() || opd->is_VectorLoad()) {
+    if (opd->is_Vector() || opd->is_LoadVector()) {
       return opd; // input is matching vector
     }
-    assert(!opd->is_VectorStore(), "such vector is not expected here");
+    assert(!opd->is_StoreVector(), "such vector is not expected here");
     // Convert scalar input to vector with the same number of elements as
     // p0's vector. Use p0's type because size of operand's container in
     // vector should match p0's size regardless operand's size.
@@ -1219,12 +1409,18 @@
 
     _phase->_igvn.register_new_node_with_optimizer(vn);
     _phase->set_ctrl(vn, _phase->get_ctrl(opd));
+#ifdef ASSERT
+    if (TraceNewVectors) {
+      tty->print("new Vector node: ");
+      vn->dump();
+    }
+#endif
     return vn;
   }
 
   // Insert pack operation
-  const Type* p0_t = velt_type(p0);
-  PackNode* pk = PackNode::make(_phase->C, opd, p0_t);
+  BasicType bt = velt_basic_type(p0);
+  PackNode* pk = PackNode::make(_phase->C, opd, vlen, bt);
   DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
 
   for (uint i = 1; i < vlen; i++) {
@@ -1232,10 +1428,16 @@
     Node* in = pi->in(opd_idx);
     assert(my_pack(in) == NULL, "Should already have been unpacked");
     assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
-    pk->add_opd(in);
+    pk->add_opd(i, in);
   }
   _phase->_igvn.register_new_node_with_optimizer(pk);
   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
+#ifdef ASSERT
+    if (TraceNewVectors) {
+      tty->print("new Vector node: ");
+      pk->dump();
+    }
+#endif
   return pk;
 }
 
@@ -1273,16 +1475,15 @@
     // Insert extract operation
     _igvn.hash_delete(def);
     int def_pos = alignment(def) / data_size(def);
-    const Type* def_t = velt_type(def);
 
-    Node* ex = ExtractNode::make(_phase->C, def, def_pos, def_t);
+    Node* ex = ExtractNode::make(_phase->C, def, def_pos, velt_basic_type(def));
     _phase->_igvn.register_new_node_with_optimizer(ex);
     _phase->set_ctrl(ex, _phase->get_ctrl(def));
     _igvn.replace_input_of(use, idx, ex);
     _igvn._worklist.push(def);
 
     bb_insert_after(ex, bb_idx(def));
-    set_velt_type(ex, def_t);
+    set_velt_type(ex, velt_type(def));
   }
 }
 
@@ -1509,10 +1710,7 @@
   // Initial type
   for (int i = 0; i < _block.length(); i++) {
     Node* n = _block.at(i);
-    const Type* t  = n->is_Mem() ? Type::get_const_basic_type(n->as_Mem()->memory_type())
-                                 : _igvn.type(n);
-    const Type* vt = container_type(t);
-    set_velt_type(n, vt);
+    set_velt_type(n, container_type(n));
   }
 
   // Propagate narrowed type backwards through operations
@@ -1543,7 +1741,7 @@
             bool same_type = true;
             for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
               Node *use = in->fast_out(k);
-              if (!in_bb(use) || velt_type(use) != vt) {
+              if (!in_bb(use) || !same_velt_type(use, n)) {
                 same_type = false;
                 break;
               }
@@ -1575,20 +1773,24 @@
   if (!p.valid()) {
     return bottom_align;
   }
+  int vw = vector_width_in_bytes(s);
+  if (vw < 2) {
+    return bottom_align; // No vectors for this type
+  }
   int offset  = p.offset_in_bytes();
   offset     += iv_adjust_in_bytes;
-  int off_rem = offset % vector_width_in_bytes();
-  int off_mod = off_rem >= 0 ? off_rem : off_rem + vector_width_in_bytes();
+  int off_rem = offset % vw;
+  int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
   return off_mod;
 }
 
 //---------------------------container_type---------------------------
 // Smallest type containing range of values
-const Type* SuperWord::container_type(const Type* t) {
-  const Type* tp = t->make_ptr();
-  if (tp && tp->isa_aryptr()) {
-    t = tp->is_aryptr()->elem();
+const Type* SuperWord::container_type(Node* n) {
+  if (n->is_Mem()) {
+    return Type::get_const_basic_type(n->as_Mem()->memory_type());
   }
+  const Type* t = _igvn.type(n);
   if (t->basic_type() == T_INT) {
     if (t->higher_equal(TypeInt::BOOL))  return TypeInt::BOOL;
     if (t->higher_equal(TypeInt::BYTE))  return TypeInt::BYTE;
@@ -1599,11 +1801,22 @@
   return t;
 }
 
+bool SuperWord::same_velt_type(Node* n1, Node* n2) {
+  const Type* vt1 = velt_type(n1);
+  const Type* vt2 = velt_type(n1);
+  if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
+    // Compare vectors element sizes for integer types.
+    return data_size(n1) == data_size(n2);
+  }
+  return vt1 == vt2;
+}
+
 //-------------------------vector_opd_range-----------------------
 // (Start, end] half-open range defining which operands are vector
 void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) {
   switch (n->Opcode()) {
-  case Op_LoadB:   case Op_LoadUS:
+  case Op_LoadB:   case Op_LoadUB:
+  case Op_LoadS:   case Op_LoadUS:
   case Op_LoadI:   case Op_LoadL:
   case Op_LoadF:   case Op_LoadD:
   case Op_LoadP:
@@ -1721,6 +1934,7 @@
   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
 
   SWPointer align_to_ref_p(align_to_ref, this);
+  assert(align_to_ref_p.valid(), "sanity");
 
   // Given:
   //     lim0 == original pre loop limit
@@ -1773,10 +1987,12 @@
   //     N = (V - (e - lim0)) % V
   //     lim = lim0 - (V - (e - lim0)) % V
 
+  int vw = vector_width_in_bytes(align_to_ref);
   int stride   = iv_stride();
   int scale    = align_to_ref_p.scale_in_bytes();
   int elt_size = align_to_ref_p.memory_size();
-  int v_align  = vector_width_in_bytes() / elt_size;
+  int v_align  = vw / elt_size;
+  assert(v_align > 1, "sanity");
   int k        = align_to_ref_p.offset_in_bytes() / elt_size;
 
   Node *kn   = _igvn.intcon(k);
@@ -1796,6 +2012,25 @@
     _phase->_igvn.register_new_node_with_optimizer(e);
     _phase->set_ctrl(e, pre_ctrl);
   }
+  if (vw > ObjectAlignmentInBytes) {
+    // incorporate base e +/- base && Mask >>> log2(elt)
+    Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
+    Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
+    _phase->_igvn.register_new_node_with_optimizer(xbase);
+    Node* masked_xbase  = new (_phase->C, 3) AndXNode(xbase, mask);
+    _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
+#ifdef _LP64
+    masked_xbase  = new (_phase->C, 2) ConvL2INode(masked_xbase);
+    _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
+#endif
+    Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
+    Node* bref     = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
+    _phase->_igvn.register_new_node_with_optimizer(bref);
+    _phase->set_ctrl(bref, pre_ctrl);
+    e = new (_phase->C, 3) AddINode(e, bref);
+    _phase->_igvn.register_new_node_with_optimizer(e);
+    _phase->set_ctrl(e, pre_ctrl);
+  }
 
   // compute e +/- lim0
   if (scale < 0) {
--- a/src/share/vm/opto/superword.hpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/superword.hpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -264,8 +264,14 @@
                                      _iv = lp->as_CountedLoop()->phi()->as_Phi(); }
   int      iv_stride()             { return lp()->as_CountedLoop()->stride_con(); }
 
-  int vector_width_in_bytes()      { return Matcher::vector_width_in_bytes(); }
-
+  int vector_width(Node* n) {
+    BasicType bt = velt_basic_type(n);
+    return MIN2(ABS(iv_stride()), Matcher::max_vector_size(bt));
+  }
+  int vector_width_in_bytes(Node* n) {
+    BasicType bt = velt_basic_type(n);
+    return vector_width(n)*type2aelembytes(bt);
+  }
   MemNode* align_to_ref()            { return _align_to_ref; }
   void  set_align_to_ref(MemNode* m) { _align_to_ref = m; }
 
@@ -298,7 +304,9 @@
 
   // vector element type
   const Type* velt_type(Node* n)             { return _node_info.adr_at(bb_idx(n))->_velt_type; }
+  BasicType velt_basic_type(Node* n)         { return velt_type(n)->array_element_basic_type(); }
   void set_velt_type(Node* n, const Type* t) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_velt_type = t; }
+  bool same_velt_type(Node* n1, Node* n2);
 
   // my_pack
   Node_List* my_pack(Node* n)                { return !in_bb(n) ? NULL : _node_info.adr_at(bb_idx(n))->_my_pack; }
@@ -311,7 +319,9 @@
   // Find the adjacent memory references and create pack pairs for them.
   void find_adjacent_refs();
   // Find a memory reference to align the loop induction variable to.
-  void find_align_to_ref(Node_List &memops);
+  MemNode* find_align_to_ref(Node_List &memops);
+  // Calculate loop's iv adjustment for this memory ops.
+  int get_iv_adjustment(MemNode* mem);
   // Can the preloop align the reference to position zero in the vector?
   bool ref_is_alignable(SWPointer& p);
   // Construct dependency graph.
@@ -394,7 +404,7 @@
   // (Start, end] half-open range defining which operands are vector
   void vector_opd_range(Node* n, uint* start, uint* end);
   // Smallest type containing range of values
-  static const Type* container_type(const Type* t);
+  const Type* container_type(Node* n);
   // Adjust pre-loop limit so that in main loop, a load/store reference
   // to align_to_ref will be a position zero in the vector.
   void align_initial_loop_index(MemNode* align_to_ref);
@@ -462,6 +472,7 @@
 
   Node* base()            { return _base; }
   Node* adr()             { return _adr; }
+  MemNode* mem()          { return _mem; }
   int   scale_in_bytes()  { return _scale; }
   Node* invar()           { return _invar; }
   bool  negate_invar()    { return _negate_invar; }
--- a/src/share/vm/opto/type.cpp	Thu Jun 28 04:21:07 2012 -0400
+++ b/src/share/vm/opto/type.cpp	Thu Jun 28 10:35:28 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -60,6 +60,10 @@
 
   T_ILLEGAL,    // Tuple
   T_ARRAY,      // Array
+  T_ILLEGAL,    // VectorS
+  T_ILLEGAL,    // VectorD
+  T_ILLEGAL,    // VectorX
+  T_ILLEGAL,    // VectorY
 
   T_ADDRESS,    // AnyPtr   // shows up in factory methods for NULL_PTR
   T_ADDRESS,    // RawPtr
@@ -414,6 +418,24 @@
   // get_zero_type() should not happen for T_CONFLICT
   _zero_type[T_CONFLICT]= NULL;
 
+  // Vector predefined types, it needs initialized _const_basic_type[].
+  if (Matcher::vector_size_supported(T_BYTE,4)) {
+    TypeVect::VECTS = TypeVect::make(T_BYTE,4);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,2)) {
+    TypeVect::VECTD = TypeVect::make(T_FLOAT,2);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,4)) {
+    TypeVect::VECTX = TypeVect::make(T_FLOAT,4);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,8)) {
+    TypeVect::VECTY = TypeVect::make(T_FLOAT,8);
+  }
+  mreg2type[Op_VecS] = TypeVect::VECTS;
+  mreg2type[Op_VecD] = TypeVect::VECTD;
+  mreg2type[Op_VecX] = TypeVect::VECTX;
+  mreg2type[Op_VecY] = TypeVect::VECTY;
+
   // Restore working type arena.
   current->set_type_arena(save);
   current->set_type_dict(NULL);
@@ -668,6 +690,10 @@
 
   Bad,          // Tuple - handled in v-call
   Bad,          // Array - handled in v-call
+  Bad,          // VectorS - handled in v-call
+  Bad,          // VectorD - handled in v-call
+  Bad,          // VectorX - handled in v-call
+  Bad,          // VectorY - handled in v-call
 
   Bad,          // AnyPtr - handled in v-call
   Bad,          // RawPtr - handled in v-call
@@ -728,8 +754,8 @@
 //------------------------------data-------------------------------------------
 const char * const Type::msg[Type::lastype] = {
   "bad","control","top","int:","long:","half", "narrowoop:",
-  "tuple:", "aryptr",
-  "anyptr:", "rawptr:", "java:", "inst:", "ary:", "klass:",
+  "tuple:", "array:", "vectors:", "vectord:", "vectorx:", "vectory:",
+  "anyptr:", "rawptr:", "java:", "inst:", "aryptr:", "klass:",
   "func", "abIO", "return_address", "memory",
   "float_top", "ftcon:", "float",
   "double_top", "dblcon:", "double",
@@ -790,7 +816,7 @@
 //------------------------------isa_oop_ptr------------------------------------
 // Return true if type is an oop pointer type.  False for raw pointers.
 static char isa_oop_ptr_tbl[Type::lastype] = {
-  0,0,0,0,0,0,0/*narrowoop*/,0/*tuple*/, 0/*ary*/,
+  0,0,0,0,0,0,0/*narrowoop*/,0/*tuple*/, 0/*array*/, 0, 0, 0, 0/*vector*/,
   0/*anyptr*/,0/*rawptr*/,1/*OopPtr*/,1/*InstPtr*/,1/*AryPtr*/,1/*KlassPtr*/,
   0/*func*/,0,0/*return_address*/,0,
   /*floats*/0,0,0, /*doubles*/0,0,0,
@@ -1926,6 +1952,121 @@
   return false;
 }
 
+//==============================TypeVect=======================================
+// Convenience common pre-built types.
+const TypeVect *TypeVect::VECTS = NULL; //  32-bit vectors
+const TypeVect *TypeVect::VECTD = NULL; //  64-bit vectors
+const TypeVect *TypeVect::VECTX = NULL; // 128-bit vectors
+const TypeVect *TypeVect::VECTY = NULL; // 256-bit vectors
+
+//------------------------------make-------------------------------------------
+const TypeVect* TypeVect::make(const Type *elem, uint length) {
+  BasicType elem_bt = elem->array_element_basic_type();
+  assert(is_java_primitive(elem_bt), "only primitive types in vector");
+  assert(length > 1 && is_power_of_2(length), "vector length is power of 2");
+  assert(Matcher::vector_size_supported(elem_bt, length), "length in range");
+  int size = length * type2aelembytes(elem_bt);
+  switch (Matcher::vector_ideal_reg(size)) {
+  case Op_VecS:
+    return (TypeVect*)(new TypeVectS(elem, length))->hashcons();
+  case Op_VecD:
+  case Op_RegD:
+    return (TypeVect*)(new TypeVectD(elem, length))->hashcons();
+  case Op_VecX:
+    return (TypeVect*)(new TypeVectX(elem, length))->hashcons();
+  case Op_VecY:
+    return (TypeVect*)(new TypeVectY(elem, length))->hashcons();
+  }
+ ShouldNotReachHere();
+  return NULL;
+}
+
+//------------------------------meet-------------------------------------------
+// Compute the MEET of two types.  It returns a new Type object.
+const Type *TypeVect::xmeet( const Type *t ) const {
+  // Perform a fast test for common case; meeting the same types together.
+  if( this == t ) return this;  // Meeting same type-rep?
+
+  // Current "this->_base" is Vector
+  switch (t->base()) {          // switch on original type
+
+  case Bottom:                  // Ye Olde Default
+    return t;
+
+  default:                      // All else is a mistake
+    typerr(t);
+
+  case VectorS:
+  case VectorD:
+  case VectorX:
+  case VectorY: {                // Meeting 2 vectors?
+    const TypeVect* v = t->is_vect();
+    assert(  base() == v->base(), "");
+    assert(length() == v->length(), "");
+    assert(element_basic_type() == v->element_basic_type(), "");
+    return TypeVect::make(_elem->xmeet(v->_elem), _length);