changeset 3074:513351373923 hs23-b10

Merge
author amurillo
date Sat, 14 Jan 2012 00:47:46 -0800
parents ed621d125d02 e504fd26c073
children 24727fb37561
files
diffstat 170 files changed, 8971 insertions(+), 5287 deletions(-) [+]
line wrap: on
line diff
--- a/make/bsd/makefiles/adlc.make	Fri Jan 13 10:05:33 2012 -0800
+++ b/make/bsd/makefiles/adlc.make	Sat Jan 14 00:47:46 2012 -0800
@@ -39,9 +39,16 @@
 
 SOURCE.AD = $(OUTDIR)/$(OS)_$(Platform_arch_model).ad 
 
-SOURCES.AD = \
+ifeq ("${Platform_arch_model}", "${Platform_arch}")
+  SOURCES.AD = \
   $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
   $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+else
+  SOURCES.AD = \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+endif
 
 EXEC	= $(OUTDIR)/adlc
 
--- a/make/hotspot_version	Fri Jan 13 10:05:33 2012 -0800
+++ b/make/hotspot_version	Sat Jan 14 00:47:46 2012 -0800
@@ -35,7 +35,7 @@
 
 HS_MAJOR_VER=23
 HS_MINOR_VER=0
-HS_BUILD_NUMBER=09
+HS_BUILD_NUMBER=10
 
 JDK_MAJOR_VER=1
 JDK_MINOR_VER=8
--- a/make/linux/makefiles/adlc.make	Fri Jan 13 10:05:33 2012 -0800
+++ b/make/linux/makefiles/adlc.make	Sat Jan 14 00:47:46 2012 -0800
@@ -39,9 +39,16 @@
 
 SOURCE.AD = $(OUTDIR)/$(OS)_$(Platform_arch_model).ad 
 
-SOURCES.AD = \
+ifeq ("${Platform_arch_model}", "${Platform_arch}")
+  SOURCES.AD = \
   $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
   $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+else
+  SOURCES.AD = \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+endif
 
 EXEC	= $(OUTDIR)/adlc
 
--- a/make/solaris/makefiles/adlc.make	Fri Jan 13 10:05:33 2012 -0800
+++ b/make/solaris/makefiles/adlc.make	Sat Jan 14 00:47:46 2012 -0800
@@ -40,9 +40,16 @@
 
 SOURCE.AD = $(OUTDIR)/$(OS)_$(Platform_arch_model).ad 
 
-SOURCES.AD = \
+ifeq ("${Platform_arch_model}", "${Platform_arch}")
+  SOURCES.AD = \
   $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
   $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+else
+  SOURCES.AD = \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+endif
 
 EXEC	= $(OUTDIR)/adlc
 
--- a/make/windows/makefiles/adlc.make	Fri Jan 13 10:05:33 2012 -0800
+++ b/make/windows/makefiles/adlc.make	Sat Jan 14 00:47:46 2012 -0800
@@ -53,6 +53,17 @@
   /I "$(WorkSpace)\src\os\windows\vm" \
   /I "$(WorkSpace)\src\cpu\$(Platform_arch)\vm"
 
+!if "$(Platform_arch_model)" == "$(Platform_arch)"
+SOURCES_AD=\
+  $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad \
+  $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad
+!else
+SOURCES_AD=\
+  $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad \
+  $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch).ad \
+  $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad
+!endif
+
 # NOTE! If you add any files here, you must also update GENERATED_NAMES_IN_DIR
 # and ProjectCreatorIDEOptions in projectcreator.make. 
 GENERATED_NAMES=\
@@ -105,7 +116,6 @@
 	$(ADLC) $(ADLCFLAGS) $(Platform_arch_model).ad
 	mv $(GENERATED_NAMES) $(AdlcOutDir)/
 
-$(Platform_arch_model).ad: $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad
+$(Platform_arch_model).ad: $(SOURCES_AD)
 	rm -f $(Platform_arch_model).ad
-	cat $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad  \
-	    $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad >$(Platform_arch_model).ad
+	cat $(SOURCES_AD) >$(Platform_arch_model).ad
--- a/src/cpu/sparc/vm/assembler_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/assembler_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -3036,10 +3036,8 @@
                                                    Label* L_failure,
                                                    Label* L_slow_path,
                                         RegisterOrConstant super_check_offset) {
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
-  int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                    Klass::super_check_offset_offset_in_bytes());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
 
   bool must_load_sco  = (super_check_offset.constant_or_zero() == -1);
   bool need_slow_path = (must_load_sco ||
@@ -3159,10 +3157,8 @@
   assert(label_nulls <= 1, "at most one NULL in the batch");
 
   // a couple of useful fields in sub_klass:
-  int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_supers_offset_in_bytes());
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
+  int ss_offset = in_bytes(Klass::secondary_supers_offset());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 
   // Do a linear scan of the secondary super-klass chain.
   // This code is rarely used, so simplicity is a virtue here.
@@ -3336,7 +3332,7 @@
   cmp_and_brx_short(temp_reg, markOopDesc::biased_lock_pattern, Assembler::notEqual, Assembler::pn, cas_label);
 
   load_klass(obj_reg, temp_reg);
-  ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
+  ld_ptr(Address(temp_reg, Klass::prototype_header_offset()), temp_reg);
   or3(G2_thread, temp_reg, temp_reg);
   xor3(mark_reg, temp_reg, temp_reg);
   andcc(temp_reg, ~((int) markOopDesc::age_mask_in_place), temp_reg);
@@ -3413,7 +3409,7 @@
   // FIXME: due to a lack of registers we currently blow away the age
   // bits in this situation. Should attempt to preserve them.
   load_klass(obj_reg, temp_reg);
-  ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
+  ld_ptr(Address(temp_reg, Klass::prototype_header_offset()), temp_reg);
   or3(G2_thread, temp_reg, temp_reg);
   casn(mark_addr.base(), mark_reg, temp_reg);
   // If the biasing toward our thread failed, this means that
@@ -3443,7 +3439,7 @@
   // FIXME: due to a lack of registers we currently blow away the age
   // bits in this situation. Should attempt to preserve them.
   load_klass(obj_reg, temp_reg);
-  ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
+  ld_ptr(Address(temp_reg, Klass::prototype_header_offset()), temp_reg);
   casn(mark_addr.base(), mark_reg, temp_reg);
   // Fall through to the normal CAS-based lock, because no matter what
   // the result of the above CAS, some thread must have succeeded in
--- a/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -302,7 +302,7 @@
     assert(_obj != noreg, "must be a valid register");
     assert(_oop_index >= 0, "must have oop index");
     __ load_heap_oop(_obj, java_lang_Class::klass_offset_in_bytes(), G3);
-    __ ld_ptr(G3, instanceKlass::init_thread_offset_in_bytes() + sizeof(klassOopDesc), G3);
+    __ ld_ptr(G3, in_bytes(instanceKlass::init_thread_offset()), G3);
     __ cmp_and_brx_short(G2_thread, G3, Assembler::notEqual, Assembler::pn, call_patch);
 
     // load_klass patches may execute the patched code before it's
@@ -471,7 +471,7 @@
 
   __ load_klass(src_reg, tmp_reg);
 
-  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc));
+  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset());
   __ ld(ref_type_adr, tmp_reg);
 
   // _reference_type field is of type ReferenceType (enum)
--- a/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -2202,8 +2202,7 @@
           } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
             __ load_klass(dst, tmp);
           }
-          int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-            Klass::layout_helper_offset_in_bytes();
+          int lh_offset = in_bytes(Klass::layout_helper_offset());
 
           __ lduw(tmp, lh_offset, tmp2);
 
@@ -2238,12 +2237,10 @@
         __ mov(length, len);
         __ load_klass(dst, tmp);
 
-        int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                         objArrayKlass::element_klass_offset_in_bytes());
+        int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
         __ ld_ptr(tmp, ek_offset, super_k);
 
-        int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                          Klass::super_check_offset_offset_in_bytes());
+        int sco_offset = in_bytes(Klass::super_check_offset_offset());
         __ lduw(super_k, sco_offset, chk_off);
 
         __ call_VM_leaf(tmp, copyfunc_addr);
@@ -2455,8 +2452,8 @@
          op->obj()->as_register()   == O0 &&
          op->klass()->as_register() == G5, "must be");
   if (op->init_check()) {
-    __ ld(op->klass()->as_register(),
-          instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc),
+    __ ldub(op->klass()->as_register(),
+          in_bytes(instanceKlass::init_state_offset()),
           op->tmp1()->as_register());
     add_debug_info_for_null_check_here(op->stub()->info());
     __ cmp(op->tmp1()->as_register(), instanceKlass::fully_initialized);
@@ -2627,7 +2624,7 @@
   } else {
     bool need_slow_path = true;
     if (k->is_loaded()) {
-      if (k->super_check_offset() != sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes())
+      if ((int) k->super_check_offset() != in_bytes(Klass::secondary_super_cache_offset()))
         need_slow_path = false;
       // perform the fast part of the checking logic
       __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, noreg,
@@ -2731,7 +2728,7 @@
     __ load_klass(value, klass_RInfo);
 
     // get instance klass
-    __ ld_ptr(Address(k_RInfo, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)), k_RInfo);
+    __ ld_ptr(Address(k_RInfo, objArrayKlass::element_klass_offset()), k_RInfo);
     // perform the fast part of the checking logic
     __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, O7, success_target, failure_target, NULL);
 
--- a/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -181,7 +181,7 @@
 void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register len, Register t1, Register t2) {
   assert_different_registers(obj, klass, len, t1, t2);
   if (UseBiasedLocking && !len->is_valid()) {
-    ld_ptr(klass, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes(), t1);
+    ld_ptr(klass, in_bytes(Klass::prototype_header_offset()), t1);
   } else {
     set((intx)markOopDesc::prototype(), t1);
   }
@@ -252,7 +252,7 @@
 #ifdef ASSERT
   {
     Label ok;
-    ld(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), t1);
+    ld(klass, in_bytes(Klass::layout_helper_offset()), t1);
     if (var_size_in_bytes != noreg) {
       cmp_and_brx_short(t1, var_size_in_bytes, Assembler::equal, Assembler::pt, ok);
     } else {
--- a/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -398,14 +398,14 @@
 
           if (id == fast_new_instance_init_check_id) {
             // make sure the klass is initialized
-            __ ld(G5_klass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc), G3_t1);
+            __ ldub(G5_klass, in_bytes(instanceKlass::init_state_offset()), G3_t1);
             __ cmp_and_br_short(G3_t1, instanceKlass::fully_initialized, Assembler::notEqual, Assembler::pn, slow_path);
           }
 #ifdef ASSERT
           // assert object can be fast path allocated
           {
             Label ok, not_ok;
-          __ ld(G5_klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), G1_obj_size);
+          __ ld(G5_klass, in_bytes(Klass::layout_helper_offset()), G1_obj_size);
           // make sure it's an instance (LH > 0)
           __ cmp_and_br_short(G1_obj_size, 0, Assembler::lessEqual, Assembler::pn, not_ok);
           __ btst(Klass::_lh_instance_slow_path_bit, G1_obj_size);
@@ -425,7 +425,7 @@
           __ bind(retry_tlab);
 
           // get the instance size
-          __ ld(G5_klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), G1_obj_size);
+          __ ld(G5_klass, in_bytes(Klass::layout_helper_offset()), G1_obj_size);
 
           __ tlab_allocate(O0_obj, G1_obj_size, 0, G3_t1, slow_path);
 
@@ -437,7 +437,7 @@
 
           __ bind(try_eden);
           // get the instance size
-          __ ld(G5_klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), G1_obj_size);
+          __ ld(G5_klass, in_bytes(Klass::layout_helper_offset()), G1_obj_size);
           __ eden_allocate(O0_obj, G1_obj_size, 0, G3_t1, G4_t2, slow_path);
           __ incr_allocated_bytes(G1_obj_size, G3_t1, G4_t2);
 
@@ -471,8 +471,7 @@
         Register G4_length = G4; // Incoming
         Register O0_obj   = O0; // Outgoing
 
-        Address klass_lh(G5_klass, ((klassOopDesc::header_size() * HeapWordSize)
-                                    + Klass::layout_helper_offset_in_bytes()));
+        Address klass_lh(G5_klass, Klass::layout_helper_offset());
         assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
         assert(Klass::_lh_header_size_mask == 0xFF, "bytewise");
         // Use this offset to pick out an individual byte of the layout_helper:
@@ -592,7 +591,7 @@
         Label register_finalizer;
         Register t = O1;
         __ load_klass(O0, t);
-        __ ld(t, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc), t);
+        __ ld(t, in_bytes(Klass::access_flags_offset()), t);
         __ set(JVM_ACC_HAS_FINALIZER, G3);
         __ andcc(G3, t, G0);
         __ br(Assembler::notZero, false, Assembler::pt, register_finalizer);
--- a/src/cpu/sparc/vm/cppInterpreter_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/cppInterpreter_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -766,7 +766,7 @@
       // get native function entry point(O0 is a good temp until the very end)
        ld_ptr(Address(G5_method, 0, in_bytes(methodOopDesc::native_function_offset())), O0);
     // for static methods insert the mirror argument
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
 
     __ ld_ptr(Address(G5_method, 0, in_bytes(methodOopDesc:: constants_offset())), O1);
     __ ld_ptr(Address(O1, 0, constantPoolOopDesc::pool_holder_offset_in_bytes()), O1);
@@ -1173,7 +1173,7 @@
     __ btst(JVM_ACC_SYNCHRONIZED, O1);
     __ br( Assembler::zero, false, Assembler::pt, done);
 
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ delayed()->btst(JVM_ACC_STATIC, O1);
     __ ld_ptr(XXX_STATE(_locals), O1);
     __ br( Assembler::zero, true, Assembler::pt, got_obj);
--- a/src/cpu/sparc/vm/methodHandles_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/methodHandles_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -1098,7 +1098,7 @@
   Address G3_amh_argument ( G3_method_handle, java_lang_invoke_AdapterMethodHandle::argument_offset_in_bytes());
   Address G3_amh_conversion(G3_method_handle, java_lang_invoke_AdapterMethodHandle::conversion_offset_in_bytes());
 
-  const int java_mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+  const int java_mirror_offset = in_bytes(Klass::java_mirror_offset());
 
   if (have_entry(ek)) {
     __ nop();  // empty stubs make SG sick
--- a/src/cpu/sparc/vm/sparc.ad	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/sparc.ad	Sat Jan 14 00:47:46 2012 -0800
@@ -6773,6 +6773,16 @@
   ins_pipe(empty);
 %}
 
+instruct membar_storestore() %{
+  match(MemBarStoreStore);
+  ins_cost(0);
+
+  size(0);
+  format %{ "!MEMBAR-storestore (empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
 //----------Register Move Instructions-----------------------------------------
 instruct roundDouble_nop(regD dst) %{
   match(Set dst (RoundDouble dst));
@@ -9273,6 +9283,7 @@
 // (compare 'operand indIndex' and 'instruct addP_reg_reg' above)
 instruct jumpXtnd(iRegX switch_val, o7RegI table) %{
   match(Jump switch_val);
+  effect(TEMP table);
 
   ins_cost(350);
 
@@ -10263,24 +10274,24 @@
 // ============================================================================
 // inlined locking and unlocking
 
-instruct cmpFastLock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, o7RegP scratch ) %{
+instruct cmpFastLock(flagsRegP pcc, iRegP object, o1RegP box, iRegP scratch2, o7RegP scratch ) %{
   match(Set pcc (FastLock object box));
 
-  effect(KILL scratch, TEMP scratch2);
+  effect(TEMP scratch2, USE_KILL box, KILL scratch);
   ins_cost(100);
 
-  format %{ "FASTLOCK  $object, $box; KILL $scratch, $scratch2, $box" %}
+  format %{ "FASTLOCK  $object,$box\t! kills $box,$scratch,$scratch2" %}
   ins_encode( Fast_Lock(object, box, scratch, scratch2) );
   ins_pipe(long_memory_op);
 %}
 
 
-instruct cmpFastUnlock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, o7RegP scratch ) %{
+instruct cmpFastUnlock(flagsRegP pcc, iRegP object, o1RegP box, iRegP scratch2, o7RegP scratch ) %{
   match(Set pcc (FastUnlock object box));
-  effect(KILL scratch, TEMP scratch2);
+  effect(TEMP scratch2, USE_KILL box, KILL scratch);
   ins_cost(100);
 
-  format %{ "FASTUNLOCK  $object, $box; KILL $scratch, $scratch2, $box" %}
+  format %{ "FASTUNLOCK  $object,$box\t! kills $box,$scratch,$scratch2" %}
   ins_encode( Fast_Unlock(object, box, scratch, scratch2) );
   ins_pipe(long_memory_op);
 %}
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -3046,8 +3046,7 @@
     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
     //
 
-    int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-                    Klass::layout_helper_offset_in_bytes();
+    int lh_offset = in_bytes(Klass::layout_helper_offset());
 
     // Load 32-bits signed value. Use br() instruction with it to check icc.
     __ lduw(G3_src_klass, lh_offset, G5_lh);
@@ -3194,15 +3193,13 @@
                                  G4_dst_klass, G3_src_klass);
 
       // Generate the type check.
-      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        Klass::super_check_offset_offset_in_bytes());
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
       __ lduw(G4_dst_klass, sco_offset, sco_temp);
       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
                           O5_temp, L_plain_copy);
 
       // Fetch destination element klass from the objArrayKlass header.
-      int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                       objArrayKlass::element_klass_offset_in_bytes());
+      int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
 
       // the checkcast_copy loop needs two extra arguments:
       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
@@ -3414,6 +3411,9 @@
       generate_throw_exception("WrongMethodTypeException throw_exception",
                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
                                G5_method_type, G3_method_handle);
+
+    // Build this early so it's available for the interpreter.
+    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
   }
 
 
@@ -3427,7 +3427,6 @@
     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
-    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
 
     StubRoutines::_handler_for_unsafe_access_entry =
       generate_handler_for_unsafe_access();
--- a/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -366,7 +366,7 @@
 
   // get synchronization object to O0
   { Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ btst(JVM_ACC_STATIC, O0);
     __ br( Assembler::zero, true, Assembler::pt, done);
     __ delayed()->ld_ptr(Llocals, Interpreter::local_offset_in_bytes(0), O0); // get receiver for not-static case
@@ -396,7 +396,6 @@
                                                          Register Rscratch,
                                                          Register Rscratch2) {
   const int page_size = os::vm_page_size();
-  Address saved_exception_pc(G2_thread, JavaThread::saved_exception_pc_offset());
   Label after_frame_check;
 
   assert_different_registers(Rframe_size, Rscratch, Rscratch2);
@@ -436,11 +435,19 @@
   // the bottom of the stack
   __ cmp_and_brx_short(SP, Rscratch, Assembler::greater, Assembler::pt, after_frame_check);
 
-  // Save the return address as the exception pc
-  __ st_ptr(O7, saved_exception_pc);
+  // the stack will overflow, throw an exception
 
-  // the stack will overflow, throw an exception
-  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_StackOverflowError));
+  // Note that SP is restored to sender's sp (in the delay slot). This
+  // is necessary if the sender's frame is an extended compiled frame
+  // (see gen_c2i_adapter()) and safer anyway in case of JSR292
+  // adaptations.
+
+  // Note also that the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  AddressLiteral stub(StubRoutines::throw_StackOverflowError_entry());
+  __ jump_to(stub, Rscratch);
+  __ delayed()->mov(O5_savedSP, SP);
 
   // if you get to here, then there is enough stack space
   __ bind( after_frame_check );
@@ -984,7 +991,7 @@
     // get native function entry point(O0 is a good temp until the very end)
     __ delayed()->ld_ptr(Lmethod, in_bytes(methodOopDesc::native_function_offset()), O0);
     // for static methods insert the mirror argument
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
 
     __ ld_ptr(Lmethod, methodOopDesc:: constants_offset(), O1);
     __ ld_ptr(O1, constantPoolOopDesc::pool_holder_offset_in_bytes(), O1);
--- a/src/cpu/sparc/vm/templateTable_sparc.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/sparc/vm/templateTable_sparc.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -888,7 +888,7 @@
 
   // do fast instanceof cache test
 
-  __ ld_ptr(O4,     sizeof(oopDesc) + objArrayKlass::element_klass_offset_in_bytes(),  O4);
+  __ ld_ptr(O4,     in_bytes(objArrayKlass::element_klass_offset()),  O4);
 
   assert(Otos_i == O0, "just checking");
 
@@ -2031,7 +2031,7 @@
     __ access_local_ptr(G3_scratch, Otos_i);
     __ load_klass(Otos_i, O2);
     __ set(JVM_ACC_HAS_FINALIZER, G3);
-    __ ld(O2, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc), O2);
+    __ ld(O2, in_bytes(Klass::access_flags_offset()), O2);
     __ andcc(G3, O2, G0);
     Label skip_register_finalizer;
     __ br(Assembler::zero, false, Assembler::pn, skip_register_finalizer);
@@ -3350,13 +3350,13 @@
   __ ld_ptr(Rscratch, Roffset, RinstanceKlass);
 
   // make sure klass is fully initialized:
-  __ ld(RinstanceKlass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc), G3_scratch);
+  __ ldub(RinstanceKlass, in_bytes(instanceKlass::init_state_offset()), G3_scratch);
   __ cmp(G3_scratch, instanceKlass::fully_initialized);
   __ br(Assembler::notEqual, false, Assembler::pn, slow_case);
-  __ delayed()->ld(RinstanceKlass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), Roffset);
+  __ delayed()->ld(RinstanceKlass, in_bytes(Klass::layout_helper_offset()), Roffset);
 
   // get instance_size in instanceKlass (already aligned)
-  //__ ld(RinstanceKlass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), Roffset);
+  //__ ld(RinstanceKlass, in_bytes(Klass::layout_helper_offset()), Roffset);
 
   // make sure klass does not have has_finalizer, or is abstract, or interface or java/lang/Class
   __ btst(Klass::_lh_instance_slow_path_bit, Roffset);
@@ -3483,7 +3483,7 @@
   __ bind(initialize_header);
 
   if (UseBiasedLocking) {
-    __ ld_ptr(RinstanceKlass, Klass::prototype_header_offset_in_bytes() + sizeof(oopDesc), G4_scratch);
+    __ ld_ptr(RinstanceKlass, in_bytes(Klass::prototype_header_offset()), G4_scratch);
   } else {
     __ set((intptr_t)markOopDesc::prototype(), G4_scratch);
   }
--- a/src/cpu/x86/vm/assembler_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -533,6 +533,19 @@
 
   case 0x0F: // movx..., etc.
     switch (0xFF & *ip++) {
+    case 0x3A: // pcmpestri
+      tail_size = 1;
+    case 0x38: // ptest, pmovzxbw
+      ip++; // skip opcode
+      debug_only(has_disp32 = true); // has both kinds of operands!
+      break;
+
+    case 0x70: // pshufd r, r/a, #8
+      debug_only(has_disp32 = true); // has both kinds of operands!
+    case 0x73: // psrldq r, #8
+      tail_size = 1;
+      break;
+
     case 0x12: // movlps
     case 0x28: // movaps
     case 0x2E: // ucomiss
@@ -543,9 +556,7 @@
     case 0x57: // xorps
     case 0x6E: // movd
     case 0x7E: // movd
-    case 0xAE: // ldmxcsr   a
-      // 64bit side says it these have both operands but that doesn't
-      // appear to be true
+    case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
       debug_only(has_disp32 = true);
       break;
 
@@ -565,6 +576,12 @@
       // fall out of the switch to decode the address
       break;
 
+    case 0xC4: // pinsrw r, a, #8
+      debug_only(has_disp32 = true);
+    case 0xC5: // pextrw r, r, #8
+      tail_size = 1;  // the imm8
+      break;
+
     case 0xAC: // shrd r, a, #8
       debug_only(has_disp32 = true);
       tail_size = 1;  // the imm8
@@ -625,11 +642,44 @@
     tail_size = 1; // the imm8
     break;
 
-  case 0xE8: // call rdisp32
-  case 0xE9: // jmp  rdisp32
-    if (which == end_pc_operand)  return ip + 4;
-    assert(which == call32_operand, "call has no disp32 or imm");
-    return ip;
+  case 0xC4: // VEX_3bytes
+  case 0xC5: // VEX_2bytes
+    assert((UseAVX > 0), "shouldn't have VEX prefix");
+    assert(ip == inst+1, "no prefixes allowed");
+    // C4 and C5 are also used as opcodes for PINSRW and PEXTRW instructions
+    // but they have prefix 0x0F and processed when 0x0F processed above.
+    //
+    // In 32-bit mode the VEX first byte C4 and C5 alias onto LDS and LES
+    // instructions (these instructions are not supported in 64-bit mode).
+    // To distinguish them bits [7:6] are set in the VEX second byte since
+    // ModRM byte can not be of the form 11xxxxxx in 32-bit mode. To set
+    // those VEX bits REX and vvvv bits are inverted.
+    //
+    // Fortunately C2 doesn't generate these instructions so we don't need
+    // to check for them in product version.
+
+    // Check second byte
+    NOT_LP64(assert((0xC0 & *ip) == 0xC0, "shouldn't have LDS and LES instructions"));
+
+    // First byte
+    if ((0xFF & *inst) == VEX_3bytes) {
+      ip++; // third byte
+      is_64bit = ((VEX_W & *ip) == VEX_W);
+    }
+    ip++; // opcode
+    // To find the end of instruction (which == end_pc_operand).
+    switch (0xFF & *ip) {
+    case 0x61: // pcmpestri r, r/a, #8
+    case 0x70: // pshufd r, r/a, #8
+    case 0x73: // psrldq r, #8
+      tail_size = 1;  // the imm8
+      break;
+    default:
+      break;
+    }
+    ip++; // skip opcode
+    debug_only(has_disp32 = true); // has both kinds of operands!
+    break;
 
   case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
   case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
@@ -643,6 +693,12 @@
     debug_only(has_disp32 = true);
     break;
 
+  case 0xE8: // call rdisp32
+  case 0xE9: // jmp  rdisp32
+    if (which == end_pc_operand)  return ip + 4;
+    assert(which == call32_operand, "call has no disp32 or imm");
+    return ip;
+
   case 0xF0:                    // Lock
     assert(os::is_MP(), "only on MP");
     goto again_after_prefix;
@@ -918,9 +974,7 @@
 
 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x58);
   emit_byte(0xC0 | encode);
 }
@@ -928,18 +982,14 @@
 void Assembler::addsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x58);
   emit_operand(dst, src);
 }
 
 void Assembler::addss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x58);
   emit_byte(0xC0 | encode);
 }
@@ -947,13 +997,19 @@
 void Assembler::addss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x58);
   emit_operand(dst, src);
 }
 
+void Assembler::andl(Address dst, int32_t imm32) {
+  InstructionMark im(this);
+  prefix(dst);
+  emit_byte(0x81);
+  emit_operand(rsp, dst, 4);
+  emit_long(imm32);
+}
+
 void Assembler::andl(Register dst, int32_t imm32) {
   prefix(dst);
   emit_arith(0x81, 0xE0, dst, imm32);
@@ -974,13 +1030,33 @@
 void Assembler::andpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
   emit_byte(0x54);
   emit_operand(dst, src);
 }
 
+void Assembler::andpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x54);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::andps(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_NONE);
+  emit_byte(0x54);
+  emit_operand(dst, src);
+}
+
+void Assembler::andps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
+  emit_byte(0x54);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::bsfl(Register dst, Register src) {
   int encode = prefix_and_encode(dst->encoding(), src->encoding());
   emit_byte(0x0F);
@@ -1025,19 +1101,7 @@
 }
 
 void Assembler::call(Register dst) {
-  // This was originally using a 32bit register encoding
-  // and surely we want 64bit!
-  // this is a 32bit encoding but in 64bit mode the default
-  // operand size is 64bit so there is no need for the
-  // wide prefix. So prefix only happens if we use the
-  // new registers. Much like push/pop.
-  int x = offset();
-  // this may be true but dbx disassembles it as if it
-  // were 32bits...
-  // int encode = prefix_and_encode(dst->encoding());
-  // if (offset() != x) assert(dst->encoding() >= 8, "what?");
-  int encode = prefixq_and_encode(dst->encoding());
-
+  int encode = prefix_and_encode(dst->encoding());
   emit_byte(0xFF);
   emit_byte(0xD0 | encode);
 }
@@ -1157,87 +1221,119 @@
   // NOTE: dbx seems to decode this as comiss even though the
   // 0x66 is there. Strangly ucomisd comes out correct
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  comiss(dst, src);
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66);
+  emit_byte(0x2F);
+  emit_operand(dst, src);
+}
+
+void Assembler::comisd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
+  emit_byte(0x2F);
+  emit_byte(0xC0 | encode);
 }
 
 void Assembler::comiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-
-  InstructionMark im(this);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_NONE);
   emit_byte(0x2F);
   emit_operand(dst, src);
 }
 
+void Assembler::comiss(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
+  emit_byte(0x2F);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
   emit_byte(0xE6);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
   emit_byte(0x5B);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
+  emit_byte(0x5A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x2A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
+  emit_byte(0x2A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x2A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
+  emit_byte(0x2A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtss2sd(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
+  emit_byte(0x5A);
+  emit_operand(dst, src);
+}
+
+
 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
   emit_byte(0x2C);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
   emit_byte(0x2C);
   emit_byte(0xC0 | encode);
 }
@@ -1253,18 +1349,14 @@
 void Assembler::divsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5E);
   emit_operand(dst, src);
 }
 
 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5E);
   emit_byte(0xC0 | encode);
 }
@@ -1272,18 +1364,14 @@
 void Assembler::divss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5E);
   emit_operand(dst, src);
 }
 
 void Assembler::divss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5E);
   emit_byte(0xC0 | encode);
 }
@@ -1377,8 +1465,14 @@
   if (L.is_bound()) {
     const int short_size = 2;
     address entry = target(L);
-    assert(is8bit((intptr_t)entry - ((intptr_t)_code_pos + short_size)),
-           "Dispacement too large for a short jmp");
+#ifdef ASSERT
+    intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
+    intptr_t delta = short_branch_delta();
+    if (delta != 0) {
+      dist += (dist < 0 ? (-delta) :delta);
+    }
+    assert(is8bit(dist), "Dispacement too large for a short jmp");
+#endif
     intptr_t offs = (intptr_t)entry - (intptr_t)_code_pos;
     // 0111 tttn #8-bit disp
     emit_byte(0x70 | cc);
@@ -1444,9 +1538,15 @@
   if (L.is_bound()) {
     const int short_size = 2;
     address entry = target(L);
-    assert(is8bit((entry - _code_pos) + short_size),
-           "Dispacement too large for a short jmp");
     assert(entry != NULL, "jmp most probably wrong");
+#ifdef ASSERT
+    intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
+    intptr_t delta = short_branch_delta();
+    if (delta != 0) {
+      dist += (dist < 0 ? (-delta) :delta);
+    }
+    assert(is8bit(dist), "Dispacement too large for a short jmp");
+#endif
     intptr_t offs = entry - _code_pos;
     emit_byte(0xEB);
     emit_byte((offs - short_size) & 0xFF);
@@ -1509,49 +1609,16 @@
 
 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int dstenc = dst->encoding();
-  int srcenc = src->encoding();
-  emit_byte(0x66);
-  if (dstenc < 8) {
-    if (srcenc >= 8) {
-      prefix(REX_B);
-      srcenc -= 8;
-    }
-  } else {
-    if (srcenc < 8) {
-      prefix(REX_R);
-    } else {
-      prefix(REX_RB);
-      srcenc -= 8;
-    }
-    dstenc -= 8;
-  }
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
   emit_byte(0x28);
-  emit_byte(0xC0 | dstenc << 3 | srcenc);
+  emit_byte(0xC0 | encode);
 }
 
 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int dstenc = dst->encoding();
-  int srcenc = src->encoding();
-  if (dstenc < 8) {
-    if (srcenc >= 8) {
-      prefix(REX_B);
-      srcenc -= 8;
-    }
-  } else {
-    if (srcenc < 8) {
-      prefix(REX_R);
-    } else {
-      prefix(REX_RB);
-      srcenc -= 8;
-    }
-    dstenc -= 8;
-  }
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
   emit_byte(0x28);
-  emit_byte(0xC0 | dstenc << 3 | srcenc);
+  emit_byte(0xC0 | encode);
 }
 
 void Assembler::movb(Register dst, Address src) {
@@ -1582,19 +1649,15 @@
 
 void Assembler::movdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
   emit_byte(0x6E);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::movdl(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
   // swap src/dst to get correct prefix
-  int encode = prefix_and_encode(src->encoding(), dst->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66);
   emit_byte(0x7E);
   emit_byte(0xC0 | encode);
 }
@@ -1602,68 +1665,37 @@
 void Assembler::movdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_66);
   emit_byte(0x6E);
   emit_operand(dst, src);
 }
 
-
-void Assembler::movdqa(XMMRegister dst, Address src) {
+void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
+  emit_byte(0x6F);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::movdqu(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x6F);
   emit_operand(dst, src);
 }
 
-void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
+void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
   emit_byte(0x6F);
   emit_byte(0xC0 | encode);
 }
 
-void Assembler::movdqa(Address dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(dst, src);
-  emit_byte(0x0F);
-  emit_byte(0x7F);
-  emit_operand(src, dst);
-}
-
-void Assembler::movdqu(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
-  emit_byte(0x6F);
-  emit_operand(dst, src);
-}
-
-void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF3);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
-  emit_byte(0x6F);
-  emit_byte(0xC0 | encode);
-}
-
 void Assembler::movdqu(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(dst, src);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x7F);
   emit_operand(src, dst);
 }
@@ -1710,9 +1742,7 @@
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
   emit_byte(0x12);
   emit_operand(dst, src);
 }
@@ -1740,9 +1770,7 @@
 void Assembler::movq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x7E);
   emit_operand(dst, src);
 }
@@ -1750,9 +1778,7 @@
 void Assembler::movq(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(dst, src);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_66);
   emit_byte(0xD6);
   emit_operand(src, dst);
 }
@@ -1775,9 +1801,7 @@
 
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x10);
   emit_byte(0xC0 | encode);
 }
@@ -1785,9 +1809,7 @@
 void Assembler::movsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F2);
   emit_byte(0x10);
   emit_operand(dst, src);
 }
@@ -1795,18 +1817,14 @@
 void Assembler::movsd(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(dst, src);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F2);
   emit_byte(0x11);
   emit_operand(src, dst);
 }
 
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x10);
   emit_byte(0xC0 | encode);
 }
@@ -1814,9 +1832,7 @@
 void Assembler::movss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x10);
   emit_operand(dst, src);
 }
@@ -1824,9 +1840,7 @@
 void Assembler::movss(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(dst, src);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x11);
   emit_operand(src, dst);
 }
@@ -1919,18 +1933,14 @@
 void Assembler::mulsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x59);
   emit_operand(dst, src);
 }
 
 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x59);
   emit_byte(0xC0 | encode);
 }
@@ -1938,18 +1948,14 @@
 void Assembler::mulss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x59);
   emit_operand(dst, src);
 }
 
 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x59);
   emit_byte(0xC0 | encode);
 }
@@ -2237,14 +2243,26 @@
   emit_arith(0x0B, 0xC0, dst, src);
 }
 
+void Assembler::packuswb(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x67);
+  emit_operand(dst, src);
+}
+
+void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x67);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-
-  InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
-  emit_byte(0x3A);
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
   emit_byte(0x61);
   emit_operand(dst, src);
   emit_byte(imm8);
@@ -2252,16 +2270,27 @@
 
 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-
-  emit_byte(0x66);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
-  emit_byte(0x3A);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
   emit_byte(0x61);
   emit_byte(0xC0 | encode);
   emit_byte(imm8);
 }
 
+void Assembler::pmovzxbw(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x30);
+  emit_operand(dst, src);
+}
+
+void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sse4_1(), "");
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x30);
+  emit_byte(0xC0 | encode);
+}
+
 // generic
 void Assembler::pop(Register dst) {
   int encode = prefix_and_encode(dst->encoding());
@@ -2360,22 +2389,24 @@
 
 void Assembler::por(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  emit_byte(0x66);
-  int  encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
-
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
   emit_byte(0xEB);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::por(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0xEB);
+  emit_operand(dst, src);
+}
+
 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  emit_byte(0x66);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
   emit_byte(0x70);
   emit_byte(0xC0 | encode);
   emit_byte(mode & 0xFF);
@@ -2385,11 +2416,9 @@
 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66);
   emit_byte(0x70);
   emit_operand(dst, src);
   emit_byte(mode & 0xFF);
@@ -2398,10 +2427,7 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
   emit_byte(0x70);
   emit_byte(0xC0 | encode);
   emit_byte(mode & 0xFF);
@@ -2410,11 +2436,9 @@
 void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst); // QQ new
-  emit_byte(0x0F);
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F2);
   emit_byte(0x70);
   emit_operand(dst, src);
   emit_byte(mode & 0xFF);
@@ -2425,11 +2449,8 @@
   // HMM Table D-1 says sse2 or mmx.
   // Do not confuse it with psrldq SSE2 instruction which
   // shifts 128 bit value in xmm register by number of bytes.
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-
-  int encode = prefixq_and_encode(xmm2->encoding(), dst->encoding());
-  emit_byte(0x66);
-  emit_byte(0x0F);
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
   emit_byte(0x73);
   emit_byte(0xC0 | encode);
   emit_byte(shift);
@@ -2438,10 +2459,7 @@
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  int encode = prefixq_and_encode(xmm3->encoding(), dst->encoding());
-  emit_byte(0x66);
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66);
   emit_byte(0x73);
   emit_byte(0xC0 | encode);
   emit_byte(shift);
@@ -2449,36 +2467,52 @@
 
 void Assembler::ptest(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
-
-  InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
-  emit_byte(0x38);
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
   emit_byte(0x17);
   emit_operand(dst, src);
 }
 
 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-
-  emit_byte(0x66);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
-  emit_byte(0x38);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
   emit_byte(0x17);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::punpcklbw(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x60);
+  emit_operand(dst, src);
+}
+
 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
   emit_byte(0x60);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::punpckldq(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x62);
+  emit_operand(dst, src);
+}
+
+void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x62);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::push(int32_t imm32) {
   // in 64bits we push 64bits onto the stack but only
   // take a 32bit immediate
@@ -2508,20 +2542,16 @@
 
 void Assembler::pxor(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
   emit_byte(0xEF);
   emit_operand(dst, src);
 }
 
 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  emit_byte(0x66);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
   emit_byte(0xEF);
   emit_byte(0xC0 | encode);
 }
@@ -2683,12 +2713,8 @@
 }
 
 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
-  // HMM Table D-1 says sse2
-  // NOT_LP64(assert(VM_Version::supports_sse(), ""));
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x51);
   emit_byte(0xC0 | encode);
 }
@@ -2696,30 +2722,22 @@
 void Assembler::sqrtsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x51);
   emit_operand(dst, src);
 }
 
 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
-  // HMM Table D-1 says sse2
-  // NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x51);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::sqrtss(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x51);
   emit_operand(dst, src);
 }
@@ -2765,9 +2783,7 @@
 
 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5C);
   emit_byte(0xC0 | encode);
 }
@@ -2775,18 +2791,14 @@
 void Assembler::subsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5C);
   emit_operand(dst, src);
 }
 
 void Assembler::subss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5C);
   emit_byte(0xC0 | encode);
 }
@@ -2794,9 +2806,7 @@
 void Assembler::subss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5C);
   emit_operand(dst, src);
 }
@@ -2836,30 +2846,30 @@
 
 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  ucomiss(dst, src);
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66);
+  emit_byte(0x2E);
+  emit_operand(dst, src);
 }
 
 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  ucomiss(dst, src);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
+  emit_byte(0x2E);
+  emit_byte(0xC0 | encode);
 }
 
 void Assembler::ucomiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-
-  InstructionMark im(this);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_NONE);
   emit_byte(0x2E);
   emit_operand(dst, src);
 }
 
 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
   emit_byte(0x2E);
   emit_byte(0xC0 | encode);
 }
@@ -2905,16 +2915,15 @@
 
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  xorps(dst, src);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x57);
+  emit_byte(0xC0 | encode);
 }
 
 void Assembler::xorpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
   emit_byte(0x57);
   emit_operand(dst, src);
 }
@@ -2922,8 +2931,7 @@
 
 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
   emit_byte(0x57);
   emit_byte(0xC0 | encode);
 }
@@ -2931,12 +2939,166 @@
 void Assembler::xorps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_NONE);
   emit_byte(0x57);
   emit_operand(dst, src);
 }
 
+// AVX 3-operands non destructive source instructions (encoded with VEX prefix)
+
+void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x58);
+  emit_operand(dst, src);
+}
+
+void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x58);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x58);
+  emit_operand(dst, src);
+}
+
+void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x58);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
+  emit_byte(0x54);
+  emit_operand(dst, src);
+}
+
+void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
+  emit_byte(0x54);
+  emit_operand(dst, src);
+}
+
+void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x5E);
+  emit_operand(dst, src);
+}
+
+void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x5E);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x5E);
+  emit_operand(dst, src);
+}
+
+void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x5E);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x59);
+  emit_operand(dst, src);
+}
+
+void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x59);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x59);
+  emit_operand(dst, src);
+}
+
+void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x59);
+  emit_byte(0xC0 | encode);
+}
+
+
+void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x5C);
+  emit_operand(dst, src);
+}
+
+void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x5C);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x5C);
+  emit_operand(dst, src);
+}
+
+void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x5C);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
+  emit_byte(0x57);
+  emit_operand(dst, src);
+}
+
+void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
+  emit_byte(0x57);
+  emit_operand(dst, src);
+}
+
+
 #ifndef _LP64
 // 32bit only pieces of the assembler
 
@@ -3394,12 +3556,114 @@
   emit_byte(0xF1);
 }
 
+// SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
+static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
+// SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
+static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
+
+// Generate SSE legacy REX prefix and SIMD opcode based on VEX encoding.
+void Assembler::rex_prefix(Address adr, XMMRegister xreg, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
+  if (pre > 0) {
+    emit_byte(simd_pre[pre]);
+  }
+  if (rex_w) {
+    prefixq(adr, xreg);
+  } else {
+    prefix(adr, xreg);
+  }
+  if (opc > 0) {
+    emit_byte(0x0F);
+    int opc2 = simd_opc[opc];
+    if (opc2 > 0) {
+      emit_byte(opc2);
+    }
+  }
+}
+
+int Assembler::rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
+  if (pre > 0) {
+    emit_byte(simd_pre[pre]);
+  }
+  int encode = (rex_w) ? prefixq_and_encode(dst_enc, src_enc) :
+                          prefix_and_encode(dst_enc, src_enc);
+  if (opc > 0) {
+    emit_byte(0x0F);
+    int opc2 = simd_opc[opc];
+    if (opc2 > 0) {
+      emit_byte(opc2);
+    }
+  }
+  return encode;
+}
+
+
+void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) {
+  if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) {
+    prefix(VEX_3bytes);
+
+    int byte1 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0);
+    byte1 = (~byte1) & 0xE0;
+    byte1 |= opc;
+    a_byte(byte1);
+
+    int byte2 = ((~nds_enc) & 0xf) << 3;
+    byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre;
+    emit_byte(byte2);
+  } else {
+    prefix(VEX_2bytes);
+
+    int byte1 = vex_r ? VEX_R : 0;
+    byte1 = (~byte1) & 0x80;
+    byte1 |= ((~nds_enc) & 0xf) << 3;
+    byte1 |= (vector256 ? 4 : 0) | pre;
+    emit_byte(byte1);
+  }
+}
+
+void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){
+  bool vex_r = (xreg_enc >= 8);
+  bool vex_b = adr.base_needs_rex();
+  bool vex_x = adr.index_needs_rex();
+  vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
+}
+
+int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) {
+  bool vex_r = (dst_enc >= 8);
+  bool vex_b = (src_enc >= 8);
+  bool vex_x = false;
+  vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
+  return (((dst_enc & 7) << 3) | (src_enc & 7));
+}
+
+
+void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
+  if (UseAVX > 0) {
+    int xreg_enc = xreg->encoding();
+    int  nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256);
+  } else {
+    assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding");
+    rex_prefix(adr, xreg, pre, opc, rex_w);
+  }
+}
+
+int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  if (UseAVX > 0) {
+    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256);
+  } else {
+    assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding");
+    return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w);
+  }
+}
 
 #ifndef _LP64
 
 void Assembler::incl(Register dst) {
   // Don't use it directly. Use MacroAssembler::incrementl() instead.
- emit_byte(0x40 | dst->encoding());
+  emit_byte(0x40 | dst->encoding());
 }
 
 void Assembler::lea(Register dst, Address src) {
@@ -3756,6 +4020,38 @@
   }
 }
 
+void Assembler::prefixq(Address adr, XMMRegister src) {
+  if (src->encoding() < 8) {
+    if (adr.base_needs_rex()) {
+      if (adr.index_needs_rex()) {
+        prefix(REX_WXB);
+      } else {
+        prefix(REX_WB);
+      }
+    } else {
+      if (adr.index_needs_rex()) {
+        prefix(REX_WX);
+      } else {
+        prefix(REX_W);
+      }
+    }
+  } else {
+    if (adr.base_needs_rex()) {
+      if (adr.index_needs_rex()) {
+        prefix(REX_WRXB);
+      } else {
+        prefix(REX_WRB);
+      }
+    } else {
+      if (adr.index_needs_rex()) {
+        prefix(REX_WRX);
+      } else {
+        prefix(REX_WR);
+      }
+    }
+  }
+}
+
 void Assembler::adcq(Register dst, int32_t imm32) {
   (void) prefixq_and_encode(dst->encoding());
   emit_arith(0x81, 0xD0, dst, imm32);
@@ -3918,36 +4214,44 @@
 
 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x2A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F2);
+  emit_byte(0x2A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x2A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F3);
+  emit_byte(0x2A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2);
   emit_byte(0x2C);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3);
   emit_byte(0x2C);
   emit_byte(0xC0 | encode);
 }
@@ -4107,21 +4411,17 @@
 
 void Assembler::movdq(XMMRegister dst, Register src) {
   // table D-1 says MMX/SSE2
-  NOT_LP64(assert(VM_Version::supports_sse2() || VM_Version::supports_mmx(), ""));
-  emit_byte(0x66);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66);
   emit_byte(0x6E);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::movdq(Register dst, XMMRegister src) {
   // table D-1 says MMX/SSE2
-  NOT_LP64(assert(VM_Version::supports_sse2() || VM_Version::supports_mmx(), ""));
-  emit_byte(0x66);
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = prefixq_and_encode(src->encoding(), dst->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66);
   emit_byte(0x7E);
   emit_byte(0xC0 | encode);
 }
@@ -4632,7 +4932,7 @@
     null_check_offset = offset();
   }
   movl(tmp_reg, klass_addr);
-  xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+  xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
   andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
   if (need_tmp_reg) {
     pop(tmp_reg);
@@ -4719,7 +5019,7 @@
   }
   get_thread(tmp_reg);
   movl(swap_reg, klass_addr);
-  orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+  orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
   movl(swap_reg, saved_mark_addr);
   if (os::is_MP()) {
     lock();
@@ -4757,7 +5057,7 @@
     push(tmp_reg);
   }
   movl(tmp_reg, klass_addr);
-  movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+  movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
   if (os::is_MP()) {
     lock();
   }
@@ -5680,6 +5980,24 @@
   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 }
 
+void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::addsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::addsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    addss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    addss(dst, Address(rscratch1, 0));
+  }
+}
+
 void MacroAssembler::align(int modulus) {
   if (offset() % modulus != 0) {
     nop(modulus - (offset() % modulus));
@@ -5687,11 +6005,24 @@
 }
 
 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-masking with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
   if (reachable(src)) {
-    andpd(dst, as_Address(src));
+    Assembler::andpd(dst, as_Address(src));
   } else {
     lea(rscratch1, src);
-    andpd(dst, Address(rscratch1, 0));
+    Assembler::andpd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-masking with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  if (reachable(src)) {
+    Assembler::andps(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::andps(dst, Address(rscratch1, 0));
   }
 }
 
@@ -6270,19 +6601,19 @@
 
 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
   if (reachable(src)) {
-    comisd(dst, as_Address(src));
+    Assembler::comisd(dst, as_Address(src));
   } else {
     lea(rscratch1, src);
-    comisd(dst, Address(rscratch1, 0));
+    Assembler::comisd(dst, Address(rscratch1, 0));
   }
 }
 
 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
   if (reachable(src)) {
-    comiss(dst, as_Address(src));
+    Assembler::comiss(dst, as_Address(src));
   } else {
     lea(rscratch1, src);
-    comiss(dst, Address(rscratch1, 0));
+    Assembler::comiss(dst, Address(rscratch1, 0));
   }
 }
 
@@ -6366,6 +6697,24 @@
   sarl(reg, shift_value);
 }
 
+void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::divsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::divsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::divss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::divss(dst, Address(rscratch1, 0));
+  }
+}
+
 // !defined(COMPILER2) is because of stupid core builds
 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
 void MacroAssembler::empty_FPU_stack() {
@@ -6805,12 +7154,39 @@
   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
 }
 
+void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::movsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::movsd(dst, Address(rscratch1, 0));
+  }
+}
+
 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
   if (reachable(src)) {
-    movss(dst, as_Address(src));
+    Assembler::movss(dst, as_Address(src));
   } else {
     lea(rscratch1, src);
-    movss(dst, Address(rscratch1, 0));
+    Assembler::movss(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::mulsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::mulsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::mulss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::mulss(dst, Address(rscratch1, 0));
   }
 }
 
@@ -6992,6 +7368,193 @@
   testl(dst, as_Address(src));
 }
 
+void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::sqrtsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::sqrtsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::sqrtss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::sqrtss(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::subsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::subsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::subss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::subss(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::ucomisd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::ucomisd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::ucomiss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::ucomiss(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-bit flipping with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  if (reachable(src)) {
+    Assembler::xorpd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::xorpd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-bit flipping with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  if (reachable(src)) {
+    Assembler::xorps(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::xorps(dst, Address(rscratch1, 0));
+  }
+}
+
+// AVX 3-operands instructions
+
+void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vaddsd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vaddsd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vaddss(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vaddss(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vandpd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vandpd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vandps(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vandps(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vdivsd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vdivsd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vdivss(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vdivss(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vmulsd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vmulsd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vmulss(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vmulss(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vsubsd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vsubsd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vsubss(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vsubss(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vxorpd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vxorpd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vxorps(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vxorps(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+
 //////////////////////////////////////////////////////////////////////////////////
 #ifndef SERIALGC
 
@@ -7430,6 +7993,16 @@
                                           Register var_size_in_bytes,
                                           int con_size_in_bytes,
                                           Register t1) {
+  if (!thread->is_valid()) {
+#ifdef _LP64
+    thread = r15_thread;
+#else
+    assert(t1->is_valid(), "need temp reg");
+    thread = t1;
+    get_thread(thread);
+#endif
+  }
+
 #ifdef _LP64
   if (var_size_in_bytes->is_valid()) {
     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
@@ -7437,12 +8010,6 @@
     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
   }
 #else
-  if (!thread->is_valid()) {
-    assert(t1->is_valid(), "need temp reg");
-    thread = t1;
-    get_thread(thread);
-  }
-
   if (var_size_in_bytes->is_valid()) {
     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
   } else {
@@ -7685,10 +8252,8 @@
   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
   assert(label_nulls <= 1, "at most one NULL in the batch");
 
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
-  int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                    Klass::super_check_offset_offset_in_bytes());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
   Address super_check_offset_addr(super_klass, sco_offset);
 
   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
@@ -7786,10 +8351,8 @@
   assert(label_nulls <= 1, "at most one NULL in the batch");
 
   // a couple of useful fields in sub_klass:
-  int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_supers_offset_in_bytes());
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
+  int ss_offset = in_bytes(Klass::secondary_supers_offset());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
   Address secondary_supers_addr(sub_klass, ss_offset);
   Address super_cache_addr(     sub_klass, sc_offset);
 
@@ -7876,32 +8439,6 @@
 }
 
 
-void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
-  ucomisd(dst, as_Address(src));
-}
-
-void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
-  ucomiss(dst, as_Address(src));
-}
-
-void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
-  if (reachable(src)) {
-    xorpd(dst, as_Address(src));
-  } else {
-    lea(rscratch1, src);
-    xorpd(dst, Address(rscratch1, 0));
-  }
-}
-
-void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
-  if (reachable(src)) {
-    xorps(dst, as_Address(src));
-  } else {
-    lea(rscratch1, src);
-    xorps(dst, Address(rscratch1, 0));
-  }
-}
-
 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
   if (VM_Version::supports_cmov()) {
     cmovl(cc, dst, src);
@@ -8487,20 +9024,20 @@
     if (Universe::narrow_oop_shift() != 0) {
       assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
       if (LogMinObjAlignmentInBytes == Address::times_8) {
-        movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+        movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset()));
       } else {
         // OK to use shift since we don't need to preserve flags.
         shlq(dst, LogMinObjAlignmentInBytes);
-        movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+        movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset()));
       }
     } else {
-      movq(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      movq(dst, Address(dst, Klass::prototype_header_offset()));
     }
   } else
 #endif
   {
     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
-    movptr(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+    movptr(dst, Address(dst, Klass::prototype_header_offset()));
   }
 }
 
@@ -8761,6 +9298,7 @@
                                       Register cnt1, Register cnt2,
                                       int int_cnt2,  Register result,
                                       XMMRegister vec, Register tmp) {
+  ShortBranchVerifier sbv(this);
   assert(UseSSE42Intrinsics, "SSE4.2 is required");
 
   // This method uses pcmpestri inxtruction with bound registers
@@ -8890,9 +9428,9 @@
       pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
     }
     // Need to reload strings pointers if not matched whole vector
-    jccb(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
+    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
     addptr(cnt2, 8);
-    jccb(Assembler::negative, SCAN_SUBSTR);
+    jcc(Assembler::negative, SCAN_SUBSTR);
     // Fall through if found full substring
 
   } // (int_cnt2 > 8)
@@ -8911,6 +9449,7 @@
                                     Register cnt1, Register cnt2,
                                     int int_cnt2,  Register result,
                                     XMMRegister vec, Register tmp) {
+  ShortBranchVerifier sbv(this);
   assert(UseSSE42Intrinsics, "SSE4.2 is required");
   //
   // int_cnt2 is length of small (< 8 chars) constant substring
@@ -9172,6 +9711,7 @@
 void MacroAssembler::string_compare(Register str1, Register str2,
                                     Register cnt1, Register cnt2, Register result,
                                     XMMRegister vec1) {
+  ShortBranchVerifier sbv(this);
   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
 
   // Compute the minimum of the string lengths and the
@@ -9308,6 +9848,7 @@
 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
                                         Register limit, Register result, Register chr,
                                         XMMRegister vec1, XMMRegister vec2) {
+  ShortBranchVerifier sbv(this);
   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
 
   int length_offset  = arrayOopDesc::length_offset_in_bytes();
@@ -9427,6 +9968,7 @@
 void MacroAssembler::generate_fill(BasicType t, bool aligned,
                                    Register to, Register value, Register count,
                                    Register rtmp, XMMRegister xtmp) {
+  ShortBranchVerifier sbv(this);
   assert_different_registers(to, value, count, rtmp);
   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
   Label L_fill_2_bytes, L_fill_4_bytes;
--- a/src/cpu/x86/vm/assembler_x86.hpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Sat Jan 14 00:47:46 2012 -0800
@@ -503,7 +503,31 @@
     REX_WR     = 0x4C,
     REX_WRB    = 0x4D,
     REX_WRX    = 0x4E,
-    REX_WRXB   = 0x4F
+    REX_WRXB   = 0x4F,
+
+    VEX_3bytes = 0xC4,
+    VEX_2bytes = 0xC5
+  };
+
+  enum VexPrefix {
+    VEX_B = 0x20,
+    VEX_X = 0x40,
+    VEX_R = 0x80,
+    VEX_W = 0x80
+  };
+
+  enum VexSimdPrefix {
+    VEX_SIMD_NONE = 0x0,
+    VEX_SIMD_66   = 0x1,
+    VEX_SIMD_F3   = 0x2,
+    VEX_SIMD_F2   = 0x3
+  };
+
+  enum VexOpcode {
+    VEX_OPCODE_NONE  = 0x0,
+    VEX_OPCODE_0F    = 0x1,
+    VEX_OPCODE_0F_38 = 0x2,
+    VEX_OPCODE_0F_3A = 0x3
   };
 
   enum WhichOperand {
@@ -546,12 +570,99 @@
   void prefixq(Address adr);
 
   void prefix(Address adr, Register reg,  bool byteinst = false);
+  void prefix(Address adr, XMMRegister reg);
   void prefixq(Address adr, Register reg);
-
-  void prefix(Address adr, XMMRegister reg);
+  void prefixq(Address adr, XMMRegister reg);
 
   void prefetch_prefix(Address src);
 
+  void rex_prefix(Address adr, XMMRegister xreg,
+                  VexSimdPrefix pre, VexOpcode opc, bool rex_w);
+  int  rex_prefix_and_encode(int dst_enc, int src_enc,
+                             VexSimdPrefix pre, VexOpcode opc, bool rex_w);
+
+  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
+                  int nds_enc, VexSimdPrefix pre, VexOpcode opc,
+                  bool vector256);
+
+  void vex_prefix(Address adr, int nds_enc, int xreg_enc,
+                  VexSimdPrefix pre, VexOpcode opc,
+                  bool vex_w, bool vector256);
+
+  void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
+                  VexSimdPrefix pre, bool vector256 = false) {
+     vex_prefix(src, nds->encoding(), dst->encoding(),
+                pre, VEX_OPCODE_0F, false, vector256);
+  }
+
+  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
+                             VexSimdPrefix pre, VexOpcode opc,
+                             bool vex_w, bool vector256);
+
+  int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
+                             VexSimdPrefix pre, bool vector256 = false) {
+     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+                                  pre, VEX_OPCODE_0F, false, vector256);
+  }
+
+  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
+                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
+                   bool rex_w = false, bool vector256 = false);
+
+  void simd_prefix(XMMRegister dst, Address src,
+                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    simd_prefix(dst, xnoreg, src, pre, opc);
+  }
+  void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
+    simd_prefix(src, dst, pre);
+  }
+  void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
+                     VexSimdPrefix pre) {
+    bool rex_w = true;
+    simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
+  }
+
+
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
+                             bool rex_w = false, bool vector256 = false);
+
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    return simd_prefix_and_encode(dst, xnoreg, src, pre, opc);
+  }
+
+  // Move/convert 32-bit integer value.
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
+                             VexSimdPrefix pre) {
+    // It is OK to cast from Register to XMMRegister to pass argument here
+    // since only encoding is used in simd_prefix_and_encode() and number of
+    // Gen and Xmm registers are the same.
+    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre);
+  }
+  int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) {
+    return simd_prefix_and_encode(dst, xnoreg, src, pre);
+  }
+  int simd_prefix_and_encode(Register dst, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc);
+  }
+
+  // Move/convert 64-bit integer value.
+  int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
+                               VexSimdPrefix pre) {
+    bool rex_w = true;
+    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w);
+  }
+  int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) {
+    return simd_prefix_and_encode_q(dst, xnoreg, src, pre);
+  }
+  int simd_prefix_and_encode_q(Register dst, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    bool rex_w = true;
+    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w);
+  }
+
   // Helper functions for groups of instructions
   void emit_arith_b(int op1, int op2, Register dst, int imm8);
 
@@ -764,6 +875,7 @@
   void addss(XMMRegister dst, Address src);
   void addss(XMMRegister dst, XMMRegister src);
 
+  void andl(Address  dst, int32_t imm32);
   void andl(Register dst, int32_t imm32);
   void andl(Register dst, Address src);
   void andl(Register dst, Register src);
@@ -774,9 +886,11 @@
   void andq(Register dst, Register src);
 
   // Bitwise Logical AND of Packed Double-Precision Floating-Point Values
-  void andpd(XMMRegister dst, Address src);
   void andpd(XMMRegister dst, XMMRegister src);
 
+  // Bitwise Logical AND of Packed Single-Precision Floating-Point Values
+  void andps(XMMRegister dst, XMMRegister src);
+
   void bsfl(Register dst, Register src);
   void bsrl(Register dst, Register src);
 
@@ -837,9 +951,11 @@
 
   // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
   void comisd(XMMRegister dst, Address src);
+  void comisd(XMMRegister dst, XMMRegister src);
 
   // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
   void comiss(XMMRegister dst, Address src);
+  void comiss(XMMRegister dst, XMMRegister src);
 
   // Identify processor type and features
   void cpuid() {
@@ -849,14 +965,19 @@
 
   // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
   void cvtsd2ss(XMMRegister dst, XMMRegister src);
+  void cvtsd2ss(XMMRegister dst, Address src);
 
   // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
   void cvtsi2sdl(XMMRegister dst, Register src);
+  void cvtsi2sdl(XMMRegister dst, Address src);
   void cvtsi2sdq(XMMRegister dst, Register src);
+  void cvtsi2sdq(XMMRegister dst, Address src);
 
   // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
   void cvtsi2ssl(XMMRegister dst, Register src);
+  void cvtsi2ssl(XMMRegister dst, Address src);
   void cvtsi2ssq(XMMRegister dst, Register src);
+  void cvtsi2ssq(XMMRegister dst, Address src);
 
   // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
   void cvtdq2pd(XMMRegister dst, XMMRegister src);
@@ -866,6 +987,7 @@
 
   // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
   void cvtss2sd(XMMRegister dst, XMMRegister src);
+  void cvtss2sd(XMMRegister dst, Address src);
 
   // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
   void cvttsd2sil(Register dst, Address src);
@@ -1140,8 +1262,6 @@
   void movdq(Register dst, XMMRegister src);
 
   // Move Aligned Double Quadword
-  void movdqa(Address     dst, XMMRegister src);
-  void movdqa(XMMRegister dst, Address src);
   void movdqa(XMMRegister dst, XMMRegister src);
 
   // Move Unaligned Double Quadword
@@ -1261,10 +1381,18 @@
   void orq(Register dst, Address src);
   void orq(Register dst, Register src);
 
+  // Pack with unsigned saturation
+  void packuswb(XMMRegister dst, XMMRegister src);
+  void packuswb(XMMRegister dst, Address src);
+
   // SSE4.2 string instructions
   void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
   void pcmpestri(XMMRegister xmm1, Address src, int imm8);
 
+  // SSE4.1 packed move
+  void pmovzxbw(XMMRegister dst, XMMRegister src);
+  void pmovzxbw(XMMRegister dst, Address src);
+
 #ifndef _LP64 // no 32bit push/pop on amd64
   void popl(Address dst);
 #endif
@@ -1292,6 +1420,7 @@
 
   // POR - Bitwise logical OR
   void por(XMMRegister dst, XMMRegister src);
+  void por(XMMRegister dst, Address src);
 
   // Shuffle Packed Doublewords
   void pshufd(XMMRegister dst, XMMRegister src, int mode);
@@ -1313,6 +1442,11 @@
 
   // Interleave Low Bytes
   void punpcklbw(XMMRegister dst, XMMRegister src);
+  void punpcklbw(XMMRegister dst, Address src);
+
+  // Interleave Low Doublewords
+  void punpckldq(XMMRegister dst, XMMRegister src);
+  void punpckldq(XMMRegister dst, Address src);
 
 #ifndef _LP64 // no 32bit push/pop on amd64
   void pushl(Address src);
@@ -1429,6 +1563,13 @@
   void xchgq(Register reg, Address adr);
   void xchgq(Register dst, Register src);
 
+  // Get Value of Extended Control Register
+  void xgetbv() {
+    emit_byte(0x0F);
+    emit_byte(0x01);
+    emit_byte(0xD0);
+  }
+
   void xorl(Register dst, int32_t imm32);
   void xorl(Register dst, Address src);
   void xorl(Register dst, Register src);
@@ -1437,14 +1578,44 @@
   void xorq(Register dst, Register src);
 
   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
+  void xorpd(XMMRegister dst, XMMRegister src);
+
+  // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
+  void xorps(XMMRegister dst, XMMRegister src);
+
+  void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
+
+  // AVX 3-operands instructions (encoded with VEX prefix)
+  void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vaddss(XMMRegister dst, XMMRegister nds, Address src);
+  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src);
+  void vandps(XMMRegister dst, XMMRegister nds, Address src);
+  void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vdivss(XMMRegister dst, XMMRegister nds, Address src);
+  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vmulss(XMMRegister dst, XMMRegister nds, Address src);
+  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vsubss(XMMRegister dst, XMMRegister nds, Address src);
+  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src);
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src);
+
+
+ protected:
+  // Next instructions require address alignment 16 bytes SSE mode.
+  // They should be called only from corresponding MacroAssembler instructions.
+  void andpd(XMMRegister dst, Address src);
+  void andps(XMMRegister dst, Address src);
   void xorpd(XMMRegister dst, Address src);
-  void xorpd(XMMRegister dst, XMMRegister src);
-
-  // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
   void xorps(XMMRegister dst, Address src);
-  void xorps(XMMRegister dst, XMMRegister src);
-
-  void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
+
 };
 
 
@@ -2175,9 +2346,15 @@
   void andpd(XMMRegister dst, Address src) { Assembler::andpd(dst, src); }
   void andpd(XMMRegister dst, AddressLiteral src);
 
+  void andps(XMMRegister dst, XMMRegister src) { Assembler::andps(dst, src); }
+  void andps(XMMRegister dst, Address src) { Assembler::andps(dst, src); }
+  void andps(XMMRegister dst, AddressLiteral src);
+
+  void comiss(XMMRegister dst, XMMRegister src) { Assembler::comiss(dst, src); }
   void comiss(XMMRegister dst, Address src) { Assembler::comiss(dst, src); }
   void comiss(XMMRegister dst, AddressLiteral src);
 
+  void comisd(XMMRegister dst, XMMRegister src) { Assembler::comisd(dst, src); }
   void comisd(XMMRegister dst, Address src) { Assembler::comisd(dst, src); }
   void comisd(XMMRegister dst, AddressLiteral src);
 
@@ -2211,62 +2388,62 @@
   void movss(XMMRegister dst, Address src)     { Assembler::movss(dst, src); }
   void movss(XMMRegister dst, AddressLiteral src);
 
-  void movlpd(XMMRegister dst, Address src)      {Assembler::movlpd(dst, src); }
+  void movlpd(XMMRegister dst, Address src)    {Assembler::movlpd(dst, src); }
   void movlpd(XMMRegister dst, AddressLiteral src);
 
 public:
 
   void addsd(XMMRegister dst, XMMRegister src)    { Assembler::addsd(dst, src); }
   void addsd(XMMRegister dst, Address src)        { Assembler::addsd(dst, src); }
-  void addsd(XMMRegister dst, AddressLiteral src) { Assembler::addsd(dst, as_Address(src)); }
+  void addsd(XMMRegister dst, AddressLiteral src);
 
   void addss(XMMRegister dst, XMMRegister src)    { Assembler::addss(dst, src); }
   void addss(XMMRegister dst, Address src)        { Assembler::addss(dst, src); }
-  void addss(XMMRegister dst, AddressLiteral src) { Assembler::addss(dst, as_Address(src)); }
+  void addss(XMMRegister dst, AddressLiteral src);
 
   void divsd(XMMRegister dst, XMMRegister src)    { Assembler::divsd(dst, src); }
   void divsd(XMMRegister dst, Address src)        { Assembler::divsd(dst, src); }
-  void divsd(XMMRegister dst, AddressLiteral src) { Assembler::divsd(dst, as_Address(src)); }
+  void divsd(XMMRegister dst, AddressLiteral src);
 
   void divss(XMMRegister dst, XMMRegister src)    { Assembler::divss(dst, src); }
   void divss(XMMRegister dst, Address src)        { Assembler::divss(dst, src); }
-  void divss(XMMRegister dst, AddressLiteral src) { Assembler::divss(dst, as_Address(src)); }
+  void divss(XMMRegister dst, AddressLiteral src);
 
   void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); }
   void movsd(Address dst, XMMRegister src)     { Assembler::movsd(dst, src); }
   void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
-  void movsd(XMMRegister dst, AddressLiteral src) { Assembler::movsd(dst, as_Address(src)); }
+  void movsd(XMMRegister dst, AddressLiteral src);
 
   void mulsd(XMMRegister dst, XMMRegister src)    { Assembler::mulsd(dst, src); }
   void mulsd(XMMRegister dst, Address src)        { Assembler::mulsd(dst, src); }
-  void mulsd(XMMRegister dst, AddressLiteral src) { Assembler::mulsd(dst, as_Address(src)); }
+  void mulsd(XMMRegister dst, AddressLiteral src);
 
   void mulss(XMMRegister dst, XMMRegister src)    { Assembler::mulss(dst, src); }
   void mulss(XMMRegister dst, Address src)        { Assembler::mulss(dst, src); }
-  void mulss(XMMRegister dst, AddressLiteral src) { Assembler::mulss(dst, as_Address(src)); }
+  void mulss(XMMRegister dst, AddressLiteral src);
 
   void sqrtsd(XMMRegister dst, XMMRegister src)    { Assembler::sqrtsd(dst, src); }
   void sqrtsd(XMMRegister dst, Address src)        { Assembler::sqrtsd(dst, src); }
-  void sqrtsd(XMMRegister dst, AddressLiteral src) { Assembler::sqrtsd(dst, as_Address(src)); }
+  void sqrtsd(XMMRegister dst, AddressLiteral src);
 
   void sqrtss(XMMRegister dst, XMMRegister src)    { Assembler::sqrtss(dst, src); }
   void sqrtss(XMMRegister dst, Address src)        { Assembler::sqrtss(dst, src); }
-  void sqrtss(XMMRegister dst, AddressLiteral src) { Assembler::sqrtss(dst, as_Address(src)); }
+  void sqrtss(XMMRegister dst, AddressLiteral src);
 
   void subsd(XMMRegister dst, XMMRegister src)    { Assembler::subsd(dst, src); }
   void subsd(XMMRegister dst, Address src)        { Assembler::subsd(dst, src); }
-  void subsd(XMMRegister dst, AddressLiteral src) { Assembler::subsd(dst, as_Address(src)); }
+  void subsd(XMMRegister dst, AddressLiteral src);
 
   void subss(XMMRegister dst, XMMRegister src)    { Assembler::subss(dst, src); }
   void subss(XMMRegister dst, Address src)        { Assembler::subss(dst, src); }
-  void subss(XMMRegister dst, AddressLiteral src) { Assembler::subss(dst, as_Address(src)); }
+  void subss(XMMRegister dst, AddressLiteral src);
 
   void ucomiss(XMMRegister dst, XMMRegister src) { Assembler::ucomiss(dst, src); }
-  void ucomiss(XMMRegister dst, Address src) { Assembler::ucomiss(dst, src); }
+  void ucomiss(XMMRegister dst, Address src)     { Assembler::ucomiss(dst, src); }
   void ucomiss(XMMRegister dst, AddressLiteral src);
 
   void ucomisd(XMMRegister dst, XMMRegister src) { Assembler::ucomisd(dst, src); }
-  void ucomisd(XMMRegister dst, Address src) { Assembler::ucomisd(dst, src); }
+  void ucomisd(XMMRegister dst, Address src)     { Assembler::ucomisd(dst, src); }
   void ucomisd(XMMRegister dst, AddressLiteral src);
 
   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
@@ -2279,6 +2456,53 @@
   void xorps(XMMRegister dst, Address src)     { Assembler::xorps(dst, src); }
   void xorps(XMMRegister dst, AddressLiteral src);
 
+  // AVX 3-operands instructions
+
+  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); }
+  void vaddsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddsd(dst, nds, src); }
+  void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddss(dst, nds, src); }
+  void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
+  void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandpd(dst, nds, src); }
+  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vandps(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandps(dst, nds, src); }
+  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
+  void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
+  void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivss(dst, nds, src); }
+  void vdivss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivss(dst, nds, src); }
+  void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulsd(dst, nds, src); }
+  void vmulsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulsd(dst, nds, src); }
+  void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulss(dst, nds, src); }
+  void vmulss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulss(dst, nds, src); }
+  void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubsd(dst, nds, src); }
+  void vsubsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubsd(dst, nds, src); }
+  void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubss(dst, nds, src); }
+  void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
+  void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); }
+  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); }
+  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+
   // Data
 
   void cmov32( Condition cc, Register dst, Address  src);
--- a/src/cpu/x86/vm/assembler_x86.inline.hpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/assembler_x86.inline.hpp	Sat Jan 14 00:47:46 2012 -0800
@@ -86,6 +86,7 @@
 inline void Assembler::prefixq(Address adr, Register reg) {}
 
 inline void Assembler::prefix(Address adr, XMMRegister reg) {}
+inline void Assembler::prefixq(Address adr, XMMRegister reg) {}
 #else
 inline void Assembler::emit_long64(jlong x) {
   *(jlong*) _code_pos = x;
--- a/src/cpu/x86/vm/c1_CodeStubs_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/c1_CodeStubs_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -320,7 +320,7 @@
     // begin_initialized_entry_offset has to fit in a byte. Also, we know it's not null.
     __ load_heap_oop_not_null(tmp2, Address(_obj, java_lang_Class::klass_offset_in_bytes()));
     __ get_thread(tmp);
-    __ cmpptr(tmp, Address(tmp2, instanceKlass::init_thread_offset_in_bytes() + sizeof(klassOopDesc)));
+    __ cmpptr(tmp, Address(tmp2, instanceKlass::init_thread_offset()));
     __ pop(tmp2);
     __ pop(tmp);
     __ jcc(Assembler::notEqual, call_patch);
@@ -519,7 +519,7 @@
 
   __ load_klass(tmp_reg, src_reg);
 
-  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc));
+  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset());
   __ cmpl(ref_type_adr, REF_NONE);
   __ jcc(Assembler::equal, _continuation);
 
--- a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -1557,8 +1557,8 @@
 
 void LIR_Assembler::emit_alloc_obj(LIR_OpAllocObj* op) {
   if (op->init_check()) {
-    __ cmpl(Address(op->klass()->as_register(),
-                    instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc)),
+    __ cmpb(Address(op->klass()->as_register(),
+                    instanceKlass::init_state_offset()),
             instanceKlass::fully_initialized);
     add_debug_info_for_null_check_here(op->stub()->info());
     __ jcc(Assembler::notEqual, *op->stub()->entry());
@@ -1730,7 +1730,7 @@
 #else
       __ cmpoop(Address(klass_RInfo, k->super_check_offset()), k->constant_encoding());
 #endif // _LP64
-      if (sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() != k->super_check_offset()) {
+      if ((juint)in_bytes(Klass::secondary_super_cache_offset()) != k->super_check_offset()) {
         __ jcc(Assembler::notEqual, *failure_target);
         // successful cast, fall through to profile or jump
       } else {
@@ -1842,7 +1842,7 @@
     __ load_klass(klass_RInfo, value);
 
     // get instance klass (it's already uncompressed)
-    __ movptr(k_RInfo, Address(k_RInfo, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+    __ movptr(k_RInfo, Address(k_RInfo, objArrayKlass::element_klass_offset()));
     // perform the fast part of the checking logic
     __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, success_target, failure_target, NULL);
     // call out-of-line instance of __ check_klass_subtype_slow_path(...):
@@ -3289,8 +3289,7 @@
           } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
             __ load_klass(tmp, dst);
           }
-          int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-            Klass::layout_helper_offset_in_bytes();
+          int lh_offset = in_bytes(Klass::layout_helper_offset());
           Address klass_lh_addr(tmp, lh_offset);
           jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
           __ cmpl(klass_lh_addr, objArray_lh);
@@ -3307,9 +3306,9 @@
 
 #ifndef _LP64
         __ movptr(tmp, dst_klass_addr);
-        __ movptr(tmp, Address(tmp, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+        __ movptr(tmp, Address(tmp, objArrayKlass::element_klass_offset()));
         __ push(tmp);
-        __ movl(tmp, Address(tmp, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(tmp, Address(tmp, Klass::super_check_offset_offset()));
         __ push(tmp);
         __ push(length);
         __ lea(tmp, Address(dst, dst_pos, scale, arrayOopDesc::base_offset_in_bytes(basic_type)));
@@ -3333,15 +3332,15 @@
         // Allocate abi space for args but be sure to keep stack aligned
         __ subptr(rsp, 6*wordSize);
         __ load_klass(c_rarg3, dst);
-        __ movptr(c_rarg3, Address(c_rarg3, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+        __ movptr(c_rarg3, Address(c_rarg3, objArrayKlass::element_klass_offset()));
         store_parameter(c_rarg3, 4);
-        __ movl(c_rarg3, Address(c_rarg3, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(c_rarg3, Address(c_rarg3, Klass::super_check_offset_offset()));
         __ call(RuntimeAddress(copyfunc_addr));
         __ addptr(rsp, 6*wordSize);
 #else
         __ load_klass(c_rarg4, dst);
-        __ movptr(c_rarg4, Address(c_rarg4, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
-        __ movl(c_rarg3, Address(c_rarg4, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ movptr(c_rarg4, Address(c_rarg4, objArrayKlass::element_klass_offset()));
+        __ movl(c_rarg3, Address(c_rarg4, Klass::super_check_offset_offset()));
         __ call(RuntimeAddress(copyfunc_addr));
 #endif
 
--- a/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -150,7 +150,7 @@
   assert_different_registers(obj, klass, len);
   if (UseBiasedLocking && !len->is_valid()) {
     assert_different_registers(obj, klass, len, t1, t2);
-    movptr(t1, Address(klass, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+    movptr(t1, Address(klass, Klass::prototype_header_offset()));
     movptr(Address(obj, oopDesc::mark_offset_in_bytes()), t1);
   } else {
     // This assumes that all prototype bits fit in an int32_t
--- a/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -1011,7 +1011,7 @@
 
           if (id == fast_new_instance_init_check_id) {
             // make sure the klass is initialized
-            __ cmpl(Address(klass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc)), instanceKlass::fully_initialized);
+            __ cmpb(Address(klass, instanceKlass::init_state_offset()), instanceKlass::fully_initialized);
             __ jcc(Assembler::notEqual, slow_path);
           }
 
@@ -1019,7 +1019,7 @@
           // assert object can be fast path allocated
           {
             Label ok, not_ok;
-            __ movl(obj_size, Address(klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+            __ movl(obj_size, Address(klass, Klass::layout_helper_offset()));
             __ cmpl(obj_size, 0);  // make sure it's an instance (LH > 0)
             __ jcc(Assembler::lessEqual, not_ok);
             __ testl(obj_size, Klass::_lh_instance_slow_path_bit);
@@ -1040,7 +1040,7 @@
           __ bind(retry_tlab);
 
           // get the instance size (size is postive so movl is fine for 64bit)
-          __ movl(obj_size, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(obj_size, Address(klass, Klass::layout_helper_offset()));
 
           __ tlab_allocate(obj, obj_size, 0, t1, t2, slow_path);
 
@@ -1052,7 +1052,7 @@
 
           __ bind(try_eden);
           // get the instance size (size is postive so movl is fine for 64bit)
-          __ movl(obj_size, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(obj_size, Address(klass, Klass::layout_helper_offset()));
 
           __ eden_allocate(obj, obj_size, 0, t1, slow_path);
           __ incr_allocated_bytes(thread, obj_size, 0);
@@ -1119,7 +1119,7 @@
         {
           Label ok;
           Register t0 = obj;
-          __ movl(t0, Address(klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+          __ movl(t0, Address(klass, Klass::layout_helper_offset()));
           __ sarl(t0, Klass::_lh_array_tag_shift);
           int tag = ((id == new_type_array_id)
                      ? Klass::_lh_array_tag_type_value
@@ -1153,7 +1153,7 @@
 
           // get the allocation size: round_up(hdr + length << (layout_helper & 0x1F))
           // since size is positive movl does right thing on 64bit
-          __ movl(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(t1, Address(klass, Klass::layout_helper_offset()));
           // since size is postive movl does right thing on 64bit
           __ movl(arr_size, length);
           assert(t1 == rcx, "fixed register usage");
@@ -1167,7 +1167,7 @@
           __ tlab_allocate(obj, arr_size, 0, t1, t2, slow_path);  // preserves arr_size
 
           __ initialize_header(obj, klass, length, t1, t2);
-          __ movb(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes() + (Klass::_lh_header_size_shift / BitsPerByte)));
+          __ movb(t1, Address(klass, in_bytes(Klass::layout_helper_offset()) + (Klass::_lh_header_size_shift / BitsPerByte)));
           assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
           assert(Klass::_lh_header_size_mask <= 0xFF, "bytewise");
           __ andptr(t1, Klass::_lh_header_size_mask);
@@ -1180,7 +1180,7 @@
           __ bind(try_eden);
           // get the allocation size: round_up(hdr + length << (layout_helper & 0x1F))
           // since size is positive movl does right thing on 64bit
-          __ movl(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(t1, Address(klass, Klass::layout_helper_offset()));
           // since size is postive movl does right thing on 64bit
           __ movl(arr_size, length);
           assert(t1 == rcx, "fixed register usage");
@@ -1195,7 +1195,7 @@
           __ incr_allocated_bytes(thread, arr_size, 0);
 
           __ initialize_header(obj, klass, length, t1, t2);
-          __ movb(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes() + (Klass::_lh_header_size_shift / BitsPerByte)));
+          __ movb(t1, Address(klass, in_bytes(Klass::layout_helper_offset()) + (Klass::_lh_header_size_shift / BitsPerByte)));
           assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
           assert(Klass::_lh_header_size_mask <= 0xFF, "bytewise");
           __ andptr(t1, Klass::_lh_header_size_mask);
@@ -1267,7 +1267,7 @@
         Label register_finalizer;
         Register t = rsi;
         __ load_klass(t, rax);
-        __ movl(t, Address(t, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(t, Address(t, Klass::access_flags_offset()));
         __ testl(t, JVM_ACC_HAS_FINALIZER);
         __ jcc(Assembler::notZero, register_finalizer);
         __ ret(0);
--- a/src/cpu/x86/vm/cppInterpreter_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/cppInterpreter_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -511,7 +511,7 @@
     // get synchronization object
 
     Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(rax, access_flags);
     __ testl(rax, JVM_ACC_STATIC);
     __ movptr(rax, Address(locals, 0));                   // get receiver (assume this is frequent case)
@@ -763,7 +763,7 @@
 #endif // ASSERT
   // get synchronization object
   { Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(rax, access_flags);
     __ movptr(rdi, STATE(_locals));                                     // prepare to get receiver (assume common case)
     __ testl(rax, JVM_ACC_STATIC);
@@ -1180,7 +1180,7 @@
 
   // pass mirror handle if static call
   { Label L;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(t, Address(method, methodOopDesc::access_flags_offset()));
     __ testl(t, JVM_ACC_STATIC);
     __ jcc(Assembler::zero, L);
--- a/src/cpu/x86/vm/methodHandles_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/methodHandles_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -1160,7 +1160,7 @@
   Address rcx_amh_conversion( rcx_recv, java_lang_invoke_AdapterMethodHandle::conversion_offset_in_bytes() );
   Address vmarg;                // __ argument_address(vmargslot)
 
-  const int java_mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+  const int java_mirror_offset = in_bytes(Klass::java_mirror_offset());
 
   if (have_entry(ek)) {
     __ nop();                   // empty stubs make SG sick
--- a/src/cpu/x86/vm/nativeInst_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/nativeInst_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -237,9 +237,21 @@
   int off = 0;
   u_char instr_0 = ubyte_at(off);
 
+  // See comment in Assembler::locate_operand() about VEX prefixes.
+  if (instr_0 == instruction_VEX_prefix_2bytes) {
+    assert((UseAVX > 0), "shouldn't have VEX prefix");
+    NOT_LP64(assert((0xC0 & ubyte_at(1)) == 0xC0, "shouldn't have LDS and LES instructions"));
+    return 2;
+  }
+  if (instr_0 == instruction_VEX_prefix_3bytes) {
+    assert((UseAVX > 0), "shouldn't have VEX prefix");
+    NOT_LP64(assert((0xC0 & ubyte_at(1)) == 0xC0, "shouldn't have LDS and LES instructions"));
+    return 3;
+  }
+
   // First check to see if we have a (prefixed or not) xor
-  if ( instr_0 >= instruction_prefix_wide_lo &&      // 0x40
-       instr_0 <= instruction_prefix_wide_hi) { // 0x4f
+  if (instr_0 >= instruction_prefix_wide_lo && // 0x40
+      instr_0 <= instruction_prefix_wide_hi) { // 0x4f
     off++;
     instr_0 = ubyte_at(off);
   }
@@ -256,13 +268,13 @@
     instr_0 = ubyte_at(off);
   }
 
-  if ( instr_0 == instruction_code_xmm_ss_prefix ||      // 0xf3
+  if ( instr_0 == instruction_code_xmm_ss_prefix || // 0xf3
        instr_0 == instruction_code_xmm_sd_prefix) { // 0xf2
     off++;
     instr_0 = ubyte_at(off);
   }
 
-  if ( instr_0 >= instruction_prefix_wide_lo &&      // 0x40
+  if ( instr_0 >= instruction_prefix_wide_lo && // 0x40
        instr_0 <= instruction_prefix_wide_hi) { // 0x4f
     off++;
     instr_0 = ubyte_at(off);
--- a/src/cpu/x86/vm/nativeInst_x86.hpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/nativeInst_x86.hpp	Sat Jan 14 00:47:46 2012 -0800
@@ -287,6 +287,9 @@
     instruction_code_xmm_store          = 0x11,
     instruction_code_xmm_lpd            = 0x12,
 
+    instruction_VEX_prefix_2bytes       = Assembler::VEX_2bytes,
+    instruction_VEX_prefix_3bytes       = Assembler::VEX_3bytes,
+
     instruction_size                    = 4,
     instruction_offset                  = 0,
     data_offset                         = 2,
--- a/src/cpu/x86/vm/register_definitions_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/register_definitions_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -53,6 +53,7 @@
 REGISTER_DEFINITION(Register, r15);
 #endif // AMD64
 
+REGISTER_DEFINITION(XMMRegister, xnoreg);
 REGISTER_DEFINITION(XMMRegister, xmm0 );
 REGISTER_DEFINITION(XMMRegister, xmm1 );
 REGISTER_DEFINITION(XMMRegister, xmm2 );
@@ -115,6 +116,7 @@
 REGISTER_DEFINITION(Register, r15_thread);
 #endif // AMD64
 
+REGISTER_DEFINITION(MMXRegister, mnoreg );
 REGISTER_DEFINITION(MMXRegister, mmx0 );
 REGISTER_DEFINITION(MMXRegister, mmx1 );
 REGISTER_DEFINITION(MMXRegister, mmx2 );
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -1374,8 +1374,7 @@
     //                                  L_success, L_failure, NULL);
     assert_different_registers(sub_klass, temp);
 
-    int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                     Klass::secondary_super_cache_offset_in_bytes());
+    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 
     // if the pointers are equal, we are done (e.g., String[] elements)
     __ cmpptr(sub_klass, super_klass_addr);
@@ -1787,8 +1786,7 @@
     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
     //
 
-    int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-                    Klass::layout_helper_offset_in_bytes();
+    int lh_offset = in_bytes(Klass::layout_helper_offset());
     Address src_klass_lh_addr(rcx_src_klass, lh_offset);
 
     // Handle objArrays completely differently...
@@ -1914,10 +1912,8 @@
     // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
     {
       // Handy offsets:
-      int  ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        objArrayKlass::element_klass_offset_in_bytes());
-      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        Klass::super_check_offset_offset_in_bytes());
+      int  ek_offset = in_bytes(objArrayKlass::element_klass_offset());
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
 
       Register rsi_dst_klass = rsi;
       Register rdi_temp      = rdi;
@@ -2323,6 +2319,9 @@
       generate_throw_exception("WrongMethodTypeException throw_exception",
                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
                                rax, rcx);
+
+    // Build this early so it's available for the interpreter
+    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
   }
 
 
@@ -2334,7 +2333,6 @@
     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
-    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
 
     //------------------------------------------------------------------------------------------------------------------------
     // entry points that are platform specific
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -2261,8 +2261,7 @@
     // The ckoff and ckval must be mutually consistent,
     // even though caller generates both.
     { Label L;
-      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        Klass::super_check_offset_offset_in_bytes());
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
       __ cmpl(ckoff, Address(ckval, sco_offset));
       __ jcc(Assembler::equal, L);
       __ stop("super_check_offset inconsistent");
@@ -2572,8 +2571,7 @@
     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
     //
 
-    const int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-                          Klass::layout_helper_offset_in_bytes();
+    const int lh_offset = in_bytes(Klass::layout_helper_offset());
 
     // Handle objArrays completely differently...
     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
@@ -2722,15 +2720,13 @@
       assert_clean_int(count, sco_temp);
 
       // Generate the type check.
-      const int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                              Klass::super_check_offset_offset_in_bytes());
+      const int sco_offset = in_bytes(Klass::super_check_offset_offset());
       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
       assert_clean_int(sco_temp, rax);
       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
 
       // Fetch destination element klass from the objArrayKlass header.
-      int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                       objArrayKlass::element_klass_offset_in_bytes());
+      int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
       assert_clean_int(sco_temp, rax);
@@ -3072,6 +3068,13 @@
       generate_throw_exception("WrongMethodTypeException throw_exception",
                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
                                rax, rcx);
+
+    // Build this early so it's available for the interpreter.
+    StubRoutines::_throw_StackOverflowError_entry =
+      generate_throw_exception("StackOverflowError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_StackOverflowError));
   }
 
   void generate_all() {
@@ -3098,12 +3101,6 @@
                                                 SharedRuntime::
                                                 throw_NullPointerException_at_call));
 
-    StubRoutines::_throw_StackOverflowError_entry =
-      generate_throw_exception("StackOverflowError throw_exception",
-                               CAST_FROM_FN_PTR(address,
-                                                SharedRuntime::
-                                                throw_StackOverflowError));
-
     // entry points that are platform specific
     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
--- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -522,9 +522,18 @@
 
   __ pop(rsi);  // get saved bcp / (c++ prev state ).
 
-  __ pop(rax);  // get return address
-  __ jump(ExternalAddress(Interpreter::throw_StackOverflowError_entry()));
+  // Restore sender's sp as SP. This is necessary if the sender's
+  // frame is an extended compiled frame (see gen_c2i_adapter())
+  // and safer anyway in case of JSR292 adaptations.
 
+  __ pop(rax); // return address must be moved if SP is changed
+  __ mov(rsp, rsi);
+  __ push(rax);
+
+  // Note: the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  __ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
   // all done with frame size check
   __ bind(after_frame_check_pop);
   __ pop(rsi);
@@ -552,7 +561,7 @@
   #endif // ASSERT
   // get synchronization object
   { Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(rax, access_flags);
     __ testl(rax, JVM_ACC_STATIC);
     __ movptr(rax, Address(rdi, Interpreter::local_offset_in_bytes(0)));  // get receiver (assume this is frequent case)
@@ -1012,7 +1021,7 @@
 
   // pass mirror handle if static call
   { Label L;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(t, Address(method, methodOopDesc::access_flags_offset()));
     __ testl(t, JVM_ACC_STATIC);
     __ jcc(Assembler::zero, L);
--- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -467,8 +467,18 @@
   __ cmpptr(rsp, rax);
   __ jcc(Assembler::above, after_frame_check);
 
-  __ pop(rax); // get return address
-  __ jump(ExternalAddress(Interpreter::throw_StackOverflowError_entry()));
+  // Restore sender's sp as SP. This is necessary if the sender's
+  // frame is an extended compiled frame (see gen_c2i_adapter())
+  // and safer anyway in case of JSR292 adaptations.
+
+  __ pop(rax); // return address must be moved if SP is changed
+  __ mov(rsp, r13);
+  __ push(rax);
+
+  // Note: the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  __ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
 
   // all done with frame size check
   __ bind(after_frame_check);
@@ -505,8 +515,7 @@
 
   // get synchronization object
   {
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() +
-                              Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     Label done;
     __ movl(rax, access_flags);
     __ testl(rax, JVM_ACC_STATIC);
@@ -1006,8 +1015,7 @@
   // pass mirror handle if static call
   {
     Label L;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() +
-                              Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(t, Address(method, methodOopDesc::access_flags_offset()));
     __ testl(t, JVM_ACC_STATIC);
     __ jcc(Assembler::zero, L);
--- a/src/cpu/x86/vm/templateTable_x86_32.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/templateTable_x86_32.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -980,7 +980,7 @@
   __ load_klass(rbx, rax);
   // Move superklass into EAX
   __ load_klass(rax, rdx);
-  __ movptr(rax, Address(rax, sizeof(oopDesc) + objArrayKlass::element_klass_offset_in_bytes()));
+  __ movptr(rax, Address(rax, objArrayKlass::element_klass_offset()));
   // Compress array+index*wordSize+12 into a single register.  Frees ECX.
   __ lea(rdx, element_address);
 
@@ -2033,7 +2033,7 @@
     assert(state == vtos, "only valid state");
     __ movptr(rax, aaddress(0));
     __ load_klass(rdi, rax);
-    __ movl(rdi, Address(rdi, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc)));
+    __ movl(rdi, Address(rdi, Klass::access_flags_offset()));
     __ testl(rdi, JVM_ACC_HAS_FINALIZER);
     Label skip_register_finalizer;
     __ jcc(Assembler::zero, skip_register_finalizer);
@@ -3188,11 +3188,11 @@
 
   // make sure klass is initialized & doesn't have finalizer
   // make sure klass is fully initialized
-  __ cmpl(Address(rcx, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc)), instanceKlass::fully_initialized);
+  __ cmpb(Address(rcx, instanceKlass::init_state_offset()), instanceKlass::fully_initialized);
   __ jcc(Assembler::notEqual, slow_case);
 
   // get instance_size in instanceKlass (scaled to a count of bytes)
-  __ movl(rdx, Address(rcx, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+  __ movl(rdx, Address(rcx, Klass::layout_helper_offset()));
   // test to see if it has a finalizer or is malformed in some way
   __ testl(rdx, Klass::_lh_instance_slow_path_bit);
   __ jcc(Assembler::notZero, slow_case);
@@ -3293,7 +3293,7 @@
     __ bind(initialize_header);
     if (UseBiasedLocking) {
       __ pop(rcx);   // get saved klass back in the register.
-      __ movptr(rbx, Address(rcx, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      __ movptr(rbx, Address(rcx, Klass::prototype_header_offset()));
       __ movptr(Address(rax, oopDesc::mark_offset_in_bytes ()), rbx);
     } else {
       __ movptr(Address(rax, oopDesc::mark_offset_in_bytes ()),
--- a/src/cpu/x86/vm/templateTable_x86_64.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/templateTable_x86_64.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -1004,8 +1004,7 @@
   // Move superklass into rax
   __ load_klass(rax, rdx);
   __ movptr(rax, Address(rax,
-                         sizeof(oopDesc) +
-                         objArrayKlass::element_klass_offset_in_bytes()));
+                         objArrayKlass::element_klass_offset()));
   // Compress array + index*oopSize + 12 into a single register.  Frees rcx.
   __ lea(rdx, element_address);
 
@@ -2067,7 +2066,7 @@
     assert(state == vtos, "only valid state");
     __ movptr(c_rarg1, aaddress(0));
     __ load_klass(rdi, c_rarg1);
-    __ movl(rdi, Address(rdi, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc)));
+    __ movl(rdi, Address(rdi, Klass::access_flags_offset()));
     __ testl(rdi, JVM_ACC_HAS_FINALIZER);
     Label skip_register_finalizer;
     __ jcc(Assembler::zero, skip_register_finalizer);
@@ -3235,16 +3234,15 @@
 
   // make sure klass is initialized & doesn't have finalizer
   // make sure klass is fully initialized
-  __ cmpl(Address(rsi,
-                  instanceKlass::init_state_offset_in_bytes() +
-                  sizeof(oopDesc)),
+  __ cmpb(Address(rsi,
+                  instanceKlass::init_state_offset()),
           instanceKlass::fully_initialized);
   __ jcc(Assembler::notEqual, slow_case);
 
   // get instance_size in instanceKlass (scaled to a count of bytes)
   __ movl(rdx,
           Address(rsi,
-                  Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+                  Klass::layout_helper_offset()));
   // test to see if it has a finalizer or is malformed in some way
   __ testl(rdx, Klass::_lh_instance_slow_path_bit);
   __ jcc(Assembler::notZero, slow_case);
@@ -3337,7 +3335,7 @@
     // initialize object header only.
     __ bind(initialize_header);
     if (UseBiasedLocking) {
-      __ movptr(rscratch1, Address(rsi, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      __ movptr(rscratch1, Address(rsi, Klass::prototype_header_offset()));
       __ movptr(Address(rax, oopDesc::mark_offset_in_bytes()), rscratch1);
     } else {
       __ movptr(Address(rax, oopDesc::mark_offset_in_bytes()),
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Sat Jan 14 00:47:46 2012 -0800
@@ -50,7 +50,7 @@
 VM_Version::CpuidInfo VM_Version::_cpuid_info   = { 0, };
 
 static BufferBlob* stub_blob;
-static const int stub_size = 400;
+static const int stub_size = 550;
 
 extern "C" {
   typedef void (*getPsrInfo_stub_t)(void*);
@@ -73,7 +73,7 @@
     const uint32_t CPU_FAMILY_486   = (4 << CPU_FAMILY_SHIFT);
 
     Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
-    Label ext_cpuid1, ext_cpuid5, done;
+    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done;
 
     StubCodeMark mark(this, "VM_Version", "getPsrInfo_stub");
 #   define __ _masm->
@@ -229,14 +229,51 @@
     __ movl(Address(rsi, 8), rcx);
     __ movl(Address(rsi,12), rdx);
 
+    //
+    // Check if OS has enabled XGETBV instruction to access XCR0
+    // (OSXSAVE feature flag) and CPU supports AVX
+    //
+    __ andl(rcx, 0x18000000);
+    __ cmpl(rcx, 0x18000000);
+    __ jccb(Assembler::notEqual, sef_cpuid);
+
+    //
+    // XCR0, XFEATURE_ENABLED_MASK register
+    //
+    __ xorl(rcx, rcx);   // zero for XCR0 register
+    __ xgetbv();
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset())));
+    __ movl(Address(rsi, 0), rax);
+    __ movl(Address(rsi, 4), rdx);
+
+    //
+    // cpuid(0x7) Structured Extended Features
+    //
+    __ bind(sef_cpuid);
+    __ movl(rax, 7);
+    __ cmpl(rax, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset()))); // Is cpuid(0x7) supported?
+    __ jccb(Assembler::greater, ext_cpuid);
+
+    __ xorl(rcx, rcx);
+    __ cpuid();
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
+    __ movl(Address(rsi, 0), rax);
+    __ movl(Address(rsi, 4), rbx);
+
+    //
+    // Extended cpuid(0x80000000)
+    //
+    __ bind(ext_cpuid);
     __ movl(rax, 0x80000000);
     __ cpuid();
     __ cmpl(rax, 0x80000000);     // Is cpuid(0x80000001) supported?
     __ jcc(Assembler::belowEqual, done);
     __ cmpl(rax, 0x80000004);     // Is cpuid(0x80000005) supported?
     __ jccb(Assembler::belowEqual, ext_cpuid1);
+    __ cmpl(rax, 0x80000006);     // Is cpuid(0x80000007) supported?
+    __ jccb(Assembler::belowEqual, ext_cpuid5);
     __ cmpl(rax, 0x80000007);     // Is cpuid(0x80000008) supported?
-    __ jccb(Assembler::belowEqual, ext_cpuid5);
+    __ jccb(Assembler::belowEqual, ext_cpuid7);
     //
     // Extended cpuid(0x80000008)
     //
@@ -249,6 +286,18 @@
     __ movl(Address(rsi,12), rdx);
 
     //
+    // Extended cpuid(0x80000007)
+    //
+    __ bind(ext_cpuid7);
+    __ movl(rax, 0x80000007);
+    __ cpuid();
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::ext_cpuid7_offset())));
+    __ movl(Address(rsi, 0), rax);
+    __ movl(Address(rsi, 4), rbx);
+    __ movl(Address(rsi, 8), rcx);
+    __ movl(Address(rsi,12), rdx);
+
+    //
     // Extended cpuid(0x80000005)
     //
     __ bind(ext_cpuid5);
@@ -359,13 +408,19 @@
   if (UseSSE < 1)
     _cpuFeatures &= ~CPU_SSE;
 
+  if (UseAVX < 2)
+    _cpuFeatures &= ~CPU_AVX2;
+
+  if (UseAVX < 1)
+    _cpuFeatures &= ~CPU_AVX;
+
   if (logical_processors_per_package() == 1) {
     // HT processor could be installed on a system which doesn't support HT.
     _cpuFeatures &= ~CPU_HT;
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -379,27 +434,39 @@
                (supports_sse4_1() ? ", sse4.1" : ""),
                (supports_sse4_2() ? ", sse4.2" : ""),
                (supports_popcnt() ? ", popcnt" : ""),
+               (supports_avx()    ? ", avx" : ""),
+               (supports_avx2()   ? ", avx2" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
                (supports_lzcnt()   ? ", lzcnt": ""),
                (supports_sse4a()   ? ", sse4a": ""),
-               (supports_ht() ? ", ht": ""));
+               (supports_ht() ? ", ht": ""),
+               (supports_tsc() ? ", tsc": ""),
+               (supports_tscinv_bit() ? ", tscinvbit": ""),
+               (supports_tscinv() ? ", tscinv": ""));
   _features_str = strdup(buf);
 
   // UseSSE is set to the smaller of what hardware supports and what
   // the command line requires.  I.e., you cannot set UseSSE to 2 on
   // older Pentiums which do not support it.
-  if( UseSSE > 4 ) UseSSE=4;
-  if( UseSSE < 0 ) UseSSE=0;
-  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
+  if (UseSSE > 4) UseSSE=4;
+  if (UseSSE < 0) UseSSE=0;
+  if (!supports_sse4_1()) // Drop to 3 if no SSE4 support
     UseSSE = MIN2((intx)3,UseSSE);
-  if( !supports_sse3() ) // Drop to 2 if no SSE3 support
+  if (!supports_sse3()) // Drop to 2 if no SSE3 support
     UseSSE = MIN2((intx)2,UseSSE);
-  if( !supports_sse2() ) // Drop to 1 if no SSE2 support
+  if (!supports_sse2()) // Drop to 1 if no SSE2 support
     UseSSE = MIN2((intx)1,UseSSE);
-  if( !supports_sse () ) // Drop to 0 if no SSE  support
+  if (!supports_sse ()) // Drop to 0 if no SSE  support
     UseSSE = 0;
 
+  if (UseAVX > 2) UseAVX=2;
+  if (UseAVX < 0) UseAVX=0;
+  if (!supports_avx2()) // Drop to 1 if no AVX2 support
+    UseAVX = MIN2((intx)1,UseAVX);
+  if (!supports_avx ()) // Drop to 0 if no AVX  support
+    UseAVX = 0;
+
   // On new cpus instructions which update whole XMM register should be used
   // to prevent partial register stall due to dependencies on high half.
   //
@@ -534,6 +601,9 @@
     if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
       UsePopCountInstruction = true;
     }
+  } else if (UsePopCountInstruction) {
+    warning("POPCNT instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UsePopCountInstruction, false);
   }
 
 #ifdef COMPILER2
@@ -605,7 +675,11 @@
   if (PrintMiscellaneous && Verbose) {
     tty->print_cr("Logical CPUs per core: %u",
                   logical_processors_per_package());
-    tty->print_cr("UseSSE=%d",UseSSE);
+    tty->print("UseSSE=%d",UseSSE);
+    if (UseAVX > 0) {
+      tty->print("  UseAVX=%d",UseAVX);
+    }
+    tty->cr();
     tty->print("Allocation");
     if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
       tty->print_cr(": no prefetching");
--- a/src/cpu/x86/vm/vm_version_x86.hpp	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/vm_version_x86.hpp	Sat Jan 14 00:47:46 2012 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -78,7 +78,10 @@
                sse4_2   : 1,
                         : 2,
                popcnt   : 1,
-                        : 8;
+                        : 3,
+               osxsave  : 1,
+               avx      : 1,
+                        : 3;
     } bits;
   };
 
@@ -168,6 +171,15 @@
     } bits;
   };
 
+  union ExtCpuid7Edx {
+    uint32_t value;
+    struct {
+      uint32_t               : 8,
+              tsc_invariance : 1,
+                             : 23;
+    } bits;
+  };
+
   union ExtCpuid8Ecx {
     uint32_t value;
     struct {
@@ -176,32 +188,75 @@
     } bits;
   };
 
+  union SefCpuid7Eax {
+    uint32_t value;
+  };
+
+  union SefCpuid7Ebx {
+    uint32_t value;
+    struct {
+      uint32_t fsgsbase : 1,
+                        : 2,
+                   bmi1 : 1,
+                        : 1,
+                   avx2 : 1,
+                        : 2,
+                   bmi2 : 1,
+                        : 23;
+    } bits;
+  };
+
+  union XemXcr0Eax {
+    uint32_t value;
+    struct {
+      uint32_t x87 : 1,
+               sse : 1,
+               ymm : 1,
+                   : 29;
+    } bits;
+  };
+
 protected:
-   static int _cpu;
-   static int _model;
-   static int _stepping;
-   static int _cpuFeatures;     // features returned by the "cpuid" instruction
-                                // 0 if this instruction is not available
-   static const char* _features_str;
+  static int _cpu;
+  static int _model;
+  static int _stepping;
+  static int _cpuFeatures;     // features returned by the "cpuid" instruction
+                               // 0 if this instruction is not available
+  static const char* _features_str;
 
-   enum {
-     CPU_CX8    = (1 << 0), // next bits are from cpuid 1 (EDX)
-     CPU_CMOV   = (1 << 1),
-     CPU_FXSR   = (1 << 2),
-     CPU_HT     = (1 << 3),
-     CPU_MMX    = (1 << 4),
-     CPU_3DNOW_PREFETCH  = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
-                                     // may not necessarily support other 3dnow instructions
-     CPU_SSE    = (1 << 6),
-     CPU_SSE2   = (1 << 7),
-     CPU_SSE3   = (1 << 8), // SSE3 comes from cpuid 1 (ECX)
-     CPU_SSSE3  = (1 << 9),
-     CPU_SSE4A  = (1 << 10),
-     CPU_SSE4_1 = (1 << 11),
-     CPU_SSE4_2 = (1 << 12),
-     CPU_POPCNT = (1 << 13),
-     CPU_LZCNT  = (1 << 14)
-   } cpuFeatureFlags;
+  enum {
+    CPU_CX8    = (1 << 0), // next bits are from cpuid 1 (EDX)
+    CPU_CMOV   = (1 << 1),
+    CPU_FXSR   = (1 << 2),
+    CPU_HT     = (1 << 3),
+    CPU_MMX    = (1 << 4),
+    CPU_3DNOW_PREFETCH  = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
+                                    // may not necessarily support other 3dnow instructions
+    CPU_SSE    = (1 << 6),
+    CPU_SSE2   = (1 << 7),
+    CPU_SSE3   = (1 << 8), // SSE3 comes from cpuid 1 (ECX)
+    CPU_SSSE3  = (1 << 9),
+    CPU_SSE4A  = (1 << 10),
+    CPU_SSE4_1 = (1 << 11),
+    CPU_SSE4_2 = (1 << 12),
+    CPU_POPCNT = (1 << 13),
+    CPU_LZCNT  = (1 << 14),
+    CPU_TSC    = (1 << 15),
+    CPU_TSCINV = (1 << 16),
+    CPU_AVX    = (1 << 17),
+    CPU_AVX2   = (1 << 18)
+  } cpuFeatureFlags;
+
+  enum {
+    // AMD
+    CPU_FAMILY_AMD_11H       = 17,
+    // Intel
+    CPU_FAMILY_INTEL_CORE    = 6,
+    CPU_MODEL_NEHALEM_EP     = 26,
+    CPU_MODEL_WESTMERE_EP    = 44,
+//  CPU_MODEL_IVYBRIDGE_EP   = ??, TODO - get real value
+    CPU_MODEL_SANDYBRIDGE_EP = 45
+  } cpuExtendedFamily;
 
   // cpuid information block.  All info derived from executing cpuid with
   // various function numbers is stored here.  Intel and AMD info is
@@ -228,6 +283,12 @@
     uint32_t     dcp_cpuid4_ecx; // unused currently
     uint32_t     dcp_cpuid4_edx; // unused currently
 
+    // cpuid function 7 (structured extended features)
+    SefCpuid7Eax sef_cpuid7_eax;
+    SefCpuid7Ebx sef_cpuid7_ebx;
+    uint32_t     sef_cpuid7_ecx; // unused currently
+    uint32_t     sef_cpuid7_edx; // unused currently
+
     // cpuid function 0xB (processor topology)
     // ecx = 0
     uint32_t     tpl_cpuidB0_eax;
@@ -270,11 +331,21 @@
     ExtCpuid5Ex  ext_cpuid5_ecx; // L1 data cache info (AMD)
     ExtCpuid5Ex  ext_cpuid5_edx; // L1 instruction cache info (AMD)
 
+    // cpuid function 0x80000007
+    uint32_t     ext_cpuid7_eax; // reserved
+    uint32_t     ext_cpuid7_ebx; // reserved
+    uint32_t     ext_cpuid7_ecx; // reserved
+    ExtCpuid7Edx ext_cpuid7_edx; // tscinv
+
     // cpuid function 0x80000008
     uint32_t     ext_cpuid8_eax; // unused currently
     uint32_t     ext_cpuid8_ebx; // reserved
     ExtCpuid8Ecx ext_cpuid8_ecx;
     uint32_t     ext_cpuid8_edx; // reserved
+
+    // extended control register XCR0 (the XFEATURE_ENABLED_MASK register)
+    XemXcr0Eax   xem_xcr0_eax;
+    uint32_t     xem_xcr0_edx; // reserved
   };
 
   // The actual cpuid info block
@@ -286,19 +357,23 @@
     result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
     return result;
   }
+
   static uint32_t extended_cpu_model() {
     uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model;
     result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
     return result;
   }
+
   static uint32_t cpu_stepping() {
     uint32_t result = _cpuid_info.std_cpuid1_eax.bits.stepping;
     return result;
   }
+
   static uint logical_processor_count() {
     uint result = threads_per_core();
     return result;
   }
+
   static uint32_t feature_flags() {
     uint32_t result = 0;
     if (_cpuid_info.std_cpuid1_edx.bits.cmpxchg8 != 0)
@@ -328,6 +403,18 @@
       result |= CPU_SSE4_2;
     if (_cpuid_info.std_cpuid1_ecx.bits.popcnt != 0)
       result |= CPU_POPCNT;
+    if (_cpuid_info.std_cpuid1_ecx.bits.avx != 0 &&
+        _cpuid_info.std_cpuid1_ecx.bits.osxsave != 0 &&
+        _cpuid_info.xem_xcr0_eax.bits.sse != 0 &&
+        _cpuid_info.xem_xcr0_eax.bits.ymm != 0) {
+      result |= CPU_AVX;
+      if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
+        result |= CPU_AVX2;
+    }
+    if (_cpuid_info.std_cpuid1_edx.bits.tsc != 0)
+      result |= CPU_TSC;
+    if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0)
+      result |= CPU_TSCINV;
 
     // AMD features.
     if (is_amd()) {
@@ -350,12 +437,15 @@
   static ByteSize std_cpuid0_offset() { return byte_offset_of(CpuidInfo, std_max_function); }
   static ByteSize std_cpuid1_offset() { return byte_offset_of(CpuidInfo, std_cpuid1_eax); }
   static ByteSize dcp_cpuid4_offset() { return byte_offset_of(CpuidInfo, dcp_cpuid4_eax); }
+  static ByteSize sef_cpuid7_offset() { return byte_offset_of(CpuidInfo, sef_cpuid7_eax); }
   static ByteSize ext_cpuid1_offset() { return byte_offset_of(CpuidInfo, ext_cpuid1_eax); }
   static ByteSize ext_cpuid5_offset() { return byte_offset_of(CpuidInfo, ext_cpuid5_eax); }
+  static ByteSize ext_cpuid7_offset() { return byte_offset_of(CpuidInfo, ext_cpuid7_eax); }
   static ByteSize ext_cpuid8_offset() { return byte_offset_of(CpuidInfo, ext_cpuid8_eax); }
   static ByteSize tpl_cpuidB0_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB0_eax); }
   static ByteSize tpl_cpuidB1_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB1_eax); }
   static ByteSize tpl_cpuidB2_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
+  static ByteSize xem_xcr0_offset() { return byte_offset_of(CpuidInfo, xem_xcr0_eax); }
 
   // Initialization
   static void initialize();
@@ -382,7 +472,6 @@
   //
   static int  cpu_family()        { return _cpu;}
   static bool is_P6()             { return cpu_family() >= 6; }
-
   static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
   static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
 
@@ -447,14 +536,51 @@
   static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
   static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
   static bool supports_popcnt()   { return (_cpuFeatures & CPU_POPCNT) != 0; }
-  //
+  static bool supports_avx()      { return (_cpuFeatures & CPU_AVX) != 0; }
+  static bool supports_avx2()     { return (_cpuFeatures & CPU_AVX2) != 0; }
+  static bool supports_tsc()      { return (_cpuFeatures & CPU_TSC)    != 0; }
+
+  // Intel features
+  static bool is_intel_family_core() { return is_intel() &&
+                                       extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
+
+  static bool is_intel_tsc_synched_at_init()  {
+    if (is_intel_family_core()) {
+      uint32_t ext_model = extended_cpu_model();
+      if (ext_model == CPU_MODEL_NEHALEM_EP   ||
+          ext_model == CPU_MODEL_WESTMERE_EP  ||
+// TODO   ext_model == CPU_MODEL_IVYBRIDGE_EP ||
+          ext_model == CPU_MODEL_SANDYBRIDGE_EP) {
+        // 2-socket invtsc support. EX versions with 4 sockets are not
+        // guaranteed to synchronize tscs at initialization via a double
+        // handshake. The tscs can be explicitly set in software.  Code
+        // that uses tsc values must be prepared for them to arbitrarily
+        // jump backward or forward.
+        return true;
+      }
+    }
+    return false;
+  }
+
   // AMD features
-  //
   static bool supports_3dnow_prefetch()    { return (_cpuFeatures & CPU_3DNOW_PREFETCH) != 0; }
   static bool supports_mmx_ext()  { return is_amd() && _cpuid_info.ext_cpuid1_edx.bits.mmx_amd != 0; }
   static bool supports_lzcnt()    { return (_cpuFeatures & CPU_LZCNT) != 0; }
   static bool supports_sse4a()    { return (_cpuFeatures & CPU_SSE4A) != 0; }
 
+  static bool is_amd_Barcelona()  { return is_amd() &&
+                                           extended_cpu_family() == CPU_FAMILY_AMD_11H; }
+
+  // Intel and AMD newer cores support fast timestamps well
+  static bool supports_tscinv_bit() {
+    return (_cpuFeatures & CPU_TSCINV) != 0;
+  }
+  static bool supports_tscinv() {
+    return supports_tscinv_bit() &&
+           ( (is_amd() && !is_amd_Barcelona()) ||
+             is_intel_tsc_synched_at_init() );
+  }
+
   // Intel Core and newer cpus have fast IDIV instruction (excluding Atom).
   static bool has_fast_idiv()     { return is_intel() && cpu_family() == 6 &&
                                            supports_sse3() && _model != 0x1C; }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cpu/x86/vm/x86.ad	Sat Jan 14 00:47:46 2012 -0800
@@ -0,0 +1,777 @@
+//
+// Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+//
+
+// X86 Common Architecture Description File
+
+source %{
+  // Float masks come from different places depending on platform.
+#ifdef _LP64
+  static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
+  static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
+  static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
+  static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
+#else
+  static address float_signmask()  { return (address)float_signmask_pool; }
+  static address float_signflip()  { return (address)float_signflip_pool; }
+  static address double_signmask() { return (address)double_signmask_pool; }
+  static address double_signflip() { return (address)double_signflip_pool; }
+#endif
+%}
+
+// INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
+
+instruct addF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AddF dst src));
+
+  format %{ "addss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AddF dst (LoadF src)));
+
+  format %{ "addss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AddF dst con));
+  format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddF src1 src2));
+
+  format %{ "vaddss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddF src1 (LoadF src2)));
+
+  format %{ "vaddss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddF src con));
+
+  format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AddD dst src));
+
+  format %{ "addsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AddD dst (LoadD src)));
+
+  format %{ "addsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AddD dst con));
+  format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddD src1 src2));
+
+  format %{ "vaddsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddD src1 (LoadD src2)));
+
+  format %{ "vaddsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddD src con));
+
+  format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (SubF dst src));
+
+  format %{ "subss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (SubF dst (LoadF src)));
+
+  format %{ "subss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (SubF dst con));
+  format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubF src1 src2));
+
+  format %{ "vsubss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubF src1 (LoadF src2)));
+
+  format %{ "vsubss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubF src con));
+
+  format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (SubD dst src));
+
+  format %{ "subsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (SubD dst (LoadD src)));
+
+  format %{ "subsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (SubD dst con));
+  format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubD src1 src2));
+
+  format %{ "vsubsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubD src1 (LoadD src2)));
+
+  format %{ "vsubsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubD src con));
+
+  format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (MulF dst src));
+
+  format %{ "mulss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (MulF dst (LoadF src)));
+
+  format %{ "mulss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (MulF dst con));
+  format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulF src1 src2));
+
+  format %{ "vmulss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulF src1 (LoadF src2)));
+
+  format %{ "vmulss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulF src con));
+
+  format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (MulD dst src));
+
+  format %{ "mulsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (MulD dst (LoadD src)));
+
+  format %{ "mulsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (MulD dst con));
+  format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulD src1 src2));
+
+  format %{ "vmulsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulD src1 (LoadD src2)));
+
+  format %{ "vmulsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulD src con));
+
+  format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (DivF dst src));
+
+  format %{ "divss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (DivF dst (LoadF src)));
+
+  format %{ "divss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (DivF dst con));
+  format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivF src1 src2));
+
+  format %{ "vdivss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivF src1 (LoadF src2)));
+
+  format %{ "vdivss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivF src con));
+
+  format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (DivD dst src));
+
+  format %{ "divsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (DivD dst (LoadD src)));
+
+  format %{ "divsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (DivD dst con));
+  format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivD src1 src2));
+
+  format %{ "vdivsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivD src1 (LoadD src2)));
+
+  format %{ "vdivsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivD src con));
+
+  format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct absF_reg(regF dst) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AbsF dst));
+  ins_cost(150);
+  format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
+  ins_encode %{
+    __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vabsF_reg(regF dst, regF src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AbsF src));
+  ins_cost(150);
+  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
+  ins_encode %{
+    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(float_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct absD_reg(regD dst) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AbsD dst));
+  ins_cost(150);
+  format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
+            "# abs double by sign masking" %}
+  ins_encode %{
+    __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vabsD_reg(regD dst, regD src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AbsD src));
+  ins_cost(150);
+  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
+            "# abs double by sign masking" %}
+  ins_encode %{
+    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(double_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct negF_reg(regF dst) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (NegF dst));
+  ins_cost(150);
+  format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vnegF_reg(regF dst, regF src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (NegF src));
+  ins_cost(150);
+  format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
+  ins_encode %{
+    __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(float_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct negD_reg(regD dst) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (NegD dst));
+  ins_cost(150);
+  format %{ "xorpd   $dst, [0x8000000000000000]\t"
+            "# neg double by sign flipping" %}
+  ins_encode %{
+    __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vnegD_reg(regD dst, regD src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (NegD src));
+  ins_cost(150);
+  format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
+            "# neg double by sign flipping" %}
+  ins_encode %{
+    __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(double_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtF_reg(regF dst, regF src) %{
+  predicate(UseSSE>=1);
+  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
+
+  format %{ "sqrtss  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtF_mem(regF dst, memory src) %{
+  predicate(UseSSE>=1);
+  match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
+
+  format %{ "sqrtss  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtF_imm(regF dst, immF con) %{
+  predicate(UseSSE>=1);
+  match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
+  format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtD_reg(regD dst, regD src) %{
+  predicate(UseSSE>=2);
+  match(Set dst (SqrtD src));
+
+  format %{ "sqrtsd  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtD_mem(regD dst, memory src) %{
+  predicate(UseSSE>=2);
+  match(Set dst (SqrtD (LoadD src)));
+
+  format %{ "sqrtsd  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtD_imm(regD dst, immD con) %{
+  predicate(UseSSE>=2);
+  match(Set dst (SqrtD con));
+  format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
--- a/src/cpu/x86/vm/x86_32.ad	Fri Jan 13 10:05:33 2012 -0800
+++ b/src/cpu/x86/vm/x86_32.ad	Sat Jan 14 00:47:46 2012 -0800
@@ -281,7 +281,7 @@
 }
 
 static int preserve_SP_size() {
-  return LP64_ONLY(1 +) 2;  // [rex,] op, rm(reg/reg)
+  return 2;  // op, rm(reg/reg)
 }
 
 // !!!!! Special hack to get all type of calls to specify the byte offset
@@ -495,14 +495,34 @@
   }
 }
 
-void encode_CopyXD( CodeBuffer &cbuf, int dst_encoding, int src_encoding ) {
-  if( dst_encoding == src_encoding ) {
-    // reg-reg copy, use an empty encoding
-  } else {
-    MacroAssembler _masm(&cbuf);
-
-    __ movdqa(as_XMMRegister(dst_encoding), as_XMMRegister(src_encoding));
-  }
+void emit_cmpfp_fixup(MacroAssembler& _masm) {
+  Label exit;
+  __ jccb(Assembler::noParity, exit);
+  __ pushf();
+  //
+  // comiss/ucomiss instructions set ZF,PF,CF flags and
+  // zero OF,AF,SF for NaN values.
+  // Fixup flags by zeroing ZF,PF so that compare of NaN
+  // values returns 'less than' result (CF is set).
+  // Leave the rest of flags unchanged.
+  //
+  //    7 6 5 4 3 2 1 0
+  //   |S|Z|r|A|r|P|r|C|  (r - reserved bit)
+  //    0 0 1 0 1 0 1 1   (0x2B)
+  //
+  __ andl(Address(rsp, 0), 0xffffff2b);
+  __ popf();
+  __ bind(exit);
+}
+
+void emit_cmpfp3(MacroAssembler& _masm, Register dst) {
+  Label done;
+  __ movl(dst, -1);
+  __ jcc(Assembler::parity, done);
+  __ jcc(Assembler::below, done);
+  __ setb(Assembler::notEqual, dst);
+  __ movzbl(dst, dst);
+  __ bind(done);
 }
 
 
@@ -792,92 +812,88 @@
 // Helper for XMM registers.  Extra opcode bits, limited syntax.
 static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
                          int offset, int reg_lo, int reg_hi, int size, outputStream* st ) {
-  if( cbuf ) {
-    if( reg_lo+1 == reg_hi ) { // double move?
-      if( is_load && !UseXmmLoadAndClearUpper )
-        emit_opcode(*cbuf, 0x66 ); // use 'movlpd' for load
-      else
-        emit_opcode(*cbuf, 0xF2 ); // use 'movsd' otherwise
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    if (reg_lo+1 == reg_hi) { // double move?
+      if (is_load) {
+        __ movdbl(as_XMMRegister(Matcher::_regEncode[reg_lo]), Address(rsp, offset));
+      } else {
+        __ movdbl(Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[reg_lo]));
+      }
     } else {
-      emit_opcode(*cbuf, 0xF3 );
+      if (is_load) {
+        __ movflt(as_XMMRegister(Matcher::_regEncode[reg_lo]), Address(rsp, offset));
+      } else {
+        __ movflt(Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[reg_lo]));
+      }
     }
-    emit_opcode(*cbuf, 0x0F );
-    if( reg_lo+1 == reg_hi && is_load && !UseXmmLoadAndClearUpper )
-      emit_opcode(*cbuf, 0x12 );   // use 'movlpd' for load
-    else
-      emit_opcode(*cbuf, is_load ? 0x10 : 0x11 );
-    encode_RegMem(*cbuf, Matcher::_regEncode[reg_lo], ESP_enc, 0x4, 0, offset, false);
 #ifndef PRODUCT
-  } else if( !do_size ) {
-    if( size != 0 ) st->print("\n\t");
-    if( reg_lo+1 == reg_hi ) { // double move?
-      if( is_load ) st->print("%s %s,[ESP + #%d]",
-                               UseXmmLoadAndClearUpper ? "MOVSD " : "MOVLPD",
-                               Matcher::regName[reg_lo], offset);
-      else          st->print("MOVSD  [ESP + #%d],%s",
-                               offset, Matcher::regName[reg_lo]);
+  } else if (!do_size) {
+    if (size != 0) st->print("\n\t");
+    if (reg_lo+1 == reg_hi) { // double move?
+      if (is_load) st->print("%s %s,[ESP + #%d]",
+                              UseXmmLoadAndClearUpper ? "MOVSD " : "MOVLPD",
+                              Matcher::regName[reg_lo], offset);
+      else         st->print("MOVSD  [ESP + #%d],%s",
+                              offset, Matcher::regName[reg_lo]);
     } else {
-      if( is_load ) st->print("MOVSS  %s,[ESP + #%d]",
-                               Matcher::regName[reg_lo], offset);
-      else          st->print("MOVSS  [ESP + #%d],%s",
-                               offset, Matcher::regName[reg_lo]);
+      if (is_load) st->print("MOVSS  %s,[ESP + #%d]",
+                              Matcher::regName[reg_lo], offset);
+      else         st->print("MOVSS  [ESP + #%d],%s",
+                              offset, Matcher::regName[reg_lo]);
     }
 #endif
   }
   int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
+  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes.
   return size+5+offset_size;
 }
 
 
 static int impl_movx_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
                             int src_hi, int dst_hi, int size, outputStream* st ) {
-  if( UseXmmRegToRegMoveAll ) {//Use movaps,movapd to move between xmm registers
-    if( cbuf ) {
-      if( (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ) {
-        emit_opcode(*cbuf, 0x66 );
-      }
-      emit_opcode(*cbuf, 0x0F );
-      emit_opcode(*cbuf, 0x28 );
-      emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    if (src_lo+1 == src_hi && dst_lo+1 == dst_hi) { // double move?
+      __ movdbl(as_XMMRegister(Matcher::_regEncode[dst_lo]),
+                as_XMMRegister(Matcher::_regEncode[src_lo]));
+    } else {
+      __ movflt(as_XMMRegister(Matcher::_regEncode[dst_lo]),
+                as_XMMRegister(Matcher::_regEncode[src_lo]));
+    }
 #ifndef PRODUCT
-    } else if( !do_size ) {
-      if( size != 0 ) st->print("\n\t");
-      if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
+  } else if (!do_size) {
+    if (size != 0) st->print("\n\t");
+    if (UseXmmRegToRegMoveAll) {//Use movaps,movapd to move between xmm registers
+      if (src_lo+1 == src_hi && dst_lo+1 == dst_hi) { // double move?
         st->print("MOVAPD %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       } else {
         st->print("MOVAPS %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       }
-#endif
-    }
-    return size + ((src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 4 : 3);
-  } else {
-    if( cbuf ) {
-      emit_opcode(*cbuf, (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 0xF2 : 0xF3 );
-      emit_opcode(*cbuf, 0x0F );
-      emit_opcode(*cbuf, 0x10 );
-      emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
-#ifndef PRODUCT
-    } else if( !do_size ) {
-      if( size != 0 ) st->print("\n\t");
+    } else {
       if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
         st->print("MOVSD  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       } else {
         st->print("MOVSS  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       }
+    }
 #endif
-    }
-    return size+4;
   }
+  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes.
+  // Only MOVAPS SSE prefix uses 1 byte.
+  int sz = 4;
+  if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) &&
+      UseXmmRegToRegMoveAll && (UseAVX == 0)) sz = 3;
+  return size + sz;
 }
 
 static int impl_movgpr2x_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
                             int src_hi, int dst_hi, int size, outputStream* st ) {
   // 32-bit
   if (cbuf) {
-    emit_opcode(*cbuf, 0x66);
-    emit_opcode(*cbuf, 0x0F);
-    emit_opcode(*cbuf, 0x6E);
-    emit_rm(*cbuf, 0x3, Matcher::_regEncode[dst_lo] & 7, Matcher::_regEncode[src_lo] & 7);
+    MacroAssembler _masm(cbuf);
+    __ movdl(as_XMMRegister(Matcher::_regEncode[dst_lo]),
+             as_Register(Matcher::_regEncode[src_lo]));
 #ifndef PRODUCT
   } else if (!do_size) {
     st->print("movdl   %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
@@ -891,10 +907,9 @@
                                  int src_hi, int dst_hi, int size, outputStream* st ) {
   // 32-bit
   if (cbuf) {
-    emit_opcode(*cbuf, 0x66);
-    emit_opcode(*cbuf, 0x0F);
-    emit_opcode(*cbuf, 0x7E);
-    emit_rm(*cbuf, 0x3, Matcher::_regEncode[src_lo] & 7, Matcher::_regEncode[dst_lo] & 7);
+    MacroAssembler _masm(cbuf);
+    __ movdl(as_Register(Matcher::_regEncode[dst_lo]),
+             as_XMMRegister(Matcher::_regEncode[src_lo]));
 #ifndef PRODUCT
   } else if (!do_size) {
     st->print("movdl   %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
@@ -1760,7 +1775,7 @@
     emit_cc(cbuf, $secondary, $cop$$cmpcode);
   %}
 
-  enc_class enc_cmov_d(cmpOp cop, regD src ) %{ // CMOV
+  enc_class enc_cmov_dpr(cmpOp cop, regDPR src ) %{ // CMOV
     int op = 0xDA00 + $cop$$cmpcode + ($src$$reg-1);
     emit_d8(cbuf, op >> 8 );
     emit_d8(cbuf, op & 255);
@@ -1931,11 +1946,6 @@
 
   %}
 
-  enc_class Xor_Reg (eRegI dst) %{
-    emit_opcode(cbuf, 0x33);
-    emit_rm(cbuf, 0x3, $dst$$reg, $dst$$reg);
-  %}
-
 //   Following encoding is no longer used, but may be restored if calling
 //   convention changes significantly.
 //   Became: Xor_Reg(EBP), Java_To_Runtime( labl )
@@ -2013,64 +2023,6 @@
   %}
 
 
-  enc_class MovI2X_reg(regX dst, eRegI src) %{
-    emit_opcode(cbuf, 0x66 );     // MOVD dst,src
-    emit_opcode(cbuf, 0x0F );
-    emit_opcode(cbuf, 0x6E );
-    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
-  %}
-
-  enc_class MovX2I_reg(eRegI dst, regX src) %{
-    emit_opcode(cbuf, 0x66 );     // MOVD dst,src
-    emit_opcode(cbuf, 0x0F );
-    emit_opcode(cbuf, 0x7E );
-    emit_rm(cbuf, 0x3, $src$$reg, $dst$$reg);
-  %}
-
-  enc_class MovL2XD_reg(regXD dst, eRegL src, regXD tmp) %{
-    { // MOVD $dst,$src.lo
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x6E);
-      emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
-    }
-    { // MOVD $tmp,$src.hi
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x6E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg));
-    }
-    { // PUNPCKLDQ $dst,$tmp
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x62);
-      emit_rm(cbuf, 0x3, $dst$$reg, $tmp$$reg);
-     }
-  %}
-
-  enc_class MovXD2L_reg(eRegL dst, regXD src, regXD tmp) %{
-    { // MOVD $dst.lo,$src
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x7E);
-      emit_rm(cbuf, 0x3, $src$$reg, $dst$$reg);
-    }
-    { // PSHUFLW $tmp,$src,0x4E  (01001110b)
-      emit_opcode(cbuf,0xF2);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x70);
-      emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
-      emit_d8(cbuf, 0x4E);
-    }
-    { // MOVD $dst.hi,$tmp
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x7E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg));
-    }
-  %}
-
-
   // Encode a reg-reg copy.  If it is useless, then empty encoding.
   enc_class enc_Copy( eRegI dst, eRegI src ) %{
     encode_Copy( cbuf, $dst$$reg, $src$$reg );
@@ -2080,11 +2032,6 @@
     encode_Copy( cbuf, $dst$$reg, $src$$reg );
   %}
 
-  // Encode xmm reg-reg copy.  If it is useless, then empty encoding.
-  enc_class enc_CopyXD( RegXD dst, RegXD src ) %{
-    encode_CopyXD( cbuf, $dst$$reg, $src$$reg );
-  %}
-
   enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
@@ -2116,14 +2063,14 @@
     $$$emit32$src$$constant;
   %}
 
-  enc_class Con32F_as_bits(immF src) %{        // storeF_imm
+  enc_class Con32FPR_as_bits(immFPR src) %{        // storeF_imm
     // Output Float immediate bits
     jfloat jf = $src$$constant;
     int    jf_as_bits = jint_cast( jf );
     emit_d32(cbuf, jf_as_bits);
   %}
 
-  enc_class Con32XF_as_bits(immXF src) %{      // storeX_imm
+  enc_class Con32F_as_bits(immF src) %{      // storeX_imm
     // Output Float immediate bits
     jfloat jf = $src$$constant;
     int    jf_as_bits = jint_cast( jf );
@@ -2336,7 +2283,7 @@
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
-  enc_class enc_FP_store(memory mem, regD src) %{
+  enc_class enc_FPR_store(memory mem, regDPR src) %{
     // If src is FPR1, we can just FST to store it.
     // Else we need to FLD it to FPR1, then FSTP to store/pop it.
     int reg_encoding = 0x2; // Just store
@@ -2485,7 +2432,7 @@
 
   // ----------------- Encodings for floating point unit -----------------
   // May leave result in FPU-TOS or FPU reg depending on opcodes
-  enc_class OpcReg_F (regF src) %{    // FMUL, FDIV
+  enc_class OpcReg_FPR(regFPR src) %{    // FMUL, FDIV
     $$$emit8$primary;
     emit_rm(cbuf, 0x3, $secondary, $src$$reg );
   %}
@@ -2497,17 +2444,17 @@
   %}
 
   // !!!!! equivalent to Pop_Reg_F
-  enc_class Pop_Reg_D( regD dst ) %{
+  enc_class Pop_Reg_DPR( regDPR dst ) %{
     emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
     emit_d8( cbuf, 0xD8+$dst$$reg );
   %}
 
-  enc_class Push_Reg_D( regD dst ) %{
+  enc_class Push_Reg_DPR( regDPR dst ) %{
     emit_opcode( cbuf, 0xD9 );
     emit_d8( cbuf, 0xC0-1+$dst$$reg );   // FLD ST(i-1)
   %}
 
-  enc_class strictfp_bias1( regD dst ) %{
+  enc_class strictfp_bias1( regDPR dst ) %{
     emit_opcode( cbuf, 0xDB );           // FLD m80real
     emit_opcode( cbuf, 0x2D );
     emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias1() );
@@ -2515,7 +2462,7 @@
     emit_opcode( cbuf, 0xC8+$dst$$reg );
   %}
 
-  enc_class strictfp_bias2( regD dst ) %{
+  enc_class strictfp_bias2( regDPR dst ) %{
     emit_opcode( cbuf, 0xDB );           // FLD m80real
     emit_opcode( cbuf, 0x2D );
     emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias2() );
@@ -2541,39 +2488,29 @@
     store_to_stackslot( cbuf, $primary, $secondary, $src$$disp );
   %}
 
-  // Push the float in stackSlot 'src' onto FP-stack
-  enc_class Push_Mem_F( memory src ) %{    // FLD_S   [ESP+src]
-    store_to_stackslot( cbuf, 0xD9, 0x00, $src$$disp );
-  %}
-
-  // Push the double in stackSlot 'src' onto FP-stack
-  enc_class Push_Mem_D( memory src ) %{    // FLD_D   [ESP+src]
-    store_to_stackslot( cbuf, 0xDD, 0x00, $src$$disp );
-  %}
-
   // Push FPU's TOS float to a stack-slot, and pop FPU-stack
-  enc_class Pop_Mem_F( stackSlotF dst ) %{ // FSTP_S [ESP+dst]
+  enc_class Pop_Mem_FPR( stackSlotF dst ) %{ // FSTP_S [ESP+dst]
     store_to_stackslot( cbuf, 0xD9, 0x03, $dst$$disp );
   %}
 
   // Same as Pop_Mem_F except for opcode
   // Push FPU's TOS double to a stack-slot, and pop FPU-stack
-  enc_class Pop_Mem_D( stackSlotD dst ) %{ // FSTP_D [ESP+dst]
+  enc_class Pop_Mem_DPR( stackSlotD dst ) %{ // FSTP_D [ESP+dst]
     store_to_stackslot( cbuf, 0xDD, 0x03, $dst$$disp );
   %}
 
-  enc_class Pop_Reg_F( regF dst ) %{
+  enc_class Pop_Reg_FPR( regFPR dst ) %{
     emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
     emit_d8( cbuf, 0xD8+$dst$$reg );
   %}
 
-  enc_class Push_Reg_F( regF dst ) %{
+  enc_class Push_Reg_FPR( regFPR dst ) %{
     emit_opcode( cbuf, 0xD9 );           // FLD    ST(i-1)
     emit_d8( cbuf, 0xC0-1+$dst$$reg );
   %}
 
   // Push FPU's float to a stack-slot, and pop FPU-stack
-  enc_class Pop_Mem_Reg_F( stackSlotF dst, regF src ) %{
+  enc_class Pop_Mem_Reg_FPR( stackSlotF dst, regFPR src ) %{
     int pop = 0x02;
     if ($src$$reg != FPR1L_enc) {
       emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
@@ -2584,7 +2521,7 @@
   %}
 
   // Push FPU's double to a stack-slot, and pop FPU-stack
-  enc_class Pop_Mem_Reg_D( stackSlotD dst, regD src ) %{
+  enc_class Pop_Mem_Reg_DPR( stackSlotD dst, regDPR src ) %{
     int pop = 0x02;
     if ($src$$reg != FPR1L_enc) {
       emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
@@ -2595,7 +2532,7 @@
   %}
 
   // Push FPU's double to a FPU-stack-slot, and pop FPU-stack
-  enc_class Pop_Reg_Reg_D( regD dst, regF src ) %{
+  enc_class Pop_Reg_Reg_DPR( regDPR dst, regFPR src ) %{
     int pop = 0xD0 - 1; // -1 since we skip FLD
     if ($src$$reg != FPR1L_enc) {
       emit_opcode( cbuf, 0xD9 );         // FLD    ST(src-1)
@@ -2607,16 +2544,7 @@
   %}
 
 
-  enc_class Mul_Add_F( regF dst, regF src, regF src1, regF src2 ) %{
-    MacroAssembler masm(&cbuf);
-    masm.fld_s(  $src1$$reg-1);   // nothing at TOS, load TOS from src1.reg
-    masm.fmul(   $src2$$reg+0);   // value at TOS
-    masm.fadd(   $src$$reg+0);    // value at TOS
-    masm.fstp_d( $dst$$reg+0);    // value at TOS, popped off after store
-  %}
-
-
-  enc_class Push_Reg_Mod_D( regD dst, regD src) %{
+  enc_class Push_Reg_Mod_DPR( regDPR dst, regDPR src) %{
     // load dst in FPR0
     emit_opcode( cbuf, 0xD9 );
     emit_d8( cbuf, 0xC0-1+$dst$$reg );
@@ -2634,116 +2562,59 @@
     }
   %}
 
-  enc_class Push_ModD_encoding( regXD src0, regXD src1) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);            // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src1
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src1$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src0
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src0$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-  %}
-
-  enc_class Push_ModX_encoding( regX src0, regX src1) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);            // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x04);
-
-    emit_opcode  (cbuf, 0xF3 );     // MOVSS [ESP], src1
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src1$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );      // FLD [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode  (cbuf, 0xF3 );     // MOVSS [ESP], src0
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src0$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );      // FLD [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-  %}
-
-  enc_class Push_ResultXD(regXD dst) %{
-    store_to_stackslot( cbuf, 0xDD, 0x03, 0 ); //FSTP [ESP]
-
-    // UseXmmLoadAndClearUpper ? movsd dst,[esp] : movlpd dst,[esp]
-    emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0x10 : 0x12);
-    encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);    // ADD ESP,8
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x08);
-  %}
-
-  enc_class Push_ResultX(regX dst, immI d8) %{
-    store_to_stackslot( cbuf, 0xD9, 0x03, 0 ); //FSTP_S [ESP]
-
-    emit_opcode  (cbuf, 0xF3 );     // MOVSS dst(xmm), [ESP]
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x10 );
-    encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);    // ADD ESP,d8 (4 or 8)
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,$d8$$constant);
-  %}
-
-  enc_class Push_SrcXD(regXD src) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);            // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
+  enc_class Push_ModD_encoding(regD src0, regD src1) %{
+    MacroAssembler _masm(&cbuf);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+  %}
+
+  enc_class Push_ModF_encoding(regF src0, regF src1) %{
+    MacroAssembler _masm(&cbuf);
+    __ subptr(rsp, 4);
+    __ movflt(Address(rsp, 0), $src1$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ movflt(Address(rsp, 0), $src0$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+  %}
+
+  enc_class Push_ResultD(regD dst) %{
+    MacroAssembler _masm(&cbuf);
+    __ fstp_d(Address(rsp, 0));
+    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, 8);
+  %}
+
+  enc_class Push_ResultF(regF dst, immI d8) %{
+    MacroAssembler _masm(&cbuf);
+    __ fstp_s(Address(rsp, 0));
+    __ movflt($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, $d8$$constant);
+  %}
+
+  enc_class Push_SrcD(regD src) %{
+    MacroAssembler _masm(&cbuf);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
   %}
 
   enc_class push_stack_temp_qword() %{
-    emit_opcode(cbuf,0x83);     // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8    (cbuf,0x08);
+    MacroAssembler _masm(&cbuf);
+    __ subptr(rsp, 8);
   %}
 
   enc_class pop_stack_temp_qword() %{
-    emit_opcode(cbuf,0x83);     // ADD ESP,8
-    emit_opcode(cbuf,0xC4);
-    emit_d8    (cbuf,0x08);
-  %}
-
-  enc_class push_xmm_to_fpr1( regXD xmm_src ) %{
-    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], xmm_src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $xmm_src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
+    MacroAssembler _masm(&cbuf);
+    __ addptr(rsp, 8);
+  %}
+
+  enc_class push_xmm_to_fpr1(regD src) %{
+    MacroAssembler _masm(&cbuf);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
   %}
 
   // Compute X^Y using Intel's fast hardware instructions, if possible.
@@ -2785,10 +2656,7 @@
     encode_RegMem(cbuf, 0x1, ESP_enc, 0x4, 0, 0, false);
   %}
 
-//   enc_class Pop_Reg_Mod_D( regD dst, regD src)
-//   was replaced by Push_Result_Mod_D followed by Pop_Reg_X() or Pop_Mem_X()
-
-  enc_class Push_Result_Mod_D( regD src) %{
+  enc_class Push_Result_Mod_DPR( regDPR src) %{
     if ($src$$reg != FPR1L_enc) {
       // fincstp
       emit_opcode (cbuf, 0xD9);
@@ -2817,7 +2685,7 @@
     emit_opcode( cbuf, 0x05 );
   %}
 
-  enc_class emitModD() %{
+  enc_class emitModDPR() %{
     // fprem must be iterative
     // :: loop
     // fprem
@@ -2922,24 +2790,6 @@
   %}
 
 
-  // XMM version of CmpF_Result. Because the XMM compare
-  // instructions set the EFLAGS directly. It becomes simpler than
-  // the float version above.
-  enc_class CmpX_Result(eRegI dst) %{
-    MacroAssembler _masm(&cbuf);
-    Label nan, inc, done;
-
-    __ jccb(Assembler::parity, nan);
-    __ jccb(Assembler::equal,  done);
-    __ jccb(Assembler::above,  inc);
-    __ bind(nan);
-    __ decrement(as_Register($dst$$reg)); // NO L qqq
-    __ jmpb(done);
-    __ bind(inc);
-    __ increment(as_Register($dst$$reg)); // NO L qqq
-    __ bind(done);
-  %}
-
   // Compare the longs and set flags
   // BROKEN!  Do Not use as-is
   enc_class cmpl_test( eRegL src1, eRegL src2 ) %{
@@ -3162,48 +3012,6 @@
     emit_d8    (cbuf,0 );
   %}
 
-  enc_class movq_ld(regXD dst, memory mem) %{
-    MacroAssembler _masm(&cbuf);
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-
-  enc_class movq_st(memory mem, regXD src) %{
-    MacroAssembler _masm(&cbuf);
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-
-  enc_class pshufd_8x8(regX dst, regX src) %{
-    MacroAssembler _masm(&cbuf);
-
-    encode_CopyXD(cbuf, $dst$$reg, $src$$reg);
-    __ punpcklbw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg));
-    __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg), 0x00);
-  %}
-
-  enc_class pshufd_4x16(regX dst, regX src) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), 0x00);
-  %}
-
-  enc_class pshufd(regXD dst, regXD src, int mode) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ pshufd(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), $mode);
-  %}
-
-  enc_class pxor(regXD dst, regXD src) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ pxor(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg));
-  %}
-
-  enc_class mov_i2x(regXD dst, eRegI src) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ movdl(as_XMMRegister($dst$$reg), as_Register($src$$reg));
-  %}
-
 
   // Because the transitions from emitted code to the runtime
   // monitorenter/exit helper stubs are so slow it's critical that
@@ -3757,7 +3565,7 @@
   // 'zero', store the darned double down as an int, and reset the
   // rounding mode to 'nearest'.  The hardware throws an exception which
   // patches up the correct value directly to the stack.
-  enc_class D2I_encoding( regD src ) %{
+  enc_class DPR2I_encoding( regDPR src ) %{
     // Flip to round-to-zero mode.  We attempted to allow invalid-op
     // exceptions here, so that a NAN or other corner-case value will
     // thrown an exception (but normal values get converted at full speed).
@@ -3800,7 +3608,7 @@
     // Carry on here...
   %}
 
-  enc_class D2L_encoding( regD src ) %{
+  enc_class DPR2L_encoding( regDPR src ) %{
     emit_opcode(cbuf,0xD9);            // FLDCW  trunc
     emit_opcode(cbuf,0x2D);
     emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
@@ -3842,294 +3650,27 @@
     // Carry on here...
   %}
 
-  enc_class X2L_encoding( regX src ) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);      // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF3 );  // MOVSS [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );     // FLD_S [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9);      // FLDCW  trunc
-    emit_opcode(cbuf,0x2D);
-    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
-
-    // Encoding assumes a double has been pushed into FPR0.
-    // Store down the double as a long, popping the FPU stack
-    emit_opcode(cbuf,0xDF);      // FISTP [ESP]
-    emit_opcode(cbuf,0x3C);
-    emit_d8(cbuf,0x24);
-
-    // Restore the rounding mode; mask the exception
-    emit_opcode(cbuf,0xD9);      // FLDCW   std/24-bit mode
-    emit_opcode(cbuf,0x2D);
-    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
-      ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
-      : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
-
-    // Load the converted int; adjust CPU stack
-    emit_opcode(cbuf,0x58);      // POP EAX
-
-    emit_opcode(cbuf,0x5A);      // POP EDX
-
-    emit_opcode(cbuf,0x81);      // CMP EDX,imm
-    emit_d8    (cbuf,0xFA);      // rdx
-    emit_d32   (cbuf,0x80000000);//         0x80000000
-
-    emit_opcode(cbuf,0x75);      // JNE around_slow_call
-    emit_d8    (cbuf,0x13+4);    // Size of slow_call
-
-    emit_opcode(cbuf,0x85);      // TEST EAX,EAX
-    emit_opcode(cbuf,0xC0);      // 2/rax,/rax,
-
-    emit_opcode(cbuf,0x75);      // JNE around_slow_call
-    emit_d8    (cbuf,0x13);      // Size of slow_call
-
-    // Allocate a word
-    emit_opcode(cbuf,0x83);      // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x04);
-
-    emit_opcode  (cbuf, 0xF3 );  // MOVSS [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );     // FLD_S [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);      // ADD ESP,4
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x04);
-
-    // CALL directly to the runtime
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf,0xE8);       // Call into runtime
-    emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
-    // Carry on here...
-  %}
-
-  enc_class XD2L_encoding( regXD src ) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);      // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF2 );  // MOVSD [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );     // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9);      // FLDCW  trunc
-    emit_opcode(cbuf,0x2D);
-    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
-
-    // Encoding assumes a double has been pushed into FPR0.
-    // Store down the double as a long, popping the FPU stack
-    emit_opcode(cbuf,0xDF);      // FISTP [ESP]
-    emit_opcode(cbuf,0x3C);
-    emit_d8(cbuf,0x24);
-
-    // Restore the rounding mode; mask the exception
-    emit_opcode(cbuf,0xD9);      // FLDCW   std/24-bit mode
-    emit_opcode(cbuf,0x2D);
-    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
-      ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
-      : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
-
-    // Load the converted int; adjust CPU stack
-    emit_opcode(cbuf,0x58);      // POP EAX
-
-    emit_opcode(cbuf,0x5A);      // POP EDX
-
-    emit_opcode(cbuf,0x81);      // CMP EDX,imm
-    emit_d8    (cbuf,0xFA);      // rdx
-    emit_d32   (cbuf,0x80000000); //         0x80000000
-
-    emit_opcode(cbuf,0x75);      // JNE around_slow_call
-    emit_d8    (cbuf,0x13+4);    // Size of slow_call
-
-    emit_opcode(cbuf,0x85);      // TEST EAX,EAX
-    emit_opcode(cbuf,0xC0);      // 2/rax,/rax,
-
-    emit_opcode(cbuf,0x75);      // JNE around_slow_call
-    emit_d8    (cbuf,0x13);      // Size of slow_call
-
-    // Push src onto stack slow-path
-    // Allocate a word
-    emit_opcode(cbuf,0x83);      // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF2 );  // MOVSD [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );     // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);      // ADD ESP,8
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x08);
-
-    // CALL directly to the runtime
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf,0xE8);      // Call into runtime
-    emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
-    // Carry on here...
-  %}
-
-  enc_class D2X_encoding( regX dst, regD src ) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);            // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x04);
-    int pop = 0x02;
-    if ($src$$reg != FPR1L_enc) {
-      emit_opcode( cbuf, 0xD9 );       // FLD    ST(i-1)
-      emit_d8( cbuf, 0xC0-1+$src$$reg );
-      pop = 0x03;
-    }
-    store_to_stackslot( cbuf, 0xD9, pop, 0 ); // FST<P>_S  [ESP]
-
-    emit_opcode  (cbuf, 0xF3 );        // MOVSS dst(xmm), [ESP]
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x10 );
-    encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);            // ADD ESP,4
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x04);
-    // Carry on here...
-  %}
-
-  enc_class FX2I_encoding( regX src, eRegI dst ) %{
-    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
-
-    // Compare the result to see if we need to go to the slow path
-    emit_opcode(cbuf,0x81);       // CMP dst,imm
-    emit_rm    (cbuf,0x3,0x7,$dst$$reg);
-    emit_d32   (cbuf,0x80000000); //         0x80000000
-
-    emit_opcode(cbuf,0x75);       // JNE around_slow_call
-    emit_d8    (cbuf,0x13);       // Size of slow_call
-    // Store xmm to a temp memory
-    // location and push it onto stack.
-
-    emit_opcode(cbuf,0x83);  // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf, $primary ? 0x8 : 0x4);
-
-    emit_opcode  (cbuf, $primary ? 0xF2 : 0xF3 );   // MOVSS [ESP], xmm
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf, $primary ? 0xDD : 0xD9 );      // FLD [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);    // ADD ESP,4
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf, $primary ? 0x8 : 0x4);
-
-    // CALL directly to the runtime
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf,0xE8);       // Call into runtime
-    emit_d32_reloc(cbuf, (StubRoutines::d2i_wrapper() - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
-
-    // Carry on here...
-  %}
-
-  enc_class X2D_encoding( regD dst, regX src ) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);     // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x04);
-
-    emit_opcode  (cbuf, 0xF3 ); // MOVSS [ESP], xmm
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );    // FLD_S [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);     // ADD ESP,4
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x04);
-
-    // Carry on here...
-  %}
-
-  enc_class AbsXF_encoding(regX dst) %{
-    address signmask_address=(address)float_signmask_pool;
-    // andpd:\tANDPS  $dst,[signconst]
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x54);
-    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
-    emit_d32(cbuf, (int)signmask_address);
-  %}
-
-  enc_class AbsXD_encoding(regXD dst) %{
-    address signmask_address=(address)double_signmask_pool;
-    // andpd:\tANDPD  $dst,[signconst]
-    emit_opcode(cbuf, 0x66);
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x54);
-    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
-    emit_d32(cbuf, (int)signmask_address);
-  %}
-
-  enc_class NegXF_encoding(regX dst) %{
-    address signmask_address=(address)float_signflip_pool;
-    // andpd:\tXORPS  $dst,[signconst]
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x57);
-    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
-    emit_d32(cbuf, (int)signmask_address);
-  %}
-
-  enc_class NegXD_encoding(regXD dst) %{
-    address signmask_address=(address)double_signflip_pool;
-    // andpd:\tXORPD  $dst,[signconst]
-    emit_opcode(cbuf, 0x66);
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x57);
-    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
-    emit_d32(cbuf, (int)signmask_address);
-  %}
-
-  enc_class FMul_ST_reg( eRegF src1 ) %{
+  enc_class FMul_ST_reg( eRegFPR src1 ) %{
     // Operand was loaded from memory into fp ST (stack top)
     // FMUL   ST,$src  /* D8 C8+i */
     emit_opcode(cbuf, 0xD8);
     emit_opcode(cbuf, 0xC8 + $src1$$reg);
   %}
 
-  enc_class FAdd_ST_reg( eRegF src2 ) %{
+  enc_class FAdd_ST_reg( eRegFPR src2 ) %{
     // FADDP  ST,src2  /* D8 C0+i */
     emit_opcode(cbuf, 0xD8);
     emit_opcode(cbuf, 0xC0 + $src2$$reg);
     //could use FADDP  src2,fpST  /* DE C0+i */
   %}
 
-  enc_class FAddP_reg_ST( eRegF src2 ) %{
+  enc_class FAddP_reg_ST( eRegFPR src2 ) %{
     // FADDP  src2,ST  /* DE C0+i */
     emit_opcode(cbuf, 0xDE);
     emit_opcode(cbuf, 0xC0 + $src2$$reg);
   %}
 
-  enc_class subF_divF_encode( eRegF src1, eRegF src2) %{
+  enc_class subFPR_divFPR_encode( eRegFPR src1, eRegFPR src2) %{
     // Operand has been loaded into fp ST (stack top)
       // FSUB   ST,$src1
       emit_opcode(cbuf, 0xD8);
@@ -4140,7 +3681,7 @@
       emit_opcode(cbuf, 0xF0 + $src2$$reg);
   %}
 
-  enc_class MulFAddF (eRegF src1, eRegF src2) %{
+  enc_class MulFAddF (eRegFPR src1, eRegFPR src2) %{
     // Operand was loaded from memory into fp ST (stack top)
     // FADD   ST,$src  /* D8 C0+i */
     emit_opcode(cbuf, 0xD8);
@@ -4152,7 +3693,7 @@
   %}
 
 
-  enc_class MulFAddFreverse (eRegF src1, eRegF src2) %{
+  enc_class MulFAddFreverse (eRegFPR src1, eRegFPR src2) %{
     // Operand was loaded from memory into fp ST (stack top)
     // FADD   ST,$src  /* D8 C0+i */
     emit_opcode(cbuf, 0xD8);
@@ -4176,66 +3717,6 @@
     store_to_stackslot( cbuf, 0x0DF, 0x07, $dst$$disp );
   %}
 
-  enc_class enc_loadLX_volatile( memory mem, stackSlotL dst, regXD tmp ) %{
-    { // Atomic long load
-      // UseXmmLoadAndClearUpper ? movsd $tmp,$mem : movlpd $tmp,$mem
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
-      int base     = $mem$$base;
-      int index    = $mem$$index;
-      int scale    = $mem$$scale;
-      int displace = $mem$$disp;
-      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-    { // MOVSD $dst,$tmp ! atomic long store
-      emit_opcode(cbuf,0xF2);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x11);
-      int base     = $dst$$base;
-      int index    = $dst$$index;
-      int scale    = $dst$$scale;
-      int displace = $dst$$disp;
-      bool disp_is_oop = $dst->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-  %}
-
-  enc_class enc_loadLX_reg_volatile( memory mem, eRegL dst, regXD tmp ) %{
-    { // Atomic long load
-      // UseXmmLoadAndClearUpper ? movsd $tmp,$mem : movlpd $tmp,$mem
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
-      int base     = $mem$$base;
-      int index    = $mem$$index;
-      int scale    = $mem$$scale;
-      int displace = $mem$$disp;
-      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-    { // MOVD $dst.lo,$tmp
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x7E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, $dst$$reg);
-    }
-    { // PSRLQ $tmp,32
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x73);
-      emit_rm(cbuf, 0x3, 0x02, $tmp$$reg);
-      emit_d8(cbuf, 0x20);
-    }
-    { // MOVD $dst.hi,$tmp
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x7E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg));
-    }
-  %}
-
   // Volatile Store Long.  Must be atomic, so move it into
   // the FP TOS and then do a 64-bit FIST.  Has to probe the
   // target address before the store (for null-ptr checks)
@@ -4253,66 +3734,6 @@
     encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
   %}
 
-  enc_class enc_storeLX_volatile( memory mem, stackSlotL src, regXD tmp) %{
-    { // Atomic long load
-      // UseXmmLoadAndClearUpper ? movsd $tmp,[$src] : movlpd $tmp,[$src]
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
-      int base     = $src$$base;
-      int index    = $src$$index;
-      int scale    = $src$$scale;
-      int displace = $src$$disp;
-      bool disp_is_oop = $src->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-    cbuf.set_insts_mark();            // Mark start of MOVSD in case $mem has an oop
-    { // MOVSD $mem,$tmp ! atomic long store
-      emit_opcode(cbuf,0xF2);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x11);
-      int base     = $mem$$base;
-      int index    = $mem$$index;
-      int scale    = $mem$$scale;
-      int displace = $mem$$disp;
-      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-  %}
-
-  enc_class enc_storeLX_reg_volatile( memory mem, eRegL src, regXD tmp, regXD tmp2) %{
-    { // MOVD $tmp,$src.lo
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x6E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
-    }
-    { // MOVD $tmp2,$src.hi
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x6E);
-      emit_rm(cbuf, 0x3, $tmp2$$reg, HIGH_FROM_LOW($src$$reg));
-    }
-    { // PUNPCKLDQ $tmp,$tmp2
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x62);
-      emit_rm(cbuf, 0x3, $tmp$$reg, $tmp2$$reg);
-    }
-    cbuf.set_insts_mark();            // Mark start of MOVSD in case $mem has an oop
-    { // MOVSD $mem,$tmp ! atomic long store
-      emit_opcode(cbuf,0xF2);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x11);
-      int base     = $mem$$base;
-      int index    = $mem$$index;
-      int scale    = $mem$$scale;
-      int displace = $mem$$disp;
-      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-  %}
-
   // Safepoint Poll.  This polls the safepoint page, and causes an
   // exception if it is not readable. Unfortunately, it kills the condition code
   // in the process
@@ -4705,7 +4126,7 @@
 %}
 
 //Double Immediate zero
-operand immD0() %{
+operand immDPR0() %{
   // Do additional (and counter-intuitive) test against NaN to work around VC++
   // bug that generates code such that NaNs compare equal to 0.0
   predicate( UseSSE<=1 && n->getd() == 0.0 && !g_isnan(n->getd()) );
@@ -4717,7 +4138,7 @@
 %}
 
 // Double Immediate one
-operand immD1() %{
+operand immDPR1() %{
   predicate( UseSSE<=1 && n->getd() == 1.0 );
   match(ConD);
 
@@ -4727,7 +4148,7 @@
 %}
 
 // Double Immediate
-operand immD() %{
+operand immDPR() %{
   predicate(UseSSE<=1);
   match(ConD);
 
@@ -4736,7 +4157,7 @@
   interface(CONST_INTER);
 %}
 
-operand immXD() %{
+operand immD() %{
   predicate(UseSSE>=2);
   match(ConD);
 
@@ -4746,7 +4167,7 @@
 %}
 
 // Double Immediate zero
-operand immXD0() %{
+operand immD0() %{
   // Do additional (and counter-intuitive) test against NaN to work around VC++
   // bug that generates code such that NaNs compare equal to 0.0 AND do not
   // compare equal to -0.0.
@@ -4758,7 +4179,7 @@
 %}
 
 // Float Immediate zero
-operand immF0() %{
+operand immFPR0() %{
   predicate(UseSSE == 0 && n->getf() == 0.0F);
   match(ConF);
 
@@ -4768,7 +4189,7 @@
 %}
 
 // Float Immediate one
-operand immF1() %{
+operand immFPR1() %{
   predicate(UseSSE == 0 && n->getf() == 1.0F);
   match(ConF);
 
@@ -4778,7 +4199,7 @@
 %}
 
 // Float Immediate
-operand immF() %{
+operand immFPR() %{
   predicate( UseSSE == 0 );
   match(ConF);
 
@@ -4788,7 +4209,7 @@
 %}
 
 // Float Immediate
-operand immXF() %{
+operand immF() %{
   predicate(UseSSE >= 1);
   match(ConF);
 
@@ -4798,7 +4219,7 @@
 %}
 
 // Float Immediate zero.  Zero and not -0.0
-operand immXF0() %{
+operand immF0() %{
   predicate( UseSSE >= 1 && jint_cast(n->getf()) == 0 );
   match(ConF);
 
@@ -5174,7 +4595,7 @@
 %}
 
 // Float register operands
-operand regD() %{
+operand regDPR() %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(dbl_reg));
   match(RegD);
@@ -5184,7 +4605,7 @@
   interface(REG_INTER);
 %}
 
-operand regDPR1(regD reg) %{
+operand regDPR1(regDPR reg) %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(dbl_reg0));
   match(reg);
@@ -5192,7 +4613,7 @@
   interface(REG_INTER);
 %}
 
-operand regDPR2(regD reg) %{
+operand regDPR2(regDPR reg) %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(dbl_reg1));
   match(reg);
@@ -5200,7 +4621,7 @@
   interface(REG_INTER);
 %}
 
-operand regnotDPR1(regD reg) %{
+operand regnotDPR1(regDPR reg) %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(dbl_notreg0));
   match(reg);
@@ -5209,18 +4630,18 @@
 %}
 
 // XMM Double register operands
-operand regXD() %{
+operand regD() %{
   predicate( UseSSE>=2 );
   constraint(ALLOC_IN_RC(xdb_reg));
   match(RegD);
-  match(regXD6);
-  match(regXD7);
+  match(regD6);
+  match(regD7);
   format %{ %}
   interface(REG_INTER);
 %}
 
 // XMM6 double register operands
-operand regXD6(regXD reg) %{
+operand regD6(regD reg) %{
   predicate( UseSSE>=2 );
   constraint(ALLOC_IN_RC(xdb_reg6));
   match(reg);
@@ -5229,7 +4650,7 @@
 %}
 
 // XMM7 double register operands
-operand regXD7(regXD reg) %{
+operand regD7(regD reg) %{
   predicate( UseSSE>=2 );
   constraint(ALLOC_IN_RC(xdb_reg7));
   match(reg);
@@ -5238,7 +4659,7 @@
 %}
 
 // Float register operands
-operand regF() %{
+operand regFPR() %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(flt_reg));
   match(RegF);
@@ -5248,7 +4669,7 @@
 %}
 
 // Float register operands
-operand regFPR1(regF reg) %{
+operand regFPR1(regFPR reg) %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(flt_reg0));
   match(reg);
@@ -5257,7 +4678,7 @@
 %}
 
 // XMM register operands
-operand regX() %{
+operand regF() %{
   predicate( UseSSE>=1 );
   constraint(ALLOC_IN_RC(xmm_reg));
   match(RegF);
@@ -6001,7 +5422,7 @@
 %}
 
 // Conditional move double reg-reg
-pipe_class pipe_cmovD_reg( eFlagsReg cr, regDPR1 dst, regD src) %{
+pipe_class pipe_cmovDPR_reg( eFlagsReg cr, regDPR1 dst, regDPR src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -6010,7 +5431,7 @@
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg(regD dst) %{
+pipe_class fpu_reg(regDPR dst) %{
     instruction_count(2);
     dst    : S3(read);
     DECODE : S0(2);     // any 2 decoders
@@ -6018,7 +5439,7 @@
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg_reg(regD dst, regD src) %{
+pipe_class fpu_reg_reg(regDPR dst, regDPR src) %{
     instruction_count(2);
     dst    : S4(write);
     src    : S3(read);
@@ -6027,7 +5448,7 @@
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg_reg_reg(regD dst, regD src1, regD src2) %{
+pipe_class fpu_reg_reg_reg(regDPR dst, regDPR src1, regDPR src2) %{
     instruction_count(3);
     dst    : S4(write);
     src1   : S3(read);
@@ -6037,7 +5458,7 @@
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg_reg_reg_reg(regD dst, regD src1, regD src2, regD src3) %{
+pipe_class fpu_reg_reg_reg_reg(regDPR dst, regDPR src1, regDPR src2, regDPR src3) %{
     instruction_count(4);
     dst    : S4(write);
     src1   : S3(read);
@@ -6048,7 +5469,7 @@
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg_mem_reg_reg(regD dst, memory src1, regD src2, regD src3) %{
+pipe_class fpu_reg_mem_reg_reg(regDPR dst, memory src1, regDPR src2, regDPR src3) %{
     instruction_count(4);
     dst    : S4(write);
     src1   : S3(read);
@@ -6061,7 +5482,7 @@
 %}
 
 // Float reg-mem operation
-pipe_class fpu_reg_mem(regD dst, memory mem) %{
+pipe_class fpu_reg_mem(regDPR dst, memory mem) %{
     instruction_count(2);
     dst    : S5(write);
     mem    : S3(read);
@@ -6072,7 +5493,7 @@
 %}
 
 // Float reg-mem operation
-pipe_class fpu_reg_reg_mem(regD dst, regD src1, memory mem) %{
+pipe_class fpu_reg_reg_mem(regDPR dst, regDPR src1, memory mem) %{
     instruction_count(3);
     dst    : S5(write);
     src1   : S3(read);
@@ -6084,7 +5505,7 @@
 %}
 
 // Float mem-reg operation
-pipe_class fpu_mem_reg(memory mem, regD src) %{
+pipe_class fpu_mem_reg(memory mem, regDPR src) %{
     instruction_count(2);
     src    : S5(read);
     mem    : S3(read);
@@ -6094,7 +5515,7 @@
     MEM    : S3;        // any mem
 %}
 
-pipe_class fpu_mem_reg_reg(memory mem, regD src1, regD src2) %{
+pipe_class fpu_mem_reg_reg(memory mem, regDPR src1, regDPR src2) %{
     instruction_count(3);
     src1   : S3(read);
     src2   : S3(read);
@@ -6105,7 +5526,7 @@
     MEM    : S3;        // any mem
 %}
 
-pipe_class fpu_mem_reg_mem(memory mem, regD src1, memory src2) %{
+pipe_class fpu_mem_reg_mem(memory mem, regDPR src1, memory src2) %{
     instruction_count(3);
     src1   : S3(read);
     src2   : S3(read);
@@ -6134,7 +5555,7 @@
     MEM    : S3(3);     // any mem
 %}
 
-pipe_class fpu_mem_reg_con(memory mem, regD src1) %{
+pipe_class fpu_mem_reg_con(memory mem, regDPR src1) %{
     instruction_count(3);
     src1   : S4(read);
     mem    : S4(read);
@@ -6145,7 +5566,7 @@
 %}
 
 // Float load constant
-pipe_class fpu_reg_con(regD dst) %{
+pipe_class fpu_reg_con(regDPR dst) %{
     instruction_count(2);
     dst    : S5(write);
     D0     : S0;        // big decoder only for the load
@@ -6155,7 +5576,7 @@
 %}
 
 // Float load constant
-pipe_class fpu_reg_reg_con(regD dst, regD src) %{
+pipe_class fpu_reg_reg_con(regDPR dst, regDPR src) %{
     instruction_count(3);
     dst    : S5(write);
     src    : S3(read);
@@ -6870,18 +6291,21 @@
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct loadLX_volatile(stackSlotL dst, memory mem, regXD tmp) %{
+instruct loadLX_volatile(stackSlotL dst, memory mem, regD tmp) %{
   predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
   match(Set dst (LoadL mem));
   effect(TEMP tmp);
   ins_cost(180);
   format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
             "MOVSD  $dst,$tmp" %}
-  ins_encode(enc_loadLX_volatile(mem, dst, tmp));
-  ins_pipe( pipe_slow );
-%}
-
-instruct loadLX_reg_volatile(eRegL dst, memory mem, regXD tmp) %{
+  ins_encode %{
+    __ movdbl($tmp$$XMMRegister, $mem$$Address);
+    __ movdbl(Address(rsp, $dst$$disp), $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadLX_reg_volatile(eRegL dst, memory mem, regD tmp) %{
   predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
   match(Set dst (LoadL mem));
   effect(TEMP tmp);
@@ -6890,7 +6314,12 @@
             "MOVD   $dst.lo,$tmp\n\t"
             "PSRLQ  $tmp,32\n\t"
             "MOVD   $dst.hi,$tmp" %}
-  ins_encode(enc_loadLX_reg_volatile(mem, dst, tmp));
+  ins_encode %{
+    __ movdbl($tmp$$XMMRegister, $mem$$Address);
+    __ movdl($dst$$Register, $tmp$$XMMRegister);
+    __ psrlq($tmp$$XMMRegister, 32);
+    __ movdl(HIGH_FROM_LOW($dst$$Register), $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -6929,7 +6358,7 @@
 %}
 
 // Load Double
-instruct loadD(regD dst, memory mem) %{
+instruct loadDPR(regDPR dst, memory mem) %{
   predicate(UseSSE<=1);
   match(Set dst (LoadD mem));
 
@@ -6938,42 +6367,48 @@
             "FSTP   $dst" %}
   opcode(0xDD);               /* DD /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Reg_D(dst) );
+              Pop_Reg_DPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
 // Load Double to XMM
-instruct loadXD(regXD dst, memory mem) %{
+instruct loadD(regD dst, memory mem) %{
   predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
   match(Set dst (LoadD mem));
   ins_cost(145);
   format %{ "MOVSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-instruct loadXD_partial(regXD dst, memory mem) %{
+  ins_encode %{
+    __ movdbl ($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadD_partial(regD dst, memory mem) %{
   predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
   match(Set dst (LoadD mem));
   ins_cost(145);
   format %{ "MOVLPD $dst,$mem" %}
-  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x12), RegMem(dst,mem));
+  ins_encode %{
+    __ movdbl ($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load to XMM register (single-precision floating point)
 // MOVSS instruction
-instruct loadX(regX dst, memory mem) %{
+instruct loadF(regF dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (LoadF mem));
   ins_cost(145);
   format %{ "MOVSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
+  ins_encode %{
+    __ movflt ($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Float
-instruct loadF(regF dst, memory mem) %{
+instruct loadFPR(regFPR dst, memory mem) %{
   predicate(UseSSE==0);
   match(Set dst (LoadF mem));
 
@@ -6982,57 +6417,67 @@
             "FSTP   $dst" %}
   opcode(0xD9);               /* D9 /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Reg_F(dst) );
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
 // Load Aligned Packed Byte to XMM register
-instruct loadA8B(regXD dst, memory mem) %{
+instruct loadA8B(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load8B mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed8B" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Aligned Packed Short to XMM register
-instruct loadA4S(regXD dst, memory mem) %{
+instruct loadA4S(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load4S mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed4S" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Aligned Packed Char to XMM register
-instruct loadA4C(regXD dst, memory mem) %{
+instruct loadA4C(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load4C mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed4C" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Aligned Packed Integer to XMM register
-instruct load2IU(regXD dst, memory mem) %{
+instruct load2IU(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load2I mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed2I" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Aligned Packed Single to XMM
-instruct loadA2F(regXD dst, memory mem) %{
+instruct loadA2F(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load2F mem));
   ins_cost(145);
   format %{ "MOVQ  $dst,$mem\t! packed2F" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7139,58 +6584,58 @@
   ins_pipe( ialu_reg_long );
 %}
 
+// The instruction usage is guarded by predicate in operand immFPR().
+instruct loadConFPR(regFPR dst, immFPR con) %{
+  match(Set dst con);
+  ins_cost(125);
+  format %{ "FLD_S  ST,[$constantaddress]\t# load from constant table: float=$con\n\t"
+            "FSTP   $dst" %}
+  ins_encode %{
+    __ fld_s($constantaddress($con));
+    __ fstp_d($dst$$reg);
+  %}
+  ins_pipe(fpu_reg_con);
+%}
+
+// The instruction usage is guarded by predicate in operand immFPR0().
+instruct loadConFPR0(regFPR dst, immFPR0 con) %{
+  match(Set dst con);
+  ins_cost(125);
+  format %{ "FLDZ   ST\n\t"
+            "FSTP   $dst" %}
+  ins_encode %{
+    __ fldz();
+    __ fstp_d($dst$$reg);
+  %}
+  ins_pipe(fpu_reg_con);
+%}
+
+// The instruction usage is guarded by predicate in operand immFPR1().
+instruct loadConFPR1(regFPR dst, immFPR1 con) %{
+  match(Set dst con);
+  ins_cost(125);
+  format %{ "FLD1   ST\n\t"
+            "FSTP   $dst" %}
+  ins_encode %{
+    __ fld1();
+    __ fstp_d($dst$$reg);
+  %}
+  ins_pipe(fpu_reg_con);
+%}
+
 // The instruction usage is guarded by predicate in operand immF().
 instruct loadConF(regF dst, immF con) %{
   match(Set dst con);
   ins_cost(125);
-  format %{ "FLD_S  ST,[$constantaddress]\t# load from constant table: float=$con\n\t"
-            "FSTP   $dst" %}
-  ins_encode %{
-    __ fld_s($constantaddress($con));
-    __ fstp_d($dst$$reg);
-  %}
-  ins_pipe(fpu_reg_con);
+  format %{ "MOVSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
 %}
 
 // The instruction usage is guarded by predicate in operand immF0().
-instruct loadConF0(regF dst, immF0 con) %{
-  match(Set dst con);
-  ins_cost(125);
-  format %{ "FLDZ   ST\n\t"
-            "FSTP   $dst" %}
-  ins_encode %{
-    __ fldz();
-    __ fstp_d($dst$$reg);
-  %}
-  ins_pipe(fpu_reg_con);
-%}
-
-// The instruction usage is guarded by predicate in operand immF1().
-instruct loadConF1(regF dst, immF1 con) %{
-  match(Set dst con);
-  ins_cost(125);
-  format %{ "FLD1   ST\n\t"
-            "FSTP   $dst" %}
-  ins_encode %{
-    __ fld1();
-    __ fstp_d($dst$$reg);
-  %}
-  ins_pipe(fpu_reg_con);
-%}
-
-// The instruction usage is guarded by predicate in operand immXF().
-instruct loadConX(regX dst, immXF con) %{
-  match(Set dst con);
-  ins_cost(125);
-  format %{ "MOVSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
-  ins_encode %{
-    __ movflt($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-// The instruction usage is guarded by predicate in operand immXF0().
-instruct loadConX0(regX dst, immXF0 src) %{
+instruct loadConF0(regF dst, immF0 src) %{
   match(Set dst src);
   ins_cost(100);
   format %{ "XORPS  $dst,$dst\t# float 0.0" %}
@@ -7200,65 +6645,67 @@
   ins_pipe(pipe_slow);
 %}
 
+// The instruction usage is guarded by predicate in operand immDPR().
+instruct loadConDPR(regDPR dst, immDPR con) %{
+  match(Set dst con);
+  ins_cost(125);
+
+  format %{ "FLD_D  ST,[$constantaddress]\t# load from constant table: double=$con\n\t"
+            "FSTP   $dst" %}
+  ins_encode %{
+    __ fld_d($constantaddress($con));
+    __ fstp_d($dst$$reg);
+  %}
+  ins_pipe(fpu_reg_con);
+%}
+
+// The instruction usage is guarded by predicate in operand immDPR0().
+instruct loadConDPR0(regDPR dst, immDPR0 con) %{
+  match(Set dst con);
+  ins_cost(125);
+
+  format %{ "FLDZ   ST\n\t"
+            "FSTP   $dst" %}
+  ins_encode %{
+    __ fldz();
+    __ fstp_d($dst$$reg);
+  %}
+  ins_pipe(fpu_reg_con);
+%}
+
+// The instruction usage is guarded by predicate in operand immDPR1().
+instruct loadConDPR1(regDPR dst, immDPR1 con) %{
+  match(Set dst con);
+  ins_cost(125);
+
+  format %{ "FLD1   ST\n\t"
+            "FSTP   $dst" %}
+  ins_encode %{
+    __ fld1();
+    __ fstp_d($dst$$reg);
+  %}
+  ins_pipe(fpu_reg_con);
+%}
+
 // The instruction usage is guarded by predicate in operand immD().
 instruct loadConD(regD dst, immD con) %{
   match(Set dst con);
   ins_cost(125);
-
-  format %{ "FLD_D  ST,[$constantaddress]\t# load from constant table: double=$con\n\t"
-            "FSTP   $dst" %}
-  ins_encode %{
-    __ fld_d($constantaddress($con));
-    __ fstp_d($dst$$reg);
-  %}
-  ins_pipe(fpu_reg_con);
+  format %{ "MOVSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
 %}
 
 // The instruction usage is guarded by predicate in operand immD0().
-instruct loadConD0(regD dst, immD0 con) %{
-  match(Set dst con);
-  ins_cost(125);
-
-  format %{ "FLDZ   ST\n\t"
-            "FSTP   $dst" %}
-  ins_encode %{
-    __ fldz();
-    __ fstp_d($dst$$reg);
-  %}
-  ins_pipe(fpu_reg_con);
-%}
-
-// The instruction usage is guarded by predicate in operand immD1().
-instruct loadConD1(regD dst, immD1 con) %{
-  match(Set dst con);
-  ins_cost(125);
-
-  format %{ "FLD1   ST\n\t"
-            "FSTP   $dst" %}
-  ins_encode %{
-    __ fld1();
-    __ fstp_d($dst$$reg);
-  %}
-  ins_pipe(fpu_reg_con);
-%}
-
-// The instruction usage is guarded by predicate in operand immXD().
-instruct loadConXD(regXD dst, immXD con) %{
-  match(Set dst con);
-  ins_cost(125);
-  format %{ "MOVSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
-  ins_encode %{
-    __ movdbl($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-// The instruction usage is guarded by predicate in operand immXD0().
-instruct loadConXD0(regXD dst, immXD0 src) %{
+instruct loadConD0(regD dst, immD0 src) %{
   match(Set dst src);
   ins_cost(100);
   format %{ "XORPD  $dst,$dst\t# double 0.0" %}
-  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x57), RegReg(dst,dst));
+  ins_encode %{
+    __ xorpd ($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7296,7 +6743,7 @@
 %}
 
 // Load Stack Slot
-instruct loadSSF(regF dst, stackSlotF src) %{
+instruct loadSSF(regFPR dst, stackSlotF src) %{
   match(Set dst src);
   ins_cost(125);
 
@@ -7304,12 +6751,12 @@
             "FSTP   $dst" %}
   opcode(0xD9);               /* D9 /0, FLD m32real */
   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
-              Pop_Reg_F(dst) );
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
 // Load Stack Slot
-instruct loadSSD(regD dst, stackSlotD src) %{
+instruct loadSSD(regDPR dst, stackSlotD src) %{
   match(Set dst src);
   ins_cost(125);
 
@@ -7317,7 +6764,7 @@
             "FSTP   $dst" %}
   opcode(0xDD);               /* DD /0, FLD m64real */
   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
-              Pop_Reg_D(dst) );
+              Pop_Reg_DPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
@@ -7552,7 +6999,7 @@
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct storeLX_volatile(memory mem, stackSlotL src, regXD tmp, eFlagsReg cr) %{
+instruct storeLX_volatile(memory mem, stackSlotL src, regD tmp, eFlagsReg cr) %{
   predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
   match(Set mem (StoreL mem src));
   effect( TEMP tmp, KILL cr );
@@ -7560,12 +7007,15 @@
   format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
             "MOVSD  $tmp,$src\n\t"
             "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
-  opcode(0x3B);
-  ins_encode( OpcP, RegMem( EAX, mem ), enc_storeLX_volatile(mem, src, tmp));
-  ins_pipe( pipe_slow );
-%}
-
-instruct storeLX_reg_volatile(memory mem, eRegL src, regXD tmp2, regXD tmp, eFlagsReg cr) %{
+  ins_encode %{
+    __ cmpl(rax, $mem$$Address);
+    __ movdbl($tmp$$XMMRegister, Address(rsp, $src$$disp));
+    __ movdbl($mem$$Address, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct storeLX_reg_volatile(memory mem, eRegL src, regD tmp2, regD tmp, eFlagsReg cr) %{
   predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
   match(Set mem (StoreL mem src));
   effect( TEMP tmp2 , TEMP tmp, KILL cr );
@@ -7575,8 +7025,13 @@
             "MOVD   $tmp2,$src.hi\n\t"
             "PUNPCKLDQ $tmp,$tmp2\n\t"
             "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
-  opcode(0x3B);
-  ins_encode( OpcP, RegMem( EAX, mem ), enc_storeLX_reg_volatile(mem, src, tmp, tmp2));
+  ins_encode %{
+    __ cmpl(rax, $mem$$Address);
+    __ movdl($tmp$$XMMRegister, $src$$Register);
+    __ movdl($tmp2$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdbl($mem$$Address, $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7638,32 +7093,38 @@
 %}
 
 // Store Aligned Packed Byte XMM register to memory
-instruct storeA8B(memory mem, regXD src) %{
+instruct storeA8B(memory mem, regD src) %{
   predicate(UseSSE>=1);
   match(Set mem (Store8B mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed8B" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store Aligned Packed Char/Short XMM register to memory
-instruct storeA4C(memory mem, regXD src) %{
+instruct storeA4C(memory mem, regD src) %{
   predicate(UseSSE>=1);
   match(Set mem (Store4C mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed4C" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store Aligned Packed Integer XMM register to memory
-instruct storeA2I(memory mem, regXD src) %{
+instruct storeA2I(memory mem, regD src) %{
   predicate(UseSSE>=1);
   match(Set mem (Store2I mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed2I" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7679,98 +7140,116 @@
 %}
 
 // Store Double
-instruct storeD( memory mem, regDPR1 src) %{
+instruct storeDPR( memory mem, regDPR1 src) %{
   predicate(UseSSE<=1);
   match(Set mem (StoreD mem src));
 
   ins_cost(100);
   format %{ "FST_D  $mem,$src" %}
   opcode(0xDD);       /* DD /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store double does rounding on x86
-instruct storeD_rounded( memory mem, regDPR1 src) %{
+instruct storeDPR_rounded( memory mem, regDPR1 src) %{
   predicate(UseSSE<=1);
   match(Set mem (StoreD mem (RoundDouble src)));
 
   ins_cost(100);
   format %{ "FST_D  $mem,$src\t# round" %}
   opcode(0xDD);       /* DD /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store XMM register to memory (double-precision floating points)
 // MOVSD instruction
-instruct storeXD(memory mem, regXD src) %{
+instruct storeD(memory mem, regD src) %{
   predicate(UseSSE>=2);
   match(Set mem (StoreD mem src));
   ins_cost(95);
   format %{ "MOVSD  $mem,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
+  ins_encode %{
+    __ movdbl($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store XMM register to memory (single-precision floating point)
 // MOVSS instruction
-instruct storeX(memory mem, regX src) %{
+instruct storeF(memory mem, regF src) %{
   predicate(UseSSE>=1);
   match(Set mem (StoreF mem src));
   ins_cost(95);
   format %{ "MOVSS  $mem,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
+  ins_encode %{
+    __ movflt($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store Aligned Packed Single Float XMM register to memory
-instruct storeA2F(memory mem, regXD src) %{
+instruct storeA2F(memory mem, regD src) %{
   predicate(UseSSE>=1);
   match(Set mem (Store2F mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed2F" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store Float
-instruct storeF( memory mem, regFPR1 src) %{
+instruct storeFPR( memory mem, regFPR1 src) %{
   predicate(UseSSE==0);
   match(Set mem (StoreF mem src));
 
   ins_cost(100);
   format %{ "FST_S  $mem,$src" %}
   opcode(0xD9);       /* D9 /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store Float does rounding on x86
-instruct storeF_rounded( memory mem, regFPR1 src) %{
+instruct storeFPR_rounded( memory mem, regFPR1 src) %{
   predicate(UseSSE==0);
   match(Set mem (StoreF mem (RoundFloat src)));
 
   ins_cost(100);
   format %{ "FST_S  $mem,$src\t# round" %}
   opcode(0xD9);       /* D9 /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store Float does rounding on x86
-instruct storeF_Drounded( memory mem, regDPR1 src) %{
+instruct storeFPR_Drounded( memory mem, regDPR1 src) %{
   predicate(UseSSE<=1);
   match(Set mem (StoreF mem (ConvD2F src)));
 
   ins_cost(100);
   format %{ "FST_S  $mem,$src\t# D-round" %}
   opcode(0xD9);       /* D9 /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store immediate Float value (it is faster than store from FPU register)
+// The instruction usage is guarded by predicate in operand immFPR().
+instruct storeFPR_imm( memory mem, immFPR src) %{
+  match(Set mem (StoreF mem src));
+
+  ins_cost(50);
+  format %{ "MOV    $mem,$src\t# store float" %}
+  opcode(0xC7);               /* C7 /0 */
+  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32FPR_as_bits( src ));
+  ins_pipe( ialu_mem_imm );
+%}
+
+// Store immediate Float value (it is faster than store from XMM register)
 // The instruction usage is guarded by predicate in operand immF().
 instruct storeF_imm( memory mem, immF src) %{
   match(Set mem (StoreF mem src));
@@ -7782,18 +7261,6 @@
   ins_pipe( ialu_mem_imm );
 %}
 
-// Store immediate Float value (it is faster than store from XMM register)
-// The instruction usage is guarded by predicate in operand immXF().
-instruct storeX_imm( memory mem, immXF src) %{
-  match(Set mem (StoreF mem src));
-
-  ins_cost(50);
-  format %{ "MOV    $mem,$src\t# store float" %}
-  opcode(0xC7);               /* C7 /0 */
-  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32XF_as_bits( src ));
-  ins_pipe( ialu_mem_imm );
-%}
-
 // Store Integer to stack slot
 instruct storeSSI(stackSlotI dst, eRegI src) %{
   match(Set dst src);
@@ -7901,6 +7368,16 @@
   ins_pipe(empty);
 %}
 
+instruct membar_storestore() %{
+  match(MemBarStoreStore);
+  ins_cost(0);
+
+  size(0);
+  format %{ "MEMBAR-storestore (empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
 //----------Move Instructions--------------------------------------------------
 instruct castX2P(eAXRegP dst, eAXRegI src) %{
   match(Set dst (CastX2P src));
@@ -8088,29 +7565,29 @@
 //%}
 
 // Conditional move
-instruct fcmovD_regU(cmpOp_fcmov cop, eFlagsRegU cr, regDPR1 dst, regD src) %{
+instruct fcmovDPR_regU(cmpOp_fcmov cop, eFlagsRegU cr, regDPR1 dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   format %{ "FCMOV$cop $dst,$src\t# double" %}
   opcode(0xDA);
-  ins_encode( enc_cmov_d(cop,src) );
-  ins_pipe( pipe_cmovD_reg );
+  ins_encode( enc_cmov_dpr(cop,src) );
+  ins_pipe( pipe_cmovDPR_reg );
 %}
 
 // Conditional move
-instruct fcmovF_regU(cmpOp_fcmov cop, eFlagsRegU cr, regFPR1 dst, regF src) %{
+instruct fcmovFPR_regU(cmpOp_fcmov cop, eFlagsRegU cr, regFPR1 dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   format %{ "FCMOV$cop $dst,$src\t# float" %}
   opcode(0xDA);
-  ins_encode( enc_cmov_d(cop,src) );
-  ins_pipe( pipe_cmovD_reg );
+  ins_encode( enc_cmov_dpr(cop,src) );
+  ins_pipe( pipe_cmovDPR_reg );
 %}
 
 // Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
-instruct fcmovD_regS(cmpOp cop, eFlagsReg cr, regD dst, regD src) %{
+instruct fcmovDPR_regS(cmpOp cop, eFlagsReg cr, regDPR dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8118,12 +7595,12 @@
             "MOV    $dst,$src\t# double\n"
       "skip:" %}
   opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
-  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_D(src), OpcP, RegOpc(dst) );
-  ins_pipe( pipe_cmovD_reg );
+  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_DPR(src), OpcP, RegOpc(dst) );
+  ins_pipe( pipe_cmovDPR_reg );
 %}
 
 // Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
-instruct fcmovF_regS(cmpOp cop, eFlagsReg cr, regF dst, regF src) %{
+instruct fcmovFPR_regS(cmpOp cop, eFlagsReg cr, regFPR dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8131,12 +7608,12 @@
             "MOV    $dst,$src\t# float\n"
       "skip:" %}
   opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
-  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_F(src), OpcP, RegOpc(dst) );
-  ins_pipe( pipe_cmovD_reg );
+  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_FPR(src), OpcP, RegOpc(dst) );
+  ins_pipe( pipe_cmovDPR_reg );
 %}
 
 // No CMOVE with SSE/SSE2
-instruct fcmovX_regS(cmpOp cop, eFlagsReg cr, regX dst, regX src) %{
+instruct fcmovF_regS(cmpOp cop, eFlagsReg cr, regF dst, regF src) %{
   predicate (UseSSE>=1);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8154,7 +7631,7 @@
 %}
 
 // No CMOVE with SSE/SSE2
-instruct fcmovXD_regS(cmpOp cop, eFlagsReg cr, regXD dst, regXD src) %{
+instruct fcmovD_regS(cmpOp cop, eFlagsReg cr, regD dst, regD src) %{
   predicate (UseSSE>=2);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8172,7 +7649,7 @@
 %}
 
 // unsigned version
-instruct fcmovX_regU(cmpOpU cop, eFlagsRegU cr, regX dst, regX src) %{
+instruct fcmovF_regU(cmpOpU cop, eFlagsRegU cr, regF dst, regF src) %{
   predicate (UseSSE>=1);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8189,17 +7666,17 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct fcmovX_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regX dst, regX src) %{
+instruct fcmovF_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regF dst, regF src) %{
   predicate (UseSSE>=1);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovX_regU(cop, cr, dst, src);
+    fcmovF_regU(cop, cr, dst, src);
   %}
 %}
 
 // unsigned version
-instruct fcmovXD_regU(cmpOpU cop, eFlagsRegU cr, regXD dst, regXD src) %{
+instruct fcmovD_regU(cmpOpU cop, eFlagsRegU cr, regD dst, regD src) %{
   predicate (UseSSE>=2);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8216,12 +7693,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct fcmovXD_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regXD dst, regXD src) %{
+instruct fcmovD_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regD dst, regD src) %{
   predicate (UseSSE>=2);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovXD_regU(cop, cr, dst, src);
+    fcmovD_regU(cop, cr, dst, src);
   %}
 %}
 
@@ -8440,7 +7917,7 @@
 %}
 
 // LoadLong-locked - same as a volatile long load when used with compare-swap
-instruct loadLLocked(stackSlotL dst, load_long_memory mem) %{
+instruct loadLLocked(stackSlotL dst, memory mem) %{
   predicate(UseSSE<=1);
   match(Set dst (LoadLLocked mem));
 
@@ -8451,18 +7928,21 @@
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct loadLX_Locked(stackSlotL dst, load_long_memory mem, regXD tmp) %{
+instruct loadLX_Locked(stackSlotL dst, memory mem, regD tmp) %{
   predicate(UseSSE>=2);
   match(Set dst (LoadLLocked mem));
   effect(TEMP tmp);
   ins_cost(180);
   format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
             "MOVSD  $dst,$tmp" %}
-  ins_encode(enc_loadLX_volatile(mem, dst, tmp));
-  ins_pipe( pipe_slow );
-%}
-
-instruct loadLX_reg_Locked(eRegL dst, load_long_memory mem, regXD tmp) %{
+  ins_encode %{
+    __ movdbl($tmp$$XMMRegister, $mem$$Address);
+    __ movdbl(Address(rsp, $dst$$disp), $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadLX_reg_Locked(eRegL dst, memory mem, regD tmp) %{
   predicate(UseSSE>=2);
   match(Set dst (LoadLLocked mem));
   effect(TEMP tmp);
@@ -8471,7 +7951,12 @@
             "MOVD   $dst.lo,$tmp\n\t"
             "PSRLQ  $tmp,32\n\t"
             "MOVD   $dst.hi,$tmp" %}
-  ins_encode(enc_loadLX_reg_volatile(mem, dst, tmp));
+  ins_encode %{
+    __ movdbl($tmp$$XMMRegister, $mem$$Address);
+    __ movdl($dst$$Register, $tmp$$XMMRegister);
+    __ psrlq($tmp$$XMMRegister, 32);
+    __ movdl(HIGH_FROM_LOW($dst$$Register), $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -10054,7 +9539,7 @@
 // Compare & branch
 
 // P6 version of float compare, sets condition codes in EFLAGS
-instruct cmpD_cc_P6(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
+instruct cmpDPR_cc_P6(eFlagsRegU cr, regDPR src1, regDPR src2, eAXRegI rax) %{
   predicate(VM_Version::supports_cmov() && UseSSE <=1);
   match(Set cr (CmpD src1 src2));
   effect(KILL rax);
@@ -10066,26 +9551,26 @@
             "SAHF\n"
      "exit:\tNOP               // avoid branch to branch" %}
   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               cmpF_P6_fixup );
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpD_cc_P6CF(eFlagsRegUCF cr, regD src1, regD src2) %{
+instruct cmpDPR_cc_P6CF(eFlagsRegUCF cr, regDPR src1, regDPR src2) %{
   predicate(VM_Version::supports_cmov() && UseSSE <=1);
   match(Set cr (CmpD src1 src2));
   ins_cost(150);
   format %{ "FLD    $src1\n\t"
             "FUCOMIP ST,$src2  // P6 instruction" %}
   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2));
   ins_pipe( pipe_slow );
 %}
 
 // Compare & branch
-instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
+instruct cmpDPR_cc(eFlagsRegU cr, regDPR src1, regDPR src2, eAXRegI rax) %{
   predicate(UseSSE<=1);
   match(Set cr (CmpD src1 src2));
   effect(KILL rax);
@@ -10098,138 +9583,140 @@
             "MOV    AH,1\t# unordered treat as LT\n"
     "flags:\tSAHF" %}
   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               fpu_flags);
   ins_pipe( pipe_slow );
 %}
 
 // Compare vs zero into -1,0,1
-instruct cmpD_0(eRegI dst, regD src1, immD0 zero, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpDPR_0(eRegI dst, regDPR src1, immDPR0 zero, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (CmpD3 src1 zero));
   effect(KILL cr, KILL rax);
   ins_cost(280);
   format %{ "FTSTD  $dst,$src1" %}
   opcode(0xE4, 0xD9);
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcS, OpcP, PopFPU,
               CmpF_Result(dst));
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1
-instruct cmpD_reg(eRegI dst, regD src1, regD src2, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpDPR_reg(eRegI dst, regDPR src1, regDPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (CmpD3 src1 src2));
   effect(KILL cr, KILL rax);
   ins_cost(300);
   format %{ "FCMPD  $dst,$src1,$src2" %}
   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               CmpF_Result(dst));
   ins_pipe( pipe_slow );
 %}
 
 // float compare and set condition codes in EFLAGS by XMM regs
-instruct cmpXD_cc(eFlagsRegU cr, regXD dst, regXD src, eAXRegI rax) %{
+instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2) %{
   predicate(UseSSE>=2);
-  match(Set cr (CmpD dst src));
-  effect(KILL rax);
-  ins_cost(125);
-  format %{ "COMISD $dst,$src\n"
-          "\tJNP    exit\n"
-          "\tMOV    ah,1       // saw a NaN, set CF\n"
-          "\tSAHF\n"
-     "exit:\tNOP               // avoid branch to branch" %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src), cmpF_P6_fixup);
-  ins_pipe( pipe_slow );
-%}
-
-instruct cmpXD_ccCF(eFlagsRegUCF cr, regXD dst, regXD src) %{
+  match(Set cr (CmpD src1 src2));
+  ins_cost(145);
+  format %{ "UCOMISD $src1,$src2\n\t"
+            "JNP,s   exit\n\t"
+            "PUSHF\t# saw NaN, set CF\n\t"
+            "AND     [rsp], #0xffffff2b\n\t"
+            "POPF\n"
+    "exit:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp_fixup(_masm);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct cmpD_ccCF(eFlagsRegUCF cr, regD src1, regD src2) %{
   predicate(UseSSE>=2);
-  match(Set cr (CmpD dst src));
+  match(Set cr (CmpD src1 src2));
   ins_cost(100);
-  format %{ "COMISD $dst,$src" %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
+  format %{ "UCOMISD $src1,$src2" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // float compare and set condition codes in EFLAGS by XMM regs
-instruct cmpXD_ccmem(eFlagsRegU cr, regXD dst, memory src, eAXRegI rax) %{
+instruct cmpD_ccmem(eFlagsRegU cr, regD src1, memory src2) %{
   predicate(UseSSE>=2);
-  match(Set cr (CmpD dst (LoadD src)));
-  effect(KILL rax);
+  match(Set cr (CmpD src1 (LoadD src2)));
   ins_cost(145);
-  format %{ "COMISD $dst,$src\n"
-          "\tJNP    exit\n"
-          "\tMOV    ah,1       // saw a NaN, set CF\n"
-          "\tSAHF\n"
-     "exit:\tNOP               // avoid branch to branch" %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src), cmpF_P6_fixup);
-  ins_pipe( pipe_slow );
-%}
-
-instruct cmpXD_ccmemCF(eFlagsRegUCF cr, regXD dst, memory src) %{
+  format %{ "UCOMISD $src1,$src2\n\t"
+            "JNP,s   exit\n\t"
+            "PUSHF\t# saw NaN, set CF\n\t"
+            "AND     [rsp], #0xffffff2b\n\t"
+            "POPF\n"
+    "exit:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp_fixup(_masm);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct cmpD_ccmemCF(eFlagsRegUCF cr, regD src1, memory src2) %{
   predicate(UseSSE>=2);
-  match(Set cr (CmpD dst (LoadD src)));
+  match(Set cr (CmpD src1 (LoadD src2)));
   ins_cost(100);
-  format %{ "COMISD $dst,$src" %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src));
+  format %{ "UCOMISD $src1,$src2" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1 in XMM
-instruct cmpXD_reg(eRegI dst, regXD src1, regXD src2, eFlagsReg cr) %{
+instruct cmpD_reg(xRegI dst, regD src1, regD src2, eFlagsReg cr) %{
   predicate(UseSSE>=2);
   match(Set dst (CmpD3 src1 src2));
   effect(KILL cr);
   ins_cost(255);
-  format %{ "XOR    $dst,$dst\n"
-          "\tCOMISD $src1,$src2\n"
-          "\tJP,s   nan\n"
-          "\tJEQ,s  exit\n"
-          "\tJA,s   inc\n"
-      "nan:\tDEC    $dst\n"
-          "\tJMP,s  exit\n"
-      "inc:\tINC    $dst\n"
-      "exit:"
-                %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(Xor_Reg(dst), OpcP, OpcS, Opcode(tertiary), RegReg(src1, src2),
-             CmpX_Result(dst));
+  format %{ "UCOMISD $src1, $src2\n\t"
+            "MOV     $dst, #-1\n\t"
+            "JP,s    done\n\t"
+            "JB,s    done\n\t"
+            "SETNE   $dst\n\t"
+            "MOVZB   $dst, $dst\n"
+    "done:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1 in XMM and memory
-instruct cmpXD_regmem(eRegI dst, regXD src1, memory mem, eFlagsReg cr) %{
+instruct cmpD_regmem(xRegI dst, regD src1, memory src2, eFlagsReg cr) %{
   predicate(UseSSE>=2);
-  match(Set dst (CmpD3 src1 (LoadD mem)));
+  match(Set dst (CmpD3 src1 (LoadD src2)));
   effect(KILL cr);
   ins_cost(275);
-  format %{ "COMISD $src1,$mem\n"
-          "\tMOV    $dst,0\t\t# do not blow flags\n"
-          "\tJP,s   nan\n"
-          "\tJEQ,s  exit\n"
-          "\tJA,s   inc\n"
-      "nan:\tDEC    $dst\n"
-          "\tJMP,s  exit\n"
-      "inc:\tINC    $dst\n"
-      "exit:"
-                %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(src1, mem),
-             LdImmI(dst,0x0), CmpX_Result(dst));
-  ins_pipe( pipe_slow );
-%}
-
-
-instruct subD_reg(regD dst, regD src) %{
+  format %{ "UCOMISD $src1, $src2\n\t"
+            "MOV     $dst, #-1\n\t"
+            "JP,s    done\n\t"
+            "JB,s    done\n\t"
+            "SETNE   $dst\n\t"
+            "MOVZB   $dst, $dst\n"
+    "done:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+
+instruct subDPR_reg(regDPR dst, regDPR src) %{
   predicate (UseSSE <=1);
   match(Set dst (SubD dst src));
 
@@ -10237,12 +9724,12 @@
             "DSUBp  $dst,ST" %}
   opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
   ins_cost(150);
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct subD_reg_round(stackSlotD dst, regD src1, regD src2) %{
+instruct subDPR_reg_round(stackSlotD dst, regDPR src1, regDPR src2) %{
   predicate (UseSSE <=1);
   match(Set dst (RoundDouble (SubD src1 src2)));
   ins_cost(250);
@@ -10251,13 +9738,13 @@
             "DSUB   ST,$src1\n\t"
             "FSTP_D $dst\t# D-round" %}
   opcode(0xD8, 0x5);
-  ins_encode( Push_Reg_D(src2),
-              OpcP, RegOpc(src1), Pop_Mem_D(dst) );
+  ins_encode( Push_Reg_DPR(src2),
+              OpcP, RegOpc(src1), Pop_Mem_DPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 
 
-instruct subD_reg_mem(regD dst, memory src) %{
+instruct subDPR_reg_mem(regDPR dst, memory src) %{
   predicate (UseSSE <=1);
   match(Set dst (SubD dst (LoadD src)));
   ins_cost(150);
@@ -10270,7 +9757,7 @@
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct absD_reg(regDPR1 dst, regDPR1 src) %{
+instruct absDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst (AbsD src));
   ins_cost(100);
@@ -10280,15 +9767,7 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct absXD_reg( regXD dst ) %{
-  predicate(UseSSE>=2);
-  match(Set dst (AbsD dst));
-  format %{ "ANDPD  $dst,[0x7FFFFFFFFFFFFFFF]\t# ABS D by sign masking" %}
-  ins_encode( AbsXD_encoding(dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct negD_reg(regDPR1 dst, regDPR1 src) %{
+instruct negDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate(UseSSE<=1);
   match(Set dst (NegD src));
   ins_cost(100);
@@ -10298,18 +9777,7 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct negXD_reg( regXD dst ) %{
-  predicate(UseSSE>=2);
-  match(Set dst (NegD dst));
-  format %{ "XORPD  $dst,[0x8000000000000000]\t# CHS D by sign flipping" %}
-  ins_encode %{
-     __ xorpd($dst$$XMMRegister,
-              ExternalAddress((address)double_signflip_pool));
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct addD_reg(regD dst, regD src) %{
+instruct addDPR_reg(regDPR dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (AddD dst src));
   format %{ "FLD    $src\n\t"
@@ -10317,13 +9785,13 @@
   size(4);
   ins_cost(150);
   opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
 
-instruct addD_reg_round(stackSlotD dst, regD src1, regD src2) %{
+instruct addDPR_reg_round(stackSlotD dst, regDPR src1, regDPR src2) %{
   predicate(UseSSE<=1);
   match(Set dst (RoundDouble (AddD src1 src2)));
   ins_cost(250);
@@ -10332,13 +9800,13 @@
             "DADD   ST,$src1\n\t"
             "FSTP_D $dst\t# D-round" %}
   opcode(0xD8, 0x0); /* D8 C0+i or D8 /0*/
-  ins_encode( Push_Reg_D(src2),
-              OpcP, RegOpc(src1), Pop_Mem_D(dst) );
+  ins_encode( Push_Reg_DPR(src2),
+              OpcP, RegOpc(src1), Pop_Mem_DPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 
 
-instruct addD_reg_mem(regD dst, memory src) %{
+instruct addDPR_reg_mem(regDPR dst, memory src) %{
   predicate(UseSSE<=1);
   match(Set dst (AddD dst (LoadD src)));
   ins_cost(150);
@@ -10352,7 +9820,7 @@
 %}
 
 // add-to-memory
-instruct addD_mem_reg(memory dst, regD src) %{
+instruct addDPR_mem_reg(memory dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (StoreD dst (RoundDouble (AddD (LoadD dst) src))));
   ins_cost(150);
@@ -10368,7 +9836,7 @@
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct addD_reg_imm1(regD dst, immD1 con) %{
+instruct addDPR_reg_imm1(regDPR dst, immDPR1 con) %{
   predicate(UseSSE<=1);
   match(Set dst (AddD dst con));
   ins_cost(125);
@@ -10381,7 +9849,7 @@
   ins_pipe(fpu_reg);
 %}
 
-instruct addD_reg_imm(regD dst, immD con) %{
+instruct addDPR_reg_imm(regDPR dst, immDPR con) %{
   predicate(UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
   match(Set dst (AddD dst con));
   ins_cost(200);
@@ -10394,7 +9862,7 @@
   ins_pipe(fpu_reg_mem);
 %}
 
-instruct addD_reg_imm_round(stackSlotD dst, regD src, immD con) %{
+instruct addDPR_reg_imm_round(stackSlotD dst, regDPR src, immDPR con) %{
   predicate(UseSSE<=1 && _kids[0]->_kids[1]->_leaf->getd() != 0.0 && _kids[0]->_kids[1]->_leaf->getd() != 1.0 );
   match(Set dst (RoundDouble (AddD src con)));
   ins_cost(200);
@@ -10409,124 +9877,14 @@
   ins_pipe(fpu_mem_reg_con);
 %}
 
-// Add two double precision floating point values in xmm
-instruct addXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (AddD dst src));
-  format %{ "ADDSD  $dst,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct addXD_imm(regXD dst, immXD con) %{
-  predicate(UseSSE>=2);
-  match(Set dst (AddD dst con));
-  format %{ "ADDSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
-  ins_encode %{
-    __ addsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct addXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (AddD dst (LoadD mem)));
-  format %{ "ADDSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Sub two double precision floating point values in xmm
-instruct subXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SubD dst src));
-  format %{ "SUBSD  $dst,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct subXD_imm(regXD dst, immXD con) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SubD dst con));
-  format %{ "SUBSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
-  ins_encode %{
-    __ subsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct subXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SubD dst (LoadD mem)));
-  format %{ "SUBSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Mul two double precision floating point values in xmm
-instruct mulXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (MulD dst src));
-  format %{ "MULSD  $dst,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct mulXD_imm(regXD dst, immXD con) %{
-  predicate(UseSSE>=2);
-  match(Set dst (MulD dst con));
-  format %{ "MULSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
-  ins_encode %{
-    __ mulsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (MulD dst (LoadD mem)));
-  format %{ "MULSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Div two double precision floating point values in xmm
-instruct divXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (DivD dst src));
-  format %{ "DIVSD  $dst,$src" %}
-  opcode(0xF2, 0x0F, 0x5E);
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct divXD_imm(regXD dst, immXD con) %{
-  predicate(UseSSE>=2);
-  match(Set dst (DivD dst con));
-  format %{ "DIVSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
-  ins_encode %{
-    __ divsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct divXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (DivD dst (LoadD mem)));
-  format %{ "DIVSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-
-instruct mulD_reg(regD dst, regD src) %{
+instruct mulDPR_reg(regDPR dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (MulD dst src));
   format %{ "FLD    $src\n\t"
             "DMULp  $dst,ST" %}
   opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
   ins_cost(150);
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
@@ -10539,7 +9897,7 @@
 // multiply scaled arg1 by arg2
 // rescale product by 2^(15360)
 //
-instruct strictfp_mulD_reg(regDPR1 dst, regnotDPR1 src) %{
+instruct strictfp_mulDPR_reg(regDPR1 dst, regnotDPR1 src) %{
   predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
   match(Set dst (MulD dst src));
   ins_cost(1);   // Select this instruction for all strict FP double multiplies
@@ -10552,13 +9910,13 @@
             "DMULp  $dst,ST\n\t" %}
   opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
   ins_encode( strictfp_bias1(dst),
-              Push_Reg_D(src),
+              Push_Reg_DPR(src),
               OpcP, RegOpc(dst),
               strictfp_bias2(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct mulD_reg_imm(regD dst, immD con) %{
+instruct mulDPR_reg_imm(regDPR dst, immDPR con) %{
   predicate( UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
   match(Set dst (MulD dst con));
   ins_cost(200);
@@ -10572,7 +9930,7 @@
 %}
 
 
-instruct mulD_reg_mem(regD dst, memory src) %{
+instruct mulDPR_reg_mem(regDPR dst, memory src) %{
   predicate( UseSSE<=1 );
   match(Set dst (MulD dst (LoadD src)));
   ins_cost(200);
@@ -10586,7 +9944,7 @@
 
 //
 // Cisc-alternate to reg-reg multiply
-instruct mulD_reg_mem_cisc(regD dst, regD src, memory mem) %{
+instruct mulDPR_reg_mem_cisc(regDPR dst, regDPR src, memory mem) %{
   predicate( UseSSE<=1 );
   match(Set dst (MulD src (LoadD mem)));
   ins_cost(250);
@@ -10595,17 +9953,17 @@
             "FSTP_D $dst" %}
   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadD D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem),
-              OpcReg_F(src),
-              Pop_Reg_D(dst) );
+              OpcReg_FPR(src),
+              Pop_Reg_DPR(dst) );
   ins_pipe( fpu_reg_reg_mem );
 %}
 
 
-// MACRO3 -- addD a mulD
+// MACRO3 -- addDPR a mulDPR
 // This instruction is a '2-address' instruction in that the result goes
 // back to src2.  This eliminates a move from the macro; possibly the
 // register allocator will have to add it back (and maybe not).
-instruct addD_mulD_reg(regD src2, regD src1, regD src0) %{
+instruct addDPR_mulDPR_reg(regDPR src2, regDPR src1, regDPR src0) %{
   predicate( UseSSE<=1 );
   match(Set src2 (AddD (MulD src0 src1) src2));
   format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
@@ -10613,29 +9971,29 @@
             "DADDp  $src2,ST" %}
   ins_cost(250);
   opcode(0xDD); /* LoadD DD /0 */
-  ins_encode( Push_Reg_F(src0),
+  ins_encode( Push_Reg_FPR(src0),
               FMul_ST_reg(src1),
               FAddP_reg_ST(src2) );
   ins_pipe( fpu_reg_reg_reg );
 %}
 
 
-// MACRO3 -- subD a mulD
-instruct subD_mulD_reg(regD src2, regD src1, regD src0) %{
+// MACRO3 -- subDPR a mulDPR
+instruct subDPR_mulDPR_reg(regDPR src2, regDPR src1, regDPR src0) %{
   predicate( UseSSE<=1 );
   match(Set src2 (SubD (MulD src0 src1) src2));
   format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
             "DMUL   ST,$src1\n\t"
             "DSUBRp $src2,ST" %}
   ins_cost(250);
-  ins_encode( Push_Reg_F(src0),
+  ins_encode( Push_Reg_FPR(src0),
               FMul_ST_reg(src1),
               Opcode(0xDE), Opc_plus(0xE0,src2));
   ins_pipe( fpu_reg_reg_reg );
 %}
 
 
-instruct divD_reg(regD dst, regD src) %{
+instruct divDPR_reg(regDPR dst, regDPR src) %{
   predicate( UseSSE<=1 );
   match(Set dst (DivD dst src));
 
@@ -10643,7 +10001,7 @@
             "FDIVp  $dst,ST" %}
   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
   ins_cost(150);
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
@@ -10656,7 +10014,7 @@
 // divide scaled dividend by divisor
 // rescale quotient by 2^(15360)
 //
-instruct strictfp_divD_reg(regDPR1 dst, regnotDPR1 src) %{
+instruct strictfp_divDPR_reg(regDPR1 dst, regnotDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst (DivD dst src));
   predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
@@ -10670,13 +10028,13 @@
             "DMULp  $dst,ST\n\t" %}
   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
   ins_encode( strictfp_bias1(dst),
-              Push_Reg_D(src),
+              Push_Reg_DPR(src),
               OpcP, RegOpc(dst),
               strictfp_bias2(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct divD_reg_round(stackSlotD dst, regD src1, regD src2) %{
+instruct divDPR_reg_round(stackSlotD dst, regDPR src1, regDPR src2) %{
   predicate( UseSSE<=1 && !(Compile::current()->has_method() && Compile::current()->method()->is_strict()) );
   match(Set dst (RoundDouble (DivD src1 src2)));
 
@@ -10684,27 +10042,27 @@
             "FDIV   ST,$src2\n\t"
             "FSTP_D $dst\t# D-round" %}
   opcode(0xD8, 0x6); /* D8 F0+i or D8 /6 */
-  ins_encode( Push_Reg_D(src1),
-              OpcP, RegOpc(src2), Pop_Mem_D(dst) );
+  ins_encode( Push_Reg_DPR(src1),
+              OpcP, RegOpc(src2), Pop_Mem_DPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 
 
-instruct modD_reg(regD dst, regD src, eAXRegI rax, eFlagsReg cr) %{
+instruct modDPR_reg(regDPR dst, regDPR src, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (ModD dst src));
-  effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
+  effect(KILL rax, KILL cr); // emitModDPR() uses EAX and EFLAGS
 
   format %{ "DMOD   $dst,$src" %}
   ins_cost(250);
-  ins_encode(Push_Reg_Mod_D(dst, src),
-              emitModD(),
-              Push_Result_Mod_D(src),
-              Pop_Reg_D(dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct modXD_reg(regXD dst, regXD src0, regXD src1, eAXRegI rax, eFlagsReg cr) %{
+  ins_encode(Push_Reg_Mod_DPR(dst, src),
+              emitModDPR(),
+              Push_Result_Mod_DPR(src),
+              Pop_Reg_DPR(dst));
+  ins_pipe( pipe_slow );
+%}
+
+instruct modD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE>=2);
   match(Set dst (ModD src0 src1));
   effect(KILL rax, KILL cr);
@@ -10725,11 +10083,11 @@
           "\tFSTP   ST0\t # Restore FPU Stack"
     %}
   ins_cost(250);
-  ins_encode( Push_ModD_encoding(src0, src1), emitModD(), Push_ResultXD(dst), PopFPU);
-  ins_pipe( pipe_slow );
-%}
-
-instruct sinD_reg(regDPR1 dst, regDPR1 src) %{
+  ins_encode( Push_ModD_encoding(src0, src1), emitModDPR(), Push_ResultD(dst), PopFPU);
+  ins_pipe( pipe_slow );
+%}
+
+instruct sinDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst (SinD src));
   ins_cost(1800);
@@ -10739,18 +10097,18 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct sinXD_reg(regXD dst, eFlagsReg cr) %{
+instruct sinD_reg(regD dst, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst (SinD dst));
-  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
+  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
   ins_cost(1800);
   format %{ "DSIN   $dst" %}
   opcode(0xD9, 0xFE);
-  ins_encode( Push_SrcXD(dst), OpcP, OpcS, Push_ResultXD(dst) );
-  ins_pipe( pipe_slow );
-%}
-
-instruct cosD_reg(regDPR1 dst, regDPR1 src) %{
+  ins_encode( Push_SrcD(dst), OpcP, OpcS, Push_ResultD(dst) );
+  ins_pipe( pipe_slow );
+%}
+
+instruct cosDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst (CosD src));
   ins_cost(1800);
@@ -10760,18 +10118,18 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct cosXD_reg(regXD dst, eFlagsReg cr) %{
+instruct cosD_reg(regD dst, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst (CosD dst));
-  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
+  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
   ins_cost(1800);
   format %{ "DCOS   $dst" %}
   opcode(0xD9, 0xFF);
-  ins_encode( Push_SrcXD(dst), OpcP, OpcS, Push_ResultXD(dst) );
-  ins_pipe( pipe_slow );
-%}
-
-instruct tanD_reg(regDPR1 dst, regDPR1 src) %{
+  ins_encode( Push_SrcD(dst), OpcP, OpcS, Push_ResultD(dst) );
+  ins_pipe( pipe_slow );
+%}
+
+instruct tanDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst(TanD src));
   format %{ "DTAN   $dst" %}
@@ -10780,50 +10138,50 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct tanXD_reg(regXD dst, eFlagsReg cr) %{
+instruct tanD_reg(regD dst, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst(TanD dst));
-  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
+  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
   format %{ "DTAN   $dst" %}
-  ins_encode( Push_SrcXD(dst),
+  ins_encode( Push_SrcD(dst),
               Opcode(0xD9), Opcode(0xF2),    // fptan
               Opcode(0xDD), Opcode(0xD8),   // fstp st
-              Push_ResultXD(dst) );
-  ins_pipe( pipe_slow );
-%}
-
-instruct atanD_reg(regD dst, regD src) %{
+              Push_ResultD(dst) );
+  ins_pipe( pipe_slow );
+%}
+
+instruct atanDPR_reg(regDPR dst, regDPR src) %{
   predicate (UseSSE<=1);
   match(Set dst(AtanD dst src));
   format %{ "DATA   $dst,$src" %}
   opcode(0xD9, 0xF3);
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, OpcS, RegOpc(dst) );
   ins_pipe( pipe_slow );
 %}
 
-instruct atanXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
+instruct atanD_reg(regD dst, regD src, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst(AtanD dst src));
-  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
+  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
   format %{ "DATA   $dst,$src" %}
   opcode(0xD9, 0xF3);
-  ins_encode( Push_SrcXD(src),
-              OpcP, OpcS, Push_ResultXD(dst) );
-  ins_pipe( pipe_slow );
-%}
-
-instruct sqrtD_reg(regD dst, regD src) %{
+  ins_encode( Push_SrcD(src),
+              OpcP, OpcS, Push_ResultD(dst) );
+  ins_pipe( pipe_slow );
+%}
+
+instruct sqrtDPR_reg(regDPR dst, regDPR src) %{
   predicate (UseSSE<=1);
   match(Set dst (SqrtD src));
   format %{ "DSQRT  $dst,$src" %}
   opcode(0xFA, 0xD9);
-  ins_encode( Push_Reg_D(src),
-              OpcS, OpcP, Pop_Reg_D(dst) );
-  ins_pipe( pipe_slow );
-%}
-
-instruct powD_reg(regD X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+  ins_encode( Push_Reg_DPR(src),
+              OpcS, OpcP, Pop_Reg_DPR(dst) );
+  ins_pipe( pipe_slow );
+%}
+
+instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
   predicate (UseSSE<=1);
   match(Set Y (PowD X Y));  // Raise X to the Yth power
   effect(KILL rax, KILL rbx, KILL rcx);
@@ -10852,14 +10210,14 @@
             "ADD    ESP,8"
              %}
   ins_encode( push_stack_temp_qword,
-              Push_Reg_D(X),
+              Push_Reg_DPR(X),
               Opcode(0xD9), Opcode(0xF1),   // fyl2x
               pow_exp_core_encoding,
               pop_stack_temp_qword);
   ins_pipe( pipe_slow );
 %}
 
-instruct powXD_reg(regXD dst, regXD src0, regXD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{
+instruct powD_reg(regD dst, regD src0, regD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{
   predicate (UseSSE>=2);
   match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
   effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx );
@@ -10897,12 +10255,12 @@
               push_xmm_to_fpr1(src0),
               Opcode(0xD9), Opcode(0xF1),   // fyl2x
               pow_exp_core_encoding,
-              Push_ResultXD(dst) );
-  ins_pipe( pipe_slow );
-%}
-
-
-instruct expD_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+              Push_ResultD(dst) );
+  ins_pipe( pipe_slow );
+%}
+
+
+instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
   predicate (UseSSE<=1);
   match(Set dpr1 (ExpD dpr1));
   effect(KILL rax, KILL rbx, KILL rcx);
@@ -10938,7 +10296,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct expXD_reg(regXD dst, regXD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+instruct expD_reg(regD dst, regD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
   predicate (UseSSE>=2);
   match(Set dst (ExpD src));
   effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx);
@@ -10969,17 +10327,17 @@
             "MOVSD  $dst,[ESP]\n\t"
             "ADD    ESP,8"
              %}
-  ins_encode( Push_SrcXD(src),
+  ins_encode( Push_SrcD(src),
               Opcode(0xD9), Opcode(0xEA),   // fldl2e
               Opcode(0xDE), Opcode(0xC9),   // fmulp
               pow_exp_core_encoding,
-              Push_ResultXD(dst) );
-  ins_pipe( pipe_slow );
-%}
-
-
-
-instruct log10D_reg(regDPR1 dst, regDPR1 src) %{
+              Push_ResultD(dst) );
+  ins_pipe( pipe_slow );
+%}
+
+
+
+instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   // The source Double operand on FPU stack
   match(Set dst (Log10D src));
@@ -10997,7 +10355,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct log10XD_reg(regXD dst, regXD src, eFlagsReg cr) %{
+instruct log10D_reg(regD dst, regD src, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   effect(KILL cr);
   match(Set dst (Log10D src));
@@ -11007,14 +10365,14 @@
             "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
          %}
   ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
-              Push_SrcXD(src),
+              Push_SrcD(src),
               Opcode(0xD9), Opcode(0xF1),   // fyl2x
-              Push_ResultXD(dst));
-
-  ins_pipe( pipe_slow );
-%}
-
-instruct logD_reg(regDPR1 dst, regDPR1 src) %{
+              Push_ResultD(dst));
+
+  ins_pipe( pipe_slow );
+%}
+
+instruct logDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   // The source Double operand on FPU stack
   match(Set dst (LogD src));
@@ -11032,7 +10390,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct logXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
+instruct logD_reg(regD dst, regD src, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   effect(KILL cr);
   // The source and result Double operands in XMM registers
@@ -11043,9 +10401,9 @@
             "FYL2X  \t\t\t# Q=Log_e*Log_2(x)"
          %}
   ins_encode( Opcode(0xD9), Opcode(0xED),   // fldln2
-              Push_SrcXD(src),
+              Push_SrcD(src),
               Opcode(0xD9), Opcode(0xF1),   // fyl2x
-              Push_ResultXD(dst));
+              Push_ResultD(dst));
   ins_pipe( pipe_slow );
 %}
 
@@ -11066,7 +10424,7 @@
 //   exit:
 
 // P6 version of float compare, sets condition codes in EFLAGS
-instruct cmpF_cc_P6(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
+instruct cmpFPR_cc_P6(eFlagsRegU cr, regFPR src1, regFPR src2, eAXRegI rax) %{
   predicate(VM_Version::supports_cmov() && UseSSE == 0);
   match(Set cr (CmpF src1 src2));
   effect(KILL rax);
@@ -11078,27 +10436,27 @@
             "SAHF\n"
      "exit:\tNOP               // avoid branch to branch" %}
   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               cmpF_P6_fixup );
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpF_cc_P6CF(eFlagsRegUCF cr, regF src1, regF src2) %{
+instruct cmpFPR_cc_P6CF(eFlagsRegUCF cr, regFPR src1, regFPR src2) %{
   predicate(VM_Version::supports_cmov() && UseSSE == 0);
   match(Set cr (CmpF src1 src2));
   ins_cost(100);
   format %{ "FLD    $src1\n\t"
             "FUCOMIP ST,$src2  // P6 instruction" %}
   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2));
   ins_pipe( pipe_slow );
 %}
 
 
 // Compare & branch
-instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
+instruct cmpFPR_cc(eFlagsRegU cr, regFPR src1, regFPR src2, eAXRegI rax) %{
   predicate(UseSSE == 0);
   match(Set cr (CmpF src1 src2));
   effect(KILL rax);
@@ -11111,328 +10469,190 @@
             "MOV    AH,1\t# unordered treat as LT\n"
     "flags:\tSAHF" %}
   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               fpu_flags);
   ins_pipe( pipe_slow );
 %}
 
 // Compare vs zero into -1,0,1
-instruct cmpF_0(eRegI dst, regF src1, immF0 zero, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpFPR_0(eRegI dst, regFPR src1, immFPR0 zero, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE == 0);
   match(Set dst (CmpF3 src1 zero));
   effect(KILL cr, KILL rax);
   ins_cost(280);
   format %{ "FTSTF  $dst,$src1" %}
   opcode(0xE4, 0xD9);
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcS, OpcP, PopFPU,
               CmpF_Result(dst));
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1
-instruct cmpF_reg(eRegI dst, regF src1, regF src2, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpFPR_reg(eRegI dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE == 0);
   match(Set dst (CmpF3 src1 src2));
   effect(KILL cr, KILL rax);
   ins_cost(300);
   format %{ "FCMPF  $dst,$src1,$src2" %}
   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               CmpF_Result(dst));
   ins_pipe( pipe_slow );
 %}
 
 // float compare and set condition codes in EFLAGS by XMM regs
-instruct cmpX_cc(eFlagsRegU cr, regX dst, regX src, eAXRegI rax) %{
+instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2) %{
   predicate(UseSSE>=1);
-  match(Set cr (CmpF dst src));
-  effect(KILL rax);
+  match(Set cr (CmpF src1 src2));
   ins_cost(145);
-  format %{ "COMISS $dst,$src\n"
-          "\tJNP    exit\n"
-          "\tMOV    ah,1       // saw a NaN, set CF\n"
-          "\tSAHF\n"
-     "exit:\tNOP               // avoid branch to branch" %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegReg(dst, src), cmpF_P6_fixup);
-  ins_pipe( pipe_slow );
-%}
-
-instruct cmpX_ccCF(eFlagsRegUCF cr, regX dst, regX src) %{
+  format %{ "UCOMISS $src1,$src2\n\t"
+            "JNP,s   exit\n\t"
+            "PUSHF\t# saw NaN, set CF\n\t"
+            "AND     [rsp], #0xffffff2b\n\t"
+            "POPF\n"
+    "exit:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp_fixup(_masm);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct cmpF_ccCF(eFlagsRegUCF cr, regF src1, regF src2) %{
   predicate(UseSSE>=1);
-  match(Set cr (CmpF dst src));
+  match(Set cr (CmpF src1 src2));
   ins_cost(100);
-  format %{ "COMISS $dst,$src" %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegReg(dst, src));
+  format %{ "UCOMISS $src1,$src2" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // float compare and set condition codes in EFLAGS by XMM regs
-instruct cmpX_ccmem(eFlagsRegU cr, regX dst, memory src, eAXRegI rax) %{
+instruct cmpF_ccmem(eFlagsRegU cr, regF src1, memory src2) %{
   predicate(UseSSE>=1);
-  match(Set cr (CmpF dst (LoadF src)));
-  effect(KILL rax);
+  match(Set cr (CmpF src1 (LoadF src2)));
   ins_cost(165);
-  format %{ "COMISS $dst,$src\n"
-          "\tJNP    exit\n"
-          "\tMOV    ah,1       // saw a NaN, set CF\n"
-          "\tSAHF\n"
-     "exit:\tNOP               // avoid branch to branch" %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegMem(dst, src), cmpF_P6_fixup);
-  ins_pipe( pipe_slow );
-%}
-
-instruct cmpX_ccmemCF(eFlagsRegUCF cr, regX dst, memory src) %{
+  format %{ "UCOMISS $src1,$src2\n\t"
+            "JNP,s   exit\n\t"
+            "PUSHF\t# saw NaN, set CF\n\t"
+            "AND     [rsp], #0xffffff2b\n\t"
+            "POPF\n"
+    "exit:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp_fixup(_masm);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct cmpF_ccmemCF(eFlagsRegUCF cr, regF src1, memory src2) %{
   predicate(UseSSE>=1);
-  match(Set cr (CmpF dst (LoadF src)));
+  match(Set cr (CmpF src1 (LoadF src2)));
   ins_cost(100);
-  format %{ "COMISS $dst,$src" %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegMem(dst, src));
+  format %{ "UCOMISS $src1,$src2" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1 in XMM
-instruct cmpX_reg(eRegI dst, regX src1, regX src2, eFlagsReg cr) %{
+instruct cmpF_reg(xRegI dst, regF src1, regF src2, eFlagsReg cr) %{
   predicate(UseSSE>=1);
   match(Set dst (CmpF3 src1 src2));
   effect(KILL cr);
   ins_cost(255);
-  format %{ "XOR    $dst,$dst\n"
-          "\tCOMISS $src1,$src2\n"
-          "\tJP,s   nan\n"
-          "\tJEQ,s  exit\n"
-          "\tJA,s   inc\n"
-      "nan:\tDEC    $dst\n"
-          "\tJMP,s  exit\n"
-      "inc:\tINC    $dst\n"
-      "exit:"
-                %}
-  opcode(0x0F, 0x2F);
-  ins_encode(Xor_Reg(dst), OpcP, OpcS, RegReg(src1, src2), CmpX_Result(dst));
+  format %{ "UCOMISS $src1, $src2\n\t"
+            "MOV     $dst, #-1\n\t"
+            "JP,s    done\n\t"
+            "JB,s    done\n\t"
+            "SETNE   $dst\n\t"
+            "MOVZB   $dst, $dst\n"
+    "done:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1 in XMM and memory
-instruct cmpX_regmem(eRegI dst, regX src1, memory mem, eFlagsReg cr) %{
+instruct cmpF_regmem(xRegI dst, regF src1, memory src2, eFlagsReg cr) %{
   predicate(UseSSE>=1);
-  match(Set dst (CmpF3 src1 (LoadF mem)));
+  match(Set dst (CmpF3 src1 (LoadF src2)));
   effect(KILL cr);
   ins_cost(275);
-  format %{ "COMISS $src1,$mem\n"
-          "\tMOV    $dst,0\t\t# do not blow flags\n"
-          "\tJP,s   nan\n"
-          "\tJEQ,s  exit\n"
-          "\tJA,s   inc\n"
-      "nan:\tDEC    $dst\n"
-          "\tJMP,s  exit\n"
-      "inc:\tINC    $dst\n"
-      "exit:"
-                %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegMem(src1, mem), LdImmI(dst,0x0), CmpX_Result(dst));
+  format %{ "UCOMISS $src1, $src2\n\t"
+            "MOV     $dst, #-1\n\t"
+            "JP,s    done\n\t"
+            "JB,s    done\n\t"
+            "SETNE   $dst\n\t"
+            "MOVZB   $dst, $dst\n"
+    "done:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Spill to obtain 24-bit precision
-instruct subF24_reg(stackSlotF dst, regF src1, regF src2) %{
+instruct subFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (SubF src1 src2));
 
   format %{ "FSUB   $dst,$src1 - $src2" %}
   opcode(0xD8, 0x4); /* D8 E0+i or D8 /4 mod==0x3 ;; result in TOS */
-  ins_encode( Push_Reg_F(src1),
-              OpcReg_F(src2),
-              Pop_Mem_F(dst) );
+  ins_encode( Push_Reg_FPR(src1),
+              OpcReg_FPR(src2),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 //
 // This instruction does not round to 24-bits
-instruct subF_reg(regF dst, regF src) %{
+instruct subFPR_reg(regFPR dst, regFPR src) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (SubF dst src));
 
   format %{ "FSUB   $dst,$src" %}
   opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
-  ins_encode( Push_Reg_F(src),
+  ins_encode( Push_Reg_FPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
 // Spill to obtain 24-bit precision
-instruct addF24_reg(stackSlotF dst, regF src1, regF src2) %{
+instruct addFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 src2));
 
   format %{ "FADD   $dst,$src1,$src2" %}
   opcode(0xD8, 0x0); /* D8 C0+i */
-  ins_encode( Push_Reg_F(src2),
-              OpcReg_F(src1),
-              Pop_Mem_F(dst) );
+  ins_encode( Push_Reg_FPR(src2),
+              OpcReg_FPR(src1),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 //
 // This instruction does not round to 24-bits
-instruct addF_reg(regF dst, regF src) %{
+instruct addFPR_reg(regFPR dst, regFPR src) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (AddF dst src));
 
   format %{ "FLD    $src\n\t"
             "FADDp  $dst,ST" %}
   opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
-  ins_encode( Push_Reg_F(src),
+  ins_encode( Push_Reg_FPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
-// Add two single precision floating point values in xmm
-instruct addX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (AddF dst src));
-  format %{ "ADDSS  $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct addX_imm(regX dst, immXF con) %{
-  predicate(UseSSE>=1);
-  match(Set dst (AddF dst con));
-  format %{ "ADDSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
-  ins_encode %{
-    __ addss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct addX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (AddF dst (LoadF mem)));
-  format %{ "ADDSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegMem(dst, mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Subtract two single precision floating point values in xmm
-instruct subX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (SubF dst src));
-  format %{ "SUBSS  $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct subX_imm(regX dst, immXF con) %{
-  predicate(UseSSE>=1);
-  match(Set dst (SubF dst con));
-  format %{ "SUBSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
-  ins_encode %{
-    __ subss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct subX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (SubF dst (LoadF mem)));
-  format %{ "SUBSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Multiply two single precision floating point values in xmm
-instruct mulX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (MulF dst src));
-  format %{ "MULSS  $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct mulX_imm(regX dst, immXF con) %{
-  predicate(UseSSE>=1);
-  match(Set dst (MulF dst con));
-  format %{ "MULSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
-  ins_encode %{
-    __ mulss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (MulF dst (LoadF mem)));
-  format %{ "MULSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Divide two single precision floating point values in xmm
-instruct divX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (DivF dst src));
-  format %{ "DIVSS  $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct divX_imm(regX dst, immXF con) %{
-  predicate(UseSSE>=1);
-  match(Set dst (DivF dst con));
-  format %{ "DIVSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
-  ins_encode %{
-    __ divss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct divX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (DivF dst (LoadF mem)));
-  format %{ "DIVSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Get the square root of a single precision floating point values in xmm
-instruct sqrtX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
-  format %{ "SQRTSS $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x51), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct sqrtX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF mem)))));
-  format %{ "SQRTSS $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x51), RegMem(dst, mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Get the square root of a double precision floating point values in xmm
-instruct sqrtXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SqrtD src));
-  format %{ "SQRTSD $dst,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct sqrtXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SqrtD (LoadD mem)));
-  format %{ "SQRTSD $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegMem(dst, mem));
-  ins_pipe( pipe_slow );
-%}
-
-instruct absF_reg(regFPR1 dst, regFPR1 src) %{
+instruct absFPR_reg(regFPR1 dst, regFPR1 src) %{
   predicate(UseSSE==0);
   match(Set dst (AbsF src));
   ins_cost(100);
@@ -11442,15 +10662,7 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct absX_reg(regX dst ) %{
-  predicate(UseSSE>=1);
-  match(Set dst (AbsF dst));
-  format %{ "ANDPS  $dst,[0x7FFFFFFF]\t# ABS F by sign masking" %}
-  ins_encode( AbsXF_encoding(dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct negF_reg(regFPR1 dst, regFPR1 src) %{
+instruct negFPR_reg(regFPR1 dst, regFPR1 src) %{
   predicate(UseSSE==0);
   match(Set dst (NegF src));
   ins_cost(100);
@@ -11460,17 +10672,9 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct negX_reg( regX dst ) %{
-  predicate(UseSSE>=1);
-  match(Set dst (NegF dst));
-  format %{ "XORPS  $dst,[0x80000000]\t# CHS F by sign flipping" %}
-  ins_encode( NegXF_encoding(dst));
-  ins_pipe( pipe_slow );
-%}
-
-// Cisc-alternate to addF_reg
+// Cisc-alternate to addFPR_reg
 // Spill to obtain 24-bit precision
-instruct addF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
+instruct addFPR24_reg_mem(stackSlotF dst, regFPR src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 (LoadF src2)));
 
@@ -11479,14 +10683,14 @@
             "FSTP_S $dst" %}
   opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
-              OpcReg_F(src1),
-              Pop_Mem_F(dst) );
+              OpcReg_FPR(src1),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_mem );
 %}
 //
-// Cisc-alternate to addF_reg
+// Cisc-alternate to addFPR_reg
 // This instruction does not round to 24-bits
-instruct addF_reg_mem(regF dst, memory src) %{
+instruct addFPR_reg_mem(regFPR dst, memory src) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (AddF dst (LoadF src)));
 
@@ -11499,21 +10703,21 @@
 
 // // Following two instructions for _222_mpegaudio
 // Spill to obtain 24-bit precision
-instruct addF24_mem_reg(stackSlotF dst, regF src2, memory src1 ) %{
+instruct addFPR24_mem_reg(stackSlotF dst, regFPR src2, memory src1 ) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 src2));
 
   format %{ "FADD   $dst,$src1,$src2" %}
   opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src1),
-              OpcReg_F(src2),
-              Pop_Mem_F(dst) );
+              OpcReg_FPR(src2),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_mem );
 %}
 
 // Cisc-spill variant
 // Spill to obtain 24-bit precision
-instruct addF24_mem_cisc(stackSlotF dst, memory src1, memory src2) %{
+instruct addFPR24_mem_cisc(stackSlotF dst, memory src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 (LoadF src2)));
 
@@ -11522,12 +10726,12 @@
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
               set_instruction_start,
               OpcP, RMopc_Mem(secondary,src1),
-              Pop_Mem_F(dst) );
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_mem_mem );
 %}
 
 // Spill to obtain 24-bit precision
-instruct addF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
+instruct addFPR24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 src2));
 
@@ -11536,13 +10740,13 @@
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
               set_instruction_start,
               OpcP, RMopc_Mem(secondary,src1),
-              Pop_Mem_F(dst) );
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_mem_mem );
 %}
 
 
 // Spill to obtain 24-bit precision
-instruct addF24_reg_imm(stackSlotF dst, regF src, immF con) %{
+instruct addFPR24_reg_imm(stackSlotF dst, regFPR src, immFPR con) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src con));
   format %{ "FLD    $src\n\t"
@@ -11557,7 +10761,7 @@
 %}
 //
 // This instruction does not round to 24-bits
-instruct addF_reg_imm(regF dst, regF src, immF con) %{
+instruct addFPR_reg_imm(regFPR dst, regFPR src, immFPR con) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src con));
   format %{ "FLD    $src\n\t"
@@ -11572,7 +10776,7 @@
 %}
 
 // Spill to obtain 24-bit precision
-instruct mulF24_reg(stackSlotF dst, regF src1, regF src2) %{
+instruct mulFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 src2));
 
@@ -11580,14 +10784,14 @@
             "FMUL   $src2\n\t"
             "FSTP_S $dst"  %}
   opcode(0xD8, 0x1); /* D8 C8+i or D8 /1 ;; result in TOS */
-  ins_encode( Push_Reg_F(src1),
-              OpcReg_F(src2),
-              Pop_Mem_F(dst) );
+  ins_encode( Push_Reg_FPR(src1),
+              OpcReg_FPR(src2),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 //
 // This instruction does not round to 24-bits
-instruct mulF_reg(regF dst, regF src1, regF src2) %{
+instruct mulFPR_reg(regFPR dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 src2));
 
@@ -11595,16 +10799,16 @@
             "FMUL   $src2\n\t"
             "FSTP_S $dst"  %}
   opcode(0xD8, 0x1); /* D8 C8+i */
-  ins_encode( Push_Reg_F(src2),
-              OpcReg_F(src1),
-              Pop_Reg_F(dst) );
+  ins_encode( Push_Reg_FPR(src2),
+              OpcReg_FPR(src1),
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_reg_reg );
 %}
 
 
 // Spill to obtain 24-bit precision
 // Cisc-alternate to reg-reg multiply
-instruct mulF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
+instruct mulFPR24_reg_mem(stackSlotF dst, regFPR src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 (LoadF src2)));
 
@@ -11613,27 +10817,27 @@
             "FSTP_S $dst"  %}
   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or DE /1*/  /* LoadF D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
-              OpcReg_F(src1),
-              Pop_Mem_F(dst) );
+              OpcReg_FPR(src1),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_mem );
 %}
 //
 // This instruction does not round to 24-bits
 // Cisc-alternate to reg-reg multiply
-instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
+instruct mulFPR_reg_mem(regFPR dst, regFPR src1, memory src2) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 (LoadF src2)));
 
   format %{ "FMUL   $dst,$src1,$src2" %}
   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadF D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
-              OpcReg_F(src1),
-              Pop_Reg_F(dst) );
+              OpcReg_FPR(src1),
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_reg_mem );
 %}
 
 // Spill to obtain 24-bit precision
-instruct mulF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
+instruct mulFPR24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 src2));
 
@@ -11642,12 +10846,12 @@
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
               set_instruction_start,
               OpcP, RMopc_Mem(secondary,src1),
-              Pop_Mem_F(dst) );
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_mem_mem );
 %}
 
 // Spill to obtain 24-bit precision
-instruct mulF24_reg_imm(stackSlotF dst, regF src, immF con) %{
+instruct mulFPR24_reg_imm(stackSlotF dst, regFPR src, immFPR con) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src con));
 
@@ -11663,7 +10867,7 @@
 %}
 //
 // This instruction does not round to 24-bits
-instruct mulF_reg_imm(regF dst, regF src, immF con) %{
+instruct mulFPR_reg_imm(regFPR dst, regFPR src, immFPR con) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src con));
 
@@ -11680,9 +10884,9 @@
 
 
 //
-// MACRO1 -- subsume unshared load into mulF
+// MACRO1 -- subsume unshared load into mulFPR
 // This instruction does not round to 24-bits
-instruct mulF_reg_load1(regF dst, regF src, memory mem1 ) %{
+instruct mulFPR_reg_load1(regFPR dst, regFPR src, memory mem1 ) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (MulF (LoadF mem1) src));
 
@@ -11691,36 +10895,36 @@
             "FSTP   $dst" %}
   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or D8 /1 */  /* LoadF D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem1),
-              OpcReg_F(src),
-              Pop_Reg_F(dst) );
+              OpcReg_FPR(src),
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_reg_mem );
 %}
 //
-// MACRO2 -- addF a mulF which subsumed an unshared load
+// MACRO2 -- addFPR a mulFPR which subsumed an unshared load
 // This instruction does not round to 24-bits
-instruct addF_mulF_reg_load1(regF dst, memory mem1, regF src1, regF src2) %{
+instruct addFPR_mulFPR_reg_load1(regFPR dst, memory mem1, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (AddF (MulF (LoadF mem1) src1) src2));
   ins_cost(95);
 
   format %{ "FLD    $mem1     ===MACRO2===\n\t"
-            "FMUL   ST,$src1  subsume mulF left load\n\t"
+            "FMUL   ST,$src1  subsume mulFPR left load\n\t"
             "FADD   ST,$src2\n\t"
             "FSTP   $dst" %}
   opcode(0xD9); /* LoadF D9 /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem1),
               FMul_ST_reg(src1),
               FAdd_ST_reg(src2),
-              Pop_Reg_F(dst) );
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_mem_reg_reg );
 %}
 
-// MACRO3 -- addF a mulF
+// MACRO3 -- addFPR a mulFPR
 // This instruction does not round to 24-bits.  It is a '2-address'
 // instruction in that the result goes back to src2.  This eliminates
 // a move from the macro; possibly the register allocator will have
 // to add it back (and maybe not).
-instruct addF_mulF_reg(regF src2, regF src1, regF src0) %{
+instruct addFPR_mulFPR_reg(regFPR src2, regFPR src1, regFPR src0) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set src2 (AddF (MulF src0 src1) src2));
 
@@ -11728,15 +10932,15 @@
             "FMUL   ST,$src1\n\t"
             "FADDP  $src2,ST" %}
   opcode(0xD9); /* LoadF D9 /0 */
-  ins_encode( Push_Reg_F(src0),
+  ins_encode( Push_Reg_FPR(src0),
               FMul_ST_reg(src1),
               FAddP_reg_ST(src2) );
   ins_pipe( fpu_reg_reg_reg );
 %}
 
-// MACRO4 -- divF subF
+// MACRO4 -- divFPR subFPR
 // This instruction does not round to 24-bits
-instruct subF_divF_reg(regF dst, regF src1, regF src2, regF src3) %{
+instruct subFPR_divFPR_reg(regFPR dst, regFPR src1, regFPR src2, regFPR src3) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (DivF (SubF src2 src1) src3));
 
@@ -11745,67 +10949,67 @@
             "FDIV   ST,$src3\n\t"
             "FSTP  $dst" %}
   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
-  ins_encode( Push_Reg_F(src2),
-              subF_divF_encode(src1,src3),
-              Pop_Reg_F(dst) );
+  ins_encode( Push_Reg_FPR(src2),
+              subFPR_divFPR_encode(src1,src3),
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_reg_reg_reg );
 %}
 
 // Spill to obtain 24-bit precision
-instruct divF24_reg(stackSlotF dst, regF src1, regF src2) %{
+instruct divFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (DivF src1 src2));
 
   format %{ "FDIV   $dst,$src1,$src2" %}
   opcode(0xD8, 0x6); /* D8 F0+i or DE /6*/
-  ins_encode( Push_Reg_F(src1),
-              OpcReg_F(src2),
-              Pop_Mem_F(dst) );
+  ins_encode( Push_Reg_FPR(src1),
+              OpcReg_FPR(src2),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 //
 // This instruction does not round to 24-bits
-instruct divF_reg(regF dst, regF src) %{
+instruct divFPR_reg(regFPR dst, regFPR src) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (DivF dst src));
 
   format %{ "FDIV   $dst,$src" %}
   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
-  ins_encode( Push_Reg_F(src),
+  ins_encode( Push_Reg_FPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
 
 // Spill to obtain 24-bit precision
-instruct modF24_reg(stackSlotF dst, regF src1, regF src2, eAXRegI rax, eFlagsReg cr) %{
+instruct modFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (ModF src1 src2));
-  effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
+  effect(KILL rax, KILL cr); // emitModDPR() uses EAX and EFLAGS
 
   format %{ "FMOD   $dst,$src1,$src2" %}
-  ins_encode( Push_Reg_Mod_D(src1, src2),
-              emitModD(),
-              Push_Result_Mod_D(src2),
-              Pop_Mem_F(dst));
+  ins_encode( Push_Reg_Mod_DPR(src1, src2),
+              emitModDPR(),
+              Push_Result_Mod_DPR(src2),
+              Pop_Mem_FPR(dst));
   ins_pipe( pipe_slow );
 %}
 //
 // This instruction does not round to 24-bits
-instruct modF_reg(regF dst, regF src, eAXRegI rax, eFlagsReg cr) %{
+instruct modFPR_reg(regFPR dst, regFPR src, eAXRegI rax, eFlagsReg cr) %{
   predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (ModF dst src));
-  effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
+  effect(KILL rax, KILL cr); // emitModDPR() uses EAX and EFLAGS
 
   format %{ "FMOD   $dst,$src" %}
-  ins_encode(Push_Reg_Mod_D(dst, src),
-              emitModD(),
-              Push_Result_Mod_D(src),
-              Pop_Reg_F(dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct modX_reg(regX dst, regX src0, regX src1, eAXRegI rax, eFlagsReg cr) %{
+  ins_encode(Push_Reg_Mod_DPR(dst, src),
+              emitModDPR(),
+              Push_Result_Mod_DPR(src),
+              Pop_Reg_FPR(dst));
+  ins_pipe( pipe_slow );
+%}
+
+instruct modF_reg(regF dst, regF src0, regF src1, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE>=1);
   match(Set dst (ModF src0 src1));
   effect(KILL rax, KILL cr);
@@ -11825,7 +11029,7 @@
           "\tFSTP   ST0\t # Restore FPU Stack"
     %}
   ins_cost(250);
-  ins_encode( Push_ModX_encoding(src0, src1), emitModD(), Push_ResultX(dst,0x4), PopFPU);
+  ins_encode( Push_ModF_encoding(src0, src1), emitModDPR(), Push_ResultF(dst,0x4), PopFPU);
   ins_pipe( pipe_slow );
 %}
 
@@ -11833,26 +11037,26 @@
 //----------Arithmetic Conversion Instructions---------------------------------
 // The conversions operations are all Alpha sorted.  Please keep it that way!
 
-instruct roundFloat_mem_reg(stackSlotF dst, regF src) %{
+instruct roundFloat_mem_reg(stackSlotF dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (RoundFloat src));
   ins_cost(125);
   format %{ "FST_S  $dst,$src\t# F-round" %}
-  ins_encode( Pop_Mem_Reg_F(dst, src) );
+  ins_encode( Pop_Mem_Reg_FPR(dst, src) );
   ins_pipe( fpu_mem_reg );
 %}
 
-instruct roundDouble_mem_reg(stackSlotD dst, regD src) %{
+instruct roundDouble_mem_reg(stackSlotD dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (RoundDouble src));
   ins_cost(125);
   format %{ "FST_D  $dst,$src\t# D-round" %}
-  ins_encode( Pop_Mem_Reg_D(dst, src) );
+  ins_encode( Pop_Mem_Reg_DPR(dst, src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Force rounding to 24-bit precision and 6-bit exponent
-instruct convD2F_reg(stackSlotF dst, regD src) %{
+instruct convDPR2FPR_reg(stackSlotF dst, regDPR src) %{
   predicate(UseSSE==0);
   match(Set dst (ConvD2F src));
   format %{ "FST_S  $dst,$src\t# F-round" %}
@@ -11862,7 +11066,7 @@
 %}
 
 // Force rounding to 24-bit precision and 6-bit exponent
-instruct convD2X_reg(regX dst, regD src, eFlagsReg cr) %{
+instruct convDPR2F_reg(regF dst, regDPR src, eFlagsReg cr) %{
   predicate(UseSSE==1);
   match(Set dst (ConvD2F src));
   effect( KILL cr );
@@ -11870,29 +11074,40 @@
             "FST_S  [ESP],$src\t# F-round\n\t"
             "MOVSS  $dst,[ESP]\n\t"
             "ADD ESP,4" %}
-  ins_encode( D2X_encoding(dst, src) );
+  ins_encode %{
+    __ subptr(rsp, 4);
+    if ($src$$reg != FPR1L_enc) {
+      __ fld_s($src$$reg-1);
+      __ fstp_s(Address(rsp, 0));
+    } else {
+      __ fst_s(Address(rsp, 0));
+    }
+    __ movflt($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, 4);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Force rounding double precision to single precision
-instruct convXD2X_reg(regX dst, regXD src) %{
+instruct convD2F_reg(regF dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (ConvD2F src));
   format %{ "CVTSD2SS $dst,$src\t# F-round" %}
-  opcode(0xF2, 0x0F, 0x5A);
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct convF2D_reg_reg(regD dst, regF src) %{
+  ins_encode %{
+    __ cvtsd2ss ($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct convFPR2DPR_reg_reg(regDPR dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (ConvF2D src));
   format %{ "FST_S  $dst,$src\t# D-round" %}
-  ins_encode( Pop_Reg_Reg_D(dst, src));
+  ins_encode( Pop_Reg_Reg_DPR(dst, src));
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct convF2D_reg(stackSlotD dst, regF src) %{
+instruct convFPR2D_reg(stackSlotD dst, regFPR src) %{
   predicate(UseSSE==1);
   match(Set dst (ConvF2D src));
   format %{ "FST_D  $dst,$src\t# D-round" %}
@@ -11901,7 +11116,7 @@
   %}
 %}
 
-instruct convX2D_reg(regD dst, regX src, eFlagsReg cr) %{
+instruct convF2DPR_reg(regDPR dst, regF src, eFlagsReg cr) %{
   predicate(UseSSE==1);
   match(Set dst (ConvF2D src));
   effect( KILL cr );
@@ -11910,21 +11125,28 @@
             "FLD_S  [ESP]\n\t"
             "ADD    ESP,4\n\t"
             "FSTP   $dst\t# D-round" %}
-  ins_encode( X2D_encoding(dst, src), Pop_Reg_D(dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct convX2XD_reg(regXD dst, regX src) %{
+  ins_encode %{
+    __ subptr(rsp, 4);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ addptr(rsp, 4);
+    __ fstp_d($dst$$reg);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct convF2D_reg(regD dst, regF src) %{
   predicate(UseSSE>=2);
   match(Set dst (ConvF2D src));
   format %{ "CVTSS2SD $dst,$src\t# D-round" %}
-  opcode(0xF3, 0x0F, 0x5A);
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
+  ins_encode %{
+    __ cvtss2sd ($dst$$XMMRegister, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
-instruct convD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regD src, eFlagsReg cr ) %{
+instruct convDPR2I_reg_reg( eAXRegI dst, eDXRegI tmp, regDPR src, eFlagsReg cr ) %{
   predicate(UseSSE<=1);
   match(Set dst (ConvD2I src));
   effect( KILL tmp, KILL cr );
@@ -11939,12 +11161,12 @@
             "FLD_D  $src\n\t"
             "CALL   d2i_wrapper\n"
       "fast:" %}
-  ins_encode( Push_Reg_D(src), D2I_encoding(src) );
+  ins_encode( Push_Reg_DPR(src), DPR2I_encoding(src) );
   ins_pipe( pipe_slow );
 %}
 
 // Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
-instruct convXD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regXD src, eFlagsReg cr ) %{
+instruct convD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regD src, eFlagsReg cr ) %{
   predicate(UseSSE>=2);
   match(Set dst (ConvD2I src));
   effect( KILL tmp, KILL cr );
@@ -11957,12 +11179,22 @@
             "ADD    ESP, 8\n\t"
             "CALL   d2i_wrapper\n"
       "fast:" %}
-  opcode(0x1); // double-precision conversion
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct convD2L_reg_reg( eADXRegL dst, regD src, eFlagsReg cr ) %{
+  ins_encode %{
+    Label fast;
+    __ cvttsd2sil($dst$$Register, $src$$XMMRegister);
+    __ cmpl($dst$$Register, 0x80000000);
+    __ jccb(Assembler::notEqual, fast);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ addptr(rsp, 8);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2i_wrapper())));
+    __ bind(fast);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct convDPR2L_reg_reg( eADXRegL dst, regDPR src, eFlagsReg cr ) %{
   predicate(UseSSE<=1);
   match(Set dst (ConvD2L src));
   effect( KILL cr );
@@ -11980,12 +11212,12 @@
             "FLD    $src\n\t"
             "CALL   d2l_wrapper\n"
       "fast:" %}
-  ins_encode( Push_Reg_D(src),  D2L_encoding(src) );
+  ins_encode( Push_Reg_DPR(src),  DPR2L_encoding(src) );
   ins_pipe( pipe_slow );
 %}
 
 // XMM lacks a float/double->long conversion, so use the old FPU stack.
-instruct convXD2L_reg_reg( eADXRegL dst, regXD src, eFlagsReg cr ) %{
+instruct convD2L_reg_reg( eADXRegL dst, regD src, eFlagsReg cr ) %{
   predicate (UseSSE>=2);
   match(Set dst (ConvD2L src));
   effect( KILL cr );
@@ -12004,9 +11236,36 @@
             "SUB    ESP,8\n\t"
             "MOVSD  [ESP],$src\n\t"
             "FLD_D  [ESP]\n\t"
+            "ADD    ESP,8\n\t"
             "CALL   d2l_wrapper\n"
       "fast:" %}
-  ins_encode( XD2L_encoding(src) );
+  ins_encode %{
+    Label fast;
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_trunc()));
+    __ fistp_d(Address(rsp, 0));
+    // Restore the rounding mode, mask the exception
+    if (Compile::current()->in_24_bit_fp_mode()) {
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
+    } else {
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+    }
+    // Load the converted long, adjust CPU stack
+    __ pop(rax);
+    __ pop(rdx);
+    __ cmpl(rdx, 0x80000000);
+    __ jccb(Assembler::notEqual, fast);
+    __ testl(rax, rax);
+    __ jccb(Assembler::notEqual, fast);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ addptr(rsp, 8);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2l_wrapper())));
+    __ bind(fast);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -12016,7 +11275,7 @@
 // rounding mode to 'nearest'.  The hardware stores a flag value down
 // if we would overflow or converted a NAN; we check for this and
 // and go the slow path if needed.
-instruct convF2I_reg_reg(eAXRegI dst, eDXRegI tmp, regF src, eFlagsReg cr ) %{
+instruct convFPR2I_reg_reg(eAXRegI dst, eDXRegI tmp, regFPR src, eFlagsReg cr ) %{
   predicate(UseSSE==0);
   match(Set dst (ConvF2I src));
   effect( KILL tmp, KILL cr );
@@ -12031,13 +11290,13 @@
             "FLD    $src\n\t"
             "CALL   d2i_wrapper\n"
       "fast:" %}
-  // D2I_encoding works for F2I
-  ins_encode( Push_Reg_F(src), D2I_encoding(src) );
+  // DPR2I_encoding works for FPR2I
+  ins_encode( Push_Reg_FPR(src), DPR2I_encoding(src) );
   ins_pipe( pipe_slow );
 %}
 
 // Convert a float in xmm to an int reg.
-instruct convX2I_reg(eAXRegI dst, eDXRegI tmp, regX src, eFlagsReg cr ) %{
+instruct convF2I_reg(eAXRegI dst, eDXRegI tmp, regF src, eFlagsReg cr ) %{
   predicate(UseSSE>=1);
   match(Set dst (ConvF2I src));
   effect( KILL tmp, KILL cr );
@@ -12050,12 +11309,22 @@
             "ADD    ESP, 4\n\t"
             "CALL   d2i_wrapper\n"
       "fast:" %}
-  opcode(0x0); // single-precision conversion
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct convF2L_reg_reg( eADXRegL dst, regF src, eFlagsReg cr ) %{
+  ins_encode %{
+    Label fast;
+    __ cvttss2sil($dst$$Register, $src$$XMMRegister);
+    __ cmpl($dst$$Register, 0x80000000);
+    __ jccb(Assembler::notEqual, fast);
+    __ subptr(rsp, 4);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ addptr(rsp, 4);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2i_wrapper())));
+    __ bind(fast);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct convFPR2L_reg_reg( eADXRegL dst, regFPR src, eFlagsReg cr ) %{
   predicate(UseSSE==0);
   match(Set dst (ConvF2L src));
   effect( KILL cr );
@@ -12073,13 +11342,13 @@
             "FLD    $src\n\t"
             "CALL   d2l_wrapper\n"
       "fast:" %}
-  // D2L_encoding works for F2L
-  ins_encode( Push_Reg_F(src), D2L_encoding(src) );
+  // DPR2L_encoding works for FPR2L
+  ins_encode( Push_Reg_FPR(src), DPR2L_encoding(src) );
   ins_pipe( pipe_slow );
 %}
 
 // XMM lacks a float/double->long conversion, so use the old FPU stack.
-instruct convX2L_reg_reg( eADXRegL dst, regX src, eFlagsReg cr ) %{
+instruct convF2L_reg_reg( eADXRegL dst, regF src, eFlagsReg cr ) %{
   predicate (UseSSE>=1);
   match(Set dst (ConvF2L src));
   effect( KILL cr );
@@ -12101,39 +11370,67 @@
             "ADD    ESP,4\n\t"
             "CALL   d2l_wrapper\n"
       "fast:" %}
-  ins_encode( X2L_encoding(src) );
-  ins_pipe( pipe_slow );
-%}
-
-instruct convI2D_reg(regD dst, stackSlotI src) %{
+  ins_encode %{
+    Label fast;
+    __ subptr(rsp, 8);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_trunc()));
+    __ fistp_d(Address(rsp, 0));
+    // Restore the rounding mode, mask the exception
+    if (Compile::current()->in_24_bit_fp_mode()) {
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
+    } else {
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+    }
+    // Load the converted long, adjust CPU stack
+    __ pop(rax);
+    __ pop(rdx);
+    __ cmpl(rdx, 0x80000000);
+    __ jccb(Assembler::notEqual, fast);
+    __ testl(rax, rax);
+    __ jccb(Assembler::notEqual, fast);
+    __ subptr(rsp, 4);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ addptr(rsp, 4);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2l_wrapper())));
+    __ bind(fast);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct convI2DPR_reg(regDPR dst, stackSlotI src) %{
   predicate( UseSSE<=1 );
   match(Set dst (ConvI2D src));
   format %{ "FILD   $src\n\t"
             "FSTP   $dst" %}
   opcode(0xDB, 0x0);  /* DB /0 */
-  ins_encode(Push_Mem_I(src), Pop_Reg_D(dst));
+  ins_encode(Push_Mem_I(src), Pop_Reg_DPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct convI2XD_reg(regXD dst, eRegI src) %{
+instruct convI2D_reg(regD dst, eRegI src) %{
   predicate( UseSSE>=2 && !UseXmmI2D );
   match(Set dst (ConvI2D src));
   format %{ "CVTSI2SD $dst,$src" %}
-  opcode(0xF2, 0x0F, 0x2A);
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct convI2XD_mem(regXD dst, memory mem) %{
+  ins_encode %{
+    __ cvtsi2sdl ($dst$$XMMRegister, $src$$Register);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct convI2D_mem(regD dst, memory mem) %{
   predicate( UseSSE>=2 );
   match(Set dst (ConvI2D (LoadI mem)));
   format %{ "CVTSI2SD $dst,$mem" %}
-  opcode(0xF2, 0x0F, 0x2A);
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegMem(dst, mem));
-  ins_pipe( pipe_slow );
-%}
-
-instruct convXI2XD_reg(regXD dst, eRegI src)
+  ins_encode %{
+    __ cvtsi2sdl ($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct convXI2D_reg(regD dst, eRegI src)
 %{
   predicate( UseSSE>=2 && UseXmmI2D );
   match(Set dst (ConvI2D src));
@@ -12147,31 +11444,31 @@
   ins_pipe(pipe_slow); // XXX
 %}
 
-instruct convI2D_mem(regD dst, memory mem) %{
+instruct convI2DPR_mem(regDPR dst, memory mem) %{
   predicate( UseSSE<=1 && !Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2D (LoadI mem)));
   format %{ "FILD   $mem\n\t"
             "FSTP   $dst" %}
   opcode(0xDB);      /* DB /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Reg_D(dst));
+              Pop_Reg_DPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
 // Convert a byte to a float; no rounding step needed.
-instruct conv24I2F_reg(regF dst, stackSlotI src) %{
+instruct conv24I2FPR_reg(regFPR dst, stackSlotI src) %{
   predicate( UseSSE==0 && n->in(1)->Opcode() == Op_AndI && n->in(1)->in(2)->is_Con() && n->in(1)->in(2)->get_int() == 255 );
   match(Set dst (ConvI2F src));
   format %{ "FILD   $src\n\t"
             "FSTP   $dst" %}
 
   opcode(0xDB, 0x0);  /* DB /0 */
-  ins_encode(Push_Mem_I(src), Pop_Reg_F(dst));
+  ins_encode(Push_Mem_I(src), Pop_Reg_FPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
 // In 24-bit mode, force exponent rounding by storing back out
-instruct convI2F_SSF(stackSlotF dst, stackSlotI src) %{
+instruct convI2FPR_SSF(stackSlotF dst, stackSlotI src) %{
   predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2F src));
   ins_cost(200);
@@ -12179,12 +11476,12 @@
             "FSTP_S $dst" %}
   opcode(0xDB, 0x0);  /* DB /0 */
   ins_encode( Push_Mem_I(src),
-              Pop_Mem_F(dst));
+              Pop_Mem_FPR(dst));
   ins_pipe( fpu_mem_mem );
 %}
 
 // In 24-bit mode, force exponent rounding by storing back out
-instruct convI2F_SSF_mem(stackSlotF dst, memory mem) %{
+instruct convI2FPR_SSF_mem(stackSlotF dst, memory mem) %{
   predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2F (LoadI mem)));
   ins_cost(200);
@@ -12192,46 +11489,46 @@
             "FSTP_S $dst" %}
   opcode(0xDB);  /* DB /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Mem_F(dst));
+              Pop_Mem_FPR(dst));
   ins_pipe( fpu_mem_mem );
 %}
 
 // This instruction does not round to 24-bits
-instruct convI2F_reg(regF dst, stackSlotI src) %{
+instruct convI2FPR_reg(regFPR dst, stackSlotI src) %{
   predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2F src));
   format %{ "FILD   $src\n\t"
             "FSTP   $dst" %}
   opcode(0xDB, 0x0);  /* DB /0 */
   ins_encode( Push_Mem_I(src),
-              Pop_Reg_F(dst));
+              Pop_Reg_FPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
 // This instruction does not round to 24-bits
-instruct convI2F_mem(regF dst, memory mem) %{
+instruct convI2FPR_mem(regFPR dst, memory mem) %{
   predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2F (LoadI mem)));
   format %{ "FILD   $mem\n\t"
             "FSTP   $dst" %}
   opcode(0xDB);      /* DB /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Reg_F(dst));
+              Pop_Reg_FPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
 // Convert an int to a float in xmm; no rounding step needed.
-instruct convI2X_reg(regX dst, eRegI src) %{
+instruct convI2F_reg(regF dst, eRegI src) %{
   predicate( UseSSE==1 || UseSSE>=2 && !UseXmmI2F );
   match(Set dst (ConvI2F src));
   format %{ "CVTSI2SS $dst, $src" %}
-
-  opcode(0xF3, 0x0F, 0x2A);  /* F3 0F 2A /r */
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
- instruct convXI2X_reg(regX dst, eRegI src)
+  ins_encode %{
+    __ cvtsi2ssl ($dst$$XMMRegister, $src$$Register);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+ instruct convXI2F_reg(regF dst, eRegI src)
 %{
   predicate( UseSSE>=2 && UseXmmI2F );
   match(Set dst (ConvI2F src));
@@ -12280,7 +11577,7 @@
   ins_pipe( ialu_reg_reg_long );
 %}
 
-instruct convL2D_reg( stackSlotD dst, eRegL src, eFlagsReg cr) %{
+instruct convL2DPR_reg( stackSlotD dst, eRegL src, eFlagsReg cr) %{
   predicate (UseSSE<=1);
   match(Set dst (ConvL2D src));
   effect( KILL cr );
@@ -12290,11 +11587,11 @@
             "ADD    ESP,8\n\t"
             "FSTP_D $dst\t# D-round" %}
   opcode(0xDF, 0x5);  /* DF /5 */
-  ins_encode(convert_long_double(src), Pop_Mem_D(dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct convL2XD_reg( regXD dst, eRegL src, eFlagsReg cr) %{
+  ins_encode(convert_long_double(src), Pop_Mem_DPR(dst));
+  ins_pipe( pipe_slow );
+%}
+
+instruct convL2D_reg( regD dst, eRegL src, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst (ConvL2D src));
   effect( KILL cr );
@@ -12305,11 +11602,11 @@
             "MOVSD  $dst,[ESP]\n\t"
             "ADD    ESP,8" %}
   opcode(0xDF, 0x5);  /* DF /5 */
-  ins_encode(convert_long_double2(src), Push_ResultXD(dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct convL2X_reg( regX dst, eRegL src, eFlagsReg cr) %{
+  ins_encode(convert_long_double2(src), Push_ResultD(dst));
+  ins_pipe( pipe_slow );
+%}
+
+instruct convL2F_reg( regF dst, eRegL src, eFlagsReg cr) %{
   predicate (UseSSE>=1);
   match(Set dst (ConvL2F src));
   effect( KILL cr );
@@ -12320,11 +11617,11 @@
             "MOVSS  $dst,[ESP]\n\t"
             "ADD    ESP,8" %}
   opcode(0xDF, 0x5);  /* DF /5 */
-  ins_encode(convert_long_double2(src), Push_ResultX(dst,0x8));
-  ins_pipe( pipe_slow );
-%}
-
-instruct convL2F_reg( stackSlotF dst, eRegL src, eFlagsReg cr) %{
+  ins_encode(convert_long_double2(src), Push_ResultF(dst,0x8));
+  ins_pipe( pipe_slow );
+%}
+
+instruct convL2FPR_reg( stackSlotF dst, eRegL src, eFlagsReg cr) %{
   match(Set dst (ConvL2F src));
   effect( KILL cr );
   format %{ "PUSH   $src.hi\t# Convert long to single float\n\t"
@@ -12333,7 +11630,7 @@
             "ADD    ESP,8\n\t"
             "FSTP_S $dst\t# F-round" %}
   opcode(0xDF, 0x5);  /* DF /5 */
-  ins_encode(convert_long_double(src), Pop_Mem_F(dst));
+  ins_encode(convert_long_double(src), Pop_Mem_FPR(dst));
   ins_pipe( pipe_slow );
 %}
 
@@ -12351,40 +11648,45 @@
   effect( DEF dst, USE src );
   ins_cost(100);
   format %{ "MOV    $dst,$src\t# MoveF2I_stack_reg" %}
-  opcode(0x8B);
-  ins_encode( OpcP, RegMem(dst,src));
+  ins_encode %{
+    __ movl($dst$$Register, Address(rsp, $src$$disp));
+  %}
   ins_pipe( ialu_reg_mem );
 %}
 
-instruct MoveF2I_reg_stack(stackSlotI dst, regF src) %{
+instruct MoveFPR2I_reg_stack(stackSlotI dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
 
   ins_cost(125);
   format %{ "FST_S  $dst,$src\t# MoveF2I_reg_stack" %}
-  ins_encode( Pop_Mem_Reg_F(dst, src) );
+  ins_encode( Pop_Mem_Reg_FPR(dst, src) );
   ins_pipe( fpu_mem_reg );
 %}
 
-instruct MoveF2I_reg_stack_sse(stackSlotI dst, regX src) %{
+instruct MoveF2I_reg_stack_sse(stackSlotI dst, regF src) %{
   predicate(UseSSE>=1);
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
 
   ins_cost(95);
   format %{ "MOVSS  $dst,$src\t# MoveF2I_reg_stack_sse" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct MoveF2I_reg_reg_sse(eRegI dst, regX src) %{
+  ins_encode %{
+    __ movflt(Address(rsp, $dst$$disp), $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct MoveF2I_reg_reg_sse(eRegI dst, regF src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
   ins_cost(85);
   format %{ "MOVD   $dst,$src\t# MoveF2I_reg_reg_sse" %}
-  ins_encode( MovX2I_reg(dst, src));
+  ins_encode %{
+    __ movdl($dst$$Register, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -12394,13 +11696,14 @@
 
   ins_cost(100);
   format %{ "MOV    $dst,$src\t# MoveI2F_reg_stack" %}
-  opcode(0x89);
-  ins_encode( OpcPRegSS( dst, src ) );
+  ins_encode %{
+    __ movl(Address(rsp, $dst$$disp), $src$$Register);
+  %}
   ins_pipe( ialu_mem_reg );
 %}
 
 
-instruct MoveI2F_stack_reg(regF dst, stackSlotI src) %{
+instruct MoveI2FPR_stack_reg(regFPR dst, stackSlotI src) %{
   predicate(UseSSE==0);
   match(Set dst (MoveI2F src));
   effect(DEF dst, USE src);
@@ -12410,29 +11713,33 @@
             "FSTP   $dst\t# MoveI2F_stack_reg" %}
   opcode(0xD9);               /* D9 /0, FLD m32real */
   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
-              Pop_Reg_F(dst) );
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct MoveI2F_stack_reg_sse(regX dst, stackSlotI src) %{
+instruct MoveI2F_stack_reg_sse(regF dst, stackSlotI src) %{
   predicate(UseSSE>=1);
   match(Set dst (MoveI2F src));
   effect( DEF dst, USE src );
 
   ins_cost(95);
   format %{ "MOVSS  $dst,$src\t# MoveI2F_stack_reg_sse" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct MoveI2F_reg_reg_sse(regX dst, eRegI src) %{
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct MoveI2F_reg_reg_sse(regF dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveI2F src));
   effect( DEF dst, USE src );
 
   ins_cost(85);
   format %{ "MOVD   $dst,$src\t# MoveI2F_reg_reg_sse" %}
-  ins_encode( MovI2X_reg(dst, src) );
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -12448,29 +11755,30 @@
   ins_pipe( ialu_mem_long_reg );
 %}
 
-instruct MoveD2L_reg_stack(stackSlotL dst, regD src) %{
+instruct MoveDPR2L_reg_stack(stackSlotL dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (MoveD2L src));
   effect(DEF dst, USE src);
 
   ins_cost(125);
   format %{ "FST_D  $dst,$src\t# MoveD2L_reg_stack" %}
-  ins_encode( Pop_Mem_Reg_D(dst, src) );
+  ins_encode( Pop_Mem_Reg_DPR(dst, src) );
   ins_pipe( fpu_mem_reg );
 %}
 
-instruct MoveD2L_reg_stack_sse(stackSlotL dst, regXD src) %{
+instruct MoveD2L_reg_stack_sse(stackSlotL dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveD2L src));
   effect(DEF dst, USE src);
   ins_cost(95);
-
   format %{ "MOVSD  $dst,$src\t# MoveD2L_reg_stack_sse" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src,dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct MoveD2L_reg_reg_sse(eRegL dst, regXD src, regXD tmp) %{
+  ins_encode %{
+    __ movdbl(Address(rsp, $dst$$disp), $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct MoveD2L_reg_reg_sse(eRegL dst, regD src, regD tmp) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveD2L src));
   effect(DEF dst, USE src, TEMP tmp);
@@ -12478,7 +11786,11 @@
   format %{ "MOVD   $dst.lo,$src\n\t"
             "PSHUFLW $tmp,$src,0x4E\n\t"
             "MOVD   $dst.hi,$tmp\t# MoveD2L_reg_reg_sse" %}
-  ins_encode( MovXD2L_reg(dst, src, tmp) );
+  ins_encode %{
+    __ movdl($dst$$Register, $src$$XMMRegister);
+    __ pshuflw($tmp$$XMMRegister, $src$$XMMRegister, 0x4e);
+    __ movdl(HIGH_FROM_LOW($dst$$Register), $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -12495,7 +11807,7 @@
 %}
 
 
-instruct MoveL2D_stack_reg(regD dst, stackSlotL src) %{
+instruct MoveL2DPR_stack_reg(regDPR dst, stackSlotL src) %{
   predicate(UseSSE<=1);
   match(Set dst (MoveL2D src));
   effect(DEF dst, USE src);
@@ -12505,34 +11817,38 @@
             "FSTP   $dst\t# MoveL2D_stack_reg" %}
   opcode(0xDD);               /* DD /0, FLD m64real */
   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
-              Pop_Reg_D(dst) );
+              Pop_Reg_DPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
 
-instruct MoveL2D_stack_reg_sse(regXD dst, stackSlotL src) %{
+instruct MoveL2D_stack_reg_sse(regD dst, stackSlotL src) %{
   predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
   match(Set dst (MoveL2D src));
   effect(DEF dst, USE src);
 
   ins_cost(95);
   format %{ "MOVSD  $dst,$src\t# MoveL2D_stack_reg_sse" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct MoveL2D_stack_reg_sse_partial(regXD dst, stackSlotL src) %{
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct MoveL2D_stack_reg_sse_partial(regD dst, stackSlotL src) %{
   predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
   match(Set dst (MoveL2D src));
   effect(DEF dst, USE src);
 
   ins_cost(95);
   format %{ "MOVLPD $dst,$src\t# MoveL2D_stack_reg_sse" %}
-  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x12), RegMem(dst,src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct MoveL2D_reg_reg_sse(regXD dst, eRegL src, regXD tmp) %{
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct MoveL2D_reg_reg_sse(regD dst, eRegL src, regD tmp) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveL2D src));
   effect(TEMP dst, USE src, TEMP tmp);
@@ -12540,149 +11856,192 @@
   format %{ "MOVD   $dst,$src.lo\n\t"
             "MOVD   $tmp,$src.hi\n\t"
             "PUNPCKLDQ $dst,$tmp\t# MoveL2D_reg_reg_sse" %}
-  ins_encode( MovL2XD_reg(dst, src, tmp) );
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_reg(regXD dst, regXD src) %{
+instruct Repl8B_reg(regD dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate8B src));
   format %{ "MOVDQA  $dst,$src\n\t"
             "PUNPCKLBW $dst,$dst\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode( pshufd_8x8(dst, src));
+  ins_encode %{
+    if ($dst$$reg != $src$$reg) {
+      __ movdqa($dst$$XMMRegister, $src$$XMMRegister);
+    }
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_eRegI(regXD dst, eRegI src) %{
+instruct Repl8B_eRegI(regD dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate8B src));
   format %{ "MOVD    $dst,$src\n\t"
             "PUNPCKLBW $dst,$dst\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode( mov_i2x(dst, src), pshufd_8x8(dst, dst));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Replicate scalar zero to packed byte (1 byte) values in xmm
-instruct Repl8B_immI0(regXD dst, immI0 zero) %{
+instruct Repl8B_immI0(regD dst, immI0 zero) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate8B zero));
   format %{ "PXOR  $dst,$dst\t! replicate8B" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_reg(regXD dst, regXD src) %{
+instruct Repl4S_reg(regD dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4S src));
   format %{ "PSHUFLW $dst,$src,0x00\t! replicate4S" %}
-  ins_encode( pshufd_4x16(dst, src));
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_eRegI(regXD dst, eRegI src) %{
+instruct Repl4S_eRegI(regD dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4S src));
   format %{ "MOVD    $dst,$src\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate4S" %}
-  ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar zero to packed short (2 byte) values in xmm
-instruct Repl4S_immI0(regXD dst, immI0 zero) %{
+instruct Repl4S_immI0(regD dst, immI0 zero) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4S zero));
   format %{ "PXOR  $dst,$dst\t! replicate4S" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_reg(regXD dst, regXD src) %{
+instruct Repl4C_reg(regD dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4C src));
   format %{ "PSHUFLW $dst,$src,0x00\t! replicate4C" %}
-  ins_encode( pshufd_4x16(dst, src));
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_eRegI(regXD dst, eRegI src) %{
+instruct Repl4C_eRegI(regD dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4C src));
   format %{ "MOVD    $dst,$src\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate4C" %}
-  ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar zero to packed char (2 byte) values in xmm
-instruct Repl4C_immI0(regXD dst, immI0 zero) %{
+instruct Repl4C_immI0(regD dst, immI0 zero) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4C zero));
   format %{ "PXOR  $dst,$dst\t! replicate4C" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_reg(regXD dst, regXD src) %{
+instruct Repl2I_reg(regD dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate2I src));
   format %{ "PSHUFD $dst,$src,0x00\t! replicate2I" %}
-  ins_encode( pshufd(dst, src, 0x00));
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_eRegI(regXD dst, eRegI src) %{
+instruct Repl2I_eRegI(regD dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate2I src));
   format %{ "MOVD   $dst,$src\n\t"
             "PSHUFD $dst,$dst,0x00\t! replicate2I" %}
-  ins_encode( mov_i2x(dst, src), pshufd(dst, dst, 0x00));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar zero to packed integer (2 byte) values in xmm
-instruct Repl2I_immI0(regXD dst, immI0 zero) %{
+instruct Repl2I_immI0(regD dst, immI0 zero) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate2I zero));
   format %{ "PXOR  $dst,$dst\t! replicate2I" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_reg(regXD dst, regXD src) %{