changeset 6083:62c54fcc0a35

Merge
author kvn
date Tue, 25 Mar 2014 17:07:36 -0700
parents a433eb716ce1 eb6b3ac64f0e
children 941427282eae
files .hgtags make/hotspot_version src/cpu/sparc/vm/sparc.ad src/cpu/x86/vm/globals_x86.hpp src/cpu/x86/vm/sharedRuntime_x86_32.cpp src/cpu/x86/vm/sharedRuntime_x86_64.cpp src/cpu/x86/vm/x86_32.ad src/cpu/x86/vm/x86_64.ad src/os/bsd/vm/os_bsd.cpp src/os/linux/vm/os_linux.cpp src/os/solaris/vm/os_solaris.cpp src/share/vm/adlc/archDesc.cpp src/share/vm/adlc/formssel.cpp src/share/vm/adlc/output_c.cpp src/share/vm/code/nmethod.cpp src/share/vm/gc_implementation/g1/heapRegionSets.cpp src/share/vm/gc_implementation/g1/heapRegionSets.hpp src/share/vm/memory/metaspace.hpp src/share/vm/memory/universe.cpp src/share/vm/oops/instanceKlass.cpp src/share/vm/oops/methodData.cpp src/share/vm/oops/methodData.hpp src/share/vm/opto/c2_globals.hpp src/share/vm/opto/classes.hpp src/share/vm/opto/compile.cpp src/share/vm/opto/compile.hpp src/share/vm/opto/graphKit.cpp src/share/vm/opto/graphKit.hpp src/share/vm/opto/lcm.cpp src/share/vm/opto/library_call.cpp src/share/vm/opto/locknode.hpp src/share/vm/opto/loopTransform.cpp src/share/vm/opto/machnode.hpp src/share/vm/opto/macro.cpp src/share/vm/opto/matcher.cpp src/share/vm/opto/matcher.hpp src/share/vm/opto/node.hpp src/share/vm/opto/parse.hpp src/share/vm/opto/parse1.cpp src/share/vm/opto/runtime.cpp src/share/vm/opto/type.cpp src/share/vm/opto/type.hpp src/share/vm/prims/unsafe.cpp src/share/vm/runtime/arguments.cpp src/share/vm/runtime/deoptimization.cpp src/share/vm/runtime/globals.hpp src/share/vm/runtime/mutexLocker.hpp src/share/vm/runtime/os.hpp src/share/vm/runtime/thread.cpp src/share/vm/runtime/vmStructs.cpp src/share/vm/utilities/globalDefinitions.hpp
diffstat 281 files changed, 13250 insertions(+), 4478 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Tue Mar 25 12:54:21 2014 -0700
+++ b/.hgtags	Tue Mar 25 17:07:36 2014 -0700
@@ -432,3 +432,8 @@
 ecf3678d5736a645aea893b525a9eb5fa1a8e072 hs25.20-b04
 51e1bb81df8680bd237630323de5e0704fb25607 jdk8u20-b03
 54436d3b2a915ff50a8d6b34f61d5afb45be7bb6 hs25.20-b05
+d4e18f0633c662588cc0875be7759721c7d85af4 jdk8u20-b04
+57eb3e69397e9d5818c5fdaef65b47d9b03f7f88 jdk8u20-b05
+804f89b6ff46728d60a69e9a338e63f362f7ac68 hs25.20-b06
+c3d92e04873788275eeebec6bcd2948cdbd143a7 jdk8u20-b06
+39eae002499704438142e78f5e0e24d46d0b266f hs25.20-b07
--- a/agent/src/share/classes/sun/jvm/hotspot/gc_implementation/g1/G1CollectedHeap.java	Tue Mar 25 12:54:21 2014 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/gc_implementation/g1/G1CollectedHeap.java	Tue Mar 25 17:07:36 2014 -0700
@@ -51,9 +51,9 @@
     static private CIntegerField summaryBytesUsedField;
     // G1MonitoringSupport* _g1mm;
     static private AddressField g1mmField;
-    // MasterOldRegionSet _old_set;
+    // HeapRegionSet _old_set;
     static private long oldSetFieldOffset;
-    // MasterHumongousRegionSet _humongous_set;
+    // HeapRegionSet _humongous_set;
     static private long humongousSetFieldOffset;
 
     static {
--- a/agent/src/share/classes/sun/jvm/hotspot/gc_implementation/g1/HeapRegionSetBase.java	Tue Mar 25 12:54:21 2014 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/gc_implementation/g1/HeapRegionSetBase.java	Tue Mar 25 17:07:36 2014 -0700
@@ -40,12 +40,8 @@
 // Mirror class for HeapRegionSetBase. Represents a group of regions.
 
 public class HeapRegionSetBase extends VMObject {
-    // uint _length;
-    static private CIntegerField lengthField;
-    // uint _region_num;
-    static private CIntegerField regionNumField;
-    // size_t _total_used_bytes;
-    static private CIntegerField totalUsedBytesField;
+
+    static private long countField;
 
     static {
         VM.registerVMInitializedObserver(new Observer() {
@@ -58,21 +54,13 @@
     static private synchronized void initialize(TypeDataBase db) {
         Type type = db.lookupType("HeapRegionSetBase");
 
-        lengthField         = type.getCIntegerField("_length");
-        regionNumField      = type.getCIntegerField("_region_num");
-        totalUsedBytesField = type.getCIntegerField("_total_used_bytes");
+        countField = type.getField("_count").getOffset();
     }
 
-    public long length() {
-        return lengthField.getValue(addr);
-    }
 
-    public long regionNum() {
-        return regionNumField.getValue(addr);
-    }
-
-    public long totalUsedBytes() {
-        return totalUsedBytesField.getValue(addr);
+    public HeapRegionSetCount count() {
+        Address countFieldAddr = addr.addOffsetTo(countField);
+        return (HeapRegionSetCount) VMObjectFactory.newObject(HeapRegionSetCount.class, countFieldAddr);
     }
 
     public HeapRegionSetBase(Address addr) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/gc_implementation/g1/HeapRegionSetCount.java	Tue Mar 25 17:07:36 2014 -0700
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.gc_implementation.g1;
+
+import java.util.Iterator;
+import java.util.Observable;
+import java.util.Observer;
+
+import sun.jvm.hotspot.debugger.Address;
+import sun.jvm.hotspot.runtime.VM;
+import sun.jvm.hotspot.runtime.VMObject;
+import sun.jvm.hotspot.runtime.VMObjectFactory;
+import sun.jvm.hotspot.types.AddressField;
+import sun.jvm.hotspot.types.CIntegerField;
+import sun.jvm.hotspot.types.Type;
+import sun.jvm.hotspot.types.TypeDataBase;
+
+// Mirror class for HeapRegionSetCount. Represents a group of regions.
+
+public class HeapRegionSetCount extends VMObject {
+
+    static private CIntegerField lengthField;
+    static private CIntegerField capacityField;
+
+    static {
+        VM.registerVMInitializedObserver(new Observer() {
+                public void update(Observable o, Object data) {
+                    initialize(VM.getVM().getTypeDataBase());
+                }
+            });
+    }
+
+    static private synchronized void initialize(TypeDataBase db) {
+        Type type = db.lookupType("HeapRegionSetCount");
+
+        lengthField   = type.getCIntegerField("_length");
+        capacityField = type.getCIntegerField("_capacity");
+    }
+
+    public long length() {
+        return lengthField.getValue(addr);
+    }
+
+    public long capacity() {
+        return capacityField.getValue(addr);
+    }
+
+    public HeapRegionSetCount(Address addr) {
+        super(addr);
+    }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/tools/HeapSummary.java	Tue Mar 25 12:54:21 2014 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/tools/HeapSummary.java	Tue Mar 25 17:07:36 2014 -0700
@@ -114,7 +114,8 @@
              long survivorRegionNum = g1mm.survivorRegionNum();
              HeapRegionSetBase oldSet = g1h.oldSet();
              HeapRegionSetBase humongousSet = g1h.humongousSet();
-             long oldRegionNum = oldSet.regionNum() + humongousSet.regionNum();
+             long oldRegionNum = oldSet.count().length()
+                          + humongousSet.count().capacity() / HeapRegion.grainBytes();
              printG1Space("G1 Heap:", g1h.n_regions(),
                           g1h.used(), g1h.capacity());
              System.out.println("G1 Young Generation:");
--- a/make/excludeSrc.make	Tue Mar 25 12:54:21 2014 -0700
+++ b/make/excludeSrc.make	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -87,9 +87,10 @@
 	g1BlockOffsetTable.cpp g1CardCounts.cpp g1CollectedHeap.cpp g1CollectorPolicy.cpp \
 	g1ErgoVerbose.cpp g1GCPhaseTimes.cpp g1HRPrinter.cpp g1HotCardCache.cpp g1Log.cpp \
 	g1MMUTracker.cpp g1MarkSweep.cpp g1MemoryPool.cpp g1MonitoringSupport.cpp g1OopClosures.cpp \
-	g1RemSet.cpp g1RemSetSummary.cpp g1SATBCardTableModRefBS.cpp g1_globals.cpp heapRegion.cpp \
+	g1RemSet.cpp g1RemSetSummary.cpp g1SATBCardTableModRefBS.cpp g1StringDedup.cpp g1StringDedupStat.cpp \
+	g1StringDedupTable.cpp g1StringDedupThread.cpp g1StringDedupQueue.cpp g1_globals.cpp heapRegion.cpp \
 	g1BiasedArray.cpp heapRegionRemSet.cpp heapRegionSeq.cpp heapRegionSet.cpp heapRegionSets.cpp \
-	ptrQueue.cpp satbQueue.cpp sparsePRT.cpp survRateGroup.cpp vm_operations_g1.cpp \
+	ptrQueue.cpp satbQueue.cpp sparsePRT.cpp survRateGroup.cpp vm_operations_g1.cpp g1CodeCacheRemSet.cpp \
 	adjoiningGenerations.cpp adjoiningVirtualSpaces.cpp asPSOldGen.cpp asPSYoungGen.cpp \
 	cardTableExtension.cpp gcTaskManager.cpp gcTaskThread.cpp objectStartArray.cpp \
 	parallelScavengeHeap.cpp parMarkBitMap.cpp pcTasks.cpp psAdaptiveSizePolicy.cpp \
--- a/make/hotspot_version	Tue Mar 25 12:54:21 2014 -0700
+++ b/make/hotspot_version	Tue Mar 25 17:07:36 2014 -0700
@@ -35,7 +35,7 @@
 
 HS_MAJOR_VER=25
 HS_MINOR_VER=20
-HS_BUILD_NUMBER=05
+HS_BUILD_NUMBER=08
 
 JDK_MAJOR_VER=1
 JDK_MINOR_VER=8
--- a/src/cpu/sparc/vm/sparc.ad	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/sparc/vm/sparc.ad	Tue Mar 25 17:07:36 2014 -0700
@@ -2071,19 +2071,6 @@
   return L7_REGP_mask();
 }
 
-const RegMask Matcher::mathExactI_result_proj_mask() {
-  return G1_REGI_mask();
-}
-
-const RegMask Matcher::mathExactL_result_proj_mask() {
-  return G1_REGL_mask();
-}
-
-const RegMask Matcher::mathExactI_flags_proj_mask() {
-  return INT_FLAGS_mask();
-}
-
-
 %}
 
 
--- a/src/cpu/x86/vm/assembler_x86.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1089,6 +1089,21 @@
   emit_arith(0x23, 0xC0, dst, src);
 }
 
+void Assembler::andnl(Register dst, Register src1, Register src2) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode(dst, src1, src2);
+  emit_int8((unsigned char)0xF2);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::andnl(Register dst, Register src1, Address src2) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38(dst, src1, src2);
+  emit_int8((unsigned char)0xF2);
+  emit_operand(dst, src2);
+}
+
 void Assembler::bsfl(Register dst, Register src) {
   int encode = prefix_and_encode(dst->encoding(), src->encoding());
   emit_int8(0x0F);
@@ -1097,7 +1112,6 @@
 }
 
 void Assembler::bsrl(Register dst, Register src) {
-  assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
   int encode = prefix_and_encode(dst->encoding(), src->encoding());
   emit_int8(0x0F);
   emit_int8((unsigned char)0xBD);
@@ -1110,6 +1124,51 @@
   emit_int8((unsigned char)(0xC8 | encode));
 }
 
+void Assembler::blsil(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode(rbx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsil(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38(rbx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rbx, src);
+}
+
+void Assembler::blsmskl(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode(rdx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsmskl(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38(rdx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rdx, src);
+}
+
+void Assembler::blsrl(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode(rcx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsrl(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38(rcx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rcx, src);
+}
+
 void Assembler::call(Label& L, relocInfo::relocType rtype) {
   // suspect disp32 is always good
   int operand = LP64_ONLY(disp32_operand) NOT_LP64(imm_operand);
@@ -2283,6 +2342,11 @@
   emit_int8(imm8);
 }
 
+void Assembler::pause() {
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)0x90);
+}
+
 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
   InstructionMark im(this);
@@ -2607,6 +2671,11 @@
   }
 }
 
+void Assembler::rdtsc() {
+  emit_int8((unsigned char)0x0F);
+  emit_int8((unsigned char)0x31);
+}
+
 // copies data from [esi] to [edi] using rcx pointer sized words
 // generic
 void Assembler::rep_mov() {
@@ -2878,6 +2947,24 @@
   emit_operand(dst, src);
 }
 
+void Assembler::tzcntl(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "tzcnt instruction not supported");
+  emit_int8((unsigned char)0xF3);
+  int encode = prefix_and_encode(dst->encoding(), src->encoding());
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xBC);
+  emit_int8((unsigned char)0xC0 | encode);
+}
+
+void Assembler::tzcntq(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "tzcnt instruction not supported");
+  emit_int8((unsigned char)0xF3);
+  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xBC);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
@@ -2898,6 +2985,11 @@
   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
 }
 
+void Assembler::xabort(int8_t imm8) {
+  emit_int8((unsigned char)0xC6);
+  emit_int8((unsigned char)0xF8);
+  emit_int8((unsigned char)(imm8 & 0xFF));
+}
 
 void Assembler::xaddl(Address dst, Register src) {
   InstructionMark im(this);
@@ -2907,6 +2999,24 @@
   emit_operand(src, dst);
 }
 
+void Assembler::xbegin(Label& abort, relocInfo::relocType rtype) {
+  InstructionMark im(this);
+  relocate(rtype);
+  if (abort.is_bound()) {
+    address entry = target(abort);
+    assert(entry != NULL, "abort entry NULL");
+    intptr_t offset = entry - pc();
+    emit_int8((unsigned char)0xC7);
+    emit_int8((unsigned char)0xF8);
+    emit_int32(offset - 6); // 2 opcode + 4 address
+  } else {
+    abort.add_patch_at(code(), locator());
+    emit_int8((unsigned char)0xC7);
+    emit_int8((unsigned char)0xF8);
+    emit_int32(0);
+  }
+}
+
 void Assembler::xchgl(Register dst, Address src) { // xchg
   InstructionMark im(this);
   prefix(src, dst);
@@ -2920,6 +3030,12 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::xend() {
+  emit_int8((unsigned char)0x0F);
+  emit_int8((unsigned char)0x01);
+  emit_int8((unsigned char)0xD5);
+}
+
 void Assembler::xgetbv() {
   emit_int8(0x0F);
   emit_int8(0x01);
@@ -4837,6 +4953,21 @@
   emit_arith(0x23, 0xC0, dst, src);
 }
 
+void Assembler::andnq(Register dst, Register src1, Register src2) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode_q(dst, src1, src2);
+  emit_int8((unsigned char)0xF2);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::andnq(Register dst, Register src1, Address src2) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38_q(dst, src1, src2);
+  emit_int8((unsigned char)0xF2);
+  emit_operand(dst, src2);
+}
+
 void Assembler::bsfq(Register dst, Register src) {
   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
   emit_int8(0x0F);
@@ -4845,7 +4976,6 @@
 }
 
 void Assembler::bsrq(Register dst, Register src) {
-  assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
   emit_int8(0x0F);
   emit_int8((unsigned char)0xBD);
@@ -4858,6 +4988,51 @@
   emit_int8((unsigned char)(0xC8 | encode));
 }
 
+void Assembler::blsiq(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode_q(rbx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsiq(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38_q(rbx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rbx, src);
+}
+
+void Assembler::blsmskq(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode_q(rdx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsmskq(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38_q(rdx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rdx, src);
+}
+
+void Assembler::blsrq(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode_q(rcx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsrq(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38_q(rcx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rcx, src);
+}
+
 void Assembler::cdqq() {
   prefix(REX_W);
   emit_int8((unsigned char)0x99);
--- a/src/cpu/x86/vm/assembler_x86.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -590,10 +590,35 @@
     vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
   }
 
+  void vex_prefix_0F38(Register dst, Register nds, Address src) {
+    bool vex_w = false;
+    bool vector256 = false;
+    vex_prefix(src, nds->encoding(), dst->encoding(),
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+  }
+
+  void vex_prefix_0F38_q(Register dst, Register nds, Address src) {
+    bool vex_w = true;
+    bool vector256 = false;
+    vex_prefix(src, nds->encoding(), dst->encoding(),
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+  }
   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
                              VexSimdPrefix pre, VexOpcode opc,
                              bool vex_w, bool vector256);
 
+  int  vex_prefix_0F38_and_encode(Register dst, Register nds, Register src) {
+    bool vex_w = false;
+    bool vector256 = false;
+    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+  }
+  int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src) {
+    bool vex_w = true;
+    bool vector256 = false;
+    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+  }
   int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
                              VexSimdPrefix pre, bool vector256 = false,
                              VexOpcode opc = VEX_OPCODE_0F) {
@@ -897,6 +922,27 @@
   void andq(Register dst, Address src);
   void andq(Register dst, Register src);
 
+  // BMI instructions
+  void andnl(Register dst, Register src1, Register src2);
+  void andnl(Register dst, Register src1, Address src2);
+  void andnq(Register dst, Register src1, Register src2);
+  void andnq(Register dst, Register src1, Address src2);
+
+  void blsil(Register dst, Register src);
+  void blsil(Register dst, Address src);
+  void blsiq(Register dst, Register src);
+  void blsiq(Register dst, Address src);
+
+  void blsmskl(Register dst, Register src);
+  void blsmskl(Register dst, Address src);
+  void blsmskq(Register dst, Register src);
+  void blsmskq(Register dst, Address src);
+
+  void blsrl(Register dst, Register src);
+  void blsrl(Register dst, Address src);
+  void blsrq(Register dst, Register src);
+  void blsrq(Register dst, Address src);
+
   void bsfl(Register dst, Register src);
   void bsrl(Register dst, Register src);
 
@@ -1405,6 +1451,8 @@
   // Pemutation of 64bit words
   void vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256);
 
+  void pause();
+
   // SSE4.2 string instructions
   void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
   void pcmpestri(XMMRegister xmm1, Address src, int imm8);
@@ -1489,6 +1537,8 @@
 
   void rclq(Register dst, int imm8);
 
+  void rdtsc();
+
   void ret(int imm16);
 
   void sahf();
@@ -1574,6 +1624,9 @@
   void testq(Register dst, int32_t imm32);
   void testq(Register dst, Register src);
 
+  // BMI - count trailing zeros
+  void tzcntl(Register dst, Register src);
+  void tzcntq(Register dst, Register src);
 
   // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
   void ucomisd(XMMRegister dst, Address src);
@@ -1583,16 +1636,22 @@
   void ucomiss(XMMRegister dst, Address src);
   void ucomiss(XMMRegister dst, XMMRegister src);
 
+  void xabort(int8_t imm8);
+
   void xaddl(Address dst, Register src);
 
   void xaddq(Address dst, Register src);
 
+  void xbegin(Label& abort, relocInfo::relocType rtype = relocInfo::none);
+
   void xchgl(Register reg, Address adr);
   void xchgl(Register dst, Register src);
 
   void xchgq(Register reg, Address adr);
   void xchgq(Register dst, Register src);
 
+  void xend();
+
   // Get Value of Extended Control Register
   void xgetbv();
 
--- a/src/cpu/x86/vm/globals_x86.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/globals_x86.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -129,11 +129,53 @@
   product(bool, UseFastStosb, false,                                        \
           "Use fast-string operation for zeroing: rep stosb")               \
                                                                             \
+  /* Use Restricted Transactional Memory for lock eliding */                \
+  experimental(bool, UseRTMLocking, false,                                  \
+          "Enable RTM lock eliding for inflated locks in compiled code")    \
+                                                                            \
+  experimental(bool, UseRTMForStackLocks, false,                            \
+          "Enable RTM lock eliding for stack locks in compiled code")       \
+                                                                            \
+  experimental(bool, UseRTMDeopt, false,                                    \
+          "Perform deopt and recompilation based on RTM abort ratio")       \
+                                                                            \
+  experimental(uintx, RTMRetryCount, 5,                                     \
+          "Number of RTM retries on lock abort or busy")                    \
+                                                                            \
+  experimental(intx, RTMSpinLoopCount, 100,                                 \
+          "Spin count for lock to become free before RTM retry")            \
+                                                                            \
+  experimental(intx, RTMAbortThreshold, 1000,                               \
+          "Calculate abort ratio after this number of aborts")              \
+                                                                            \
+  experimental(intx, RTMLockingThreshold, 10000,                            \
+          "Lock count at which to do RTM lock eliding without "             \
+          "abort ratio calculation")                                        \
+                                                                            \
+  experimental(intx, RTMAbortRatio, 50,                                     \
+          "Lock abort ratio at which to stop use RTM lock eliding")         \
+                                                                            \
+  experimental(intx, RTMTotalCountIncrRate, 64,                             \
+          "Increment total RTM attempted lock count once every n times")    \
+                                                                            \
+  experimental(intx, RTMLockingCalculationDelay, 0,                         \
+          "Number of milliseconds to wait before start calculating aborts " \
+          "for RTM locking")                                                \
+                                                                            \
+  experimental(bool, UseRTMXendForLockBusy, false,                          \
+          "Use RTM Xend instead of Xabort when lock busy")                  \
+                                                                            \
   /* assembler */                                                           \
   product(bool, Use486InstrsOnly, false,                                    \
           "Use 80486 Compliant instruction subset")                         \
                                                                             \
   product(bool, UseCountLeadingZerosInstruction, false,                     \
           "Use count leading zeros instruction")                            \
+                                                                            \
+  product(bool, UseCountTrailingZerosInstruction, false,                    \
+          "Use count trailing zeros instruction")                           \
+                                                                            \
+  product(bool, UseBMI1Instructions, false,                                 \
+          "Use BMI instructions")
 
 #endif // CPU_X86_VM_GLOBALS_X86_HPP
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -98,217 +98,6 @@
   return Address::make_array(adr);
 }
 
-int MacroAssembler::biased_locking_enter(Register lock_reg,
-                                         Register obj_reg,
-                                         Register swap_reg,
-                                         Register tmp_reg,
-                                         bool swap_reg_contains_mark,
-                                         Label& done,
-                                         Label* slow_case,
-                                         BiasedLockingCounters* counters) {
-  assert(UseBiasedLocking, "why call this otherwise?");
-  assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
-  assert_different_registers(lock_reg, obj_reg, swap_reg);
-
-  if (PrintBiasedLockingStatistics && counters == NULL)
-    counters = BiasedLocking::counters();
-
-  bool need_tmp_reg = false;
-  if (tmp_reg == noreg) {
-    need_tmp_reg = true;
-    tmp_reg = lock_reg;
-  } else {
-    assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
-  }
-  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
-  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
-  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
-  Address saved_mark_addr(lock_reg, 0);
-
-  // Biased locking
-  // See whether the lock is currently biased toward our thread and
-  // whether the epoch is still valid
-  // Note that the runtime guarantees sufficient alignment of JavaThread
-  // pointers to allow age to be placed into low bits
-  // First check to see whether biasing is even enabled for this object
-  Label cas_label;
-  int null_check_offset = -1;
-  if (!swap_reg_contains_mark) {
-    null_check_offset = offset();
-    movl(swap_reg, mark_addr);
-  }
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  movl(tmp_reg, swap_reg);
-  andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
-  cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  jcc(Assembler::notEqual, cas_label);
-  // The bias pattern is present in the object's header. Need to check
-  // whether the bias owner and the epoch are both still current.
-  // Note that because there is no current thread register on x86 we
-  // need to store off the mark word we read out of the object to
-  // avoid reloading it and needing to recheck invariants below. This
-  // store is unfortunate but it makes the overall code shorter and
-  // simpler.
-  movl(saved_mark_addr, swap_reg);
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  get_thread(tmp_reg);
-  xorl(swap_reg, tmp_reg);
-  if (swap_reg_contains_mark) {
-    null_check_offset = offset();
-  }
-  movl(tmp_reg, klass_addr);
-  xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
-  andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address)counters->biased_lock_entry_count_addr()));
-  }
-  jcc(Assembler::equal, done);
-
-  Label try_revoke_bias;
-  Label try_rebias;
-
-  // At this point we know that the header has the bias pattern and
-  // that we are not the bias owner in the current epoch. We need to
-  // figure out more details about the state of the header in order to
-  // know what operations can be legally performed on the object's
-  // header.
-
-  // If the low three bits in the xor result aren't clear, that means
-  // the prototype header is no longer biased and we have to revoke
-  // the bias on this object.
-  testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
-  jcc(Assembler::notZero, try_revoke_bias);
-
-  // Biasing is still enabled for this data type. See whether the
-  // epoch of the current bias is still valid, meaning that the epoch
-  // bits of the mark word are equal to the epoch bits of the
-  // prototype header. (Note that the prototype header's epoch bits
-  // only change at a safepoint.) If not, attempt to rebias the object
-  // toward the current thread. Note that we must be absolutely sure
-  // that the current epoch is invalid in order to do this because
-  // otherwise the manipulations it performs on the mark word are
-  // illegal.
-  testl(swap_reg, markOopDesc::epoch_mask_in_place);
-  jcc(Assembler::notZero, try_rebias);
-
-  // The epoch of the current bias is still valid but we know nothing
-  // about the owner; it might be set or it might be clear. Try to
-  // acquire the bias of the object using an atomic operation. If this
-  // fails we will go in to the runtime to revoke the object's bias.
-  // Note that we first construct the presumed unbiased header so we
-  // don't accidentally blow away another thread's valid bias.
-  movl(swap_reg, saved_mark_addr);
-  andl(swap_reg,
-       markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  get_thread(tmp_reg);
-  orl(tmp_reg, swap_reg);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  // If the biasing toward our thread failed, this means that
-  // another thread succeeded in biasing it toward itself and we
-  // need to revoke that bias. The revocation will occur in the
-  // interpreter runtime in the slow case.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
-  }
-  if (slow_case != NULL) {
-    jcc(Assembler::notZero, *slow_case);
-  }
-  jmp(done);
-
-  bind(try_rebias);
-  // At this point we know the epoch has expired, meaning that the
-  // current "bias owner", if any, is actually invalid. Under these
-  // circumstances _only_, we are allowed to use the current header's
-  // value as the comparison value when doing the cas to acquire the
-  // bias in the current epoch. In other words, we allow transfer of
-  // the bias from one thread to another directly in this situation.
-  //
-  // FIXME: due to a lack of registers we currently blow away the age
-  // bits in this situation. Should attempt to preserve them.
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  get_thread(tmp_reg);
-  movl(swap_reg, klass_addr);
-  orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
-  movl(swap_reg, saved_mark_addr);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  // If the biasing toward our thread failed, then another thread
-  // succeeded in biasing it toward itself and we need to revoke that
-  // bias. The revocation will occur in the runtime in the slow case.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
-  }
-  if (slow_case != NULL) {
-    jcc(Assembler::notZero, *slow_case);
-  }
-  jmp(done);
-
-  bind(try_revoke_bias);
-  // The prototype mark in the klass doesn't have the bias bit set any
-  // more, indicating that objects of this data type are not supposed
-  // to be biased any more. We are going to try to reset the mark of
-  // this object to the prototype value and fall through to the
-  // CAS-based locking scheme. Note that if our CAS fails, it means
-  // that another thread raced us for the privilege of revoking the
-  // bias of this particular object, so it's okay to continue in the
-  // normal locking code.
-  //
-  // FIXME: due to a lack of registers we currently blow away the age
-  // bits in this situation. Should attempt to preserve them.
-  movl(swap_reg, saved_mark_addr);
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  movl(tmp_reg, klass_addr);
-  movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  // Fall through to the normal CAS-based lock, because no matter what
-  // the result of the above CAS, some thread must have succeeded in
-  // removing the bias bit from the object's header.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
-  }
-
-  bind(cas_label);
-
-  return null_check_offset;
-}
 void MacroAssembler::call_VM_leaf_base(address entry_point,
                                        int number_of_arguments) {
   call(RuntimeAddress(entry_point));
@@ -512,7 +301,9 @@
   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 }
 
-void MacroAssembler::movptr(Register dst, AddressLiteral src) {
+void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
+  // scratch register is not used,
+  // it is defined to match parameters of 64-bit version of this method.
   if (src.is_lval()) {
     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
   } else {
@@ -726,165 +517,6 @@
   return array;
 }
 
-int MacroAssembler::biased_locking_enter(Register lock_reg,
-                                         Register obj_reg,
-                                         Register swap_reg,
-                                         Register tmp_reg,
-                                         bool swap_reg_contains_mark,
-                                         Label& done,
-                                         Label* slow_case,
-                                         BiasedLockingCounters* counters) {
-  assert(UseBiasedLocking, "why call this otherwise?");
-  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
-  assert(tmp_reg != noreg, "tmp_reg must be supplied");
-  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
-  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
-  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
-  Address saved_mark_addr(lock_reg, 0);
-
-  if (PrintBiasedLockingStatistics && counters == NULL)
-    counters = BiasedLocking::counters();
-
-  // Biased locking
-  // See whether the lock is currently biased toward our thread and
-  // whether the epoch is still valid
-  // Note that the runtime guarantees sufficient alignment of JavaThread
-  // pointers to allow age to be placed into low bits
-  // First check to see whether biasing is even enabled for this object
-  Label cas_label;
-  int null_check_offset = -1;
-  if (!swap_reg_contains_mark) {
-    null_check_offset = offset();
-    movq(swap_reg, mark_addr);
-  }
-  movq(tmp_reg, swap_reg);
-  andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
-  cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
-  jcc(Assembler::notEqual, cas_label);
-  // The bias pattern is present in the object's header. Need to check
-  // whether the bias owner and the epoch are both still current.
-  load_prototype_header(tmp_reg, obj_reg);
-  orq(tmp_reg, r15_thread);
-  xorq(tmp_reg, swap_reg);
-  andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
-  }
-  jcc(Assembler::equal, done);
-
-  Label try_revoke_bias;
-  Label try_rebias;
-
-  // At this point we know that the header has the bias pattern and
-  // that we are not the bias owner in the current epoch. We need to
-  // figure out more details about the state of the header in order to
-  // know what operations can be legally performed on the object's
-  // header.
-
-  // If the low three bits in the xor result aren't clear, that means
-  // the prototype header is no longer biased and we have to revoke
-  // the bias on this object.
-  testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
-  jcc(Assembler::notZero, try_revoke_bias);
-
-  // Biasing is still enabled for this data type. See whether the
-  // epoch of the current bias is still valid, meaning that the epoch
-  // bits of the mark word are equal to the epoch bits of the
-  // prototype header. (Note that the prototype header's epoch bits
-  // only change at a safepoint.) If not, attempt to rebias the object
-  // toward the current thread. Note that we must be absolutely sure
-  // that the current epoch is invalid in order to do this because
-  // otherwise the manipulations it performs on the mark word are
-  // illegal.
-  testq(tmp_reg, markOopDesc::epoch_mask_in_place);
-  jcc(Assembler::notZero, try_rebias);
-
-  // The epoch of the current bias is still valid but we know nothing
-  // about the owner; it might be set or it might be clear. Try to
-  // acquire the bias of the object using an atomic operation. If this
-  // fails we will go in to the runtime to revoke the object's bias.
-  // Note that we first construct the presumed unbiased header so we
-  // don't accidentally blow away another thread's valid bias.
-  andq(swap_reg,
-       markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
-  movq(tmp_reg, swap_reg);
-  orq(tmp_reg, r15_thread);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgq(tmp_reg, Address(obj_reg, 0));
-  // If the biasing toward our thread failed, this means that
-  // another thread succeeded in biasing it toward itself and we
-  // need to revoke that bias. The revocation will occur in the
-  // interpreter runtime in the slow case.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
-  }
-  if (slow_case != NULL) {
-    jcc(Assembler::notZero, *slow_case);
-  }
-  jmp(done);
-
-  bind(try_rebias);
-  // At this point we know the epoch has expired, meaning that the
-  // current "bias owner", if any, is actually invalid. Under these
-  // circumstances _only_, we are allowed to use the current header's
-  // value as the comparison value when doing the cas to acquire the
-  // bias in the current epoch. In other words, we allow transfer of
-  // the bias from one thread to another directly in this situation.
-  //
-  // FIXME: due to a lack of registers we currently blow away the age
-  // bits in this situation. Should attempt to preserve them.
-  load_prototype_header(tmp_reg, obj_reg);
-  orq(tmp_reg, r15_thread);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgq(tmp_reg, Address(obj_reg, 0));
-  // If the biasing toward our thread failed, then another thread
-  // succeeded in biasing it toward itself and we need to revoke that
-  // bias. The revocation will occur in the runtime in the slow case.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
-  }
-  if (slow_case != NULL) {
-    jcc(Assembler::notZero, *slow_case);
-  }
-  jmp(done);
-
-  bind(try_revoke_bias);
-  // The prototype mark in the klass doesn't have the bias bit set any
-  // more, indicating that objects of this data type are not supposed
-  // to be biased any more. We are going to try to reset the mark of
-  // this object to the prototype value and fall through to the
-  // CAS-based locking scheme. Note that if our CAS fails, it means
-  // that another thread raced us for the privilege of revoking the
-  // bias of this particular object, so it's okay to continue in the
-  // normal locking code.
-  //
-  // FIXME: due to a lack of registers we currently blow away the age
-  // bits in this situation. Should attempt to preserve them.
-  load_prototype_header(tmp_reg, obj_reg);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgq(tmp_reg, Address(obj_reg, 0));
-  // Fall through to the normal CAS-based lock, because no matter what
-  // the result of the above CAS, some thread must have succeeded in
-  // removing the bias bit from the object's header.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
-  }
-
-  bind(cas_label);
-
-  return null_check_offset;
-}
-
 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
   Label L, E;
 
@@ -983,6 +615,15 @@
   /* else */      { subq(dst, value)       ; return; }
 }
 
+void MacroAssembler::incrementq(AddressLiteral dst) {
+  if (reachable(dst)) {
+    incrementq(as_Address(dst));
+  } else {
+    lea(rscratch1, dst);
+    incrementq(Address(rscratch1, 0));
+  }
+}
+
 void MacroAssembler::incrementq(Register reg, int value) {
   if (value == min_jint) { addq(reg, value); return; }
   if (value <  0) { decrementq(reg, -value); return; }
@@ -1051,15 +692,15 @@
   movq(dst, rscratch1);
 }
 
-void MacroAssembler::movptr(Register dst, AddressLiteral src) {
+void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
   if (src.is_lval()) {
     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
   } else {
     if (reachable(src)) {
       movq(dst, as_Address(src));
     } else {
-      lea(rscratch1, src);
-      movq(dst, Address(rscratch1,0));
+      lea(scratch, src);
+      movq(dst, Address(scratch, 0));
     }
   }
 }
@@ -1358,13 +999,37 @@
   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
 }
 
-void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
-  pushf();
+void MacroAssembler::atomic_incl(Address counter_addr) {
   if (os::is_MP())
     lock();
   incrementl(counter_addr);
-  popf();
-}
+}
+
+void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
+  if (reachable(counter_addr)) {
+    atomic_incl(as_Address(counter_addr));
+  } else {
+    lea(scr, counter_addr);
+    atomic_incl(Address(scr, 0));
+  }
+}
+
+#ifdef _LP64
+void MacroAssembler::atomic_incq(Address counter_addr) {
+  if (os::is_MP())
+    lock();
+  incrementq(counter_addr);
+}
+
+void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
+  if (reachable(counter_addr)) {
+    atomic_incq(as_Address(counter_addr));
+  } else {
+    lea(scr, counter_addr);
+    atomic_incq(Address(scr, 0));
+  }
+}
+#endif
 
 // Writes to stack successive pages until offset reached to check for
 // stack overflow + shadow pages.  This clobbers tmp.
@@ -1393,6 +1058,234 @@
   }
 }
 
+int MacroAssembler::biased_locking_enter(Register lock_reg,
+                                         Register obj_reg,
+                                         Register swap_reg,
+                                         Register tmp_reg,
+                                         bool swap_reg_contains_mark,
+                                         Label& done,
+                                         Label* slow_case,
+                                         BiasedLockingCounters* counters) {
+  assert(UseBiasedLocking, "why call this otherwise?");
+  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
+  LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); )
+  bool need_tmp_reg = false;
+  if (tmp_reg == noreg) {
+    need_tmp_reg = true;
+    tmp_reg = lock_reg;
+    assert_different_registers(lock_reg, obj_reg, swap_reg);
+  } else {
+    assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
+  }
+  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
+  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
+  Address saved_mark_addr(lock_reg, 0);
+
+  if (PrintBiasedLockingStatistics && counters == NULL) {
+    counters = BiasedLocking::counters();
+  }
+  // Biased locking
+  // See whether the lock is currently biased toward our thread and
+  // whether the epoch is still valid
+  // Note that the runtime guarantees sufficient alignment of JavaThread
+  // pointers to allow age to be placed into low bits
+  // First check to see whether biasing is even enabled for this object
+  Label cas_label;
+  int null_check_offset = -1;
+  if (!swap_reg_contains_mark) {
+    null_check_offset = offset();
+    movptr(swap_reg, mark_addr);
+  }
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+  movptr(tmp_reg, swap_reg);
+  andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
+  cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  jcc(Assembler::notEqual, cas_label);
+  // The bias pattern is present in the object's header. Need to check
+  // whether the bias owner and the epoch are both still current.
+#ifndef _LP64
+  // Note that because there is no current thread register on x86_32 we
+  // need to store off the mark word we read out of the object to
+  // avoid reloading it and needing to recheck invariants below. This
+  // store is unfortunate but it makes the overall code shorter and
+  // simpler.
+  movptr(saved_mark_addr, swap_reg);
+#endif
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+  if (swap_reg_contains_mark) {
+    null_check_offset = offset();
+  }
+  load_prototype_header(tmp_reg, obj_reg);
+#ifdef _LP64
+  orptr(tmp_reg, r15_thread);
+  xorptr(tmp_reg, swap_reg);
+  Register header_reg = tmp_reg;
+#else
+  xorptr(tmp_reg, swap_reg);
+  get_thread(swap_reg);
+  xorptr(swap_reg, tmp_reg);
+  Register header_reg = swap_reg;
+#endif
+  andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  if (counters != NULL) {
+    cond_inc32(Assembler::zero,
+               ExternalAddress((address) counters->biased_lock_entry_count_addr()));
+  }
+  jcc(Assembler::equal, done);
+
+  Label try_revoke_bias;
+  Label try_rebias;
+
+  // At this point we know that the header has the bias pattern and
+  // that we are not the bias owner in the current epoch. We need to
+  // figure out more details about the state of the header in order to
+  // know what operations can be legally performed on the object's
+  // header.
+
+  // If the low three bits in the xor result aren't clear, that means
+  // the prototype header is no longer biased and we have to revoke
+  // the bias on this object.
+  testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
+  jccb(Assembler::notZero, try_revoke_bias);
+
+  // Biasing is still enabled for this data type. See whether the
+  // epoch of the current bias is still valid, meaning that the epoch
+  // bits of the mark word are equal to the epoch bits of the
+  // prototype header. (Note that the prototype header's epoch bits
+  // only change at a safepoint.) If not, attempt to rebias the object
+  // toward the current thread. Note that we must be absolutely sure
+  // that the current epoch is invalid in order to do this because
+  // otherwise the manipulations it performs on the mark word are
+  // illegal.
+  testptr(header_reg, markOopDesc::epoch_mask_in_place);
+  jccb(Assembler::notZero, try_rebias);
+
+  // The epoch of the current bias is still valid but we know nothing
+  // about the owner; it might be set or it might be clear. Try to
+  // acquire the bias of the object using an atomic operation. If this
+  // fails we will go in to the runtime to revoke the object's bias.
+  // Note that we first construct the presumed unbiased header so we
+  // don't accidentally blow away another thread's valid bias.
+  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
+  andptr(swap_reg,
+         markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+#ifdef _LP64
+  movptr(tmp_reg, swap_reg);
+  orptr(tmp_reg, r15_thread);
+#else
+  get_thread(tmp_reg);
+  orptr(tmp_reg, swap_reg);
+#endif
+  if (os::is_MP()) {
+    lock();
+  }
+  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  // If the biasing toward our thread failed, this means that
+  // another thread succeeded in biasing it toward itself and we
+  // need to revoke that bias. The revocation will occur in the
+  // interpreter runtime in the slow case.
+  if (counters != NULL) {
+    cond_inc32(Assembler::zero,
+               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
+  }
+  if (slow_case != NULL) {
+    jcc(Assembler::notZero, *slow_case);
+  }
+  jmp(done);
+
+  bind(try_rebias);
+  // At this point we know the epoch has expired, meaning that the
+  // current "bias owner", if any, is actually invalid. Under these
+  // circumstances _only_, we are allowed to use the current header's
+  // value as the comparison value when doing the cas to acquire the
+  // bias in the current epoch. In other words, we allow transfer of
+  // the bias from one thread to another directly in this situation.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+  load_prototype_header(tmp_reg, obj_reg);
+#ifdef _LP64
+  orptr(tmp_reg, r15_thread);
+#else
+  get_thread(swap_reg);
+  orptr(tmp_reg, swap_reg);
+  movptr(swap_reg, saved_mark_addr);
+#endif
+  if (os::is_MP()) {
+    lock();
+  }
+  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  // If the biasing toward our thread failed, then another thread
+  // succeeded in biasing it toward itself and we need to revoke that
+  // bias. The revocation will occur in the runtime in the slow case.
+  if (counters != NULL) {
+    cond_inc32(Assembler::zero,
+               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
+  }
+  if (slow_case != NULL) {
+    jcc(Assembler::notZero, *slow_case);
+  }
+  jmp(done);
+
+  bind(try_revoke_bias);
+  // The prototype mark in the klass doesn't have the bias bit set any
+  // more, indicating that objects of this data type are not supposed
+  // to be biased any more. We are going to try to reset the mark of
+  // this object to the prototype value and fall through to the
+  // CAS-based locking scheme. Note that if our CAS fails, it means
+  // that another thread raced us for the privilege of revoking the
+  // bias of this particular object, so it's okay to continue in the
+  // normal locking code.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+  load_prototype_header(tmp_reg, obj_reg);
+  if (os::is_MP()) {
+    lock();
+  }
+  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  // Fall through to the normal CAS-based lock, because no matter what
+  // the result of the above CAS, some thread must have succeeded in
+  // removing the bias bit from the object's header.
+  if (counters != NULL) {
+    cond_inc32(Assembler::zero,
+               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
+  }
+
+  bind(cas_label);
+
+  return null_check_offset;
+}
+
 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
   assert(UseBiasedLocking, "why call this otherwise?");
 
@@ -1408,6 +1301,996 @@
   jcc(Assembler::equal, done);
 }
 
+#ifdef COMPILER2
+
+#if INCLUDE_RTM_OPT
+
+// Update rtm_counters based on abort status
+// input: abort_status
+//        rtm_counters (RTMLockingCounters*)
+// flags are killed
+void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
+
+  atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
+  if (PrintPreciseRTMLockingStatistics) {
+    for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
+      Label check_abort;
+      testl(abort_status, (1<<i));
+      jccb(Assembler::equal, check_abort);
+      atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
+      bind(check_abort);
+    }
+  }
+}
+
+// Branch if (random & (count-1) != 0), count is 2^n
+// tmp, scr and flags are killed
+void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
+  assert(tmp == rax, "");
+  assert(scr == rdx, "");
+  rdtsc(); // modifies EDX:EAX
+  andptr(tmp, count-1);
+  jccb(Assembler::notZero, brLabel);
+}
+
+// Perform abort ratio calculation, set no_rtm bit if high ratio
+// input:  rtm_counters_Reg (RTMLockingCounters* address)
+// tmpReg, rtm_counters_Reg and flags are killed
+void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
+                                                 Register rtm_counters_Reg,
+                                                 RTMLockingCounters* rtm_counters,
+                                                 Metadata* method_data) {
+  Label L_done, L_check_always_rtm1, L_check_always_rtm2;
+
+  if (RTMLockingCalculationDelay > 0) {
+    // Delay calculation
+    movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
+    testptr(tmpReg, tmpReg);
+    jccb(Assembler::equal, L_done);
+  }
+  // Abort ratio calculation only if abort_count > RTMAbortThreshold
+  //   Aborted transactions = abort_count * 100
+  //   All transactions = total_count *  RTMTotalCountIncrRate
+  //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
+
+  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
+  cmpptr(tmpReg, RTMAbortThreshold);
+  jccb(Assembler::below, L_check_always_rtm2);
+  imulptr(tmpReg, tmpReg, 100);
+
+  Register scrReg = rtm_counters_Reg;
+  movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
+  imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
+  imulptr(scrReg, scrReg, RTMAbortRatio);
+  cmpptr(tmpReg, scrReg);
+  jccb(Assembler::below, L_check_always_rtm1);
+  if (method_data != NULL) {
+    // set rtm_state to "no rtm" in MDO
+    mov_metadata(tmpReg, method_data);
+    if (os::is_MP()) {
+      lock();
+    }
+    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
+  }
+  jmpb(L_done);
+  bind(L_check_always_rtm1);
+  // Reload RTMLockingCounters* address
+  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
+  bind(L_check_always_rtm2);
+  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
+  cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
+  jccb(Assembler::below, L_done);
+  if (method_data != NULL) {
+    // set rtm_state to "always rtm" in MDO
+    mov_metadata(tmpReg, method_data);
+    if (os::is_MP()) {
+      lock();
+    }
+    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
+  }
+  bind(L_done);
+}
+
+// Update counters and perform abort ratio calculation
+// input:  abort_status_Reg
+// rtm_counters_Reg, flags are killed
+void MacroAssembler::rtm_profiling(Register abort_status_Reg,
+                                   Register rtm_counters_Reg,
+                                   RTMLockingCounters* rtm_counters,
+                                   Metadata* method_data,
+                                   bool profile_rtm) {
+
+  assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
+  // update rtm counters based on rax value at abort
+  // reads abort_status_Reg, updates flags
+  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
+  rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
+  if (profile_rtm) {
+    // Save abort status because abort_status_Reg is used by following code.
+    if (RTMRetryCount > 0) {
+      push(abort_status_Reg);
+    }
+    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
+    rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
+    // restore abort status
+    if (RTMRetryCount > 0) {
+      pop(abort_status_Reg);
+    }
+  }
+}
+
+// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
+// inputs: retry_count_Reg
+//       : abort_status_Reg
+// output: retry_count_Reg decremented by 1
+// flags are killed
+void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
+  Label doneRetry;
+  assert(abort_status_Reg == rax, "");
+  // The abort reason bits are in eax (see all states in rtmLocking.hpp)
+  // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
+  // if reason is in 0x6 and retry count != 0 then retry
+  andptr(abort_status_Reg, 0x6);
+  jccb(Assembler::zero, doneRetry);
+  testl(retry_count_Reg, retry_count_Reg);
+  jccb(Assembler::zero, doneRetry);
+  pause();
+  decrementl(retry_count_Reg);
+  jmp(retryLabel);
+  bind(doneRetry);
+}
+
+// Spin and retry if lock is busy,
+// inputs: box_Reg (monitor address)
+//       : retry_count_Reg
+// output: retry_count_Reg decremented by 1
+//       : clear z flag if retry count exceeded
+// tmp_Reg, scr_Reg, flags are killed
+void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
+                                            Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
+  Label SpinLoop, SpinExit, doneRetry;
+  // Clean monitor_value bit to get valid pointer
+  int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
+
+  testl(retry_count_Reg, retry_count_Reg);
+  jccb(Assembler::zero, doneRetry);
+  decrementl(retry_count_Reg);
+  movptr(scr_Reg, RTMSpinLoopCount);
+
+  bind(SpinLoop);
+  pause();
+  decrementl(scr_Reg);
+  jccb(Assembler::lessEqual, SpinExit);
+  movptr(tmp_Reg, Address(box_Reg, owner_offset));
+  testptr(tmp_Reg, tmp_Reg);
+  jccb(Assembler::notZero, SpinLoop);
+
+  bind(SpinExit);
+  jmp(retryLabel);
+  bind(doneRetry);
+  incrementl(retry_count_Reg); // clear z flag
+}
+
+// Use RTM for normal stack locks
+// Input: objReg (object to lock)
+void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
+                                       Register retry_on_abort_count_Reg,
+                                       RTMLockingCounters* stack_rtm_counters,
+                                       Metadata* method_data, bool profile_rtm,
+                                       Label& DONE_LABEL, Label& IsInflated) {
+  assert(UseRTMForStackLocks, "why call this otherwise?");
+  assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
+  assert(tmpReg == rax, "");
+  assert(scrReg == rdx, "");
+  Label L_rtm_retry, L_decrement_retry, L_on_abort;
+
+  if (RTMRetryCount > 0) {
+    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
+    bind(L_rtm_retry);
+  }
+  if (!UseRTMXendForLockBusy) {
+    movptr(tmpReg, Address(objReg, 0));
+    testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
+    jcc(Assembler::notZero, IsInflated);
+  }
+  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
+    Label L_noincrement;
+    if (RTMTotalCountIncrRate > 1) {
+      // tmpReg, scrReg and flags are killed
+      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
+    }
+    assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
+    atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
+    bind(L_noincrement);
+  }
+  xbegin(L_on_abort);
+  movptr(tmpReg, Address(objReg, 0));       // fetch markword
+  andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
+  cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
+  jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
+
+  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
+  if (UseRTMXendForLockBusy) {
+    xend();
+    movptr(tmpReg, Address(objReg, 0));
+    testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
+    jcc(Assembler::notZero, IsInflated);
+    movptr(abort_status_Reg, 0x1);                // Set the abort status to 1 (as xabort does)
+    jmp(L_decrement_retry);
+  }
+  else {
+    xabort(0);
+  }
+  bind(L_on_abort);
+  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
+    rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
+  }
+  bind(L_decrement_retry);
+  if (RTMRetryCount > 0) {
+    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
+    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
+  }
+}
+
+// Use RTM for inflating locks
+// inputs: objReg (object to lock)
+//         boxReg (on-stack box address (displaced header location) - KILLED)
+//         tmpReg (ObjectMonitor address + 2(monitor_value))
+void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
+                                          Register scrReg, Register retry_on_busy_count_Reg,
+                                          Register retry_on_abort_count_Reg,
+                                          RTMLockingCounters* rtm_counters,
+                                          Metadata* method_data, bool profile_rtm,
+                                          Label& DONE_LABEL) {
+  assert(UseRTMLocking, "why call this otherwise?");
+  assert(tmpReg == rax, "");
+  assert(scrReg == rdx, "");
+  Label L_rtm_retry, L_decrement_retry, L_on_abort;
+  // Clean monitor_value bit to get valid pointer
+  int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
+
+  // Without cast to int32_t a movptr will destroy r10 which is typically obj
+  movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
+  movptr(boxReg, tmpReg); // Save ObjectMonitor address
+
+  if (RTMRetryCount > 0) {
+    movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
+    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
+    bind(L_rtm_retry);
+  }
+  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
+    Label L_noincrement;
+    if (RTMTotalCountIncrRate > 1) {
+      // tmpReg, scrReg and flags are killed
+      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
+    }
+    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
+    atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
+    bind(L_noincrement);
+  }
+  xbegin(L_on_abort);
+  movptr(tmpReg, Address(objReg, 0));
+  movptr(tmpReg, Address(tmpReg, owner_offset));
+  testptr(tmpReg, tmpReg);
+  jcc(Assembler::zero, DONE_LABEL);
+  if (UseRTMXendForLockBusy) {
+    xend();
+    jmp(L_decrement_retry);
+  }
+  else {
+    xabort(0);
+  }
+  bind(L_on_abort);
+  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
+  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
+    rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
+  }
+  if (RTMRetryCount > 0) {
+    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
+    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
+  }
+
+  movptr(tmpReg, Address(boxReg, owner_offset)) ;
+  testptr(tmpReg, tmpReg) ;
+  jccb(Assembler::notZero, L_decrement_retry) ;
+
+  // Appears unlocked - try to swing _owner from null to non-null.
+  // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
+#ifdef _LP64
+  Register threadReg = r15_thread;
+#else
+  get_thread(scrReg);
+  Register threadReg = scrReg;
+#endif
+  if (os::is_MP()) {
+    lock();
+  }
+  cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
+
+  if (RTMRetryCount > 0) {
+    // success done else retry
+    jccb(Assembler::equal, DONE_LABEL) ;
+    bind(L_decrement_retry);
+    // Spin and retry if lock is busy.
+    rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
+  }
+  else {
+    bind(L_decrement_retry);
+  }
+}
+
+#endif //  INCLUDE_RTM_OPT
+
+// Fast_Lock and Fast_Unlock used by C2
+
+// Because the transitions from emitted code to the runtime
+// monitorenter/exit helper stubs are so slow it's critical that
+// we inline both the stack-locking fast-path and the inflated fast path.
+//
+// See also: cmpFastLock and cmpFastUnlock.
+//
+// What follows is a specialized inline transliteration of the code
+// in slow_enter() and slow_exit().  If we're concerned about I$ bloat
+// another option would be to emit TrySlowEnter and TrySlowExit methods
+// at startup-time.  These methods would accept arguments as
+// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
+// indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
+// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
+// In practice, however, the # of lock sites is bounded and is usually small.
+// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
+// if the processor uses simple bimodal branch predictors keyed by EIP
+// Since the helper routines would be called from multiple synchronization
+// sites.
+//
+// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
+// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
+// to those specialized methods.  That'd give us a mostly platform-independent
+// implementation that the JITs could optimize and inline at their pleasure.
+// Done correctly, the only time we'd need to cross to native could would be
+// to park() or unpark() threads.  We'd also need a few more unsafe operators
+// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
+// (b) explicit barriers or fence operations.
+//
+// TODO:
+//
+// *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
+//    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
+//    Given TLAB allocation, Self is usually manifested in a register, so passing it into
+//    the lock operators would typically be faster than reifying Self.
+//
+// *  Ideally I'd define the primitives as:
+//       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
+//       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
+//    Unfortunately ADLC bugs prevent us from expressing the ideal form.
+//    Instead, we're stuck with a rather awkward and brittle register assignments below.
+//    Furthermore the register assignments are overconstrained, possibly resulting in
+//    sub-optimal code near the synchronization site.
+//
+// *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
+//    Alternately, use a better sp-proximity test.
+//
+// *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
+//    Either one is sufficient to uniquely identify a thread.
+//    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
+//
+// *  Intrinsify notify() and notifyAll() for the common cases where the
+//    object is locked by the calling thread but the waitlist is empty.
+//    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
+//
+// *  use jccb and jmpb instead of jcc and jmp to improve code density.
+//    But beware of excessive branch density on AMD Opterons.
+//
+// *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
+//    or failure of the fast-path.  If the fast-path fails then we pass
+//    control to the slow-path, typically in C.  In Fast_Lock and
+//    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
+//    will emit a conditional branch immediately after the node.
+//    So we have branches to branches and lots of ICC.ZF games.
+//    Instead, it might be better to have C2 pass a "FailureLabel"
+//    into Fast_Lock and Fast_Unlock.  In the case of success, control
+//    will drop through the node.  ICC.ZF is undefined at exit.
+//    In the case of failure, the node will branch directly to the
+//    FailureLabel
+
+
+// obj: object to lock
+// box: on-stack box address (displaced header location) - KILLED
+// rax,: tmp -- KILLED
+// scr: tmp -- KILLED
+void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
+                               Register scrReg, Register cx1Reg, Register cx2Reg,
+                               BiasedLockingCounters* counters,
+                               RTMLockingCounters* rtm_counters,
+                               RTMLockingCounters* stack_rtm_counters,
+                               Metadata* method_data,
+                               bool use_rtm, bool profile_rtm) {
+  // Ensure the register assignents are disjoint
+  assert(tmpReg == rax, "");
+
+  if (use_rtm) {
+    assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
+  } else {
+    assert(cx1Reg == noreg, "");
+    assert(cx2Reg == noreg, "");
+    assert_different_registers(objReg, boxReg, tmpReg, scrReg);
+  }
+
+  if (counters != NULL) {
+    atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
+  }
+  if (EmitSync & 1) {
+      // set box->dhw = unused_mark (3)
+      // Force all sync thru slow-path: slow_enter() and slow_exit()
+      movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
+      cmpptr (rsp, (int32_t)NULL_WORD);
+  } else
+  if (EmitSync & 2) {
+      Label DONE_LABEL ;
+      if (UseBiasedLocking) {
+         // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
+         biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
+      }
+
+      movptr(tmpReg, Address(objReg, 0));           // fetch markword
+      orptr (tmpReg, 0x1);
+      movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS
+      if (os::is_MP()) {
+        lock();
+      }
+      cmpxchgptr(boxReg, Address(objReg, 0));       // Updates tmpReg
+      jccb(Assembler::equal, DONE_LABEL);
+      // Recursive locking
+      subptr(tmpReg, rsp);
+      andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
+      movptr(Address(boxReg, 0), tmpReg);
+      bind(DONE_LABEL);
+  } else {
+    // Possible cases that we'll encounter in fast_lock
+    // ------------------------------------------------
+    // * Inflated
+    //    -- unlocked
+    //    -- Locked
+    //       = by self
+    //       = by other
+    // * biased
+    //    -- by Self
+    //    -- by other
+    // * neutral
+    // * stack-locked
+    //    -- by self
+    //       = sp-proximity test hits
+    //       = sp-proximity test generates false-negative
+    //    -- by other
+    //
+
+    Label IsInflated, DONE_LABEL;
+
+    // it's stack-locked, biased or neutral
+    // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
+    // order to reduce the number of conditional branches in the most common cases.
+    // Beware -- there's a subtle invariant that fetch of the markword
+    // at [FETCH], below, will never observe a biased encoding (*101b).
+    // If this invariant is not held we risk exclusion (safety) failure.
+    if (UseBiasedLocking && !UseOptoBiasInlining) {
+      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
+    }
+
+#if INCLUDE_RTM_OPT
+    if (UseRTMForStackLocks && use_rtm) {
+      rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
+                        stack_rtm_counters, method_data, profile_rtm,
+                        DONE_LABEL, IsInflated);
+    }
+#endif // INCLUDE_RTM_OPT
+
+    movptr(tmpReg, Address(objReg, 0));          // [FETCH]
+    testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
+    jccb(Assembler::notZero, IsInflated);
+
+    // Attempt stack-locking ...
+    orptr (tmpReg, markOopDesc::unlocked_value);
+    movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
+    if (os::is_MP()) {
+      lock();
+    }
+    cmpxchgptr(boxReg, Address(objReg, 0));      // Updates tmpReg
+    if (counters != NULL) {
+      cond_inc32(Assembler::equal,
+                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
+    }
+    jcc(Assembler::equal, DONE_LABEL);           // Success
+
+    // Recursive locking.
+    // The object is stack-locked: markword contains stack pointer to BasicLock.
+    // Locked by current thread if difference with current SP is less than one page.
+    subptr(tmpReg, rsp);
+    // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
+    andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
+    movptr(Address(boxReg, 0), tmpReg);
+    if (counters != NULL) {
+      cond_inc32(Assembler::equal,
+                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
+    }
+    jmp(DONE_LABEL);
+
+    bind(IsInflated);
+    // The object is inflated. tmpReg contains pointer to ObjectMonitor* + 2(monitor_value)
+
+#if INCLUDE_RTM_OPT
+    // Use the same RTM locking code in 32- and 64-bit VM.
+    if (use_rtm) {
+      rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
+                           rtm_counters, method_data, profile_rtm, DONE_LABEL);
+    } else {
+#endif // INCLUDE_RTM_OPT
+
+#ifndef _LP64
+    // The object is inflated.
+    //
+    // TODO-FIXME: eliminate the ugly use of manifest constants:
+    //   Use markOopDesc::monitor_value instead of "2".
+    //   use markOop::unused_mark() instead of "3".
+    // The tmpReg value is an objectMonitor reference ORed with
+    // markOopDesc::monitor_value (2).   We can either convert tmpReg to an
+    // objectmonitor pointer by masking off the "2" bit or we can just
+    // use tmpReg as an objectmonitor pointer but bias the objectmonitor
+    // field offsets with "-2" to compensate for and annul the low-order tag bit.
+    //
+    // I use the latter as it avoids AGI stalls.
+    // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
+    // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
+    //
+    #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
+
+    // boxReg refers to the on-stack BasicLock in the current frame.
+    // We'd like to write:
+    //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
+    // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
+    // additional latency as we have another ST in the store buffer that must drain.
+
+    if (EmitSync & 8192) {
+       movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
+       get_thread (scrReg);
+       movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
+       movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
+       if (os::is_MP()) {
+         lock();
+       }
+       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    } else
+    if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
+       movptr(scrReg, boxReg);
+       movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
+
+       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
+       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
+          // prefetchw [eax + Offset(_owner)-2]
+          prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       }
+
+       if ((EmitSync & 64) == 0) {
+         // Optimistic form: consider XORL tmpReg,tmpReg
+         movptr(tmpReg, NULL_WORD);
+       } else {
+         // Can suffer RTS->RTO upgrades on shared or cold $ lines
+         // Test-And-CAS instead of CAS
+         movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));   // rax, = m->_owner
+         testptr(tmpReg, tmpReg);                   // Locked ?
+         jccb  (Assembler::notZero, DONE_LABEL);
+       }
+
+       // Appears unlocked - try to swing _owner from null to non-null.
+       // Ideally, I'd manifest "Self" with get_thread and then attempt
+       // to CAS the register containing Self into m->Owner.
+       // But we don't have enough registers, so instead we can either try to CAS
+       // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
+       // we later store "Self" into m->Owner.  Transiently storing a stack address
+       // (rsp or the address of the box) into  m->owner is harmless.
+       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
+       if (os::is_MP()) {
+         lock();
+       }
+       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
+       jccb  (Assembler::notZero, DONE_LABEL);
+       get_thread (scrReg);                    // beware: clobbers ICCs
+       movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg);
+       xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
+
+       // If the CAS fails we can either retry or pass control to the slow-path.
+       // We use the latter tactic.
+       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
+       // If the CAS was successful ...
+       //   Self has acquired the lock
+       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
+       // Intentional fall-through into DONE_LABEL ...
+    } else {
+       movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
+       movptr(boxReg, tmpReg);
+
+       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
+       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
+          // prefetchw [eax + Offset(_owner)-2]
+          prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       }
+
+       if ((EmitSync & 64) == 0) {
+         // Optimistic form
+         xorptr  (tmpReg, tmpReg);
+       } else {
+         // Can suffer RTS->RTO upgrades on shared or cold $ lines
+         movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));   // rax, = m->_owner
+         testptr(tmpReg, tmpReg);                   // Locked ?
+         jccb  (Assembler::notZero, DONE_LABEL);
+       }
+
+       // Appears unlocked - try to swing _owner from null to non-null.
+       // Use either "Self" (in scr) or rsp as thread identity in _owner.
+       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
+       get_thread (scrReg);
+       if (os::is_MP()) {
+         lock();
+       }
+       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+
+       // If the CAS fails we can either retry or pass control to the slow-path.
+       // We use the latter tactic.
+       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
+       // If the CAS was successful ...
+       //   Self has acquired the lock
+       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
+       // Intentional fall-through into DONE_LABEL ...
+    }
+#else // _LP64
+    // It's inflated
+
+    // TODO: someday avoid the ST-before-CAS penalty by
+    // relocating (deferring) the following ST.
+    // We should also think about trying a CAS without having
+    // fetched _owner.  If the CAS is successful we may
+    // avoid an RTO->RTS upgrade on the $line.
+
+    // Without cast to int32_t a movptr will destroy r10 which is typically obj
+    movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
+
+    movptr (boxReg, tmpReg);
+    movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    testptr(tmpReg, tmpReg);
+    jccb   (Assembler::notZero, DONE_LABEL);
+
+    // It's inflated and appears unlocked
+    if (os::is_MP()) {
+      lock();
+    }
+    cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    // Intentional fall-through into DONE_LABEL ...
+#endif // _LP64
+
+#if INCLUDE_RTM_OPT
+    } // use_rtm()
+#endif
+    // DONE_LABEL is a hot target - we'd really like to place it at the
+    // start of cache line by padding with NOPs.
+    // See the AMD and Intel software optimization manuals for the
+    // most efficient "long" NOP encodings.
+    // Unfortunately none of our alignment mechanisms suffice.
+    bind(DONE_LABEL);
+
+    // At DONE_LABEL the icc ZFlag is set as follows ...
+    // Fast_Unlock uses the same protocol.
+    // ZFlag == 1 -> Success
+    // ZFlag == 0 -> Failure - force control through the slow-path
+  }
+}
+
+// obj: object to unlock
+// box: box address (displaced header location), killed.  Must be EAX.
+// tmp: killed, cannot be obj nor box.
+//
+// Some commentary on balanced locking:
+//
+// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
+// Methods that don't have provably balanced locking are forced to run in the
+// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
+// The interpreter provides two properties:
+// I1:  At return-time the interpreter automatically and quietly unlocks any
+//      objects acquired the current activation (frame).  Recall that the
+//      interpreter maintains an on-stack list of locks currently held by
+//      a frame.
+// I2:  If a method attempts to unlock an object that is not held by the
+//      the frame the interpreter throws IMSX.
+//
+// Lets say A(), which has provably balanced locking, acquires O and then calls B().
+// B() doesn't have provably balanced locking so it runs in the interpreter.
+// Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
+// is still locked by A().
+//
+// The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
+// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
+// should not be unlocked by "normal" java-level locking and vice-versa.  The specification
+// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
+
+void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
+  assert(boxReg == rax, "");
+  assert_different_registers(objReg, boxReg, tmpReg);
+
+  if (EmitSync & 4) {
+    // Disable - inhibit all inlining.  Force control through the slow-path
+    cmpptr (rsp, 0);
+  } else
+  if (EmitSync & 8) {
+    Label DONE_LABEL;
+    if (UseBiasedLocking) {
+       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
+    }
+    // Classic stack-locking code ...
+    // Check whether the displaced header is 0
+    //(=> recursive unlock)
+    movptr(tmpReg, Address(boxReg, 0));
+    testptr(tmpReg, tmpReg);
+    jccb(Assembler::zero, DONE_LABEL);
+    // If not recursive lock, reset the header to displaced header
+    if (os::is_MP()) {
+      lock();
+    }
+    cmpxchgptr(tmpReg, Address(objReg, 0));   // Uses RAX which is box
+    bind(DONE_LABEL);
+  } else {
+    Label DONE_LABEL, Stacked, CheckSucc;
+
+    // Critically, the biased locking test must have precedence over
+    // and appear before the (box->dhw == 0) recursive stack-lock test.
+    if (UseBiasedLocking && !UseOptoBiasInlining) {
+       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
+    }
+
+#if INCLUDE_RTM_OPT
+    if (UseRTMForStackLocks && use_rtm) {
+      assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
+      Label L_regular_unlock;
+      movptr(tmpReg, Address(objReg, 0));           // fetch markword
+      andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
+      cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
+      jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
+      xend();                                       // otherwise end...
+      jmp(DONE_LABEL);                              // ... and we're done
+      bind(L_regular_unlock);
+    }
+#endif
+
+    cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
+    jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
+    movptr(tmpReg, Address(objReg, 0));             // Examine the object's markword
+    testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
+    jccb  (Assembler::zero, Stacked);
+
+    // It's inflated.
+#if INCLUDE_RTM_OPT
+    if (use_rtm) {
+      Label L_regular_inflated_unlock;
+      // Clean monitor_value bit to get valid pointer
+      int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
+      movptr(boxReg, Address(tmpReg, owner_offset));
+      testptr(boxReg, boxReg);
+      jccb(Assembler::notZero, L_regular_inflated_unlock);
+      xend();
+      jmpb(DONE_LABEL);
+      bind(L_regular_inflated_unlock);
+    }
+#endif
+
+    // Despite our balanced locking property we still check that m->_owner == Self
+    // as java routines or native JNI code called by this thread might
+    // have released the lock.
+    // Refer to the comments in synchronizer.cpp for how we might encode extra
+    // state in _succ so we can avoid fetching EntryList|cxq.
+    //
+    // I'd like to add more cases in fast_lock() and fast_unlock() --
+    // such as recursive enter and exit -- but we have to be wary of
+    // I$ bloat, T$ effects and BP$ effects.
+    //
+    // If there's no contention try a 1-0 exit.  That is, exit without
+    // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
+    // we detect and recover from the race that the 1-0 exit admits.
+    //
+    // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
+    // before it STs null into _owner, releasing the lock.  Updates
+    // to data protected by the critical section must be visible before
+    // we drop the lock (and thus before any other thread could acquire
+    // the lock and observe the fields protected by the lock).
+    // IA32's memory-model is SPO, so STs are ordered with respect to
+    // each other and there's no need for an explicit barrier (fence).
+    // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
+#ifndef _LP64
+    get_thread (boxReg);
+    if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
+      // prefetchw [ebx + Offset(_owner)-2]
+      prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    }
+
+    // Note that we could employ various encoding schemes to reduce
+    // the number of loads below (currently 4) to just 2 or 3.
+    // Refer to the comments in synchronizer.cpp.
+    // In practice the chain of fetches doesn't seem to impact performance, however.
+    if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
+       // Attempt to reduce branch density - AMD's branch predictor.
+       xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
+       jccb  (Assembler::notZero, DONE_LABEL);
+       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
+       jmpb  (DONE_LABEL);
+    } else {
+       xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
+       jccb  (Assembler::notZero, DONE_LABEL);
+       movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
+       jccb  (Assembler::notZero, CheckSucc);
+       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
+       jmpb  (DONE_LABEL);
+    }
+
+    // The Following code fragment (EmitSync & 65536) improves the performance of
+    // contended applications and contended synchronization microbenchmarks.
+    // Unfortunately the emission of the code - even though not executed - causes regressions
+    // in scimark and jetstream, evidently because of $ effects.  Replacing the code
+    // with an equal number of never-executed NOPs results in the same regression.
+    // We leave it off by default.
+
+    if ((EmitSync & 65536) != 0) {
+       Label LSuccess, LGoSlowPath ;
+
+       bind  (CheckSucc);
+
+       // Optional pre-test ... it's safe to elide this
+       if ((EmitSync & 16) == 0) {
+          cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
+          jccb  (Assembler::zero, LGoSlowPath);
+       }
+
+       // We have a classic Dekker-style idiom:
+       //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
+       // There are a number of ways to implement the barrier:
+       // (1) lock:andl &m->_owner, 0
+       //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
+       //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
+       //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
+       // (2) If supported, an explicit MFENCE is appealing.
+       //     In older IA32 processors MFENCE is slower than lock:add or xchg
+       //     particularly if the write-buffer is full as might be the case if
+       //     if stores closely precede the fence or fence-equivalent instruction.
+       //     In more modern implementations MFENCE appears faster, however.
+       // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
+       //     The $lines underlying the top-of-stack should be in M-state.
+       //     The locked add instruction is serializing, of course.
+       // (4) Use xchg, which is serializing
+       //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
+       // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
+       //     The integer condition codes will tell us if succ was 0.
+       //     Since _succ and _owner should reside in the same $line and
+       //     we just stored into _owner, it's likely that the $line
+       //     remains in M-state for the lock:orl.
+       //
+       // We currently use (3), although it's likely that switching to (2)
+       // is correct for the future.
+
+       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
+       if (os::is_MP()) {
+          if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
+            mfence();
+          } else {
+            lock (); addptr(Address(rsp, 0), 0);
+          }
+       }
+       // Ratify _succ remains non-null
+       cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0);
+       jccb  (Assembler::notZero, LSuccess);
+
+       xorptr(boxReg, boxReg);                  // box is really EAX
+       if (os::is_MP()) { lock(); }
+       cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       jccb  (Assembler::notEqual, LSuccess);
+       // Since we're low on registers we installed rsp as a placeholding in _owner.
+       // Now install Self over rsp.  This is safe as we're transitioning from
+       // non-null to non=null
+       get_thread (boxReg);
+       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
+       // Intentional fall-through into LGoSlowPath ...
+
+       bind  (LGoSlowPath);
+       orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
+       jmpb  (DONE_LABEL);
+
+       bind  (LSuccess);
+       xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
+       jmpb  (DONE_LABEL);
+    }
+
+    bind (Stacked);
+    // It's not inflated and it's not recursively stack-locked and it's not biased.
+    // It must be stack-locked.
+    // Try to reset the header to displaced header.
+    // The "box" value on the stack is stable, so we can reload
+    // and be assured we observe the same value as above.
+    movptr(tmpReg, Address(boxReg, 0));
+    if (os::is_MP()) {
+      lock();
+    }
+    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
+    // Intention fall-thru into DONE_LABEL
+
+    // DONE_LABEL is a hot target - we'd really like to place it at the
+    // start of cache line by padding with NOPs.
+    // See the AMD and Intel software optimization manuals for the
+    // most efficient "long" NOP encodings.
+    // Unfortunately none of our alignment mechanisms suffice.
+    if ((EmitSync & 65536) == 0) {
+       bind (CheckSucc);
+    }
+#else // _LP64
+    // It's inflated
+    movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    xorptr(boxReg, r15_thread);
+    orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
+    jccb  (Assembler::notZero, DONE_LABEL);
+    movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
+    orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
+    jccb  (Assembler::notZero, CheckSucc);
+    movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
+    jmpb  (DONE_LABEL);
+
+    if ((EmitSync & 65536) == 0) {
+      Label LSuccess, LGoSlowPath ;
+      bind  (CheckSucc);
+      cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
+      jccb  (Assembler::zero, LGoSlowPath);
+
+      // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
+      // the explicit ST;MEMBAR combination, but masm doesn't currently support
+      // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
+      // are all faster when the write buffer is populated.
+      movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
+      if (os::is_MP()) {
+         lock (); addl (Address(rsp, 0), 0);
+      }
+      cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
+      jccb  (Assembler::notZero, LSuccess);
+
+      movptr (boxReg, (int32_t)NULL_WORD);                   // box is really EAX
+      if (os::is_MP()) { lock(); }
+      cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+      jccb  (Assembler::notEqual, LSuccess);
+      // Intentional fall-through into slow-path
+
+      bind  (LGoSlowPath);
+      orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
+      jmpb  (DONE_LABEL);
+
+      bind  (LSuccess);
+      testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
+      jmpb  (DONE_LABEL);
+    }
+
+    bind  (Stacked);
+    movptr(tmpReg, Address (boxReg, 0));      // re-fetch
+    if (os::is_MP()) { lock(); }
+    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
+
+    if (EmitSync & 65536) {
+       bind (CheckSucc);
+    }
+#endif
+    bind(DONE_LABEL);
+    // Avoid branch to branch on AMD processors
+    if (EmitSync & 32768) {
+       nop();
+    }
+  }
+}
+#endif // COMPILER2
+
 void MacroAssembler::c2bool(Register x) {
   // implements x == 0 ? 0 : 1
   // note: must only look at least-significant byte of x
@@ -1969,7 +2852,9 @@
   Condition negated_cond = negate_condition(cond);
   Label L;
   jcc(negated_cond, L);
+  pushf(); // Preserve flags
   atomic_incl(counter_addr);
+  popf();
   bind(L);
 }
 
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -27,6 +27,7 @@
 
 #include "asm/assembler.hpp"
 #include "utilities/macros.hpp"
+#include "runtime/rtmLocking.hpp"
 
 
 // MacroAssembler extends Assembler by frequently used macros.
@@ -111,7 +112,8 @@
         op == 0xE9 /* jmp */ ||
         op == 0xEB /* short jmp */ ||
         (op & 0xF0) == 0x70 /* short jcc */ ||
-        op == 0x0F && (branch[1] & 0xF0) == 0x80 /* jcc */,
+        op == 0x0F && (branch[1] & 0xF0) == 0x80 /* jcc */ ||
+        op == 0xC7 && branch[1] == 0xF8 /* xbegin */,
         "Invalid opcode at patch point");
 
     if (op == 0xEB || (op & 0xF0) == 0x70) {
@@ -121,7 +123,7 @@
       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset");
       *disp = imm8;
     } else {
-      int* disp = (int*) &branch[(op == 0x0F)? 2: 1];
+      int* disp = (int*) &branch[(op == 0x0F || op == 0xC7)? 2: 1];
       int imm32 = target - (address) &disp[1];
       *disp = imm32;
     }
@@ -161,7 +163,6 @@
   void incrementq(Register reg, int value = 1);
   void incrementq(Address dst, int value = 1);
 
-
   // Support optimal SSE move instructions.
   void movflt(XMMRegister dst, XMMRegister src) {
     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
@@ -187,6 +188,8 @@
   void incrementl(AddressLiteral dst);
   void incrementl(ArrayAddress dst);
 
+  void incrementq(AddressLiteral dst);
+
   // Alignment
   void align(int modulus);
 
@@ -651,7 +654,40 @@
                            Label& done, Label* slow_case = NULL,
                            BiasedLockingCounters* counters = NULL);
   void biased_locking_exit (Register obj_reg, Register temp_reg, Label& done);
-
+#ifdef COMPILER2
+  // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
+  // See full desription in macroAssembler_x86.cpp.
+  void fast_lock(Register obj, Register box, Register tmp,
+                 Register scr, Register cx1, Register cx2,
+                 BiasedLockingCounters* counters,
+                 RTMLockingCounters* rtm_counters,
+                 RTMLockingCounters* stack_rtm_counters,
+                 Metadata* method_data,
+                 bool use_rtm, bool profile_rtm);
+  void fast_unlock(Register obj, Register box, Register tmp, bool use_rtm);
+#if INCLUDE_RTM_OPT
+  void rtm_counters_update(Register abort_status, Register rtm_counters);
+  void branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel);
+  void rtm_abort_ratio_calculation(Register tmp, Register rtm_counters_reg,
+                                   RTMLockingCounters* rtm_counters,
+                                   Metadata* method_data);
+  void rtm_profiling(Register abort_status_Reg, Register rtm_counters_Reg,
+                     RTMLockingCounters* rtm_counters, Metadata* method_data, bool profile_rtm);
+  void rtm_retry_lock_on_abort(Register retry_count, Register abort_status, Label& retryLabel);
+  void rtm_retry_lock_on_busy(Register retry_count, Register box, Register tmp, Register scr, Label& retryLabel);
+  void rtm_stack_locking(Register obj, Register tmp, Register scr,
+                         Register retry_on_abort_count,
+                         RTMLockingCounters* stack_rtm_counters,
+                         Metadata* method_data, bool profile_rtm,
+                         Label& DONE_LABEL, Label& IsInflated);
+  void rtm_inflated_locking(Register obj, Register box, Register tmp,
+                            Register scr, Register retry_on_busy_count,
+                            Register retry_on_abort_count,
+                            RTMLockingCounters* rtm_counters,
+                            Metadata* method_data, bool profile_rtm,
+                            Label& DONE_LABEL);
+#endif
+#endif
 
   Condition negate_condition(Condition cond);
 
@@ -716,6 +752,7 @@
 
 
   void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
+  void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
 
 
   void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
@@ -757,7 +794,14 @@
   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
   void cond_inc32(Condition cond, AddressLiteral counter_addr);
   // Unconditional atomic increment.
-  void atomic_incl(AddressLiteral counter_addr);
+  void atomic_incl(Address counter_addr);
+  void atomic_incl(AddressLiteral counter_addr, Register scr = rscratch1);
+#ifdef _LP64
+  void atomic_incq(Address counter_addr);
+  void atomic_incq(AddressLiteral counter_addr, Register scr = rscratch1);
+#endif
+  void atomic_incptr(AddressLiteral counter_addr, Register scr = rscratch1) { LP64_ONLY(atomic_incq(counter_addr, scr)) NOT_LP64(atomic_incl(counter_addr, scr)) ; }
+  void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
 
   void lea(Register dst, AddressLiteral adr);
   void lea(Address dst, AddressLiteral adr);
@@ -1069,7 +1113,11 @@
 
   void movptr(Register dst, Address src);
 
-  void movptr(Register dst, AddressLiteral src);
+#ifdef _LP64
+  void movptr(Register dst, AddressLiteral src, Register scratch=rscratch1);
+#else
+  void movptr(Register dst, AddressLiteral src, Register scratch=noreg); // Scratch reg is ignored in 32-bit
+#endif
 
   void movptr(Register dst, intptr_t src);
   void movptr(Register dst, Register src);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cpu/x86/vm/rtmLocking.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "memory/allocation.inline.hpp"
+#include "runtime/task.hpp"
+#include "runtime/rtmLocking.hpp"
+
+// One-shot PeriodicTask subclass for enabling RTM locking
+uintx RTMLockingCounters::_calculation_flag = 0;
+
+class RTMLockingCalculationTask : public PeriodicTask {
+ public:
+  RTMLockingCalculationTask(size_t interval_time) : PeriodicTask(interval_time){  }
+
+  virtual void task() {
+    RTMLockingCounters::_calculation_flag = 1;
+    // Reclaim our storage and disenroll ourself
+    delete this;
+  }
+};
+
+void RTMLockingCounters::init() {
+  if (UseRTMLocking && RTMLockingCalculationDelay > 0) {
+    RTMLockingCalculationTask* task = new RTMLockingCalculationTask(RTMLockingCalculationDelay);
+    task->enroll();
+  } else {
+    _calculation_flag = 1;
+  }
+}
+
+//------------------------------print_on-------------------------------
+void RTMLockingCounters::print_on(outputStream* st) {
+  tty->print_cr("# rtm locks total (estimated): " UINTX_FORMAT, _total_count * RTMTotalCountIncrRate);
+  tty->print_cr("# rtm lock aborts  : " UINTX_FORMAT, _abort_count);
+  for (int i = 0; i < ABORT_STATUS_LIMIT; i++) {
+    tty->print_cr("# rtm lock aborts %d: " UINTX_FORMAT, i, _abortX_count[i]);
+  }
+}
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1817,6 +1817,13 @@
   // Frame is now completed as far as size and linkage.
   int frame_complete = ((intptr_t)__ pc()) - start;
 
+  if (UseRTMLocking) {
+    // Abort RTM transaction before calling JNI
+    // because critical section will be large and will be
+    // aborted anyway. Also nmethod could be deoptimized.
+    __ xabort(0);
+  }
+
   // Calculate the difference between rsp and rbp,. We need to know it
   // after the native call because on windows Java Natives will pop
   // the arguments and it is painful to do rsp relative addressing
@@ -3170,6 +3177,12 @@
   };
 
   address start = __ pc();
+
+  if (UseRTMLocking) {
+    // Abort RTM transaction before possible nmethod deoptimization.
+    __ xabort(0);
+  }
+
   // Push self-frame.
   __ subptr(rsp, return_off*wordSize);     // Epilog!
 
@@ -3355,6 +3368,14 @@
   address call_pc = NULL;
   bool cause_return = (poll_type == POLL_AT_RETURN);
   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
+
+  if (UseRTMLocking) {
+    // Abort RTM transaction before calling runtime
+    // because critical section will be large and will be
+    // aborted anyway. Also nmethod could be deoptimized.
+    __ xabort(0);
+  }
+
   // If cause_return is true we are at a poll_return and there is
   // the return address on the stack to the caller on the nmethod
   // that is safepoint. We can leave this return on the stack and
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -2012,6 +2012,13 @@
   // Frame is now completed as far as size and linkage.
   int frame_complete = ((intptr_t)__ pc()) - start;
 
+    if (UseRTMLocking) {
+      // Abort RTM transaction before calling JNI
+      // because critical section will be large and will be
+      // aborted anyway. Also nmethod could be deoptimized.
+      __ xabort(0);
+    }
+
 #ifdef ASSERT
     {
       Label L;
@@ -3612,6 +3619,11 @@
 
   address start = __ pc();
 
+  if (UseRTMLocking) {
+    // Abort RTM transaction before possible nmethod deoptimization.
+    __ xabort(0);
+  }
+
   // Push self-frame.  We get here with a return address on the
   // stack, so rsp is 8-byte aligned until we allocate our frame.
   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
@@ -3792,6 +3804,13 @@
   bool cause_return = (poll_type == POLL_AT_RETURN);
   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
 
+  if (UseRTMLocking) {
+    // Abort RTM transaction before calling runtime
+    // because critical section will be large and will be
+    // aborted anyway. Also nmethod could be deoptimized.
+    __ xabort(0);
+  }
+
   // Make room for return address (or push it again)
   if (!cause_return) {
     __ push(rbx);
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -50,8 +50,13 @@
 const char*           VM_Version::_features_str = "";
 VM_Version::CpuidInfo VM_Version::_cpuid_info   = { 0, };
 
+// Address of instruction which causes SEGV
+address VM_Version::_cpuinfo_segv_addr = 0;
+// Address of instruction after the one which causes SEGV
+address VM_Version::_cpuinfo_cont_addr = 0;
+
 static BufferBlob* stub_blob;
-static const int stub_size = 550;
+static const int stub_size = 600;
 
 extern "C" {
   typedef void (*getPsrInfo_stub_t)(void*);
@@ -234,9 +239,9 @@
     // Check if OS has enabled XGETBV instruction to access XCR0
     // (OSXSAVE feature flag) and CPU supports AVX
     //
-    __ andl(rcx, 0x18000000);
+    __ andl(rcx, 0x18000000); // cpuid1 bits osxsave | avx
     __ cmpl(rcx, 0x18000000);
-    __ jccb(Assembler::notEqual, sef_cpuid);
+    __ jccb(Assembler::notEqual, sef_cpuid); // jump if AVX is not supported
 
     //
     // XCR0, XFEATURE_ENABLED_MASK register
@@ -247,6 +252,47 @@
     __ movl(Address(rsi, 0), rax);
     __ movl(Address(rsi, 4), rdx);
 
+    __ andl(rax, 0x6); // xcr0 bits sse | ymm
+    __ cmpl(rax, 0x6);
+    __ jccb(Assembler::notEqual, sef_cpuid); // jump if AVX is not supported
+
+    //
+    // Some OSs have a bug when upper 128bits of YMM
+    // registers are not restored after a signal processing.
+    // Generate SEGV here (reference through NULL)
+    // and check upper YMM bits after it.
+    //
+    VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
+
+    // load value into all 32 bytes of ymm7 register
+    __ movl(rcx, VM_Version::ymm_test_value());
+
+    __ movdl(xmm0, rcx);
+    __ pshufd(xmm0, xmm0, 0x00);
+    __ vinsertf128h(xmm0, xmm0, xmm0);
+    __ vmovdqu(xmm7, xmm0);
+#ifdef _LP64
+    __ vmovdqu(xmm8,  xmm0);
+    __ vmovdqu(xmm15, xmm0);
+#endif
+
+    __ xorl(rsi, rsi);
+    VM_Version::set_cpuinfo_segv_addr( __ pc() );
+    // Generate SEGV
+    __ movl(rax, Address(rsi, 0));
+
+    VM_Version::set_cpuinfo_cont_addr( __ pc() );
+    // Returns here after signal. Save xmm0 to check it later.
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset())));
+    __ vmovdqu(Address(rsi,  0), xmm0);
+    __ vmovdqu(Address(rsi, 32), xmm7);
+#ifdef _LP64
+    __ vmovdqu(Address(rsi, 64), xmm8);
+    __ vmovdqu(Address(rsi, 96), xmm15);
+#endif
+
+    VM_Version::clean_cpuFeatures();
+
     //
     // cpuid(0x7) Structured Extended Features
     //
@@ -429,7 +475,7 @@
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -446,8 +492,9 @@
                (supports_avx()    ? ", avx" : ""),
                (supports_avx2()   ? ", avx2" : ""),
                (supports_aes()    ? ", aes" : ""),
-               (supports_clmul()    ? ", clmul" : ""),
+               (supports_clmul()  ? ", clmul" : ""),
                (supports_erms()   ? ", erms" : ""),
+               (supports_rtm()    ? ", rtm" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
                (supports_lzcnt()   ? ", lzcnt": ""),
@@ -455,7 +502,9 @@
                (supports_ht() ? ", ht": ""),
                (supports_tsc() ? ", tsc": ""),
                (supports_tscinv_bit() ? ", tscinvbit": ""),
-               (supports_tscinv() ? ", tscinv": ""));
+               (supports_tscinv() ? ", tscinv": ""),
+               (supports_bmi1() ? ", bmi1" : ""),
+               (supports_bmi2() ? ", bmi2" : ""));
   _features_str = strdup(buf);
 
   // UseSSE is set to the smaller of what hardware supports and what
@@ -486,7 +535,7 @@
     }
   } else if (UseAES) {
     if (!FLAG_IS_DEFAULT(UseAES))
-      warning("AES instructions not available on this CPU");
+      warning("AES instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseAES, false);
   }
 
@@ -519,10 +568,57 @@
     }
   } else if (UseAESIntrinsics) {
     if (!FLAG_IS_DEFAULT(UseAESIntrinsics))
-      warning("AES intrinsics not available on this CPU");
+      warning("AES intrinsics are not available on this CPU");
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
 
+  // Adjust RTM (Restricted Transactional Memory) flags
+  if (!supports_rtm() && UseRTMLocking) {
+    // Can't continue because UseRTMLocking affects UseBiasedLocking flag
+    // setting during arguments processing. See use_biased_locking().
+    // VM_Version_init() is executed after UseBiasedLocking is used
+    // in Thread::allocate().
+    vm_exit_during_initialization("RTM instructions are not available on this CPU");
+  }
+
+#if INCLUDE_RTM_OPT
+  if (UseRTMLocking) {
+    if (!FLAG_IS_CMDLINE(UseRTMLocking)) {
+      // RTM locking should be used only for applications with
+      // high lock contention. For now we do not use it by default.
+      vm_exit_during_initialization("UseRTMLocking flag should be only set on command line");
+    }
+    if (!is_power_of_2(RTMTotalCountIncrRate)) {
+      warning("RTMTotalCountIncrRate must be a power of 2, resetting it to 64");
+      FLAG_SET_DEFAULT(RTMTotalCountIncrRate, 64);
+    }
+    if (RTMAbortRatio < 0 || RTMAbortRatio > 100) {
+      warning("RTMAbortRatio must be in the range 0 to 100, resetting it to 50");
+      FLAG_SET_DEFAULT(RTMAbortRatio, 50);
+    }
+  } else { // !UseRTMLocking
+    if (UseRTMForStackLocks) {
+      if (!FLAG_IS_DEFAULT(UseRTMForStackLocks)) {
+        warning("UseRTMForStackLocks flag should be off when UseRTMLocking flag is off");
+      }
+      FLAG_SET_DEFAULT(UseRTMForStackLocks, false);
+    }
+    if (UseRTMDeopt) {
+      FLAG_SET_DEFAULT(UseRTMDeopt, false);
+    }
+    if (PrintPreciseRTMLockingStatistics) {
+      FLAG_SET_DEFAULT(PrintPreciseRTMLockingStatistics, false);
+    }
+  }
+#else
+  if (UseRTMLocking) {
+    // Only C2 does RTM locking optimization.
+    // Can't continue because UseRTMLocking affects UseBiasedLocking flag
+    // setting during arguments processing. See use_biased_locking().
+    vm_exit_during_initialization("RTM locking optimization is not supported in this VM");
+  }
+#endif
+
 #ifdef COMPILER2
   if (UseFPUForSpilling) {
     if (UseSSE < 2) {
@@ -538,14 +634,28 @@
     if (MaxVectorSize > 32) {
       FLAG_SET_DEFAULT(MaxVectorSize, 32);
     }
-    if (MaxVectorSize > 16 && UseAVX == 0) {
-      // Only supported with AVX+
+    if (MaxVectorSize > 16 && (UseAVX == 0 || !os_supports_avx_vectors())) {
+      // 32 bytes vectors (in YMM) are only supported with AVX+
       FLAG_SET_DEFAULT(MaxVectorSize, 16);
     }
     if (UseSSE < 2) {
-      // Only supported with SSE2+
+      // Vectors (in XMM) are only supported with SSE2+
       FLAG_SET_DEFAULT(MaxVectorSize, 0);
     }
+#ifdef ASSERT
+    if (supports_avx() && PrintMiscellaneous && Verbose && TraceNewVectors) {
+      tty->print_cr("State of YMM registers after signal handle:");
+      int nreg = 2 LP64_ONLY(+2);
+      const char* ymm_name[4] = {"0", "7", "8", "15"};
+      for (int i = 0; i < nreg; i++) {
+        tty->print("YMM%s:", ymm_name[i]);
+        for (int j = 7; j >=0; j--) {
+          tty->print(" %x", _cpuid_info.ymm_save[i*8 + j]);
+        }
+        tty->cr();
+      }
+    }
+#endif
   }
 #endif
 
@@ -600,13 +710,6 @@
       }
     }
 
-    // Use count leading zeros count instruction if available.
-    if (supports_lzcnt()) {
-      if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) {
-        UseCountLeadingZerosInstruction = true;
-      }
-    }
-
     // some defaults for AMD family 15h
     if ( cpu_family() == 0x15 ) {
       // On family 15h processors default is no sw prefetch
@@ -683,14 +786,35 @@
       }
     }
   }
-#if defined(COMPILER2) && defined(_ALLBSD_SOURCE)
-    if (MaxVectorSize > 16) {
-      // Limit vectors size to 16 bytes on BSD until it fixes
-      // restoring upper 128bit of YMM registers on return
-      // from signal handler.
-      FLAG_SET_DEFAULT(MaxVectorSize, 16);
+
+  // Use count leading zeros count instruction if available.
+  if (supports_lzcnt()) {
+    if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) {
+      UseCountLeadingZerosInstruction = true;
     }
-#endif // COMPILER2
+   } else if (UseCountLeadingZerosInstruction) {
+    warning("lzcnt instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UseCountLeadingZerosInstruction, false);
+  }
+
+  if (supports_bmi1()) {
+    if (FLAG_IS_DEFAULT(UseBMI1Instructions)) {
+      UseBMI1Instructions = true;
+    }
+  } else if (UseBMI1Instructions) {
+    warning("BMI1 instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseBMI1Instructions, false);
+  }
+
+  // Use count trailing zeros instruction if available
+  if (supports_bmi1()) {
+    if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) {
+      UseCountTrailingZerosInstruction = UseBMI1Instructions;
+    }
+  } else if (UseCountTrailingZerosInstruction) {
+    warning("tzcnt instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false);
+  }
 
   // Use population count instruction if available.
   if (supports_popcnt()) {
@@ -790,6 +914,11 @@
     if (UseAES) {
       tty->print("  UseAES=1");
     }
+#ifdef COMPILER2
+    if (MaxVectorSize > 0) {
+      tty->print("  MaxVectorSize=%d", MaxVectorSize);
+    }
+#endif
     tty->cr();
     tty->print("Allocation");
     if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
@@ -832,6 +961,27 @@
 #endif // !PRODUCT
 }
 
+bool VM_Version::use_biased_locking() {
+#if INCLUDE_RTM_OPT
+  // RTM locking is most useful when there is high lock contention and
+  // low data contention.  With high lock contention the lock is usually
+  // inflated and biased locking is not suitable for that case.
+  // RTM locking code requires that biased locking is off.
+  // Note: we can't switch off UseBiasedLocking in get_processor_features()
+  // because it is used by Thread::allocate() which is called before
+  // VM_Version::initialize().
+  if (UseRTMLocking && UseBiasedLocking) {
+    if (FLAG_IS_DEFAULT(UseBiasedLocking)) {
+      FLAG_SET_DEFAULT(UseBiasedLocking, false);
+    } else {
+      warning("Biased locking is not supported with RTM locking; ignoring UseBiasedLocking flag." );
+      UseBiasedLocking = false;
+    }
+  }
+#endif
+  return UseBiasedLocking;
+}
+
 void VM_Version::initialize() {
   ResourceMark rm;
   // Making this stub must be FIRST use of assembler
--- a/src/cpu/x86/vm/vm_version_x86.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/vm_version_x86.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -141,7 +141,8 @@
     struct {
       uint32_t LahfSahf     : 1,
                CmpLegacy    : 1,
-                            : 4,
+                            : 3,
+               lzcnt_intel  : 1,
                lzcnt        : 1,
                sse4a        : 1,
                misalignsse  : 1,
@@ -206,7 +207,9 @@
                         : 2,
                    bmi2 : 1,
                    erms : 1,
-                        : 22;
+                        : 1,
+                   rtm  : 1,
+                        : 20;
     } bits;
   };
 
@@ -228,6 +231,9 @@
                                // 0 if this instruction is not available
   static const char* _features_str;
 
+  static address   _cpuinfo_segv_addr; // address of instruction which causes SEGV
+  static address   _cpuinfo_cont_addr; // address of instruction after the one which causes SEGV
+
   enum {
     CPU_CX8    = (1 << 0), // next bits are from cpuid 1 (EDX)
     CPU_CMOV   = (1 << 1),
@@ -251,7 +257,10 @@
     CPU_AVX2   = (1 << 18),
     CPU_AES    = (1 << 19),
     CPU_ERMS   = (1 << 20), // enhanced 'rep movsb/stosb' instructions
-    CPU_CLMUL  = (1 << 21) // carryless multiply for CRC
+    CPU_CLMUL  = (1 << 21), // carryless multiply for CRC
+    CPU_BMI1   = (1 << 22),
+    CPU_BMI2   = (1 << 23),
+    CPU_RTM    = (1 << 24)  // Restricted Transactional Memory instructions
   } cpuFeatureFlags;
 
   enum {
@@ -358,6 +367,9 @@
     // extended control register XCR0 (the XFEATURE_ENABLED_MASK register)
     XemXcr0Eax   xem_xcr0_eax;
     uint32_t     xem_xcr0_edx; // reserved
+
+    // Space to save ymm registers after signal handle
+    int          ymm_save[8*4]; // Save ymm0, ymm7, ymm8, ymm15
   };
 
   // The actual cpuid info block
@@ -423,6 +435,8 @@
       if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
         result |= CPU_AVX2;
     }
+    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
+      result |= CPU_BMI1;
     if (_cpuid_info.std_cpuid1_edx.bits.tsc != 0)
       result |= CPU_TSC;
     if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0)
@@ -433,6 +447,8 @@
       result |= CPU_ERMS;
     if (_cpuid_info.std_cpuid1_ecx.bits.clmul != 0)
       result |= CPU_CLMUL;
+    if (_cpuid_info.sef_cpuid7_ebx.bits.rtm != 0)
+      result |= CPU_RTM;
 
     // AMD features.
     if (is_amd()) {
@@ -444,10 +460,32 @@
       if (_cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
         result |= CPU_SSE4A;
     }
+    // Intel features.
+    if(is_intel()) {
+      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
+        result |= CPU_BMI2;
+      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
+        result |= CPU_LZCNT;
+    }
 
     return result;
   }
 
+  static bool os_supports_avx_vectors() {
+    if (!supports_avx()) {
+      return false;
+    }
+    // Verify that OS save/restore all bits of AVX registers
+    // during signal processing.
+    int nreg = 2 LP64_ONLY(+2);
+    for (int i = 0; i < 8 * nreg; i++) { // 32 bytes per ymm register
+      if (_cpuid_info.ymm_save[i] != ymm_test_value()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   static void get_processor_features();
 
 public:
@@ -464,10 +502,26 @@
   static ByteSize tpl_cpuidB1_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB1_eax); }
   static ByteSize tpl_cpuidB2_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
   static ByteSize xem_xcr0_offset() { return byte_offset_of(CpuidInfo, xem_xcr0_eax); }
+  static ByteSize ymm_save_offset() { return byte_offset_of(CpuidInfo, ymm_save); }
+
+  // The value used to check ymm register after signal handle
+  static int ymm_test_value()    { return 0xCAFEBABE; }
+
+  static void set_cpuinfo_segv_addr(address pc) { _cpuinfo_segv_addr = pc; }
+  static bool  is_cpuinfo_segv_addr(address pc) { return _cpuinfo_segv_addr == pc; }
+  static void set_cpuinfo_cont_addr(address pc) { _cpuinfo_cont_addr = pc; }
+  static address  cpuinfo_cont_addr()           { return _cpuinfo_cont_addr; }
+
+  static void clean_cpuFeatures()   { _cpuFeatures = 0; }
+  static void set_avx_cpuFeatures() { _cpuFeatures = (CPU_SSE | CPU_SSE2 | CPU_AVX); }
+
 
   // Initialization
   static void initialize();
 
+  // Override Abstract_VM_Version implementation
+  static bool use_biased_locking();
+
   // Asserts
   static void assert_is_initialized() {
     assert(_cpuid_info.std_cpuid1_eax.bits.family != 0, "VM_Version not initialized");
@@ -560,7 +614,9 @@
   static bool supports_aes()      { return (_cpuFeatures & CPU_AES) != 0; }
   static bool supports_erms()     { return (_cpuFeatures & CPU_ERMS) != 0; }
   static bool supports_clmul()    { return (_cpuFeatures & CPU_CLMUL) != 0; }
-
+  static bool supports_rtm()      { return (_cpuFeatures & CPU_RTM) != 0; }
+  static bool supports_bmi1()     { return (_cpuFeatures & CPU_BMI1) != 0; }
+  static bool supports_bmi2()     { return (_cpuFeatures & CPU_BMI2) != 0; }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/src/cpu/x86/vm/x86_32.ad	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/x86_32.ad	Tue Mar 25 17:07:36 2014 -0700
@@ -1489,19 +1489,6 @@
   return EBP_REG_mask();
 }
 
-const RegMask Matcher::mathExactI_result_proj_mask() {
-  return EAX_REG_mask();
-}
-
-const RegMask Matcher::mathExactL_result_proj_mask() {
-  ShouldNotReachHere();
-  return RegMask();
-}
-
-const RegMask Matcher::mathExactI_flags_proj_mask() {
-  return INT_FLAGS_mask();
-}
-
 // Returns true if the high 32 bits of the value is known to be zero.
 bool is_operand_hi32_zero(Node* n) {
   int opc = n->Opcode();
@@ -2865,542 +2852,6 @@
     emit_d8    (cbuf,0 );
   %}
 
-
-  // Because the transitions from emitted code to the runtime
-  // monitorenter/exit helper stubs are so slow it's critical that
-  // we inline both the stack-locking fast-path and the inflated fast path.
-  //
-  // See also: cmpFastLock and cmpFastUnlock.
-  //
-  // What follows is a specialized inline transliteration of the code
-  // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
-  // another option would be to emit TrySlowEnter and TrySlowExit methods
-  // at startup-time.  These methods would accept arguments as
-  // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
-  // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
-  // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
-  // In practice, however, the # of lock sites is bounded and is usually small.
-  // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
-  // if the processor uses simple bimodal branch predictors keyed by EIP
-  // Since the helper routines would be called from multiple synchronization
-  // sites.
-  //
-  // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
-  // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
-  // to those specialized methods.  That'd give us a mostly platform-independent
-  // implementation that the JITs could optimize and inline at their pleasure.
-  // Done correctly, the only time we'd need to cross to native could would be
-  // to park() or unpark() threads.  We'd also need a few more unsafe operators
-  // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
-  // (b) explicit barriers or fence operations.
-  //
-  // TODO:
-  //
-  // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
-  //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
-  //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
-  //    the lock operators would typically be faster than reifying Self.
-  //
-  // *  Ideally I'd define the primitives as:
-  //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
-  //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
-  //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
-  //    Instead, we're stuck with a rather awkward and brittle register assignments below.
-  //    Furthermore the register assignments are overconstrained, possibly resulting in
-  //    sub-optimal code near the synchronization site.
-  //
-  // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
-  //    Alternately, use a better sp-proximity test.
-  //
-  // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
-  //    Either one is sufficient to uniquely identify a thread.
-  //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
-  //
-  // *  Intrinsify notify() and notifyAll() for the common cases where the
-  //    object is locked by the calling thread but the waitlist is empty.
-  //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
-  //
-  // *  use jccb and jmpb instead of jcc and jmp to improve code density.
-  //    But beware of excessive branch density on AMD Opterons.
-  //
-  // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
-  //    or failure of the fast-path.  If the fast-path fails then we pass
-  //    control to the slow-path, typically in C.  In Fast_Lock and
-  //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
-  //    will emit a conditional branch immediately after the node.
-  //    So we have branches to branches and lots of ICC.ZF games.
-  //    Instead, it might be better to have C2 pass a "FailureLabel"
-  //    into Fast_Lock and Fast_Unlock.  In the case of success, control
-  //    will drop through the node.  ICC.ZF is undefined at exit.
-  //    In the case of failure, the node will branch directly to the
-  //    FailureLabel
-
-
-  // obj: object to lock
-  // box: on-stack box address (displaced header location) - KILLED
-  // rax,: tmp -- KILLED
-  // scr: tmp -- KILLED
-  enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{
-
-    Register objReg = as_Register($obj$$reg);
-    Register boxReg = as_Register($box$$reg);
-    Register tmpReg = as_Register($tmp$$reg);
-    Register scrReg = as_Register($scr$$reg);
-
-    // Ensure the register assignents are disjoint
-    guarantee (objReg != boxReg, "") ;
-    guarantee (objReg != tmpReg, "") ;
-    guarantee (objReg != scrReg, "") ;
-    guarantee (boxReg != tmpReg, "") ;
-    guarantee (boxReg != scrReg, "") ;
-    guarantee (tmpReg == as_Register(EAX_enc), "") ;
-
-    MacroAssembler masm(&cbuf);
-
-    if (_counters != NULL) {
-      masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
-    }
-    if (EmitSync & 1) {
-        // set box->dhw = unused_mark (3)
-        // Force all sync thru slow-path: slow_enter() and slow_exit() 
-        masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ;             
-        masm.cmpptr (rsp, (int32_t)0) ;                        
-    } else 
-    if (EmitSync & 2) { 
-        Label DONE_LABEL ;           
-        if (UseBiasedLocking) {
-           // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
-           masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
-        }
-
-        masm.movptr(tmpReg, Address(objReg, 0)) ;          // fetch markword 
-        masm.orptr (tmpReg, 0x1);
-        masm.movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS 
-        if (os::is_MP()) { masm.lock();  }
-        masm.cmpxchgptr(boxReg, Address(objReg, 0));          // Updates tmpReg
-        masm.jcc(Assembler::equal, DONE_LABEL);
-        // Recursive locking
-        masm.subptr(tmpReg, rsp);
-        masm.andptr(tmpReg, (int32_t) 0xFFFFF003 );
-        masm.movptr(Address(boxReg, 0), tmpReg);
-        masm.bind(DONE_LABEL) ; 
-    } else {  
-      // Possible cases that we'll encounter in fast_lock 
-      // ------------------------------------------------
-      // * Inflated
-      //    -- unlocked
-      //    -- Locked
-      //       = by self
-      //       = by other
-      // * biased
-      //    -- by Self
-      //    -- by other
-      // * neutral
-      // * stack-locked
-      //    -- by self
-      //       = sp-proximity test hits
-      //       = sp-proximity test generates false-negative
-      //    -- by other
-      //
-
-      Label IsInflated, DONE_LABEL, PopDone ;
-
-      // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
-      // order to reduce the number of conditional branches in the most common cases.
-      // Beware -- there's a subtle invariant that fetch of the markword
-      // at [FETCH], below, will never observe a biased encoding (*101b).
-      // If this invariant is not held we risk exclusion (safety) failure.
-      if (UseBiasedLocking && !UseOptoBiasInlining) {
-        masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
-      }
-
-      masm.movptr(tmpReg, Address(objReg, 0)) ;         // [FETCH]
-      masm.testptr(tmpReg, 0x02) ;                      // Inflated v (Stack-locked or neutral)
-      masm.jccb  (Assembler::notZero, IsInflated) ;
-
-      // Attempt stack-locking ...
-      masm.orptr (tmpReg, 0x1);
-      masm.movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
-      if (os::is_MP()) { masm.lock();  }
-      masm.cmpxchgptr(boxReg, Address(objReg, 0));           // Updates tmpReg
-      if (_counters != NULL) {
-        masm.cond_inc32(Assembler::equal,
-                        ExternalAddress((address)_counters->fast_path_entry_count_addr()));
-      }
-      masm.jccb (Assembler::equal, DONE_LABEL);
-
-      // Recursive locking
-      masm.subptr(tmpReg, rsp);
-      masm.andptr(tmpReg, 0xFFFFF003 );
-      masm.movptr(Address(boxReg, 0), tmpReg);
-      if (_counters != NULL) {
-        masm.cond_inc32(Assembler::equal,
-                        ExternalAddress((address)_counters->fast_path_entry_count_addr()));
-      }
-      masm.jmp  (DONE_LABEL) ;
-
-      masm.bind (IsInflated) ;
-
-      // The object is inflated.
-      //
-      // TODO-FIXME: eliminate the ugly use of manifest constants:
-      //   Use markOopDesc::monitor_value instead of "2".
-      //   use markOop::unused_mark() instead of "3".
-      // The tmpReg value is an objectMonitor reference ORed with
-      // markOopDesc::monitor_value (2).   We can either convert tmpReg to an
-      // objectmonitor pointer by masking off the "2" bit or we can just
-      // use tmpReg as an objectmonitor pointer but bias the objectmonitor
-      // field offsets with "-2" to compensate for and annul the low-order tag bit.
-      //
-      // I use the latter as it avoids AGI stalls.
-      // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
-      // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
-      //
-      #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
-
-      // boxReg refers to the on-stack BasicLock in the current frame.
-      // We'd like to write:
-      //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
-      // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
-      // additional latency as we have another ST in the store buffer that must drain.
-
-      if (EmitSync & 8192) { 
-         masm.movptr(Address(boxReg, 0), 3) ;            // results in ST-before-CAS penalty
-         masm.get_thread (scrReg) ; 
-         masm.movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2] 
-         masm.movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
-         if (os::is_MP()) { masm.lock(); } 
-         masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
-      } else 
-      if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
-         masm.movptr(scrReg, boxReg) ; 
-         masm.movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2] 
-
-         // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
-         if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
-            // prefetchw [eax + Offset(_owner)-2]
-            masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
-         }
-
-         if ((EmitSync & 64) == 0) {
-           // Optimistic form: consider XORL tmpReg,tmpReg
-           masm.movptr(tmpReg, NULL_WORD) ; 
-         } else { 
-           // Can suffer RTS->RTO upgrades on shared or cold $ lines
-           // Test-And-CAS instead of CAS
-           masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
-           masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
-           masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
-         }
-
-         // Appears unlocked - try to swing _owner from null to non-null.
-         // Ideally, I'd manifest "Self" with get_thread and then attempt
-         // to CAS the register containing Self into m->Owner.
-         // But we don't have enough registers, so instead we can either try to CAS
-         // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
-         // we later store "Self" into m->Owner.  Transiently storing a stack address
-         // (rsp or the address of the box) into  m->owner is harmless.
-         // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
-         if (os::is_MP()) { masm.lock();  }
-         masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
-         masm.movptr(Address(scrReg, 0), 3) ;          // box->_displaced_header = 3
-         masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
-         masm.get_thread (scrReg) ;                    // beware: clobbers ICCs
-         masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ; 
-         masm.xorptr(boxReg, boxReg) ;                 // set icc.ZFlag = 1 to indicate success
-                       
-         // If the CAS fails we can either retry or pass control to the slow-path.  
-         // We use the latter tactic.  
-         // Pass the CAS result in the icc.ZFlag into DONE_LABEL
-         // If the CAS was successful ...
-         //   Self has acquired the lock
-         //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
-         // Intentional fall-through into DONE_LABEL ...
-      } else {
-         masm.movptr(Address(boxReg, 0), 3) ;       // results in ST-before-CAS penalty
-         masm.movptr(boxReg, tmpReg) ; 
-
-         // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
-         if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
-            // prefetchw [eax + Offset(_owner)-2]
-            masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
-         }
-
-         if ((EmitSync & 64) == 0) {
-           // Optimistic form
-           masm.xorptr  (tmpReg, tmpReg) ; 
-         } else { 
-           // Can suffer RTS->RTO upgrades on shared or cold $ lines
-           masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
-           masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
-           masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
-         }
-
-         // Appears unlocked - try to swing _owner from null to non-null.
-         // Use either "Self" (in scr) or rsp as thread identity in _owner.
-         // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
-         masm.get_thread (scrReg) ;
-         if (os::is_MP()) { masm.lock(); }
-         masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
-
-         // If the CAS fails we can either retry or pass control to the slow-path.
-         // We use the latter tactic.
-         // Pass the CAS result in the icc.ZFlag into DONE_LABEL
-         // If the CAS was successful ...
-         //   Self has acquired the lock
-         //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
-         // Intentional fall-through into DONE_LABEL ...
-      }
-
-      // DONE_LABEL is a hot target - we'd really like to place it at the
-      // start of cache line by padding with NOPs.
-      // See the AMD and Intel software optimization manuals for the
-      // most efficient "long" NOP encodings.
-      // Unfortunately none of our alignment mechanisms suffice.
-      masm.bind(DONE_LABEL);
-
-      // Avoid branch-to-branch on AMD processors
-      // This appears to be superstition.
-      if (EmitSync & 32) masm.nop() ;
-
-
-      // At DONE_LABEL the icc ZFlag is set as follows ...
-      // Fast_Unlock uses the same protocol.
-      // ZFlag == 1 -> Success
-      // ZFlag == 0 -> Failure - force control through the slow-path
-    }
-  %}
-
-  // obj: object to unlock
-  // box: box address (displaced header location), killed.  Must be EAX.
-  // rbx,: killed tmp; cannot be obj nor box.
-  //
-  // Some commentary on balanced locking:
-  //
-  // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
-  // Methods that don't have provably balanced locking are forced to run in the
-  // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
-  // The interpreter provides two properties:
-  // I1:  At return-time the interpreter automatically and quietly unlocks any
-  //      objects acquired the current activation (frame).  Recall that the
-  //      interpreter maintains an on-stack list of locks currently held by
-  //      a frame.
-  // I2:  If a method attempts to unlock an object that is not held by the
-  //      the frame the interpreter throws IMSX.
-  //
-  // Lets say A(), which has provably balanced locking, acquires O and then calls B().
-  // B() doesn't have provably balanced locking so it runs in the interpreter.
-  // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
-  // is still locked by A().
-  //
-  // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
-  // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
-  // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
-  // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
-
-  enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{
-
-    Register objReg = as_Register($obj$$reg);
-    Register boxReg = as_Register($box$$reg);
-    Register tmpReg = as_Register($tmp$$reg);
-
-    guarantee (objReg != boxReg, "") ;
-    guarantee (objReg != tmpReg, "") ;
-    guarantee (boxReg != tmpReg, "") ;
-    guarantee (boxReg == as_Register(EAX_enc), "") ;
-    MacroAssembler masm(&cbuf);
-
-    if (EmitSync & 4) {
-      // Disable - inhibit all inlining.  Force control through the slow-path
-      masm.cmpptr (rsp, 0) ; 
-    } else 
-    if (EmitSync & 8) {
-      Label DONE_LABEL ;
-      if (UseBiasedLocking) {
-         masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
-      }
-      // classic stack-locking code ...
-      masm.movptr(tmpReg, Address(boxReg, 0)) ;
-      masm.testptr(tmpReg, tmpReg) ;
-      masm.jcc   (Assembler::zero, DONE_LABEL) ;
-      if (os::is_MP()) { masm.lock(); }
-      masm.cmpxchgptr(tmpReg, Address(objReg, 0));          // Uses EAX which is box
-      masm.bind(DONE_LABEL);
-    } else {
-      Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
-
-      // Critically, the biased locking test must have precedence over
-      // and appear before the (box->dhw == 0) recursive stack-lock test.
-      if (UseBiasedLocking && !UseOptoBiasInlining) {
-         masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
-      }
-      
-      masm.cmpptr(Address(boxReg, 0), 0) ;            // Examine the displaced header
-      masm.movptr(tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
-      masm.jccb  (Assembler::zero, DONE_LABEL) ;      // 0 indicates recursive stack-lock
-
-      masm.testptr(tmpReg, 0x02) ;                     // Inflated? 
-      masm.jccb  (Assembler::zero, Stacked) ;
-
-      masm.bind  (Inflated) ;
-      // It's inflated.
-      // Despite our balanced locking property we still check that m->_owner == Self
-      // as java routines or native JNI code called by this thread might
-      // have released the lock.
-      // Refer to the comments in synchronizer.cpp for how we might encode extra
-      // state in _succ so we can avoid fetching EntryList|cxq.
-      //
-      // I'd like to add more cases in fast_lock() and fast_unlock() --
-      // such as recursive enter and exit -- but we have to be wary of
-      // I$ bloat, T$ effects and BP$ effects.
-      //
-      // If there's no contention try a 1-0 exit.  That is, exit without
-      // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
-      // we detect and recover from the race that the 1-0 exit admits.
-      //
-      // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
-      // before it STs null into _owner, releasing the lock.  Updates
-      // to data protected by the critical section must be visible before
-      // we drop the lock (and thus before any other thread could acquire
-      // the lock and observe the fields protected by the lock).
-      // IA32's memory-model is SPO, so STs are ordered with respect to
-      // each other and there's no need for an explicit barrier (fence).
-      // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
-
-      masm.get_thread (boxReg) ;
-      if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
-        // prefetchw [ebx + Offset(_owner)-2]
-        masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
-      }
-
-      // Note that we could employ various encoding schemes to reduce
-      // the number of loads below (currently 4) to just 2 or 3.
-      // Refer to the comments in synchronizer.cpp.
-      // In practice the chain of fetches doesn't seem to impact performance, however.
-      if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
-         // Attempt to reduce branch density - AMD's branch predictor.
-         masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
-         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
-         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
-         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
-         masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
-         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
-         masm.jmpb  (DONE_LABEL) ; 
-      } else { 
-         masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
-         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
-         masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
-         masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
-         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
-         masm.jccb  (Assembler::notZero, CheckSucc) ; 
-         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
-         masm.jmpb  (DONE_LABEL) ; 
-      }
-
-      // The Following code fragment (EmitSync & 65536) improves the performance of
-      // contended applications and contended synchronization microbenchmarks.
-      // Unfortunately the emission of the code - even though not executed - causes regressions
-      // in scimark and jetstream, evidently because of $ effects.  Replacing the code
-      // with an equal number of never-executed NOPs results in the same regression.
-      // We leave it off by default.
-
-      if ((EmitSync & 65536) != 0) {
-         Label LSuccess, LGoSlowPath ;
-
-         masm.bind  (CheckSucc) ;
-
-         // Optional pre-test ... it's safe to elide this
-         if ((EmitSync & 16) == 0) { 
-            masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
-            masm.jccb  (Assembler::zero, LGoSlowPath) ; 
-         }
-
-         // We have a classic Dekker-style idiom:
-         //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
-         // There are a number of ways to implement the barrier:
-         // (1) lock:andl &m->_owner, 0
-         //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
-         //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
-         //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
-         // (2) If supported, an explicit MFENCE is appealing.
-         //     In older IA32 processors MFENCE is slower than lock:add or xchg
-         //     particularly if the write-buffer is full as might be the case if
-         //     if stores closely precede the fence or fence-equivalent instruction.
-         //     In more modern implementations MFENCE appears faster, however.
-         // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
-         //     The $lines underlying the top-of-stack should be in M-state.
-         //     The locked add instruction is serializing, of course.
-         // (4) Use xchg, which is serializing
-         //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
-         // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
-         //     The integer condition codes will tell us if succ was 0.
-         //     Since _succ and _owner should reside in the same $line and
-         //     we just stored into _owner, it's likely that the $line
-         //     remains in M-state for the lock:orl.
-         //
-         // We currently use (3), although it's likely that switching to (2)
-         // is correct for the future.
-            
-         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
-         if (os::is_MP()) { 
-            if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 
-              masm.mfence();
-            } else { 
-              masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; 
-            }
-         }
-         // Ratify _succ remains non-null
-         masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
-         masm.jccb  (Assembler::notZero, LSuccess) ; 
-
-         masm.xorptr(boxReg, boxReg) ;                  // box is really EAX
-         if (os::is_MP()) { masm.lock(); }
-         masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
-         masm.jccb  (Assembler::notEqual, LSuccess) ;
-         // Since we're low on registers we installed rsp as a placeholding in _owner.
-         // Now install Self over rsp.  This is safe as we're transitioning from
-         // non-null to non=null
-         masm.get_thread (boxReg) ;
-         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
-         // Intentional fall-through into LGoSlowPath ...
-
-         masm.bind  (LGoSlowPath) ; 
-         masm.orptr(boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
-         masm.jmpb  (DONE_LABEL) ; 
-
-         masm.bind  (LSuccess) ; 
-         masm.xorptr(boxReg, boxReg) ;                 // set ICC.ZF=1 to indicate success
-         masm.jmpb  (DONE_LABEL) ; 
-      }
-
-      masm.bind (Stacked) ;
-      // It's not inflated and it's not recursively stack-locked and it's not biased.
-      // It must be stack-locked.
-      // Try to reset the header to displaced header.
-      // The "box" value on the stack is stable, so we can reload
-      // and be assured we observe the same value as above.
-      masm.movptr(tmpReg, Address(boxReg, 0)) ;
-      if (os::is_MP()) {   masm.lock();    }
-      masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
-      // Intention fall-thru into DONE_LABEL
-
-
-      // DONE_LABEL is a hot target - we'd really like to place it at the
-      // start of cache line by padding with NOPs.
-      // See the AMD and Intel software optimization manuals for the
-      // most efficient "long" NOP encodings.
-      // Unfortunately none of our alignment mechanisms suffice.
-      if ((EmitSync & 65536) == 0) {
-         masm.bind (CheckSucc) ;
-      }
-      masm.bind(DONE_LABEL);
-
-      // Avoid branch to branch on AMD processors
-      if (EmitSync & 32768) { masm.nop() ; }
-    }
-  %}
-
-
   enc_class enc_pop_rdx() %{
     emit_opcode(cbuf,0x5A);
   %}
@@ -5659,6 +5110,19 @@
 %}
 
 instruct countTrailingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{
+  predicate(UseCountTrailingZerosInstruction);
+  match(Set dst (CountTrailingZerosI src));
+  effect(KILL cr);
+
+  format %{ "TZCNT    $dst, $src\t# count trailing zeros (int)" %}
+  ins_encode %{
+    __ tzcntl($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosI_bsf(rRegI dst, rRegI src, eFlagsReg cr) %{
+  predicate(!UseCountTrailingZerosInstruction);
   match(Set dst (CountTrailingZerosI src));
   effect(KILL cr);
 
@@ -5678,6 +5142,30 @@
 %}
 
 instruct countTrailingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{
+  predicate(UseCountTrailingZerosInstruction);
+  match(Set dst (CountTrailingZerosL src));
+  effect(TEMP dst, KILL cr);
+
+  format %{ "TZCNT  $dst, $src.lo\t# count trailing zeros (long) \n\t"
+            "JNC    done\n\t"
+            "TZCNT  $dst, $src.hi\n\t"
+            "ADD    $dst, 32\n"
+            "done:" %}
+  ins_encode %{
+    Register Rdst = $dst$$Register;
+    Register Rsrc = $src$$Register;
+    Label done;
+    __ tzcntl(Rdst, Rsrc);
+    __ jccb(Assembler::carryClear, done);
+    __ tzcntl(Rdst, HIGH_FROM_LOW(Rsrc));
+    __ addl(Rdst, BitsPerInt);
+    __ bind(done);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosL_bsf(rRegI dst, eRegL src, eFlagsReg cr) %{
+  predicate(!UseCountTrailingZerosInstruction);
   match(Set dst (CountTrailingZerosL src));
   effect(TEMP dst, KILL cr);
 
@@ -7492,44 +6980,6 @@
 //----------Arithmetic Instructions--------------------------------------------
 //----------Addition Instructions----------------------------------------------
 
-instruct addExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr)
-%{
-  match(AddExactI dst src);
-  effect(DEF cr);
-
-  format %{ "ADD    $dst, $src\t# addExact int" %}
-  ins_encode %{
-    __ addl($dst$$Register, $src$$Register);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct addExactI_eReg_imm(eAXRegI dst, immI src, eFlagsReg cr)
-%{
-  match(AddExactI dst src);
-  effect(DEF cr);
-
-  format %{ "ADD    $dst, $src\t# addExact int" %}
-  ins_encode %{
-    __ addl($dst$$Register, $src$$constant);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct addExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr)
-%{
-  match(AddExactI dst (LoadI src));
-  effect(DEF cr);
-
-  ins_cost(125);
-  format %{ "ADD    $dst,$src\t# addExact int" %}
-  ins_encode %{
-    __ addl($dst$$Register, $src$$Address);
-  %}
-  ins_pipe( ialu_reg_mem );
-%}
-
-
 // Integer Addition Instructions
 instruct addI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (AddI dst src));
@@ -7839,43 +7289,6 @@
 
 //----------Subtraction Instructions-------------------------------------------
 
-instruct subExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr)
-%{
-  match(SubExactI dst src);
-  effect(DEF cr);
-
-  format %{ "SUB    $dst, $src\t# subExact int" %}
-  ins_encode %{
-    __ subl($dst$$Register, $src$$Register);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct subExactI_eReg_imm(eAXRegI dst, immI src, eFlagsReg cr)
-%{
-  match(SubExactI dst src);
-  effect(DEF cr);
-
-  format %{ "SUB    $dst, $src\t# subExact int" %}
-  ins_encode %{
-    __ subl($dst$$Register, $src$$constant);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct subExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr)
-%{
-  match(SubExactI dst (LoadI src));
-  effect(DEF cr);
-
-  ins_cost(125);
-  format %{ "SUB    $dst,$src\t# subExact int" %}
-  ins_encode %{
-    __ subl($dst$$Register, $src$$Address);
-  %}
-  ins_pipe( ialu_reg_mem );
-%}
-
 // Integer Subtraction Instructions
 instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (SubI dst src));
@@ -7944,17 +7357,6 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct negExactI_eReg(eAXRegI dst, eFlagsReg cr) %{
-  match(NegExactI dst);
-  effect(DEF cr);
-
-  format %{ "NEG    $dst\t# negExact int"%}
-  ins_encode %{
-    __ negl($dst$$Register);
-  %}
-  ins_pipe(ialu_reg);
-%}
-
 //----------Multiplication/Division Instructions-------------------------------
 // Integer Multiplication Instructions
 // Multiply Register
@@ -8166,46 +7568,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct mulExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr)
-%{
-  match(MulExactI dst src);
-  effect(DEF cr);
-
-  ins_cost(300);
-  format %{ "IMUL   $dst, $src\t# mulExact int" %}
-  ins_encode %{
-    __ imull($dst$$Register, $src$$Register);
-  %}
-  ins_pipe(ialu_reg_reg_alu0);
-%}
-
-instruct mulExactI_eReg_imm(eAXRegI dst, rRegI src, immI imm, eFlagsReg cr)
-%{
-  match(MulExactI src imm);
-  effect(DEF cr);
-
-  ins_cost(300);
-  format %{ "IMUL   $dst, $src, $imm\t# mulExact int" %}
-  ins_encode %{
-    __ imull($dst$$Register, $src$$Register, $imm$$constant);
-  %}
-  ins_pipe(ialu_reg_reg_alu0);
-%}
-
-instruct mulExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr)
-%{
-  match(MulExactI dst (LoadI src));
-  effect(DEF cr);
-
-  ins_cost(350);
-  format %{ "IMUL   $dst, $src\t# mulExact int" %}
-  ins_encode %{
-    __ imull($dst$$Register, $src$$Address);
-  %}
-  ins_pipe(ialu_reg_mem_alu0);
-%}
-
-
 // Integer DIV with Register
 instruct divI_eReg(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{
   match(Set rax (DivI rax div));
@@ -8649,6 +8011,123 @@
   ins_pipe( ialu_mem_imm );
 %}
 
+// BMI1 instructions
+instruct andnI_rReg_rReg_rReg(rRegI dst, rRegI src1, rRegI src2, immI_M1 minus_1, eFlagsReg cr) %{
+  match(Set dst (AndI (XorI src1 minus_1) src2));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "ANDNL  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnl($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct andnI_rReg_rReg_mem(rRegI dst, rRegI src1, memory src2, immI_M1 minus_1, eFlagsReg cr) %{
+  match(Set dst (AndI (XorI src1 minus_1) (LoadI src2) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "ANDNL  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnl($dst$$Register, $src1$$Register, $src2$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, eFlagsReg cr) %{
+  match(Set dst (AndI (SubI imm_zero src) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "BLSIL  $dst, $src" %}
+
+  ins_encode %{
+    __ blsil($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, eFlagsReg cr) %{
+  match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "BLSIL  $dst, $src" %}
+
+  ins_encode %{
+    __ blsil($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (XorI (AddI src minus_1) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "BLSMSKL $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskl($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsmskI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (XorI (AddI (LoadI src) minus_1) (LoadI src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "BLSMSKL $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskl($dst$$Register, $src$$Address);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsrI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (AndI (AddI src minus_1) src) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "BLSRL  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrl($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (AndI (AddI (LoadI src) minus_1) (LoadI src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "BLSRL  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrl($dst$$Register, $src$$Address);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
 // Or Instructions
 // Or Register with Register
 instruct orI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
@@ -9071,6 +8550,91 @@
 instruct cadd_cmpLTMask_mem(ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr) %{
   match(Set p (AddI (AndI (CmpLTMask p q) (LoadI y)) (SubI p q)));
 */
+//----------Overflow Math Instructions-----------------------------------------
+
+instruct overflowAddI_eReg(eFlagsReg cr, eAXRegI op1, rRegI op2)
+%{
+  match(Set cr (OverflowAddI op1 op2));
+  effect(DEF cr, USE_KILL op1, USE op2);
+
+  format %{ "ADD    $op1, $op2\t# overflow check int" %}
+
+  ins_encode %{
+    __ addl($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowAddI_rReg_imm(eFlagsReg cr, eAXRegI op1, immI op2)
+%{
+  match(Set cr (OverflowAddI op1 op2));
+  effect(DEF cr, USE_KILL op1, USE op2);
+
+  format %{ "ADD    $op1, $op2\t# overflow check int" %}
+
+  ins_encode %{
+    __ addl($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowSubI_rReg(eFlagsReg cr, rRegI op1, rRegI op2)
+%{
+  match(Set cr (OverflowSubI op1 op2));
+
+  format %{ "CMP    $op1, $op2\t# overflow check int" %}
+  ins_encode %{
+    __ cmpl($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowSubI_rReg_imm(eFlagsReg cr, rRegI op1, immI op2)
+%{
+  match(Set cr (OverflowSubI op1 op2));
+
+  format %{ "CMP    $op1, $op2\t# overflow check int" %}
+  ins_encode %{
+    __ cmpl($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowNegI_rReg(eFlagsReg cr, immI0 zero, eAXRegI op2)
+%{
+  match(Set cr (OverflowSubI zero op2));
+  effect(DEF cr, USE_KILL op2);
+
+  format %{ "NEG    $op2\t# overflow check int" %}
+  ins_encode %{
+    __ negl($op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowMulI_rReg(eFlagsReg cr, eAXRegI op1, rRegI op2)
+%{
+  match(Set cr (OverflowMulI op1 op2));
+  effect(DEF cr, USE_KILL op1, USE op2);
+
+  format %{ "IMUL    $op1, $op2\t# overflow check int" %}
+  ins_encode %{
+    __ imull($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg_alu0);
+%}
+
+instruct overflowMulI_rReg_imm(eFlagsReg cr, rRegI op1, immI op2, rRegI tmp)
+%{
+  match(Set cr (OverflowMulI op1 op2));
+  effect(DEF cr, TEMP tmp, USE op1, USE op2);
+
+  format %{ "IMUL    $tmp, $op1, $op2\t# overflow check int" %}
+  ins_encode %{
+    __ imull($tmp$$Register, $op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_reg_reg_alu0);
+%}
 
 //----------Long Instructions------------------------------------------------
 // Add Long Register with Register
@@ -9186,6 +8750,210 @@
   ins_pipe( ialu_reg_long_mem );
 %}
 
+// BMI1 instructions
+instruct andnL_eReg_eReg_eReg(eRegL dst, eRegL src1, eRegL src2, immL_M1 minus_1, eFlagsReg cr) %{
+  match(Set dst (AndL (XorL src1 minus_1) src2));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  format %{ "ANDNL  $dst.lo, $src1.lo, $src2.lo\n\t"
+            "ANDNL  $dst.hi, $src1.hi, $src2.hi"
+         %}
+
+  ins_encode %{
+    Register Rdst = $dst$$Register;
+    Register Rsrc1 = $src1$$Register;
+    Register Rsrc2 = $src2$$Register;
+    __ andnl(Rdst, Rsrc1, Rsrc2);
+    __ andnl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc1), HIGH_FROM_LOW(Rsrc2));
+  %}
+  ins_pipe(ialu_reg_reg_long);
+%}
+
+instruct andnL_eReg_eReg_mem(eRegL dst, eRegL src1, memory src2, immL_M1 minus_1, eFlagsReg cr) %{
+  match(Set dst (AndL (XorL src1 minus_1) (LoadL src2) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  ins_cost(125);
+  format %{ "ANDNL  $dst.lo, $src1.lo, $src2\n\t"
+            "ANDNL  $dst.hi, $src1.hi, $src2+4"
+         %}
+
+  ins_encode %{
+    Register Rdst = $dst$$Register;
+    Register Rsrc1 = $src1$$Register;
+    Address src2_hi = Address::make_raw($src2$$base, $src2$$index, $src2$$scale, $src2$$disp + 4, relocInfo::none);
+
+    __ andnl(Rdst, Rsrc1, $src2$$Address);
+    __ andnl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc1), src2_hi);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsiL_eReg_eReg(eRegL dst, eRegL src, immL0 imm_zero, eFlagsReg cr) %{
+  match(Set dst (AndL (SubL imm_zero src) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  format %{ "MOVL   $dst.hi, 0\n\t"
+            "BLSIL  $dst.lo, $src.lo\n\t"
+            "JNZ    done\n\t"
+            "BLSIL  $dst.hi, $src.hi\n"
+            "done:"
+         %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Register Rsrc = $src$$Register;
+    __ movl(HIGH_FROM_LOW(Rdst), 0);
+    __ blsil(Rdst, Rsrc);
+    __ jccb(Assembler::notZero, done);
+    __ blsil(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
+    __ bind(done);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiL_eReg_mem(eRegL dst, memory src, immL0 imm_zero, eFlagsReg cr) %{
+  match(Set dst (AndL (SubL imm_zero (LoadL src) ) (LoadL src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  ins_cost(125);
+  format %{ "MOVL   $dst.hi, 0\n\t"
+            "BLSIL  $dst.lo, $src\n\t"
+            "JNZ    done\n\t"
+            "BLSIL  $dst.hi, $src+4\n"
+            "done:"
+         %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none);
+
+    __ movl(HIGH_FROM_LOW(Rdst), 0);
+    __ blsil(Rdst, $src$$Address);
+    __ jccb(Assembler::notZero, done);
+    __ blsil(HIGH_FROM_LOW(Rdst), src_hi);
+    __ bind(done);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskL_eReg_eReg(eRegL dst, eRegL src, immL_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (XorL (AddL src minus_1) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  format %{ "MOVL    $dst.hi, 0\n\t"
+            "BLSMSKL $dst.lo, $src.lo\n\t"
+            "JNC     done\n\t"
+            "BLSMSKL $dst.hi, $src.hi\n"
+            "done:"
+         %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Register Rsrc = $src$$Register;
+    __ movl(HIGH_FROM_LOW(Rdst), 0);
+    __ blsmskl(Rdst, Rsrc);
+    __ jccb(Assembler::carryClear, done);
+    __ blsmskl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
+    __ bind(done);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsmskL_eReg_mem(eRegL dst, memory src, immL_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (XorL (AddL (LoadL src) minus_1) (LoadL src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  ins_cost(125);
+  format %{ "MOVL    $dst.hi, 0\n\t"
+            "BLSMSKL $dst.lo, $src\n\t"
+            "JNC     done\n\t"
+            "BLSMSKL $dst.hi, $src+4\n"
+            "done:"
+         %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none);
+
+    __ movl(HIGH_FROM_LOW(Rdst), 0);
+    __ blsmskl(Rdst, $src$$Address);
+    __ jccb(Assembler::carryClear, done);
+    __ blsmskl(HIGH_FROM_LOW(Rdst), src_hi);
+    __ bind(done);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsrL_eReg_eReg(eRegL dst, eRegL src, immL_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (AndL (AddL src minus_1) src) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  format %{ "MOVL   $dst.hi, $src.hi\n\t"
+            "BLSRL  $dst.lo, $src.lo\n\t"
+            "JNC    done\n\t"
+            "BLSRL  $dst.hi, $src.hi\n"
+            "done:"
+  %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Register Rsrc = $src$$Register;
+    __ movl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
+    __ blsrl(Rdst, Rsrc);
+    __ jccb(Assembler::carryClear, done);
+    __ blsrl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
+    __ bind(done);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrL_eReg_mem(eRegL dst, memory src, immL_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (AndL (AddL (LoadL src) minus_1) (LoadL src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  ins_cost(125);
+  format %{ "MOVL   $dst.hi, $src+4\n\t"
+            "BLSRL  $dst.lo, $src\n\t"
+            "JNC    done\n\t"
+            "BLSRL  $dst.hi, $src+4\n"
+            "done:"
+  %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none);
+    __ movl(HIGH_FROM_LOW(Rdst), src_hi);
+    __ blsrl(Rdst, $src$$Address);
+    __ jccb(Assembler::carryClear, done);
+    __ blsrl(HIGH_FROM_LOW(Rdst), src_hi);
+    __ bind(done);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
 // Or Long Register with Register
 instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
   match(Set dst (OrL dst src));
@@ -13104,23 +12872,44 @@
 
 // inlined locking and unlocking
 
-
-instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
-  match( Set cr (FastLock object box) );
-  effect( TEMP tmp, TEMP scr, USE_KILL box );
+instruct cmpFastLockRTM(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eDXRegI scr, rRegI cx1, rRegI cx2) %{
+  predicate(Compile::current()->use_rtm());
+  match(Set cr (FastLock object box));
+  effect(TEMP tmp, TEMP scr, TEMP cx1, TEMP cx2, USE_KILL box);
+  ins_cost(300);
+  format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr,$cx1,$cx2" %}
+  ins_encode %{
+    __ fast_lock($object$$Register, $box$$Register, $tmp$$Register,
+                 $scr$$Register, $cx1$$Register, $cx2$$Register,
+                 _counters, _rtm_counters, _stack_rtm_counters,
+                 ((Method*)(ra_->C->method()->constant_encoding()))->method_data(),
+                 true, ra_->C->profile_rtm());
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct cmpFastLock(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
+  predicate(!Compile::current()->use_rtm());
+  match(Set cr (FastLock object box));
+  effect(TEMP tmp, TEMP scr, USE_KILL box);
   ins_cost(300);
   format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %}
-  ins_encode( Fast_Lock(object,box,tmp,scr) );
-  ins_pipe( pipe_slow );
-%}
-
-instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
-  match( Set cr (FastUnlock object box) );
-  effect( TEMP tmp, USE_KILL box );
+  ins_encode %{
+    __ fast_lock($object$$Register, $box$$Register, $tmp$$Register,
+                 $scr$$Register, noreg, noreg, _counters, NULL, NULL, NULL, false, false);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct cmpFastUnlock(eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
+  match(Set cr (FastUnlock object box));
+  effect(TEMP tmp, USE_KILL box);
   ins_cost(300);
   format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %}
-  ins_encode( Fast_Unlock(object,box,tmp) );
-  ins_pipe( pipe_slow );
+  ins_encode %{
+    __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register, ra_->C->use_rtm());
+  %}
+  ins_pipe(pipe_slow);
 %}
 
 
--- a/src/cpu/x86/vm/x86_64.ad	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/cpu/x86/vm/x86_64.ad	Tue Mar 25 17:07:36 2014 -0700
@@ -1600,18 +1600,6 @@
   return PTR_RBP_REG_mask();
 }
 
-const RegMask Matcher::mathExactI_result_proj_mask() {
-  return INT_RAX_REG_mask();
-}
-
-const RegMask Matcher::mathExactL_result_proj_mask() {
-  return LONG_RAX_REG_mask();
-}
-
-const RegMask Matcher::mathExactI_flags_proj_mask() {
-  return INT_FLAGS_mask();
-}
-
 %}
 
 //----------ENCODING BLOCK-----------------------------------------------------
@@ -2542,231 +2530,6 @@
   %}
 
 
-  // obj: object to lock
-  // box: box address (header location) -- killed
-  // tmp: rax -- killed
-  // scr: rbx -- killed
-  //
-  // What follows is a direct transliteration of fast_lock() and fast_unlock()
-  // from i486.ad.  See that file for comments.
-  // TODO: where possible switch from movq (r, 0) to movl(r,0) and
-  // use the shorter encoding.  (Movl clears the high-order 32-bits).
-
-
-  enc_class Fast_Lock(rRegP obj, rRegP box, rax_RegI tmp, rRegP scr)
-  %{
-    Register objReg = as_Register((int)$obj$$reg);
-    Register boxReg = as_Register((int)$box$$reg);
-    Register tmpReg = as_Register($tmp$$reg);
-    Register scrReg = as_Register($scr$$reg);
-    MacroAssembler masm(&cbuf);
-
-    // Verify uniqueness of register assignments -- necessary but not sufficient
-    assert (objReg != boxReg && objReg != tmpReg &&
-            objReg != scrReg && tmpReg != scrReg, "invariant") ;
-
-    if (_counters != NULL) {
-      masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
-    }
-    if (EmitSync & 1) {
-        // Without cast to int32_t a movptr will destroy r10 which is typically obj
-        masm.movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())) ;
-        masm.cmpptr(rsp, (int32_t)NULL_WORD) ;
-    } else
-    if (EmitSync & 2) {
-        Label DONE_LABEL;
-        if (UseBiasedLocking) {
-           // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
-          masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
-        }
-        // QQQ was movl...
-        masm.movptr(tmpReg, 0x1);
-        masm.orptr(tmpReg, Address(objReg, 0));
-        masm.movptr(Address(boxReg, 0), tmpReg);
-        if (os::is_MP()) {
-          masm.lock();
-        }
-        masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
-        masm.jcc(Assembler::equal, DONE_LABEL);
-
-        // Recursive locking
-        masm.subptr(tmpReg, rsp);
-        masm.andptr(tmpReg, 7 - os::vm_page_size());
-        masm.movptr(Address(boxReg, 0), tmpReg);
-
-        masm.bind(DONE_LABEL);
-        masm.nop(); // avoid branch to branch
-    } else {
-        Label DONE_LABEL, IsInflated, Egress;
-
-        masm.movptr(tmpReg, Address(objReg, 0)) ;
-        masm.testl (tmpReg, 0x02) ;         // inflated vs stack-locked|neutral|biased
-        masm.jcc   (Assembler::notZero, IsInflated) ;
-
-        // it's stack-locked, biased or neutral
-        // TODO: optimize markword triage order to reduce the number of
-        // conditional branches in the most common cases.
-        // Beware -- there's a subtle invariant that fetch of the markword
-        // at [FETCH], below, will never observe a biased encoding (*101b).
-        // If this invariant is not held we'll suffer exclusion (safety) failure.
-
-        if (UseBiasedLocking && !UseOptoBiasInlining) {
-          masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, _counters);
-          masm.movptr(tmpReg, Address(objReg, 0)) ;        // [FETCH]
-        }
-
-        // was q will it destroy high?
-        masm.orl   (tmpReg, 1) ;
-        masm.movptr(Address(boxReg, 0), tmpReg) ;
-        if (os::is_MP()) { masm.lock(); }
-        masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
-        if (_counters != NULL) {
-           masm.cond_inc32(Assembler::equal,
-                           ExternalAddress((address) _counters->fast_path_entry_count_addr()));
-        }
-        masm.jcc   (Assembler::equal, DONE_LABEL);
-
-        // Recursive locking
-        masm.subptr(tmpReg, rsp);
-        masm.andptr(tmpReg, 7 - os::vm_page_size());
-        masm.movptr(Address(boxReg, 0), tmpReg);
-        if (_counters != NULL) {
-           masm.cond_inc32(Assembler::equal,
-                           ExternalAddress((address) _counters->fast_path_entry_count_addr()));
-        }
-        masm.jmp   (DONE_LABEL) ;
-
-        masm.bind  (IsInflated) ;
-        // It's inflated
-
-        // TODO: someday avoid the ST-before-CAS penalty by
-        // relocating (deferring) the following ST.
-        // We should also think about trying a CAS without having
-        // fetched _owner.  If the CAS is successful we may
-        // avoid an RTO->RTS upgrade on the $line.
-        // Without cast to int32_t a movptr will destroy r10 which is typically obj
-        masm.movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())) ;
-
-        masm.mov    (boxReg, tmpReg) ;
-        masm.movptr (tmpReg, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
-        masm.testptr(tmpReg, tmpReg) ;
-        masm.jcc    (Assembler::notZero, DONE_LABEL) ;
-
-        // It's inflated and appears unlocked
-        if (os::is_MP()) { masm.lock(); }
-        masm.cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
-        // Intentional fall-through into DONE_LABEL ...
-
-        masm.bind  (DONE_LABEL) ;
-        masm.nop   () ;                 // avoid jmp to jmp
-    }
-  %}
-
-  // obj: object to unlock
-  // box: box address (displaced header location), killed
-  // RBX: killed tmp; cannot be obj nor box
-  enc_class Fast_Unlock(rRegP obj, rax_RegP box, rRegP tmp)
-  %{
-
-    Register objReg = as_Register($obj$$reg);
-    Register boxReg = as_Register($box$$reg);
-    Register tmpReg = as_Register($tmp$$reg);
-    MacroAssembler masm(&cbuf);
-
-    if (EmitSync & 4) {
-       masm.cmpptr(rsp, 0) ;
-    } else
-    if (EmitSync & 8) {
-       Label DONE_LABEL;
-       if (UseBiasedLocking) {
-         masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
-       }
-
-       // Check whether the displaced header is 0
-       //(=> recursive unlock)
-       masm.movptr(tmpReg, Address(boxReg, 0));
-       masm.testptr(tmpReg, tmpReg);
-       masm.jcc(Assembler::zero, DONE_LABEL);
-
-       // If not recursive lock, reset the header to displaced header
-       if (os::is_MP()) {
-         masm.lock();
-       }
-       masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
-       masm.bind(DONE_LABEL);
-       masm.nop(); // avoid branch to branch
-    } else {
-       Label DONE_LABEL, Stacked, CheckSucc ;
-
-       if (UseBiasedLocking && !UseOptoBiasInlining) {
-         masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
-       }
-
-       masm.movptr(tmpReg, Address(objReg, 0)) ;
-       masm.cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD) ;
-       masm.jcc   (Assembler::zero, DONE_LABEL) ;
-       masm.testl (tmpReg, 0x02) ;
-       masm.jcc   (Assembler::zero, Stacked) ;
-
-       // It's inflated
-       masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
-       masm.xorptr(boxReg, r15_thread) ;
-       masm.orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
-       masm.jcc   (Assembler::notZero, DONE_LABEL) ;
-       masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
-       masm.orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
-       masm.jcc   (Assembler::notZero, CheckSucc) ;
-       masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD) ;
-       masm.jmp   (DONE_LABEL) ;
-
-       if ((EmitSync & 65536) == 0) {
-         Label LSuccess, LGoSlowPath ;
-         masm.bind  (CheckSucc) ;
-         masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD) ;
-         masm.jcc   (Assembler::zero, LGoSlowPath) ;
-
-         // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
-         // the explicit ST;MEMBAR combination, but masm doesn't currently support
-         // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
-         // are all faster when the write buffer is populated.
-         masm.movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD) ;
-         if (os::is_MP()) {
-            masm.lock () ; masm.addl (Address(rsp, 0), 0) ;
-         }
-         masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD) ;
-         masm.jcc   (Assembler::notZero, LSuccess) ;
-
-         masm.movptr (boxReg, (int32_t)NULL_WORD) ;                   // box is really EAX
-         if (os::is_MP()) { masm.lock(); }
-         masm.cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
-         masm.jcc   (Assembler::notEqual, LSuccess) ;
-         // Intentional fall-through into slow-path
-
-         masm.bind  (LGoSlowPath) ;
-         masm.orl   (boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
-         masm.jmp   (DONE_LABEL) ;
-
-         masm.bind  (LSuccess) ;
-         masm.testl (boxReg, 0) ;                      // set ICC.ZF=1 to indicate success
-         masm.jmp   (DONE_LABEL) ;
-       }
-
-       masm.bind  (Stacked) ;
-       masm.movptr(tmpReg, Address (boxReg, 0)) ;      // re-fetch
-       if (os::is_MP()) { masm.lock(); }
-       masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
-
-       if (EmitSync & 65536) {
-          masm.bind (CheckSucc) ;
-       }
-       masm.bind(DONE_LABEL);
-       if (EmitSync & 32768) {
-          masm.nop();                      // avoid branch to branch
-       }
-    }
-  %}
-
-
   enc_class enc_rethrow()
   %{
     cbuf.set_insts_mark();
@@ -6202,6 +5965,19 @@
 %}
 
 instruct countTrailingZerosI(rRegI dst, rRegI src, rFlagsReg cr) %{
+  predicate(UseCountTrailingZerosInstruction);
+  match(Set dst (CountTrailingZerosI src));
+  effect(KILL cr);
+
+  format %{ "tzcntl    $dst, $src\t# count trailing zeros (int)" %}
+  ins_encode %{
+    __ tzcntl($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosI_bsf(rRegI dst, rRegI src, rFlagsReg cr) %{
+  predicate(!UseCountTrailingZerosInstruction);
   match(Set dst (CountTrailingZerosI src));
   effect(KILL cr);
 
@@ -6221,6 +5997,19 @@
 %}
 
 instruct countTrailingZerosL(rRegI dst, rRegL src, rFlagsReg cr) %{
+  predicate(UseCountTrailingZerosInstruction);
+  match(Set dst (CountTrailingZerosL src));
+  effect(KILL cr);
+
+  format %{ "tzcntq    $dst, $src\t# count trailing zeros (long)" %}
+  ins_encode %{
+    __ tzcntq($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosL_bsf(rRegI dst, rRegL src, rFlagsReg cr) %{
+  predicate(!UseCountTrailingZerosInstruction);
   match(Set dst (CountTrailingZerosL src));
   effect(KILL cr);
 
@@ -6906,82 +6695,6 @@
 //----------Arithmetic Instructions--------------------------------------------
 //----------Addition Instructions----------------------------------------------
 
-instruct addExactI_rReg(rax_RegI dst, rRegI src, rFlagsReg cr)
-%{
-  match(AddExactI dst src);
-  effect(DEF cr);
-
-  format %{ "addl    $dst, $src\t# addExact int" %}
-  ins_encode %{
-    __ addl($dst$$Register, $src$$Register);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct addExactI_rReg_imm(rax_RegI dst, immI src, rFlagsReg cr)
-%{
-  match(AddExactI dst src);
-  effect(DEF cr);
-
-  format %{ "addl    $dst, $src\t# addExact int" %}
-  ins_encode %{
-    __ addl($dst$$Register, $src$$constant);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct addExactI_rReg_mem(rax_RegI dst, memory src, rFlagsReg cr)
-%{
-  match(AddExactI dst (LoadI src));
-  effect(DEF cr);
-
-  ins_cost(125); // XXX
-  format %{ "addl    $dst, $src\t# addExact int" %}
-  ins_encode %{
-    __ addl($dst$$Register, $src$$Address);
-  %}
-
-  ins_pipe(ialu_reg_mem);
-%}
-
-instruct addExactL_rReg(rax_RegL dst, rRegL src, rFlagsReg cr)
-%{
-  match(AddExactL dst src);
-  effect(DEF cr);
-
-  format %{ "addq    $dst, $src\t# addExact long" %}
-  ins_encode %{
-    __ addq($dst$$Register, $src$$Register);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct addExactL_rReg_imm(rax_RegL dst, immL32 src, rFlagsReg cr)
-%{
-  match(AddExactL dst src);
-  effect(DEF cr);
-
-  format %{ "addq    $dst, $src\t# addExact long" %}
-  ins_encode %{
-    __ addq($dst$$Register, $src$$constant);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct addExactL_rReg_mem(rax_RegL dst, memory src, rFlagsReg cr)
-%{
-  match(AddExactL dst (LoadL src));
-  effect(DEF cr);
-
-  ins_cost(125); // XXX
-  format %{ "addq    $dst, $src\t# addExact long" %}
-  ins_encode %{
-    __ addq($dst$$Register, $src$$Address);
-  %}
-
-  ins_pipe(ialu_reg_mem);
-%}
-
 instruct addI_rReg(rRegI dst, rRegI src, rFlagsReg cr)
 %{
   match(Set dst (AddI dst src));
@@ -7594,80 +7307,6 @@
   ins_pipe(ialu_mem_imm);
 %}
 
-instruct subExactI_rReg(rax_RegI dst, rRegI src, rFlagsReg cr)
-%{
-  match(SubExactI dst src);
-  effect(DEF cr);
-
-  format %{ "subl    $dst, $src\t# subExact int" %}
-  ins_encode %{
-    __ subl($dst$$Register, $src$$Register);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct subExactI_rReg_imm(rax_RegI dst, immI src, rFlagsReg cr)
-%{
-  match(SubExactI dst src);
-  effect(DEF cr);
-
-  format %{ "subl    $dst, $src\t# subExact int" %}
-  ins_encode %{
-    __ subl($dst$$Register, $src$$constant);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct subExactI_rReg_mem(rax_RegI dst, memory src, rFlagsReg cr)
-%{
-  match(SubExactI dst (LoadI src));
-  effect(DEF cr);
-
-  ins_cost(125);
-  format %{ "subl    $dst, $src\t# subExact int" %}
-  ins_encode %{
-    __ subl($dst$$Register, $src$$Address);
-  %}
-  ins_pipe(ialu_reg_mem);
-%}
-
-instruct subExactL_rReg(rax_RegL dst, rRegL src, rFlagsReg cr)
-%{
-  match(SubExactL dst src);
-  effect(DEF cr);
-
-  format %{ "subq    $dst, $src\t# subExact long" %}
-  ins_encode %{
-    __ subq($dst$$Register, $src$$Register);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct subExactL_rReg_imm(rax_RegL dst, immL32 src, rFlagsReg cr)
-%{
-  match(SubExactL dst (LoadL src));
-  effect(DEF cr);
-
-  format %{ "subq    $dst, $src\t# subExact long" %}
-  ins_encode %{
-    __ subq($dst$$Register, $src$$constant);
-  %}
-  ins_pipe(ialu_reg_reg);
-%}
-
-instruct subExactL_rReg_mem(rax_RegI dst, memory src, rFlagsReg cr)
-%{
-  match(SubExactI dst src);
-  effect(DEF cr);
-
-  ins_cost(125);
-  format %{ "subq    $dst, $src\t# subExact long" %}
-  ins_encode %{
-    __ subq($dst$$Register, $src$$Address);
-  %}
-  ins_pipe(ialu_reg_mem);
-%}
-
 instruct subL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
 %{
   match(Set dst (SubL dst src));
@@ -7784,31 +7423,6 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct negExactI_rReg(rax_RegI dst, rFlagsReg cr)
-%{
-  match(NegExactI dst);
-  effect(KILL cr);
-
-  format %{ "negl    $dst\t# negExact int" %}
-  ins_encode %{
-    __ negl($dst$$Register);
-  %}
-  ins_pipe(ialu_reg);
-%}
-
-instruct negExactL_rReg(rax_RegL dst, rFlagsReg cr)
-%{
-  match(NegExactL dst);
-  effect(KILL cr);
-
-  format %{ "negq    $dst\t# negExact long" %}
-  ins_encode %{
-    __ negq($dst$$Register);
-  %}
-  ins_pipe(ialu_reg);
-%}
-
-
 //----------Multiplication/Division Instructions-------------------------------
 // Integer Multiplication Instructions
 // Multiply Register
@@ -7925,86 +7539,6 @@
   ins_pipe(ialu_reg_reg_alu0);
 %}
 
-
-instruct mulExactI_rReg(rax_RegI dst, rRegI src, rFlagsReg cr)
-%{
-  match(MulExactI dst src);
-  effect(DEF cr);
-
-  ins_cost(300);
-  format %{ "imull   $dst, $src\t# mulExact int" %}
-  ins_encode %{
-    __ imull($dst$$Register, $src$$Register);
-  %}
-  ins_pipe(ialu_reg_reg_alu0);
-%}
-
-
-instruct mulExactI_rReg_imm(rax_RegI dst, rRegI src, immI imm, rFlagsReg cr)
-%{
-  match(MulExactI src imm);
-  effect(DEF cr);
-
-  ins_cost(300);
-  format %{ "imull   $dst, $src, $imm\t# mulExact int" %}
-  ins_encode %{
-    __ imull($dst$$Register, $src$$Register, $imm$$constant);
-  %}
-  ins_pipe(ialu_reg_reg_alu0);
-%}
-
-instruct mulExactI_rReg_mem(rax_RegI dst, memory src, rFlagsReg cr)
-%{
-  match(MulExactI dst (LoadI src));
-  effect(DEF cr);
-
-  ins_cost(350);
-  format %{ "imull   $dst, $src\t# mulExact int" %}
-  ins_encode %{
-    __ imull($dst$$Register, $src$$Address);
-  %}
-  ins_pipe(ialu_reg_mem_alu0);
-%}
-
-instruct mulExactL_rReg(rax_RegL dst, rRegL src, rFlagsReg cr)
-%{
-  match(MulExactL dst src);
-  effect(DEF cr);
-
-  ins_cost(300);
-  format %{ "imulq   $dst, $src\t# mulExact long" %}
-  ins_encode %{
-    __ imulq($dst$$Register, $src$$Register);
-  %}
-  ins_pipe(ialu_reg_reg_alu0);
-%}
-
-instruct mulExactL_rReg_imm(rax_RegL dst, rRegL src, immL32 imm, rFlagsReg cr)
-%{
-  match(MulExactL src imm);
-  effect(DEF cr);
-
-  ins_cost(300);
-  format %{ "imulq   $dst, $src, $imm\t# mulExact long" %}
-  ins_encode %{
-    __ imulq($dst$$Register, $src$$Register, $imm$$constant);
-  %}
-  ins_pipe(ialu_reg_reg_alu0);
-%}
-
-instruct mulExactL_rReg_mem(rax_RegL dst, memory src, rFlagsReg cr)
-%{
-  match(MulExactL dst (LoadL src));
-  effect(DEF cr);
-
-  ins_cost(350);
-  format %{ "imulq   $dst, $src\t# mulExact long" %}
-  ins_encode %{
-    __ imulq($dst$$Register, $src$$Address);
-  %}
-  ins_pipe(ialu_reg_mem_alu0);
-%}
-
 instruct divI_rReg(rax_RegI rax, rdx_RegI rdx, no_rax_rdx_RegI div,
                    rFlagsReg cr)
 %{
@@ -9057,6 +8591,122 @@
   ins_pipe(ialu_mem_imm);
 %}
 
+// BMI1 instructions
+instruct andnI_rReg_rReg_mem(rRegI dst, rRegI src1, memory src2, immI_M1 minus_1, rFlagsReg cr) %{
+  match(Set dst (AndI (XorI src1 minus_1) (LoadI src2)));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "andnl  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnl($dst$$Register, $src1$$Register, $src2$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct andnI_rReg_rReg_rReg(rRegI dst, rRegI src1, rRegI src2, immI_M1 minus_1, rFlagsReg cr) %{
+  match(Set dst (AndI (XorI src1 minus_1) src2));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "andnl  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnl($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, rFlagsReg cr) %{
+  match(Set dst (AndI (SubI imm_zero src) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsil  $dst, $src" %}
+
+  ins_encode %{
+    __ blsil($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, rFlagsReg cr) %{
+  match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsil  $dst, $src" %}
+
+  ins_encode %{
+    __ blsil($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (XorI (AddI (LoadI src) minus_1) (LoadI src) ) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsmskl $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskl($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (XorI (AddI src minus_1) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsmskl $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskl($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (AndI (AddI src minus_1) src) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsrl  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrl($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsrI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (AndI (AddI (LoadI src) minus_1) (LoadI src) ) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsrl  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrl($dst$$Register, $src$$Address);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
 // Or Instructions
 // Or Register with Register
 instruct orI_rReg(rRegI dst, rRegI src, rFlagsReg cr)
@@ -9288,6 +8938,122 @@
   ins_pipe(ialu_mem_imm);
 %}
 
+// BMI1 instructions
+instruct andnL_rReg_rReg_mem(rRegL dst, rRegL src1, memory src2, immL_M1 minus_1, rFlagsReg cr) %{
+  match(Set dst (AndL (XorL src1 minus_1) (LoadL src2)));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "andnq  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnq($dst$$Register, $src1$$Register, $src2$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct andnL_rReg_rReg_rReg(rRegL dst, rRegL src1, rRegL src2, immL_M1 minus_1, rFlagsReg cr) %{
+  match(Set dst (AndL (XorL src1 minus_1) src2));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "andnq  $dst, $src1, $src2" %}
+
+  ins_encode %{
+  __ andnq($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsiL_rReg_rReg(rRegL dst, rRegL src, immL0 imm_zero, rFlagsReg cr) %{
+  match(Set dst (AndL (SubL imm_zero src) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsiq  $dst, $src" %}
+
+  ins_encode %{
+    __ blsiq($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiL_rReg_mem(rRegL dst, memory src, immL0 imm_zero, rFlagsReg cr) %{
+  match(Set dst (AndL (SubL imm_zero (LoadL src) ) (LoadL src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsiq  $dst, $src" %}
+
+  ins_encode %{
+    __ blsiq($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskL_rReg_mem(rRegL dst, memory src, immL_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (XorL (AddL (LoadL src) minus_1) (LoadL src) ) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsmskq $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskq($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskL_rReg_rReg(rRegL dst, rRegL src, immL_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (XorL (AddL src minus_1) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsmskq $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskq($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrL_rReg_rReg(rRegL dst, rRegL src, immL_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (AndL (AddL src minus_1) src) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsrq  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrq($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrL_rReg_mem(rRegL dst, memory src, immL_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (AndL (AddL (LoadL src) minus_1) (LoadL src)) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsrq  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrq($dst$$Register, $src$$Address);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
 // Or Instructions
 // Or Register with Register
 instruct orL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
@@ -10613,6 +10379,174 @@
   ins_pipe( pipe_slow );
 %}
 
+//----------Overflow Math Instructions-----------------------------------------
+
+instruct overflowAddI_rReg(rFlagsReg cr, rax_RegI op1, rRegI op2)
+%{
+  match(Set cr (OverflowAddI op1 op2));
+  effect(DEF cr, USE_KILL op1, USE op2);
+
+  format %{ "addl    $op1, $op2\t# overflow check int" %}
+
+  ins_encode %{
+    __ addl($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowAddI_rReg_imm(rFlagsReg cr, rax_RegI op1, immI op2)
+%{
+  match(Set cr (OverflowAddI op1 op2));
+  effect(DEF cr, USE_KILL op1, USE op2);
+
+  format %{ "addl    $op1, $op2\t# overflow check int" %}
+
+  ins_encode %{
+    __ addl($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowAddL_rReg(rFlagsReg cr, rax_RegL op1, rRegL op2)
+%{
+  match(Set cr (OverflowAddL op1 op2));
+  effect(DEF cr, USE_KILL op1, USE op2);
+
+  format %{ "addq    $op1, $op2\t# overflow check long" %}
+  ins_encode %{
+    __ addq($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowAddL_rReg_imm(rFlagsReg cr, rax_RegL op1, immL32 op2)
+%{
+  match(Set cr (OverflowAddL op1 op2));
+  effect(DEF cr, USE_KILL op1, USE op2);
+
+  format %{ "addq    $op1, $op2\t# overflow check long" %}
+  ins_encode %{
+    __ addq($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowSubI_rReg(rFlagsReg cr, rRegI op1, rRegI op2)
+%{
+  match(Set cr (OverflowSubI op1 op2));
+
+  format %{ "cmpl    $op1, $op2\t# overflow check int" %}
+  ins_encode %{
+    __ cmpl($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowSubI_rReg_imm(rFlagsReg cr, rRegI op1, immI op2)
+%{
+  match(Set cr (OverflowSubI op1 op2));
+
+  format %{ "cmpl    $op1, $op2\t# overflow check int" %}
+  ins_encode %{
+    __ cmpl($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowSubL_rReg(rFlagsReg cr, rRegL op1, rRegL op2)
+%{
+  match(Set cr (OverflowSubL op1 op2));
+
+  format %{ "cmpq    $op1, $op2\t# overflow check long" %}
+  ins_encode %{
+    __ cmpq($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowSubL_rReg_imm(rFlagsReg cr, rRegL op1, immL32 op2)
+%{
+  match(Set cr (OverflowSubL op1 op2));
+
+  format %{ "cmpq    $op1, $op2\t# overflow check long" %}
+  ins_encode %{
+    __ cmpq($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowNegI_rReg(rFlagsReg cr, immI0 zero, rax_RegI op2)
+%{
+  match(Set cr (OverflowSubI zero op2));
+  effect(DEF cr, USE_KILL op2);
+
+  format %{ "negl    $op2\t# overflow check int" %}
+  ins_encode %{
+    __ negl($op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowNegL_rReg(rFlagsReg cr, immL0 zero, rax_RegL op2)
+%{
+  match(Set cr (OverflowSubL zero op2));
+  effect(DEF cr, USE_KILL op2);
+
+  format %{ "negq    $op2\t# overflow check long" %}
+  ins_encode %{
+    __ negq($op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct overflowMulI_rReg(rFlagsReg cr, rax_RegI op1, rRegI op2)
+%{
+  match(Set cr (OverflowMulI op1 op2));
+  effect(DEF cr, USE_KILL op1, USE op2);
+
+  format %{ "imull    $op1, $op2\t# overflow check int" %}
+  ins_encode %{
+    __ imull($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg_alu0);
+%}
+
+instruct overflowMulI_rReg_imm(rFlagsReg cr, rRegI op1, immI op2, rRegI tmp)
+%{
+  match(Set cr (OverflowMulI op1 op2));
+  effect(DEF cr, TEMP tmp, USE op1, USE op2);
+
+  format %{ "imull    $tmp, $op1, $op2\t# overflow check int" %}
+  ins_encode %{
+    __ imull($tmp$$Register, $op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_reg_reg_alu0);
+%}
+
+instruct overflowMulL_rReg(rFlagsReg cr, rax_RegL op1, rRegL op2)
+%{
+  match(Set cr (OverflowMulL op1 op2));
+  effect(DEF cr, USE_KILL op1, USE op2);
+
+  format %{ "imulq    $op1, $op2\t# overflow check long" %}
+  ins_encode %{
+    __ imulq($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg_alu0);
+%}
+
+instruct overflowMulL_rReg_imm(rFlagsReg cr, rRegL op1, immL32 op2, rRegL tmp)
+%{
+  match(Set cr (OverflowMulL op1 op2));
+  effect(DEF cr, TEMP tmp, USE op1, USE op2);
+
+  format %{ "imulq    $tmp, $op1, $op2\t# overflow check long" %}
+  ins_encode %{
+    __ imulq($tmp$$Register, $op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_reg_reg_alu0);
+%}
+
 
 //----------Control Flow Instructions------------------------------------------
 // Signed compare Instructions
@@ -11396,27 +11330,43 @@
 // ============================================================================
 // inlined locking and unlocking
 
-instruct cmpFastLock(rFlagsReg cr,
-                     rRegP object, rbx_RegP box, rax_RegI tmp, rRegP scr)
-%{
+instruct cmpFastLockRTM(rFlagsReg cr, rRegP object, rbx_RegP box, rax_RegI tmp, rdx_RegI scr, rRegI cx1, rRegI cx2) %{
+  predicate(Compile::current()->use_rtm());
+  match(Set cr (FastLock object box));
+  effect(TEMP tmp, TEMP scr, TEMP cx1, TEMP cx2, USE_KILL box);
+  ins_cost(300);
+  format %{ "fastlock $object,$box\t! kills $box,$tmp,$scr,$cx1,$cx2" %}
+  ins_encode %{
+    __ fast_lock($object$$Register, $box$$Register, $tmp$$Register,
+                 $scr$$Register, $cx1$$Register, $cx2$$Register,
+                 _counters, _rtm_counters, _stack_rtm_counters,
+                 ((Method*)(ra_->C->method()->constant_encoding()))->method_data(),
+                 true, ra_->C->profile_rtm());
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct cmpFastLock(rFlagsReg cr, rRegP object, rbx_RegP box, rax_RegI tmp, rRegP scr) %{
+  predicate(!Compile::current()->use_rtm());
   match(Set cr (FastLock object box));
   effect(TEMP tmp, TEMP scr, USE_KILL box);
-
   ins_cost(300);
   format %{ "fastlock $object,$box\t! kills $box,$tmp,$scr" %}
-  ins_encode(Fast_Lock(object, box, tmp, scr));
+  ins_encode %{
+    __ fast_lock($object$$Register, $box$$Register, $tmp$$Register,
+                 $scr$$Register, noreg, noreg, _counters, NULL, NULL, NULL, false, false);
+  %}
   ins_pipe(pipe_slow);
 %}
 
-instruct cmpFastUnlock(rFlagsReg cr,
-                       rRegP object, rax_RegP box, rRegP tmp)
-%{
+instruct cmpFastUnlock(rFlagsReg cr, rRegP object, rax_RegP box, rRegP tmp) %{
   match(Set cr (FastUnlock object box));
   effect(TEMP tmp, USE_KILL box);
-
   ins_cost(300);
   format %{ "fastunlock $object,$box\t! kills $box,$tmp" %}
-  ins_encode(Fast_Unlock(object, box, tmp));
+  ins_encode %{
+    __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register, ra_->C->use_rtm());
+  %}
   ins_pipe(pipe_slow);
 %}
 
--- a/src/os/bsd/vm/os_bsd.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/os/bsd/vm/os_bsd.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -2590,9 +2590,21 @@
   }
 }
 
-int os::naked_sleep() {
-  // %% make the sleep time an integer flag. for now use 1 millisec.
-  return os::sleep(Thread::current(), 1, false);
+void os::naked_short_sleep(jlong ms) {
+  struct timespec req;
+
+  assert(ms < 1000, "Un-interruptable sleep, short time use only");
+  req.tv_sec = 0;
+  if (ms > 0) {
+    req.tv_nsec = (ms % 1000) * 1000000;
+  }
+  else {
+    req.tv_nsec = 1;
+  }
+
+  nanosleep(&req, NULL);
+
+  return;
 }
 
 // Sleep forever; naked call to OS-specific sleep; use with CAUTION
--- a/src/os/linux/vm/os_linux.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/os/linux/vm/os_linux.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -109,6 +109,8 @@
 
 #define MAX_PATH    (2 * K)
 
+#define MAX_SECS 100000000
+
 // for timer info max values which include all bits
 #define ALL_64_BITS CONST64(0xFFFFFFFFFFFFFFFF)
 
@@ -2433,7 +2435,6 @@
     sem_t _semaphore;
 };
 
-
 Semaphore::Semaphore() {
   sem_init(&_semaphore, 0, 0);
 }
@@ -2455,8 +2456,22 @@
 }
 
 bool Semaphore::timedwait(unsigned int sec, int nsec) {
+
   struct timespec ts;
-  unpackTime(&ts, false, (sec * NANOSECS_PER_SEC) + nsec);
+  // Semaphore's are always associated with CLOCK_REALTIME
+  os::Linux::clock_gettime(CLOCK_REALTIME, &ts);
+  // see unpackTime for discussion on overflow checking
+  if (sec >= MAX_SECS) {
+    ts.tv_sec += MAX_SECS;
+    ts.tv_nsec = 0;
+  } else {
+    ts.tv_sec += sec;
+    ts.tv_nsec += nsec;
+    if (ts.tv_nsec >= NANOSECS_PER_SEC) {
+      ts.tv_nsec -= NANOSECS_PER_SEC;
+      ++ts.tv_sec; // note: this must be <= max_secs
+    }
+  }
 
   while (1) {
     int result = sem_timedwait(&_semaphore, &ts);
@@ -2961,7 +2976,9 @@
 
   unsigned char vec[1];
   unsigned imin = 1, imax = pages + 1, imid;
-  int mincore_return_value;
+  int mincore_return_value = 0;
+
+  assert(imin <= imax, "Unexpected page size");
 
   while (imin < imax) {
     imid = (imax + imin) / 2;
@@ -3833,9 +3850,33 @@
   }
 }
 
-int os::naked_sleep() {
-  // %% make the sleep time an integer flag. for now use 1 millisec.
-  return os::sleep(Thread::current(), 1, false);
+//
+// Short sleep, direct OS call.
+//
+// Note: certain versions of Linux CFS scheduler (since 2.6.23) do not guarantee
+// sched_yield(2) will actually give up the CPU:
+//
+//   * Alone on this pariticular CPU, keeps running.
+//   * Before the introduction of "skip_buddy" with "compat_yield" disabled
+//     (pre 2.6.39).
+//
+// So calling this with 0 is an alternative.
+//
+void os::naked_short_sleep(jlong ms) {
+  struct timespec req;
+
+  assert(ms < 1000, "Un-interruptable sleep, short time use only");
+  req.tv_sec = 0;
+  if (ms > 0) {
+    req.tv_nsec = (ms % 1000) * 1000000;
+  }
+  else {
+    req.tv_nsec = 1;
+  }
+
+  nanosleep(&req, NULL);
+
+  return;
 }
 
 // Sleep forever; naked call to OS-specific sleep; use with CAUTION
@@ -5752,7 +5793,6 @@
  * is no need to track notifications.
  */
 
-#define MAX_SECS 100000000
 /*
  * This code is common to linux and solaris and will be moved to a
  * common place in dolphin.
--- a/src/os/linux/vm/perfMemory_linux.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/os/linux/vm/perfMemory_linux.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -891,8 +891,16 @@
   FREE_C_HEAP_ARRAY(char, filename, mtInternal);
 
   // open the shared memory file for the give vmid
-  fd = open_sharedmem_file(rfilename, file_flags, CHECK);
-  assert(fd != OS_ERR, "unexpected value");
+  fd = open_sharedmem_file(rfilename, file_flags, THREAD);
+
+  if (fd == OS_ERR) {
+    return;
+  }
+
+  if (HAS_PENDING_EXCEPTION) {
+    ::close(fd);
+    return;
+  }
 
   if (*sizep == 0) {
     size = sharedmem_filesize(fd, CHECK);
--- a/src/os/solaris/vm/os_solaris.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/os/solaris/vm/os_solaris.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -2232,8 +2232,8 @@
         st->cr();
         status = true;
       }
-      ::close(fd);
     }
+    ::close(fd);
   }
   return status;
 }
@@ -2967,7 +2967,7 @@
 char *os::scan_pages(char *start, char* end, page_info* page_expected, page_info* page_found) {
   const uint_t info_types[] = { MEMINFO_VLGRP, MEMINFO_VPAGESIZE };
   const size_t types = sizeof(info_types) / sizeof(info_types[0]);
-  uint64_t addrs[MAX_MEMINFO_CNT], outdata[types * MAX_MEMINFO_CNT];
+  uint64_t addrs[MAX_MEMINFO_CNT], outdata[types * MAX_MEMINFO_CNT + 1];
   uint_t validity[MAX_MEMINFO_CNT];
 
   size_t page_size = MAX2((size_t)os::vm_page_size(), page_expected->size);
@@ -3006,7 +3006,7 @@
       }
     }
 
-    if (i != addrs_count) {
+    if (i < addrs_count) {
       if ((validity[i] & 2) != 0) {
         page_found->lgrp_id = outdata[types * i];
       } else {
@@ -3496,9 +3496,14 @@
   return os_sleep(millis, interruptible);
 }
 
-int os::naked_sleep() {
-  // %% make the sleep time an integer flag. for now use 1 millisec.
-  return os_sleep(1, false);
+void os::naked_short_sleep(jlong ms) {
+  assert(ms < 1000, "Un-interruptable sleep, short time use only");
+
+  // usleep is deprecated and removed from POSIX, in favour of nanosleep, but
+  // Solaris requires -lrt for this.
+  usleep((ms * 1000));
+
+  return;
 }
 
 // Sleep forever; naked call to OS-specific sleep; use with CAUTION
--- a/src/os/solaris/vm/perfMemory_solaris.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/os/solaris/vm/perfMemory_solaris.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -431,10 +431,12 @@
 
       RESTARTABLE(::read(fd, addr, remaining), result);
       if (result == OS_ERR) {
+        ::close(fd);
         THROW_MSG_0(vmSymbols::java_io_IOException(), "Read error");
+      } else {
+        remaining-=result;
+        addr+=result;
       }
-      remaining-=result;
-      addr+=result;
     }
 
     ::close(fd);
@@ -906,8 +908,16 @@
   FREE_C_HEAP_ARRAY(char, filename, mtInternal);
 
   // open the shared memory file for the give vmid
-  fd = open_sharedmem_file(rfilename, file_flags, CHECK);
-  assert(fd != OS_ERR, "unexpected value");
+  fd = open_sharedmem_file(rfilename, file_flags, THREAD);
+
+  if (fd == OS_ERR) {
+    return;
+  }
+
+  if (HAS_PENDING_EXCEPTION) {
+    ::close(fd);
+    return;
+  }
 
   if (*sizep == 0) {
     size = sharedmem_filesize(fd, CHECK);
--- a/src/os/windows/vm/os_windows.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/os/windows/vm/os_windows.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -2437,6 +2437,12 @@
     }
   }
 
+  if ((exception_code == EXCEPTION_ACCESS_VIOLATION) &&
+      VM_Version::is_cpuinfo_segv_addr(pc)) {
+    // Verify that OS save/restore AVX registers.
+    return Handle_Exception(exceptionInfo, VM_Version::cpuinfo_cont_addr());
+  }
+
   if (t != NULL && t->is_Java_thread()) {
     JavaThread* thread = (JavaThread*) t;
     bool in_java = thread->thread_state() == _thread_in_Java;
@@ -3496,6 +3502,16 @@
   return result;
 }
 
+//
+// Short sleep, direct OS call.
+//
+// ms = 0, means allow others (if any) to run.
+//
+void os::naked_short_sleep(jlong ms) {
+  assert(ms < 1000, "Un-interruptable sleep, short time use only");
+  Sleep(ms);
+}
+
 // Sleep forever; naked call to OS-specific sleep; use with CAUTION
 void os::infinite_sleep() {
   while (true) {    // sleep forever ...
@@ -3623,13 +3639,14 @@
          "possibility of dangling Thread pointer");
 
   OSThread* osthread = thread->osthread();
-  bool interrupted = osthread->interrupted();
   // There is no synchronization between the setting of the interrupt
   // and it being cleared here. It is critical - see 6535709 - that
   // we only clear the interrupt state, and reset the interrupt event,
   // if we are going to report that we were indeed interrupted - else
   // an interrupt can be "lost", leading to spurious wakeups or lost wakeups
-  // depending on the timing
+  // depending on the timing. By checking thread interrupt event to see
+  // if the thread gets real interrupt thus prevent spurious wakeup.
+  bool interrupted = osthread->interrupted() && (WaitForSingleObject(osthread->interrupt_event(), 0) == WAIT_OBJECT_0);
   if (interrupted && clear_interrupted) {
     osthread->set_interrupted(false);
     ResetEvent(osthread->interrupt_event());
--- a/src/os_cpu/bsd_x86/vm/os_bsd_x86.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/os_cpu/bsd_x86/vm/os_bsd_x86.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -492,6 +492,11 @@
       }
     }
 
+    if ((sig == SIGSEGV || sig == SIGBUS) && VM_Version::is_cpuinfo_segv_addr(pc)) {
+      // Verify that OS save/restore AVX registers.
+      stub = VM_Version::cpuinfo_cont_addr();
+    }
+
     // We test if stub is already set (by the stack overflow code
     // above) so it is not overwritten by the code that follows. This
     // check is not required on other platforms, because on other
--- a/src/os_cpu/linux_x86/vm/os_linux_x86.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/os_cpu/linux_x86/vm/os_linux_x86.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -337,6 +337,11 @@
       }
     }
 
+    if ((sig == SIGSEGV) && VM_Version::is_cpuinfo_segv_addr(pc)) {
+      // Verify that OS save/restore AVX registers.
+      stub = VM_Version::cpuinfo_cont_addr();
+    }
+
     if (thread->thread_state() == _thread_in_Java) {
       // Java thread running in Java code => find exception handler if any
       // a fault inside compiled code, the interpreter, or a stub
--- a/src/os_cpu/solaris_x86/vm/os_solaris_x86.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/os_cpu/solaris_x86/vm/os_solaris_x86.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -459,6 +459,11 @@
       }
     }
 
+    if ((sig == SIGSEGV) && VM_Version::is_cpuinfo_segv_addr(pc)) {
+      // Verify that OS save/restore AVX registers.
+      stub = VM_Version::cpuinfo_cont_addr();
+    }
+
     if (thread->thread_state() == _thread_in_vm) {
       if (sig == SIGBUS && info->si_code == BUS_OBJERR && thread->doing_unsafe_access()) {
         stub = StubRoutines::handler_for_unsafe_access();
@@ -475,9 +480,11 @@
         // here if the underlying file has been truncated.
         // Do not crash the VM in such a case.
         CodeBlob* cb = CodeCache::find_blob_unsafe(pc);
-        nmethod* nm = cb->is_nmethod() ? (nmethod*)cb : NULL;
-        if (nm != NULL && nm->has_unsafe_access()) {
-          stub = StubRoutines::handler_for_unsafe_access();
+        if (cb != NULL) {
+          nmethod* nm = cb->is_nmethod() ? (nmethod*)cb : NULL;
+          if (nm != NULL && nm->has_unsafe_access()) {
+            stub = StubRoutines::handler_for_unsafe_access();
+          }
         }
       }
       else
@@ -724,6 +731,7 @@
   err.report_and_die();
 
   ShouldNotReachHere();
+  return false;
 }
 
 void os::print_context(outputStream *st, void *context) {
--- a/src/share/vm/adlc/archDesc.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/adlc/archDesc.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1193,15 +1193,12 @@
          || strcmp(idealName,"CmpF") == 0
          || strcmp(idealName,"FastLock") == 0
          || strcmp(idealName,"FastUnlock") == 0
-         || strcmp(idealName,"AddExactI") == 0
-         || strcmp(idealName,"AddExactL") == 0
-         || strcmp(idealName,"SubExactI") == 0
-         || strcmp(idealName,"SubExactL") == 0
-         || strcmp(idealName,"MulExactI") == 0
-         || strcmp(idealName,"MulExactL") == 0
-         || strcmp(idealName,"NegExactI") == 0
-         || strcmp(idealName,"NegExactL") == 0
-         || strcmp(idealName,"FlagsProj") == 0
+         || strcmp(idealName,"OverflowAddI") == 0
+         || strcmp(idealName,"OverflowAddL") == 0
+         || strcmp(idealName,"OverflowSubI") == 0
+         || strcmp(idealName,"OverflowSubL") == 0
+         || strcmp(idealName,"OverflowMulI") == 0
+         || strcmp(idealName,"OverflowMulL") == 0
          || strcmp(idealName,"Bool") == 0
          || strcmp(idealName,"Binary") == 0 ) {
       // Removed ConI from the must_clone list.  CPUs that cannot use
--- a/src/share/vm/adlc/formssel.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/adlc/formssel.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -660,6 +660,7 @@
   int USE_of_memory  = 0;
   int DEF_of_memory  = 0;
   const char*    last_memory_DEF = NULL; // to test DEF/USE pairing in asserts
+  const char*    last_memory_USE = NULL;
   Component     *unique          = NULL;
   Component     *comp            = NULL;
   ComponentList &components      = (ComponentList &)_components;
@@ -681,7 +682,16 @@
           assert(0 == strcmp(last_memory_DEF, comp->_name), "every memory DEF is followed by a USE of the same name");
           last_memory_DEF = NULL;
         }
-        USE_of_memory++;
+        // Handles same memory being used multiple times in the case of BMI1 instructions.
+        if (last_memory_USE != NULL) {
+          if (strcmp(comp->_name, last_memory_USE) != 0) {
+            USE_of_memory++;
+          }
+        } else {
+          USE_of_memory++;
+        }
+        last_memory_USE = comp->_name;
+
         if (DEF_of_memory == 0)  // defs take precedence
           unique = comp;
       } else {
--- a/src/share/vm/adlc/output_c.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/adlc/output_c.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1599,6 +1599,8 @@
 
       if( node->is_ideal_fastlock() && new_inst->is_ideal_fastlock() ) {
         fprintf(fp, "  ((MachFastLockNode*)n%d)->_counters = _counters;\n",cnt);
+        fprintf(fp, "  ((MachFastLockNode*)n%d)->_rtm_counters = _rtm_counters;\n",cnt);
+        fprintf(fp, "  ((MachFastLockNode*)n%d)->_stack_rtm_counters = _stack_rtm_counters;\n",cnt);
       }
 
       // Fill in the bottom_type where requested
@@ -3980,6 +3982,8 @@
   }
   if( inst->is_ideal_fastlock() ) {
     fprintf(fp_cpp, "%s node->_counters = _leaf->as_FastLock()->counters();\n", indent);
+    fprintf(fp_cpp, "%s node->_rtm_counters = _leaf->as_FastLock()->rtm_counters();\n", indent);
+    fprintf(fp_cpp, "%s node->_stack_rtm_counters = _leaf->as_FastLock()->stack_rtm_counters();\n", indent);
   }
 
 }
--- a/src/share/vm/ci/ciClassList.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/ci/ciClassList.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -103,6 +103,7 @@
 friend class ciMethodType;             \
 friend class ciReceiverTypeData;       \
 friend class ciTypeEntries;            \
+friend class ciSpeculativeTrapData;    \
 friend class ciSymbol;                 \
 friend class ciArray;                  \
 friend class ciObjArray;               \
--- a/src/share/vm/ci/ciEnv.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/ci/ciEnv.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -926,7 +926,8 @@
                             AbstractCompiler* compiler,
                             int comp_level,
                             bool has_unsafe_access,
-                            bool has_wide_vectors) {
+                            bool has_wide_vectors,
+                            RTMState  rtm_state) {
   VM_ENTRY_MARK;
   nmethod* nm = NULL;
   {
@@ -973,6 +974,15 @@
 
     methodHandle method(THREAD, target->get_Method());
 
+#if INCLUDE_RTM_OPT
+    if (!failing() && (rtm_state != NoRTM) &&
+        (method()->method_data() != NULL) &&
+        (method()->method_data()->rtm_state() != rtm_state)) {
+      // Preemptive decompile if rtm state was changed.
+      record_failure("RTM state change invalidated rtm code");
+    }
+#endif
+
     if (failing()) {
       // While not a true deoptimization, it is a preemptive decompile.
       MethodData* mdo = method()->method_data();
@@ -999,13 +1009,15 @@
                                frame_words, oop_map_set,
                                handler_table, inc_table,
                                compiler, comp_level);
-
     // Free codeBlobs
     code_buffer->free_blob();
 
     if (nm != NULL) {
       nm->set_has_unsafe_access(has_unsafe_access);
       nm->set_has_wide_vectors(has_wide_vectors);
+#if INCLUDE_RTM_OPT
+      nm->set_rtm_state(rtm_state);
+#endif
 
       // Record successful registration.
       // (Put nm into the task handle *before* publishing to the Java heap.)
--- a/src/share/vm/ci/ciEnv.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/ci/ciEnv.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -363,7 +363,8 @@
                        AbstractCompiler*         compiler,
                        int                       comp_level,
                        bool                      has_unsafe_access,
-                       bool                      has_wide_vectors);
+                       bool                      has_wide_vectors,
+                       RTMState                  rtm_state = NoRTM);
 
 
   // Access to certain well known ciObjects.
--- a/src/share/vm/ci/ciMethodData.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/ci/ciMethodData.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -78,6 +78,36 @@
   _parameters = NULL;
 }
 
+void ciMethodData::load_extra_data() {
+  MethodData* mdo = get_MethodData();
+
+  // speculative trap entries also hold a pointer to a Method so need to be translated
+  DataLayout* dp_src  = mdo->extra_data_base();
+  DataLayout* end_src = mdo->extra_data_limit();
+  DataLayout* dp_dst  = extra_data_base();
+  for (;; dp_src = MethodData::next_extra(dp_src), dp_dst = MethodData::next_extra(dp_dst)) {
+    assert(dp_src < end_src, "moved past end of extra data");
+    // New traps in the MDO can be added as we translate the copy so
+    // look at the entries in the copy.
+    switch(dp_dst->tag()) {
+    case DataLayout::speculative_trap_data_tag: {
+      ciSpeculativeTrapData* data_dst = new ciSpeculativeTrapData(dp_dst);
+      SpeculativeTrapData* data_src = new SpeculativeTrapData(dp_src);
+      data_dst->translate_from(data_src);
+      break;
+    }
+    case DataLayout::bit_data_tag:
+      break;
+    case DataLayout::no_tag:
+    case DataLayout::arg_info_data_tag:
+      // An empty slot or ArgInfoData entry marks the end of the trap data
+      return;
+    default:
+      fatal(err_msg("bad tag = %d", dp_dst->tag()));
+    }
+  }
+}
+
 void ciMethodData::load_data() {
   MethodData* mdo = get_MethodData();
   if (mdo == NULL) {
@@ -116,6 +146,8 @@
     parameters->translate_from(mdo->parameters_type_data());
   }
 
+  load_extra_data();
+
   // Note:  Extra data are all BitData, and do not need translation.
   _current_mileage = MethodData::mileage_of(mdo->method());
   _invocation_counter = mdo->invocation_count();
@@ -156,6 +188,12 @@
   set_type(translate_klass(k));
 }
 
+void ciSpeculativeTrapData::translate_from(const ProfileData* data) {
+  Method* m = data->as_SpeculativeTrapData()->method();
+  ciMethod* ci_m = CURRENT_ENV->get_method(m);
+  set_method(ci_m);
+}
+
 // Get the data at an arbitrary (sort of) data index.
 ciProfileData* ciMethodData::data_at(int data_index) {
   if (out_of_bounds(data_index)) {
@@ -203,33 +241,65 @@
   return next;
 }
 
-// Translate a bci to its corresponding data, or NULL.
-ciProfileData* ciMethodData::bci_to_data(int bci) {
-  ciProfileData* data = data_before(bci);
-  for ( ; is_valid(data); data = next_data(data)) {
-    if (data->bci() == bci) {
-      set_hint_di(dp_to_di(data->dp()));
-      return data;
-    } else if (data->bci() > bci) {
-      break;
-    }
-  }
+ciProfileData* ciMethodData::bci_to_extra_data(int bci, ciMethod* m, bool& two_free_slots) {
   // bci_to_extra_data(bci) ...
   DataLayout* dp  = data_layout_at(data_size());
   DataLayout* end = data_layout_at(data_size() + extra_data_size());
-  for (; dp < end; dp = MethodData::next_extra(dp)) {
-    if (dp->tag() == DataLayout::no_tag) {
+  two_free_slots = false;
+  for (;dp < end; dp = MethodData::next_extra(dp)) {
+    switch(dp->tag()) {
+    case DataLayout::no_tag:
       _saw_free_extra_data = true;  // observed an empty slot (common case)
+      two_free_slots = (MethodData::next_extra(dp)->tag() == DataLayout::no_tag);
       return NULL;
+    case DataLayout::arg_info_data_tag:
+      return NULL; // ArgInfoData is at the end of extra data section.
+    case DataLayout::bit_data_tag:
+      if (m == NULL && dp->bci() == bci) {
+        return new ciBitData(dp);
+      }
+      break;
+    case DataLayout::speculative_trap_data_tag: {
+      ciSpeculativeTrapData* data = new ciSpeculativeTrapData(dp);
+      // data->method() might be null if the MDO is snapshotted
+      // concurrently with a trap
+      if (m != NULL && data->method() == m && dp->bci() == bci) {
+        return data;
+      }
+      break;
     }
-    if (dp->tag() == DataLayout::arg_info_data_tag) {
-      break; // ArgInfoData is at the end of extra data section.
+    default:
+      fatal(err_msg("bad tag = %d", dp->tag()));
     }
-    if (dp->bci() == bci) {
-      assert(dp->tag() == DataLayout::bit_data_tag, "sane");
-      return new ciBitData(dp);
+  }
+  return NULL;
+}
+
+// Translate a bci to its corresponding data, or NULL.
+ciProfileData* ciMethodData::bci_to_data(int bci, ciMethod* m) {
+  // If m is not NULL we look for a SpeculativeTrapData entry
+  if (m == NULL) {
+    ciProfileData* data = data_before(bci);
+    for ( ; is_valid(data); data = next_data(data)) {
+      if (data->bci() == bci) {
+        set_hint_di(dp_to_di(data->dp()));
+        return data;
+      } else if (data->bci() > bci) {
+        break;
+      }
     }
   }
+  bool two_free_slots = false;
+  ciProfileData* result = bci_to_extra_data(bci, m, two_free_slots);
+  if (result != NULL) {
+    return result;
+  }
+  if (m != NULL && !two_free_slots) {
+    // We were looking for a SpeculativeTrapData entry we didn't
+    // find. Room is not available for more SpeculativeTrapData
+    // entries, look in the non SpeculativeTrapData entries.
+    return bci_to_data(bci, NULL);
+  }
   return NULL;
 }
 
@@ -525,18 +595,25 @@
   st->print_cr("--- Extra data:");
   DataLayout* dp  = data_layout_at(data_size());
   DataLayout* end = data_layout_at(data_size() + extra_data_size());
-  for (; dp < end; dp = MethodData::next_extra(dp)) {
-    if (dp->tag() == DataLayout::no_tag)  continue;
-    if (dp->tag() == DataLayout::bit_data_tag) {
+  for (;; dp = MethodData::next_extra(dp)) {
+    assert(dp < end, "moved past end of extra data");
+    switch (dp->tag()) {
+    case DataLayout::no_tag:
+      continue;
+    case DataLayout::bit_data_tag:
       data = new BitData(dp);
-    } else {
-      assert(dp->tag() == DataLayout::arg_info_data_tag, "must be BitData or ArgInfo");
+      break;
+    case DataLayout::arg_info_data_tag:
       data = new ciArgInfoData(dp);
       dp = end; // ArgInfoData is at the end of extra data section.
+      break;
+    default:
+      fatal(err_msg("unexpected tag %d", dp->tag()));
     }
     st->print("%d", dp_to_di(data->dp()));
     st->fill_to(6);
     data->print_data_on(st);
+    if (dp >= end) return;
   }
 }
 
@@ -569,8 +646,8 @@
   st->cr();
 }
 
-void ciCallTypeData::print_data_on(outputStream* st) const {
-  print_shared(st, "ciCallTypeData");
+void ciCallTypeData::print_data_on(outputStream* st, const char* extra) const {
+  print_shared(st, "ciCallTypeData", extra);
   if (has_arguments()) {
     tab(st, true);
     st->print("argument types");
@@ -599,18 +676,18 @@
   }
 }
 
-void ciReceiverTypeData::print_data_on(outputStream* st) const {
-  print_shared(st, "ciReceiverTypeData");
+void ciReceiverTypeData::print_data_on(outputStream* st, const char* extra) const {
+  print_shared(st, "ciReceiverTypeData", extra);
   print_receiver_data_on(st);
 }
 
-void ciVirtualCallData::print_data_on(outputStream* st) const {
-  print_shared(st, "ciVirtualCallData");
+void ciVirtualCallData::print_data_on(outputStream* st, const char* extra) const {
+  print_shared(st, "ciVirtualCallData", extra);
   rtd_super()->print_receiver_data_on(st);
 }
 
-void ciVirtualCallTypeData::print_data_on(outputStream* st) const {
-  print_shared(st, "ciVirtualCallTypeData");
+void ciVirtualCallTypeData::print_data_on(outputStream* st, const char* extra) const {
+  print_shared(st, "ciVirtualCallTypeData", extra);
   rtd_super()->print_receiver_data_on(st);
   if (has_arguments()) {
     tab(st, true);
@@ -624,8 +701,15 @@
   }
 }
 
-void ciParametersTypeData::print_data_on(outputStream* st) const {
-  st->print_cr("Parametertypes");
+void ciParametersTypeData::print_data_on(outputStream* st, const char* extra) const {
+  st->print_cr("ciParametersTypeData");
   parameters()->print_data_on(st);
 }
+
+void ciSpeculativeTrapData::print_data_on(outputStream* st, const char* extra) const {
+  st->print_cr("ciSpeculativeTrapData");
+  tab(st);
+  method()->print_short_name(st);
+  st->cr();
+}
 #endif
--- a/src/share/vm/ci/ciMethodData.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/ci/ciMethodData.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -31,6 +31,7 @@
 #include "ci/ciUtilities.hpp"
 #include "oops/methodData.hpp"
 #include "oops/oop.inline.hpp"
+#include "runtime/deoptimization.hpp"
 
 class ciBitData;
 class ciCounterData;
@@ -44,6 +45,7 @@
 class ciCallTypeData;
 class ciVirtualCallTypeData;
 class ciParametersTypeData;
+class ciSpeculativeTrapData;;
 
 typedef ProfileData ciProfileData;
 
@@ -173,7 +175,7 @@
   }
 
 #ifndef PRODUCT
-  void print_data_on(outputStream* st) const;
+  void print_data_on(outputStream* st, const char* extra) const;
 #endif
 };
 
@@ -200,7 +202,7 @@
   }
   void translate_receiver_data_from(const ProfileData* data);
 #ifndef PRODUCT
-  void print_data_on(outputStream* st) const;
+  void print_data_on(outputStream* st, const char* extra) const;
   void print_receiver_data_on(outputStream* st) const;
 #endif
 };
@@ -225,7 +227,7 @@
     rtd_super()->translate_receiver_data_from(data);
   }
 #ifndef PRODUCT
-  void print_data_on(outputStream* st) const;
+  void print_data_on(outputStream* st, const char* extra) const;
 #endif
 };
 
@@ -287,7 +289,7 @@
   }
 
 #ifndef PRODUCT
-  void print_data_on(outputStream* st) const;
+  void print_data_on(outputStream* st, const char* extra) const;
 #endif
 };
 
@@ -336,7 +338,26 @@
   }
 
 #ifndef PRODUCT
-  void print_data_on(outputStream* st) const;
+  void print_data_on(outputStream* st, const char* extra) const;
+#endif
+};
+
+class ciSpeculativeTrapData : public SpeculativeTrapData {
+public:
+  ciSpeculativeTrapData(DataLayout* layout) : SpeculativeTrapData(layout) {}
+
+  virtual void translate_from(const ProfileData* data);
+
+  ciMethod* method() const {
+    return (ciMethod*)intptr_at(method_offset);
+  }
+
+  void set_method(ciMethod* m) {
+    set_intptr_at(method_offset, (intptr_t)m);
+  }
+
+#ifndef PRODUCT
+  void print_data_on(outputStream* st, const char* extra) const;
 #endif
 };
 
@@ -436,6 +457,16 @@
 
   ciArgInfoData *arg_info() const;
 
+  address data_base() const {
+    return (address) _data;
+  }
+  DataLayout* limit_data_position() const {
+    return (DataLayout*)((address)data_base() + _data_size);
+  }
+
+  void load_extra_data();
+  ciProfileData* bci_to_extra_data(int bci, ciMethod* m, bool& two_free_slots);
+
 public:
   bool is_method_data() const { return true; }
 
@@ -447,6 +478,18 @@
 
   int invocation_count() { return _invocation_counter; }
   int backedge_count()   { return _backedge_counter;   }
+
+#if INCLUDE_RTM_OPT
+  // return cached value
+  int rtm_state() {
+    if (is_empty()) {
+      return NoRTM;
+    } else {
+      return get_MethodData()->rtm_state();
+    }
+  }
+#endif
+
   // Transfer information about the method to MethodData*.
   // would_profile means we would like to profile this method,
   // meaning it's not trivial.
@@ -475,9 +518,11 @@
   ciProfileData* next_data(ciProfileData* current);
   bool is_valid(ciProfileData* current) { return current != NULL; }
 
-  // Get the data at an arbitrary bci, or NULL if there is none.
-  ciProfileData* bci_to_data(int bci);
-  ciProfileData* bci_to_extra_data(int bci, bool create_if_missing);
+  DataLayout* extra_data_base() const { return limit_data_position(); }
+
+  // Get the data at an arbitrary bci, or NULL if there is none. If m
+  // is not NULL look for a SpeculativeTrapData if any first.
+  ciProfileData* bci_to_data(int bci, ciMethod* m = NULL);
 
   uint overflow_trap_count() const {
     return _orig.overflow_trap_count();
@@ -496,12 +541,13 @@
 
   // Helpful query functions that decode trap_state.
   int has_trap_at(ciProfileData* data, int reason);
-  int has_trap_at(int bci, int reason) {
-    return has_trap_at(bci_to_data(bci), reason);
+  int has_trap_at(int bci, ciMethod* m, int reason) {
+    assert((m != NULL) == Deoptimization::reason_is_speculate(reason), "inconsistent method/reason");
+    return has_trap_at(bci_to_data(bci, m), reason);
   }
   int trap_recompiled_at(ciProfileData* data);
-  int trap_recompiled_at(int bci) {
-    return trap_recompiled_at(bci_to_data(bci));
+  int trap_recompiled_at(int bci, ciMethod* m) {
+    return trap_recompiled_at(bci_to_data(bci, m));
   }
 
   void clear_escape_info();
--- a/src/share/vm/classfile/altHashing.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/classfile/altHashing.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -39,18 +39,18 @@
 }
 
 // Seed value used for each alternative hash calculated.
-jint AltHashing::compute_seed() {
+juint AltHashing::compute_seed() {
   jlong nanos = os::javaTimeNanos();
   jlong now = os::javaTimeMillis();
-  jint SEED_MATERIAL[8] = {
-            (jint) object_hash(SystemDictionary::String_klass()),
-            (jint) object_hash(SystemDictionary::System_klass()),
-            (jint) os::random(),  // current thread isn't a java thread
-            (jint) (((julong)nanos) >> 32),
-            (jint) nanos,
-            (jint) (((julong)now) >> 32),
-            (jint) now,
-            (jint) (os::javaTimeNanos() >> 2)
+  int SEED_MATERIAL[8] = {
+            (int) object_hash(SystemDictionary::String_klass()),
+            (int) object_hash(SystemDictionary::System_klass()),
+            (int) os::random(),  // current thread isn't a java thread
+            (int) (((julong)nanos) >> 32),
+            (int) nanos,
+            (int) (((julong)now) >> 32),
+            (int) now,
+            (int) (os::javaTimeNanos() >> 2)
   };
 
   return murmur3_32(SEED_MATERIAL, 8);
@@ -58,14 +58,14 @@
 
 
 // Murmur3 hashing for Symbol
-jint AltHashing::murmur3_32(jint seed, const jbyte* data, int len) {
-  jint h1 = seed;
+juint AltHashing::murmur3_32(juint seed, const jbyte* data, int len) {
+  juint h1 = seed;
   int count = len;
   int offset = 0;
 
   // body
   while (count >= 4) {
-    jint k1 = (data[offset] & 0x0FF)
+    juint k1 = (data[offset] & 0x0FF)
         | (data[offset + 1] & 0x0FF) << 8
         | (data[offset + 2] & 0x0FF) << 16
         | data[offset + 3] << 24;
@@ -85,7 +85,7 @@
   // tail
 
   if (count > 0) {
-    jint k1 = 0;
+    juint k1 = 0;
 
     switch (count) {
       case 3:
@@ -109,18 +109,18 @@
   h1 ^= len;
 
   // finalization mix force all bits of a hash block to avalanche
-  h1 ^= ((unsigned int)h1) >> 16;
+  h1 ^= h1 >> 16;
   h1 *= 0x85ebca6b;
-  h1 ^= ((unsigned int)h1) >> 13;
+  h1 ^= h1 >> 13;
   h1 *= 0xc2b2ae35;
-  h1 ^= ((unsigned int)h1) >> 16;
+  h1 ^= h1 >> 16;
 
   return h1;
 }
 
 // Murmur3 hashing for Strings
-jint AltHashing::murmur3_32(jint seed, const jchar* data, int len) {
-  jint h1 = seed;
+juint AltHashing::murmur3_32(juint seed, const jchar* data, int len) {
+  juint h1 = seed;
 
   int off = 0;
   int count = len;
@@ -129,7 +129,7 @@
   while (count >= 2) {
     jchar d1 = data[off++] & 0xFFFF;
     jchar d2 = data[off++];
-    jint k1 = (d1 | d2 << 16);
+    juint k1 = (d1 | d2 << 16);
 
     count -= 2;
 
@@ -145,7 +145,7 @@
   // tail
 
   if (count > 0) {
-    int k1 = data[off];
+    juint k1 = (juint)data[off];
 
     k1 *= 0xcc9e2d51;
     k1 = Integer_rotateLeft(k1, 15);
@@ -157,25 +157,25 @@
   h1 ^= len * 2; // (Character.SIZE / Byte.SIZE);
 
   // finalization mix force all bits of a hash block to avalanche
-  h1 ^= ((unsigned int)h1) >> 16;
+  h1 ^= h1 >> 16;
   h1 *= 0x85ebca6b;
-  h1 ^= ((unsigned int)h1) >> 13;
+  h1 ^= h1 >> 13;
   h1 *= 0xc2b2ae35;
-  h1 ^= ((unsigned int)h1) >> 16;
+  h1 ^= h1 >> 16;
 
   return h1;
 }
 
 // Hash used for the seed.
-jint AltHashing::murmur3_32(jint seed, const int* data, int len) {
-  jint h1 = seed;
+juint AltHashing::murmur3_32(juint seed, const int* data, int len) {
+  juint h1 = seed;
 
   int off = 0;
   int end = len;
 
   // body
   while (off < end) {
-    jint k1 = data[off++];
+    juint k1 = (juint)data[off++];
 
     k1 *= 0xcc9e2d51;
     k1 = Integer_rotateLeft(k1, 15);
@@ -193,26 +193,26 @@
   h1 ^= len * 4; // (Integer.SIZE / Byte.SIZE);
 
   // finalization mix force all bits of a hash block to avalanche
-  h1 ^= ((juint)h1) >> 16;
+  h1 ^= h1 >> 16;
   h1 *= 0x85ebca6b;
-  h1 ^= ((juint)h1) >> 13;
+  h1 ^= h1 >> 13;
   h1 *= 0xc2b2ae35;
-  h1 ^= ((juint)h1) >> 16;
+  h1 ^= h1 >> 16;
 
   return h1;
 }
 
-jint AltHashing::murmur3_32(const int* data, int len) {
+juint AltHashing::murmur3_32(const int* data, int len) {
   return murmur3_32(0, data, len);
 }
 
 #ifndef PRODUCT
 // Overloaded versions for internal test.
-jint AltHashing::murmur3_32(const jbyte* data, int len) {
+juint AltHashing::murmur3_32(const jbyte* data, int len) {
   return murmur3_32(0, data, len);
 }
 
-jint AltHashing::murmur3_32(const jchar* data, int len) {
+juint AltHashing::murmur3_32(const jchar* data, int len) {
   return murmur3_32(0, data, len);
 }
 
@@ -251,11 +251,11 @@
 
   // Hash subranges {}, {0}, {0,1}, {0,1,2}, ..., {0,...,255}
   for (int i = 0; i < 256; i++) {
-    jint hash = murmur3_32(256 - i, vector, i);
+    juint hash = murmur3_32(256 - i, vector, i);
     hashes[i * 4] = (jbyte) hash;
-    hashes[i * 4 + 1] = (jbyte) (((juint)hash) >> 8);
-    hashes[i * 4 + 2] = (jbyte) (((juint)hash) >> 16);
-    hashes[i * 4 + 3] = (jbyte) (((juint)hash) >> 24);
+    hashes[i * 4 + 1] = (jbyte)(hash >> 8);
+    hashes[i * 4 + 2] = (jbyte)(hash >> 16);
+    hashes[i * 4 + 3] = (jbyte)(hash >> 24);
   }
 
   // hash to get const result.
@@ -269,7 +269,7 @@
 }
 
 void AltHashing::testEquivalentHashes() {
-  jint jbytes, jchars, ints;
+  juint jbytes, jchars, ints;
 
   // printf("testEquivalentHashes\n");
 
--- a/src/share/vm/classfile/altHashing.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/classfile/altHashing.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -39,24 +39,24 @@
 class AltHashing : AllStatic {
 
   // utility function copied from java/lang/Integer
-  static jint Integer_rotateLeft(jint i, int distance) {
-    return (i << distance) | (((juint)i) >> (32-distance));
+  static juint Integer_rotateLeft(juint i, int distance) {
+    return (i << distance) | (i >> (32-distance));
   }
-  static jint murmur3_32(const int* data, int len);
-  static jint murmur3_32(jint seed, const int* data, int len);
+  static juint murmur3_32(const int* data, int len);
+  static juint murmur3_32(juint seed, const int* data, int len);
 
 #ifndef PRODUCT
   // Hashing functions used for internal testing
-  static jint murmur3_32(const jbyte* data, int len);
-  static jint murmur3_32(const jchar* data, int len);
+  static juint murmur3_32(const jbyte* data, int len);
+  static juint murmur3_32(const jchar* data, int len);
   static void testMurmur3_32_ByteArray();
   static void testEquivalentHashes();
 #endif // PRODUCT
 
  public:
-  static jint compute_seed();
-  static jint murmur3_32(jint seed, const jbyte* data, int len);
-  static jint murmur3_32(jint seed, const jchar* data, int len);
+  static juint compute_seed();
+  static juint murmur3_32(juint seed, const jbyte* data, int len);
+  static juint murmur3_32(juint seed, const jchar* data, int len);
   NOT_PRODUCT(static void test_alt_hash();)
 };
 #endif // SHARE_VM_CLASSFILE_ALTHASHING_HPP
--- a/src/share/vm/classfile/javaClasses.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/classfile/javaClasses.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -61,10 +61,6 @@
 
   static Handle basic_create(int length, TRAPS);
 
-  static void set_value( oop string, typeArrayOop buffer) {
-    assert(initialized, "Must be initialized");
-    string->obj_field_put(value_offset,  (oop)buffer);
-  }
   static void set_offset(oop string, int offset) {
     assert(initialized, "Must be initialized");
     if (offset_offset > 0) {
@@ -122,12 +118,26 @@
     return hash_offset;
   }
 
+  static void set_value(oop string, typeArrayOop buffer) {
+    assert(initialized && (value_offset > 0), "Must be initialized");
+    string->obj_field_put(value_offset, (oop)buffer);
+  }
+  static void set_hash(oop string, unsigned int hash) {
+    assert(initialized && (hash_offset > 0), "Must be initialized");
+    string->int_field_put(hash_offset, hash);
+  }
+
   // Accessors
   static typeArrayOop value(oop java_string) {
     assert(initialized && (value_offset > 0), "Must be initialized");
     assert(is_instance(java_string), "must be java_string");
     return (typeArrayOop) java_string->obj_field(value_offset);
   }
+  static unsigned int hash(oop java_string) {
+    assert(initialized && (hash_offset > 0), "Must be initialized");
+    assert(is_instance(java_string), "must be java_string");
+    return java_string->int_field(hash_offset);
+  }
   static int offset(oop java_string) {
     assert(initialized, "Must be initialized");
     assert(is_instance(java_string), "must be java_string");
--- a/src/share/vm/classfile/symbolTable.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/classfile/symbolTable.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -35,6 +35,9 @@
 #include "oops/oop.inline2.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "utilities/hashtable.inline.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc_implementation/g1/g1StringDedup.hpp"
+#endif
 
 // --------------------------------------------------------------------------
 
@@ -728,6 +731,15 @@
     string = java_lang_String::create_from_unicode(name, len, CHECK_NULL);
   }
 
+#if INCLUDE_ALL_GCS
+  if (G1StringDedup::is_enabled()) {
+    // Deduplicate the string before it is interned. Note that we should never
+    // deduplicate a string after it has been interned. Doing so will counteract
+    // compiler optimizations done on e.g. interned string literals.
+    G1StringDedup::deduplicate(string());
+  }
+#endif
+
   // Grab the StringTable_lock before getting the_table() because it could
   // change at safepoint.
   MutexLocker ml(StringTable_lock, THREAD);
--- a/src/share/vm/code/nmethod.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/code/nmethod.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -481,7 +481,9 @@
   _scavenge_root_link      = NULL;
   _scavenge_root_state     = 0;
   _compiler                = NULL;
-
+#if INCLUDE_RTM_OPT
+  _rtm_state               = NoRTM;
+#endif
 #ifdef HAVE_DTRACE_H
   _trap_offset             = 0;
 #endif // def HAVE_DTRACE_H
--- a/src/share/vm/code/nmethod.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/code/nmethod.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -193,6 +193,12 @@
 
   jbyte _scavenge_root_state;
 
+#if INCLUDE_RTM_OPT
+  // RTM state at compile time. Used during deoptimization to decide
+  // whether to restart collecting RTM locking abort statistic again.
+  RTMState _rtm_state;
+#endif
+
   // Nmethod Flushing lock. If non-zero, then the nmethod is not removed
   // and is not made into a zombie. However, once the nmethod is made into
   // a zombie, it will be locked one final time if CompiledMethodUnload
@@ -414,6 +420,12 @@
   bool  is_zombie() const                         { return _state == zombie; }
   bool  is_unloaded() const                       { return _state == unloaded;   }
 
+#if INCLUDE_RTM_OPT
+  // rtm state accessing and manipulating
+  RTMState  rtm_state() const                     { return _rtm_state; }
+  void set_rtm_state(RTMState state)              { _rtm_state = state; }
+#endif
+
   // Make the nmethod non entrant. The nmethod will continue to be
   // alive.  It is used when an uncommon trap happens.  Returns true
   // if this thread changed the state of the nmethod or false if
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -2496,7 +2496,8 @@
 }
 
 void CMSCollector::report_heap_summary(GCWhen::Type when) {
-  _gc_tracer_cm->report_gc_heap_summary(when, _last_heap_summary, _last_metaspace_summary);
+  _gc_tracer_cm->report_gc_heap_summary(when, _last_heap_summary);
+  _gc_tracer_cm->report_metaspace_summary(when, _last_metaspace_summary);
 }
 
 void CMSCollector::collect_in_foreground(bool clear_all_soft_refs, GCCause::Cause cause) {
--- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1809,8 +1809,8 @@
   uint _regions_claimed;
   size_t _freed_bytes;
   FreeRegionList* _local_cleanup_list;
-  OldRegionSet* _old_proxy_set;
-  HumongousRegionSet* _humongous_proxy_set;
+  HeapRegionSetCount _old_regions_removed;
+  HeapRegionSetCount _humongous_regions_removed;
   HRRSCleanupTask* _hrrs_cleanup_task;
   double _claimed_region_time;
   double _max_region_time;
@@ -1819,19 +1819,19 @@
   G1NoteEndOfConcMarkClosure(G1CollectedHeap* g1,
                              int worker_num,
                              FreeRegionList* local_cleanup_list,
-                             OldRegionSet* old_proxy_set,
-                             HumongousRegionSet* humongous_proxy_set,
                              HRRSCleanupTask* hrrs_cleanup_task) :
     _g1(g1), _worker_num(worker_num),
     _max_live_bytes(0), _regions_claimed(0),
     _freed_bytes(0),
     _claimed_region_time(0.0), _max_region_time(0.0),
     _local_cleanup_list(local_cleanup_list),
-    _old_proxy_set(old_proxy_set),
-    _humongous_proxy_set(humongous_proxy_set),
+    _old_regions_removed(),
+    _humongous_regions_removed(),
     _hrrs_cleanup_task(hrrs_cleanup_task) { }
 
   size_t freed_bytes() { return _freed_bytes; }
+  const HeapRegionSetCount& old_regions_removed() { return _old_regions_removed; }
+  const HeapRegionSetCount& humongous_regions_removed() { return _humongous_regions_removed; }
 
   bool doHeapRegion(HeapRegion *hr) {
     if (hr->continuesHumongous()) {
@@ -1844,13 +1844,22 @@
     _regions_claimed++;
     hr->note_end_of_marking();
     _max_live_bytes += hr->max_live_bytes();
-    _g1->free_region_if_empty(hr,
-                              &_freed_bytes,
-                              _local_cleanup_list,
-                              _old_proxy_set,
-                              _humongous_proxy_set,
-                              _hrrs_cleanup_task,
-                              true /* par */);
+
+    if (hr->used() > 0 && hr->max_live_bytes() == 0 && !hr->is_young()) {
+      _freed_bytes += hr->used();
+      hr->set_containing_set(NULL);
+      if (hr->isHumongous()) {
+        assert(hr->startsHumongous(), "we should only see starts humongous");
+        _humongous_regions_removed.increment(1u, hr->capacity());
+        _g1->free_humongous_region(hr, _local_cleanup_list, true);
+      } else {
+        _old_regions_removed.increment(1u, hr->capacity());
+        _g1->free_region(hr, _local_cleanup_list, true);
+      }
+    } else {
+      hr->rem_set()->do_cleanup_work(_hrrs_cleanup_task);
+    }
+
     double region_time = (os::elapsedTime() - start);
     _claimed_region_time += region_time;
     if (region_time > _max_region_time) {
@@ -1883,12 +1892,8 @@
   void work(uint worker_id) {
     double start = os::elapsedTime();
     FreeRegionList local_cleanup_list("Local Cleanup List");
-    OldRegionSet old_proxy_set("Local Cleanup Old Proxy Set");
-    HumongousRegionSet humongous_proxy_set("Local Cleanup Humongous Proxy Set");
     HRRSCleanupTask hrrs_cleanup_task;
     G1NoteEndOfConcMarkClosure g1_note_end(_g1h, worker_id, &local_cleanup_list,
-                                           &old_proxy_set,
-                                           &humongous_proxy_set,
                                            &hrrs_cleanup_task);
     if (G1CollectedHeap::use_parallel_gc_threads()) {
       _g1h->heap_region_par_iterate_chunked(&g1_note_end, worker_id,
@@ -1900,13 +1905,10 @@
     assert(g1_note_end.complete(), "Shouldn't have yielded!");
 
     // Now update the lists
-    _g1h->update_sets_after_freeing_regions(g1_note_end.freed_bytes(),
-                                            NULL /* free_list */,
-                                            &old_proxy_set,
-                                            &humongous_proxy_set,
-                                            true /* par */);
+    _g1h->remove_from_old_sets(g1_note_end.old_regions_removed(), g1_note_end.humongous_regions_removed());
     {
       MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
+      _g1h->decrement_summary_bytes(g1_note_end.freed_bytes());
       _max_live_bytes += g1_note_end.max_live_bytes();
       _freed_bytes += g1_note_end.freed_bytes();
 
@@ -1920,14 +1922,14 @@
 
       G1HRPrinter* hr_printer = _g1h->hr_printer();
       if (hr_printer->is_active()) {
-        HeapRegionLinkedListIterator iter(&local_cleanup_list);
+        FreeRegionListIterator iter(&local_cleanup_list);
         while (iter.more_available()) {
           HeapRegion* hr = iter.get_next();
           hr_printer->cleanup(hr);
         }
       }
 
-      _cleanup_list->add_as_tail(&local_cleanup_list);
+      _cleanup_list->add_ordered(&local_cleanup_list);
       assert(local_cleanup_list.is_empty(), "post-condition");
 
       HeapRegionRemSet::finish_cleanup_task(&hrrs_cleanup_task);
@@ -1971,7 +1973,6 @@
     return;
   }
 
-  HRSPhaseSetter x(HRSPhaseCleanup);
   g1h->verify_region_sets_optional();
 
   if (VerifyDuringGC) {
@@ -2144,7 +2145,7 @@
 
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
 
-  _cleanup_list.verify_optional();
+  _cleanup_list.verify_list();
   FreeRegionList tmp_free_list("Tmp Free List");
 
   if (G1ConcRegionFreeingVerbose) {
@@ -2157,9 +2158,9 @@
   // so it's not necessary to take any locks
   while (!_cleanup_list.is_empty()) {
     HeapRegion* hr = _cleanup_list.remove_head();
-    assert(hr != NULL, "the list was not empty");
+    assert(hr != NULL, "Got NULL from a non-empty list");
     hr->par_clear();
-    tmp_free_list.add_as_tail(hr);
+    tmp_free_list.add_ordered(hr);
 
     // Instead of adding one region at a time to the secondary_free_list,
     // we accumulate them in the local list and move them a few at a
@@ -2179,7 +2180,7 @@
 
       {
         MutexLockerEx x(SecondaryFreeList_lock, Mutex::_no_safepoint_check_flag);
-        g1h->secondary_free_list_add_as_tail(&tmp_free_list);
+        g1h->secondary_free_list_add(&tmp_free_list);
         SecondaryFreeList_lock->notify_all();
       }
 
@@ -2528,6 +2529,11 @@
     assert(!rp->discovery_enabled(), "Post condition");
   }
 
+  if (has_overflown()) {
+    // We can not trust g1_is_alive if the marking stack overflowed
+    return;
+  }
+
   g1h->unlink_string_and_symbol_table(&g1_is_alive,
                                       /* process_strings */ false, // currently strings are always roots
                                       /* process_symbols */ true);
--- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -25,7 +25,7 @@
 #ifndef SHARE_VM_GC_IMPLEMENTATION_G1_CONCURRENTMARK_HPP
 #define SHARE_VM_GC_IMPLEMENTATION_G1_CONCURRENTMARK_HPP
 
-#include "gc_implementation/g1/heapRegionSets.hpp"
+#include "gc_implementation/g1/heapRegionSet.hpp"
 #include "utilities/taskqueue.hpp"
 
 class G1CollectedHeap;
--- a/src/share/vm/gc_implementation/g1/g1BiasedArray.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1BiasedArray.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -24,6 +24,14 @@
 
 #include "precompiled.hpp"
 #include "gc_implementation/g1/g1BiasedArray.hpp"
+#include "memory/padded.inline.hpp"
+
+// Allocate a new array, generic version.
+address G1BiasedMappedArrayBase::create_new_base_array(size_t length, size_t elem_size) {
+  assert(length > 0, "just checking");
+  assert(elem_size > 0, "just checking");
+  return PaddedPrimitiveArray<u_char, mtGC>::create_unfreeable(length * elem_size);
+}
 
 #ifndef PRODUCT
 void G1BiasedMappedArrayBase::verify_index(idx_t index) const {
--- a/src/share/vm/gc_implementation/g1/g1BiasedArray.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1BiasedArray.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -25,8 +25,8 @@
 #ifndef SHARE_VM_GC_IMPLEMENTATION_G1_G1BIASEDARRAY_HPP
 #define SHARE_VM_GC_IMPLEMENTATION_G1_G1BIASEDARRAY_HPP
 
+#include "memory/allocation.hpp"
 #include "utilities/debug.hpp"
-#include "memory/allocation.inline.hpp"
 
 // Implements the common base functionality for arrays that contain provisions
 // for accessing its elements using a biased index.
@@ -48,11 +48,7 @@
     _bias(0), _shift_by(0) { }
 
   // Allocate a new array, generic version.
-  static address create_new_base_array(size_t length, size_t elem_size) {
-    assert(length > 0, "just checking");
-    assert(elem_size > 0, "just checking");
-    return NEW_C_HEAP_ARRAY(u_char, length * elem_size, mtGC);
-  }
+  static address create_new_base_array(size_t length, size_t elem_size);
 
   // Initialize the members of this class. The biased start address of this array
   // is the bias (in elements) multiplied by the element size.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc_implementation/g1/g1CodeCacheRemSet.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+
+#include "precompiled.hpp"
+#include "code/nmethod.hpp"
+#include "gc_implementation/g1/g1CodeCacheRemSet.hpp"
+#include "memory/iterator.hpp"
+
+G1CodeRootChunk::G1CodeRootChunk() : _top(NULL), _next(NULL), _prev(NULL) {
+  _top = bottom();
+}
+
+void G1CodeRootChunk::reset() {
+  _next = _prev = NULL;
+  _top = bottom();
+}
+
+void G1CodeRootChunk::nmethods_do(CodeBlobClosure* cl) {
+  nmethod** cur = bottom();
+  while (cur != _top) {
+    cl->do_code_blob(*cur);
+    cur++;
+  }
+}
+
+FreeList<G1CodeRootChunk> G1CodeRootSet::_free_list;
+size_t G1CodeRootSet::_num_chunks_handed_out = 0;
+
+G1CodeRootChunk* G1CodeRootSet::new_chunk() {
+  G1CodeRootChunk* result = _free_list.get_chunk_at_head();
+  if (result == NULL) {
+    result = new G1CodeRootChunk();
+  }
+  G1CodeRootSet::_num_chunks_handed_out++;
+  result->reset();
+  return result;
+}
+
+void G1CodeRootSet::free_chunk(G1CodeRootChunk* chunk) {
+  _free_list.return_chunk_at_head(chunk);
+  G1CodeRootSet::_num_chunks_handed_out--;
+}
+
+void G1CodeRootSet::free_all_chunks(FreeList<G1CodeRootChunk>* list) {
+  G1CodeRootSet::_num_chunks_handed_out -= list->count();
+  _free_list.prepend(list);
+}
+
+void G1CodeRootSet::purge_chunks(size_t keep_ratio) {
+  size_t keep = G1CodeRootSet::_num_chunks_handed_out * keep_ratio / 100;
+
+  if (keep >= (size_t)_free_list.count()) {
+    return;
+  }
+
+  FreeList<G1CodeRootChunk> temp;
+  temp.initialize();
+  temp.set_size(G1CodeRootChunk::word_size());
+
+  _free_list.getFirstNChunksFromList((size_t)_free_list.count() - keep, &temp);
+
+  G1CodeRootChunk* cur = temp.get_chunk_at_head();
+  while (cur != NULL) {
+    delete cur;
+    cur = temp.get_chunk_at_head();
+  }
+}
+
+size_t G1CodeRootSet::static_mem_size() {
+  return sizeof(_free_list) + sizeof(_num_chunks_handed_out);
+}
+
+size_t G1CodeRootSet::fl_mem_size() {
+  return _free_list.count() * _free_list.size();
+}
+
+void G1CodeRootSet::initialize() {
+  _free_list.initialize();
+  _free_list.set_size(G1CodeRootChunk::word_size());
+}
+
+G1CodeRootSet::G1CodeRootSet() : _list(), _length(0) {
+  _list.initialize();
+  _list.set_size(G1CodeRootChunk::word_size());
+}
+
+G1CodeRootSet::~G1CodeRootSet() {
+  clear();
+}
+
+void G1CodeRootSet::add(nmethod* method) {
+  if (!contains(method)) {
+    // Try to add the nmethod. If there is not enough space, get a new chunk.
+    if (_list.head() == NULL || _list.head()->is_full()) {
+      G1CodeRootChunk* cur = new_chunk();
+      _list.return_chunk_at_head(cur);
+    }
+    bool result = _list.head()->add(method);
+    guarantee(result, err_msg("Not able to add nmethod "PTR_FORMAT" to newly allocated chunk.", method));
+    _length++;
+  }
+}
+
+void G1CodeRootSet::remove(nmethod* method) {
+  G1CodeRootChunk* found = find(method);
+  if (found != NULL) {
+    bool result = found->remove(method);
+    guarantee(result, err_msg("could not find nmethod "PTR_FORMAT" during removal although we previously found it", method));
+    // eventually free completely emptied chunk
+    if (found->is_empty()) {
+      _list.remove_chunk(found);
+      free(found);
+    }
+    _length--;
+  }
+  assert(!contains(method), err_msg(PTR_FORMAT" still contains nmethod "PTR_FORMAT, this, method));
+}
+
+nmethod* G1CodeRootSet::pop() {
+  do {
+    G1CodeRootChunk* cur = _list.head();
+    if (cur == NULL) {
+      assert(_length == 0, "when there are no chunks, there should be no elements");
+      return NULL;
+    }
+    nmethod* result = cur->pop();
+    if (result != NULL) {
+      _length--;
+      return result;
+    } else {
+      free(_list.get_chunk_at_head());
+    }
+  } while (true);
+}
+
+G1CodeRootChunk* G1CodeRootSet::find(nmethod* method) {
+  G1CodeRootChunk* cur = _list.head();
+  while (cur != NULL) {
+    if (cur->contains(method)) {
+      return cur;
+    }
+    cur = (G1CodeRootChunk*)cur->next();
+  }
+  return NULL;
+}
+
+void G1CodeRootSet::free(G1CodeRootChunk* chunk) {
+  free_chunk(chunk);
+}
+
+bool G1CodeRootSet::contains(nmethod* method) {
+  return find(method) != NULL;
+}
+
+void G1CodeRootSet::clear() {
+  free_all_chunks(&_list);
+  _length = 0;
+}
+
+void G1CodeRootSet::nmethods_do(CodeBlobClosure* blk) const {
+  G1CodeRootChunk* cur = _list.head();
+  while (cur != NULL) {
+    cur->nmethods_do(blk);
+    cur = (G1CodeRootChunk*)cur->next();
+  }
+}
+
+size_t G1CodeRootSet::mem_size() {
+  return sizeof(this) + _list.count() * _list.size();
+}
+
+#ifndef PRODUCT
+
+void G1CodeRootSet::test() {
+  initialize();
+
+  assert(_free_list.count() == 0, "Free List must be empty");
+  assert(_num_chunks_handed_out == 0, "No elements must have been handed out yet");
+
+  // The number of chunks that we allocate for purge testing.
+  size_t const num_chunks = 10;
+  {
+    G1CodeRootSet set1;
+    assert(set1.is_empty(), "Code root set must be initially empty but is not.");
+
+    set1.add((nmethod*)1);
+    assert(_num_chunks_handed_out == 1,
+           err_msg("Must have allocated and handed out one chunk, but handed out "
+                   SIZE_FORMAT" chunks", _num_chunks_handed_out));
+    assert(set1.length() == 1, err_msg("Added exactly one element, but set contains "
+                                       SIZE_FORMAT" elements", set1.length()));
+
+    // G1CodeRootChunk::word_size() is larger than G1CodeRootChunk::num_entries which
+    // we cannot access.
+    for (uint i = 0; i < G1CodeRootChunk::word_size() + 1; i++) {
+      set1.add((nmethod*)1);
+    }
+    assert(_num_chunks_handed_out == 1,
+           err_msg("Duplicate detection must have prevented allocation of further "
+                   "chunks but contains "SIZE_FORMAT, _num_chunks_handed_out));
+    assert(set1.length() == 1,
+           err_msg("Duplicate detection should not have increased the set size but "
+                   "is "SIZE_FORMAT, set1.length()));
+
+    size_t num_total_after_add = G1CodeRootChunk::word_size() + 1;
+    for (size_t i = 0; i < num_total_after_add - 1; i++) {
+      set1.add((nmethod*)(2 + i));
+    }
+    assert(_num_chunks_handed_out > 1,
+           "After adding more code roots, more than one chunks should have been handed out");
+    assert(set1.length() == num_total_after_add,
+           err_msg("After adding in total "SIZE_FORMAT" distinct code roots, they "
+                   "need to be in the set, but there are only "SIZE_FORMAT,
+                   num_total_after_add, set1.length()));
+
+    size_t num_popped = 0;
+    while (set1.pop() != NULL) {
+      num_popped++;
+    }
+    assert(num_popped == num_total_after_add,
+           err_msg("Managed to pop "SIZE_FORMAT" code roots, but only "SIZE_FORMAT" "
+                   "were added", num_popped, num_total_after_add));
+    assert(_num_chunks_handed_out == 0,
+           err_msg("After popping all elements, all chunks must have been returned "
+                   "but are still "SIZE_FORMAT, _num_chunks_handed_out));
+
+    purge_chunks(0);
+    assert(_free_list.count() == 0,
+           err_msg("After purging everything, the free list must be empty but still "
+                   "contains "SIZE_FORMAT" chunks", _free_list.count()));
+
+    // Add some more handed out chunks.
+    size_t i = 0;
+    while (_num_chunks_handed_out < num_chunks) {
+      set1.add((nmethod*)i);
+      i++;
+    }
+
+    {
+      // Generate chunks on the free list.
+      G1CodeRootSet set2;
+      size_t i = 0;
+      while (_num_chunks_handed_out < num_chunks * 2) {
+        set2.add((nmethod*)i);
+        i++;
+      }
+      // Exit of the scope of the set2 object will call the destructor that generates
+      // num_chunks elements on the free list.
+    }
+
+    assert(_num_chunks_handed_out == num_chunks,
+           err_msg("Deletion of the second set must have resulted in giving back "
+                   "those, but there is still "SIZE_FORMAT" handed out, expecting "
+                   SIZE_FORMAT, _num_chunks_handed_out, num_chunks));
+    assert((size_t)_free_list.count() == num_chunks,
+           err_msg("After freeing "SIZE_FORMAT" chunks, they must be on the free list "
+                   "but there are only "SIZE_FORMAT, num_chunks, _free_list.count()));
+
+    size_t const test_percentage = 50;
+    purge_chunks(test_percentage);
+    assert(_num_chunks_handed_out == num_chunks,
+           err_msg("Purging must not hand out chunks but there are "SIZE_FORMAT,
+                   _num_chunks_handed_out));
+    assert((size_t)_free_list.count() == (ssize_t)(num_chunks * test_percentage / 100),
+           err_msg("Must have purged "SIZE_FORMAT" percent of "SIZE_FORMAT" chunks"
+                   "but there are "SSIZE_FORMAT, test_percentage, num_chunks,
+                   _free_list.count()));
+    // Purge the remainder of the chunks on the free list.
+    purge_chunks(0);
+    assert(_free_list.count() == 0, "Free List must be empty");
+    assert(_num_chunks_handed_out == num_chunks,
+           err_msg("Expected to be "SIZE_FORMAT" chunks handed out from the first set "
+                   "but there are "SIZE_FORMAT, num_chunks, _num_chunks_handed_out));
+
+    // Exit of the scope of the set1 object will call the destructor that generates
+    // num_chunks additional elements on the free list.
+  }
+
+  assert(_num_chunks_handed_out == 0,
+         err_msg("Deletion of the only set must have resulted in no chunks handed "
+                 "out, but there is still "SIZE_FORMAT" handed out", _num_chunks_handed_out));
+  assert((size_t)_free_list.count() == num_chunks,
+         err_msg("After freeing "SIZE_FORMAT" chunks, they must be on the free list "
+                 "but there are only "SSIZE_FORMAT, num_chunks, _free_list.count()));
+
+  // Restore initial state.
+  purge_chunks(0);
+  assert(_free_list.count() == 0, "Free List must be empty");
+  assert(_num_chunks_handed_out == 0, "No elements must have been handed out yet");
+}
+
+void TestCodeCacheRemSet_test() {
+  G1CodeRootSet::test();
+}
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc_implementation/g1/g1CodeCacheRemSet.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_IMPLEMENTATION_G1_G1CODECACHEREMSET_HPP
+#define SHARE_VM_GC_IMPLEMENTATION_G1_G1CODECACHEREMSET_HPP
+
+#include "memory/allocation.hpp"
+#include "memory/freeList.hpp"
+#include "runtime/globals.hpp"
+
+class CodeBlobClosure;
+
+class G1CodeRootChunk : public CHeapObj<mtGC> {
+ private:
+  static const int NUM_ENTRIES = 32;
+ public:
+  G1CodeRootChunk*     _next;
+  G1CodeRootChunk*     _prev;
+
+  nmethod** _top;
+
+  nmethod* _data[NUM_ENTRIES];
+
+  nmethod** bottom() const {
+    return (nmethod**) &(_data[0]);
+  }
+
+  nmethod** end() const {
+    return (nmethod**) &(_data[NUM_ENTRIES]);
+  }
+
+ public:
+  G1CodeRootChunk();
+  ~G1CodeRootChunk() {}
+
+  static size_t word_size() { return (size_t)(align_size_up_(sizeof(G1CodeRootChunk), HeapWordSize) / HeapWordSize); }
+
+  // FreeList "interface" methods
+
+  G1CodeRootChunk* next() const         { return _next; }
+  G1CodeRootChunk* prev() const         { return _prev; }
+  void set_next(G1CodeRootChunk* v)     { _next = v; assert(v != this, "Boom");}
+  void set_prev(G1CodeRootChunk* v)     { _prev = v; assert(v != this, "Boom");}
+  void clear_next()       { set_next(NULL); }
+  void clear_prev()       { set_prev(NULL); }
+
+  size_t size() const { return word_size(); }
+
+  void link_next(G1CodeRootChunk* ptr)  { set_next(ptr); }
+  void link_prev(G1CodeRootChunk* ptr)  { set_prev(ptr); }
+  void link_after(G1CodeRootChunk* ptr) {
+    link_next(ptr);
+    if (ptr != NULL) ptr->link_prev((G1CodeRootChunk*)this);
+  }
+
+  bool is_free()                 { return true; }
+
+  // New G1CodeRootChunk routines
+
+  void reset();
+
+  bool is_empty() const {
+    return _top == bottom();
+  }
+
+  bool is_full() const {
+    return _top == (nmethod**)end();
+  }
+
+  bool contains(nmethod* method) {
+    nmethod** cur = bottom();
+    while (cur != _top) {
+      if (*cur == method) return true;
+      cur++;
+    }
+    return false;
+  }
+
+  bool add(nmethod* method) {
+    if (is_full()) return false;
+    *_top = method;
+    _top++;
+    return true;
+  }
+
+  bool remove(nmethod* method) {
+    nmethod** cur = bottom();
+    while (cur != _top) {
+      if (*cur == method) {
+        memmove(cur, cur + 1, (_top - (cur + 1)) * sizeof(nmethod**));
+        _top--;
+        return true;
+      }
+      cur++;
+    }
+    return false;
+  }
+
+  void nmethods_do(CodeBlobClosure* blk);
+
+  nmethod* pop() {
+    if (is_empty()) {
+      return NULL;
+    }
+    _top--;
+    return *_top;
+  }
+};
+
+// Implements storage for a set of code roots.
+// All methods that modify the set are not thread-safe except if otherwise noted.
+class G1CodeRootSet VALUE_OBJ_CLASS_SPEC {
+ private:
+  // Global free chunk list management
+  static FreeList<G1CodeRootChunk> _free_list;
+  // Total number of chunks handed out
+  static size_t _num_chunks_handed_out;
+
+  static G1CodeRootChunk* new_chunk();
+  static void free_chunk(G1CodeRootChunk* chunk);
+  // Free all elements of the given list.
+  static void free_all_chunks(FreeList<G1CodeRootChunk>* list);
+
+  // Return the chunk that contains the given nmethod, NULL otherwise.
+  // Scans the list of chunks backwards, as this method is used to add new
+  // entries, which are typically added in bulk for a single nmethod.
+  G1CodeRootChunk* find(nmethod* method);
+  void free(G1CodeRootChunk* chunk);
+
+  size_t _length;
+  FreeList<G1CodeRootChunk> _list;
+
+ public:
+  G1CodeRootSet();
+  ~G1CodeRootSet();
+
+  static void initialize();
+  static void purge_chunks(size_t keep_ratio);
+
+  static size_t static_mem_size();
+  static size_t fl_mem_size();
+
+  // Search for the code blob from the recently allocated ones to find duplicates more quickly, as this
+  // method is likely to be repeatedly called with the same nmethod.
+  void add(nmethod* method);
+
+  void remove(nmethod* method);
+  nmethod* pop();
+
+  bool contains(nmethod* method);
+
+  void clear();
+
+  void nmethods_do(CodeBlobClosure* blk) const;
+
+  bool is_empty() { return length() == 0; }
+
+  // Length in elements
+  size_t length() const { return _length; }
+
+  // Memory size in bytes taken by this set.
+  size_t mem_size();
+
+  static void test() PRODUCT_RETURN;
+};
+
+#endif // SHARE_VM_GC_IMPLEMENTATION_G1_G1CODECACHEREMSET_HPP
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -39,6 +39,7 @@
 #include "gc_implementation/g1/g1MarkSweep.hpp"
 #include "gc_implementation/g1/g1OopClosures.inline.hpp"
 #include "gc_implementation/g1/g1RemSet.inline.hpp"
+#include "gc_implementation/g1/g1StringDedup.hpp"
 #include "gc_implementation/g1/g1YCTypes.hpp"
 #include "gc_implementation/g1/heapRegion.inline.hpp"
 #include "gc_implementation/g1/heapRegionRemSet.hpp"
@@ -169,14 +170,6 @@
   int calls() { return _calls; }
 };
 
-class RedirtyLoggedCardTableEntryFastClosure : public CardTableEntryClosure {
-public:
-  bool do_card_ptr(jbyte* card_ptr, int worker_i) {
-    *card_ptr = CardTableModRefBS::dirty_card_val();
-    return true;
-  }
-};
-
 YoungList::YoungList(G1CollectedHeap* g1h) :
     _g1h(g1h), _head(NULL), _length(0), _last_sampled_rs_lengths(0),
     _survivor_head(NULL), _survivor_tail(NULL), _survivor_length(0) {
@@ -524,7 +517,7 @@
 // Private methods.
 
 HeapRegion*
-G1CollectedHeap::new_region_try_secondary_free_list() {
+G1CollectedHeap::new_region_try_secondary_free_list(bool is_old) {
   MutexLockerEx x(SecondaryFreeList_lock, Mutex::_no_safepoint_check_flag);
   while (!_secondary_free_list.is_empty() || free_regions_coming()) {
     if (!_secondary_free_list.is_empty()) {
@@ -540,7 +533,7 @@
 
       assert(!_free_list.is_empty(), "if the secondary_free_list was not "
              "empty we should have moved at least one entry to the free_list");
-      HeapRegion* res = _free_list.remove_head();
+      HeapRegion* res = _free_list.remove_region(is_old);
       if (G1ConcRegionFreeingVerbose) {
         gclog_or_tty->print_cr("G1ConcRegionFreeing [region alloc] : "
                                "allocated "HR_FORMAT" from secondary_free_list",
@@ -562,7 +555,7 @@
   return NULL;
 }
 
-HeapRegion* G1CollectedHeap::new_region(size_t word_size, bool do_expand) {
+HeapRegion* G1CollectedHeap::new_region(size_t word_size, bool is_old, bool do_expand) {
   assert(!isHumongous(word_size) || word_size <= HeapRegion::GrainWords,
          "the only time we use this to allocate a humongous region is "
          "when we are allocating a single humongous region");
@@ -574,19 +567,21 @@
         gclog_or_tty->print_cr("G1ConcRegionFreeing [region alloc] : "
                                "forced to look at the secondary_free_list");
       }
-      res = new_region_try_secondary_free_list();
+      res = new_region_try_secondary_free_list(is_old);
       if (res != NULL) {
         return res;
       }
     }
   }
-  res = _free_list.remove_head_or_null();
+
+  res = _free_list.remove_region(is_old);
+
   if (res == NULL) {
     if (G1ConcRegionFreeingVerbose) {
       gclog_or_tty->print_cr("G1ConcRegionFreeing [region alloc] : "
                              "res == NULL, trying the secondary_free_list");
     }
-    res = new_region_try_secondary_free_list();
+    res = new_region_try_secondary_free_list(is_old);
   }
   if (res == NULL && do_expand && _expand_heap_after_alloc_failure) {
     // Currently, only attempts to allocate GC alloc regions set
@@ -603,12 +598,9 @@
     if (expand(word_size * HeapWordSize)) {
       // Given that expand() succeeded in expanding the heap, and we
       // always expand the heap by an amount aligned to the heap
-      // region size, the free list should in theory not be empty. So
-      // it would probably be OK to use remove_head(). But the extra
-      // check for NULL is unlikely to be a performance issue here (we
-      // just expanded the heap!) so let's just be conservative and
-      // use remove_head_or_null().
-      res = _free_list.remove_head_or_null();
+      // region size, the free list should in theory not be empty.
+      // In either case remove_region() will check for NULL.
+      res = _free_list.remove_region(is_old);
     } else {
       _expand_heap_after_alloc_failure = false;
     }
@@ -626,7 +618,7 @@
     // Only one region to allocate, no need to go through the slower
     // path. The caller will attempt the expansion if this fails, so
     // let's not try to expand here too.
-    HeapRegion* hr = new_region(word_size, false /* do_expand */);
+    HeapRegion* hr = new_region(word_size, true /* is_old */, false /* do_expand */);
     if (hr != NULL) {
       first = hr->hrs_index();
     } else {
@@ -1298,7 +1290,6 @@
 
   size_t metadata_prev_used = MetaspaceAux::allocated_used_bytes();
 
-  HRSPhaseSetter x(HRSPhaseFullGC);
   verify_region_sets_optional();
 
   const bool do_clear_all_soft_refs = clear_all_soft_refs ||
@@ -1928,10 +1919,10 @@
   _g1mm(NULL),
   _refine_cte_cl(NULL),
   _full_collection(false),
-  _free_list("Master Free List"),
-  _secondary_free_list("Secondary Free List"),
-  _old_set("Old Set"),
-  _humongous_set("Master Humongous Set"),
+  _free_list("Master Free List", new MasterFreeRegionListMtSafeChecker()),
+  _secondary_free_list("Secondary Free List", new SecondaryFreeRegionListMtSafeChecker()),
+  _old_set("Old Set", false /* humongous */, new OldRegionSetMtSafeChecker()),
+  _humongous_set("Master Humongous Set", true /* humongous */, new HumongousRegionSetMtSafeChecker()),
   _free_regions_coming(false),
   _young_list(new YoungList(this)),
   _gc_time_stamp(0),
@@ -1963,7 +1954,7 @@
   int n_queues = MAX2((int)ParallelGCThreads, 1);
   _task_queues = new RefToScanQueueSet(n_queues);
 
-  int n_rem_sets = HeapRegionRemSet::num_par_rem_sets();
+  uint n_rem_sets = HeapRegionRemSet::num_par_rem_sets();
   assert(n_rem_sets > 0, "Invariant.");
 
   _worker_cset_start_region = NEW_C_HEAP_ARRAY(HeapRegion*, n_queues, mtGC);
@@ -2079,7 +2070,7 @@
   guarantee(HeapRegion::CardsPerRegion < max_cards_per_region,
             "too many cards per region");
 
-  HeapRegionSet::set_unrealistically_long_length(max_regions() + 1);
+  FreeRegionList::set_unrealistically_long_length(max_regions() + 1);
 
   _bot_shared = new G1BlockOffsetSharedArray(_reserved,
                                              heap_word_size(init_byte_size));
@@ -2182,6 +2173,8 @@
   // values in the heap have been properly initialized.
   _g1mm = new G1MonitoringSupport(this);
 
+  G1StringDedup::initialize();
+
   return JNI_OK;
 }
 
@@ -2266,7 +2259,7 @@
                                 // (for efficiency/performance)
                            false);
                                 // Setting next fields of discovered
-                                // lists requires a barrier.
+                                // lists does not require a barrier.
 }
 
 size_t G1CollectedHeap::capacity() const {
@@ -2369,8 +2362,12 @@
 };
 
 size_t G1CollectedHeap::recalculate_used() const {
+  double recalculate_used_start = os::elapsedTime();
+
   SumUsedClosure blk;
   heap_region_iterate(&blk);
+
+  g1_policy()->phase_times()->record_evac_fail_recalc_used_time((os::elapsedTime() - recalculate_used_start) * 1000.0);
   return blk.result();
 }
 
@@ -3013,7 +3010,17 @@
 }
 
 size_t G1CollectedHeap::tlab_capacity(Thread* ignored) const {
-  return HeapRegion::GrainBytes;
+  return (_g1_policy->young_list_target_length() - young_list()->survivor_length()) * HeapRegion::GrainBytes;
+}
+
+size_t G1CollectedHeap::tlab_used(Thread* ignored) const {
+  return young_list()->eden_used_bytes();
+}
+
+// For G1 TLABs should not contain humongous objects, so the maximum TLAB size
+// must be smaller than the humongous object limit.
+size_t G1CollectedHeap::max_tlab_size() const {
+  return align_size_down(_humongous_object_threshold_in_words - 1, MinObjAlignment);
 }
 
 size_t G1CollectedHeap::unsafe_max_tlab_alloc(Thread* ignored) const {
@@ -3025,11 +3032,11 @@
   // humongous objects.
 
   HeapRegion* hr = _mutator_alloc_region.get();
-  size_t max_tlab_size = _humongous_object_threshold_in_words * wordSize;
+  size_t max_tlab = max_tlab_size() * wordSize;
   if (hr == NULL) {
-    return max_tlab_size;
+    return max_tlab;
   } else {
-    return MIN2(MAX2(hr->free(), (size_t) MinTLABSize), max_tlab_size);
+    return MIN2(MAX2(hr->free(), (size_t) MinTLABSize), max_tlab);
   }
 }
 
@@ -3470,6 +3477,11 @@
     if (!silent) gclog_or_tty->print("RemSet ");
     rem_set()->verify();
 
+    if (G1StringDedup::is_enabled()) {
+      if (!silent) gclog_or_tty->print("StrDedup ");
+      G1StringDedup::verify();
+    }
+
     if (failures) {
       gclog_or_tty->print_cr("Heap:");
       // It helps to have the per-region information in the output to
@@ -3487,8 +3499,13 @@
     }
     guarantee(!failures, "there should not have been any failures");
   } else {
-    if (!silent)
-      gclog_or_tty->print("(SKIPPING roots, heapRegionSets, heapRegions, remset) ");
+    if (!silent) {
+      gclog_or_tty->print("(SKIPPING Roots, HeapRegionSets, HeapRegions, RemSet");
+      if (G1StringDedup::is_enabled()) {
+        gclog_or_tty->print(", StrDedup");
+      }
+      gclog_or_tty->print(") ");
+    }
   }
 }
 
@@ -3581,6 +3598,9 @@
   st->cr();
   _cm->print_worker_threads_on(st);
   _cg1r->print_worker_threads_on(st);
+  if (G1StringDedup::is_enabled()) {
+    G1StringDedup::print_worker_threads_on(st);
+  }
 }
 
 void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const {
@@ -3589,6 +3609,9 @@
   }
   tc->do_thread(_cmThread);
   _cg1r->threads_do(tc);
+  if (G1StringDedup::is_enabled()) {
+    G1StringDedup::threads_do(tc);
+  }
 }
 
 void G1CollectedHeap::print_tracing_info() const {
@@ -3668,6 +3691,7 @@
   // always_do_update_barrier = false;
   assert(InlineCacheBuffer::is_empty(), "should have cleaned up ICBuffer");
   // Fill TLAB's and such
+  accumulate_statistics_all_tlabs();
   ensure_parsability(true);
 
   if (G1SummarizeRSetStats && (G1SummarizeRSetStatsPeriod > 0) &&
@@ -3692,6 +3716,8 @@
                         "derived pointer present"));
   // always_do_update_barrier = true;
 
+  resize_all_tlabs();
+
   // We have just completed a GC. Update the soft reference
   // policy with the new heap occupancy
   Universe::update_heap_info_at_gc();
@@ -3892,7 +3918,6 @@
   print_heap_before_gc();
   trace_heap_before_gc(_gc_tracer_stw);
 
-  HRSPhaseSetter x(HRSPhaseEvacuation);
   verify_region_sets_optional();
   verify_dirty_young_regions();
 
@@ -4391,6 +4416,8 @@
 void G1CollectedHeap::remove_self_forwarding_pointers() {
   assert(check_cset_heap_region_claim_values(HeapRegion::InitialClaimValue), "sanity");
 
+  double remove_self_forwards_start = os::elapsedTime();
+
   G1ParRemoveSelfForwardPtrsTask rsfp_task(this);
 
   if (G1CollectedHeap::use_parallel_gc_threads()) {
@@ -4418,6 +4445,8 @@
   }
   _objs_with_preserved_marks.clear(true);
   _preserved_marks_of_objs.clear(true);
+
+  g1_policy()->phase_times()->record_evac_fail_remove_self_forwards((os::elapsedTime() - remove_self_forwards_start) * 1000.0);
 }
 
 void G1CollectedHeap::push_on_evac_failure_scan_stack(oop obj) {
@@ -4639,9 +4668,7 @@
 #endif // ASSERT
 
 void G1ParScanThreadState::trim_queue() {
-  assert(_evac_cl != NULL, "not set");
   assert(_evac_failure_cl != NULL, "not set");
-  assert(_partial_scan_cl != NULL, "not set");
 
   StarTask ref;
   do {
@@ -4732,6 +4759,12 @@
   oop forward_ptr = old->forward_to_atomic(obj);
   if (forward_ptr == NULL) {
     Copy::aligned_disjoint_words((HeapWord*) old, obj_ptr, word_sz);
+
+    // alloc_purpose is just a hint to allocate() above, recheck the type of region
+    // we actually allocated from and update alloc_purpose accordingly
+    HeapRegion* to_region = _g1h->heap_region_containing_raw(obj_ptr);
+    alloc_purpose = to_region->is_young() ? GCAllocForSurvived : GCAllocForTenured;
+
     if (g1p->track_object_age(alloc_purpose)) {
       // We could simply do obj->incr_age(). However, this causes a
       // performance issue. obj->incr_age() will first check whether
@@ -4759,6 +4792,13 @@
       obj->set_mark(m);
     }
 
+    if (G1StringDedup::is_enabled()) {
+      G1StringDedup::enqueue_from_evacuation(from_region->is_young(),
+                                             to_region->is_young(),
+                                             queue_num(),
+                                             obj);
+    }
+
     size_t* surv_young_words = surviving_young_words();
     surv_young_words[young_index] += word_sz;
 
@@ -4837,55 +4877,6 @@
 template void G1ParCopyClosure<G1BarrierEvac, false>::do_oop_work(oop* p);
 template void G1ParCopyClosure<G1BarrierEvac, false>::do_oop_work(narrowOop* p);
 
-template <class T> void G1ParScanPartialArrayClosure::do_oop_nv(T* p) {
-  assert(has_partial_array_mask(p), "invariant");
-  oop from_obj = clear_partial_array_mask(p);
-
-  assert(Universe::heap()->is_in_reserved(from_obj), "must be in heap.");
-  assert(from_obj->is_objArray(), "must be obj array");
-  objArrayOop from_obj_array = objArrayOop(from_obj);
-  // The from-space object contains the real length.
-  int length                 = from_obj_array->length();
-
-  assert(from_obj->is_forwarded(), "must be forwarded");
-  oop to_obj                 = from_obj->forwardee();
-  assert(from_obj != to_obj, "should not be chunking self-forwarded objects");
-  objArrayOop to_obj_array   = objArrayOop(to_obj);
-  // We keep track of the next start index in the length field of the
-  // to-space object.
-  int next_index             = to_obj_array->length();
-  assert(0 <= next_index && next_index < length,
-         err_msg("invariant, next index: %d, length: %d", next_index, length));
-
-  int start                  = next_index;
-  int end                    = length;
-  int remainder              = end - start;
-  // We'll try not to push a range that's smaller than ParGCArrayScanChunk.
-  if (remainder > 2 * ParGCArrayScanChunk) {
-    end = start + ParGCArrayScanChunk;
-    to_obj_array->set_length(end);
-    // Push the remainder before we process the range in case another
-    // worker has run out of things to do and can steal it.
-    oop* from_obj_p = set_partial_array_mask(from_obj);
-    _par_scan_state->push_on_queue(from_obj_p);
-  } else {
-    assert(length == end, "sanity");
-    // We'll process the final range for this object. Restore the length
-    // so that the heap remains parsable in case of evacuation failure.
-    to_obj_array->set_length(end);
-  }
-  _scanner.set_region(_g1->heap_region_containing_raw(to_obj));
-  // Process indexes [start,end). It will also process the header
-  // along with the first chunk (i.e., the chunk with start == 0).
-  // Note that at this point the length field of to_obj_array is not
-  // correct given that we are using it to keep track of the next
-  // start index. oop_iterate_range() (thankfully!) ignores the length
-  // field and only relies on the start / end parameters.  It does
-  // however return the size of the object which will be incorrect. So
-  // we have to ignore it even if we wanted to use it.
-  to_obj_array->oop_iterate_range(&_scanner, start, end);
-}
-
 class G1ParEvacuateFollowersClosure : public VoidClosure {
 protected:
   G1CollectedHeap*              _g1h;
@@ -5027,13 +5018,9 @@
       ReferenceProcessor*             rp = _g1h->ref_processor_stw();
 
       G1ParScanThreadState            pss(_g1h, worker_id, rp);
-      G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, rp);
       G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, rp);
-      G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, rp);
-
-      pss.set_evac_closure(&scan_evac_cl);
+
       pss.set_evac_failure_closure(&evac_failure_cl);
-      pss.set_partial_scan_closure(&partial_scan_cl);
 
       G1ParScanExtRootClosure        only_scan_root_cl(_g1h, &pss, rp);
       G1ParScanMetadataClosure       only_scan_metadata_cl(_g1h, &pss, rp);
@@ -5287,6 +5274,33 @@
                            g1_unlink_task.strings_processed(), g1_unlink_task.strings_removed(),
                            g1_unlink_task.symbols_processed(), g1_unlink_task.symbols_removed());
   }
+
+  if (G1StringDedup::is_enabled()) {
+    G1StringDedup::unlink(is_alive);
+  }
+}
+
+class RedirtyLoggedCardTableEntryFastClosure : public CardTableEntryClosure {
+public:
+  bool do_card_ptr(jbyte* card_ptr, int worker_i) {
+    *card_ptr = CardTableModRefBS::dirty_card_val();
+    return true;
+  }
+};
+
+void G1CollectedHeap::redirty_logged_cards() {
+  guarantee(G1DeferredRSUpdate, "Must only be called when using deferred RS updates.");
+  double redirty_logged_cards_start = os::elapsedTime();
+
+  RedirtyLoggedCardTableEntryFastClosure redirty;
+  dirty_card_queue_set().set_closure(&redirty);
+  dirty_card_queue_set().apply_closure_to_all_completed_buffers();
+
+  DirtyCardQueueSet& dcq = JavaThread::dirty_card_queue_set();
+  dcq.merge_bufferlists(&dirty_card_queue_set());
+  assert(dirty_card_queue_set().completed_buffers_num() == 0, "All should be consumed");
+
+  g1_policy()->phase_times()->record_redirty_logged_cards_time_ms((os::elapsedTime() - redirty_logged_cards_start) * 1000.0);
 }
 
 // Weak Reference Processing support
@@ -5470,14 +5484,9 @@
     G1STWIsAliveClosure is_alive(_g1h);
 
     G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-
-    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
     G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
-    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, NULL);
-
-    pss.set_evac_closure(&scan_evac_cl);
+
     pss.set_evac_failure_closure(&evac_failure_cl);
-    pss.set_partial_scan_closure(&partial_scan_cl);
 
     G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
     G1ParScanMetadataClosure       only_copy_metadata_cl(_g1h, &pss, NULL);
@@ -5582,13 +5591,9 @@
     HandleMark   hm;
 
     G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
     G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
-    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, NULL);
-
-    pss.set_evac_closure(&scan_evac_cl);
+
     pss.set_evac_failure_closure(&evac_failure_cl);
-    pss.set_partial_scan_closure(&partial_scan_cl);
 
     assert(pss.refs()->is_empty(), "both queue and overflow should be empty");
 
@@ -5712,13 +5717,9 @@
   // We do not embed a reference processor in the copying/scanning
   // closures while we're actually processing the discovered
   // reference objects.
-  G1ParScanHeapEvacClosure        scan_evac_cl(this, &pss, NULL);
   G1ParScanHeapEvacFailureClosure evac_failure_cl(this, &pss, NULL);
-  G1ParScanPartialArrayClosure    partial_scan_cl(this, &pss, NULL);
-
-  pss.set_evac_closure(&scan_evac_cl);
+
   pss.set_evac_failure_closure(&evac_failure_cl);
-  pss.set_partial_scan_closure(&partial_scan_cl);
 
   assert(pss.refs()->is_empty(), "pre-condition");
 
@@ -5900,6 +5901,9 @@
     G1STWIsAliveClosure is_alive(this);
     G1KeepAliveClosure keep_alive(this);
     JNIHandles::weak_oops_do(&is_alive, &keep_alive);
+    if (G1StringDedup::is_enabled()) {
+      G1StringDedup::unlink_or_oops_do(&is_alive, &keep_alive);
+    }
   }
 
   release_gc_alloc_regions(n_workers, evacuation_info);
@@ -5917,6 +5921,8 @@
   // strong code roots for a particular heap region.
   migrate_strong_code_roots();
 
+  purge_code_root_memory();
+
   if (g1_policy()->during_initial_mark_pause()) {
     // Reset the claim values set during marking the strong code roots
     reset_heap_region_claim_values();
@@ -5943,41 +5949,15 @@
   enqueue_discovered_references(n_workers);
 
   if (G1DeferredRSUpdate) {
-    RedirtyLoggedCardTableEntryFastClosure redirty;
-    dirty_card_queue_set().set_closure(&redirty);
-    dirty_card_queue_set().apply_closure_to_all_completed_buffers();
-
-    DirtyCardQueueSet& dcq = JavaThread::dirty_card_queue_set();
-    dcq.merge_bufferlists(&dirty_card_queue_set());
-    assert(dirty_card_queue_set().completed_buffers_num() == 0, "All should be consumed");
+    redirty_logged_cards();
   }
   COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
 }
 
-void G1CollectedHeap::free_region_if_empty(HeapRegion* hr,
-                                     size_t* pre_used,
-                                     FreeRegionList* free_list,
-                                     OldRegionSet* old_proxy_set,
-                                     HumongousRegionSet* humongous_proxy_set,
-                                     HRRSCleanupTask* hrrs_cleanup_task,
-                                     bool par) {
-  if (hr->used() > 0 && hr->max_live_bytes() == 0 && !hr->is_young()) {
-    if (hr->isHumongous()) {
-      assert(hr->startsHumongous(), "we should only see starts humongous");
-      free_humongous_region(hr, pre_used, free_list, humongous_proxy_set, par);
-    } else {
-      _old_set.remove_with_proxy(hr, old_proxy_set);
-      free_region(hr, pre_used, free_list, par);
-    }
-  } else {
-    hr->rem_set()->do_cleanup_work(hrrs_cleanup_task);
-  }
-}
-
 void G1CollectedHeap::free_region(HeapRegion* hr,
-                                  size_t* pre_used,
                                   FreeRegionList* free_list,
-                                  bool par) {
+                                  bool par,
+                                  bool locked) {
   assert(!hr->isHumongous(), "this is only for non-humongous regions");
   assert(!hr->is_empty(), "the region should not be empty");
   assert(free_list != NULL, "pre-condition");
@@ -5988,70 +5968,56 @@
   if (!hr->is_young()) {
     _cg1r->hot_card_cache()->reset_card_counts(hr);
   }
-  *pre_used += hr->used();
-  hr->hr_clear(par, true /* clear_space */);
-  free_list->add_as_head(hr);
+  hr->hr_clear(par, true /* clear_space */, locked /* locked */);
+  free_list->add_ordered(hr);
 }
 
 void G1CollectedHeap::free_humongous_region(HeapRegion* hr,
-                                     size_t* pre_used,
                                      FreeRegionList* free_list,
-                                     HumongousRegionSet* humongous_proxy_set,
                                      bool par) {
   assert(hr->startsHumongous(), "this is only for starts humongous regions");
   assert(free_list != NULL, "pre-condition");
-  assert(humongous_proxy_set != NULL, "pre-condition");
-
-  size_t hr_used = hr->used();
+
   size_t hr_capacity = hr->capacity();
-  size_t hr_pre_used = 0;
-  _humongous_set.remove_with_proxy(hr, humongous_proxy_set);
   // We need to read this before we make the region non-humongous,
   // otherwise the information will be gone.
   uint last_index = hr->last_hc_index();
   hr->set_notHumongous();
-  free_region(hr, &hr_pre_used, free_list, par);
+  free_region(hr, free_list, par);
 
   uint i = hr->hrs_index() + 1;
   while (i < last_index) {
     HeapRegion* curr_hr = region_at(i);
     assert(curr_hr->continuesHumongous(), "invariant");
     curr_hr->set_notHumongous();
-    free_region(curr_hr, &hr_pre_used, free_list, par);
+    free_region(curr_hr, free_list, par);
     i += 1;
   }
-  assert(hr_pre_used == hr_used,
-         err_msg("hr_pre_used: "SIZE_FORMAT" and hr_used: "SIZE_FORMAT" "
-                 "should be the same", hr_pre_used, hr_used));
-  *pre_used += hr_pre_used;
-}
-
-void G1CollectedHeap::update_sets_after_freeing_regions(size_t pre_used,
-                                       FreeRegionList* free_list,
-                                       OldRegionSet* old_proxy_set,
-                                       HumongousRegionSet* humongous_proxy_set,
-                                       bool par) {
-  if (pre_used > 0) {
-    Mutex* lock = (par) ? ParGCRareEvent_lock : NULL;
-    MutexLockerEx x(lock, Mutex::_no_safepoint_check_flag);
-    assert(_summary_bytes_used >= pre_used,
-           err_msg("invariant: _summary_bytes_used: "SIZE_FORMAT" "
-                   "should be >= pre_used: "SIZE_FORMAT,
-                   _summary_bytes_used, pre_used));
-    _summary_bytes_used -= pre_used;
-  }
-  if (free_list != NULL && !free_list->is_empty()) {
+}
+
+void G1CollectedHeap::remove_from_old_sets(const HeapRegionSetCount& old_regions_removed,
+                                       const HeapRegionSetCount& humongous_regions_removed) {
+  if (old_regions_removed.length() > 0 || humongous_regions_removed.length() > 0) {
+    MutexLockerEx x(OldSets_lock, Mutex::_no_safepoint_check_flag);
+    _old_set.bulk_remove(old_regions_removed);
+    _humongous_set.bulk_remove(humongous_regions_removed);
+  }
+
+}
+
+void G1CollectedHeap::prepend_to_freelist(FreeRegionList* list) {
+  assert(list != NULL, "list can't be null");
+  if (!list->is_empty()) {
     MutexLockerEx x(FreeList_lock, Mutex::_no_safepoint_check_flag);
-    _free_list.add_as_head(free_list);
-  }
-  if (old_proxy_set != NULL && !old_proxy_set->is_empty()) {
-    MutexLockerEx x(OldSets_lock, Mutex::_no_safepoint_check_flag);
-    _old_set.update_from_proxy(old_proxy_set);
-  }
-  if (humongous_proxy_set != NULL && !humongous_proxy_set->is_empty()) {
-    MutexLockerEx x(OldSets_lock, Mutex::_no_safepoint_check_flag);
-    _humongous_set.update_from_proxy(humongous_proxy_set);
-  }
+    _free_list.add_ordered(list);
+  }
+}
+
+void G1CollectedHeap::decrement_summary_bytes(size_t bytes) {
+  assert(_summary_bytes_used >= bytes,
+         err_msg("invariant: _summary_bytes_used: "SIZE_FORMAT" should be >= bytes: "SIZE_FORMAT,
+                  _summary_bytes_used, bytes));
+  _summary_bytes_used -= bytes;
 }
 
 class G1ParCleanupCTTask : public AbstractGangTask {
@@ -6211,7 +6177,7 @@
       }
     }
 
-    rs_lengths += cur->rem_set()->occupied();
+    rs_lengths += cur->rem_set()->occupied_locked();
 
     HeapRegion* next = cur->next_in_collection_set();
     assert(cur->in_collection_set(), "bad CS");
@@ -6244,7 +6210,8 @@
 
       // And the region is empty.
       assert(!used_mr.is_empty(), "Should not have empty regions in a CS.");
-      free_region(cur, &pre_used, &local_free_list, false /* par */);
+      pre_used += cur->used();
+      free_region(cur, &local_free_list, false /* par */, true /* locked */);
     } else {
       cur->uninstall_surv_rate_group();
       if (cur->is_young()) {
@@ -6272,10 +6239,8 @@
     young_time_ms += elapsed_ms;
   }
 
-  update_sets_after_freeing_regions(pre_used, &local_free_list,
-                                    NULL /* old_proxy_set */,
-                                    NULL /* humongous_proxy_set */,
-                                    false /* par */);
+  prepend_to_freelist(&local_free_list);
+  decrement_summary_bytes(pre_used);
   policy->phase_times()->record_young_free_cset_time_ms(young_time_ms);
   policy->phase_times()->record_non_young_free_cset_time_ms(non_young_time_ms);
 }
@@ -6387,10 +6352,10 @@
 
 class TearDownRegionSetsClosure : public HeapRegionClosure {
 private:
-  OldRegionSet *_old_set;
+  HeapRegionSet *_old_set;
 
 public:
-  TearDownRegionSetsClosure(OldRegionSet* old_set) : _old_set(old_set) { }
+  TearDownRegionSetsClosure(HeapRegionSet* old_set) : _old_set(old_set) { }
 
   bool doHeapRegion(HeapRegion* r) {
     if (r->is_empty()) {
@@ -6419,9 +6384,10 @@
     TearDownRegionSetsClosure cl(&_old_set);
     heap_region_iterate(&cl);
 
-    // Need to do this after the heap iteration to be able to
-    // recognize the young regions and ignore them during the iteration.
-    _young_list->empty_list();
+    // Note that emptying the _young_list is postponed and instead done as
+    // the first step when rebuilding the regions sets again. The reason for
+    // this is that during a full GC string deduplication needs to know if
+    // a collected region was young or old when the full GC was initiated.
   }
   _free_list.remove_all();
 }
@@ -6429,13 +6395,13 @@
 class RebuildRegionSetsClosure : public HeapRegionClosure {
 private:
   bool            _free_list_only;
-  OldRegionSet*   _old_set;
+  HeapRegionSet*   _old_set;
   FreeRegionList* _free_list;
   size_t          _total_used;
 
 public:
   RebuildRegionSetsClosure(bool free_list_only,
-                           OldRegionSet* old_set, FreeRegionList* free_list) :
+                           HeapRegionSet* old_set, FreeRegionList* free_list) :
     _free_list_only(free_list_only),
     _old_set(old_set), _free_list(free_list), _total_used(0) {
     assert(_free_list->is_empty(), "pre-condition");
@@ -6475,6 +6441,10 @@
 void G1CollectedHeap::rebuild_region_sets(bool free_list_only) {
   assert_at_safepoint(true /* should_be_vm_thread */);
 
+  if (!free_list_only) {
+    _young_list->empty_list();
+  }
+
   RebuildRegionSetsClosure cl(free_list_only, &_old_set, &_free_list);
   heap_region_iterate(&cl);
 
@@ -6510,6 +6480,7 @@
   bool young_list_full = g1_policy()->is_young_list_full();
   if (force || !young_list_full) {
     HeapRegion* new_alloc_region = new_region(word_size,
+                                              false /* is_old */,
                                               false /* do_expand */);
     if (new_alloc_region != NULL) {
       set_region_short_lived_locked(new_alloc_region);
@@ -6568,14 +6539,16 @@
   assert(FreeList_lock->owned_by_self(), "pre-condition");
 
   if (count < g1_policy()->max_regions(ap)) {
+    bool survivor = (ap == GCAllocForSurvived);
     HeapRegion* new_alloc_region = new_region(word_size,
+                                              !survivor,
                                               true /* do_expand */);
     if (new_alloc_region != NULL) {
       // We really only need to do this for old regions given that we
       // should never scan survivors. But it doesn't hurt to do it
       // for survivors too.
       new_alloc_region->set_saved_mark();
-      if (ap == GCAllocForSurvived) {
+      if (survivor) {
         new_alloc_region->set_survivor();
         _hr_printer.alloc(new_alloc_region, G1HRPrinter::Survivor);
       } else {
@@ -6632,23 +6605,22 @@
 
 class VerifyRegionListsClosure : public HeapRegionClosure {
 private:
-  FreeRegionList*     _free_list;
-  OldRegionSet*       _old_set;
-  HumongousRegionSet* _humongous_set;
-  uint                _region_count;
+  HeapRegionSet*   _old_set;
+  HeapRegionSet*   _humongous_set;
+  FreeRegionList*  _free_list;
 
 public:
-  VerifyRegionListsClosure(OldRegionSet* old_set,
-                           HumongousRegionSet* humongous_set,
+  HeapRegionSetCount _old_count;
+  HeapRegionSetCount _humongous_count;
+  HeapRegionSetCount _free_count;
+
+  VerifyRegionListsClosure(HeapRegionSet* old_set,
+                           HeapRegionSet* humongous_set,
                            FreeRegionList* free_list) :
-    _old_set(old_set), _humongous_set(humongous_set),
-    _free_list(free_list), _region_count(0) { }
-
-  uint region_count() { return _region_count; }
+    _old_set(old_set), _humongous_set(humongous_set), _free_list(free_list),
+    _old_count(), _humongous_count(), _free_count(){ }
 
   bool doHeapRegion(HeapRegion* hr) {
-    _region_count += 1;
-
     if (hr->continuesHumongous()) {
       return false;
     }
@@ -6656,14 +6628,31 @@
     if (hr->is_young()) {
       // TODO
     } else if (hr->startsHumongous()) {
-      _humongous_set->verify_next_region(hr);
+      assert(hr->containing_set() == _humongous_set, err_msg("Heap region %u is starts humongous but not in humongous set.", hr->region_num()));
+      _humongous_count.increment(1u, hr->capacity());
     } else if (hr->is_empty()) {
-      _free_list->verify_next_region(hr);
+      assert(hr->containing_set() == _free_list, err_msg("Heap region %u is empty but not on the free list.", hr->region_num()));
+      _free_count.increment(1u, hr->capacity());
     } else {
-      _old_set->verify_next_region(hr);
+      assert(hr->containing_set() == _old_set, err_msg("Heap region %u is old but not in the old set.", hr->region_num()));
+      _old_count.increment(1u, hr->capacity());
     }
     return false;
   }
+
+  void verify_counts(HeapRegionSet* old_set, HeapRegionSet* humongous_set, FreeRegionList* free_list) {
+    guarantee(old_set->length() == _old_count.length(), err_msg("Old set count mismatch. Expected %u, actual %u.", old_set->length(), _old_count.length()));
+    guarantee(old_set->total_capacity_bytes() == _old_count.capacity(), err_msg("Old set capacity mismatch. Expected " SIZE_FORMAT ", actual " SIZE_FORMAT,
+        old_set->total_capacity_bytes(), _old_count.capacity()));
+
+    guarantee(humongous_set->length() == _humongous_count.length(), err_msg("Hum set count mismatch. Expected %u, actual %u.", humongous_set->length(), _humongous_count.length()));
+    guarantee(humongous_set->total_capacity_bytes() == _humongous_count.capacity(), err_msg("Hum set capacity mismatch. Expected " SIZE_FORMAT ", actual " SIZE_FORMAT,
+        humongous_set->total_capacity_bytes(), _humongous_count.capacity()));
+
+    guarantee(free_list->length() == _free_count.length(), err_msg("Free list count mismatch. Expected %u, actual %u.", free_list->length(), _free_count.length()));
+    guarantee(free_list->total_capacity_bytes() == _free_count.capacity(), err_msg("Free list capacity mismatch. Expected " SIZE_FORMAT ", actual " SIZE_FORMAT,
+        free_list->total_capacity_bytes(), _free_count.capacity()));
+  }
 };
 
 HeapRegion* G1CollectedHeap::new_heap_region(uint hrs_index,
@@ -6679,16 +6668,14 @@
   assert_heap_locked_or_at_safepoint(true /* should_be_vm_thread */);
 
   // First, check the explicit lists.
-  _free_list.verify();
+  _free_list.verify_list();
   {
     // Given that a concurrent operation might be adding regions to
     // the secondary free list we have to take the lock before
     // verifying it.
     MutexLockerEx x(SecondaryFreeList_lock, Mutex::_no_safepoint_check_flag);
-    _secondary_free_list.verify();
-  }
-  _old_set.verify();
-  _humongous_set.verify();
+    _secondary_free_list.verify_list();
+  }
 
   // If a concurrent region freeing operation is in progress it will
   // be difficult to correctly attributed any free regions we come
@@ -6711,16 +6698,10 @@
 
   // Finally, make sure that the region accounting in the lists is
   // consistent with what we see in the heap.
-  _old_set.verify_start();
-  _humongous_set.verify_start();
-  _free_list.verify_start();
 
   VerifyRegionListsClosure cl(&_old_set, &_humongous_set, &_free_list);
   heap_region_iterate(&cl);
-
-  _old_set.verify_end();
-  _humongous_set.verify_end();
-  _free_list.verify_end();
+  cl.verify_counts(&_old_set, &_humongous_set, &_free_list);
 }
 
 // Optimized nmethod scanning
@@ -6821,6 +6802,13 @@
   g1_policy()->phase_times()->record_strong_code_root_migration_time(migration_time_ms);
 }
 
+void G1CollectedHeap::purge_code_root_memory() {
+  double purge_start = os::elapsedTime();
+  G1CodeRootSet::purge_chunks(G1CodeRootsChunkCacheKeepPercent);
+  double purge_time_ms = (os::elapsedTime() - purge_start) * 1000.0;
+  g1_policy()->phase_times()->record_strong_code_root_purge_time(purge_time_ms);
+}
+
 // Mark all the code roots that point into regions *not* in the
 // collection set.
 //
@@ -6891,7 +6879,7 @@
       // Code roots should never be attached to a continuation of a humongous region
       assert(hrrs->strong_code_roots_list_length() == 0,
              err_msg("code roots should never be attached to continuations of humongous region "HR_FORMAT
-                     " starting at "HR_FORMAT", but has "INT32_FORMAT,
+                     " starting at "HR_FORMAT", but has "SIZE_FORMAT,
                      HR_FORMAT_PARAMS(hr), HR_FORMAT_PARAMS(hr->humongous_start_region()),
                      hrrs->strong_code_roots_list_length()));
       return false;
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,7 +34,7 @@
 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
 #include "gc_implementation/g1/g1YCTypes.hpp"
 #include "gc_implementation/g1/heapRegionSeq.hpp"
-#include "gc_implementation/g1/heapRegionSets.hpp"
+#include "gc_implementation/g1/heapRegionSet.hpp"
 #include "gc_implementation/shared/hSpaceCounters.hpp"
 #include "gc_implementation/shared/parGCAllocBuffer.hpp"
 #include "memory/barrierSet.hpp"
@@ -243,18 +243,18 @@
   MemRegion _g1_committed;
 
   // The master free list. It will satisfy all new region allocations.
-  MasterFreeRegionList      _free_list;
+  FreeRegionList _free_list;
 
   // The secondary free list which contains regions that have been
   // freed up during the cleanup process. This will be appended to the
   // master free list when appropriate.
-  SecondaryFreeRegionList   _secondary_free_list;
+  FreeRegionList _secondary_free_list;
 
   // It keeps track of the old regions.
-  MasterOldRegionSet        _old_set;
+  HeapRegionSet _old_set;
 
   // It keeps track of the humongous regions.
-  MasterHumongousRegionSet  _humongous_set;
+  HeapRegionSet _humongous_set;
 
   // The number of regions we could create by expansion.
   uint _expansion_regions;
@@ -497,13 +497,14 @@
   // check whether there's anything available on the
   // secondary_free_list and/or wait for more regions to appear on
   // that list, if _free_regions_coming is set.
-  HeapRegion* new_region_try_secondary_free_list();
+  HeapRegion* new_region_try_secondary_free_list(bool is_old);
 
   // Try to allocate a single non-humongous HeapRegion sufficient for
   // an allocation of the given word_size. If do_expand is true,
   // attempt to expand the heap if necessary to satisfy the allocation
-  // request.
-  HeapRegion* new_region(size_t word_size, bool do_expand);
+  // request. If the region is to be used as an old region or for a
+  // humongous object, set is_old to true. If not, to false.
+  HeapRegion* new_region(size_t word_size, bool is_old, bool do_expand);
 
   // Attempt to satisfy a humongous allocation request of the given
   // size by finding a contiguous set of free regions of num_regions
@@ -757,6 +758,29 @@
 
   G1HRPrinter* hr_printer() { return &_hr_printer; }
 
+  // Frees a non-humongous region by initializing its contents and
+  // adding it to the free list that's passed as a parameter (this is
+  // usually a local list which will be appended to the master free
+  // list later). The used bytes of freed regions are accumulated in
+  // pre_used. If par is true, the region's RSet will not be freed
+  // up. The assumption is that this will be done later.
+  // The locked parameter indicates if the caller has already taken
+  // care of proper synchronization. This may allow some optimizations.
+  void free_region(HeapRegion* hr,
+                   FreeRegionList* free_list,
+                   bool par,
+                   bool locked = false);
+
+  // Frees a humongous region by collapsing it into individual regions
+  // and calling free_region() for each of them. The freed regions
+  // will be added to the free list that's passed as a parameter (this
+  // is usually a local list which will be appended to the master free
+  // list later). The used bytes of freed regions are accumulated in
+  // pre_used. If par is true, the region's RSet will not be freed
+  // up. The assumption is that this will be done later.
+  void free_humongous_region(HeapRegion* hr,
+                             FreeRegionList* free_list,
+                             bool par);
 protected:
 
   // Shrink the garbage-first heap by at most the given size (in bytes!).
@@ -840,30 +864,6 @@
   // string table, and referents of reachable weak refs.
   void g1_process_weak_roots(OopClosure* root_closure);
 
-  // Frees a non-humongous region by initializing its contents and
-  // adding it to the free list that's passed as a parameter (this is
-  // usually a local list which will be appended to the master free
-  // list later). The used bytes of freed regions are accumulated in
-  // pre_used. If par is true, the region's RSet will not be freed
-  // up. The assumption is that this will be done later.
-  void free_region(HeapRegion* hr,
-                   size_t* pre_used,
-                   FreeRegionList* free_list,
-                   bool par);
-
-  // Frees a humongous region by collapsing it into individual regions
-  // and calling free_region() for each of them. The freed regions
-  // will be added to the free list that's passed as a parameter (this
-  // is usually a local list which will be appended to the master free
-  // list later). The used bytes of freed regions are accumulated in
-  // pre_used. If par is true, the region's RSet will not be freed
-  // up. The assumption is that this will be done later.
-  void free_humongous_region(HeapRegion* hr,
-                             size_t* pre_used,
-                             FreeRegionList* free_list,
-                             HumongousRegionSet* humongous_proxy_set,
-                             bool par);
-
   // Notifies all the necessary spaces that the committed space has
   // been updated (either expanded or shrunk). It should be called
   // after _g1_storage is updated.
@@ -1242,21 +1242,17 @@
   bool is_on_master_free_list(HeapRegion* hr) {
     return hr->containing_set() == &_free_list;
   }
-
-  bool is_in_humongous_set(HeapRegion* hr) {
-    return hr->containing_set() == &_humongous_set;
-  }
 #endif // ASSERT
 
   // Wrapper for the region list operations that can be called from
   // methods outside this class.
 
-  void secondary_free_list_add_as_tail(FreeRegionList* list) {
-    _secondary_free_list.add_as_tail(list);
+  void secondary_free_list_add(FreeRegionList* list) {
+    _secondary_free_list.add_ordered(list);
   }
 
   void append_secondary_free_list() {
-    _free_list.add_as_head(&_secondary_free_list);
+    _free_list.add_ordered(&_secondary_free_list);
   }
 
   void append_secondary_free_list_if_not_empty_with_lock() {
@@ -1298,27 +1294,9 @@
   // True iff an evacuation has failed in the most-recent collection.
   bool evacuation_failed() { return _evacuation_failed; }
 
-  // It will free a region if it has allocated objects in it that are
-  // all dead. It calls either free_region() or
-  // free_humongous_region() depending on the type of the region that
-  // is passed to it.
-  void free_region_if_empty(HeapRegion* hr,
-                            size_t* pre_used,
-                            FreeRegionList* free_list,
-                            OldRegionSet* old_proxy_set,
-                            HumongousRegionSet* humongous_proxy_set,
-                            HRRSCleanupTask* hrrs_cleanup_task,
-                            bool par);
-
-  // It appends the free list to the master free list and updates the
-  // master humongous list according to the contents of the proxy
-  // list. It also adjusts the total used bytes according to pre_used
-  // (if par is true, it will do so by taking the ParGCRareEvent_lock).
-  void update_sets_after_freeing_regions(size_t pre_used,
-                                       FreeRegionList* free_list,
-                                       OldRegionSet* old_proxy_set,
-                                       HumongousRegionSet* humongous_proxy_set,
-                                       bool par);
+  void remove_from_old_sets(const HeapRegionSetCount& old_regions_removed, const HeapRegionSetCount& humongous_regions_removed);
+  void prepend_to_freelist(FreeRegionList* list);
+  void decrement_summary_bytes(size_t bytes);
 
   // Returns "TRUE" iff "p" points into the committed areas of the heap.
   virtual bool is_in(const void* p) const;
@@ -1481,9 +1459,11 @@
   // Section on thread-local allocation buffers (TLABs)
   // See CollectedHeap for semantics.
 
-  virtual bool supports_tlab_allocation() const;
-  virtual size_t tlab_capacity(Thread* thr) const;
-  virtual size_t unsafe_max_tlab_alloc(Thread* thr) const;
+  bool supports_tlab_allocation() const;
+  size_t tlab_capacity(Thread* ignored) const;
+  size_t tlab_used(Thread* ignored) const;
+  size_t max_tlab_size() const;
+  size_t unsafe_max_tlab_alloc(Thread* ignored) const;
 
   // Can a compiler initialize a new object without store barriers?
   // This permission only extends from the creation of a new object
@@ -1568,7 +1548,7 @@
   void set_region_short_lived_locked(HeapRegion* hr);
   // add appropriate methods for any other surv rate groups
 
-  YoungList* young_list() { return _young_list; }
+  YoungList* young_list() const { return _young_list; }
 
   // debugging
   bool check_young_list_well_formed() {
@@ -1671,6 +1651,9 @@
   // that were not successfullly evacuated are not migrated.
   void migrate_strong_code_roots();
 
+  // Free up superfluous code root memory.
+  void purge_code_root_memory();
+
   // During an initial mark pause, mark all the code roots that
   // point into regions *not* in the collection set.
   void mark_strong_code_roots(uint worker_id);
@@ -1683,6 +1666,8 @@
   // in symbol table, possibly in parallel.
   void unlink_string_and_symbol_table(BoolObjectClosure* is_alive, bool unlink_strings = true, bool unlink_symbols = true);
 
+  // Redirty logged cards in the refinement queue.
+  void redirty_logged_cards();
   // Verification
 
   // The following is just to alert the verification code
@@ -1809,8 +1794,6 @@
   size_t           _undo_waste;
 
   OopsInHeapRegionClosure*      _evac_failure_cl;
-  G1ParScanHeapEvacClosure*     _evac_cl;
-  G1ParScanPartialArrayClosure* _partial_scan_cl;
 
   int  _hash_seed;
   uint _queue_num;
@@ -1938,14 +1921,6 @@
     return _evac_failure_cl;
   }
 
-  void set_evac_closure(G1ParScanHeapEvacClosure* evac_cl) {
-    _evac_cl = evac_cl;
-  }
-
-  void set_partial_scan_closure(G1ParScanPartialArrayClosure* partial_scan_cl) {
-    _partial_scan_cl = partial_scan_cl;
-  }
-
   int* hash_seed() { return &_hash_seed; }
   uint queue_num() { return _queue_num; }
 
@@ -1993,19 +1968,121 @@
                                                  false /* retain */);
     }
   }
+private:
+  #define G1_PARTIAL_ARRAY_MASK 0x2
+
+  inline bool has_partial_array_mask(oop* ref) const {
+    return ((uintptr_t)ref & G1_PARTIAL_ARRAY_MASK) == G1_PARTIAL_ARRAY_MASK;
+  }
+
+  // We never encode partial array oops as narrowOop*, so return false immediately.
+  // This allows the compiler to create optimized code when popping references from
+  // the work queue.
+  inline bool has_partial_array_mask(narrowOop* ref) const {
+    assert(((uintptr_t)ref & G1_PARTIAL_ARRAY_MASK) != G1_PARTIAL_ARRAY_MASK, "Partial array oop reference encoded as narrowOop*");
+    return false;
+  }
+
+  // Only implement set_partial_array_mask() for regular oops, not for narrowOops.
+  // We always encode partial arrays as regular oop, to allow the
+  // specialization for has_partial_array_mask() for narrowOops above.
+  // This means that unintentional use of this method with narrowOops are caught
+  // by the compiler.
+  inline oop* set_partial_array_mask(oop obj) const {
+    assert(((uintptr_t)(void *)obj & G1_PARTIAL_ARRAY_MASK) == 0, "Information loss!");
+    return (oop*) ((uintptr_t)(void *)obj | G1_PARTIAL_ARRAY_MASK);
+  }
+
+  inline oop clear_partial_array_mask(oop* ref) const {
+    return cast_to_oop((intptr_t)ref & ~G1_PARTIAL_ARRAY_MASK);
+  }
+
+  void do_oop_partial_array(oop* p) {
+    assert(has_partial_array_mask(p), "invariant");
+    oop from_obj = clear_partial_array_mask(p);
+
+    assert(Universe::heap()->is_in_reserved(from_obj), "must be in heap.");
+    assert(from_obj->is_objArray(), "must be obj array");
+    objArrayOop from_obj_array = objArrayOop(from_obj);
+    // The from-space object contains the real length.
+    int length                 = from_obj_array->length();
+
+    assert(from_obj->is_forwarded(), "must be forwarded");
+    oop to_obj                 = from_obj->forwardee();
+    assert(from_obj != to_obj, "should not be chunking self-forwarded objects");
+    objArrayOop to_obj_array   = objArrayOop(to_obj);
+    // We keep track of the next start index in the length field of the
+    // to-space object.
+    int next_index             = to_obj_array->length();
+    assert(0 <= next_index && next_index < length,
+           err_msg("invariant, next index: %d, length: %d", next_index, length));
+
+    int start                  = next_index;
+    int end                    = length;
+    int remainder              = end - start;
+    // We'll try not to push a range that's smaller than ParGCArrayScanChunk.
+    if (remainder > 2 * ParGCArrayScanChunk) {
+      end = start + ParGCArrayScanChunk;
+      to_obj_array->set_length(end);
+      // Push the remainder before we process the range in case another
+      // worker has run out of things to do and can steal it.
+      oop* from_obj_p = set_partial_array_mask(from_obj);
+      push_on_queue(from_obj_p);
+    } else {
+      assert(length == end, "sanity");
+      // We'll process the final range for this object. Restore the length
+      // so that the heap remains parsable in case of evacuation failure.
+      to_obj_array->set_length(end);
+    }
+    _scanner.set_region(_g1h->heap_region_containing_raw(to_obj));
+    // Process indexes [start,end). It will also process the header
+    // along with the first chunk (i.e., the chunk with start == 0).
+    // Note that at this point the length field of to_obj_array is not
+    // correct given that we are using it to keep track of the next
+    // start index. oop_iterate_range() (thankfully!) ignores the length
+    // field and only relies on the start / end parameters.  It does
+    // however return the size of the object which will be incorrect. So
+    // we have to ignore it even if we wanted to use it.
+    to_obj_array->oop_iterate_range(&_scanner, start, end);
+  }
+
+  // This method is applied to the fields of the objects that have just been copied.
+  template <class T> void do_oop_evac(T* p, HeapRegion* from) {
+    assert(!oopDesc::is_null(oopDesc::load_decode_heap_oop(p)),
+           "Reference should not be NULL here as such are never pushed to the task queue.");
+    oop obj = oopDesc::load_decode_heap_oop_not_null(p);
+
+    // Although we never intentionally push references outside of the collection
+    // set, due to (benign) races in the claim mechanism during RSet scanning more
+    // than one thread might claim the same card. So the same card may be
+    // processed multiple times. So redo this check.
+    if (_g1h->in_cset_fast_test(obj)) {
+      oop forwardee;
+      if (obj->is_forwarded()) {
+        forwardee = obj->forwardee();
+      } else {
+        forwardee = copy_to_survivor_space(obj);
+      }
+      assert(forwardee != NULL, "forwardee should not be NULL");
+      oopDesc::encode_store_heap_oop(p, forwardee);
+    }
+
+    assert(obj != NULL, "Must be");
+    update_rs(from, p, queue_num());
+  }
+public:
 
   oop copy_to_survivor_space(oop const obj);
 
   template <class T> void deal_with_reference(T* ref_to_scan) {
-    if (has_partial_array_mask(ref_to_scan)) {
-      _partial_scan_cl->do_oop_nv(ref_to_scan);
-    } else {
+    if (!has_partial_array_mask(ref_to_scan)) {
       // Note: we can use "raw" versions of "region_containing" because
       // "obj_to_scan" is definitely in the heap, and is not in a
       // humongous region.
       HeapRegion* r = _g1h->heap_region_containing_raw(ref_to_scan);
-      _evac_cl->set_region(r);
-      _evac_cl->do_oop_nv(ref_to_scan);
+      do_oop_evac(ref_to_scan, r);
+    } else {
+      do_oop_partial_array((oop*)ref_to_scan);
     }
   }
 
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.inline.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.inline.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -30,6 +30,7 @@
 #include "gc_implementation/g1/g1AllocRegion.inline.hpp"
 #include "gc_implementation/g1/g1CollectorPolicy.hpp"
 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
+#include "gc_implementation/g1/heapRegionSet.inline.hpp"
 #include "gc_implementation/g1/heapRegionSeq.inline.hpp"
 #include "utilities/taskqueue.hpp"
 
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -820,6 +820,8 @@
     // do that for any other surv rate groups
   }
 
+  size_t young_list_target_length() const { return _young_list_target_length; }
+
   bool is_young_list_full() {
     uint young_list_length = _g1->young_list()->length();
     uint young_list_target_length = _young_list_target_length;
--- a/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -27,6 +27,7 @@
 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
 #include "gc_implementation/g1/g1GCPhaseTimes.hpp"
 #include "gc_implementation/g1/g1Log.hpp"
+#include "gc_implementation/g1/g1StringDedup.hpp"
 
 // Helper class for avoiding interleaved logging
 class LineBuffer: public StackObj {
@@ -168,7 +169,9 @@
   _last_termination_attempts(_max_gc_threads, SIZE_FORMAT),
   _last_gc_worker_end_times_ms(_max_gc_threads, "%.1lf", false),
   _last_gc_worker_times_ms(_max_gc_threads, "%.1lf"),
-  _last_gc_worker_other_times_ms(_max_gc_threads, "%.1lf")
+  _last_gc_worker_other_times_ms(_max_gc_threads, "%.1lf"),
+  _cur_string_dedup_queue_fixup_worker_times_ms(_max_gc_threads, "%.1lf"),
+  _cur_string_dedup_table_fixup_worker_times_ms(_max_gc_threads, "%.1lf")
 {
   assert(max_gc_threads > 0, "Must have some GC threads");
 }
@@ -229,6 +232,16 @@
   _last_gc_worker_other_times_ms.verify();
 }
 
+void G1GCPhaseTimes::note_string_dedup_fixup_start() {
+  _cur_string_dedup_queue_fixup_worker_times_ms.reset();
+  _cur_string_dedup_table_fixup_worker_times_ms.reset();
+}
+
+void G1GCPhaseTimes::note_string_dedup_fixup_end() {
+  _cur_string_dedup_queue_fixup_worker_times_ms.verify();
+  _cur_string_dedup_table_fixup_worker_times_ms.verify();
+}
+
 void G1GCPhaseTimes::print_stats(int level, const char* str, double value) {
   LineBuffer(level).append_and_print_cr("[%s: %.1lf ms]", str, value);
 }
@@ -250,6 +263,14 @@
     // Strong code root migration time
     misc_time_ms += _cur_strong_code_root_migration_time_ms;
 
+    // Strong code root purge time
+    misc_time_ms += _cur_strong_code_root_purge_time_ms;
+
+    if (G1StringDedup::is_enabled()) {
+      // String dedup fixup time
+      misc_time_ms += _cur_string_dedup_fixup_time_ms;
+    }
+
     // Subtract the time taken to clean the card table from the
     // current value of "other time"
     misc_time_ms += _cur_clear_ct_time_ms;
@@ -299,20 +320,43 @@
   }
   print_stats(1, "Code Root Fixup", _cur_collection_code_root_fixup_time_ms);
   print_stats(1, "Code Root Migration", _cur_strong_code_root_migration_time_ms);
+  print_stats(1, "Code Root Purge", _cur_strong_code_root_purge_time_ms);
+  if (G1StringDedup::is_enabled()) {
+    print_stats(1, "String Dedup Fixup", _cur_string_dedup_fixup_time_ms, _active_gc_threads);
+    _cur_string_dedup_queue_fixup_worker_times_ms.print(2, "Queue Fixup (ms)");
+    _cur_string_dedup_table_fixup_worker_times_ms.print(2, "Table Fixup (ms)");
+  }
   print_stats(1, "Clear CT", _cur_clear_ct_time_ms);
   double misc_time_ms = pause_time_sec * MILLIUNITS - accounted_time_ms();
   print_stats(1, "Other", misc_time_ms);
   if (_cur_verify_before_time_ms > 0.0) {
     print_stats(2, "Verify Before", _cur_verify_before_time_ms);
   }
+  if (G1CollectedHeap::heap()->evacuation_failed()) {
+    double evac_fail_handling = _cur_evac_fail_recalc_used + _cur_evac_fail_remove_self_forwards +
+      _cur_evac_fail_restore_remsets;
+    print_stats(2, "Evacuation Failure", evac_fail_handling);
+    if (G1Log::finest()) {
+      print_stats(3, "Recalculate Used", _cur_evac_fail_recalc_used);
+      print_stats(3, "Remove Self Forwards", _cur_evac_fail_remove_self_forwards);
+      print_stats(3, "Restore RemSet", _cur_evac_fail_restore_remsets);
+    }
+  }
   print_stats(2, "Choose CSet",
     (_recorded_young_cset_choice_time_ms +
     _recorded_non_young_cset_choice_time_ms));
   print_stats(2, "Ref Proc", _cur_ref_proc_time_ms);
   print_stats(2, "Ref Enq", _cur_ref_enq_time_ms);
+  if (G1DeferredRSUpdate) {
+    print_stats(2, "Redirty Cards", _recorded_redirty_logged_cards_time_ms);
+  }
   print_stats(2, "Free CSet",
     (_recorded_young_free_cset_time_ms +
     _recorded_non_young_free_cset_time_ms));
+  if (G1Log::finest()) {
+    print_stats(3, "Young Free CSet", _recorded_young_free_cset_time_ms);
+    print_stats(3, "Non-Young Free CSet", _recorded_non_young_free_cset_time_ms);
+  }
   if (_cur_verify_after_time_ms > 0.0) {
     print_stats(2, "Verify After", _cur_verify_after_time_ms);
   }
--- a/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -131,6 +131,15 @@
   double _cur_collection_par_time_ms;
   double _cur_collection_code_root_fixup_time_ms;
   double _cur_strong_code_root_migration_time_ms;
+  double _cur_strong_code_root_purge_time_ms;
+
+  double _cur_evac_fail_recalc_used;
+  double _cur_evac_fail_restore_remsets;
+  double _cur_evac_fail_remove_self_forwards;
+
+  double                  _cur_string_dedup_fixup_time_ms;
+  WorkerDataArray<double> _cur_string_dedup_queue_fixup_worker_times_ms;
+  WorkerDataArray<double> _cur_string_dedup_table_fixup_worker_times_ms;
 
   double _cur_clear_ct_time_ms;
   double _cur_ref_proc_time_ms;
@@ -142,6 +151,8 @@
   double _recorded_young_cset_choice_time_ms;
   double _recorded_non_young_cset_choice_time_ms;
 
+  double _recorded_redirty_logged_cards_time_ms;
+
   double _recorded_young_free_cset_time_ms;
   double _recorded_non_young_free_cset_time_ms;
 
@@ -223,6 +234,37 @@
     _cur_strong_code_root_migration_time_ms = ms;
   }
 
+  void record_strong_code_root_purge_time(double ms) {
+    _cur_strong_code_root_purge_time_ms = ms;
+  }
+
+  void record_evac_fail_recalc_used_time(double ms) {
+    _cur_evac_fail_recalc_used = ms;
+  }
+
+  void record_evac_fail_restore_remsets(double ms) {
+    _cur_evac_fail_restore_remsets = ms;
+  }
+
+  void record_evac_fail_remove_self_forwards(double ms) {
+    _cur_evac_fail_remove_self_forwards = ms;
+  }
+
+  void note_string_dedup_fixup_start();
+  void note_string_dedup_fixup_end();
+
+  void record_string_dedup_fixup_time(double ms) {
+    _cur_string_dedup_fixup_time_ms = ms;
+  }
+
+  void record_string_dedup_queue_fixup_worker_time(uint worker_id, double ms) {
+    _cur_string_dedup_queue_fixup_worker_times_ms.set(worker_id, ms);
+  }
+
+  void record_string_dedup_table_fixup_worker_time(uint worker_id, double ms) {
+    _cur_string_dedup_table_fixup_worker_times_ms.set(worker_id, ms);
+  }
+
   void record_ref_proc_time(double ms) {
     _cur_ref_proc_time_ms = ms;
   }
@@ -251,6 +293,10 @@
     _recorded_non_young_cset_choice_time_ms = time_ms;
   }
 
+  void record_redirty_logged_cards_time_ms(double time_ms) {
+    _recorded_redirty_logged_cards_time_ms = time_ms;
+  }
+
   void record_cur_collection_start_sec(double time_ms) {
     _cur_collection_start_sec = time_ms;
   }
--- a/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -31,6 +31,7 @@
 #include "code/icBuffer.hpp"
 #include "gc_implementation/g1/g1Log.hpp"
 #include "gc_implementation/g1/g1MarkSweep.hpp"
+#include "gc_implementation/g1/g1StringDedup.hpp"
 #include "gc_implementation/shared/gcHeapSummary.hpp"
 #include "gc_implementation/shared/gcTimer.hpp"
 #include "gc_implementation/shared/gcTrace.hpp"
@@ -196,17 +197,19 @@
   G1CollectedHeap* _g1h;
   ModRefBarrierSet* _mrbs;
   CompactPoint _cp;
-  HumongousRegionSet _humongous_proxy_set;
+  HeapRegionSetCount _humongous_regions_removed;
 
   void free_humongous_region(HeapRegion* hr) {
     HeapWord* end = hr->end();
-    size_t dummy_pre_used;
     FreeRegionList dummy_free_list("Dummy Free List for G1MarkSweep");
 
     assert(hr->startsHumongous(),
            "Only the start of a humongous region should be freed.");
-    _g1h->free_humongous_region(hr, &dummy_pre_used, &dummy_free_list,
-                                &_humongous_proxy_set, false /* par */);
+
+    hr->set_containing_set(NULL);
+    _humongous_regions_removed.increment(1u, hr->capacity());
+
+    _g1h->free_humongous_region(hr, &dummy_free_list, false /* par */);
     hr->prepare_for_compaction(&_cp);
     // Also clear the part of the card table that will be unused after
     // compaction.
@@ -219,16 +222,13 @@
   : _g1h(G1CollectedHeap::heap()),
     _mrbs(_g1h->g1_barrier_set()),
     _cp(NULL, cs, cs->initialize_threshold()),
-    _humongous_proxy_set("G1MarkSweep Humongous Proxy Set") { }
+    _humongous_regions_removed() { }
 
   void update_sets() {
     // We'll recalculate total used bytes and recreate the free list
     // at the end of the GC, so no point in updating those values here.
-    _g1h->update_sets_after_freeing_regions(0, /* pre_used */
-                                            NULL, /* free_list */
-                                            NULL, /* old_proxy_set */
-                                            &_humongous_proxy_set,
-                                            false /* par */);
+    HeapRegionSetCount empty_set;
+    _g1h->remove_from_old_sets(empty_set, _humongous_regions_removed);
   }
 
   bool doHeapRegion(HeapRegion* hr) {
@@ -321,6 +321,10 @@
   // have been cleared if they pointed to non-surviving objects.)
   g1h->g1_process_weak_roots(&GenMarkSweep::adjust_pointer_closure);
 
+  if (G1StringDedup::is_enabled()) {
+    G1StringDedup::oops_do(&GenMarkSweep::adjust_pointer_closure);
+  }
+
   GenMarkSweep::adjust_marks();
 
   G1AdjustPointersClosure blk;
--- a/src/share/vm/gc_implementation/g1/g1OopClosures.hpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1OopClosures.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -80,53 +80,6 @@
   virtual void do_oop(narrowOop* p)    { do_oop_nv(p); }
 };
 
-#define G1_PARTIAL_ARRAY_MASK 0x2
-
-inline bool has_partial_array_mask(oop* ref) {
-  return ((uintptr_t)ref & G1_PARTIAL_ARRAY_MASK) == G1_PARTIAL_ARRAY_MASK;
-}
-
-// We never encode partial array oops as narrowOop*, so return false immediately.
-// This allows the compiler to create optimized code when popping references from
-// the work queue.
-inline bool has_partial_array_mask(narrowOop* ref) {
-  assert(((uintptr_t)ref & G1_PARTIAL_ARRAY_MASK) != G1_PARTIAL_ARRAY_MASK, "Partial array oop reference encoded as narrowOop*");
-  return false;
-}
-
-// Only implement set_partial_array_mask() for regular oops, not for narrowOops.
-// We always encode partial arrays as regular oop, to allow the
-// specialization for has_partial_array_mask() for narrowOops above.
-// This means that unintentional use of this method with narrowOops are caught
-// by the compiler.
-inline oop* set_partial_array_mask(oop obj) {
-  assert(((uintptr_t)(void *)obj & G1_PARTIAL_ARRAY_MASK) == 0, "Information loss!");
-  return (oop*) ((uintptr_t)(void *)obj | G1_PARTIAL_ARRAY_MASK);
-}
-
-template <class T> inline oop clear_partial_array_mask(T* ref) {
-  return cast_to_oop((intptr_t)ref & ~G1_PARTIAL_ARRAY_MASK);
-}
-
-class G1ParScanPartialArrayClosure : public G1ParClosureSuper {
-  G1ParScanClosure _scanner;
-
-public:
-  G1ParScanPartialArrayClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state, ReferenceProcessor* rp) :
-    G1ParClosureSuper(g1, par_scan_state), _scanner(g1, par_scan_state, rp)
-  {
-    assert(_ref_processor == NULL, "sanity");
-  }
-
-  G1ParScanClosure* scanner() {
-    return &_scanner;
-  }
-
-  template <class T> void do_oop_nv(T* p);
-  virtual void do_oop(oop* p)       { do_oop_nv(p); }
-  virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
-};
-
 // Add back base class for metadata
 class G1ParCopyHelper : public G1ParClosureSuper {
 protected:
@@ -173,15 +126,8 @@
 typedef G1ParCopyClosure<G1BarrierNone, true> G1ParScanAndMarkExtRootClosure;
 typedef G1ParCopyClosure<G1BarrierKlass, true> G1ParScanAndMarkMetadataClosure;
 
-// The following closure type is defined in g1_specialized_oop_closures.hpp:
-//
-// typedef G1ParCopyClosure<G1BarrierEvac, false> G1ParScanHeapEvacClosure;
-
 // We use a separate closure to handle references during evacuation
 // failure processing.
-// We could have used another instance of G1ParScanHeapEvacClosure
-// (since that closure no longer assumes that the references it
-// handles point into the collection set).
 
 typedef G1ParCopyClosure<G1BarrierEvac, false> G1ParScanHeapEvacFailureClosure;
 
--- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp	Tue Mar 25 12:54:21 2014 -0700
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -463,8 +463,9 @@
   int into_cset_n_buffers = into_cset_dcqs.completed_buffers_num();
 
   if (_g1->evacuation_failed()) {
+    double restore_remembered_set_start = os::elapsedTime();
+
     // Restore remembered sets for the regions pointing into the collection set.
-
     if (G1DeferredRSUpdate) {
       // If deferred RS updates are enabled then we just need to transfer
       // the completed buffers from (a) the DirtyCardQueueSet used to hold
@@ -483,6 +484,8 @@
       }
       assert(n_completed_buffers == into_cset_n_buffers, "missed some buffers");
     }
+
+    _g1->g1_policy()->phase_times()->record_evac_fail_restore_remsets((os::elapsedTime() - restore_remembered_set_start) * 1000.0);
   }
 
   // Free any completed buffers in the DirtyCardQueueSet used to hold cards
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc_implementation/g1/g1StringDedup.cpp	Tue Mar 25 17:07:36 2014 -0700
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "classfile/javaClasses.hpp"
+#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
+#include "gc_implementation/g1/g1GCPhaseTimes.hpp"
+#include "gc_implementation/g1/g1StringDedup.hpp"
+#include "gc_implementation/g1/g1StringDedupQueue.hpp"
+#include "gc_implementation/g1/g1StringDedupStat.hpp"
+#include "gc_implementation/g1/g1StringDedupTable.hpp"
+#include "gc_implementation/g1/g1StringDedupThread.hpp"
+
+bool G1StringDedup::_enabled = false;
+
+void G1StringDedup::initialize() {
+  assert(UseG1GC, "String deduplication only available with G1");
+  if (UseStringDeduplication) {
+    _enabled = true;
+    G1StringDedupQueue::create();
+    G1StringDedupTable::create();
+    G1StringDedupThread::create();
+  }
+}
+
+bool G1StringDedup::is_candidate_from_mark(oop obj) {
+  if (java_lang_String::is_instance(obj)) {
+    bool from_young = G1CollectedHeap::heap()->heap_region_containing_raw(obj)->is_young();
+    if (from_young && obj->age() < StringDeduplicationAgeThreshold) {
+      // Candidate found. String is being evacuated from young to old but has not
+      // reached the deduplication age threshold, i.e. has not previously been a
+      // candidate during its life in the young generation.
+      return true;
+    }
+  }
+
+  // Not a candidate
+  return false;
+}
+
+void G1StringDedup::enqueue_from_mark(oop java_string) {
+  assert(is_enabled(), "String deduplication not enabled");
+  if (is_candidate_from_mark(java_string)) {
+    G1StringDedupQueue::push(0 /* worker_id */, java_string);
+  }
+}
+
+bool G1StringDedup::is_candidate_from_evacuation(bool from_young, bool to_young, oop obj) {
+  if (from_young && java_lang_String::is_instance(obj)) {
+    if (to_young && obj->age() == StringDeduplicationAgeThreshold) {
+      // Candidate found. String is being evacuated from young to young and just
+      // reached the deduplication age threshold.
+      return true;
+    }
+    if (!to_young && obj->age() < StringDeduplicationAgeThreshold) {
+      // Candidate found. String is being evacuated from young to old but has not
+      // reached the deduplication age threshold, i.e. has not previously been a
+      // candidate during its life in the young generation.
+      return true;
+    }
+  }
+
+  // Not a candidate
+  return false;
+}
+
+void G1StringDedup::enqueue_from_evacuation(bool from_young, bool to_young, uint worker_id, oop java_string) {
+  assert(is_enabled(), "String deduplication not enabled");
+  if (is_candidate_from_evacuation(from_young, to_young, java_string)) {
+    G1StringDedupQueue::push(worker_id, java_string);
+  }
+}
+
+void G1StringDedup::deduplicate(oop java_string) {
+  assert(is_enabled(), "String deduplication not enabled");
+  G1StringDedupStat dummy; // Statistics from this path is never used
+  G1StringDedupTable::deduplicate(java_string, dummy);
+}
+
+void G1StringDedup::oops_do(OopClosure* keep_alive) {
+  assert(is_enabled(), "String deduplication not enabled");
+  unlink_or_oops_do(NULL, keep_alive);
+}
+
+void G1StringDedup::unlink(BoolObjectClosure* is_alive) {
+  assert(is_enabled(), "String deduplication not enabled");
+  // Don't allow a potential resize or rehash during unlink, as the unlink
+  // operation itself might remove enough entries to invalidate such a decision.
+  unlink_or_oops_do(is_alive, NULL, false /* allow_resize_and_rehash */);
+}
+
+//
+// Task for parallel unlink_or_oops_do() operation on the deduplication queue
+// and table.
+//
+class G1StringDedupUnlinkOrOopsDoTask : public AbstractGangTask {
+private:
+  G1StringDedupUnlinkOrOopsDoClosure _cl;
+
+public:
+  G1StringDedupUnlinkOrOopsDoTask(BoolObjectClosure* is_alive,
+                                  OopClosure* keep_alive,
+                                  bool allow_resize_and_rehash) :
+    AbstractGangTask("G1StringDedupUnlinkOrOopsDoTask"),
+    _cl(is_alive, keep_alive, allow_resize_and_rehash) {
+  }
+
+  virtual void work(uint worker_id) {
+    double queue_fixup_start = os::elapsedTime();
+    G1StringDedupQueue::unlink_or_oops_do(&_cl);
+
+    double table_fixup_start = os::elapsedTime();
+    G1StringDedupTable::unlink_or_oops_do(&_cl, worker_id);
+
+    double queue_fixup_time_ms = (table_fixup_start - queue_fixup_start) * 1000.0;
+    double table_fixup_time_ms = (os::elapsedTime() - table_fixup_start) * 1000.0;
+    G1CollectorPolicy* g1p = G1CollectedHeap::heap()->g1_policy();
+    g1p->phase_times()->record_string_dedup_queue_fixup_worker_time(worker_id, queue_fixup_time_ms);
+    g1p->phase_times()->record_string_dedup_table_fixup_worker_time(worker_id, table_fixup_time_ms);
+  }
+};
+
+void G1StringDedup::unlink_or_oops_do(BoolObjectClosure* is_alive, OopClosure* keep_alive, bool allow_resize_and_rehash) {
+  assert(is_enabled(), "String deduplication not enabled");
+  G1CollectorPolicy* g1p = G1CollectedHeap::heap()->g1_policy();
+  g1p->phase_times()->note_string_dedup_fixup_start();
+  double fixup_start = os::elapsedTime();
+
+  G1StringDedupUnlinkOrOopsDoTask task(is_alive, keep_alive, allow_resize_and_rehash);
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+    g1h->set_par_threads();
+    g1h->workers()->run_task(&task);
+    g1h->set_par_threads(0);
+  } else {
+    task.work(0);
+  }
+
+  double fixup_time_ms = (os::elapsedTime() - fixup_start) * 1000.0;
+  g1p->phase_times()->record_string_dedup_fixup_time(fixup_time_ms);
+  g1p->phase_times()->note_string_dedup_fixup_end();
+}
+
+void G1StringDedup::threads_do(ThreadClosure* tc) {
+  assert(is_enabled(), "String deduplication not enabled");
+  tc->do_thread(G1StringDedupThread::thread());
+}
+
+void G1StringDedup::print_worker_threads_on(outputStream* st) {
+  assert(is_enabled(), "String deduplication not enabled");
+  G1StringDedupThread::thread()->print_on(st);
+  st->cr();
+}
+
+void G1StringDedup::verify() {
+  assert(is_enabled(), "String deduplication not enabled");
+  G1StringDedupQueue::verify();
+  G1StringDedupTable::verify();
+}
+
+G1StringDedupUnlinkOrOopsDoClosure::G1StringDedupUnlinkOrOopsDoClosure(BoolObjectClosure* is_alive,
+                                                                       OopClosure* keep_alive,
+                                                                       bool allow_resize_and_rehash) :
+  _is_alive(is_alive),
+  _keep_alive(keep_alive),
+  _resized_table(NULL),
+  _rehashed_table(NULL),
+  _next_queue(0),
+  _next_bucket(0) {
+  if (allow_resize_and_rehash) {
+    // If both resize and rehash is needed, only do resize. Rehash of
+    // the table will eventually happen if the situation persists.
+    _resized_table = G1StringDedupTable::prepare_resize();
+    if (!is_resizing()) {
+      _rehashed_table = G1StringDedupTable::prepare_rehash();
+    }
+  }
+}
+
+G1StringDedupUnlinkOrOopsDoClosure::~G1StringDedupUnlinkOrOopsDoClosure() {
+  assert(!is_resizing() || !is_rehashing(), "Can not both resize and rehash");
+  if (is_resizing()) {
+    G1StringDedupTable::finish_resize(_resized_table);
+  } else if (is_rehashing()) {
+    G1StringDedupTable::finish_rehash(_rehashed_table);
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc_implementation/g1/g1StringDedup.hpp	Tue Mar 25 17:07:36 2014 -0700
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.