changeset 7236:5fc21235d0fd

8049737: Contended Locking reorder and cache line bucket Summary: JEP-143/JDK-8046133 - optimization #1 - reorder and cache line bucket. Reviewed-by: shade, dice, dholmes, dsimms Contributed-by: dave.dice@oracle.com, karen.kinnear@oracle.com, daniel.daugherty@oracle.com
author dcubed
date Tue, 14 Oct 2014 10:32:12 -0700
parents 763abe04c848
children e859b5ba6be4
files agent/src/share/classes/sun/jvm/hotspot/runtime/ObjectMonitor.java agent/src/share/classes/sun/jvm/hotspot/runtime/ObjectSynchronizer.java src/cpu/sparc/vm/globalDefinitions_sparc.hpp src/cpu/x86/vm/globalDefinitions_x86.hpp src/share/vm/memory/padded.hpp src/share/vm/prims/jvmtiEnvBase.cpp src/share/vm/runtime/objectMonitor.cpp src/share/vm/runtime/objectMonitor.hpp src/share/vm/runtime/objectMonitor.inline.hpp src/share/vm/runtime/synchronizer.cpp src/share/vm/runtime/synchronizer.hpp src/share/vm/runtime/vmStructs.cpp src/share/vm/utilities/globalDefinitions.hpp
diffstat 13 files changed, 302 insertions(+), 247 deletions(-) [+]
line wrap: on
line diff
--- a/agent/src/share/classes/sun/jvm/hotspot/runtime/ObjectMonitor.java	Mon Oct 13 22:11:39 2014 +0200
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/ObjectMonitor.java	Tue Oct 14 10:32:12 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -50,8 +50,8 @@
     ownerFieldOffset = f.getOffset();
     f = type.getField("FreeNext");
     FreeNextFieldOffset = f.getOffset();
-    countField  = type.getCIntegerField("_count");
-    waitersField = type.getCIntegerField("_waiters");
+    countField  = type.getJIntField("_count");
+    waitersField = type.getJIntField("_waiters");
     recursionsField = type.getCIntegerField("_recursions");
   }
 
@@ -81,15 +81,15 @@
   // FIXME
   //  void      set_owner(void* owner);
 
-  public long    waiters() { return waitersField.getValue(addr); }
+  public int    waiters() { return waitersField.getValue(addr); }
 
   public Address freeNext() { return addr.getAddressAt(FreeNextFieldOffset); }
   // FIXME
   //  void      set_queue(void* owner);
 
-  public long count() { return countField.getValue(addr); }
+  public int count() { return countField.getValue(addr); }
   // FIXME
-  //  void      set_count(intptr_t count);
+  //  void      set_count(int count);
 
   public long recursions() { return recursionsField.getValue(addr); }
 
@@ -97,18 +97,9 @@
     return addr.getOopHandleAt(objectFieldOffset);
   }
 
-  public long contentions() {
-      // refer to objectMonitor_xxx.inline.hpp - contentions definition.
-      // for Solaris and Linux, contentions is same as count. for Windows
-      // it is different (objectMonitor_win32.inline.hpp)
-      long count = count();
-      if (VM.getVM().getOS().equals("win32")) {
-          // don't count the owner of the monitor
-          return count > 0? count - 1 : 0;
-      } else {
-          // Solaris and Linux
-          return count;
-      }
+  // contentions is always equal to count
+  public int contentions() {
+      return count();
   }
 
   // FIXME
@@ -123,8 +114,8 @@
   private static long          objectFieldOffset;
   private static long          ownerFieldOffset;
   private static long          FreeNextFieldOffset;
-  private static CIntegerField countField;
-  private static CIntegerField waitersField;
+  private static JIntField     countField;
+  private static JIntField     waitersField;
   private static CIntegerField recursionsField;
   // FIXME: expose platform-dependent stuff
 }
--- a/agent/src/share/classes/sun/jvm/hotspot/runtime/ObjectSynchronizer.java	Mon Oct 13 22:11:39 2014 +0200
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/ObjectSynchronizer.java	Tue Oct 14 10:32:12 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2007, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -48,9 +48,17 @@
       blockListField = type.getAddressField("gBlockList");
       gBlockListAddr = blockListField.getValue();
       blockSize = db.lookupIntConstant("ObjectSynchronizer::_BLOCKSIZE").intValue();
+      defaultCacheLineSize = db.lookupIntConstant("DEFAULT_CACHE_LINE_SIZE").intValue();
     } catch (RuntimeException e) { }
     type = db.lookupType("ObjectMonitor");
     objectMonitorTypeSize = type.getSize();
+    if ((objectMonitorTypeSize % defaultCacheLineSize) != 0) {
+      // sizeof(ObjectMonitor) is not already a multiple of a cache line.
+      // The ObjectMonitor allocation code in ObjectSynchronizer pads each
+      // ObjectMonitor in a block to the next cache line boundary.
+      int needLines = ((int)objectMonitorTypeSize / defaultCacheLineSize) + 1;
+      objectMonitorTypeSize = needLines * defaultCacheLineSize;
+    }
   }
 
   public long identityHashValueFor(Oop obj) {
@@ -122,6 +130,7 @@
 
   private static Address gBlockListAddr;
   private static int blockSize;
+  private static int defaultCacheLineSize;
   private static long objectMonitorTypeSize;
 
 }
--- a/src/cpu/sparc/vm/globalDefinitions_sparc.hpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/cpu/sparc/vm/globalDefinitions_sparc.hpp	Tue Oct 14 10:32:12 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -38,4 +38,26 @@
 
 #define SUPPORTS_NATIVE_CX8
 
+// The expected size in bytes of a cache line, used to pad data structures.
+#if defined(TIERED)
+  #ifdef _LP64
+    // tiered, 64-bit, large machine
+    #define DEFAULT_CACHE_LINE_SIZE 128
+  #else
+    // tiered, 32-bit, medium machine
+    #define DEFAULT_CACHE_LINE_SIZE 64
+  #endif
+#elif defined(COMPILER1)
+  // pure C1, 32-bit, small machine
+  #define DEFAULT_CACHE_LINE_SIZE 16
+#elif defined(COMPILER2) || defined(SHARK)
+  #ifdef _LP64
+    // pure C2, 64-bit, large machine
+    #define DEFAULT_CACHE_LINE_SIZE 128
+  #else
+    // pure C2, 32-bit, medium machine
+    #define DEFAULT_CACHE_LINE_SIZE 64
+  #endif
+#endif
+
 #endif // CPU_SPARC_VM_GLOBALDEFINITIONS_SPARC_HPP
--- a/src/cpu/x86/vm/globalDefinitions_x86.hpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/cpu/x86/vm/globalDefinitions_x86.hpp	Tue Oct 14 10:32:12 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -35,4 +35,27 @@
 
 #define SUPPORTS_NATIVE_CX8
 
+// The expected size in bytes of a cache line, used to pad data structures.
+#if defined(TIERED)
+  #ifdef _LP64
+    // tiered, 64-bit, large machine
+    #define DEFAULT_CACHE_LINE_SIZE 128
+  #else
+    // tiered, 32-bit, medium machine
+    #define DEFAULT_CACHE_LINE_SIZE 64
+  #endif
+#elif defined(COMPILER1)
+  // pure C1, 32-bit, small machine
+  // i486 was the last Intel chip with 16-byte cache line size
+  #define DEFAULT_CACHE_LINE_SIZE 32
+#elif defined(COMPILER2) || defined(SHARK)
+  #ifdef _LP64
+    // pure C2, 64-bit, large machine
+    #define DEFAULT_CACHE_LINE_SIZE 128
+  #else
+    // pure C2, 32-bit, medium machine
+    #define DEFAULT_CACHE_LINE_SIZE 64
+  #endif
+#endif
+
 #endif // CPU_X86_VM_GLOBALDEFINITIONS_X86_HPP
--- a/src/share/vm/memory/padded.hpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/share/vm/memory/padded.hpp	Tue Oct 14 10:32:12 2014 -0700
@@ -76,10 +76,16 @@
 // if the start address is a multiple of alignment.
 template <class T, size_t alignment = DEFAULT_CACHE_LINE_SIZE>
 class PaddedEnd : public PaddedEndImpl<T, PADDED_END_SIZE(T, alignment)> {
-  // C++ don't allow zero-length arrays. The padding is put in a
+  // C++ doesn't allow zero-length arrays. The padding is put in a
   // super class that is specialized for the pad_size == 0 case.
 };
 
+// Similar to PaddedEnd, this macro defines a _pad_buf#id field
+// that is (alignment - size) bytes in size. This macro is used
+// to add padding in between non-class fields in a class or struct.
+#define DEFINE_PAD_MINUS_SIZE(id, alignment, size) \
+          char _pad_buf##id[(alignment) - (size)]
+
 // Helper class to create an array of PaddedEnd<T> objects. All elements will
 // start at a multiple of alignment and the size will be aligned to alignment.
 template <class T, MEMFLAGS flags, size_t alignment = DEFAULT_CACHE_LINE_SIZE>
--- a/src/share/vm/prims/jvmtiEnvBase.cpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/share/vm/prims/jvmtiEnvBase.cpp	Tue Oct 14 10:32:12 2014 -0700
@@ -1031,7 +1031,7 @@
     // implied else: entry_count == 0
   }
 
-  int nWant,nWait;
+  jint nWant, nWait;
   if (mon != NULL) {
     // this object has a heavyweight monitor
     nWant = mon->contentions(); // # of threads contending for monitor
--- a/src/share/vm/runtime/objectMonitor.cpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/share/vm/runtime/objectMonitor.cpp	Tue Oct 14 10:32:12 2014 -0700
@@ -257,7 +257,6 @@
       assert(_recursions == 0, "internal state error");
       _owner = THREAD;
       _recursions = 1;
-      OwnerIsThread = 1;
       return true;
     }
     if (Atomic::cmpxchg_ptr (THREAD, &_owner, NULL) != NULL) {
@@ -280,7 +279,6 @@
     // Either ASSERT _recursions == 0 or explicitly set _recursions = 0.
     assert(_recursions == 0, "invariant");
     assert(_owner == Self, "invariant");
-    // CONSIDER: set or assert OwnerIsThread == 1
     return;
   }
 
@@ -296,7 +294,6 @@
     // Commute owner from a thread-specific on-stack BasicLockObject address to
     // a full-fledged "Thread *".
     _owner = Self;
-    OwnerIsThread = 1;
     return;
   }
 
@@ -328,7 +325,7 @@
 
   // Prevent deflation at STW-time.  See deflate_idle_monitors() and is_busy().
   // Ensure the object-monitor relationship remains stable while there's contention.
-  Atomic::inc_ptr(&_count);
+  Atomic::inc(&_count);
 
   EventJavaMonitorEnter event;
 
@@ -384,7 +381,7 @@
     // acquire it.
   }
 
-  Atomic::dec_ptr(&_count);
+  Atomic::dec(&_count);
   assert(_count >= 0, "invariant");
   Self->_Stalled = 0;
 
@@ -440,7 +437,6 @@
     // Either guarantee _recursions == 0 or set _recursions = 0.
     assert(_recursions == 0, "invariant");
     assert(_owner == Self, "invariant");
-    // CONSIDER: set or assert that OwnerIsThread == 1
     return 1;
   }
   // The lock had been free momentarily, but we lost the race to the lock.
@@ -922,7 +918,6 @@
       assert(_recursions == 0, "invariant");
       _owner = THREAD;
       _recursions = 0;
-      OwnerIsThread = 1;
     } else {
       // Apparent unbalanced locking ...
       // Naively we'd like to throw IllegalMonitorStateException.
@@ -1346,7 +1341,6 @@
       assert(_recursions == 0, "internal state error");
       _owner = THREAD;   // Convert from basiclock addr to Thread addr
       _recursions = 0;
-      OwnerIsThread = 1;
     }
   }
 
@@ -1385,7 +1379,6 @@
       if (THREAD->is_lock_owned((address) _owner)) {                        \
         _owner = THREAD;  /* Convert from basiclock addr to Thread addr */  \
         _recursions = 0;                                                    \
-        OwnerIsThread = 1;                                                  \
       } else {                                                              \
         TEVENT(Throw IMSX);                                                 \
         THROW(vmSymbols::java_lang_IllegalMonitorStateException());         \
@@ -1906,8 +1899,8 @@
 // a contending thread could enqueue itself on the cxq and then spin locally
 // on a thread-specific variable such as its ParkEvent._Event flag.
 // That's left as an exercise for the reader.  Note that global spinning is
-// not problematic on Niagara, as the L2$ serves the interconnect and has both
-// low latency and massive bandwidth.
+// not problematic on Niagara, as the L2 cache serves the interconnect and
+// has both low latency and massive bandwidth.
 //
 // Broadly, we can fix the spin frequency -- that is, the % of contended lock
 // acquisition attempts where we opt to spin --  at 100% and vary the spin count
@@ -2208,7 +2201,7 @@
 // as advisory.
 //
 // Beware too, that _owner is sometimes a BasicLock address and sometimes
-// a thread pointer.  We differentiate the two cases with OwnerIsThread.
+// a thread pointer.
 // Alternately, we might tag the type (thread pointer vs basiclock pointer)
 // with the LSB of _owner.  Another option would be to probablistically probe
 // the putative _owner->TypeTag value.
@@ -2230,9 +2223,7 @@
 
 
 int ObjectMonitor::NotRunnable(Thread * Self, Thread * ox) {
-  // Check either OwnerIsThread or ox->TypeTag == 2BAD.
-  if (!OwnerIsThread) return 0;
-
+  // Check ox->TypeTag == 2BAD.
   if (ox == NULL) return 0;
 
   // Avoid transitive spinning ...
@@ -2399,20 +2390,6 @@
   }
 }
 
-
-// Compile-time asserts
-// When possible, it's better to catch errors deterministically at
-// compile-time than at runtime.  The down-side to using compile-time
-// asserts is that error message -- often something about negative array
-// indices -- is opaque.
-
-#define CTASSERT(x) { int tag[1-(2*!(x))]; printf ("Tag @" INTPTR_FORMAT "\n", (intptr_t)tag); }
-
-void ObjectMonitor::ctAsserts() {
-  CTASSERT(offset_of (ObjectMonitor, _header) == 0);
-}
-
-
 static char * kvGet(char * kvList, const char * Key) {
   if (kvList == NULL) return NULL;
   size_t n = strlen(Key);
@@ -2526,6 +2503,8 @@
   if (verbose) {
     tty->print_cr("INFO: sizeof(ObjectMonitor)=" SIZE_FORMAT,
                   sizeof(ObjectMonitor));
+    tty->print_cr("INFO: sizeof(PaddedEnd<ObjectMonitor>)=" SIZE_FORMAT,
+                  sizeof(PaddedEnd<ObjectMonitor>));
   }
 
   uint cache_line_size = VM_Version::L1_data_cache_line_size();
@@ -2559,9 +2538,9 @@
       warning_cnt++;
     }
 
-    if ((sizeof(ObjectMonitor) % cache_line_size) != 0) {
-      tty->print_cr("WARNING: ObjectMonitor size is not a multiple of "
-                    "a cache line which permits false sharing.");
+    if ((sizeof(PaddedEnd<ObjectMonitor>) % cache_line_size) != 0) {
+      tty->print_cr("WARNING: PaddedEnd<ObjectMonitor> size is not a "
+                    "multiple of a cache line which permits false sharing.");
       warning_cnt++;
     }
   }
--- a/src/share/vm/runtime/objectMonitor.hpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/share/vm/runtime/objectMonitor.hpp	Tue Oct 14 10:32:12 2014 -0700
@@ -25,6 +25,7 @@
 #ifndef SHARE_VM_RUNTIME_OBJECTMONITOR_HPP
 #define SHARE_VM_RUNTIME_OBJECTMONITOR_HPP
 
+#include "memory/padded.hpp"
 #include "runtime/os.hpp"
 #include "runtime/park.hpp"
 #include "runtime/perfData.hpp"
@@ -58,21 +59,71 @@
 // forward declaration to avoid include tracing.hpp
 class EventJavaMonitorWait;
 
-// WARNING:
-//   This is a very sensitive and fragile class. DO NOT make any
-// change unless you are fully aware of the underlying semantics.
-
-//   This class can not inherit from any other class, because I have
-// to let the displaced header be the very first word. Otherwise I
-// have to let markOop include this file, which would export the
-// monitor data structure to everywhere.
+// The ObjectMonitor class implements the heavyweight version of a
+// JavaMonitor. The lightweight BasicLock/stack lock version has been
+// inflated into an ObjectMonitor. This inflation is typically due to
+// contention or use of Object.wait().
 //
-// The ObjectMonitor class is used to implement JavaMonitors which have
-// transformed from the lightweight structure of the thread stack to a
-// heavy weight lock due to contention
-
-// It is also used as RawMonitor by the JVMTI
-
+// WARNING: This is a very sensitive and fragile class. DO NOT make any
+// changes unless you are fully aware of the underlying semantics.
+//
+// Class JvmtiRawMonitor currently inherits from ObjectMonitor so
+// changes in this class must be careful to not break JvmtiRawMonitor.
+// These two subsystems should be separated.
+//
+// ObjectMonitor Layout Overview/Highlights/Restrictions:
+//
+// - The _header field must be at offset 0 because the displaced header
+//   from markOop is stored there. We do not want markOop.hpp to include
+//   ObjectMonitor.hpp to avoid exposing ObjectMonitor everywhere. This
+//   means that ObjectMonitor cannot inherit from any other class nor can
+//   it use any virtual member functions. This restriction is critical to
+//   the proper functioning of the VM.
+// - The _header and _owner fields should be separated by enough space
+//   to avoid false sharing due to parallel access by different threads.
+//   This is an advisory recommendation.
+// - The general layout of the fields in ObjectMonitor is:
+//     _header
+//     <lightly_used_fields>
+//     <optional padding>
+//     _owner
+//     <remaining_fields>
+// - The VM assumes write ordering and machine word alignment with
+//   respect to the _owner field and the <remaining_fields> that can
+//   be read in parallel by other threads.
+// - Generally fields that are accessed closely together in time should
+//   be placed proximally in space to promote data cache locality. That
+//   is, temporal locality should condition spatial locality.
+// - We have to balance avoiding false sharing with excessive invalidation
+//   from coherence traffic. As such, we try to cluster fields that tend
+//   to be _written_ at approximately the same time onto the same data
+//   cache line.
+// - We also have to balance the natural tension between minimizing
+//   single threaded capacity misses with excessive multi-threaded
+//   coherency misses. There is no single optimal layout for both
+//   single-threaded and multi-threaded environments.
+//
+// - See ObjectMonitor::sanity_checks() for how critical restrictions are
+//   enforced and advisory recommendations are reported.
+// - Adjacent ObjectMonitors should be separated by enough space to avoid
+//   false sharing. This is handled by the ObjectMonitor allocation code
+//   in synchronizer.cpp. Also see ObjectSynchronizer::sanity_checks().
+//
+// Futures notes:
+//   - Separating _owner from the <remaining_fields> by enough space to
+//     avoid false sharing might be profitable. Given
+//     http://blogs.oracle.com/dave/entry/cas_and_cache_trivia_invalidate
+//     we know that the CAS in monitorenter will invalidate the line
+//     underlying _owner. We want to avoid an L1 data cache miss on that
+//     same line for monitorexit. Putting these <remaining_fields>:
+//     _recursions, _EntryList, _cxq, and _succ, all of which may be
+//     fetched in the inflated unlock path, on a different cache line
+//     would make them immune to CAS-based invalidation from the _owner
+//     field.
+//
+//   - The _recursions field should be of type int, or int32_t but not
+//     intptr_t. There's no reason to use a 64-bit type for this field
+//     in a 64-bit JVM.
 
 class ObjectMonitor {
  public:
@@ -84,7 +135,84 @@
     OM_TIMED_OUT              // Object.wait() timed out
   };
 
+ private:
+  friend class ObjectSynchronizer;
+  friend class ObjectWaiter;
+  friend class VMStructs;
+
+  volatile markOop   _header;       // displaced object header word - mark
+  void*     volatile _object;       // backward object pointer - strong root
  public:
+  ObjectMonitor *    FreeNext;      // Free list linkage
+ private:
+  DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE,
+                        sizeof(volatile markOop) + sizeof(void * volatile) +
+                        sizeof(ObjectMonitor *));
+ protected:                         // protected for JvmtiRawMonitor
+  void *  volatile _owner;          // pointer to owning thread OR BasicLock
+  volatile jlong _previous_owner_tid;  // thread id of the previous owner of the monitor
+  volatile intptr_t  _recursions;   // recursion count, 0 for first entry
+  ObjectWaiter * volatile _EntryList; // Threads blocked on entry or reentry.
+                                      // The list is actually composed of WaitNodes,
+                                      // acting as proxies for Threads.
+ private:
+  ObjectWaiter * volatile _cxq;     // LL of recently-arrived threads blocked on entry.
+  Thread * volatile _succ;          // Heir presumptive thread - used for futile wakeup throttling
+  Thread * volatile _Responsible;
+
+  volatile int _Spinner;            // for exit->spinner handoff optimization
+  volatile int _SpinFreq;           // Spin 1-out-of-N attempts: success rate
+  volatile int _SpinClock;
+  volatile intptr_t _SpinState;     // MCS/CLH list of spinners
+  volatile int _SpinDuration;
+
+  volatile jint  _count;            // reference count to prevent reclamation/deflation
+                                    // at stop-the-world time.  See deflate_idle_monitors().
+                                    // _count is approximately |_WaitSet| + |_EntryList|
+ protected:
+  ObjectWaiter * volatile _WaitSet; // LL of threads wait()ing on the monitor
+  volatile jint  _waiters;          // number of waiting threads
+ private:
+  volatile int _WaitSetLock;        // protects Wait Queue - simple spinlock
+
+ public:
+  static void Initialize();
+  static PerfCounter * _sync_ContendedLockAttempts;
+  static PerfCounter * _sync_FutileWakeups;
+  static PerfCounter * _sync_Parks;
+  static PerfCounter * _sync_EmptyNotifications;
+  static PerfCounter * _sync_Notifications;
+  static PerfCounter * _sync_SlowEnter;
+  static PerfCounter * _sync_SlowExit;
+  static PerfCounter * _sync_SlowNotify;
+  static PerfCounter * _sync_SlowNotifyAll;
+  static PerfCounter * _sync_FailedSpins;
+  static PerfCounter * _sync_SuccessfulSpins;
+  static PerfCounter * _sync_PrivateA;
+  static PerfCounter * _sync_PrivateB;
+  static PerfCounter * _sync_MonInCirculation;
+  static PerfCounter * _sync_MonScavenged;
+  static PerfCounter * _sync_Inflations;
+  static PerfCounter * _sync_Deflations;
+  static PerfLongVariable * _sync_MonExtant;
+
+  static int Knob_Verbose;
+  static int Knob_VerifyInUse;
+  static int Knob_SpinLimit;
+
+  void* operator new (size_t size) throw() {
+    return AllocateHeap(size, mtInternal);
+  }
+  void* operator new[] (size_t size) throw() {
+    return operator new (size);
+  }
+  void operator delete(void* p) {
+    FreeHeap(p, mtInternal);
+  }
+  void operator delete[] (void *p) {
+    operator delete(p);
+  }
+
   // TODO-FIXME: the "offset" routines should return a type of off_t instead of int ...
   // ByteSize would also be an appropriate type.
   static int header_offset_in_bytes()      { return offset_of(ObjectMonitor, _header); }
@@ -100,14 +228,11 @@
   static int Responsible_offset_in_bytes() { return offset_of(ObjectMonitor, _Responsible); }
   static int Spinner_offset_in_bytes()     { return offset_of(ObjectMonitor, _Spinner); }
 
- public:
   // Eventually we'll make provisions for multiple callbacks, but
   // now one will suffice.
   static int (*SpinCallbackFunction)(intptr_t, int);
   static intptr_t SpinCallbackArgument;
 
-
- public:
   markOop   header() const;
   void      set_header(markOop hdr);
 
@@ -123,39 +248,22 @@
   void*     owner() const;
   void      set_owner(void* owner);
 
-  intptr_t  waiters() const;
+  jint      waiters() const;
 
-  intptr_t  count() const;
-  void      set_count(intptr_t count);
-  intptr_t  contentions() const;
+  jint      count() const;
+  void      set_count(jint count);
+  jint      contentions() const;
   intptr_t  recursions() const                                         { return _recursions; }
 
-  // JVM/DI GetMonitorInfo() needs this
+  // JVM/TI GetObjectMonitorUsage() needs this:
   ObjectWaiter* first_waiter()                                         { return _WaitSet; }
   ObjectWaiter* next_waiter(ObjectWaiter* o)                           { return o->_next; }
   Thread* thread_of_waiter(ObjectWaiter* o)                            { return o->_thread; }
 
-  // initialize the monitor, exception the semaphore, all other fields
-  // are simple integers or pointers
-  ObjectMonitor() {
-    _header       = NULL;
-    _count        = 0;
-    _waiters      = 0;
-    _recursions   = 0;
-    _object       = NULL;
-    _owner        = NULL;
-    _WaitSet      = NULL;
-    _WaitSetLock  = 0;
-    _Responsible  = NULL;
-    _succ         = NULL;
-    _cxq          = NULL;
-    FreeNext      = NULL;
-    _EntryList    = NULL;
-    _SpinFreq     = 0;
-    _SpinClock    = 0;
-    OwnerIsThread = 0;
-    _previous_owner_tid = 0;
-  }
+ protected:
+  // We don't typically expect or want the ctors or dtors to run.
+  // normal ObjectMonitors are type-stable and immortal.
+  ObjectMonitor() { ::memset((void *)this, 0, sizeof(*this)); }
 
   ~ObjectMonitor() {
     // TODO: Add asserts ...
@@ -169,7 +277,7 @@
     // _cxq == 0 _succ == NULL _owner == NULL _waiters == 0
     // _count == 0 EntryList  == NULL
     // _recursions == 0 _WaitSet == NULL
-    // TODO: assert (is_busy()|_recursions) == 0
+    assert(((is_busy()|_recursions) == 0), "freeing inuse monitor");
     _succ          = NULL;
     _EntryList     = NULL;
     _cxq           = NULL;
@@ -177,7 +285,6 @@
     _recursions    = 0;
     _SpinFreq      = 0;
     _SpinClock     = 0;
-    OwnerIsThread  = 0;
   }
 
  public:
@@ -221,7 +328,6 @@
   int       TrySpin_Fixed(Thread * Self);
   int       TrySpin_VaryFrequency(Thread * Self);
   int       TrySpin_VaryDuration(Thread * Self);
-  void      ctAsserts();
   void      ExitEpilog(Thread * Self, ObjectWaiter * Wakee);
   bool      ExitSuspendEquivalent(JavaThread * Self);
   void      post_monitor_wait_event(EventJavaMonitorWait * event,
@@ -229,102 +335,6 @@
                                     jlong timeout,
                                     bool timedout);
 
- private:
-  friend class ObjectSynchronizer;
-  friend class ObjectWaiter;
-  friend class VMStructs;
-
-  // WARNING: this must be the very first word of ObjectMonitor
-  // This means this class can't use any virtual member functions.
-
-  volatile markOop   _header;       // displaced object header word - mark
-  void*     volatile _object;       // backward object pointer - strong root
-
-  double SharingPad[1];             // temp to reduce false sharing
-
-  // All the following fields must be machine word aligned
-  // The VM assumes write ordering wrt these fields, which can be
-  // read from other threads.
-
- protected:                         // protected for jvmtiRawMonitor
-  void *  volatile _owner;          // pointer to owning thread OR BasicLock
-  volatile jlong _previous_owner_tid;  // thread id of the previous owner of the monitor
-  volatile intptr_t  _recursions;   // recursion count, 0 for first entry
- private:
-  int OwnerIsThread;                // _owner is (Thread *) vs SP/BasicLock
-  ObjectWaiter * volatile _cxq;     // LL of recently-arrived threads blocked on entry.
-                                    // The list is actually composed of WaitNodes, acting
-                                    // as proxies for Threads.
- protected:
-  ObjectWaiter * volatile _EntryList;  // Threads blocked on entry or reentry.
- private:
-  Thread * volatile _succ;          // Heir presumptive thread - used for futile wakeup throttling
-  Thread * volatile _Responsible;
-  int _PromptDrain;                 // rqst to drain cxq into EntryList ASAP
-
-  volatile int _Spinner;            // for exit->spinner handoff optimization
-  volatile int _SpinFreq;           // Spin 1-out-of-N attempts: success rate
-  volatile int _SpinClock;
-  volatile int _SpinDuration;
-  volatile intptr_t _SpinState;     // MCS/CLH list of spinners
-
-  // TODO-FIXME: _count, _waiters and _recursions should be of
-  // type int, or int32_t but not intptr_t.  There's no reason
-  // to use 64-bit fields for these variables on a 64-bit JVM.
-
-  volatile intptr_t  _count;        // reference count to prevent reclamation/deflation
-                                    // at stop-the-world time.  See deflate_idle_monitors().
-                                    // _count is approximately |_WaitSet| + |_EntryList|
- protected:
-  volatile intptr_t  _waiters;      // number of waiting threads
- private:
- protected:
-  ObjectWaiter * volatile _WaitSet; // LL of threads wait()ing on the monitor
- private:
-  volatile int _WaitSetLock;        // protects Wait Queue - simple spinlock
-
- public:
-  int _QMix;                        // Mixed prepend queue discipline
-  ObjectMonitor * FreeNext;         // Free list linkage
-  intptr_t StatA, StatsB;
-
- public:
-  static void Initialize();
-  static PerfCounter * _sync_ContendedLockAttempts;
-  static PerfCounter * _sync_FutileWakeups;
-  static PerfCounter * _sync_Parks;
-  static PerfCounter * _sync_EmptyNotifications;
-  static PerfCounter * _sync_Notifications;
-  static PerfCounter * _sync_SlowEnter;
-  static PerfCounter * _sync_SlowExit;
-  static PerfCounter * _sync_SlowNotify;
-  static PerfCounter * _sync_SlowNotifyAll;
-  static PerfCounter * _sync_FailedSpins;
-  static PerfCounter * _sync_SuccessfulSpins;
-  static PerfCounter * _sync_PrivateA;
-  static PerfCounter * _sync_PrivateB;
-  static PerfCounter * _sync_MonInCirculation;
-  static PerfCounter * _sync_MonScavenged;
-  static PerfCounter * _sync_Inflations;
-  static PerfCounter * _sync_Deflations;
-  static PerfLongVariable * _sync_MonExtant;
-
- public:
-  static int Knob_Verbose;
-  static int Knob_VerifyInUse;
-  static int Knob_SpinLimit;
-  void* operator new (size_t size) throw() {
-    return AllocateHeap(size, mtInternal);
-  }
-  void* operator new[] (size_t size) throw() {
-    return operator new (size);
-  }
-  void operator delete(void* p) {
-    FreeHeap(p, mtInternal);
-  }
-  void operator delete[] (void *p) {
-    operator delete(p);
-  }
 };
 
 #undef TEVENT
--- a/src/share/vm/runtime/objectMonitor.inline.hpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/share/vm/runtime/objectMonitor.inline.hpp	Tue Oct 14 10:32:12 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -40,15 +40,11 @@
   _header = hdr;
 }
 
-inline intptr_t ObjectMonitor::count() const {
+inline jint ObjectMonitor::count() const {
   return _count;
 }
 
-inline void ObjectMonitor::set_count(intptr_t count) {
-  _count= count;
-}
-
-inline intptr_t ObjectMonitor::waiters() const {
+inline jint ObjectMonitor::waiters() const {
   return _waiters;
 }
 
@@ -61,7 +57,7 @@
   assert(_count == 0, "Fatal logic error in ObjectMonitor count!");
   assert(_waiters == 0, "Fatal logic error in ObjectMonitor waiters!");
   assert(_recursions == 0, "Fatal logic error in ObjectMonitor recursions!");
-  assert(_object, "Fatal logic error in ObjectMonitor object!");
+  assert(_object != NULL, "Fatal logic error in ObjectMonitor object!");
   assert(_owner == 0, "Fatal logic error in ObjectMonitor owner!");
 
   _header = NULL;
@@ -85,7 +81,6 @@
   if (THREAD != _owner) {
     if (THREAD->is_lock_owned((address) _owner)) {
       _owner = THREAD;  // regain ownership of inflated monitor
-      OwnerIsThread = 1 ;
       assert (_recursions == 0, "invariant") ;
     } else {
       check_slow(THREAD);
@@ -97,7 +92,7 @@
 
 
 // return number of threads contending for this monitor
-inline intptr_t ObjectMonitor::contentions() const {
+inline jint ObjectMonitor::contentions() const {
   return _count;
 }
 
--- a/src/share/vm/runtime/synchronizer.cpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/share/vm/runtime/synchronizer.cpp	Tue Oct 14 10:32:12 2014 -0700
@@ -24,6 +24,7 @@
 
 #include "precompiled.hpp"
 #include "classfile/vmSymbols.hpp"
+#include "memory/padded.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/markOop.hpp"
 #include "oops/oop.inline.hpp"
@@ -110,6 +111,8 @@
 #define NINFLATIONLOCKS 256
 static volatile intptr_t InflationLocks[NINFLATIONLOCKS];
 
+// gBlockList is really PaddedEnd<ObjectMonitor> *, but we don't
+// want to expose the PaddedEnd template more than necessary.
 ObjectMonitor * ObjectSynchronizer::gBlockList = NULL;
 ObjectMonitor * volatile ObjectSynchronizer::gFreeList  = NULL;
 ObjectMonitor * volatile ObjectSynchronizer::gOmInUseList  = NULL;
@@ -410,16 +413,15 @@
 // performed by the CPU(s) or platform.
 
 struct SharedGlobals {
+  char         _pad_prefix[DEFAULT_CACHE_LINE_SIZE];
   // These are highly shared mostly-read variables.
-  // To avoid false-sharing they need to be the sole occupants of a $ line.
-  double padPrefix[8];
+  // To avoid false-sharing they need to be the sole occupants of a cache line.
   volatile int stwRandom;
   volatile int stwCycle;
-
-  // Hot RW variables -- Sequester to avoid false-sharing
-  double padSuffix[16];
+  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile int) * 2);
+  // Hot RW variable -- Sequester to avoid false-sharing
   volatile int hcSequence;
-  double padFinal[8];
+  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile int));
 };
 
 static SharedGlobals GVars;
@@ -780,18 +782,18 @@
 // Visitors ...
 
 void ObjectSynchronizer::monitors_iterate(MonitorClosure* closure) {
-  ObjectMonitor* block = gBlockList;
+  PaddedEnd<ObjectMonitor> * block = (PaddedEnd<ObjectMonitor> *)gBlockList;
   ObjectMonitor* mid;
   while (block) {
     assert(block->object() == CHAINMARKER, "must be a block header");
     for (int i = _BLOCKSIZE - 1; i > 0; i--) {
-      mid = block + i;
+      mid = (ObjectMonitor *)(block + i);
       oop object = (oop) mid->object();
       if (object != NULL) {
         closure->do_monitor(mid);
       }
     }
-    block = (ObjectMonitor*) block->FreeNext;
+    block = (PaddedEnd<ObjectMonitor> *) block->FreeNext;
   }
 }
 
@@ -806,10 +808,12 @@
 
 void ObjectSynchronizer::oops_do(OopClosure* f) {
   assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
-  for (ObjectMonitor* block = gBlockList; block != NULL; block = next(block)) {
+  for (PaddedEnd<ObjectMonitor> * block =
+       (PaddedEnd<ObjectMonitor> *)gBlockList; block != NULL;
+       block = (PaddedEnd<ObjectMonitor> *)next(block)) {
     assert(block->object() == CHAINMARKER, "must be a block header");
     for (int i = 1; i < _BLOCKSIZE; i++) {
-      ObjectMonitor* mid = &block[i];
+      ObjectMonitor* mid = (ObjectMonitor *)&block[i];
       if (mid->object() != NULL) {
         f->do_oop((oop*)mid->object_addr());
       }
@@ -966,16 +970,29 @@
     // 3: allocate a block of new ObjectMonitors
     // Both the local and global free lists are empty -- resort to malloc().
     // In the current implementation objectMonitors are TSM - immortal.
+    // Ideally, we'd write "new ObjectMonitor[_BLOCKSIZE], but we want
+    // each ObjectMonitor to start at the beginning of a cache line,
+    // so we use align_size_up().
+    // A better solution would be to use C++ placement-new.
+    // BEWARE: As it stands currently, we don't run the ctors!
     assert(_BLOCKSIZE > 1, "invariant");
-    ObjectMonitor * temp = new ObjectMonitor[_BLOCKSIZE];
+    size_t neededsize = sizeof(PaddedEnd<ObjectMonitor>) * _BLOCKSIZE;
+    PaddedEnd<ObjectMonitor> * temp;
+    size_t aligned_size = neededsize + (DEFAULT_CACHE_LINE_SIZE - 1);
+    void* real_malloc_addr = (void *)NEW_C_HEAP_ARRAY(char, aligned_size,
+                                                      mtInternal);
+    temp = (PaddedEnd<ObjectMonitor> *)
+             align_size_up((intptr_t)real_malloc_addr,
+                           DEFAULT_CACHE_LINE_SIZE);
 
     // NOTE: (almost) no way to recover if allocation failed.
     // We might be able to induce a STW safepoint and scavenge enough
     // objectMonitors to permit progress.
     if (temp == NULL) {
-      vm_exit_out_of_memory(sizeof (ObjectMonitor[_BLOCKSIZE]), OOM_MALLOC_ERROR,
+      vm_exit_out_of_memory(neededsize, OOM_MALLOC_ERROR,
                             "Allocate ObjectMonitors");
     }
+    (void)memset((void *) temp, 0, neededsize);
 
     // Format the block.
     // initialize the linked list, each monitor points to its next
@@ -986,7 +1003,7 @@
     // look like: class Block { Block * next; int N; ObjectMonitor Body [N] ; }
 
     for (int i = 1; i < _BLOCKSIZE; i++) {
-      temp[i].FreeNext = &temp[i+1];
+      temp[i].FreeNext = (ObjectMonitor *)&temp[i+1];
     }
 
     // terminate the last monitor as the end of list
@@ -1141,10 +1158,6 @@
 }
 
 
-// Note that we could encounter some performance loss through false-sharing as
-// multiple locks occupy the same $ line.  Padding might be appropriate.
-
-
 ObjectMonitor * NOINLINE ObjectSynchronizer::inflate(Thread * Self,
                                                      oop object) {
   // Inflate mutates the heap ...
@@ -1210,7 +1223,6 @@
       // in which INFLATING appears in the mark.
       m->Recycle();
       m->_Responsible  = NULL;
-      m->OwnerIsThread = 0;
       m->_recursions   = 0;
       m->_SpinDuration = ObjectMonitor::Knob_SpinLimit;   // Consider: maintain by type/class
 
@@ -1257,8 +1269,8 @@
       m->set_header(dmw);
 
       // Optimization: if the mark->locker stack address is associated
-      // with this thread we could simply set m->_owner = Self and
-      // m->OwnerIsThread = 1. Note that a thread can inflate an object
+      // with this thread we could simply set m->_owner = Self.
+      // Note that a thread can inflate an object
       // that it has stack-locked -- as might happen in wait() -- directly
       // with CAS.  That is, we can avoid the xchg-NULL .... ST idiom.
       m->set_owner(mark->locker());
@@ -1302,7 +1314,6 @@
     m->set_header(mark);
     m->set_owner(NULL);
     m->set_object(object);
-    m->OwnerIsThread = 1;
     m->_recursions   = 0;
     m->_Responsible  = NULL;
     m->_SpinDuration = ObjectMonitor::Knob_SpinLimit;       // consider: keep metastats by type/class
@@ -1310,7 +1321,6 @@
     if (Atomic::cmpxchg_ptr (markOopDesc::encode(m), object->mark_addr(), mark) != mark) {
       m->set_object(NULL);
       m->set_owner(NULL);
-      m->OwnerIsThread = 0;
       m->Recycle();
       omRelease(Self, m, true);
       m = NULL;
@@ -1336,9 +1346,6 @@
   }
 }
 
-// Note that we could encounter some performance loss through false-sharing as
-// multiple locks occupy the same $ line.  Padding might be appropriate.
-
 
 // Deflate_idle_monitors() is called at all safepoints, immediately
 // after all mutators are stopped, but before any objects have moved.
@@ -1491,12 +1498,14 @@
       nInuse += gOmInUseCount;
     }
 
-  } else for (ObjectMonitor* block = gBlockList; block != NULL; block = next(block)) {
+  } else for (PaddedEnd<ObjectMonitor> * block =
+              (PaddedEnd<ObjectMonitor> *)gBlockList; block != NULL;
+              block = (PaddedEnd<ObjectMonitor> *)next(block)) {
     // Iterate over all extant monitors - Scavenge all idle monitors.
     assert(block->object() == CHAINMARKER, "must be a block header");
     nInCirculation += _BLOCKSIZE;
     for (int i = 1; i < _BLOCKSIZE; i++) {
-      ObjectMonitor* mid = &block[i];
+      ObjectMonitor* mid = (ObjectMonitor*)&block[i];
       oop obj = (oop) mid->object();
 
       if (obj == NULL) {
@@ -1648,18 +1657,18 @@
 
 // Verify all monitors in the monitor cache, the verification is weak.
 void ObjectSynchronizer::verify() {
-  ObjectMonitor* block = gBlockList;
+  PaddedEnd<ObjectMonitor> * block = (PaddedEnd<ObjectMonitor> *)gBlockList;
   ObjectMonitor* mid;
   while (block) {
     assert(block->object() == CHAINMARKER, "must be a block header");
     for (int i = 1; i < _BLOCKSIZE; i++) {
-      mid = block + i;
+      mid = (ObjectMonitor *)(block + i);
       oop object = (oop) mid->object();
       if (object != NULL) {
         mid->verify();
       }
     }
-    block = (ObjectMonitor*) block->FreeNext;
+    block = (PaddedEnd<ObjectMonitor> *) block->FreeNext;
   }
 }
 
@@ -1668,18 +1677,19 @@
 // the list of extant blocks without taking a lock.
 
 int ObjectSynchronizer::verify_objmon_isinpool(ObjectMonitor *monitor) {
-  ObjectMonitor* block = gBlockList;
+  PaddedEnd<ObjectMonitor> * block = (PaddedEnd<ObjectMonitor> *)gBlockList;
 
   while (block) {
     assert(block->object() == CHAINMARKER, "must be a block header");
-    if (monitor > &block[0] && monitor < &block[_BLOCKSIZE]) {
+    if (monitor > (ObjectMonitor *)&block[0] &&
+        monitor < (ObjectMonitor *)&block[_BLOCKSIZE]) {
       address mon = (address) monitor;
       address blk = (address) block;
       size_t diff = mon - blk;
-      assert((diff % sizeof(ObjectMonitor)) == 0, "check");
+      assert((diff % sizeof(PaddedEnd<ObjectMonitor>)) == 0, "check");
       return 1;
     }
-    block = (ObjectMonitor*) block->FreeNext;
+    block = (PaddedEnd<ObjectMonitor> *) block->FreeNext;
   }
   return 0;
 }
--- a/src/share/vm/runtime/synchronizer.hpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/share/vm/runtime/synchronizer.hpp	Tue Oct 14 10:32:12 2014 -0700
@@ -134,6 +134,8 @@
 
  private:
   enum { _BLOCKSIZE = 128 };
+  // gBlockList is really PaddedEnd<ObjectMonitor> *, but we don't
+  // want to expose the PaddedEnd template more than necessary.
   static ObjectMonitor* gBlockList;
   static ObjectMonitor * volatile gFreeList;
   // global monitor in use list, for moribund threads,
--- a/src/share/vm/runtime/vmStructs.cpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/share/vm/runtime/vmStructs.cpp	Tue Oct 14 10:32:12 2014 -0700
@@ -1070,8 +1070,8 @@
   volatile_nonstatic_field(ObjectMonitor,      _header,                                       markOop)                               \
   unchecked_nonstatic_field(ObjectMonitor,     _object,                                       sizeof(void *)) /* NOTE: no type */    \
   unchecked_nonstatic_field(ObjectMonitor,     _owner,                                        sizeof(void *)) /* NOTE: no type */    \
-  volatile_nonstatic_field(ObjectMonitor,      _count,                                        intptr_t)                              \
-  volatile_nonstatic_field(ObjectMonitor,      _waiters,                                      intptr_t)                              \
+  volatile_nonstatic_field(ObjectMonitor,      _count,                                        jint)                                  \
+  volatile_nonstatic_field(ObjectMonitor,      _waiters,                                      jint)                                  \
   volatile_nonstatic_field(ObjectMonitor,      _recursions,                                   intptr_t)                              \
   nonstatic_field(ObjectMonitor,               FreeNext,                                      ObjectMonitor*)                        \
   volatile_nonstatic_field(BasicLock,          _displaced_header,                             markOop)                               \
@@ -2507,6 +2507,12 @@
   declare_constant(Deoptimization::Action_make_not_compilable)            \
   declare_constant(Deoptimization::Action_LIMIT)                          \
                                                                           \
+  /***************************************************/                   \
+  /* DEFAULT_CACHE_LINE_SIZE (globalDefinitions.hpp) */                   \
+  /***************************************************/                   \
+                                                                          \
+  declare_constant(DEFAULT_CACHE_LINE_SIZE)                               \
+                                                                          \
   /*********************/                                                 \
   /* Matcher (C2 only) */                                                 \
   /*********************/                                                 \
--- a/src/share/vm/utilities/globalDefinitions.hpp	Mon Oct 13 22:11:39 2014 +0200
+++ b/src/share/vm/utilities/globalDefinitions.hpp	Tue Oct 14 10:32:12 2014 -0700
@@ -540,7 +540,9 @@
 
 
 // The expected size in bytes of a cache line, used to pad data structures.
-#define DEFAULT_CACHE_LINE_SIZE 64
+#ifndef DEFAULT_CACHE_LINE_SIZE
+  #define DEFAULT_CACHE_LINE_SIZE 64
+#endif
 
 
 //----------------------------------------------------------------------------------------------------