changeset 3231:64bf7c8270cb

7147724: G1: hang in SurrogateLockerThread::manipulatePLL Summary: Attempting to initiate a marking cycle when allocating a humongous object can, if a marking cycle is successfully initiated by another thread, result in the allocating thread spinning until the marking cycle is complete. Eliminate a deadlock between the main ConcurrentMarkThread, the SurrogateLocker thread, the VM thread, and a mutator thread waiting on the SecondaryFreeList_lock (while free regions are going to become available) by not manipulating the pending list lock during the prologue and epilogue of the cleanup pause. Reviewed-by: brutisso, jcoomes, tonyp
author johnc
date Mon, 12 Mar 2012 14:59:00 -0700
parents 8a729074feae
children 21595f05bc93
files src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp src/share/vm/gc_implementation/g1/vm_operations_g1.cpp src/share/vm/gc_implementation/g1/vm_operations_g1.hpp
diffstat 4 files changed, 74 insertions(+), 22 deletions(-) [+]
line wrap: on
line diff
--- a/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp	Fri Mar 16 16:14:04 2012 +0100
+++ b/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp	Mon Mar 12 14:59:00 2012 -0700
@@ -155,7 +155,7 @@
 
           CMCheckpointRootsFinalClosure final_cl(_cm);
           sprintf(verbose_str, "GC remark");
-          VM_CGC_Operation op(&final_cl, verbose_str);
+          VM_CGC_Operation op(&final_cl, verbose_str, true /* needs_pll */);
           VMThread::execute(&op);
         }
         if (cm()->restart_for_overflow() &&
@@ -189,7 +189,7 @@
 
         CMCleanUp cl_cl(_cm);
         sprintf(verbose_str, "GC cleanup");
-        VM_CGC_Operation op(&cl_cl, verbose_str);
+        VM_CGC_Operation op(&cl_cl, verbose_str, false /* needs_pll */);
         VMThread::execute(&op);
       } else {
         // We don't want to update the marking status if a GC pause
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Fri Mar 16 16:14:04 2012 +0100
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Mon Mar 12 14:59:00 2012 -0700
@@ -993,7 +993,7 @@
     // iteration (after taking the Heap_lock).
     result = _mutator_alloc_region.attempt_allocation(word_size,
                                                       false /* bot_updates */);
-    if (result != NULL ){
+    if (result != NULL) {
       return result;
     }
 
@@ -2437,20 +2437,22 @@
                                  true,  /* should_initiate_conc_mark */
                                  g1_policy()->max_pause_time_ms(),
                                  cause);
+
       VMThread::execute(&op);
       if (!op.pause_succeeded()) {
-        // Another GC got scheduled and prevented us from scheduling
-        // the initial-mark GC. It's unlikely that the GC that
-        // pre-empted us was also an initial-mark GC. So, we'll retry
-        // the initial-mark GC.
-
         if (full_gc_count_before == total_full_collections()) {
-          retry_gc = true;
+          retry_gc = op.should_retry_gc();
         } else {
           // A Full GC happened while we were trying to schedule the
           // initial-mark GC. No point in starting a new cycle given
           // that the whole heap was collected anyway.
         }
+
+        if (retry_gc) {
+          if (GC_locker::is_active_and_needs_gc()) {
+            GC_locker::stall_until_clear();
+          }
+        }
       }
     } else {
       if (cause == GCCause::_gc_locker
--- a/src/share/vm/gc_implementation/g1/vm_operations_g1.cpp	Fri Mar 16 16:14:04 2012 +0100
+++ b/src/share/vm/gc_implementation/g1/vm_operations_g1.cpp	Mon Mar 12 14:59:00 2012 -0700
@@ -34,7 +34,8 @@
 VM_G1CollectForAllocation::VM_G1CollectForAllocation(
                                                   unsigned int gc_count_before,
                                                   size_t word_size)
-  : VM_G1OperationWithAllocRequest(gc_count_before, word_size) {
+  : VM_G1OperationWithAllocRequest(gc_count_before, word_size,
+                                   GCCause::_allocation_failure) {
   guarantee(word_size > 0, "an allocation should always be requested");
 }
 
@@ -57,9 +58,10 @@
                                       bool           should_initiate_conc_mark,
                                       double         target_pause_time_ms,
                                       GCCause::Cause gc_cause)
-  : VM_G1OperationWithAllocRequest(gc_count_before, word_size),
+  : VM_G1OperationWithAllocRequest(gc_count_before, word_size, gc_cause),
     _should_initiate_conc_mark(should_initiate_conc_mark),
     _target_pause_time_ms(target_pause_time_ms),
+    _should_retry_gc(false),
     _full_collections_completed_before(0) {
   guarantee(target_pause_time_ms > 0.0,
             err_msg("target_pause_time_ms = %1.6lf should be positive",
@@ -70,6 +72,22 @@
   _gc_cause = gc_cause;
 }
 
+bool VM_G1IncCollectionPause::doit_prologue() {
+  bool res = VM_GC_Operation::doit_prologue();
+  if (!res) {
+    if (_should_initiate_conc_mark) {
+      // The prologue can fail for a couple of reasons. The first is that another GC
+      // got scheduled and prevented the scheduling of the initial mark GC. The
+      // second is that the GC locker may be active and the heap can't be expanded.
+      // In both cases we want to retry the GC so that the initial mark pause is
+      // actually scheduled. In the second case, however, we should stall until
+      // until the GC locker is no longer active and then retry the initial mark GC.
+      _should_retry_gc = true;
+    }
+  }
+  return res;
+}
+
 void VM_G1IncCollectionPause::doit() {
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
   assert(!_should_initiate_conc_mark ||
@@ -106,11 +124,25 @@
     // next GC pause to be an initial mark; it returns false if a
     // marking cycle is already in progress.
     //
-    // If a marking cycle is already in progress just return and skip
-    // the pause - the requesting thread should block in doit_epilogue
-    // until the marking cycle is complete.
+    // If a marking cycle is already in progress just return and skip the
+    // pause below - if the reason for requesting this initial mark pause
+    // was due to a System.gc() then the requesting thread should block in
+    // doit_epilogue() until the marking cycle is complete.
+    //
+    // If this initial mark pause was requested as part of a humongous
+    // allocation then we know that the marking cycle must just have
+    // been started by another thread (possibly also allocating a humongous
+    // object) as there was no active marking cycle when the requesting
+    // thread checked before calling collect() in
+    // attempt_allocation_humongous(). Retrying the GC, in this case,
+    // will cause the requesting thread to spin inside collect() until the
+    // just started marking cycle is complete - which may be a while. So
+    // we do NOT retry the GC.
     if (!res) {
-      assert(_word_size == 0, "ExplicitGCInvokesConcurrent shouldn't be allocating");
+      assert(_word_size == 0, "Concurrent Full GC/Humongous Object IM shouldn't be allocating");
+      if (_gc_cause != GCCause::_g1_humongous_allocation) {
+        _should_retry_gc = true;
+      }
       return;
     }
   }
@@ -123,6 +155,13 @@
                                       true /* expect_null_cur_alloc_region */);
   } else {
     assert(_result == NULL, "invariant");
+    if (!_pause_succeeded) {
+      // Another possible reason reason for the pause to not be successful
+      // is that, again, the GC locker is active (and has become active
+      // since the prologue was executed). In this case we should retry
+      // the pause after waiting for the GC locker to become inactive.
+      _should_retry_gc = true;
+    }
   }
 }
 
@@ -168,6 +207,7 @@
 }
 
 void VM_CGC_Operation::acquire_pending_list_lock() {
+  assert(_needs_pll, "don't call this otherwise");
   // The caller may block while communicating
   // with the SLT thread in order to acquire/release the PLL.
   ConcurrentMarkThread::slt()->
@@ -175,6 +215,7 @@
 }
 
 void VM_CGC_Operation::release_and_notify_pending_list_lock() {
+  assert(_needs_pll, "don't call this otherwise");
   // The caller may block while communicating
   // with the SLT thread in order to acquire/release the PLL.
   ConcurrentMarkThread::slt()->
@@ -198,7 +239,9 @@
 bool VM_CGC_Operation::doit_prologue() {
   // Note the relative order of the locks must match that in
   // VM_GC_Operation::doit_prologue() or deadlocks can occur
-  acquire_pending_list_lock();
+  if (_needs_pll) {
+    acquire_pending_list_lock();
+  }
 
   Heap_lock->lock();
   SharedHeap::heap()->_thread_holds_heap_lock_for_gc = true;
@@ -210,5 +253,7 @@
   // VM_GC_Operation::doit_epilogue()
   SharedHeap::heap()->_thread_holds_heap_lock_for_gc = false;
   Heap_lock->unlock();
-  release_and_notify_pending_list_lock();
+  if (_needs_pll) {
+    release_and_notify_pending_list_lock();
+  }
 }
--- a/src/share/vm/gc_implementation/g1/vm_operations_g1.hpp	Fri Mar 16 16:14:04 2012 +0100
+++ b/src/share/vm/gc_implementation/g1/vm_operations_g1.hpp	Mon Mar 12 14:59:00 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -43,8 +43,9 @@
 
 public:
   VM_G1OperationWithAllocRequest(unsigned int gc_count_before,
-                                 size_t       word_size)
-    : VM_GC_Operation(gc_count_before, GCCause::_allocation_failure),
+                                 size_t       word_size,
+                                 GCCause::Cause gc_cause)
+    : VM_GC_Operation(gc_count_before, gc_cause),
       _word_size(word_size), _result(NULL), _pause_succeeded(false) { }
   HeapWord* result() { return _result; }
   bool pause_succeeded() { return _pause_succeeded; }
@@ -77,6 +78,7 @@
 class VM_G1IncCollectionPause: public VM_G1OperationWithAllocRequest {
 private:
   bool         _should_initiate_conc_mark;
+  bool         _should_retry_gc;
   double       _target_pause_time_ms;
   unsigned int _full_collections_completed_before;
 public:
@@ -86,11 +88,13 @@
                           double         target_pause_time_ms,
                           GCCause::Cause gc_cause);
   virtual VMOp_Type type() const { return VMOp_G1IncCollectionPause; }
+  virtual bool doit_prologue();
   virtual void doit();
   virtual void doit_epilogue();
   virtual const char* name() const {
     return "garbage-first incremental collection pause";
   }
+  bool should_retry_gc() const { return _should_retry_gc; }
 };
 
 // Concurrent GC stop-the-world operations such as remark and cleanup;
@@ -98,6 +102,7 @@
 class VM_CGC_Operation: public VM_Operation {
   VoidClosure* _cl;
   const char* _printGCMessage;
+  bool _needs_pll;
 
 protected:
   // java.lang.ref.Reference support
@@ -105,8 +110,8 @@
   void release_and_notify_pending_list_lock();
 
 public:
-  VM_CGC_Operation(VoidClosure* cl, const char *printGCMsg)
-    : _cl(cl), _printGCMessage(printGCMsg) { }
+  VM_CGC_Operation(VoidClosure* cl, const char *printGCMsg, bool needs_pll)
+    : _cl(cl), _printGCMessage(printGCMsg), _needs_pll(needs_pll) { }
   virtual VMOp_Type type() const { return VMOp_CGC_Operation; }
   virtual void doit();
   virtual bool doit_prologue();