changeset 57036:92c98aa0f801

8228609: G1 copy cost prediction uses used vs. actual copied byte Summary: Let the object copy cost predictor use the actual copied bytes as input for the prediction instead of the used bytes. Reviewed-by: kbarrett, sjohanss
author tschatzl
date Thu, 21 Nov 2019 11:42:54 +0100
parents bb3472aa26c8
children a096dfdcabde
files src/hotspot/share/gc/g1/g1CollectedHeap.cpp src/hotspot/share/gc/g1/g1CollectedHeap.hpp src/hotspot/share/gc/g1/g1EvacuationInfo.hpp src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.cpp src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.hpp src/hotspot/share/gc/g1/g1ParScanThreadState.cpp src/hotspot/share/gc/g1/g1ParScanThreadState.hpp src/hotspot/share/gc/g1/g1Policy.cpp src/hotspot/share/gc/g1/g1Policy.hpp src/hotspot/share/gc/g1/g1Trace.cpp src/hotspot/share/gc/shared/workerDataArray.cpp src/hotspot/share/gc/shared/workerDataArray.hpp src/hotspot/share/gc/shared/workerDataArray.inline.hpp test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
diffstat 16 files changed, 137 insertions(+), 112 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Thu Nov 21 11:42:54 2019 +0100
@@ -1516,6 +1516,7 @@
   _allocator(NULL),
   _verifier(NULL),
   _summary_bytes_used(0),
+  _bytes_used_during_gc(0),
   _archive_allocator(NULL),
   _survivor_evac_stats("Young", YoungPLABSize, PLABWeight),
   _old_evac_stats("Old", OldPLABSize, PLABWeight),
@@ -3138,7 +3139,6 @@
                          collector_state()->yc_type() == Mixed /* all_memory_pools_affected */);
 
     G1HeapTransition heap_transition(this);
-    size_t heap_used_bytes_before_gc = used();
 
     {
       IsGCActiveMark x;
@@ -3211,7 +3211,7 @@
 
         double sample_end_time_sec = os::elapsedTime();
         double pause_time_ms = (sample_end_time_sec - sample_start_time_sec) * MILLIUNITS;
-        policy()->record_collection_pause_end(pause_time_ms, heap_used_bytes_before_gc);
+        policy()->record_collection_pause_end(pause_time_ms);
       }
 
       verify_after_young_collection(verify_type);
@@ -3694,12 +3694,14 @@
 }
 
 void G1CollectedHeap::merge_per_thread_state_info(G1ParScanThreadStateSet* per_thread_states) {
-  double merge_pss_time_start = os::elapsedTime();
+  Ticks start = Ticks::now();
   per_thread_states->flush();
-  phase_times()->record_merge_pss_time_ms((os::elapsedTime() - merge_pss_time_start) * 1000.0);
+  phase_times()->record_or_add_time_secs(G1GCPhaseTimes::MergePSS, 0 /* worker_id */, (Ticks::now() - start).seconds());
 }
 
 void G1CollectedHeap::pre_evacuate_collection_set(G1EvacuationInfo& evacuation_info, G1ParScanThreadStateSet* per_thread_states) {
+  _bytes_used_during_gc = 0;
+
   _expand_heap_after_alloc_failure = true;
   _evacuation_failed = false;
 
@@ -3764,9 +3766,6 @@
     Tickspan evac_time = (Ticks::now() - start);
     p->record_or_add_time_secs(objcopy_phase, worker_id, evac_time.seconds() - cl.term_time());
 
-    p->record_or_add_thread_work_item(objcopy_phase, worker_id, pss->lab_waste_words() * HeapWordSize, G1GCPhaseTimes::ObjCopyLABWaste);
-    p->record_or_add_thread_work_item(objcopy_phase, worker_id, pss->lab_undo_waste_words() * HeapWordSize, G1GCPhaseTimes::ObjCopyLABUndoWaste);
-
     if (termination_phase == G1GCPhaseTimes::Termination) {
       p->record_time_secs(termination_phase, worker_id, cl.term_time());
       p->record_thread_work_item(termination_phase, worker_id, cl.term_attempts());
@@ -3943,6 +3942,8 @@
 void G1CollectedHeap::post_evacuate_collection_set(G1EvacuationInfo& evacuation_info,
                                                    G1RedirtyCardsQueueSet* rdcqs,
                                                    G1ParScanThreadStateSet* per_thread_states) {
+  G1GCPhaseTimes* p = phase_times();
+
   rem_set()->cleanup_after_scan_heap_roots();
 
   // Process any discovered reference objects - we have
@@ -3955,16 +3956,15 @@
   G1STWIsAliveClosure is_alive(this);
   G1KeepAliveClosure keep_alive(this);
 
-  WeakProcessor::weak_oops_do(workers(), &is_alive, &keep_alive,
-                              phase_times()->weak_phase_times());
+  WeakProcessor::weak_oops_do(workers(), &is_alive, &keep_alive, p->weak_phase_times());
 
   if (G1StringDedup::is_enabled()) {
     double string_dedup_time_ms = os::elapsedTime();
 
-    string_dedup_cleaning(&is_alive, &keep_alive, phase_times());
+    string_dedup_cleaning(&is_alive, &keep_alive, p);
 
     double string_cleanup_time_ms = (os::elapsedTime() - string_dedup_time_ms) * 1000.0;
-    phase_times()->record_string_deduplication_time(string_cleanup_time_ms);
+    p->record_string_deduplication_time(string_cleanup_time_ms);
   }
 
   _allocator->release_gc_alloc_regions(evacuation_info);
@@ -3977,7 +3977,7 @@
 
     double recalculate_used_start = os::elapsedTime();
     set_used(recalculate_used());
-    phase_times()->record_evac_fail_recalc_used_time((os::elapsedTime() - recalculate_used_start) * 1000.0);
+    p->record_evac_fail_recalc_used_time((os::elapsedTime() - recalculate_used_start) * 1000.0);
 
     if (_archive_allocator != NULL) {
       _archive_allocator->clear_used();
@@ -3989,8 +3989,8 @@
     }
   } else {
     // The "used" of the the collection set have already been subtracted
-    // when they were freed.  Add in the bytes evacuated.
-    increase_used(policy()->bytes_copied_during_gc());
+    // when they were freed.  Add in the bytes used.
+    increase_used(_bytes_used_during_gc);
   }
 
   _preserved_marks_set.assert_empty();
@@ -4014,7 +4014,7 @@
   record_obj_copy_mem_stats();
 
   evacuation_info.set_collectionset_used_before(collection_set()->bytes_used_before());
-  evacuation_info.set_bytes_copied(policy()->bytes_copied_during_gc());
+  evacuation_info.set_bytes_used(_bytes_used_during_gc);
 
 #if COMPILER2_OR_JVMCI
   double start = os::elapsedTime();
@@ -4786,7 +4786,7 @@
 void G1CollectedHeap::retire_gc_alloc_region(HeapRegion* alloc_region,
                                              size_t allocated_bytes,
                                              G1HeapRegionAttr dest) {
-  policy()->record_bytes_copied_during_gc(allocated_bytes);
+  _bytes_used_during_gc += allocated_bytes;
   if (dest.is_old()) {
     old_set_add(alloc_region);
   } else {
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp	Thu Nov 21 11:42:54 2019 +0100
@@ -214,6 +214,10 @@
 
   void set_used(size_t bytes);
 
+  // Number of bytes used in all regions during GC. Typically changed when
+  // retiring a GC alloc region.
+  size_t _bytes_used_during_gc;
+
   // Class that handles archive allocation ranges.
   G1ArchiveAllocator* _archive_allocator;
 
--- a/src/hotspot/share/gc/g1/g1EvacuationInfo.hpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1EvacuationInfo.hpp	Thu Nov 21 11:42:54 2019 +0100
@@ -33,13 +33,14 @@
   size_t _collectionset_used_before;
   size_t _collectionset_used_after;
   size_t _alloc_regions_used_before;
-  size_t _bytes_copied;
+  size_t _bytes_used;
   uint   _regions_freed;
 
 public:
-  G1EvacuationInfo() : _collectionset_regions(0), _allocation_regions(0), _collectionset_used_before(0),
-                     _collectionset_used_after(0), _alloc_regions_used_before(0),
-                     _bytes_copied(0), _regions_freed(0) { }
+  G1EvacuationInfo() :
+    _collectionset_regions(0), _allocation_regions(0), _collectionset_used_before(0),
+    _collectionset_used_after(0), _alloc_regions_used_before(0),
+    _bytes_used(0), _regions_freed(0) { }
 
   void set_collectionset_regions(uint collectionset_regions) {
     _collectionset_regions = collectionset_regions;
@@ -61,8 +62,8 @@
     _alloc_regions_used_before = used;
   }
 
-  void set_bytes_copied(size_t copied) {
-    _bytes_copied = copied;
+  void set_bytes_used(size_t used) {
+    _bytes_used = used;
   }
 
   void set_regions_freed(uint freed) {
@@ -74,7 +75,7 @@
   size_t collectionset_used_before() { return _collectionset_used_before; }
   size_t collectionset_used_after()  { return _collectionset_used_after; }
   size_t alloc_regions_used_before() { return _alloc_regions_used_before; }
-  size_t bytes_copied()              { return _bytes_copied; }
+  size_t bytes_used()                { return _bytes_used; }
   uint   regions_freed()             { return _regions_freed; }
 };
 
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp	Thu Nov 21 11:42:54 2019 +0100
@@ -128,15 +128,14 @@
   _merge_lb_skipped_cards = new WorkerDataArray<size_t>(max_gc_threads, "Skipped Cards:");
   _gc_par_phases[MergeLB]->link_thread_work_items(_merge_lb_skipped_cards, MergeLBSkippedCards);
 
-  _obj_copy_lab_waste = new WorkerDataArray<size_t>(max_gc_threads, "LAB Waste");
-  _gc_par_phases[ObjCopy]->link_thread_work_items(_obj_copy_lab_waste, ObjCopyLABWaste);
-  _obj_copy_lab_undo_waste = new WorkerDataArray<size_t>(max_gc_threads, "LAB Undo Waste");
-  _gc_par_phases[ObjCopy]->link_thread_work_items(_obj_copy_lab_undo_waste, ObjCopyLABUndoWaste);
+  _gc_par_phases[MergePSS] = new WorkerDataArray<double>(1, "Merge Per-Thread State", true /* is_serial */);
 
-  _opt_obj_copy_lab_waste = new WorkerDataArray<size_t>(max_gc_threads, "LAB Waste");
-  _gc_par_phases[OptObjCopy]->link_thread_work_items(_obj_copy_lab_waste, ObjCopyLABWaste);
-  _opt_obj_copy_lab_undo_waste  = new WorkerDataArray<size_t>(max_gc_threads, "LAB Undo Waste");
-  _gc_par_phases[OptObjCopy]->link_thread_work_items(_obj_copy_lab_undo_waste, ObjCopyLABUndoWaste);
+  _merge_pss_copied_bytes = new WorkerDataArray<size_t>(max_gc_threads, "Copied Bytes");
+  _gc_par_phases[MergePSS]->link_thread_work_items(_merge_pss_copied_bytes, MergePSSCopiedBytes);
+  _merge_pss_lab_waste_bytes = new WorkerDataArray<size_t>(max_gc_threads, "LAB Waste");
+  _gc_par_phases[MergePSS]->link_thread_work_items(_merge_pss_lab_waste_bytes, MergePSSLABWasteBytes);
+  _merge_pss_lab_undo_waste_bytes = new WorkerDataArray<size_t>(max_gc_threads, "LAB Undo Waste");
+  _gc_par_phases[MergePSS]->link_thread_work_items(_merge_pss_lab_undo_waste_bytes, MergePSSLABUndoWasteBytes);
 
   _termination_attempts = new WorkerDataArray<size_t>(max_gc_threads, "Termination Attempts:");
   _gc_par_phases[Termination]->link_thread_work_items(_termination_attempts);
@@ -189,7 +188,6 @@
   _recorded_non_young_cset_choice_time_ms = 0.0;
   _recorded_redirty_logged_cards_time_ms = 0.0;
   _recorded_preserve_cm_referents_time_ms = 0.0;
-  _recorded_merge_pss_time_ms = 0.0;
   _recorded_start_new_cset_time_ms = 0.0;
   _recorded_total_free_cset_time_ms = 0.0;
   _recorded_serial_free_cset_time_ms = 0.0;
@@ -464,13 +462,15 @@
 double G1GCPhaseTimes::print_post_evacuate_collection_set() const {
   const double evac_fail_handling = _cur_evac_fail_recalc_used +
                                     _cur_evac_fail_remove_self_forwards;
+  assert(_gc_par_phases[MergePSS]->get(0) != WorkerDataArray<double>::uninitialized(), "must be set");
+  const double merge_pss = _gc_par_phases[MergePSS]->get(0) * MILLIUNITS;
   const double sum_ms = evac_fail_handling +
                         _cur_collection_code_root_fixup_time_ms +
                         _recorded_preserve_cm_referents_time_ms +
                         _cur_ref_proc_time_ms +
                         (_weak_phase_times.total_time_sec() * MILLIUNITS) +
                         _cur_clear_ct_time_ms +
-                        _recorded_merge_pss_time_ms +
+                        merge_pss +
                         _cur_strong_code_root_purge_time_ms +
                         _recorded_redirty_logged_cards_time_ms +
                         _recorded_total_free_cset_time_ms +
@@ -500,7 +500,7 @@
     trace_time("Remove Self Forwards",_cur_evac_fail_remove_self_forwards);
   }
 
-  debug_time("Merge Per-Thread State", _recorded_merge_pss_time_ms);
+  debug_phase(_gc_par_phases[MergePSS], 0);
   debug_time("Code Roots Purge", _cur_strong_code_root_purge_time_ms);
 
   debug_time("Redirty Cards", _recorded_redirty_logged_cards_time_ms);
@@ -585,7 +585,8 @@
       "StringDedupTableFixup",
       "RedirtyCards",
       "YoungFreeCSet",
-      "NonYoungFreeCSet"
+      "NonYoungFreeCSet",
+      "MergePSS"
       //GCParPhasesSentinel only used to tell end of enum
       };
 
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp	Thu Nov 21 11:42:54 2019 +0100
@@ -78,6 +78,7 @@
     RedirtyCards,
     YoungFreeCSet,
     NonYoungFreeCSet,
+    MergePSS,
     GCParPhasesSentinel
   };
 
@@ -108,9 +109,10 @@
     MergeLBSkippedCards
   };
 
-  enum GCObjCopyWorkItems {
-    ObjCopyLABWaste,
-    ObjCopyLABUndoWaste
+  enum GCMergePSSWorkItems {
+    MergePSSCopiedBytes,
+    MergePSSLABWasteBytes,
+    MergePSSLABUndoWasteBytes
   };
 
  private:
@@ -143,11 +145,9 @@
   WorkerDataArray<size_t>* _opt_scan_hr_scanned_opt_refs;
   WorkerDataArray<size_t>* _opt_scan_hr_used_memory;
 
-  WorkerDataArray<size_t>* _obj_copy_lab_waste;
-  WorkerDataArray<size_t>* _obj_copy_lab_undo_waste;
-
-  WorkerDataArray<size_t>* _opt_obj_copy_lab_waste;
-  WorkerDataArray<size_t>* _opt_obj_copy_lab_undo_waste;
+  WorkerDataArray<size_t>* _merge_pss_copied_bytes;
+  WorkerDataArray<size_t>* _merge_pss_lab_waste_bytes;
+  WorkerDataArray<size_t>* _merge_pss_lab_undo_waste_bytes;
 
   WorkerDataArray<size_t>* _termination_attempts;
 
@@ -224,7 +224,9 @@
   template <class T>
   void details(T* phase, const char* indent) const;
 
+  void log_work_items(WorkerDataArray<double>* phase, uint indent, outputStream* out) const;
   void log_phase(WorkerDataArray<double>* phase, uint indent, outputStream* out, bool print_sum) const;
+  void debug_serial_phase(WorkerDataArray<double>* phase, uint extra_indent = 0) const;
   void debug_phase(WorkerDataArray<double>* phase, uint extra_indent = 0) const;
   void trace_phase(WorkerDataArray<double>* phase, bool print_sum = true) const;
 
@@ -269,8 +271,6 @@
 
   size_t sum_thread_work_items(GCParPhases phase, uint index = 0);
 
- public:
-
   void record_prepare_tlab_time_ms(double ms) {
     _cur_prepare_tlab_time_ms = ms;
   }
@@ -378,10 +378,6 @@
     _recorded_preserve_cm_referents_time_ms = time_ms;
   }
 
-  void record_merge_pss_time_ms(double time_ms) {
-    _recorded_merge_pss_time_ms = time_ms;
-  }
-
   void record_start_new_cset_time_ms(double time_ms) {
     _recorded_start_new_cset_time_ms = time_ms;
   }
--- a/src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.cpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.cpp	Thu Nov 21 11:42:54 2019 +0100
@@ -39,8 +39,8 @@
 }
 
 // After a collection pause, young list target length is updated. So we need to make sure we have enough regions in dram for young gen.
-void G1HeterogeneousHeapPolicy::record_collection_pause_end(double pause_time_ms, size_t heap_used_bytes_before_gc) {
-  G1Policy::record_collection_pause_end(pause_time_ms, heap_used_bytes_before_gc);
+void G1HeterogeneousHeapPolicy::record_collection_pause_end(double pause_time_ms) {
+  G1Policy::record_collection_pause_end(pause_time_ms);
   _manager->adjust_dram_regions((uint)young_list_target_length(), G1CollectedHeap::heap()->workers());
 }
 
--- a/src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.hpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.hpp	Thu Nov 21 11:42:54 2019 +0100
@@ -38,7 +38,7 @@
   // initialize policy
   virtual void init(G1CollectedHeap* g1h, G1CollectionSet* collection_set);
   // Record end of an evacuation pause.
-  virtual void record_collection_pause_end(double pause_time_ms, size_t heap_used_bytes_before_gc);
+  virtual void record_collection_pause_end(double pause_time_ms);
   // Record the end of full collection.
   virtual void record_full_collection_end();
 
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp	Thu Nov 21 11:42:54 2019 +0100
@@ -56,6 +56,9 @@
     _stack_trim_upper_threshold(GCDrainStackTargetSize * 2 + 1),
     _stack_trim_lower_threshold(GCDrainStackTargetSize),
     _trim_ticks(),
+    _surviving_young_words_base(NULL),
+    _surviving_young_words(NULL),
+    _surviving_words_length(young_cset_length + 1),
     _old_gen_is_full(false),
     _num_optional_regions(optional_cset_length),
     _numa(g1h->numa()),
@@ -65,11 +68,10 @@
   // entries, since entry 0 keeps track of surviving bytes for non-young regions.
   // We also add a few elements at the beginning and at the end in
   // an attempt to eliminate cache contention
-  size_t real_length = young_cset_length + 1;
-  size_t array_length = PADDING_ELEM_NUM + real_length + PADDING_ELEM_NUM;
+  size_t array_length = PADDING_ELEM_NUM + _surviving_words_length + PADDING_ELEM_NUM;
   _surviving_young_words_base = NEW_C_HEAP_ARRAY(size_t, array_length, mtGC);
   _surviving_young_words = _surviving_young_words_base + PADDING_ELEM_NUM;
-  memset(_surviving_young_words, 0, real_length * sizeof(size_t));
+  memset(_surviving_young_words, 0, _surviving_words_length * sizeof(size_t));
 
   _plab_allocator = new G1PLABAllocator(_g1h->allocator());
 
@@ -85,18 +87,19 @@
   initialize_numa_stats();
 }
 
-// Pass locally gathered statistics to global state.
-void G1ParScanThreadState::flush(size_t* surviving_young_words) {
+size_t G1ParScanThreadState::flush(size_t* surviving_young_words) {
   _rdcq.flush();
+  flush_numa_stats();
   // Update allocation statistics.
   _plab_allocator->flush_and_retire_stats();
   _g1h->policy()->record_age_table(&_age_table);
 
-  uint length = _g1h->collection_set()->young_region_length() + 1;
-  for (uint i = 0; i < length; i++) {
+  size_t sum = 0;
+  for (uint i = 0; i < _surviving_words_length; i++) {
     surviving_young_words[i] += _surviving_young_words[i];
+    sum += _surviving_young_words[i];
   }
-  flush_numa_stats();
+  return sum;
 }
 
 G1ParScanThreadState::~G1ParScanThreadState() {
@@ -357,16 +360,27 @@
 void G1ParScanThreadStateSet::flush() {
   assert(!_flushed, "thread local state from the per thread states should be flushed once");
 
-  for (uint worker_index = 0; worker_index < _n_workers; ++worker_index) {
-    G1ParScanThreadState* pss = _states[worker_index];
+  for (uint worker_id = 0; worker_id < _n_workers; ++worker_id) {
+    G1ParScanThreadState* pss = _states[worker_id];
 
     if (pss == NULL) {
       continue;
     }
 
-    pss->flush(_surviving_young_words_total);
+    G1GCPhaseTimes* p = _g1h->phase_times();
+
+    // Need to get the following two before the call to G1ParThreadScanState::flush()
+    // because it resets the PLAB allocator where we get this info from.
+    size_t lab_waste_bytes = pss->lab_waste_words() * HeapWordSize;
+    size_t lab_undo_waste_bytes = pss->lab_undo_waste_words() * HeapWordSize;
+    size_t copied_bytes = pss->flush(_surviving_young_words_total) * HeapWordSize;
+
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, copied_bytes, G1GCPhaseTimes::MergePSSCopiedBytes);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, lab_waste_bytes, G1GCPhaseTimes::MergePSSLABWasteBytes);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, lab_undo_waste_bytes, G1GCPhaseTimes::MergePSSLABUndoWasteBytes);
+
     delete pss;
-    _states[worker_index] = NULL;
+    _states[worker_id] = NULL;
   }
   _flushed = true;
 }
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp	Thu Nov 21 11:42:54 2019 +0100
@@ -74,7 +74,8 @@
   size_t* _surviving_young_words_base;
   // this points into the array, as we use the first few entries for padding
   size_t* _surviving_young_words;
-
+  // Number of elements in the array above.
+  size_t _surviving_words_length;
   // Indicates whether in the last generation (old) there is no more space
   // available for allocation.
   bool _old_gen_is_full;
@@ -152,7 +153,9 @@
   size_t lab_waste_words() const;
   size_t lab_undo_waste_words() const;
 
-  void flush(size_t* surviving_young_words);
+  // Pass locally gathered statistics to global state. Returns the total number of
+  // HeapWords copied.
+  size_t flush(size_t* surviving_young_words);
 
 private:
   #define G1_PARTIAL_ARRAY_MASK 0x2
--- a/src/hotspot/share/gc/g1/g1Policy.cpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1Policy.cpp	Thu Nov 21 11:42:54 2019 +0100
@@ -78,7 +78,6 @@
   _bytes_allocated_in_old_since_last_gc(0),
   _initial_mark_to_mixed(),
   _collection_set(NULL),
-  _bytes_copied_during_gc(0),
   _g1h(NULL),
   _phase_times(new G1GCPhaseTimes(gc_timer, ParallelGCThreads)),
   _mark_remark_start_sec(0),
@@ -553,7 +552,6 @@
   record_concurrent_refinement_data(false /* is_full_collection */);
 
   _collection_set->reset_bytes_used_before();
-  _bytes_copied_during_gc = 0;
 
   // do that for any other surv rate groups
   _short_lived_surv_rate_group->stop_adding_regions();
@@ -647,13 +645,11 @@
 // Anything below that is considered to be zero
 #define MIN_TIMER_GRANULARITY 0.0000001
 
-void G1Policy::record_collection_pause_end(double pause_time_ms, size_t heap_used_bytes_before_gc) {
+void G1Policy::record_collection_pause_end(double pause_time_ms) {
   G1GCPhaseTimes* p = phase_times();
 
   double end_time_sec = os::elapsedTime();
 
-  assert_used_and_recalculate_used_equal(_g1h);
-  size_t cur_used_bytes = _g1h->used();
   bool this_pause_included_initial_mark = false;
   bool this_pause_was_young_only = collector_state()->in_young_only_phase();
 
@@ -778,12 +774,10 @@
     }
     _analytics->report_rs_length_diff((double) rs_length_diff);
 
-    size_t freed_bytes = heap_used_bytes_before_gc - cur_used_bytes;
-    size_t copied_bytes = _collection_set->bytes_used_before() - freed_bytes;
-    double cost_per_byte_ms = 0.0;
+    size_t copied_bytes = p->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSCopiedBytes);
 
     if (copied_bytes > 0) {
-      cost_per_byte_ms = (average_time_ms(G1GCPhaseTimes::ObjCopy) + average_time_ms(G1GCPhaseTimes::OptObjCopy)) / (double) copied_bytes;
+      double cost_per_byte_ms = (average_time_ms(G1GCPhaseTimes::ObjCopy) + average_time_ms(G1GCPhaseTimes::OptObjCopy)) / copied_bytes;
       _analytics->report_cost_per_byte_ms(cost_per_byte_ms, collector_state()->mark_or_rebuild_in_progress());
     }
 
--- a/src/hotspot/share/gc/g1/g1Policy.hpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1Policy.hpp	Thu Nov 21 11:42:54 2019 +0100
@@ -184,9 +184,6 @@
 
   G1CollectionSetChooser* cset_chooser() const;
 
-  // The number of bytes copied during the GC.
-  size_t _bytes_copied_during_gc;
-
   // Stash a pointer to the g1 heap.
   G1CollectedHeap* _g1h;
 
@@ -320,7 +317,7 @@
 
   // Record the start and end of an evacuation pause.
   void record_collection_pause_start(double start_time_sec);
-  virtual void record_collection_pause_end(double pause_time_ms, size_t heap_used_bytes_before_gc);
+  virtual void record_collection_pause_end(double pause_time_ms);
 
   // Record the start and end of a full collection.
   void record_full_collection_start();
@@ -339,17 +336,6 @@
 
   void print_phases();
 
-  // Record how much space we copied during a GC. This is typically
-  // called when a GC alloc region is being retired.
-  void record_bytes_copied_during_gc(size_t bytes) {
-    _bytes_copied_during_gc += bytes;
-  }
-
-  // The amount of space we copied during a GC.
-  size_t bytes_copied_during_gc() const {
-    return _bytes_copied_during_gc;
-  }
-
   bool next_gc_should_be_mixed(const char* true_action_str,
                                const char* false_action_str) const;
 
--- a/src/hotspot/share/gc/g1/g1Trace.cpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1Trace.cpp	Thu Nov 21 11:42:54 2019 +0100
@@ -145,8 +145,8 @@
     e.set_cSetUsedAfter(info->collectionset_used_after());
     e.set_allocationRegions(info->allocation_regions());
     e.set_allocationRegionsUsedBefore(info->alloc_regions_used_before());
-    e.set_allocationRegionsUsedAfter(info->alloc_regions_used_before() + info->bytes_copied());
-    e.set_bytesCopied(info->bytes_copied());
+    e.set_allocationRegionsUsedAfter(info->alloc_regions_used_before() + info->bytes_used());
+    e.set_bytesCopied(info->bytes_used());
     e.set_regionsFreed(info->regions_freed());
     e.commit();
   }
--- a/src/hotspot/share/gc/shared/workerDataArray.cpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/shared/workerDataArray.cpp	Thu Nov 21 11:42:54 2019 +0100
@@ -37,6 +37,16 @@
 }
 
 template <>
+void WorkerDataArray<double>::WDAPrinter::summary(outputStream* out, double time) {
+  out->print_cr(" %.1lfms", time * MILLIUNITS);
+}
+
+template <>
+void WorkerDataArray<size_t>::WDAPrinter::summary(outputStream* out, size_t value) {
+  out->print_cr(" " SIZE_FORMAT, value);
+}
+
+template <>
 void WorkerDataArray<double>::WDAPrinter::summary(outputStream* out, double min, double avg, double max, double diff, double sum, bool print_sum) {
   out->print(" Min: %4.1lf, Avg: %4.1lf, Max: %4.1lf, Diff: %4.1lf", min * MILLIUNITS, avg * MILLIUNITS, max * MILLIUNITS, diff* MILLIUNITS);
   if (print_sum) {
--- a/src/hotspot/share/gc/shared/workerDataArray.hpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/shared/workerDataArray.hpp	Thu Nov 21 11:42:54 2019 +0100
@@ -40,10 +40,12 @@
   uint        _length;
   const char* _title;
 
+  bool _is_serial;
+
   WorkerDataArray<size_t>* _thread_work_items[MaxThreadWorkItems];
 
  public:
-  WorkerDataArray(uint length, const char* title);
+  WorkerDataArray(uint length, const char* title, bool is_serial = false);
   ~WorkerDataArray();
 
   void link_thread_work_items(WorkerDataArray<size_t>* thread_work_items, uint index = 0);
@@ -79,7 +81,9 @@
  private:
   class WDAPrinter {
   public:
+    static void summary(outputStream* out, double time);
     static void summary(outputStream* out, double min, double avg, double max, double diff, double sum, bool print_sum);
+    static void summary(outputStream* out, size_t value);
     static void summary(outputStream* out, size_t min, double avg, size_t max, size_t diff, size_t sum, bool print_sum);
 
     static void details(const WorkerDataArray<double>* phase, outputStream* out);
--- a/src/hotspot/share/gc/shared/workerDataArray.inline.hpp	Wed Nov 20 14:16:29 2019 +0100
+++ b/src/hotspot/share/gc/shared/workerDataArray.inline.hpp	Thu Nov 21 11:42:54 2019 +0100
@@ -30,11 +30,13 @@
 #include "utilities/ostream.hpp"
 
 template <typename T>
-WorkerDataArray<T>::WorkerDataArray(uint length, const char* title) :
+WorkerDataArray<T>::WorkerDataArray(uint length, const char* title, bool is_serial) :
  _data(NULL),
  _length(length),
- _title(title) {
+ _title(title),
+ _is_serial(is_serial) {
   assert(length > 0, "Must have some workers to store data for");
+  assert(!is_serial || length == 1, "Serial phase must only have a single entry.");
   _data = NEW_C_HEAP_ARRAY(T, _length, mtGC);
   for (uint i = 0; i < MaxThreadWorkItems; i++) {
     _thread_work_items[i] = NULL;
@@ -139,30 +141,39 @@
 
 template <class T>
 void WorkerDataArray<T>::print_summary_on(outputStream* out, bool print_sum) const {
-  out->print("%-25s", title());
+  if (_is_serial) {
+    out->print("%s:", title());
+  } else {
+    out->print("%-25s", title());
+  }
+
   uint start = 0;
   while (start < _length && get(start) == uninitialized()) {
     start++;
   }
   if (start < _length) {
-    T min = get(start);
-    T max = min;
-    T sum = 0;
-    uint contributing_threads = 0;
-    for (uint i = start; i < _length; ++i) {
-      T value = get(i);
-      if (value != uninitialized()) {
-        max = MAX2(max, value);
-        min = MIN2(min, value);
-        sum += value;
-        contributing_threads++;
+    if (_is_serial) {
+      WDAPrinter::summary(out, get(0));
+    } else {
+      T min = get(start);
+      T max = min;
+      T sum = 0;
+      uint contributing_threads = 0;
+      for (uint i = start; i < _length; ++i) {
+        T value = get(i);
+        if (value != uninitialized()) {
+          max = MAX2(max, value);
+          min = MIN2(min, value);
+          sum += value;
+          contributing_threads++;
+        }
       }
+      T diff = max - min;
+      assert(contributing_threads != 0, "Must be since we found a used value for the start index");
+      double avg = sum / (double) contributing_threads;
+      WDAPrinter::summary(out, min, avg, max, diff, sum, print_sum);
+      out->print_cr(", Workers: %d", contributing_threads);
     }
-    T diff = max - min;
-    assert(contributing_threads != 0, "Must be since we found a used value for the start index");
-    double avg = sum / (double) contributing_threads;
-    WDAPrinter::summary(out, min, avg, max, diff, sum, print_sum);
-    out->print_cr(", Workers: %d", contributing_threads);
   } else {
     // No data for this phase.
     out->print_cr(" skipped");
--- a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java	Wed Nov 20 14:16:29 2019 +0100
+++ b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java	Thu Nov 21 11:42:54 2019 +0100
@@ -118,6 +118,7 @@
         new LogMessageWithLevel("Code Root Scan", Level.DEBUG),
         // Object Copy
         new LogMessageWithLevel("Object Copy", Level.DEBUG),
+        new LogMessageWithLevel("Copied Bytes", Level.DEBUG),
         new LogMessageWithLevel("LAB Waste", Level.DEBUG),
         new LogMessageWithLevel("LAB Undo Waste", Level.DEBUG),
         // Ext Root Scan