changeset 1488:ec72429f79c4

Merge
author duke
date Wed, 05 Jul 2017 16:43:15 +0200
parents 628049ac53ed 27ac920c7b50
children 126f365cec6c
files
diffstat 325 files changed, 42376 insertions(+), 4071 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags-top-repo	Thu Oct 23 21:56:41 2008 -0700
+++ b/.hgtags-top-repo	Wed Jul 05 16:43:15 2017 +0200
@@ -12,3 +12,4 @@
 143c1abedb7d3095eff0f9ee5fec9bf48e3490fc jdk7-b35
 4b4f5fea8d7d0743f0c30d91fcd9bf9d96e5d2ad jdk7-b36
 744554f5a3290e11c71cd2ddb1aff49e431f9ed0 jdk7-b37
+cc47a76899ed33a2c513cb688348244c9b5a1288 jdk7-b38
--- a/corba/.hgtags	Thu Oct 23 21:56:41 2008 -0700
+++ b/corba/.hgtags	Wed Jul 05 16:43:15 2017 +0200
@@ -12,3 +12,4 @@
 3867c4d14a5bfdbb37c97b4874ccb0ee5343111c jdk7-b35
 0723891eb8d1c27e67c54163af0b4cea05a4e036 jdk7-b36
 59d5848bdedebe91cc2753acce78911bcb4a66db jdk7-b37
+08be802754b0296c91a7713b6d85a015dbcd5349 jdk7-b38
--- a/hotspot/.hgtags	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/.hgtags	Wed Jul 05 16:43:15 2017 +0200
@@ -12,3 +12,4 @@
 5fa96a5a7e76da7c8dad12486293a0456c2c116c jdk7-b35
 e91159f921a58af3698e6479ea1fc5818da66d09 jdk7-b36
 9ee9cf798b59e7d51f8c0a686959f313867a55d6 jdk7-b37
+d9bc824aa078573829bb66572af847e26e1bd12e jdk7-b38
--- a/hotspot/make/hotspot_distro	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/make/hotspot_distro	Wed Jul 05 16:43:15 2017 +0200
@@ -1,4 +1,4 @@
-#
+# 
 # Copyright 2006-2008 Sun Microsystems, Inc.  All Rights Reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
@@ -19,7 +19,7 @@
 # Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
 # CA 95054 USA or visit www.sun.com if you need additional information or
 # have any questions.
-#
+# 
 
 #
 # This file format must remain compatible with both
--- a/hotspot/make/hotspot_version	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/make/hotspot_version	Wed Jul 05 16:43:15 2017 +0200
@@ -35,7 +35,7 @@
 
 HS_MAJOR_VER=14
 HS_MINOR_VER=0
-HS_BUILD_NUMBER=05
+HS_BUILD_NUMBER=06
 
 JDK_MAJOR_VER=1
 JDK_MINOR_VER=7
--- a/hotspot/make/linux/makefiles/top.make	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/make/linux/makefiles/top.make	Wed Jul 05 16:43:15 2017 +0200
@@ -64,6 +64,7 @@
                           $(VM)/gc_implementation/includeDB_gc_parallelScavenge \
                           $(VM)/gc_implementation/includeDB_gc_concurrentMarkSweep \
                           $(VM)/gc_implementation/includeDB_gc_parNew \
+                          $(VM)/gc_implementation/includeDB_gc_g1     \
                           $(VM)/gc_implementation/includeDB_gc_serial \
                           $(VM)/gc_implementation/includeDB_gc_shared
 
--- a/hotspot/make/solaris/makefiles/top.make	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/make/solaris/makefiles/top.make	Wed Jul 05 16:43:15 2017 +0200
@@ -54,6 +54,7 @@
                      $(VM)/gc_implementation/includeDB_gc_parallelScavenge \
                      $(VM)/gc_implementation/includeDB_gc_concurrentMarkSweep \
                      $(VM)/gc_implementation/includeDB_gc_parNew \
+                     $(VM)/gc_implementation/includeDB_gc_g1 \
                      $(VM)/gc_implementation/includeDB_gc_serial \
                      $(VM)/gc_implementation/includeDB_gc_shared
 
--- a/hotspot/make/windows/makefiles/generated.make	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/make/windows/makefiles/generated.make	Wed Jul 05 16:43:15 2017 +0200
@@ -50,7 +50,8 @@
            $(WorkSpace)/src/share/vm/gc_implementation/includeDB_gc_parallelScavenge \
            $(WorkSpace)/src/share/vm/gc_implementation/includeDB_gc_shared \
            $(WorkSpace)/src/share/vm/gc_implementation/includeDB_gc_parNew \
-           $(WorkSpace)/src/share/vm/gc_implementation/includeDB_gc_concurrentMarkSweep
+           $(WorkSpace)/src/share/vm/gc_implementation/includeDB_gc_concurrentMarkSweep \
+           $(WorkSpace)/src/share/vm/gc_implementation/includeDB_gc_g1
 
 IncludeDBs_core=$(IncludeDBs_base) $(IncludeDBs_gc) \
                 $(WorkSpace)/src/share/vm/includeDB_features
--- a/hotspot/make/windows/makefiles/makedeps.make	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/make/windows/makefiles/makedeps.make	Wed Jul 05 16:43:15 2017 +0200
@@ -64,6 +64,7 @@
         -relativeInclude src\share\vm\gc_implementation\shared \
         -relativeInclude src\share\vm\gc_implementation\parNew \
         -relativeInclude src\share\vm\gc_implementation\concurrentMarkSweep \
+        -relativeInclude src\share\vm\gc_implementation\g1 \
         -relativeInclude src\share\vm\gc_interface \
         -relativeInclude src\share\vm\asm \
         -relativeInclude src\share\vm\memory \
@@ -115,6 +116,7 @@
         -additionalFile includeDB_gc_parallel \
         -additionalFile includeDB_gc_parallelScavenge \
         -additionalFile includeDB_gc_concurrentMarkSweep \
+        -additionalFile includeDB_gc_g1 \
         -additionalFile includeDB_gc_parNew \
         -additionalFile includeDB_gc_shared \
         -additionalFile includeDB_gc_serial \
--- a/hotspot/make/windows/makefiles/vm.make	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/make/windows/makefiles/vm.make	Wed Jul 05 16:43:15 2017 +0200
@@ -117,6 +117,7 @@
   /I "$(WorkSpace)\src\share\vm\gc_implementation\shared"\
   /I "$(WorkSpace)\src\share\vm\gc_implementation\parNew"\
   /I "$(WorkSpace)\src\share\vm\gc_implementation\concurrentMarkSweep"\
+  /I "$(WorkSpace)\src\share\vm\gc_implementation\g1"\
   /I "$(WorkSpace)\src\share\vm\gc_interface"\
   /I "$(WorkSpace)\src\share\vm\asm"         \
   /I "$(WorkSpace)\src\share\vm\memory"      \
@@ -146,6 +147,7 @@
 VM_PATH=$(VM_PATH);$(WorkSpace)/src/share/vm/gc_implementation/shared
 VM_PATH=$(VM_PATH);$(WorkSpace)/src/share/vm/gc_implementation/parNew
 VM_PATH=$(VM_PATH);$(WorkSpace)/src/share/vm/gc_implementation/concurrentMarkSweep
+VM_PATH=$(VM_PATH);$(WorkSpace)/src/share/vm/gc_implementation/g1
 VM_PATH=$(VM_PATH);$(WorkSpace)/src/share/vm/gc_interface
 VM_PATH=$(VM_PATH);$(WorkSpace)/src/share/vm/asm
 VM_PATH=$(VM_PATH);$(WorkSpace)/src/share/vm/memory
@@ -222,6 +224,9 @@
 {$(WorkSpace)\src\share\vm\gc_implementation\concurrentMarkSweep}.cpp.obj::
         $(CPP) $(CPP_FLAGS) $(CPP_USE_PCH) /c $<
 
+{$(WorkSpace)\src\share\vm\gc_implementation\g1}.cpp.obj::
+        $(CPP) $(CPP_FLAGS) $(CPP_USE_PCH) /c $<
+
 {$(WorkSpace)\src\share\vm\gc_interface}.cpp.obj::
         $(CPP) $(CPP_FLAGS) $(CPP_USE_PCH) /c $<
 
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -130,6 +130,20 @@
   return 0x00;                  // illegal instruction 0x00000000
 }
 
+Assembler::Condition Assembler::reg_cond_to_cc_cond(Assembler::RCondition in) {
+  switch (in) {
+  case rc_z:   return equal;
+  case rc_lez: return lessEqual;
+  case rc_lz:  return less;
+  case rc_nz:  return notEqual;
+  case rc_gz:  return greater;
+  case rc_gez: return greaterEqual;
+  default:
+    ShouldNotReachHere();
+  }
+  return equal;
+}
+
 // Generate a bunch 'o stuff (including v9's
 #ifndef PRODUCT
 void Assembler::test_v9() {
@@ -1213,31 +1227,19 @@
 }
 
 
-void MacroAssembler::store_check(Register tmp, Register obj) {
-  // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
-
-  /* $$$ This stuff needs to go into one of the BarrierSet generator
-     functions.  (The particular barrier sets will have to be friends of
-     MacroAssembler, I guess.) */
-  BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
-  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
-  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+void MacroAssembler::card_table_write(jbyte* byte_map_base,
+                                      Register tmp, Register obj) {
 #ifdef _LP64
   srlx(obj, CardTableModRefBS::card_shift, obj);
 #else
   srl(obj, CardTableModRefBS::card_shift, obj);
 #endif
   assert( tmp != obj, "need separate temp reg");
-  Address rs(tmp, (address)ct->byte_map_base);
+  Address rs(tmp, (address)byte_map_base);
   load_address(rs);
   stb(G0, rs.base(), obj);
 }
 
-void MacroAssembler::store_check(Register tmp, Register obj, Register offset) {
-  store_check(tmp, obj);
-}
-
 // %%% Note:  The following six instructions have been moved,
 //            unchanged, from assembler_sparc.inline.hpp.
 //            They will be refactored at a later date.
@@ -1663,11 +1665,21 @@
 
   if (reg == G0)  return;       // always NULL, which is always an oop
 
-  char buffer[16];
+  char buffer[64];
+#ifdef COMPILER1
+  if (CommentedAssembly) {
+    snprintf(buffer, sizeof(buffer), "verify_oop at %d", offset());
+    block_comment(buffer);
+  }
+#endif
+
+  int len = strlen(file) + strlen(msg) + 1 + 4;
   sprintf(buffer, "%d", line);
-  int len = strlen(file) + strlen(msg) + 1 + 4 + strlen(buffer);
+  len += strlen(buffer);
+  sprintf(buffer, " at offset %d ", offset());
+  len += strlen(buffer);
   char * real_msg = new char[len];
-  sprintf(real_msg, "%s (%s:%d)", msg, file, line);
+  sprintf(real_msg, "%s%s(%s:%d)", msg, buffer, file, line);
 
   // Call indirectly to solve generation ordering problem
   Address a(O7, (address)StubRoutines::verify_oop_subroutine_entry_address());
@@ -2059,6 +2071,27 @@
 #endif
 }
 
+void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
+                                     Register s1, address d,
+                                     relocInfo::relocType rt ) {
+  if (VM_Version::v9_instructions_work()) {
+    bpr(rc, a, p, s1, d, rt);
+  } else {
+    tst(s1);
+    br(reg_cond_to_cc_cond(rc), a, p, d, rt);
+  }
+}
+
+void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
+                                     Register s1, Label& L ) {
+  if (VM_Version::v9_instructions_work()) {
+    bpr(rc, a, p, s1, L);
+  } else {
+    tst(s1);
+    br(reg_cond_to_cc_cond(rc), a, p, L);
+  }
+}
+
 
 // instruction sequences factored across compiler & interpreter
 
@@ -3241,68 +3274,74 @@
   assert(0 <= con_size_in_bytes && Assembler::is_simm13(con_size_in_bytes), "illegal object size");
   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
 
-  // get eden boundaries
-  // note: we need both top & top_addr!
-  const Register top_addr = t1;
-  const Register end      = t2;
-
-  CollectedHeap* ch = Universe::heap();
-  set((intx)ch->top_addr(), top_addr);
-  intx delta = (intx)ch->end_addr() - (intx)ch->top_addr();
-  ld_ptr(top_addr, delta, end);
-  ld_ptr(top_addr, 0, obj);
-
-  // try to allocate
-  Label retry;
-  bind(retry);
+  if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
+    // No allocation in the shared eden.
+    br(Assembler::always, false, Assembler::pt, slow_case);
+    delayed()->nop();
+  } else {
+    // get eden boundaries
+    // note: we need both top & top_addr!
+    const Register top_addr = t1;
+    const Register end      = t2;
+
+    CollectedHeap* ch = Universe::heap();
+    set((intx)ch->top_addr(), top_addr);
+    intx delta = (intx)ch->end_addr() - (intx)ch->top_addr();
+    ld_ptr(top_addr, delta, end);
+    ld_ptr(top_addr, 0, obj);
+
+    // try to allocate
+    Label retry;
+    bind(retry);
 #ifdef ASSERT
-  // make sure eden top is properly aligned
-  {
-    Label L;
-    btst(MinObjAlignmentInBytesMask, obj);
-    br(Assembler::zero, false, Assembler::pt, L);
-    delayed()->nop();
-    stop("eden top is not properly aligned");
-    bind(L);
+    // make sure eden top is properly aligned
+    {
+      Label L;
+      btst(MinObjAlignmentInBytesMask, obj);
+      br(Assembler::zero, false, Assembler::pt, L);
+      delayed()->nop();
+      stop("eden top is not properly aligned");
+      bind(L);
+    }
+#endif // ASSERT
+    const Register free = end;
+    sub(end, obj, free);                                   // compute amount of free space
+    if (var_size_in_bytes->is_valid()) {
+      // size is unknown at compile time
+      cmp(free, var_size_in_bytes);
+      br(Assembler::lessUnsigned, false, Assembler::pn, slow_case); // if there is not enough space go the slow case
+      delayed()->add(obj, var_size_in_bytes, end);
+    } else {
+      // size is known at compile time
+      cmp(free, con_size_in_bytes);
+      br(Assembler::lessUnsigned, false, Assembler::pn, slow_case); // if there is not enough space go the slow case
+      delayed()->add(obj, con_size_in_bytes, end);
+    }
+    // Compare obj with the value at top_addr; if still equal, swap the value of
+    // end with the value at top_addr. If not equal, read the value at top_addr
+    // into end.
+    casx_under_lock(top_addr, obj, end, (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr());
+    // if someone beat us on the allocation, try again, otherwise continue
+    cmp(obj, end);
+    brx(Assembler::notEqual, false, Assembler::pn, retry);
+    delayed()->mov(end, obj);                              // nop if successfull since obj == end
+
+#ifdef ASSERT
+    // make sure eden top is properly aligned
+    {
+      Label L;
+      const Register top_addr = t1;
+
+      set((intx)ch->top_addr(), top_addr);
+      ld_ptr(top_addr, 0, top_addr);
+      btst(MinObjAlignmentInBytesMask, top_addr);
+      br(Assembler::zero, false, Assembler::pt, L);
+      delayed()->nop();
+      stop("eden top is not properly aligned");
+      bind(L);
+    }
+#endif // ASSERT
   }
-#endif // ASSERT
-  const Register free = end;
-  sub(end, obj, free);                                   // compute amount of free space
-  if (var_size_in_bytes->is_valid()) {
-    // size is unknown at compile time
-    cmp(free, var_size_in_bytes);
-    br(Assembler::lessUnsigned, false, Assembler::pn, slow_case); // if there is not enough space go the slow case
-    delayed()->add(obj, var_size_in_bytes, end);
-  } else {
-    // size is known at compile time
-    cmp(free, con_size_in_bytes);
-    br(Assembler::lessUnsigned, false, Assembler::pn, slow_case); // if there is not enough space go the slow case
-    delayed()->add(obj, con_size_in_bytes, end);
-  }
-  // Compare obj with the value at top_addr; if still equal, swap the value of
-  // end with the value at top_addr. If not equal, read the value at top_addr
-  // into end.
-  casx_under_lock(top_addr, obj, end, (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr());
-  // if someone beat us on the allocation, try again, otherwise continue
-  cmp(obj, end);
-  brx(Assembler::notEqual, false, Assembler::pn, retry);
-  delayed()->mov(end, obj);                              // nop if successfull since obj == end
-
-#ifdef ASSERT
-  // make sure eden top is properly aligned
-  {
-    Label L;
-    const Register top_addr = t1;
-
-    set((intx)ch->top_addr(), top_addr);
-    ld_ptr(top_addr, 0, top_addr);
-    btst(MinObjAlignmentInBytesMask, top_addr);
-    br(Assembler::zero, false, Assembler::pt, L);
-    delayed()->nop();
-    stop("eden top is not properly aligned");
-    bind(L);
-  }
-#endif // ASSERT
 }
 
 
@@ -3554,6 +3593,468 @@
   }
 }
 
+///////////////////////////////////////////////////////////////////////////////////
+#ifndef SERIALGC
+
+static uint num_stores = 0;
+static uint num_null_pre_stores = 0;
+
+static void count_null_pre_vals(void* pre_val) {
+  num_stores++;
+  if (pre_val == NULL) num_null_pre_stores++;
+  if ((num_stores % 1000000) == 0) {
+    tty->print_cr(UINT32_FORMAT " stores, " UINT32_FORMAT " (%5.2f%%) with null pre-vals.",
+                  num_stores, num_null_pre_stores,
+                  100.0*(float)num_null_pre_stores/(float)num_stores);
+  }
+}
+
+static address satb_log_enqueue_with_frame = 0;
+static u_char* satb_log_enqueue_with_frame_end = 0;
+
+static address satb_log_enqueue_frameless = 0;
+static u_char* satb_log_enqueue_frameless_end = 0;
+
+static int EnqueueCodeSize = 128 DEBUG_ONLY( + 256); // Instructions?
+
+// The calls to this don't work.  We'd need to do a fair amount of work to
+// make it work.
+static void check_index(int ind) {
+  assert(0 <= ind && ind <= 64*K && ((ind % oopSize) == 0),
+         "Invariants.")
+}
+
+static void generate_satb_log_enqueue(bool with_frame) {
+  BufferBlob* bb = BufferBlob::create("enqueue_with_frame", EnqueueCodeSize);
+  CodeBuffer buf(bb->instructions_begin(), bb->instructions_size());
+  MacroAssembler masm(&buf);
+  address start = masm.pc();
+  Register pre_val;
+
+  Label refill, restart;
+  if (with_frame) {
+    masm.save_frame(0);
+    pre_val = I0;  // Was O0 before the save.
+  } else {
+    pre_val = O0;
+  }
+  int satb_q_index_byte_offset =
+    in_bytes(JavaThread::satb_mark_queue_offset() +
+             PtrQueue::byte_offset_of_index());
+  int satb_q_buf_byte_offset =
+    in_bytes(JavaThread::satb_mark_queue_offset() +
+             PtrQueue::byte_offset_of_buf());
+  assert(in_bytes(PtrQueue::byte_width_of_index()) == sizeof(intptr_t) &&
+         in_bytes(PtrQueue::byte_width_of_buf()) == sizeof(intptr_t),
+         "check sizes in assembly below");
+
+  masm.bind(restart);
+  masm.ld_ptr(G2_thread, satb_q_index_byte_offset, L0);
+
+  masm.br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, L0, refill);
+  // If the branch is taken, no harm in executing this in the delay slot.
+  masm.delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, L1);
+  masm.sub(L0, oopSize, L0);
+
+  masm.st_ptr(pre_val, L1, L0);  // [_buf + index] := I0
+  if (!with_frame) {
+    // Use return-from-leaf
+    masm.retl();
+    masm.delayed()->st_ptr(L0, G2_thread, satb_q_index_byte_offset);
+  } else {
+    // Not delayed.
+    masm.st_ptr(L0, G2_thread, satb_q_index_byte_offset);
+  }
+  if (with_frame) {
+    masm.ret();
+    masm.delayed()->restore();
+  }
+  masm.bind(refill);
+
+  address handle_zero =
+    CAST_FROM_FN_PTR(address,
+                     &SATBMarkQueueSet::handle_zero_index_for_thread);
+  // This should be rare enough that we can afford to save all the
+  // scratch registers that the calling context might be using.
+  masm.mov(G1_scratch, L0);
+  masm.mov(G3_scratch, L1);
+  masm.mov(G4, L2);
+  // We need the value of O0 above (for the write into the buffer), so we
+  // save and restore it.
+  masm.mov(O0, L3);
+  // Since the call will overwrite O7, we save and restore that, as well.
+  masm.mov(O7, L4);
+  masm.call_VM_leaf(L5, handle_zero, G2_thread);
+  masm.mov(L0, G1_scratch);
+  masm.mov(L1, G3_scratch);
+  masm.mov(L2, G4);
+  masm.mov(L3, O0);
+  masm.br(Assembler::always, /*annul*/false, Assembler::pt, restart);
+  masm.delayed()->mov(L4, O7);
+
+  if (with_frame) {
+    satb_log_enqueue_with_frame = start;
+    satb_log_enqueue_with_frame_end = masm.pc();
+  } else {
+    satb_log_enqueue_frameless = start;
+    satb_log_enqueue_frameless_end = masm.pc();
+  }
+}
+
+static inline void generate_satb_log_enqueue_if_necessary(bool with_frame) {
+  if (with_frame) {
+    if (satb_log_enqueue_with_frame == 0) {
+      generate_satb_log_enqueue(with_frame);
+      assert(satb_log_enqueue_with_frame != 0, "postcondition.");
+      if (G1SATBPrintStubs) {
+        tty->print_cr("Generated with-frame satb enqueue:");
+        Disassembler::decode((u_char*)satb_log_enqueue_with_frame,
+                             satb_log_enqueue_with_frame_end,
+                             tty);
+      }
+    }
+  } else {
+    if (satb_log_enqueue_frameless == 0) {
+      generate_satb_log_enqueue(with_frame);
+      assert(satb_log_enqueue_frameless != 0, "postcondition.");
+      if (G1SATBPrintStubs) {
+        tty->print_cr("Generated frameless satb enqueue:");
+        Disassembler::decode((u_char*)satb_log_enqueue_frameless,
+                             satb_log_enqueue_frameless_end,
+                             tty);
+      }
+    }
+  }
+}
+
+void MacroAssembler::g1_write_barrier_pre(Register obj, Register index, int offset, Register tmp, bool preserve_o_regs) {
+  assert(offset == 0 || index == noreg, "choose one");
+
+  if (G1DisablePreBarrier) return;
+  // satb_log_barrier(tmp, obj, offset, preserve_o_regs);
+  Label filtered;
+  // satb_log_barrier_work0(tmp, filtered);
+  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
+    ld(G2,
+       in_bytes(JavaThread::satb_mark_queue_offset() +
+                PtrQueue::byte_offset_of_active()),
+       tmp);
+  } else {
+    guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1,
+              "Assumption");
+    ldsb(G2,
+         in_bytes(JavaThread::satb_mark_queue_offset() +
+                  PtrQueue::byte_offset_of_active()),
+         tmp);
+  }
+  // Check on whether to annul.
+  br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered);
+  delayed() -> nop();
+
+  // satb_log_barrier_work1(tmp, offset);
+  if (index == noreg) {
+    if (Assembler::is_simm13(offset)) {
+      ld_ptr(obj, offset, tmp);
+    } else {
+      set(offset, tmp);
+      ld_ptr(obj, tmp, tmp);
+    }
+  } else {
+    ld_ptr(obj, index, tmp);
+  }
+
+  // satb_log_barrier_work2(obj, tmp, offset);
+
+  // satb_log_barrier_work3(tmp, filtered, preserve_o_regs);
+
+  const Register pre_val = tmp;
+
+  if (G1SATBBarrierPrintNullPreVals) {
+    save_frame(0);
+    mov(pre_val, O0);
+    // Save G-regs that target may use.
+    mov(G1, L1);
+    mov(G2, L2);
+    mov(G3, L3);
+    mov(G4, L4);
+    mov(G5, L5);
+    call(CAST_FROM_FN_PTR(address, &count_null_pre_vals));
+    delayed()->nop();
+    // Restore G-regs that target may have used.
+    mov(L1, G1);
+    mov(L2, G2);
+    mov(L3, G3);
+    mov(L4, G4);
+    mov(L5, G5);
+    restore(G0, G0, G0);
+  }
+
+  // Check on whether to annul.
+  br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, pre_val, filtered);
+  delayed() -> nop();
+
+  // OK, it's not filtered, so we'll need to call enqueue.  In the normal
+  // case, pre_val will be a scratch G-reg, but there's some cases in which
+  // it's an O-reg.  In the first case, do a normal call.  In the latter,
+  // do a save here and call the frameless version.
+
+  guarantee(pre_val->is_global() || pre_val->is_out(),
+            "Or we need to think harder.");
+  if (pre_val->is_global() && !preserve_o_regs) {
+    generate_satb_log_enqueue_if_necessary(true); // with frame.
+    call(satb_log_enqueue_with_frame);
+    delayed()->mov(pre_val, O0);
+  } else {
+    generate_satb_log_enqueue_if_necessary(false); // with frameless.
+    save_frame(0);
+    call(satb_log_enqueue_frameless);
+    delayed()->mov(pre_val->after_save(), O0);
+    restore();
+  }
+
+  bind(filtered);
+}
+
+static jint num_ct_writes = 0;
+static jint num_ct_writes_filtered_in_hr = 0;
+static jint num_ct_writes_filtered_null = 0;
+static jint num_ct_writes_filtered_pop = 0;
+static G1CollectedHeap* g1 = NULL;
+
+static Thread* count_ct_writes(void* filter_val, void* new_val) {
+  Atomic::inc(&num_ct_writes);
+  if (filter_val == NULL) {
+    Atomic::inc(&num_ct_writes_filtered_in_hr);
+  } else if (new_val == NULL) {
+    Atomic::inc(&num_ct_writes_filtered_null);
+  } else {
+    if (g1 == NULL) {
+      g1 = G1CollectedHeap::heap();
+    }
+    if ((HeapWord*)new_val < g1->popular_object_boundary()) {
+      Atomic::inc(&num_ct_writes_filtered_pop);
+    }
+  }
+  if ((num_ct_writes % 1000000) == 0) {
+    jint num_ct_writes_filtered =
+      num_ct_writes_filtered_in_hr +
+      num_ct_writes_filtered_null +
+      num_ct_writes_filtered_pop;
+
+    tty->print_cr("%d potential CT writes: %5.2f%% filtered\n"
+                  "   (%5.2f%% intra-HR, %5.2f%% null, %5.2f%% popular).",
+                  num_ct_writes,
+                  100.0*(float)num_ct_writes_filtered/(float)num_ct_writes,
+                  100.0*(float)num_ct_writes_filtered_in_hr/
+                  (float)num_ct_writes,
+                  100.0*(float)num_ct_writes_filtered_null/
+                  (float)num_ct_writes,
+                  100.0*(float)num_ct_writes_filtered_pop/
+                  (float)num_ct_writes);
+  }
+  return Thread::current();
+}
+
+static address dirty_card_log_enqueue = 0;
+static u_char* dirty_card_log_enqueue_end = 0;
+
+// This gets to assume that o0 contains the object address.
+static void generate_dirty_card_log_enqueue(jbyte* byte_map_base) {
+  BufferBlob* bb = BufferBlob::create("dirty_card_enqueue", EnqueueCodeSize*2);
+  CodeBuffer buf(bb->instructions_begin(), bb->instructions_size());
+  MacroAssembler masm(&buf);
+  address start = masm.pc();
+
+  Label not_already_dirty, restart, refill;
+
+#ifdef _LP64
+  masm.srlx(O0, CardTableModRefBS::card_shift, O0);
+#else
+  masm.srl(O0, CardTableModRefBS::card_shift, O0);
+#endif
+  Address rs(O1, (address)byte_map_base);
+  masm.load_address(rs); // O1 := <card table base>
+  masm.ldub(O0, O1, O2); // O2 := [O0 + O1]
+
+  masm.br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
+                      O2, not_already_dirty);
+  // Get O1 + O2 into a reg by itself -- useful in the take-the-branch
+  // case, harmless if not.
+  masm.delayed()->add(O0, O1, O3);
+
+  // We didn't take the branch, so we're already dirty: return.
+  // Use return-from-leaf
+  masm.retl();
+  masm.delayed()->nop();
+
+  // Not dirty.
+  masm.bind(not_already_dirty);
+  // First, dirty it.
+  masm.stb(G0, O3, G0);  // [cardPtr] := 0  (i.e., dirty).
+  int dirty_card_q_index_byte_offset =
+    in_bytes(JavaThread::dirty_card_queue_offset() +
+             PtrQueue::byte_offset_of_index());
+  int dirty_card_q_buf_byte_offset =
+    in_bytes(JavaThread::dirty_card_queue_offset() +
+             PtrQueue::byte_offset_of_buf());
+  masm.bind(restart);
+  masm.ld_ptr(G2_thread, dirty_card_q_index_byte_offset, L0);
+
+  masm.br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
+                      L0, refill);
+  // If the branch is taken, no harm in executing this in the delay slot.
+  masm.delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1);
+  masm.sub(L0, oopSize, L0);
+
+  masm.st_ptr(O3, L1, L0);  // [_buf + index] := I0
+  // Use return-from-leaf
+  masm.retl();
+  masm.delayed()->st_ptr(L0, G2_thread, dirty_card_q_index_byte_offset);
+
+  masm.bind(refill);
+  address handle_zero =
+    CAST_FROM_FN_PTR(address,
+                     &DirtyCardQueueSet::handle_zero_index_for_thread);
+  // This should be rare enough that we can afford to save all the
+  // scratch registers that the calling context might be using.
+  masm.mov(G1_scratch, L3);
+  masm.mov(G3_scratch, L5);
+  // We need the value of O3 above (for the write into the buffer), so we
+  // save and restore it.
+  masm.mov(O3, L6);
+  // Since the call will overwrite O7, we save and restore that, as well.
+  masm.mov(O7, L4);
+
+  masm.call_VM_leaf(L7_thread_cache, handle_zero, G2_thread);
+  masm.mov(L3, G1_scratch);
+  masm.mov(L5, G3_scratch);
+  masm.mov(L6, O3);
+  masm.br(Assembler::always, /*annul*/false, Assembler::pt, restart);
+  masm.delayed()->mov(L4, O7);
+
+  dirty_card_log_enqueue = start;
+  dirty_card_log_enqueue_end = masm.pc();
+  // XXX Should have a guarantee here about not going off the end!
+  // Does it already do so?  Do an experiment...
+}
+
+static inline void
+generate_dirty_card_log_enqueue_if_necessary(jbyte* byte_map_base) {
+  if (dirty_card_log_enqueue == 0) {
+    generate_dirty_card_log_enqueue(byte_map_base);
+    assert(dirty_card_log_enqueue != 0, "postcondition.");
+    if (G1SATBPrintStubs) {
+      tty->print_cr("Generated dirty_card enqueue:");
+      Disassembler::decode((u_char*)dirty_card_log_enqueue,
+                           dirty_card_log_enqueue_end,
+                           tty);
+    }
+  }
+}
+
+
+void MacroAssembler::g1_write_barrier_post(Register store_addr, Register new_val, Register tmp) {
+
+  Label filtered;
+  MacroAssembler* post_filter_masm = this;
+
+  if (new_val == G0) return;
+  if (G1DisablePostBarrier) return;
+
+  G1SATBCardTableModRefBS* bs = (G1SATBCardTableModRefBS*) Universe::heap()->barrier_set();
+  assert(bs->kind() == BarrierSet::G1SATBCT ||
+         bs->kind() == BarrierSet::G1SATBCTLogging, "wrong barrier");
+  if (G1RSBarrierRegionFilter) {
+    xor3(store_addr, new_val, tmp);
+#ifdef _LP64
+    srlx(tmp, HeapRegion::LogOfHRGrainBytes, tmp);
+#else
+    srl(tmp, HeapRegion::LogOfHRGrainBytes, tmp);
+#endif
+    if (G1PrintCTFilterStats) {
+      guarantee(tmp->is_global(), "Or stats won't work...");
+      // This is a sleazy hack: I'm temporarily hijacking G2, which I
+      // promise to restore.
+      mov(new_val, G2);
+      save_frame(0);
+      mov(tmp, O0);
+      mov(G2, O1);
+      // Save G-regs that target may use.
+      mov(G1, L1);
+      mov(G2, L2);
+      mov(G3, L3);
+      mov(G4, L4);
+      mov(G5, L5);
+      call(CAST_FROM_FN_PTR(address, &count_ct_writes));
+      delayed()->nop();
+      mov(O0, G2);
+      // Restore G-regs that target may have used.
+      mov(L1, G1);
+      mov(L3, G3);
+      mov(L4, G4);
+      mov(L5, G5);
+      restore(G0, G0, G0);
+    }
+    // XXX Should I predict this taken or not?  Does it mattern?
+    br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered);
+    delayed()->nop();
+  }
+
+  // Now we decide how to generate the card table write.  If we're
+  // enqueueing, we call out to a generated function.  Otherwise, we do it
+  // inline here.
+
+  if (G1RSBarrierUseQueue) {
+    // If the "store_addr" register is an "in" or "local" register, move it to
+    // a scratch reg so we can pass it as an argument.
+    bool use_scr = !(store_addr->is_global() || store_addr->is_out());
+    // Pick a scratch register different from "tmp".
+    Register scr = (tmp == G1_scratch ? G3_scratch : G1_scratch);
+    // Make sure we use up the delay slot!
+    if (use_scr) {
+      post_filter_masm->mov(store_addr, scr);
+    } else {
+      post_filter_masm->nop();
+    }
+    generate_dirty_card_log_enqueue_if_necessary(bs->byte_map_base);
+    save_frame(0);
+    call(dirty_card_log_enqueue);
+    if (use_scr) {
+      delayed()->mov(scr, O0);
+    } else {
+      delayed()->mov(store_addr->after_save(), O0);
+    }
+    restore();
+
+  } else {
+
+#ifdef _LP64
+    post_filter_masm->srlx(store_addr, CardTableModRefBS::card_shift, store_addr);
+#else
+    post_filter_masm->srl(store_addr, CardTableModRefBS::card_shift, store_addr);
+#endif
+    assert( tmp != store_addr, "need separate temp reg");
+    Address rs(tmp, (address)bs->byte_map_base);
+    load_address(rs);
+    stb(G0, rs.base(), store_addr);
+  }
+
+  bind(filtered);
+
+}
+
+#endif  // SERIALGC
+///////////////////////////////////////////////////////////////////////////////////
+
+void MacroAssembler::card_write_barrier_post(Register store_addr, Register new_val, Register tmp) {
+  // If we're writing constant NULL, we can skip the write barrier.
+  if (new_val == G0) return;
+  CardTableModRefBS* bs = (CardTableModRefBS*) Universe::heap()->barrier_set();
+  assert(bs->kind() == BarrierSet::CardTableModRef ||
+         bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
+  card_table_write(bs->byte_map_base, tmp, store_addr);
+}
+
 void MacroAssembler::load_klass(Register src_oop, Register klass) {
   // The number of bytes in this code is used by
   // MachCallDynamicJavaNode::ret_addr_offset()
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -1439,7 +1439,11 @@
   // pp 214
 
   void save(    Register s1, Register s2, Register d ) { emit_long( op(arith_op) | rd(d) | op3(save_op3) | rs1(s1) | rs2(s2) ); }
-  void save(    Register s1, int simm13a, Register d ) { emit_long( op(arith_op) | rd(d) | op3(save_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  void save(    Register s1, int simm13a, Register d ) {
+    // make sure frame is at least large enough for the register save area
+    assert(-simm13a >= 16 * wordSize, "frame too small");
+    emit_long( op(arith_op) | rd(d) | op3(save_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) );
+  }
 
   void restore( Register s1 = G0,  Register s2 = G0, Register d = G0 ) { emit_long( op(arith_op) | rd(d) | op3(restore_op3) | rs1(s1) | rs2(s2) ); }
   void restore( Register s1,       int simm13a,      Register d      ) { emit_long( op(arith_op) | rd(d) | op3(restore_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
@@ -1594,6 +1598,11 @@
   inline void wrasi(  Register d) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(3, 29, 25)); }
   inline void wrfprs( Register d) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); }
 
+  // For a given register condition, return the appropriate condition code
+  // Condition (the one you would use to get the same effect after "tst" on
+  // the target register.)
+  Assembler::Condition reg_cond_to_cc_cond(RCondition in);
+
 
   // Creation
   Assembler(CodeBuffer* code) : AbstractAssembler(code) {
@@ -1630,6 +1639,8 @@
 
   // restore global registers in case C code disturbed them
   static void restore_registers(MacroAssembler* a, Register r);
+
+
 };
 
 
@@ -1722,6 +1733,12 @@
   void br_null   ( Register s1, bool a, Predict p, Label& L );
   void br_notnull( Register s1, bool a, Predict p, Label& L );
 
+  // These versions will do the most efficient thing on v8 and v9.  Perhaps
+  // this is what the routine above was meant to do, but it didn't (and
+  // didn't cover both target address kinds.)
+  void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none );
+  void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, Label& L);
+
   inline void bp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt = relocInfo::none );
   inline void bp( Condition c, bool a, CC cc, Predict p, Label& L );
 
@@ -2056,9 +2073,23 @@
 #endif // ASSERT
 
  public:
-  // Stores
-  void store_check(Register tmp, Register obj);                // store check for obj - register is destroyed afterwards
-  void store_check(Register tmp, Register obj, Register offset); // store check for obj - register is destroyed afterwards
+
+  // Write to card table for - register is destroyed afterwards.
+  void card_table_write(jbyte* byte_map_base, Register tmp, Register obj);
+
+  void card_write_barrier_post(Register store_addr, Register new_val, Register tmp);
+
+#ifndef SERIALGC
+  // Array store and offset
+  void g1_write_barrier_pre(Register obj, Register index, int offset, Register tmp, bool preserve_o_regs);
+
+  void g1_write_barrier_post(Register store_addr, Register new_val, Register tmp);
+
+  // May do filtering, depending on the boolean arguments.
+  void g1_card_table_write(jbyte* byte_map_base,
+                           Register tmp, Register obj, Register new_val,
+                           bool region_filter, bool null_filter);
+#endif // SERIALGC
 
   // pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
   void push_fTOS();
--- a/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -404,4 +404,55 @@
 }
 
 
+///////////////////////////////////////////////////////////////////////////////////
+#ifndef SERIALGC
+
+void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+
+  assert(pre_val()->is_register(), "Precondition.");
+
+  Register pre_val_reg = pre_val()->as_register();
+
+  ce->mem2reg(addr(), pre_val(), T_OBJECT, patch_code(), info(), false);
+  __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
+                    pre_val_reg, _continuation);
+  __ delayed()->nop();
+
+  __ call(Runtime1::entry_for(Runtime1::Runtime1::g1_pre_barrier_slow_id));
+  __ delayed()->mov(pre_val_reg, G4);
+  __ br(Assembler::always, false, Assembler::pt, _continuation);
+  __ delayed()->nop();
+
+}
+
+jbyte* G1PostBarrierStub::_byte_map_base = NULL;
+
+jbyte* G1PostBarrierStub::byte_map_base_slow() {
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  assert(bs->is_a(BarrierSet::G1SATBCTLogging),
+         "Must be if we're using this.");
+  return ((G1SATBCardTableModRefBS*)bs)->byte_map_base;
+}
+
+void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+
+  assert(addr()->is_register(), "Precondition.");
+  assert(new_val()->is_register(), "Precondition.");
+  Register addr_reg = addr()->as_pointer_register();
+  Register new_val_reg = new_val()->as_register();
+  __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
+                    new_val_reg, _continuation);
+  __ delayed()->nop();
+
+  __ call(Runtime1::entry_for(Runtime1::Runtime1::g1_post_barrier_slow_id));
+  __ delayed()->mov(addr_reg, G4);
+  __ br(Assembler::always, false, Assembler::pt, _continuation);
+  __ delayed()->nop();
+}
+
+#endif // SERIALGC
+///////////////////////////////////////////////////////////////////////////////////
+
 #undef __
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -2093,7 +2093,11 @@
   // the known type isn't loaded since the code sanity checks
   // in debug mode and the type isn't required when we know the exact type
   // also check that the type is an array type.
-  if (op->expected_type() == NULL) {
+  // We also, for now, always call the stub if the barrier set requires a
+  // write_ref_pre barrier (which the stub does, but none of the optimized
+  // cases currently does).
+  if (op->expected_type() == NULL ||
+      Universe::heap()->barrier_set()->has_write_ref_pre_barrier()) {
     __ mov(src,     O0);
     __ mov(src_pos, O1);
     __ mov(dst,     O2);
--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -365,6 +365,10 @@
     __ store_check(value.result(), array.result(), tmp1, tmp2, tmp3, store_check_info);
   }
 
+  if (obj_store) {
+    // Needs GC write barriers.
+    pre_barrier(LIR_OprFact::address(array_addr), false, NULL);
+  }
   __ move(value.result(), array_addr, null_check_info);
   if (obj_store) {
     // Is this precise?
@@ -663,6 +667,10 @@
 
   __ add(obj.result(), offset.result(), addr);
 
+  if (type == objectType) {  // Write-barrier needed for Object fields.
+    pre_barrier(obj.result(), false, NULL);
+  }
+
   if (type == objectType)
     __ cas_obj(addr, cmp.result(), val.result(), t1, t2);
   else if (type == intType)
@@ -677,7 +685,11 @@
   LIR_Opr result = rlock_result(x);
   __ cmove(lir_cond_equal, LIR_OprFact::intConst(1), LIR_OprFact::intConst(0), result);
   if (type == objectType) {  // Write-barrier needed for Object fields.
+#ifdef PRECISE_CARDMARK
+    post_barrier(addr, val.result());
+#else
     post_barrier(obj.result(), val.result());
+#endif // PRECISE_CARDMARK
   }
 }
 
@@ -1154,6 +1166,10 @@
         addr = new LIR_Address(base_op, index_op, type);
       }
 
+      if (is_obj) {
+        pre_barrier(LIR_OprFact::address(addr), false, NULL);
+        // _bs->c1_write_barrier_pre(this, LIR_OprFact::address(addr));
+      }
       __ move(data, addr);
       if (is_obj) {
         // This address is precise
--- a/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -832,6 +832,163 @@
       }
       break;
 
+#ifndef SERIALGC
+    case g1_pre_barrier_slow_id:
+      { // G4: previous value of memory
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        if (bs->kind() != BarrierSet::G1SATBCTLogging) {
+          __ save_frame(0);
+          __ set((int)id, O1);
+          __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, unimplemented_entry), I0);
+          __ should_not_reach_here();
+          break;
+        }
+
+        __ set_info("g1_pre_barrier_slow_id", dont_gc_arguments);
+
+        Register pre_val = G4;
+        Register tmp  = G1_scratch;
+        Register tmp2 = G3_scratch;
+
+        Label refill, restart;
+        bool with_frame = false; // I don't know if we can do with-frame.
+        int satb_q_index_byte_offset =
+          in_bytes(JavaThread::satb_mark_queue_offset() +
+                   PtrQueue::byte_offset_of_index());
+        int satb_q_buf_byte_offset =
+          in_bytes(JavaThread::satb_mark_queue_offset() +
+                   PtrQueue::byte_offset_of_buf());
+        __ bind(restart);
+        __ ld_ptr(G2_thread, satb_q_index_byte_offset, tmp);
+
+        __ br_on_reg_cond(Assembler::rc_z, /*annul*/false,
+                          Assembler::pn, tmp, refill);
+
+        // If the branch is taken, no harm in executing this in the delay slot.
+        __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, tmp2);
+        __ sub(tmp, oopSize, tmp);
+
+        __ st_ptr(pre_val, tmp2, tmp);  // [_buf + index] := <address_of_card>
+        // Use return-from-leaf
+        __ retl();
+        __ delayed()->st_ptr(tmp, G2_thread, satb_q_index_byte_offset);
+
+        __ bind(refill);
+        __ save_frame(0);
+
+        __ mov(pre_val, L0);
+        __ mov(tmp,     L1);
+        __ mov(tmp2,    L2);
+
+        __ call_VM_leaf(L7_thread_cache,
+                        CAST_FROM_FN_PTR(address,
+                                         SATBMarkQueueSet::handle_zero_index_for_thread),
+                                         G2_thread);
+
+        __ mov(L0, pre_val);
+        __ mov(L1, tmp);
+        __ mov(L2, tmp2);
+
+        __ br(Assembler::always, /*annul*/false, Assembler::pt, restart);
+        __ delayed()->restore();
+      }
+      break;
+
+    case g1_post_barrier_slow_id:
+      {
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        if (bs->kind() != BarrierSet::G1SATBCTLogging) {
+          __ save_frame(0);
+          __ set((int)id, O1);
+          __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, unimplemented_entry), I0);
+          __ should_not_reach_here();
+          break;
+        }
+
+        __ set_info("g1_post_barrier_slow_id", dont_gc_arguments);
+
+        Register addr = G4;
+        Register cardtable = G5;
+        Register tmp  = G1_scratch;
+        Register tmp2 = G3_scratch;
+        jbyte* byte_map_base = ((CardTableModRefBS*)bs)->byte_map_base;
+
+        Label not_already_dirty, restart, refill;
+
+#ifdef _LP64
+        __ srlx(addr, CardTableModRefBS::card_shift, addr);
+#else
+        __ srl(addr, CardTableModRefBS::card_shift, addr);
+#endif
+
+        Address rs(cardtable, (address)byte_map_base);
+        __ load_address(rs); // cardtable := <card table base>
+        __ ldub(addr, cardtable, tmp); // tmp := [addr + cardtable]
+
+        __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
+                          tmp, not_already_dirty);
+        // Get cardtable + tmp into a reg by itself -- useful in the take-the-branch
+        // case, harmless if not.
+        __ delayed()->add(addr, cardtable, tmp2);
+
+        // We didn't take the branch, so we're already dirty: return.
+        // Use return-from-leaf
+        __ retl();
+        __ delayed()->nop();
+
+        // Not dirty.
+        __ bind(not_already_dirty);
+        // First, dirty it.
+        __ stb(G0, tmp2, 0);  // [cardPtr] := 0  (i.e., dirty).
+
+        Register tmp3 = cardtable;
+        Register tmp4 = tmp;
+
+        // these registers are now dead
+        addr = cardtable = tmp = noreg;
+
+        int dirty_card_q_index_byte_offset =
+          in_bytes(JavaThread::dirty_card_queue_offset() +
+                   PtrQueue::byte_offset_of_index());
+        int dirty_card_q_buf_byte_offset =
+          in_bytes(JavaThread::dirty_card_queue_offset() +
+                   PtrQueue::byte_offset_of_buf());
+        __ bind(restart);
+        __ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, tmp3);
+
+        __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
+                          tmp3, refill);
+        // If the branch is taken, no harm in executing this in the delay slot.
+        __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, tmp4);
+        __ sub(tmp3, oopSize, tmp3);
+
+        __ st_ptr(tmp2, tmp4, tmp3);  // [_buf + index] := <address_of_card>
+        // Use return-from-leaf
+        __ retl();
+        __ delayed()->st_ptr(tmp3, G2_thread, dirty_card_q_index_byte_offset);
+
+        __ bind(refill);
+        __ save_frame(0);
+
+        __ mov(tmp2, L0);
+        __ mov(tmp3, L1);
+        __ mov(tmp4, L2);
+
+        __ call_VM_leaf(L7_thread_cache,
+                        CAST_FROM_FN_PTR(address,
+                                         DirtyCardQueueSet::handle_zero_index_for_thread),
+                                         G2_thread);
+
+        __ mov(L0, tmp2);
+        __ mov(L1, tmp3);
+        __ mov(L2, tmp4);
+
+        __ br(Assembler::always, /*annul*/false, Assembler::pt, restart);
+        __ delayed()->restore();
+      }
+      break;
+#endif // !SERIALGC
+
     default:
       { __ set_info("unimplemented entry", dont_gc_arguments);
         __ save_frame(0);
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -1110,30 +1110,31 @@
   //  The input registers are overwritten.
   //
   void gen_write_ref_array_pre_barrier(Register addr, Register count) {
-#if 0 // G1 only
     BarrierSet* bs = Universe::heap()->barrier_set();
     if (bs->has_write_ref_pre_barrier()) {
       assert(bs->has_write_ref_array_pre_opt(),
              "Else unsupported barrier set.");
 
-      assert(addr->is_global() && count->is_global(),
-             "If not, then we have to fix this code to handle more "
-             "general cases.");
-      // Get some new fresh output registers.
       __ save_frame(0);
       // Save the necessary global regs... will be used after.
-      __ mov(addr, L0);
-      __ mov(count, L1);
-
-      __ mov(addr, O0);
+      if (addr->is_global()) {
+        __ mov(addr, L0);
+      }
+      if (count->is_global()) {
+        __ mov(count, L1);
+      }
+      __ mov(addr->after_save(), O0);
       // Get the count into O1
       __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
-      __ delayed()->mov(count, O1);
-      __ mov(L0, addr);
-      __ mov(L1, count);
+      __ delayed()->mov(count->after_save(), O1);
+      if (addr->is_global()) {
+        __ mov(L0, addr);
+      }
+      if (count->is_global()) {
+        __ mov(L1, count);
+      }
       __ restore();
     }
-#endif // 0
   }
   //
   //  Generate post-write barrier for array.
@@ -1150,22 +1151,17 @@
     BarrierSet* bs = Universe::heap()->barrier_set();
 
     switch (bs->kind()) {
-#if 0 // G1 - only
       case BarrierSet::G1SATBCT:
       case BarrierSet::G1SATBCTLogging:
         {
-          assert(addr->is_global() && count->is_global(),
-                 "If not, then we have to fix this code to handle more "
-                 "general cases.");
           // Get some new fresh output registers.
           __ save_frame(0);
-          __ mov(addr, O0);
+          __ mov(addr->after_save(), O0);
           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
-          __ delayed()->mov(count, O1);
+          __ delayed()->mov(count->after_save(), O1);
           __ restore();
         }
         break;
-#endif // 0 G1 - only
       case BarrierSet::CardTableModRef:
       case BarrierSet::CardTableExtension:
         {
@@ -2412,8 +2408,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    gen_write_ref_array_pre_barrier(G1, G5);
-
+    gen_write_ref_array_pre_barrier(O1, O2);
 
 #ifdef ASSERT
     // We sometimes save a frame (see partial_subtype_check below).
--- a/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -28,6 +28,79 @@
 #ifndef CC_INTERP
 #define __ _masm->
 
+// Misc helpers
+
+// Do an oop store like *(base + index + offset) = val
+// index can be noreg,
+static void do_oop_store(InterpreterMacroAssembler* _masm,
+                         Register base,
+                         Register index,
+                         int offset,
+                         Register val,
+                         Register tmp,
+                         BarrierSet::Name barrier,
+                         bool precise) {
+  assert(tmp != val && tmp != base && tmp != index, "register collision");
+  assert(index == noreg || offset == 0, "only one offset");
+  switch (barrier) {
+#ifndef SERIALGC
+    case BarrierSet::G1SATBCT:
+    case BarrierSet::G1SATBCTLogging:
+      {
+        __ g1_write_barrier_pre( base, index, offset, tmp, /*preserve_o_regs*/true);
+        if (index == noreg ) {
+          assert(Assembler::is_simm13(offset), "fix this code");
+          __ store_heap_oop(val, base, offset);
+        } else {
+          __ store_heap_oop(val, base, index);
+        }
+
+        // No need for post barrier if storing NULL
+        if (val != G0) {
+          if (precise) {
+            if (index == noreg) {
+              __ add(base, offset, base);
+            } else {
+              __ add(base, index, base);
+            }
+          }
+          __ g1_write_barrier_post(base, val, tmp);
+        }
+      }
+      break;
+#endif // SERIALGC
+    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableExtension:
+      {
+        if (index == noreg ) {
+          assert(Assembler::is_simm13(offset), "fix this code");
+          __ store_heap_oop(val, base, offset);
+        } else {
+          __ store_heap_oop(val, base, index);
+        }
+        // No need for post barrier if storing NULL
+        if (val != G0) {
+          if (precise) {
+            if (index == noreg) {
+              __ add(base, offset, base);
+            } else {
+              __ add(base, index, base);
+            }
+          }
+          __ card_write_barrier_post(base, val, tmp);
+        }
+      }
+      break;
+    case BarrierSet::ModRef:
+    case BarrierSet::Other:
+      ShouldNotReachHere();
+      break;
+    default      :
+      ShouldNotReachHere();
+
+  }
+}
+
 
 //----------------------------------------------------------------------------------------------------
 // Platform-dependent initialization
@@ -758,6 +831,8 @@
   // O4:        array element klass
   // O5:        value klass
 
+  // Address element(O1, 0, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
+
   // Generate a fast subtype check.  Branch to store_ok if no
   // failure.  Throw if failure.
   __ gen_subtype_check( O5, O4, G3_scratch, G4_scratch, G1_scratch, store_ok );
@@ -767,18 +842,14 @@
 
   // Store is OK.
   __ bind(store_ok);
-  __ store_heap_oop(Otos_i, O1, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
-  // Quote from rememberedSet.hpp: For objArrays, the precise card
-  // corresponding to the pointer store is dirtied so we don't need to
-  // scavenge the entire array.
-  Address element(O1, 0, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
-  __ add(element, O1);              // address the element precisely
-  __ store_check(G3_scratch, O1);
+  do_oop_store(_masm, O1, noreg, arrayOopDesc::base_offset_in_bytes(T_OBJECT), Otos_i, G3_scratch, _bs->kind(), true);
+
   __ ba(false,done);
   __ delayed()->inc(Lesp, 3* Interpreter::stackElementSize()); // adj sp (pops array, index and value)
 
   __ bind(is_null);
-  __ store_heap_oop(Otos_i, element);
+  do_oop_store(_masm, O1, noreg, arrayOopDesc::base_offset_in_bytes(T_OBJECT), G0, G4_scratch, _bs->kind(), true);
+
   __ profile_null_seen(G3_scratch);
   __ inc(Lesp, 3* Interpreter::stackElementSize());     // adj sp (pops array, index and value)
   __ bind(done);
@@ -2449,8 +2520,9 @@
     // atos
     __ pop_ptr();
     __ verify_oop(Otos_i);
-    __ store_heap_oop(Otos_i, Rclass, Roffset);
-    __ store_check(G1_scratch, Rclass, Roffset);
+
+    do_oop_store(_masm, Rclass, Roffset, 0, Otos_i, G1_scratch, _bs->kind(), false);
+
     __ ba(false, checkVolatile);
     __ delayed()->tst(Lscratch);
 
@@ -2491,8 +2563,9 @@
     __ pop_ptr();
     pop_and_check_object(Rclass);
     __ verify_oop(Otos_i);
-    __ store_heap_oop(Otos_i, Rclass, Roffset);
-    __ store_check(G1_scratch, Rclass, Roffset);
+
+    do_oop_store(_masm, Rclass, Roffset, 0, Otos_i, G1_scratch, _bs->kind(), false);
+
     patch_bytecode(Bytecodes::_fast_aputfield, G3_scratch, G4_scratch);
     __ ba(false, checkVolatile);
     __ delayed()->tst(Lscratch);
@@ -2646,8 +2719,7 @@
       __ stf(FloatRegisterImpl::D, Ftos_d, Rclass, Roffset);
       break;
     case Bytecodes::_fast_aputfield:
-      __ store_heap_oop(Otos_i, Rclass, Roffset);
-      __ store_check(G1_scratch, Rclass, Roffset);
+      do_oop_store(_masm, Rclass, Roffset, 0, Otos_i, G1_scratch, _bs->kind(), false);
       break;
     default:
       ShouldNotReachHere();
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -1575,6 +1575,35 @@
   emit_operand(src, dst);
 }
 
+void Assembler::movdqu(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  emit_byte(0xF3);
+  prefix(src, dst);
+  emit_byte(0x0F);
+  emit_byte(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_byte(0xF3);
+  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+  emit_byte(0x0F);
+  emit_byte(0x6F);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::movdqu(Address dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  emit_byte(0xF3);
+  prefix(dst, src);
+  emit_byte(0x0F);
+  emit_byte(0x7F);
+  emit_operand(src, dst);
+}
+
 // Uses zero extension on 64bit
 
 void Assembler::movl(Register dst, int32_t imm32) {
@@ -5935,26 +5964,30 @@
                                    Label& slow_case) {
   assert(obj == rax, "obj must be in rax, for cmpxchg");
   assert_different_registers(obj, var_size_in_bytes, t1);
-  Register end = t1;
-  Label retry;
-  bind(retry);
-  ExternalAddress heap_top((address) Universe::heap()->top_addr());
-  movptr(obj, heap_top);
-  if (var_size_in_bytes == noreg) {
-    lea(end, Address(obj, con_size_in_bytes));
+  if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
+    jmp(slow_case);
   } else {
-    lea(end, Address(obj, var_size_in_bytes, Address::times_1));
-  }
-  // if end < obj then we wrapped around => object too long => slow case
-  cmpptr(end, obj);
-  jcc(Assembler::below, slow_case);
-  cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
-  jcc(Assembler::above, slow_case);
-  // Compare obj with the top addr, and if still equal, store the new top addr in
-  // end at the address of the top addr pointer. Sets ZF if was equal, and clears
-  // it otherwise. Use lock prefix for atomicity on MPs.
-  locked_cmpxchgptr(end, heap_top);
-  jcc(Assembler::notEqual, retry);
+    Register end = t1;
+    Label retry;
+    bind(retry);
+    ExternalAddress heap_top((address) Universe::heap()->top_addr());
+    movptr(obj, heap_top);
+    if (var_size_in_bytes == noreg) {
+      lea(end, Address(obj, con_size_in_bytes));
+    } else {
+      lea(end, Address(obj, var_size_in_bytes, Address::times_1));
+    }
+    // if end < obj then we wrapped around => object too long => slow case
+    cmpptr(end, obj);
+    jcc(Assembler::below, slow_case);
+    cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
+    jcc(Assembler::above, slow_case);
+    // Compare obj with the top addr, and if still equal, store the new top addr in
+    // end at the address of the top addr pointer. Sets ZF if was equal, and clears
+    // it otherwise. Use lock prefix for atomicity on MPs.
+    locked_cmpxchgptr(end, heap_top);
+    jcc(Assembler::notEqual, retry);
+  }
 }
 
 void MacroAssembler::enter() {
@@ -6491,6 +6524,179 @@
   }
 }
 
+//////////////////////////////////////////////////////////////////////////////////
+#ifndef SERIALGC
+
+void MacroAssembler::g1_write_barrier_pre(Register obj,
+#ifndef _LP64
+                                          Register thread,
+#endif
+                                          Register tmp,
+                                          Register tmp2,
+                                          bool tosca_live) {
+  LP64_ONLY(Register thread = r15_thread;)
+  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_active()));
+
+  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_index()));
+  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_buf()));
+
+
+  Label done;
+  Label runtime;
+
+  // if (!marking_in_progress) goto done;
+  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
+    cmpl(in_progress, 0);
+  } else {
+    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
+    cmpb(in_progress, 0);
+  }
+  jcc(Assembler::equal, done);
+
+  // if (x.f == NULL) goto done;
+  cmpptr(Address(obj, 0), NULL_WORD);
+  jcc(Assembler::equal, done);
+
+  // Can we store original value in the thread's buffer?
+
+  LP64_ONLY(movslq(tmp, index);)
+  movptr(tmp2, Address(obj, 0));
+#ifdef _LP64
+  cmpq(tmp, 0);
+#else
+  cmpl(index, 0);
+#endif
+  jcc(Assembler::equal, runtime);
+#ifdef _LP64
+  subq(tmp, wordSize);
+  movl(index, tmp);
+  addq(tmp, buffer);
+#else
+  subl(index, wordSize);
+  movl(tmp, buffer);
+  addl(tmp, index);
+#endif
+  movptr(Address(tmp, 0), tmp2);
+  jmp(done);
+  bind(runtime);
+  // save the live input values
+  if(tosca_live) push(rax);
+  push(obj);
+#ifdef _LP64
+  movq(c_rarg0, Address(obj, 0));
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), c_rarg0, r15_thread);
+#else
+  push(thread);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), tmp2, thread);
+  pop(thread);
+#endif
+  pop(obj);
+  if(tosca_live) pop(rax);
+  bind(done);
+
+}
+
+void MacroAssembler::g1_write_barrier_post(Register store_addr,
+                                           Register new_val,
+#ifndef _LP64
+                                           Register thread,
+#endif
+                                           Register tmp,
+                                           Register tmp2) {
+
+  LP64_ONLY(Register thread = r15_thread;)
+  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                       PtrQueue::byte_offset_of_index()));
+  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                       PtrQueue::byte_offset_of_buf()));
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+  Label done;
+  Label runtime;
+
+  // Does store cross heap regions?
+
+  movptr(tmp, store_addr);
+  xorptr(tmp, new_val);
+  shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
+  jcc(Assembler::equal, done);
+
+  // crosses regions, storing NULL?
+
+  cmpptr(new_val, (int32_t) NULL_WORD);
+  jcc(Assembler::equal, done);
+
+  // storing region crossing non-NULL, is card already dirty?
+
+  ExternalAddress cardtable((address) ct->byte_map_base);
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+#ifdef _LP64
+  const Register card_addr = tmp;
+
+  movq(card_addr, store_addr);
+  shrq(card_addr, CardTableModRefBS::card_shift);
+
+  lea(tmp2, cardtable);
+
+  // get the address of the card
+  addq(card_addr, tmp2);
+#else
+  const Register card_index = tmp;
+
+  movl(card_index, store_addr);
+  shrl(card_index, CardTableModRefBS::card_shift);
+
+  Address index(noreg, card_index, Address::times_1);
+  const Register card_addr = tmp;
+  lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
+#endif
+  cmpb(Address(card_addr, 0), 0);
+  jcc(Assembler::equal, done);
+
+  // storing a region crossing, non-NULL oop, card is clean.
+  // dirty card and log.
+
+  movb(Address(card_addr, 0), 0);
+
+  cmpl(queue_index, 0);
+  jcc(Assembler::equal, runtime);
+  subl(queue_index, wordSize);
+  movptr(tmp2, buffer);
+#ifdef _LP64
+  movslq(rscratch1, queue_index);
+  addq(tmp2, rscratch1);
+  movq(Address(tmp2, 0), card_addr);
+#else
+  addl(tmp2, queue_index);
+  movl(Address(tmp2, 0), card_index);
+#endif
+  jmp(done);
+
+  bind(runtime);
+  // save the live input values
+  push(store_addr);
+  push(new_val);
+#ifdef _LP64
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
+#else
+  push(thread);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
+  pop(thread);
+#endif
+  pop(new_val);
+  pop(store_addr);
+
+  bind(done);
+
+}
+
+#endif // SERIALGC
+//////////////////////////////////////////////////////////////////////////////////
+
+
 void MacroAssembler::store_check(Register obj) {
   // Does a store check for the oop in register obj. The content of
   // register obj is destroyed afterwards.
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -227,9 +227,11 @@
 #endif // ASSERT
 
   // accessors
-  bool uses(Register reg) const {
-    return _base == reg || _index == reg;
-  }
+  bool        uses(Register reg) const { return _base == reg || _index == reg; }
+  Register    base()             const { return _base;  }
+  Register    index()            const { return _index; }
+  ScaleFactor scale()            const { return _scale; }
+  int         disp()             const { return _disp;  }
 
   // Convert the raw encoding form into the form expected by the constructor for
   // Address.  An index of 4 (rsp) corresponds to having no index, so convert
@@ -1053,6 +1055,11 @@
   void movdqa(XMMRegister dst, Address src);
   void movdqa(XMMRegister dst, XMMRegister src);
 
+  // Move Unaligned Double Quadword
+  void movdqu(Address     dst, XMMRegister src);
+  void movdqu(XMMRegister dst, Address src);
+  void movdqu(XMMRegister dst, XMMRegister src);
+
   void movl(Register dst, int32_t imm32);
   void movl(Address dst, int32_t imm32);
   void movl(Register dst, Register src);
@@ -1310,7 +1317,8 @@
 // on arguments should also go in here.
 
 class MacroAssembler: public Assembler {
- friend class LIR_Assembler;
+  friend class LIR_Assembler;
+  friend class Runtime1;      // as_Address()
  protected:
 
   Address as_Address(AddressLiteral adr);
@@ -1453,6 +1461,7 @@
   // The pointer will be loaded into the thread register.
   void get_thread(Register thread);
 
+
   // Support for VM calls
   //
   // It is imperative that all calls into the VM are handled via the call_VM macros.
@@ -1527,6 +1536,22 @@
   void store_check(Register obj);                // store check for obj - register is destroyed afterwards
   void store_check(Register obj, Address dst);   // same as above, dst is exact store location (reg. is destroyed)
 
+  void g1_write_barrier_pre(Register obj,
+#ifndef _LP64
+                            Register thread,
+#endif
+                            Register tmp,
+                            Register tmp2,
+                            bool     tosca_live);
+  void g1_write_barrier_post(Register store_addr,
+                             Register new_val,
+#ifndef _LP64
+                             Register thread,
+#endif
+                             Register tmp,
+                             Register tmp2);
+
+
   // split store_check(Register obj) to enhance instruction interleaving
   void store_check_part_1(Register obj);
   void store_check_part_2(Register obj);
--- a/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -456,5 +456,50 @@
   __ jmp(_continuation);
 }
 
+/////////////////////////////////////////////////////////////////////////////
+#ifndef SERIALGC
+
+void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
+
+  // At this point we know that marking is in progress
+
+  __ bind(_entry);
+  assert(pre_val()->is_register(), "Precondition.");
+
+  Register pre_val_reg = pre_val()->as_register();
+
+  ce->mem2reg(addr(), pre_val(), T_OBJECT, patch_code(), info(), false);
+
+  __ cmpptr(pre_val_reg, (int32_t) NULL_WORD);
+  __ jcc(Assembler::equal, _continuation);
+  ce->store_parameter(pre_val()->as_register(), 0);
+  __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::g1_pre_barrier_slow_id)));
+  __ jmp(_continuation);
+
+}
+
+jbyte* G1PostBarrierStub::_byte_map_base = NULL;
+
+jbyte* G1PostBarrierStub::byte_map_base_slow() {
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  assert(bs->is_a(BarrierSet::G1SATBCTLogging),
+         "Must be if we're using this.");
+  return ((G1SATBCardTableModRefBS*)bs)->byte_map_base;
+}
+
+void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  assert(addr()->is_register(), "Precondition.");
+  assert(new_val()->is_register(), "Precondition.");
+  Register new_val_reg = new_val()->as_register();
+  __ cmpptr(new_val_reg, (int32_t) NULL_WORD);
+  __ jcc(Assembler::equal, _continuation);
+  ce->store_parameter(addr()->as_register(), 0);
+  __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::g1_post_barrier_slow_id)));
+  __ jmp(_continuation);
+}
+
+#endif // SERIALGC
+/////////////////////////////////////////////////////////////////////////////
 
 #undef __
--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -302,6 +302,8 @@
   }
 
   if (obj_store) {
+    // Needs GC write barriers.
+    pre_barrier(LIR_OprFact::address(array_addr), false, NULL);
     __ move(value.result(), array_addr, null_check_info);
     // Seems to be a precise
     post_barrier(LIR_OprFact::address(array_addr), value.result());
@@ -756,7 +758,10 @@
   __ move(obj.result(), addr);
   __ add(addr, offset.result(), addr);
 
-
+  if (type == objectType) {  // Write-barrier needed for Object fields.
+    // Do the pre-write barrier, if any.
+    pre_barrier(addr, false, NULL);
+  }
 
   LIR_Opr ill = LIR_OprFact::illegalOpr;  // for convenience
   if (type == objectType)
@@ -1286,6 +1291,8 @@
     LIR_Address* addr = new LIR_Address(src, offset, type);
     bool is_obj = (type == T_ARRAY || type == T_OBJECT);
     if (is_obj) {
+      // Do the pre-write barrier, if any.
+      pre_barrier(LIR_OprFact::address(addr), false, NULL);
       __ move(data, addr);
       assert(src->is_register(), "must be register");
       // Seems to be a precise address
--- a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -1583,6 +1583,166 @@
       }
       break;
 
+#ifndef SERIALGC
+    case g1_pre_barrier_slow_id:
+      {
+        StubFrame f(sasm, "g1_pre_barrier", dont_gc_arguments);
+        // arg0 : previous value of memory
+
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        if (bs->kind() != BarrierSet::G1SATBCTLogging) {
+          __ movptr(rax, (int)id);
+          __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, unimplemented_entry), rax);
+          __ should_not_reach_here();
+          break;
+        }
+
+        __ push(rax);
+        __ push(rdx);
+
+        const Register pre_val = rax;
+        const Register thread = NOT_LP64(rax) LP64_ONLY(r15_thread);
+        const Register tmp = rdx;
+
+        NOT_LP64(__ get_thread(thread);)
+
+        Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                             PtrQueue::byte_offset_of_active()));
+
+        Address queue_index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                             PtrQueue::byte_offset_of_index()));
+        Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                        PtrQueue::byte_offset_of_buf()));
+
+
+        Label done;
+        Label runtime;
+
+        // Can we store original value in the thread's buffer?
+
+        LP64_ONLY(__ movslq(tmp, queue_index);)
+#ifdef _LP64
+        __ cmpq(tmp, 0);
+#else
+        __ cmpl(queue_index, 0);
+#endif
+        __ jcc(Assembler::equal, runtime);
+#ifdef _LP64
+        __ subq(tmp, wordSize);
+        __ movl(queue_index, tmp);
+        __ addq(tmp, buffer);
+#else
+        __ subl(queue_index, wordSize);
+        __ movl(tmp, buffer);
+        __ addl(tmp, queue_index);
+#endif
+
+        // prev_val (rax)
+        f.load_argument(0, pre_val);
+        __ movptr(Address(tmp, 0), pre_val);
+        __ jmp(done);
+
+        __ bind(runtime);
+        // load the pre-value
+        __ push(rcx);
+        f.load_argument(0, rcx);
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), rcx, thread);
+        __ pop(rcx);
+
+        __ bind(done);
+        __ pop(rdx);
+        __ pop(rax);
+      }
+      break;
+
+    case g1_post_barrier_slow_id:
+      {
+        StubFrame f(sasm, "g1_post_barrier", dont_gc_arguments);
+
+
+        // arg0: store_address
+        Address store_addr(rbp, 2*BytesPerWord);
+
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+        Label done;
+        Label runtime;
+
+        // At this point we know new_value is non-NULL and the new_value crosses regsion.
+        // Must check to see if card is already dirty
+
+        const Register thread = NOT_LP64(rax) LP64_ONLY(r15_thread);
+
+        Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                             PtrQueue::byte_offset_of_index()));
+        Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                        PtrQueue::byte_offset_of_buf()));
+
+        __ push(rax);
+        __ push(rdx);
+
+        NOT_LP64(__ get_thread(thread);)
+        ExternalAddress cardtable((address)ct->byte_map_base);
+        assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+        const Register card_addr = rdx;
+#ifdef _LP64
+        const Register tmp = rscratch1;
+        f.load_argument(0, card_addr);
+        __ shrq(card_addr, CardTableModRefBS::card_shift);
+        __ lea(tmp, cardtable);
+        // get the address of the card
+        __ addq(card_addr, tmp);
+#else
+        const Register card_index = rdx;
+        f.load_argument(0, card_index);
+        __ shrl(card_index, CardTableModRefBS::card_shift);
+
+        Address index(noreg, card_index, Address::times_1);
+        __ leal(card_addr, __ as_Address(ArrayAddress(cardtable, index)));
+#endif
+
+        __ cmpb(Address(card_addr, 0), 0);
+        __ jcc(Assembler::equal, done);
+
+        // storing region crossing non-NULL, card is clean.
+        // dirty card and log.
+
+        __ movb(Address(card_addr, 0), 0);
+
+        __ cmpl(queue_index, 0);
+        __ jcc(Assembler::equal, runtime);
+        __ subl(queue_index, wordSize);
+
+        const Register buffer_addr = rbx;
+        __ push(rbx);
+
+        __ movptr(buffer_addr, buffer);
+
+#ifdef _LP64
+        __ movslq(rscratch1, queue_index);
+        __ addptr(buffer_addr, rscratch1);
+#else
+        __ addptr(buffer_addr, queue_index);
+#endif
+        __ movptr(Address(buffer_addr, 0), card_addr);
+
+        __ pop(rbx);
+        __ jmp(done);
+
+        __ bind(runtime);
+        NOT_LP64(__ push(rcx);)
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
+        NOT_LP64(__ pop(rcx);)
+
+        __ bind(done);
+        __ pop(rdx);
+        __ pop(rax);
+
+      }
+      break;
+#endif // !SERIALGC
+
     default:
       { StubFrame f(sasm, "unimplemented entry", dont_gc_arguments);
         __ movptr(rax, (int)id);
--- a/hotspot/src/cpu/x86/vm/interp_masm_x86_64.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/interp_masm_x86_64.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -44,8 +44,13 @@
   // Note: No need to save/restore bcp & locals (r13 & r14) pointer
   //       since these are callee saved registers and no blocking/
   //       GC can happen in leaf calls.
+  // Further Note: DO NOT save/restore bcp/locals. If a caller has
+  // already saved them so that it can use esi/edi as temporaries
+  // then a save/restore here will DESTROY the copy the caller
+  // saved! There used to be a save_bcp() that only happened in
+  // the ASSERT path (no restore_bcp). Which caused bizarre failures
+  // when jvm built with ASSERTs.
 #ifdef ASSERT
-  save_bcp();
   {
     Label L;
     cmpptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD);
@@ -58,24 +63,9 @@
   // super call
   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
   // interpreter specific
-#ifdef ASSERT
-  {
-    Label L;
-    cmpptr(r13, Address(rbp, frame::interpreter_frame_bcx_offset * wordSize));
-    jcc(Assembler::equal, L);
-    stop("InterpreterMacroAssembler::call_VM_leaf_base:"
-         " r13 not callee saved?");
-    bind(L);
-  }
-  {
-    Label L;
-    cmpptr(r14, Address(rbp, frame::interpreter_frame_locals_offset * wordSize));
-    jcc(Assembler::equal, L);
-    stop("InterpreterMacroAssembler::call_VM_leaf_base:"
-         " r14 not callee saved?");
-    bind(L);
-  }
-#endif
+  // Used to ASSERT that r13/r14 were equal to frame's bcp/locals
+  // but since they may not have been saved (and we don't want to
+  // save thme here (see note above) the assert is invalid.
 }
 
 void InterpreterMacroAssembler::call_VM_base(Register oop_result,
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -712,7 +712,6 @@
   //     end     -  element count
   void  gen_write_ref_array_pre_barrier(Register start, Register count) {
     assert_different_registers(start, count);
-#if 0 // G1 only
     BarrierSet* bs = Universe::heap()->barrier_set();
     switch (bs->kind()) {
       case BarrierSet::G1SATBCT:
@@ -721,8 +720,8 @@
           __ pusha();                      // push registers
           __ push(count);
           __ push(start);
-          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
-          __ addl(esp, wordSize * 2);
+          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)));
+          __ addptr(rsp, 2*wordSize);
           __ popa();
         }
         break;
@@ -734,7 +733,6 @@
         ShouldNotReachHere();
 
     }
-#endif // 0 - G1 only
   }
 
 
@@ -750,20 +748,18 @@
     BarrierSet* bs = Universe::heap()->barrier_set();
     assert_different_registers(start, count);
     switch (bs->kind()) {
-#if 0 // G1 only
       case BarrierSet::G1SATBCT:
       case BarrierSet::G1SATBCTLogging:
         {
           __ pusha();                      // push registers
           __ push(count);
           __ push(start);
-          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
-          __ addl(esp, wordSize * 2);
+          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)));
+          __ addptr(rsp, 2*wordSize);
           __ popa();
 
         }
         break;
-#endif // 0 G1 only
 
       case BarrierSet::CardTableModRef:
       case BarrierSet::CardTableExtension:
@@ -795,6 +791,69 @@
     }
   }
 
+
+  // Copy 64 bytes chunks
+  //
+  // Inputs:
+  //   from        - source array address
+  //   to_from     - destination array address - from
+  //   qword_count - 8-bytes element count, negative
+  //
+  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
+    assert( UseSSE >= 2, "supported cpu only" );
+    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+    // Copy 64-byte chunks
+    __ jmpb(L_copy_64_bytes);
+    __ align(16);
+  __ BIND(L_copy_64_bytes_loop);
+
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(from, 0));
+      __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+      __ movdqu(xmm1, Address(from, 16));
+      __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+      __ movdqu(xmm2, Address(from, 32));
+      __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+      __ movdqu(xmm3, Address(from, 48));
+      __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+
+    } else {
+      __ movq(xmm0, Address(from, 0));
+      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
+      __ movq(xmm1, Address(from, 8));
+      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
+      __ movq(xmm2, Address(from, 16));
+      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
+      __ movq(xmm3, Address(from, 24));
+      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
+      __ movq(xmm4, Address(from, 32));
+      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
+      __ movq(xmm5, Address(from, 40));
+      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
+      __ movq(xmm6, Address(from, 48));
+      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
+      __ movq(xmm7, Address(from, 56));
+      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
+    }
+
+    __ addl(from, 64);
+  __ BIND(L_copy_64_bytes);
+    __ subl(qword_count, 8);
+    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
+    __ addl(qword_count, 8);
+    __ jccb(Assembler::zero, L_exit);
+    //
+    // length is too short, just copy qwords
+    //
+  __ BIND(L_copy_8_bytes);
+    __ movq(xmm0, Address(from, 0));
+    __ movq(Address(from, to_from, Address::times_1), xmm0);
+    __ addl(from, 8);
+    __ decrement(qword_count);
+    __ jcc(Assembler::greater, L_copy_8_bytes);
+  __ BIND(L_exit);
+  }
+
   // Copy 64 bytes chunks
   //
   // Inputs:
@@ -803,6 +862,7 @@
   //   qword_count - 8-bytes element count, negative
   //
   void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
+    assert( VM_Version::supports_mmx(), "supported cpu only" );
     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
     // Copy 64-byte chunks
     __ jmpb(L_copy_64_bytes);
@@ -880,7 +940,7 @@
     __ subptr(to, from); // to --> to_from
     __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
     __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
-    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
+    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
       // align source address at 4 bytes address boundary
       if (t == T_BYTE) {
         // One byte misalignment happens only for byte arrays
@@ -910,20 +970,26 @@
       __ mov(count, rax);      // restore 'count'
       __ jmpb(L_copy_2_bytes); // all dwords were copied
     } else {
-      // align to 8 bytes, we know we are 4 byte aligned to start
-      __ testptr(from, 4);
-      __ jccb(Assembler::zero, L_copy_64_bytes);
-      __ movl(rax, Address(from, 0));
-      __ movl(Address(from, to_from, Address::times_1, 0), rax);
-      __ addptr(from, 4);
-      __ subl(count, 1<<shift);
+      if (!UseUnalignedLoadStores) {
+        // align to 8 bytes, we know we are 4 byte aligned to start
+        __ testptr(from, 4);
+        __ jccb(Assembler::zero, L_copy_64_bytes);
+        __ movl(rax, Address(from, 0));
+        __ movl(Address(from, to_from, Address::times_1, 0), rax);
+        __ addptr(from, 4);
+        __ subl(count, 1<<shift);
+      }
     __ BIND(L_copy_64_bytes);
       __ mov(rax, count);
       __ shrl(rax, shift+1);  // 8 bytes chunk count
       //
       // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
       //
-      mmx_copy_forward(from, to_from, rax);
+      if (UseXMMForArrayCopy) {
+        xmm_copy_forward(from, to_from, rax);
+      } else {
+        mmx_copy_forward(from, to_from, rax);
+      }
     }
     // copy tailing dword
   __ BIND(L_copy_4_bytes);
@@ -1073,13 +1139,20 @@
       __ align(16);
       // Move 8 bytes
     __ BIND(L_copy_8_bytes_loop);
-      __ movq(mmx0, Address(from, count, sf, 0));
-      __ movq(Address(to, count, sf, 0), mmx0);
+      if (UseXMMForArrayCopy) {
+        __ movq(xmm0, Address(from, count, sf, 0));
+        __ movq(Address(to, count, sf, 0), xmm0);
+      } else {
+        __ movq(mmx0, Address(from, count, sf, 0));
+        __ movq(Address(to, count, sf, 0), mmx0);
+      }
     __ BIND(L_copy_8_bytes);
       __ subl(count, 2<<shift);
       __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
       __ addl(count, 2<<shift);
-      __ emms();
+      if (!UseXMMForArrayCopy) {
+        __ emms();
+      }
     }
   __ BIND(L_copy_4_bytes);
     // copy prefix qword
@@ -1147,7 +1220,11 @@
 
     __ subptr(to, from); // to --> to_from
     if (VM_Version::supports_mmx()) {
-      mmx_copy_forward(from, to_from, count);
+      if (UseXMMForArrayCopy) {
+        xmm_copy_forward(from, to_from, count);
+      } else {
+        mmx_copy_forward(from, to_from, count);
+      }
     } else {
       __ jmpb(L_copy_8_bytes);
       __ align(16);
@@ -1200,8 +1277,13 @@
     __ align(16);
   __ BIND(L_copy_8_bytes_loop);
     if (VM_Version::supports_mmx()) {
-      __ movq(mmx0, Address(from, count, Address::times_8));
-      __ movq(Address(to, count, Address::times_8), mmx0);
+      if (UseXMMForArrayCopy) {
+        __ movq(xmm0, Address(from, count, Address::times_8));
+        __ movq(Address(to, count, Address::times_8), xmm0);
+      } else {
+        __ movq(mmx0, Address(from, count, Address::times_8));
+        __ movq(Address(to, count, Address::times_8), mmx0);
+      }
     } else {
       __ fild_d(Address(from, count, Address::times_8));
       __ fistp_d(Address(to, count, Address::times_8));
@@ -1210,7 +1292,7 @@
     __ decrement(count);
     __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
 
-    if (VM_Version::supports_mmx()) {
+    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
       __ emms();
     }
     inc_copy_counter_np(T_LONG);
@@ -1378,9 +1460,9 @@
     Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
 
     // Copy from low to high addresses, indexed from the end of each array.
+    gen_write_ref_array_pre_barrier(to, count);
     __ lea(end_from, end_from_addr);
     __ lea(end_to,   end_to_addr);
-    gen_write_ref_array_pre_barrier(to, count);
     assert(length == count, "");        // else fix next line:
     __ negptr(count);                   // negate and test the length
     __ jccb(Assembler::notZero, L_load_element);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -1153,18 +1153,26 @@
   //     Destroy no registers!
   //
   void  gen_write_ref_array_pre_barrier(Register addr, Register count) {
-#if 0 // G1 - only
-    assert_different_registers(addr, c_rarg1);
-    assert_different_registers(count, c_rarg0);
     BarrierSet* bs = Universe::heap()->barrier_set();
     switch (bs->kind()) {
       case BarrierSet::G1SATBCT:
       case BarrierSet::G1SATBCTLogging:
         {
           __ pusha();                      // push registers
-          __ movptr(c_rarg0, addr);
-          __ movptr(c_rarg1, count);
-          __ call(RuntimeAddress(BarrierSet::static_write_ref_array_pre));
+          if (count == c_rarg0) {
+            if (addr == c_rarg1) {
+              // exactly backwards!!
+              __ xchgptr(c_rarg1, c_rarg0);
+            } else {
+              __ movptr(c_rarg1, count);
+              __ movptr(c_rarg0, addr);
+            }
+
+          } else {
+            __ movptr(c_rarg0, addr);
+            __ movptr(c_rarg1, count);
+          }
+          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)));
           __ popa();
         }
         break;
@@ -1172,11 +1180,10 @@
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
-      default      :
+      default:
         ShouldNotReachHere();
 
     }
-#endif // 0 G1 - only
   }
 
   //
@@ -1193,7 +1200,6 @@
     assert_different_registers(start, end, scratch);
     BarrierSet* bs = Universe::heap()->barrier_set();
     switch (bs->kind()) {
-#if 0 // G1 - only
       case BarrierSet::G1SATBCT:
       case BarrierSet::G1SATBCTLogging:
 
@@ -1206,11 +1212,10 @@
           __ shrptr(scratch, LogBytesPerWord);
           __ mov(c_rarg0, start);
           __ mov(c_rarg1, scratch);
-          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
+          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)));
           __ popa();
         }
         break;
-#endif // 0 G1 - only
       case BarrierSet::CardTableModRef:
       case BarrierSet::CardTableExtension:
         {
@@ -1239,8 +1244,13 @@
           __ decrement(count);
           __ jcc(Assembler::greaterEqual, L_loop);
         }
-      }
-   }
+        break;
+      default:
+        ShouldNotReachHere();
+
+    }
+  }
+
 
   // Copy big chunks forward
   //
@@ -1259,14 +1269,22 @@
     Label L_loop;
     __ align(16);
   __ BIND(L_loop);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
-    __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
-    __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
-    __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
-    __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+      __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
+      __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
+      __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
+
+    } else {
+      __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
+      __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
+      __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
+      __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
+      __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+    }
   __ BIND(L_copy_32_bytes);
     __ addptr(qword_count, 4);
     __ jcc(Assembler::lessEqual, L_loop);
@@ -1292,14 +1310,22 @@
     Label L_loop;
     __ align(16);
   __ BIND(L_loop);
-    __ movq(to, Address(from, qword_count, Address::times_8, 24));
-    __ movq(Address(dest, qword_count, Address::times_8, 24), to);
-    __ movq(to, Address(from, qword_count, Address::times_8, 16));
-    __ movq(Address(dest, qword_count, Address::times_8, 16), to);
-    __ movq(to, Address(from, qword_count, Address::times_8,  8));
-    __ movq(Address(dest, qword_count, Address::times_8,  8), to);
-    __ movq(to, Address(from, qword_count, Address::times_8,  0));
-    __ movq(Address(dest, qword_count, Address::times_8,  0), to);
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
+      __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
+      __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
+      __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
+
+    } else {
+      __ movq(to, Address(from, qword_count, Address::times_8, 24));
+      __ movq(Address(dest, qword_count, Address::times_8, 24), to);
+      __ movq(to, Address(from, qword_count, Address::times_8, 16));
+      __ movq(Address(dest, qword_count, Address::times_8, 16), to);
+      __ movq(to, Address(from, qword_count, Address::times_8,  8));
+      __ movq(Address(dest, qword_count, Address::times_8,  8), to);
+      __ movq(to, Address(from, qword_count, Address::times_8,  0));
+      __ movq(Address(dest, qword_count, Address::times_8,  0), to);
+    }
   __ BIND(L_copy_32_bytes);
     __ subptr(qword_count, 4);
     __ jcc(Assembler::greaterEqual, L_loop);
@@ -2282,7 +2308,7 @@
     // and report their number to the caller.
     assert_different_registers(rax, r14_length, count, to, end_to, rcx);
     __ lea(end_to, to_element_addr);
-    gen_write_ref_array_post_barrier(to, end_to, rcx);
+    gen_write_ref_array_post_barrier(to, end_to, rscratch1);
     __ movptr(rax, r14_length);           // original oops
     __ addptr(rax, count);                // K = (original - remaining) oops
     __ notptr(rax);                       // report (-1^K) to caller
@@ -2291,7 +2317,7 @@
     // Come here on success only.
     __ BIND(L_do_card_marks);
     __ addptr(end_to, -wordSize);         // make an inclusive end pointer
-    gen_write_ref_array_post_barrier(to, end_to, rcx);
+    gen_write_ref_array_post_barrier(to, end_to, rscratch1);
     __ xorptr(rax, rax);                  // return 0 on success
 
     // Common exit point (success or failure).
--- a/hotspot/src/cpu/x86/vm/templateTable_x86_32.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86_32.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -107,6 +107,78 @@
 //----------------------------------------------------------------------------------------------------
 // Miscelaneous helper routines
 
+// Store an oop (or NULL) at the address described by obj.
+// If val == noreg this means store a NULL
+
+static void do_oop_store(InterpreterMacroAssembler* _masm,
+                         Address obj,
+                         Register val,
+                         BarrierSet::Name barrier,
+                         bool precise) {
+  assert(val == noreg || val == rax, "parameter is just for looks");
+  switch (barrier) {
+#ifndef SERIALGC
+    case BarrierSet::G1SATBCT:
+    case BarrierSet::G1SATBCTLogging:
+      {
+        // flatten object address if needed
+        // We do it regardless of precise because we need the registers
+        if (obj.index() == noreg && obj.disp() == 0) {
+          if (obj.base() != rdx) {
+            __ movl(rdx, obj.base());
+          }
+        } else {
+          __ leal(rdx, obj);
+        }
+        __ get_thread(rcx);
+        __ save_bcp();
+        __ g1_write_barrier_pre(rdx, rcx, rsi, rbx, val != noreg);
+
+        // Do the actual store
+        // noreg means NULL
+        if (val == noreg) {
+          __ movl(Address(rdx, 0), NULL_WORD);
+          // No post barrier for NULL
+        } else {
+          __ movl(Address(rdx, 0), val);
+          __ g1_write_barrier_post(rdx, rax, rcx, rbx, rsi);
+        }
+        __ restore_bcp();
+
+      }
+      break;
+#endif // SERIALGC
+    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableExtension:
+      {
+        if (val == noreg) {
+          __ movl(obj, NULL_WORD);
+        } else {
+          __ movl(obj, val);
+          // flatten object address if needed
+          if (!precise || (obj.index() == noreg && obj.disp() == 0)) {
+            __ store_check(obj.base());
+          } else {
+            __ leal(rdx, obj);
+            __ store_check(rdx);
+          }
+        }
+      }
+      break;
+    case BarrierSet::ModRef:
+    case BarrierSet::Other:
+      if (val == noreg) {
+        __ movl(obj, NULL_WORD);
+      } else {
+        __ movl(obj, val);
+      }
+      break;
+    default      :
+      ShouldNotReachHere();
+
+  }
+}
+
 Address TemplateTable::at_bcp(int offset) {
   assert(_desc->uses_bcp(), "inconsistent uses_bcp information");
   return Address(rsi, offset);
@@ -876,6 +948,8 @@
   __ movptr(rax, at_tos());     // Value
   __ movl(rcx, at_tos_p1());  // Index
   __ movptr(rdx, at_tos_p2());  // Array
+
+  Address element_address(rdx, rcx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
   index_check_without_pop(rdx, rcx);      // kills rbx,
   // do array store check - check for NULL value first
   __ testptr(rax, rax);
@@ -887,7 +961,7 @@
   __ movptr(rax, Address(rdx, oopDesc::klass_offset_in_bytes()));
   __ movptr(rax, Address(rax, sizeof(oopDesc) + objArrayKlass::element_klass_offset_in_bytes()));
   // Compress array+index*wordSize+12 into a single register.  Frees ECX.
-  __ lea(rdx, Address(rdx, rcx, Address::times_ptr, arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
+  __ lea(rdx, element_address);
 
   // Generate subtype check.  Blows ECX.  Resets EDI to locals.
   // Superklass in EAX.  Subklass in EBX.
@@ -899,15 +973,20 @@
 
   // Come here on success
   __ bind(ok_is_subtype);
-  __ movptr(rax, at_rsp());     // Value
-  __ movptr(Address(rdx, 0), rax);
-  __ store_check(rdx);
-  __ jmpb(done);
+
+  // Get the value to store
+  __ movptr(rax, at_rsp());
+  // and store it with appropriate barrier
+  do_oop_store(_masm, Address(rdx, 0), rax, _bs->kind(), true);
+
+  __ jmp(done);
 
   // Have a NULL in EAX, EDX=array, ECX=index.  Store NULL at ary[idx]
   __ bind(is_null);
   __ profile_null_seen(rbx);
-  __ movptr(Address(rdx, rcx, Address::times_ptr, arrayOopDesc::base_offset_in_bytes(T_OBJECT)), rax);
+
+  // Store NULL, (noreg means NULL to do_oop_store)
+  do_oop_store(_masm, element_address, noreg, _bs->kind(), true);
 
   // Pop stack arguments
   __ bind(done);
@@ -1515,7 +1594,7 @@
     // compute return address as bci in rax,
     __ lea(rax, at_bcp((is_wide ? 5 : 3) - in_bytes(constMethodOopDesc::codes_offset())));
     __ subptr(rax, Address(rcx, methodOopDesc::const_offset()));
-    // Adjust the bcp in ESI by the displacement in EDX
+    // Adjust the bcp in RSI by the displacement in EDX
     __ addptr(rsi, rdx);
     // Push return address
     __ push_i(rax);
@@ -1526,7 +1605,7 @@
 
   // Normal (non-jsr) branch handling
 
-  // Adjust the bcp in ESI by the displacement in EDX
+  // Adjust the bcp in RSI by the displacement in EDX
   __ addptr(rsi, rdx);
 
   assert(UseLoopCounter || !UseOnStackReplacement, "on-stack-replacement requires loop counters");
@@ -2439,11 +2518,12 @@
   __ pop(atos);
   if (!is_static) pop_and_check_object(obj);
 
-  __ movptr(lo, rax );
-  __ store_check(obj, lo);  // Need to mark card
+  do_oop_store(_masm, lo, rax, _bs->kind(), false);
+
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_aputfield, rcx, rbx);
   }
+
   __ jmp(Done);
 
   __ bind(notObj);
@@ -2664,7 +2744,10 @@
       break;
     case Bytecodes::_fast_fputfield: __ fstp_s(lo); break;
     case Bytecodes::_fast_dputfield: __ fstp_d(lo); break;
-    case Bytecodes::_fast_aputfield: __ movptr(lo, rax); __ store_check(rcx, lo); break;
+    case Bytecodes::_fast_aputfield: {
+      do_oop_store(_masm, lo, rax, _bs->kind(), false);
+      break;
+    }
     default:
       ShouldNotReachHere();
   }
@@ -2672,7 +2755,8 @@
   Label done;
   volatile_barrier(Assembler::Membar_mask_bits(Assembler::StoreLoad |
                                                Assembler::StoreStore));
-  __ jmpb(done);
+  // Barriers are so large that short branch doesn't reach!
+  __ jmp(done);
 
   // Same code as above, but don't need rdx to test for volatile.
   __ bind(notVolatile);
@@ -2694,7 +2778,10 @@
       break;
     case Bytecodes::_fast_fputfield: __ fstp_s(lo); break;
     case Bytecodes::_fast_dputfield: __ fstp_d(lo); break;
-    case Bytecodes::_fast_aputfield: __ movptr(lo, rax); __ store_check(rcx, lo); break;
+    case Bytecodes::_fast_aputfield: {
+      do_oop_store(_masm, lo, rax, _bs->kind(), false);
+      break;
+    }
     default:
       ShouldNotReachHere();
   }
@@ -3054,8 +3141,6 @@
   Label initialize_object;  // including clearing the fields
   Label allocate_shared;
 
-  ExternalAddress heap_top((address)Universe::heap()->top_addr());
-
   __ get_cpool_and_tags(rcx, rax);
   // get instanceKlass
   __ movptr(rcx, Address(rcx, rdx, Address::times_ptr, sizeof(constantPoolOopDesc)));
@@ -3112,6 +3197,8 @@
   if (allow_shared_alloc) {
     __ bind(allocate_shared);
 
+    ExternalAddress heap_top((address)Universe::heap()->top_addr());
+
     Label retry;
     __ bind(retry);
     __ movptr(rax, heap_top);
--- a/hotspot/src/cpu/x86/vm/templateTable_x86_64.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86_64.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -115,6 +115,69 @@
 
 
 // Miscelaneous helper routines
+// Store an oop (or NULL) at the address described by obj.
+// If val == noreg this means store a NULL
+
+static void do_oop_store(InterpreterMacroAssembler* _masm,
+                         Address obj,
+                         Register val,
+                         BarrierSet::Name barrier,
+                         bool precise) {
+  assert(val == noreg || val == rax, "parameter is just for looks");
+  switch (barrier) {
+#ifndef SERIALGC
+    case BarrierSet::G1SATBCT:
+    case BarrierSet::G1SATBCTLogging:
+      {
+        // flatten object address if needed
+        if (obj.index() == noreg && obj.disp() == 0) {
+          if (obj.base() != rdx) {
+            __ movq(rdx, obj.base());
+          }
+        } else {
+          __ leaq(rdx, obj);
+        }
+        __ g1_write_barrier_pre(rdx, r8, rbx, val != noreg);
+        if (val == noreg) {
+          __ store_heap_oop(Address(rdx, 0), NULL_WORD);
+        } else {
+          __ store_heap_oop(Address(rdx, 0), val);
+          __ g1_write_barrier_post(rdx, val, r8, rbx);
+        }
+
+      }
+      break;
+#endif // SERIALGC
+    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableExtension:
+      {
+        if (val == noreg) {
+          __ store_heap_oop(obj, NULL_WORD);
+        } else {
+          __ store_heap_oop(obj, val);
+          // flatten object address if needed
+          if (!precise || (obj.index() == noreg && obj.disp() == 0)) {
+            __ store_check(obj.base());
+          } else {
+            __ leaq(rdx, obj);
+            __ store_check(rdx);
+          }
+        }
+      }
+      break;
+    case BarrierSet::ModRef:
+    case BarrierSet::Other:
+      if (val == noreg) {
+        __ store_heap_oop(obj, NULL_WORD);
+      } else {
+        __ store_heap_oop(obj, val);
+      }
+      break;
+    default      :
+      ShouldNotReachHere();
+
+  }
+}
 
 Address TemplateTable::at_bcp(int offset) {
   assert(_desc->uses_bcp(), "inconsistent uses_bcp information");
@@ -560,8 +623,8 @@
   // rdx: array
   index_check(rdx, rax); // kills rbx
   __ load_heap_oop(rax, Address(rdx, rax,
-                       UseCompressedOops ? Address::times_4 : Address::times_8,
-                       arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
+                                UseCompressedOops ? Address::times_4 : Address::times_8,
+                                arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
 }
 
 void TemplateTable::baload() {
@@ -866,6 +929,11 @@
   __ movptr(rax, at_tos());    // value
   __ movl(rcx, at_tos_p1()); // index
   __ movptr(rdx, at_tos_p2()); // array
+
+  Address element_address(rdx, rcx,
+                          UseCompressedOops? Address::times_4 : Address::times_8,
+                          arrayOopDesc::base_offset_in_bytes(T_OBJECT));
+
   index_check(rdx, rcx);     // kills rbx
   // do array store check - check for NULL value first
   __ testptr(rax, rax);
@@ -879,9 +947,7 @@
                          sizeof(oopDesc) +
                          objArrayKlass::element_klass_offset_in_bytes()));
   // Compress array + index*oopSize + 12 into a single register.  Frees rcx.
-  __ lea(rdx, Address(rdx, rcx,
-                      UseCompressedOops ? Address::times_4 : Address::times_8,
-                      arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
+  __ lea(rdx, element_address);
 
   // Generate subtype check.  Blows rcx, rdi
   // Superklass in rax.  Subklass in rbx.
@@ -893,18 +959,19 @@
 
   // Come here on success
   __ bind(ok_is_subtype);
-  __ movptr(rax, at_tos()); // Value
-  __ store_heap_oop(Address(rdx, 0), rax);
-  __ store_check(rdx);
+
+  // Get the value we will store
+  __ movptr(rax, at_tos());
+  // Now store using the appropriate barrier
+  do_oop_store(_masm, Address(rdx, 0), rax, _bs->kind(), true);
   __ jmp(done);
 
   // Have a NULL in rax, rdx=array, ecx=index.  Store NULL at ary[idx]
   __ bind(is_null);
   __ profile_null_seen(rbx);
-  __ store_heap_oop(Address(rdx, rcx,
-                            UseCompressedOops ? Address::times_4 : Address::times_8,
-                            arrayOopDesc::base_offset_in_bytes(T_OBJECT)),
-                    rax);
+
+  // Store a NULL
+  do_oop_store(_masm, element_address, noreg, _bs->kind(), true);
 
   // Pop stack arguments
   __ bind(done);
@@ -2396,8 +2463,10 @@
   // atos
   __ pop(atos);
   if (!is_static) pop_and_check_object(obj);
-  __ store_heap_oop(field, rax);
-  __ store_check(obj, field); // Need to mark card
+
+  // Store into the field
+  do_oop_store(_masm, field, rax, _bs->kind(), false);
+
   if (!is_static) {
     patch_bytecode(Bytecodes::_fast_aputfield, bc, rbx);
   }
@@ -2584,8 +2653,7 @@
   // access field
   switch (bytecode()) {
   case Bytecodes::_fast_aputfield:
-    __ store_heap_oop(field, rax);
-    __ store_check(rcx, field);
+    do_oop_store(_masm, field, rax, _bs->kind(), false);
     break;
   case Bytecodes::_fast_lputfield:
     __ movq(field, rax);
@@ -3044,8 +3112,6 @@
   Label initialize_header;
   Label initialize_object; // including clearing the fields
   Label allocate_shared;
-  ExternalAddress top((address)Universe::heap()->top_addr());
-  ExternalAddress end((address)Universe::heap()->end_addr());
 
   __ get_cpool_and_tags(rsi, rax);
   // get instanceKlass
@@ -3106,6 +3172,9 @@
   if (allow_shared_alloc) {
     __ bind(allocate_shared);
 
+    ExternalAddress top((address)Universe::heap()->top_addr());
+    ExternalAddress end((address)Universe::heap()->end_addr());
+
     const Register RtopAddr = rscratch1;
     const Register RendAddr = rscratch2;
 
--- a/hotspot/src/cpu/x86/vm/vm_version_x86_32.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86_32.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -242,9 +242,11 @@
   _supports_cx8 = supports_cmpxchg8();
   // if the OS doesn't support SSE, we can't use this feature even if the HW does
   if( !os::supports_sse())
-    _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4|CPU_SSE4A);
-  if (UseSSE < 4)
-    _cpuFeatures &= ~CPU_SSE4;
+    _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2);
+  if (UseSSE < 4) {
+    _cpuFeatures &= ~CPU_SSE4_1;
+    _cpuFeatures &= ~CPU_SSE4_2;
+  }
   if (UseSSE < 3) {
     _cpuFeatures &= ~CPU_SSE3;
     _cpuFeatures &= ~CPU_SSSE3;
@@ -261,7 +263,7 @@
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -272,7 +274,8 @@
                (supports_sse2() ? ", sse2" : ""),
                (supports_sse3() ? ", sse3" : ""),
                (supports_ssse3()? ", ssse3": ""),
-               (supports_sse4() ? ", sse4" : ""),
+               (supports_sse4_1() ? ", sse4.1" : ""),
+               (supports_sse4_2() ? ", sse4.2" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow()   ? ", 3dnow"  : ""),
                (supports_3dnow2()  ? ", 3dnowext" : ""),
@@ -285,7 +288,7 @@
   // older Pentiums which do not support it.
   if( UseSSE > 4 ) UseSSE=4;
   if( UseSSE < 0 ) UseSSE=0;
-  if( !supports_sse4() ) // Drop to 3 if no SSE4 support
+  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
     UseSSE = MIN2((intx)3,UseSSE);
   if( !supports_sse3() ) // Drop to 2 if no SSE3 support
     UseSSE = MIN2((intx)2,UseSSE);
@@ -375,6 +378,14 @@
         MaxLoopPad = 11;
       }
 #endif // COMPILER2
+      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+        UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
+      }
+      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
+        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
+        }
+      }
     }
   }
 
@@ -413,7 +424,7 @@
 
 #ifndef PRODUCT
   if (PrintMiscellaneous && Verbose) {
-    tty->print_cr("Logical CPUs per package: %u",
+    tty->print_cr("Logical CPUs per core: %u",
                   logical_processors_per_package());
     tty->print_cr("UseSSE=%d",UseSSE);
     tty->print("Allocation: ");
--- a/hotspot/src/cpu/x86/vm/vm_version_x86_32.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86_32.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -68,9 +68,9 @@
                cmpxchg16: 1,
                         : 4,
                dca      : 1,
-                        : 4,
-               popcnt   : 1,
-                        : 8;
+               sse4_1   : 1,
+               sse4_2   : 1,
+                        : 11;
     } bits;
   };
 
@@ -177,8 +177,9 @@
      CPU_SSE2 = (1 << 7),
      CPU_SSE3 = (1 << 8), // sse3  comes from cpuid 1 (ECX)
      CPU_SSSE3= (1 << 9),
-     CPU_SSE4 = (1 <<10),
-     CPU_SSE4A= (1 <<11)
+     CPU_SSE4A= (1 <<10),
+     CPU_SSE4_1 = (1 << 11),
+     CPU_SSE4_2 = (1 << 12)
    } cpuFeatureFlags;
 
   // cpuid information block.  All info derived from executing cpuid with
@@ -240,22 +241,14 @@
   static CpuidInfo _cpuid_info;
 
   // Extractors and predicates
-  static bool is_extended_cpu_family() {
-    const uint32_t Extended_Cpu_Family = 0xf;
-    return _cpuid_info.std_cpuid1_rax.bits.family == Extended_Cpu_Family;
-  }
   static uint32_t extended_cpu_family() {
     uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family;
-    if (is_extended_cpu_family()) {
-      result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
-    }
+    result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
     return result;
   }
   static uint32_t extended_cpu_model() {
     uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model;
-    if (is_extended_cpu_family()) {
-      result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
-    }
+    result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
     return result;
   }
   static uint32_t cpu_stepping() {
@@ -293,6 +286,10 @@
       result |= CPU_SSSE3;
     if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0)
       result |= CPU_SSE4A;
+    if (_cpuid_info.std_cpuid1_rcx.bits.sse4_1 != 0)
+      result |= CPU_SSE4_1;
+    if (_cpuid_info.std_cpuid1_rcx.bits.sse4_2 != 0)
+      result |= CPU_SSE4_2;
     return result;
   }
 
@@ -380,7 +377,8 @@
   static bool supports_sse2()     { return (_cpuFeatures & CPU_SSE2) != 0; }
   static bool supports_sse3()     { return (_cpuFeatures & CPU_SSE3) != 0; }
   static bool supports_ssse3()    { return (_cpuFeatures & CPU_SSSE3)!= 0; }
-  static bool supports_sse4()     { return (_cpuFeatures & CPU_SSE4) != 0; }
+  static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
+  static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
   //
   // AMD features
   //
--- a/hotspot/src/cpu/x86/vm/vm_version_x86_64.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86_64.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -186,8 +186,10 @@
   if (!VM_Version::supports_sse2()) {
     vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported");
   }
-  if (UseSSE < 4)
-    _cpuFeatures &= ~CPU_SSE4;
+  if (UseSSE < 4) {
+    _cpuFeatures &= ~CPU_SSE4_1;
+    _cpuFeatures &= ~CPU_SSE4_2;
+  }
   if (UseSSE < 3) {
     _cpuFeatures &= ~CPU_SSE3;
     _cpuFeatures &= ~CPU_SSSE3;
@@ -204,7 +206,7 @@
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -215,7 +217,8 @@
                (supports_sse2() ? ", sse2" : ""),
                (supports_sse3() ? ", sse3" : ""),
                (supports_ssse3()? ", ssse3": ""),
-               (supports_sse4() ? ", sse4" : ""),
+               (supports_sse4_1() ? ", sse4.1" : ""),
+               (supports_sse4_2() ? ", sse4.2" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow()   ? ", 3dnow"  : ""),
                (supports_3dnow2()  ? ", 3dnowext" : ""),
@@ -228,7 +231,7 @@
   // older Pentiums which do not support it.
   if( UseSSE > 4 ) UseSSE=4;
   if( UseSSE < 0 ) UseSSE=0;
-  if( !supports_sse4() ) // Drop to 3 if no SSE4 support
+  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
     UseSSE = MIN2((intx)3,UseSSE);
   if( !supports_sse3() ) // Drop to 2 if no SSE3 support
     UseSSE = MIN2((intx)2,UseSSE);
@@ -314,6 +317,14 @@
         MaxLoopPad = 11;
       }
 #endif // COMPILER2
+      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+        UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
+      }
+      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
+        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
+        }
+      }
     }
   }
 
@@ -355,7 +366,7 @@
 
 #ifndef PRODUCT
   if (PrintMiscellaneous && Verbose) {
-    tty->print_cr("Logical CPUs per package: %u",
+    tty->print_cr("Logical CPUs per core: %u",
                   logical_processors_per_package());
     tty->print_cr("UseSSE=%d",UseSSE);
     tty->print("Allocation: ");
--- a/hotspot/src/cpu/x86/vm/vm_version_x86_64.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86_64.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -68,9 +68,9 @@
                cmpxchg16: 1,
                         : 4,
                dca      : 1,
-                        : 4,
-               popcnt   : 1,
-                        : 8;
+               sse4_1   : 1,
+               sse4_2   : 1,
+                        : 11;
     } bits;
   };
 
@@ -177,8 +177,9 @@
      CPU_SSE2 = (1 << 7),
      CPU_SSE3 = (1 << 8),
      CPU_SSSE3= (1 << 9),
-     CPU_SSE4 = (1 <<10),
-     CPU_SSE4A= (1 <<11)
+     CPU_SSE4A= (1 <<10),
+     CPU_SSE4_1 = (1 << 11),
+     CPU_SSE4_2 = (1 << 12)
    } cpuFeatureFlags;
 
   // cpuid information block.  All info derived from executing cpuid with
@@ -240,22 +241,14 @@
   static CpuidInfo _cpuid_info;
 
   // Extractors and predicates
-  static bool is_extended_cpu_family() {
-    const uint32_t Extended_Cpu_Family = 0xf;
-    return _cpuid_info.std_cpuid1_eax.bits.family == Extended_Cpu_Family;
-  }
   static uint32_t extended_cpu_family() {
     uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family;
-    if (is_extended_cpu_family()) {
-      result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
-    }
+    result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
     return result;
   }
   static uint32_t extended_cpu_model() {
     uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model;
-    if (is_extended_cpu_family()) {
-      result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
-    }
+    result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
     return result;
   }
   static uint32_t cpu_stepping() {
@@ -293,6 +286,10 @@
       result |= CPU_SSSE3;
     if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
       result |= CPU_SSE4A;
+    if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0)
+      result |= CPU_SSE4_1;
+    if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0)
+      result |= CPU_SSE4_2;
     return result;
   }
 
@@ -380,7 +377,8 @@
   static bool supports_sse2()     { return (_cpuFeatures & CPU_SSE2) != 0; }
   static bool supports_sse3()     { return (_cpuFeatures & CPU_SSE3) != 0; }
   static bool supports_ssse3()    { return (_cpuFeatures & CPU_SSSE3)!= 0; }
-  static bool supports_sse4()     { return (_cpuFeatures & CPU_SSE4) != 0; }
+  static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
+  static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
   //
   // AMD features
   //
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Wed Jul 05 16:43:15 2017 +0200
@@ -4810,6 +4810,16 @@
   interface(CONST_INTER);
 %}
 
+// Long Immediate zero
+operand immL_M1() %{
+  predicate( n->get_long() == -1L );
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // Long immediate from 0 to 127.
 // Used for a shorter form of long mul by 10.
 operand immL_127() %{
@@ -8621,6 +8631,18 @@
   ins_pipe( ialu_reg_reg );
 %}
 
+// Xor Register with Immediate -1
+instruct xorI_eReg_im1(eRegI dst, immI_M1 imm) %{
+  match(Set dst (XorI dst imm));  
+
+  size(2);
+  format %{ "NOT    $dst" %}  
+  ins_encode %{
+     __ notl($dst$$Register);
+  %}
+  ins_pipe( ialu_reg );
+%}
+
 // Xor Register with Immediate
 instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (XorI dst src));
@@ -8938,6 +8960,18 @@
   ins_pipe( ialu_reg_reg_long );
 %}
 
+// Xor Long Register with Immediate -1
+instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{
+  match(Set dst (XorL dst imm));  
+  format %{ "NOT    $dst.lo\n\t"
+            "NOT    $dst.hi" %}
+  ins_encode %{
+     __ notl($dst$$Register);
+     __ notl(HIGH_FROM_LOW($dst$$Register));
+  %}
+  ins_pipe( ialu_reg_long );
+%}
+
 // Xor Long Register with Immediate
 instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
   match(Set dst (XorL dst src));
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Wed Jul 05 16:43:15 2017 +0200
@@ -9309,6 +9309,17 @@
   ins_pipe(ialu_reg_reg);
 %}
 
+// Xor Register with Immediate -1
+instruct xorI_rReg_im1(rRegI dst, immI_M1 imm) %{
+  match(Set dst (XorI dst imm));  
+
+  format %{ "not    $dst" %}  
+  ins_encode %{
+     __ notl($dst$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
 // Xor Register with Immediate
 instruct xorI_rReg_imm(rRegI dst, immI src, rFlagsReg cr)
 %{
@@ -9529,6 +9540,17 @@
   ins_pipe(ialu_reg_reg);
 %}
 
+// Xor Register with Immediate -1
+instruct xorL_rReg_im1(rRegL dst, immL_M1 imm) %{
+  match(Set dst (XorL dst imm));  
+
+  format %{ "notq   $dst" %}  
+  ins_encode %{
+     __ notq($dst$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
 // Xor Register with Immediate
 instruct xorL_rReg_imm(rRegL dst, immL32 src, rFlagsReg cr)
 %{
--- a/hotspot/src/os/linux/launcher/java.c	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/linux/launcher/java.c	Wed Jul 05 16:43:15 2017 +0200
@@ -1110,7 +1110,7 @@
         if (propname) {
             jclass cls;
             jmethodID mid;
-            NULL_CHECK0 (cls = (*env)->FindClass(env, "java/lang/System"));
+            NULL_CHECK0 (cls = FindBootStrapClass(env, "java/lang/System"));
             NULL_CHECK0 (mid = (*env)->GetStaticMethodID(
                                    env, cls,
                                    "getProperty",
@@ -1125,7 +1125,7 @@
 static jboolean isEncodingSupported(JNIEnv *env, jstring enc) {
     jclass cls;
     jmethodID mid;
-    NULL_CHECK0 (cls = (*env)->FindClass(env, "java/nio/charset/Charset"));
+    NULL_CHECK0 (cls = FindBootStrapClass(env, "java/nio/charset/Charset"));
     NULL_CHECK0 (mid = (*env)->GetStaticMethodID(
                            env, cls,
                            "isSupported",
@@ -1161,7 +1161,7 @@
 #else
             if (isEncodingSupported(env, enc) == JNI_TRUE) {
 #endif
-                NULL_CHECK0(cls = (*env)->FindClass(env, "java/lang/String"));
+                NULL_CHECK0(cls = FindBootStrapClass(env, "java/lang/String"));
                 NULL_CHECK0(mid = (*env)->GetMethodID(env, cls, "<init>",
                                           "([BLjava/lang/String;)V"));
                 str = (*env)->NewObject(env, cls, mid, ary, enc);
@@ -1172,7 +1172,7 @@
                   the encoding name, in which the StringCoding class will
                   pickup the iso-8859-1 as the fallback converter for us.
                 */
-                NULL_CHECK0(cls = (*env)->FindClass(env, "java/lang/String"));
+                NULL_CHECK0(cls = FindBootStrapClass(env, "java/lang/String"));
                 NULL_CHECK0(mid = (*env)->GetMethodID(env, cls, "<init>",
                                           "([B)V"));
                 str = (*env)->NewObject(env, cls, mid, ary);
@@ -1195,7 +1195,7 @@
     jarray ary;
     int i;
 
-    NULL_CHECK0(cls = (*env)->FindClass(env, "java/lang/String"));
+    NULL_CHECK0(cls = FindBootStrapClass(env, "java/lang/String"));
     NULL_CHECK0(ary = (*env)->NewObjectArray(env, strc, cls, 0));
     for (i = 0; i < strc; i++) {
         jstring str = NewPlatformString(env, *strv++);
@@ -1224,6 +1224,7 @@
         c = *t++;
         *s++ = (c == '.') ? '/' : c;
     } while (c != '\0');
+    // use the application class loader for main-class
     cls = (*env)->FindClass(env, buf);
     free(buf);
 
@@ -1250,7 +1251,7 @@
     jobject jar, man, attr;
     jstring str, result = 0;
 
-    NULL_CHECK0(cls = (*env)->FindClass(env, "java/util/jar/JarFile"));
+    NULL_CHECK0(cls = FindBootStrapClass(env, "java/util/jar/JarFile"));
     NULL_CHECK0(mid = (*env)->GetMethodID(env, cls, "<init>",
                                           "(Ljava/lang/String;)V"));
     NULL_CHECK0(str = NewPlatformString(env, jarname));
@@ -1471,7 +1472,7 @@
     jclass ver;
     jmethodID print;
 
-    NULL_CHECK(ver = (*env)->FindClass(env, "sun/misc/Version"));
+    NULL_CHECK(ver = FindBootStrapClass(env, "sun/misc/Version"));
     NULL_CHECK(print = (*env)->GetStaticMethodID(env, ver, "print", "()V"));
 
     (*env)->CallStaticVoidMethod(env, ver, print);
--- a/hotspot/src/os/linux/launcher/java.h	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/linux/launcher/java.h	Wed Jul 05 16:43:15 2017 +0200
@@ -100,5 +100,15 @@
  * Make launcher spit debug output.
  */
 extern jboolean _launcher_debug;
+/*
+ * This allows for finding classes from the VM's bootstrap class loader
+ * directly, FindClass uses the application class loader internally, this will
+ * cause unnecessary searching of the classpath for the required classes.
+ */
+typedef jclass (JNICALL FindClassFromBootLoader_t(JNIEnv *env,
+                                                const char *name,
+                                                jboolean throwError));
+
+jclass FindBootStrapClass(JNIEnv *env, const char *classname);
 
 #endif /* _JAVA_H_ */
--- a/hotspot/src/os/linux/launcher/java_md.c	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/linux/launcher/java_md.c	Wed Jul 05 16:43:15 2017 +0200
@@ -1826,3 +1826,23 @@
 {
     return(borrowed_unsetenv(name));
 }
+/*
+ * The implementation for finding classes from the bootstrap
+ * class loader, refer to java.h
+ */
+static FindClassFromBootLoader_t *findBootClass = NULL;
+
+jclass
+FindBootStrapClass(JNIEnv *env, const char* classname)
+{
+   if (findBootClass == NULL) {
+       findBootClass = (FindClassFromBootLoader_t *)dlsym(RTLD_DEFAULT,
+          "JVM_FindClassFromBootLoader");
+       if (findBootClass == NULL) {
+           fprintf(stderr, "Error: could load method JVM_FindClassFromBootLoader");
+           return NULL;
+       }
+   }
+   return findBootClass(env, classname, JNI_FALSE);
+}
+
--- a/hotspot/src/os/linux/vm/globals_linux.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/linux/vm/globals_linux.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -38,5 +38,6 @@
 // platforms, but they may have different default values on other platforms.
 //
 define_pd_global(bool, UseLargePages, false);
+define_pd_global(bool, UseLargePagesIndividualAllocation, false);
 define_pd_global(bool, UseOSErrorReporting, false);
 define_pd_global(bool, UseThreadPriorities, true) ;
--- a/hotspot/src/os/linux/vm/os_linux.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/linux/vm/os_linux.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -1261,6 +1261,17 @@
   return (1000 * 1000);
 }
 
+// For now, we say that linux does not support vtime.  I have no idea
+// whether it can actually be made to (DLD, 9/13/05).
+
+bool os::supports_vtime() { return false; }
+bool os::enable_vtime()   { return false; }
+bool os::vtime_enabled()  { return false; }
+double os::elapsedVTime() {
+  // better than nothing, but not much
+  return elapsedTime();
+}
+
 jlong os::javaTimeMillis() {
   timeval time;
   int status = gettimeofday(&time, NULL);
--- a/hotspot/src/os/solaris/launcher/java.c	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/solaris/launcher/java.c	Wed Jul 05 16:43:15 2017 +0200
@@ -1110,7 +1110,7 @@
         if (propname) {
             jclass cls;
             jmethodID mid;
-            NULL_CHECK0 (cls = (*env)->FindClass(env, "java/lang/System"));
+            NULL_CHECK0 (cls = FindBootStrapClass(env, "java/lang/System"));
             NULL_CHECK0 (mid = (*env)->GetStaticMethodID(
                                    env, cls,
                                    "getProperty",
@@ -1125,7 +1125,7 @@
 static jboolean isEncodingSupported(JNIEnv *env, jstring enc) {
     jclass cls;
     jmethodID mid;
-    NULL_CHECK0 (cls = (*env)->FindClass(env, "java/nio/charset/Charset"));
+    NULL_CHECK0 (cls = FindBootStrapClass(env, "java/nio/charset/Charset"));
     NULL_CHECK0 (mid = (*env)->GetStaticMethodID(
                            env, cls,
                            "isSupported",
@@ -1161,7 +1161,7 @@
 #else
             if (isEncodingSupported(env, enc) == JNI_TRUE) {
 #endif
-                NULL_CHECK0(cls = (*env)->FindClass(env, "java/lang/String"));
+                NULL_CHECK0(cls = FindBootStrapClass(env, "java/lang/String"));
                 NULL_CHECK0(mid = (*env)->GetMethodID(env, cls, "<init>",
                                           "([BLjava/lang/String;)V"));
                 str = (*env)->NewObject(env, cls, mid, ary, enc);
@@ -1172,7 +1172,7 @@
                   the encoding name, in which the StringCoding class will
                   pickup the iso-8859-1 as the fallback converter for us.
                 */
-                NULL_CHECK0(cls = (*env)->FindClass(env, "java/lang/String"));
+                NULL_CHECK0(cls = FindBootStrapClass(env, "java/lang/String"));
                 NULL_CHECK0(mid = (*env)->GetMethodID(env, cls, "<init>",
                                           "([B)V"));
                 str = (*env)->NewObject(env, cls, mid, ary);
@@ -1195,7 +1195,7 @@
     jarray ary;
     int i;
 
-    NULL_CHECK0(cls = (*env)->FindClass(env, "java/lang/String"));
+    NULL_CHECK0(cls = FindBootStrapClass(env, "java/lang/String"));
     NULL_CHECK0(ary = (*env)->NewObjectArray(env, strc, cls, 0));
     for (i = 0; i < strc; i++) {
         jstring str = NewPlatformString(env, *strv++);
@@ -1224,6 +1224,7 @@
         c = *t++;
         *s++ = (c == '.') ? '/' : c;
     } while (c != '\0');
+    // use the application class loader for the main-class
     cls = (*env)->FindClass(env, buf);
     free(buf);
 
@@ -1250,7 +1251,7 @@
     jobject jar, man, attr;
     jstring str, result = 0;
 
-    NULL_CHECK0(cls = (*env)->FindClass(env, "java/util/jar/JarFile"));
+    NULL_CHECK0(cls = FindBootStrapClass(env, "java/util/jar/JarFile"));
     NULL_CHECK0(mid = (*env)->GetMethodID(env, cls, "<init>",
                                           "(Ljava/lang/String;)V"));
     NULL_CHECK0(str = NewPlatformString(env, jarname));
@@ -1471,7 +1472,7 @@
     jclass ver;
     jmethodID print;
 
-    NULL_CHECK(ver = (*env)->FindClass(env, "sun/misc/Version"));
+    NULL_CHECK(ver = FindBootStrapClass(env, "sun/misc/Version"));
     NULL_CHECK(print = (*env)->GetStaticMethodID(env, ver, "print", "()V"));
 
     (*env)->CallStaticVoidMethod(env, ver, print);
--- a/hotspot/src/os/solaris/launcher/java.h	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/solaris/launcher/java.h	Wed Jul 05 16:43:15 2017 +0200
@@ -101,4 +101,15 @@
  */
 extern jboolean _launcher_debug;
 
+/*
+ * This allows for finding classes from the VM's bootstrap class loader
+ * directly, FindClass uses the application class loader internally, this will
+ * cause unnecessary searching of the classpath for the required classes.
+ */
+typedef jclass (JNICALL FindClassFromBootLoader_t(JNIEnv *env,
+                                                const char *name,
+                                                jboolean throwError));
+
+jclass FindBootStrapClass(JNIEnv *env, const char *classname);
+
 #endif /* _JAVA_H_ */
--- a/hotspot/src/os/solaris/launcher/java_md.c	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/solaris/launcher/java_md.c	Wed Jul 05 16:43:15 2017 +0200
@@ -1826,3 +1826,24 @@
 {
     return(borrowed_unsetenv(name));
 }
+
+/*
+ * The implementation for finding classes from the bootstrap
+ * class loader, refer to java.h
+ */
+static FindClassFromBootLoader_t *findBootClass = NULL;
+
+jclass
+FindBootStrapClass(JNIEnv *env, const char* classname)
+{
+   if (findBootClass == NULL) {
+       findBootClass = (FindClassFromBootLoader_t *)dlsym(RTLD_DEFAULT,
+          "JVM_FindClassFromBootLoader");
+       if (findBootClass == NULL) {
+           fprintf(stderr, "Error: could not load method JVM_FindClassFromBootLoader");
+           return NULL;
+       }
+   }
+   return findBootClass(env, classname, JNI_FALSE);
+}
+
--- a/hotspot/src/os/solaris/vm/globals_solaris.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/solaris/vm/globals_solaris.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -44,5 +44,6 @@
 // platforms, but they may have different default values on other platforms.
 //
 define_pd_global(bool, UseLargePages, true);
+define_pd_global(bool, UseLargePagesIndividualAllocation, false);
 define_pd_global(bool, UseOSErrorReporting, false);
 define_pd_global(bool, UseThreadPriorities, false);
--- a/hotspot/src/os/solaris/vm/os_solaris.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/solaris/vm/os_solaris.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -462,16 +462,14 @@
   int online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
   pid_t pid = getpid();
   psetid_t pset = PS_NONE;
-  // Are we running in a processor set?
+  // Are we running in a processor set or is there any processor set around?
   if (pset_bind(PS_QUERY, P_PID, pid, &pset) == 0) {
-    if (pset != PS_NONE) {
-      uint_t pset_cpus;
-      // Query number of cpus in processor set
-      if (pset_info(pset, NULL, &pset_cpus, NULL) == 0) {
-        assert(pset_cpus > 0 && pset_cpus <= online_cpus, "sanity check");
-        _processors_online = pset_cpus;
-        return pset_cpus;
-      }
+    uint_t pset_cpus;
+    // Query the number of cpus available to us.
+    if (pset_info(pset, NULL, &pset_cpus, NULL) == 0) {
+      assert(pset_cpus > 0 && pset_cpus <= online_cpus, "sanity check");
+      _processors_online = pset_cpus;
+      return pset_cpus;
     }
   }
   // Otherwise return number of online cpus
@@ -1691,6 +1689,40 @@
   }
 }
 
+bool os::supports_vtime() { return true; }
+
+bool os::enable_vtime() {
+  int fd = open("/proc/self/ctl", O_WRONLY);
+  if (fd == -1)
+    return false;
+
+  long cmd[] = { PCSET, PR_MSACCT };
+  int res = write(fd, cmd, sizeof(long) * 2);
+  close(fd);
+  if (res != sizeof(long) * 2)
+    return false;
+
+  return true;
+}
+
+bool os::vtime_enabled() {
+  int fd = open("/proc/self/status", O_RDONLY);
+  if (fd == -1)
+    return false;
+
+  pstatus_t status;
+  int res = read(fd, (void*) &status, sizeof(pstatus_t));
+  close(fd);
+  if (res != sizeof(pstatus_t))
+    return false;
+
+  return status.pr_flags & PR_MSACCT;
+}
+
+double os::elapsedVTime() {
+  return (double)gethrvtime() / (double)hrtime_hz;
+}
+
 // Used internally for comparisons only
 // getTimeMillis guaranteed to not move backwards on Solaris
 jlong getTimeMillis() {
@@ -2688,7 +2720,7 @@
    return bottom;
 }
 
-// Detect the topology change. Typically happens during CPU pluggin-unplugging.
+// Detect the topology change. Typically happens during CPU plugging-unplugging.
 bool os::numa_topology_changed() {
   int is_stale = Solaris::lgrp_cookie_stale(Solaris::lgrp_cookie());
   if (is_stale != -1 && is_stale) {
--- a/hotspot/src/os/windows/vm/globals_windows.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/windows/vm/globals_windows.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -37,5 +37,6 @@
 // platforms, but they may have different default values on other platforms.
 //
 define_pd_global(bool, UseLargePages, false);
+define_pd_global(bool, UseLargePagesIndividualAllocation, true);
 define_pd_global(bool, UseOSErrorReporting, false);  // for now.
 define_pd_global(bool, UseThreadPriorities, true) ;
--- a/hotspot/src/os/windows/vm/os_windows.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/windows/vm/os_windows.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -737,6 +737,17 @@
   return result;
 }
 
+// For now, we say that Windows does not support vtime.  I have no idea
+// whether it can actually be made to (DLD, 9/13/05).
+
+bool os::supports_vtime() { return false; }
+bool os::enable_vtime() { return false; }
+bool os::vtime_enabled() { return false; }
+double os::elapsedVTime() {
+  // better than nothing, but not much
+  return elapsedTime();
+}
+
 jlong os::javaTimeMillis() {
   if (UseFakeTimers) {
     return fake_time++;
@@ -2582,9 +2593,104 @@
 }
 
 char* os::reserve_memory_special(size_t bytes) {
-  DWORD flag = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
-  char * res = (char *)VirtualAlloc(NULL, bytes, flag, PAGE_EXECUTE_READWRITE);
-  return res;
+
+  if (UseLargePagesIndividualAllocation) {
+    if (TracePageSizes && Verbose) {
+       tty->print_cr("Reserving large pages individually.");
+    }
+    char * p_buf;
+    // first reserve enough address space in advance since we want to be
+    // able to break a single contiguous virtual address range into multiple
+    // large page commits but WS2003 does not allow reserving large page space
+    // so we just use 4K pages for reserve, this gives us a legal contiguous
+    // address space. then we will deallocate that reservation, and re alloc
+    // using large pages
+    const size_t size_of_reserve = bytes + _large_page_size;
+    if (bytes > size_of_reserve) {
+      // Overflowed.
+      warning("Individually allocated large pages failed, "
+        "use -XX:-UseLargePagesIndividualAllocation to turn off");
+      return NULL;
+    }
+    p_buf = (char *) VirtualAlloc(NULL,
+                                 size_of_reserve,  // size of Reserve
+                                 MEM_RESERVE,
+                                 PAGE_EXECUTE_READWRITE);
+    // If reservation failed, return NULL
+    if (p_buf == NULL) return NULL;
+
+    release_memory(p_buf, bytes + _large_page_size);
+    // round up to page boundary.  If the size_of_reserve did not
+    // overflow and the reservation did not fail, this align up
+    // should not overflow.
+    p_buf = (char *) align_size_up((size_t)p_buf, _large_page_size);
+
+    // now go through and allocate one page at a time until all bytes are
+    // allocated
+    size_t  bytes_remaining = align_size_up(bytes, _large_page_size);
+    // An overflow of align_size_up() would have been caught above
+    // in the calculation of size_of_reserve.
+    char * next_alloc_addr = p_buf;
+
+#ifdef ASSERT
+    // Variable for the failure injection
+    long ran_num = os::random();
+    size_t fail_after = ran_num % bytes;
+#endif
+
+    while (bytes_remaining) {
+      size_t bytes_to_rq = MIN2(bytes_remaining, _large_page_size);
+      // Note allocate and commit
+      char * p_new;
+
+#ifdef ASSERT
+      bool inject_error = LargePagesIndividualAllocationInjectError &&
+          (bytes_remaining <= fail_after);
+#else
+      const bool inject_error = false;
+#endif
+
+      if (inject_error) {
+        p_new = NULL;
+      } else {
+        p_new = (char *) VirtualAlloc(next_alloc_addr,
+                                    bytes_to_rq,
+                                    MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
+                                    PAGE_EXECUTE_READWRITE);
+      }
+
+      if (p_new == NULL) {
+        // Free any allocated pages
+        if (next_alloc_addr > p_buf) {
+          // Some memory was committed so release it.
+          size_t bytes_to_release = bytes - bytes_remaining;
+          release_memory(p_buf, bytes_to_release);
+        }
+#ifdef ASSERT
+        if (UseLargePagesIndividualAllocation &&
+            LargePagesIndividualAllocationInjectError) {
+          if (TracePageSizes && Verbose) {
+             tty->print_cr("Reserving large pages individually failed.");
+          }
+        }
+#endif
+        return NULL;
+      }
+      bytes_remaining -= bytes_to_rq;
+      next_alloc_addr += bytes_to_rq;
+    }
+
+    return p_buf;
+
+  } else {
+    // normal policy just allocate it all at once
+    DWORD flag = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
+    char * res = (char *)VirtualAlloc(NULL,
+                                      bytes,
+                                      flag,
+                                      PAGE_EXECUTE_READWRITE);
+    return res;
+  }
 }
 
 bool os::release_memory_special(char* base, size_t bytes) {
@@ -2972,6 +3078,7 @@
 volatile intx os::win32::_os_thread_count    = 0;
 
 bool   os::win32::_is_nt              = false;
+bool   os::win32::_is_windows_2003    = false;
 
 
 void os::win32::initialize_system_info() {
@@ -2994,7 +3101,15 @@
   GetVersionEx(&oi);
   switch(oi.dwPlatformId) {
     case VER_PLATFORM_WIN32_WINDOWS: _is_nt = false; break;
-    case VER_PLATFORM_WIN32_NT:      _is_nt = true;  break;
+    case VER_PLATFORM_WIN32_NT:
+      _is_nt = true;
+      {
+        int os_vers = oi.dwMajorVersion * 1000 + oi.dwMinorVersion;
+        if (os_vers == 5002) {
+          _is_windows_2003 = true;
+        }
+      }
+      break;
     default: fatal("Unknown platform");
   }
 
@@ -3092,9 +3207,13 @@
     NoYieldsInMicrolock = true;
   }
 #endif
+  // This may be overridden later when argument processing is done.
+  FLAG_SET_ERGO(bool, UseLargePagesIndividualAllocation,
+    os::win32::is_windows_2003());
+
   // Initialize main_process and main_thread
   main_process = GetCurrentProcess();  // Remember main_process is a pseudo handle
-  if (!DuplicateHandle(main_process, GetCurrentThread(), main_process,
+ if (!DuplicateHandle(main_process, GetCurrentThread(), main_process,
                        &main_thread, THREAD_ALL_ACCESS, false, 0)) {
     fatal("DuplicateHandle failed\n");
   }
--- a/hotspot/src/os/windows/vm/os_windows.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/os/windows/vm/os_windows.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -34,6 +34,7 @@
   static julong _physical_memory;
   static size_t _default_stack_size;
   static bool   _is_nt;
+  static bool   _is_windows_2003;
 
  public:
   // Windows-specific interface:
@@ -60,6 +61,9 @@
   // Tells whether the platform is NT or Windown95
   static bool is_nt() { return _is_nt; }
 
+  // Tells whether the platform is Windows 2003
+  static bool is_windows_2003() { return _is_windows_2003; }
+
   // Returns the byte size of a virtual memory page
   static int vm_page_size() { return _vm_page_size; }
 
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -3768,6 +3768,10 @@
 int MatchRule::is_ideal_copy() const {
   if( _rChild ) {
     const char  *opType = _rChild->_opType;
+#if 1
+    if( strcmp(opType,"CastIP")==0 )
+      return 1;
+#else
     if( strcmp(opType,"CastII")==0 )
       return 1;
     // Do not treat *CastPP this way, because it
@@ -3787,6 +3791,7 @@
     //  return 1;
     //if( strcmp(opType,"CastP2X")==0 )
     //  return 1;
+#endif
   }
   if( is_chain_rule(_AD.globalNames()) &&
       _lChild && strncmp(_lChild->_opType,"stackSlot",9)==0 )
--- a/hotspot/src/share/vm/asm/assembler.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/asm/assembler.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -249,8 +249,6 @@
 bool MacroAssembler::needs_explicit_null_check(intptr_t offset) {
   // Exception handler checks the nmethod's implicit null checks table
   // only when this method returns false.
-#ifndef SPARC
-  // Sparc does not have based addressing
   if (UseCompressedOops) {
     // The first page after heap_base is unmapped and
     // the 'offset' is equal to [heap_base + offset] for
@@ -261,7 +259,6 @@
       offset = (intptr_t)(pointer_delta((void*)offset, (void*)heap_base, 1));
     }
   }
-#endif // SPARC
   return offset < 0 || os::vm_page_size() <= offset;
 }
 
--- a/hotspot/src/share/vm/c1/c1_CodeStubs.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/c1/c1_CodeStubs.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -482,3 +482,81 @@
   virtual void print_name(outputStream* out) const { out->print("ArrayCopyStub"); }
 #endif // PRODUCT
 };
+
+//////////////////////////////////////////////////////////////////////////////////////////
+#ifndef SERIALGC
+
+// Code stubs for Garbage-First barriers.
+class G1PreBarrierStub: public CodeStub {
+ private:
+  LIR_Opr _addr;
+  LIR_Opr _pre_val;
+  LIR_PatchCode _patch_code;
+  CodeEmitInfo* _info;
+
+ public:
+  // pre_val (a temporary register) must be a register;
+  // addr (the address of the field to be read) must be a LIR_Address
+  G1PreBarrierStub(LIR_Opr addr, LIR_Opr pre_val, LIR_PatchCode patch_code, CodeEmitInfo* info) :
+    _addr(addr), _pre_val(pre_val), _patch_code(patch_code), _info(info)
+  {
+    assert(_pre_val->is_register(), "should be temporary register");
+    assert(_addr->is_address(), "should be the address of the field");
+  }
+
+  LIR_Opr addr() const { return _addr; }
+  LIR_Opr pre_val() const { return _pre_val; }
+  LIR_PatchCode patch_code() const { return _patch_code; }
+  CodeEmitInfo* info() const { return _info; }
+
+  virtual void emit_code(LIR_Assembler* e);
+  virtual void visit(LIR_OpVisitState* visitor) {
+    // don't pass in the code emit info since it's processed in the fast
+    // path
+    if (_info != NULL)
+      visitor->do_slow_case(_info);
+    else
+      visitor->do_slow_case();
+    visitor->do_input(_addr);
+    visitor->do_temp(_pre_val);
+  }
+#ifndef PRODUCT
+  virtual void print_name(outputStream* out) const { out->print("G1PreBarrierStub"); }
+#endif // PRODUCT
+};
+
+class G1PostBarrierStub: public CodeStub {
+ private:
+  LIR_Opr _addr;
+  LIR_Opr _new_val;
+
+  static jbyte* _byte_map_base;
+  static jbyte* byte_map_base_slow();
+  static jbyte* byte_map_base() {
+    if (_byte_map_base == NULL) {
+      _byte_map_base = byte_map_base_slow();
+    }
+    return _byte_map_base;
+  }
+
+ public:
+  // addr (the address of the object head) and new_val must be registers.
+  G1PostBarrierStub(LIR_Opr addr, LIR_Opr new_val): _addr(addr), _new_val(new_val) { }
+
+  LIR_Opr addr() const { return _addr; }
+  LIR_Opr new_val() const { return _new_val; }
+
+  virtual void emit_code(LIR_Assembler* e);
+  virtual void visit(LIR_OpVisitState* visitor) {
+    // don't pass in the code emit info since it's processed in the fast path
+    visitor->do_slow_case();
+    visitor->do_input(_addr);
+    visitor->do_input(_new_val);
+  }
+#ifndef PRODUCT
+  virtual void print_name(outputStream* out) const { out->print("G1PostBarrierStub"); }
+#endif // PRODUCT
+};
+
+#endif // SERIALGC
+//////////////////////////////////////////////////////////////////////////////////////////
--- a/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -74,6 +74,7 @@
 LIR_Assembler::LIR_Assembler(Compilation* c):
    _compilation(c)
  , _masm(c->masm())
+ , _bs(Universe::heap()->barrier_set())
  , _frame_map(c->frame_map())
  , _current_block(NULL)
  , _pending_non_safepoint(NULL)
--- a/hotspot/src/share/vm/c1/c1_LIRAssembler.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIRAssembler.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -24,11 +24,13 @@
 
 class Compilation;
 class ScopeValue;
+class BarrierSet;
 
 class LIR_Assembler: public CompilationResourceObj {
  private:
   C1_MacroAssembler* _masm;
   CodeStubList*      _slow_case_stubs;
+  BarrierSet*        _bs;
 
   Compilation*       _compilation;
   FrameMap*          _frame_map;
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -285,16 +285,7 @@
 
 
 void LIRGenerator::init() {
-  BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
-  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
-  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
-
-#ifdef _LP64
-  _card_table_base = new LIR_Const((jlong)ct->byte_map_base);
-#else
-  _card_table_base = new LIR_Const((jint)ct->byte_map_base);
-#endif
+  _bs = Universe::heap()->barrier_set();
 }
 
 
@@ -1239,8 +1230,37 @@
 
 // Various barriers
 
+void LIRGenerator::pre_barrier(LIR_Opr addr_opr, bool patch,  CodeEmitInfo* info) {
+  // Do the pre-write barrier, if any.
+  switch (_bs->kind()) {
+#ifndef SERIALGC
+    case BarrierSet::G1SATBCT:
+    case BarrierSet::G1SATBCTLogging:
+      G1SATBCardTableModRef_pre_barrier(addr_opr, patch, info);
+      break;
+#endif // SERIALGC
+    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableExtension:
+      // No pre barriers
+      break;
+    case BarrierSet::ModRef:
+    case BarrierSet::Other:
+      // No pre barriers
+      break;
+    default      :
+      ShouldNotReachHere();
+
+  }
+}
+
 void LIRGenerator::post_barrier(LIR_OprDesc* addr, LIR_OprDesc* new_val) {
-  switch (Universe::heap()->barrier_set()->kind()) {
+  switch (_bs->kind()) {
+#ifndef SERIALGC
+    case BarrierSet::G1SATBCT:
+    case BarrierSet::G1SATBCTLogging:
+      G1SATBCardTableModRef_post_barrier(addr,  new_val);
+      break;
+#endif // SERIALGC
     case BarrierSet::CardTableModRef:
     case BarrierSet::CardTableExtension:
       CardTableModRef_post_barrier(addr,  new_val);
@@ -1254,11 +1274,120 @@
     }
 }
 
+////////////////////////////////////////////////////////////////////////
+#ifndef SERIALGC
+
+void LIRGenerator::G1SATBCardTableModRef_pre_barrier(LIR_Opr addr_opr, bool patch,  CodeEmitInfo* info) {
+  if (G1DisablePreBarrier) return;
+
+  // First we test whether marking is in progress.
+  BasicType flag_type;
+  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
+    flag_type = T_INT;
+  } else {
+    guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1,
+              "Assumption");
+    flag_type = T_BYTE;
+  }
+  LIR_Opr thrd = getThreadPointer();
+  LIR_Address* mark_active_flag_addr =
+    new LIR_Address(thrd,
+                    in_bytes(JavaThread::satb_mark_queue_offset() +
+                             PtrQueue::byte_offset_of_active()),
+                    flag_type);
+  // Read the marking-in-progress flag.
+  LIR_Opr flag_val = new_register(T_INT);
+  __ load(mark_active_flag_addr, flag_val);
+
+  LabelObj* start_store = new LabelObj();
+
+  LIR_PatchCode pre_val_patch_code =
+    patch ? lir_patch_normal : lir_patch_none;
+
+  LIR_Opr pre_val = new_register(T_OBJECT);
+
+  __ cmp(lir_cond_notEqual, flag_val, LIR_OprFact::intConst(0));
+  if (!addr_opr->is_address()) {
+    assert(addr_opr->is_register(), "must be");
+    addr_opr = LIR_OprFact::address(new LIR_Address(addr_opr, 0, T_OBJECT));
+  }
+  CodeStub* slow = new G1PreBarrierStub(addr_opr, pre_val, pre_val_patch_code,
+                                        info);
+  __ branch(lir_cond_notEqual, T_INT, slow);
+  __ branch_destination(slow->continuation());
+}
+
+void LIRGenerator::G1SATBCardTableModRef_post_barrier(LIR_OprDesc* addr, LIR_OprDesc* new_val) {
+  if (G1DisablePostBarrier) return;
+
+  // If the "new_val" is a constant NULL, no barrier is necessary.
+  if (new_val->is_constant() &&
+      new_val->as_constant_ptr()->as_jobject() == NULL) return;
+
+  if (!new_val->is_register()) {
+    LIR_Opr new_val_reg = new_pointer_register();
+    if (new_val->is_constant()) {
+      __ move(new_val, new_val_reg);
+    } else {
+      __ leal(new_val, new_val_reg);
+    }
+    new_val = new_val_reg;
+  }
+  assert(new_val->is_register(), "must be a register at this point");
+
+  if (addr->is_address()) {
+    LIR_Address* address = addr->as_address_ptr();
+    LIR_Opr ptr = new_pointer_register();
+    if (!address->index()->is_valid() && address->disp() == 0) {
+      __ move(address->base(), ptr);
+    } else {
+      assert(address->disp() != max_jint, "lea doesn't support patched addresses!");
+      __ leal(addr, ptr);
+    }
+    addr = ptr;
+  }
+  assert(addr->is_register(), "must be a register at this point");
+
+  LIR_Opr xor_res = new_pointer_register();
+  LIR_Opr xor_shift_res = new_pointer_register();
+
+  if (TwoOperandLIRForm ) {
+    __ move(addr, xor_res);
+    __ logical_xor(xor_res, new_val, xor_res);
+    __ move(xor_res, xor_shift_res);
+    __ unsigned_shift_right(xor_shift_res,
+                            LIR_OprFact::intConst(HeapRegion::LogOfHRGrainBytes),
+                            xor_shift_res,
+                            LIR_OprDesc::illegalOpr());
+  } else {
+    __ logical_xor(addr, new_val, xor_res);
+    __ unsigned_shift_right(xor_res,
+                            LIR_OprFact::intConst(HeapRegion::LogOfHRGrainBytes),
+                            xor_shift_res,
+                            LIR_OprDesc::illegalOpr());
+  }
+
+  if (!new_val->is_register()) {
+    LIR_Opr new_val_reg = new_pointer_register();
+    __ leal(new_val, new_val_reg);
+    new_val = new_val_reg;
+  }
+  assert(new_val->is_register(), "must be a register at this point");
+
+  __ cmp(lir_cond_notEqual, xor_shift_res, LIR_OprFact::intptrConst(NULL_WORD));
+
+  CodeStub* slow = new G1PostBarrierStub(addr, new_val);
+  __ branch(lir_cond_notEqual, T_INT, slow);
+  __ branch_destination(slow->continuation());
+}
+
+#endif // SERIALGC
+////////////////////////////////////////////////////////////////////////
+
 void LIRGenerator::CardTableModRef_post_barrier(LIR_OprDesc* addr, LIR_OprDesc* new_val) {
 
-  BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(sizeof(*((CardTableModRefBS*)bs)->byte_map_base) == sizeof(jbyte), "adjust this code");
-  LIR_Const* card_table_base = new LIR_Const(((CardTableModRefBS*)bs)->byte_map_base);
+  assert(sizeof(*((CardTableModRefBS*)_bs)->byte_map_base) == sizeof(jbyte), "adjust this code");
+  LIR_Const* card_table_base = new LIR_Const(((CardTableModRefBS*)_bs)->byte_map_base);
   if (addr->is_address()) {
     LIR_Address* address = addr->as_address_ptr();
     LIR_Opr ptr = new_register(T_OBJECT);
@@ -1388,6 +1517,13 @@
     __ membar_release();
   }
 
+  if (is_oop) {
+    // Do the pre-write barrier, if any.
+    pre_barrier(LIR_OprFact::address(address),
+                needs_patching,
+                (info ? new CodeEmitInfo(info) : NULL));
+  }
+
   if (is_volatile) {
     assert(!needs_patching && x->is_loaded(),
            "how do we know it's volatile if it's not loaded");
@@ -1398,7 +1534,12 @@
   }
 
   if (is_oop) {
+#ifdef PRECISE_CARDMARK
+    // Precise cardmarks don't work
+    post_barrier(LIR_OprFact::address(address), value.result());
+#else
     post_barrier(object.result(), value.result());
+#endif // PRECISE_CARDMARK
   }
 
   if (is_volatile && os::is_MP()) {
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -145,6 +145,7 @@
 
 // only the classes below belong in the same file
 class LIRGenerator: public InstructionVisitor, public BlockClosure {
+
  private:
   Compilation*  _compilation;
   ciMethod*     _method;    // method that we are compiling
@@ -154,6 +155,7 @@
   Values        _instruction_for_operand;
   BitMap2D      _vreg_flags; // flags which can be set on a per-vreg basis
   LIR_List*     _lir;
+  BarrierSet*   _bs;
 
   LIRGenerator* gen() {
     return this;
@@ -174,8 +176,6 @@
   LIR_OprList                     _reg_for_constants;
   Values                          _unpinned_constants;
 
-  LIR_Const*                      _card_table_base;
-
   friend class PhiResolver;
 
   // unified bailout support
@@ -196,8 +196,6 @@
   LIR_Opr load_constant(Constant* x);
   LIR_Opr load_constant(LIR_Const* constant);
 
-  LIR_Const* card_table_base() const { return _card_table_base; }
-
   void  set_result(Value x, LIR_Opr opr)           {
     assert(opr->is_valid(), "must set to valid value");
     assert(x->operand()->is_illegal(), "operand should never change");
@@ -253,12 +251,17 @@
 
   // generic interface
 
+  void pre_barrier(LIR_Opr addr_opr, bool patch,  CodeEmitInfo* info);
   void post_barrier(LIR_OprDesc* addr, LIR_OprDesc* new_val);
 
   // specific implementations
+  // pre barriers
+
+  void G1SATBCardTableModRef_pre_barrier(LIR_Opr addr_opr, bool patch,  CodeEmitInfo* info);
 
   // post barriers
 
+  void G1SATBCardTableModRef_post_barrier(LIR_OprDesc* addr, LIR_OprDesc* new_val);
   void CardTableModRef_post_barrier(LIR_OprDesc* addr, LIR_OprDesc* new_val);
 
 
--- a/hotspot/src/share/vm/c1/c1_Runtime1.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/c1/c1_Runtime1.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -168,6 +168,8 @@
   switch (id) {
     // These stubs don't need to have an oopmap
     case dtrace_object_alloc_id:
+    case g1_pre_barrier_slow_id:
+    case g1_post_barrier_slow_id:
     case slow_subtype_check_id:
     case fpu2long_stub_id:
     case unwind_exception_id:
--- a/hotspot/src/share/vm/c1/c1_Runtime1.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/c1/c1_Runtime1.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -56,6 +56,8 @@
   stub(access_field_patching)        \
   stub(load_klass_patching)          \
   stub(jvmti_exception_throw)        \
+  stub(g1_pre_barrier_slow)          \
+  stub(g1_post_barrier_slow)         \
   stub(fpu2long_stub)                \
   stub(counter_overflow)             \
   last_entry(number_of_ids)
--- a/hotspot/src/share/vm/c1/c1_globals.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/c1/c1_globals.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -213,9 +213,6 @@
   develop(bool, UseFastLocking, true,                                       \
           "Use fast inlined locking code")                                  \
                                                                             \
-  product(bool, FastTLABRefill, true,                                       \
-          "Use fast TLAB refill code")                                      \
-                                                                            \
   develop(bool, UseSlowPath, false,                                         \
           "For debugging: test slow cases by always using them")            \
                                                                             \
--- a/hotspot/src/share/vm/ci/ciMethodBlocks.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/ci/ciMethodBlocks.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -49,7 +49,7 @@
 // first half.  Returns the range beginning at bci.
 ciBlock *ciMethodBlocks::split_block_at(int bci) {
   ciBlock *former_block = block_containing(bci);
-  ciBlock *new_block = new(_arena) ciBlock(_method, _num_blocks++, this, former_block->start_bci());
+  ciBlock *new_block = new(_arena) ciBlock(_method, _num_blocks++, former_block->start_bci());
   _blocks->append(new_block);
   assert(former_block != NULL, "must not be NULL");
   new_block->set_limit_bci(bci);
@@ -83,7 +83,7 @@
   if (cb == NULL ) {
     // This is our first time visiting this bytecode.  Create
     // a fresh block and assign it this starting point.
-    ciBlock *nb = new(_arena) ciBlock(_method, _num_blocks++, this, bci);
+    ciBlock *nb = new(_arena) ciBlock(_method, _num_blocks++, bci);
     _blocks->append(nb);
      _bci_to_block[bci] = nb;
     return nb;
@@ -98,6 +98,11 @@
   }
 }
 
+ciBlock *ciMethodBlocks::make_dummy_block() {
+  ciBlock *dum = new(_arena) ciBlock(_method, -1, 0);
+  return dum;
+}
+
 void ciMethodBlocks::do_analysis() {
   ciBytecodeStream s(_method);
   ciBlock *cur_block = block_containing(0);
@@ -253,7 +258,7 @@
   Copy::zero_to_words((HeapWord*) _bci_to_block, b2bsize / sizeof(HeapWord));
 
   // create initial block covering the entire method
-  ciBlock *b = new(arena) ciBlock(_method, _num_blocks++, this, 0);
+  ciBlock *b = new(arena) ciBlock(_method, _num_blocks++, 0);
   _blocks->append(b);
   _bci_to_block[0] = b;
 
@@ -334,7 +339,7 @@
 #endif
 
 
-ciBlock::ciBlock(ciMethod *method, int index, ciMethodBlocks *mb, int start_bci) :
+ciBlock::ciBlock(ciMethod *method, int index, int start_bci) :
 #ifndef PRODUCT
                          _method(method),
 #endif
--- a/hotspot/src/share/vm/ci/ciMethodBlocks.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/ci/ciMethodBlocks.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -48,6 +48,8 @@
   int num_blocks()  { return _num_blocks;}
   void clear_processed();
 
+  ciBlock *make_dummy_block(); // a block not associated with a bci
+
 #ifndef PRODUCT
   void dump();
 #endif
@@ -81,7 +83,7 @@
     fall_through_bci = -1
   };
 
-  ciBlock(ciMethod *method, int index, ciMethodBlocks *mb, int start_bci);
+  ciBlock(ciMethod *method, int index, int start_bci);
   int start_bci() const         { return _start_bci; }
   int limit_bci() const         { return _limit_bci; }
   int control_bci() const       { return _control_bci; }
@@ -94,7 +96,6 @@
   int ex_limit_bci() const      { return _ex_limit_bci; }
   bool contains(int bci) const { return start_bci() <= bci && bci < limit_bci(); }
 
-
   // flag handling
   bool  processed() const           { return (_flags & Processed) != 0; }
   bool  is_handler() const          { return (_flags & Handler) != 0; }
--- a/hotspot/src/share/vm/ci/ciTypeFlow.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/ci/ciTypeFlow.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -338,8 +338,10 @@
   }
   _trap_bci = -1;
   _trap_index = 0;
+  _def_locals.clear();
 }
 
+
 // ------------------------------------------------------------------
 // ciTypeFlow::get_start_state
 //
@@ -735,7 +737,7 @@
 void ciTypeFlow::StateVector::do_new(ciBytecodeStream* str) {
   bool will_link;
   ciKlass* klass = str->get_klass(will_link);
-  if (!will_link) {
+  if (!will_link || str->is_unresolved_klass()) {
     trap(str, klass, str->get_klass_index());
   } else {
     push_object(klass);
@@ -1268,7 +1270,9 @@
     }
   case Bytecodes::_iinc:
     {
-      check_int(local(str->get_index()));
+      int lnum = str->get_index();
+      check_int(local(lnum));
+      store_to_local(lnum);
       break;
     }
   case Bytecodes::_iload:   load_local_int(str->get_index()); break;
@@ -1506,6 +1510,46 @@
 }
 #endif
 
+
+// ------------------------------------------------------------------
+// ciTypeFlow::SuccIter::next
+//
+void ciTypeFlow::SuccIter::next() {
+  int succ_ct = _pred->successors()->length();
+  int next = _index + 1;
+  if (next < succ_ct) {
+    _index = next;
+    _succ = _pred->successors()->at(next);
+    return;
+  }
+  for (int i = next - succ_ct; i < _pred->exceptions()->length(); i++) {
+    // Do not compile any code for unloaded exception types.
+    // Following compiler passes are responsible for doing this also.
+    ciInstanceKlass* exception_klass = _pred->exc_klasses()->at(i);
+    if (exception_klass->is_loaded()) {
+      _index = next;
+      _succ = _pred->exceptions()->at(i);
+      return;
+    }
+    next++;
+  }
+  _index = -1;
+  _succ = NULL;
+}
+
+// ------------------------------------------------------------------
+// ciTypeFlow::SuccIter::set_succ
+//
+void ciTypeFlow::SuccIter::set_succ(Block* succ) {
+  int succ_ct = _pred->successors()->length();
+  if (_index < succ_ct) {
+    _pred->successors()->at_put(_index, succ);
+  } else {
+    int idx = _index - succ_ct;
+    _pred->exceptions()->at_put(idx, succ);
+  }
+}
+
 // ciTypeFlow::Block
 //
 // A basic block.
@@ -1526,10 +1570,11 @@
   _jsrs = new_jsrs;
   _next = NULL;
   _on_work_list = false;
-  _pre_order = -1; assert(!has_pre_order(), "");
-  _private_copy = false;
+  _backedge_copy = false;
+  _exception_entry = false;
   _trap_bci = -1;
   _trap_index = 0;
+  df_init();
 
   if (CITraceTypeFlow) {
     tty->print_cr(">> Created new block");
@@ -1541,55 +1586,13 @@
 }
 
 // ------------------------------------------------------------------
-// ciTypeFlow::Block::clone_loop_head
-//
-ciTypeFlow::Block*
-ciTypeFlow::Block::clone_loop_head(ciTypeFlow* analyzer,
-                                   int branch_bci,
-                                   ciTypeFlow::Block* target,
-                                   ciTypeFlow::JsrSet* jsrs) {
-  // Loop optimizations are not performed on Tier1 compiles. Do nothing.
-  if (analyzer->env()->comp_level() < CompLevel_full_optimization) {
-    return target;
-  }
-
-  // The current block ends with a branch.
-  //
-  // If the target block appears to be the test-clause of a for loop, and
-  // it is not too large, and it has not yet been cloned, clone it.
-  // The pre-existing copy becomes the private clone used only by
-  // the initial iteration of the loop.  (We know we are simulating
-  // the initial iteration right now, since we have never calculated
-  // successors before for this block.)
-
-  if (branch_bci <= start()
-      && (target->limit() - target->start()) <= CICloneLoopTestLimit
-      && target->private_copy_count() == 0) {
-    // Setting the private_copy bit ensures that the target block cannot be
-    // reached by any other paths, such as fall-in from the loop body.
-    // The private copy will be accessible only on successor lists
-    // created up to this point.
-    target->set_private_copy(true);
-    if (CITraceTypeFlow) {
-      tty->print(">> Cloning a test-clause block ");
-      print_value_on(tty);
-      tty->cr();
-    }
-    // If the target is the current block, then later on a new copy of the
-    // target block will be created when its bytecodes are reached by
-    // an alternate path. (This is the case for loops with the loop
-    // head at the bci-wise bottom of the loop, as with pre-1.4.2 javac.)
-    //
-    // Otherwise, duplicate the target block now and use it immediately.
-    // (The case for loops with the loop head at the bci-wise top of the
-    // loop, as with 1.4.2 javac.)
-    //
-    // In either case, the new copy of the block will remain public.
-    if (target != this) {
-      target = analyzer->block_at(branch_bci, jsrs);
-    }
-  }
-  return target;
+// ciTypeFlow::Block::df_init
+void ciTypeFlow::Block::df_init() {
+  _pre_order = -1; assert(!has_pre_order(), "");
+  _post_order = -1; assert(!has_post_order(), "");
+  _loop = NULL;
+  _irreducible_entry = false;
+  _rpo_next = NULL;
 }
 
 // ------------------------------------------------------------------
@@ -1644,7 +1647,6 @@
       case Bytecodes::_ifnull:       case Bytecodes::_ifnonnull:
         // Our successors are the branch target and the next bci.
         branch_bci = str->get_dest();
-        clone_loop_head(analyzer, branch_bci, this, jsrs);
         _successors =
           new (arena) GrowableArray<Block*>(arena, 2, 0, NULL);
         assert(_successors->length() == IF_NOT_TAKEN, "");
@@ -1658,14 +1660,7 @@
         _successors =
           new (arena) GrowableArray<Block*>(arena, 1, 0, NULL);
         assert(_successors->length() == GOTO_TARGET, "");
-        target = analyzer->block_at(branch_bci, jsrs);
-        // If the target block has not been visited yet, and looks like
-        // a two-way branch, attempt to clone it if it is a loop head.
-        if (target->_successors != NULL
-            && target->_successors->length() == (IF_TAKEN + 1)) {
-          target = clone_loop_head(analyzer, branch_bci, target, jsrs);
-        }
-        _successors->append(target);
+        _successors->append(analyzer->block_at(branch_bci, jsrs));
         break;
 
       case Bytecodes::_jsr:
@@ -1801,65 +1796,60 @@
 }
 
 // ------------------------------------------------------------------
-// ciTypeFlow::Block::is_simpler_than
-//
-// A relation used to order our work list.  We work on a block earlier
-// if it has a smaller jsr stack or it occurs earlier in the program
-// text.
-//
-// Note: maybe we should redo this functionality to make blocks
-// which correspond to exceptions lower priority.
-bool ciTypeFlow::Block::is_simpler_than(ciTypeFlow::Block* other) {
-  if (other == NULL) {
-    return true;
-  } else {
-    int size1 = _jsrs->size();
-    int size2 = other->_jsrs->size();
-    if (size1 < size2) {
-      return true;
-    } else if (size2 < size1) {
-      return false;
-    } else {
-#if 0
-      if (size1 > 0) {
-        int r1 = _jsrs->record_at(0)->return_address();
-        int r2 = _jsrs->record_at(0)->return_address();
-        if (r1 < r2) {
-          return true;
-        } else if (r2 < r1) {
-          return false;
-        } else {
-          int e1 = _jsrs->record_at(0)->return_address();
-          int e2 = _jsrs->record_at(0)->return_address();
-          if (e1 < e2) {
-            return true;
-          } else if (e2 < e1) {
-            return false;
-          }
-        }
-      }
-#endif
-      return (start() <= other->start());
-    }
-  }
+// ciTypeFlow::Block::set_backedge_copy
+// Use this only to make a pre-existing public block into a backedge copy.
+void ciTypeFlow::Block::set_backedge_copy(bool z) {
+  assert(z || (z == is_backedge_copy()), "cannot make a backedge copy public");
+  _backedge_copy = z;
 }
 
 // ------------------------------------------------------------------
-// ciTypeFlow::Block::set_private_copy
-// Use this only to make a pre-existing public block into a private copy.
-void ciTypeFlow::Block::set_private_copy(bool z) {
-  assert(z || (z == is_private_copy()), "cannot make a private copy public");
-  _private_copy = z;
+// ciTypeFlow::Block::is_clonable_exit
+//
+// At most 2 normal successors, one of which continues looping,
+// and all exceptional successors must exit.
+bool ciTypeFlow::Block::is_clonable_exit(ciTypeFlow::Loop* lp) {
+  int normal_cnt  = 0;
+  int in_loop_cnt = 0;
+  for (SuccIter iter(this); !iter.done(); iter.next()) {
+    Block* succ = iter.succ();
+    if (iter.is_normal_ctrl()) {
+      if (++normal_cnt > 2) return false;
+      if (lp->contains(succ->loop())) {
+        if (++in_loop_cnt > 1) return false;
+      }
+    } else {
+      if (lp->contains(succ->loop())) return false;
+    }
+  }
+  return in_loop_cnt == 1;
+}
+
+// ------------------------------------------------------------------
+// ciTypeFlow::Block::looping_succ
+//
+ciTypeFlow::Block* ciTypeFlow::Block::looping_succ(ciTypeFlow::Loop* lp) {
+  assert(successors()->length() <= 2, "at most 2 normal successors");
+  for (SuccIter iter(this); !iter.done(); iter.next()) {
+    Block* succ = iter.succ();
+    if (lp->contains(succ->loop())) {
+      return succ;
+    }
+  }
+  return NULL;
 }
 
 #ifndef PRODUCT
 // ------------------------------------------------------------------
 // ciTypeFlow::Block::print_value_on
 void ciTypeFlow::Block::print_value_on(outputStream* st) const {
-  if (has_pre_order())  st->print("#%-2d ", pre_order());
+  if (has_pre_order()) st->print("#%-2d ", pre_order());
+  if (has_rpo())       st->print("rpo#%-2d ", rpo());
   st->print("[%d - %d)", start(), limit());
+  if (is_loop_head()) st->print(" lphd");
+  if (is_irreducible_entry()) st->print(" irred");
   if (_jsrs->size() > 0) { st->print("/");  _jsrs->print_on(st); }
-  if (is_private_copy())  st->print("/private_copy");
+  if (is_backedge_copy())  st->print("/backedge_copy");
 }
 
 // ------------------------------------------------------------------
@@ -1871,6 +1861,16 @@
   st->print_cr("  ====================================================  ");
   st->print ("  ");
   print_value_on(st);
+  st->print(" Stored locals: "); def_locals()->print_on(st, outer()->method()->max_locals()); tty->cr();
+  if (loop() && loop()->parent() != NULL) {
+    st->print(" loops:");
+    Loop* lp = loop();
+    do {
+      st->print(" %d<-%d", lp->head()->pre_order(),lp->tail()->pre_order());
+      if (lp->is_irreducible()) st->print("(ir)");
+      lp = lp->parent();
+    } while (lp->parent() != NULL);
+  }
   st->cr();
   _state->print_on(st);
   if (_successors == NULL) {
@@ -1907,6 +1907,21 @@
 }
 #endif
 
+#ifndef PRODUCT
+// ------------------------------------------------------------------
+// ciTypeFlow::LocalSet::print_on
+void ciTypeFlow::LocalSet::print_on(outputStream* st, int limit) const {
+  st->print("{");
+  for (int i = 0; i < max; i++) {
+    if (test(i)) st->print(" %d", i);
+  }
+  if (limit > max) {
+    st->print(" %d..%d ", max, limit);
+  }
+  st->print(" }");
+}
+#endif
+
 // ciTypeFlow
 //
 // This is a pass over the bytecodes which computes the following:
@@ -1922,12 +1937,11 @@
   _max_locals = method->max_locals();
   _max_stack = method->max_stack();
   _code_size = method->code_size();
+  _has_irreducible_entry = false;
   _osr_bci = osr_bci;
   _failure_reason = NULL;
   assert(start_bci() >= 0 && start_bci() < code_size() , "correct osr_bci argument");
-
   _work_list = NULL;
-  _next_pre_order = 0;
 
   _ciblock_count = _methodBlocks->num_blocks();
   _idx_to_blocklist = NEW_ARENA_ARRAY(arena(), GrowableArray<Block*>*, _ciblock_count);
@@ -1949,12 +1963,6 @@
   _work_list = next_block->next();
   next_block->set_next(NULL);
   next_block->set_on_work_list(false);
-  if (!next_block->has_pre_order()) {
-    // Assign "pre_order" as each new block is taken from the work list.
-    // This number may be used by following phases to order block visits.
-    assert(!have_block_count(), "must not have mapped blocks yet")
-    next_block->set_pre_order(_next_pre_order++);
-  }
   return next_block;
 }
 
@@ -1962,30 +1970,37 @@
 // ciTypeFlow::add_to_work_list
 //
 // Add a basic block to our work list.
+// List is sorted by decreasing postorder sort (same as increasing RPO)
 void ciTypeFlow::add_to_work_list(ciTypeFlow::Block* block) {
   assert(!block->is_on_work_list(), "must not already be on work list");
 
   if (CITraceTypeFlow) {
-    tty->print(">> Adding block%s ", block->has_pre_order() ? " (again)" : "");
+    tty->print(">> Adding block ");
     block->print_value_on(tty);
     tty->print_cr(" to the work list : ");
   }
 
   block->set_on_work_list(true);
-  if (block->is_simpler_than(_work_list)) {
+
+  // decreasing post order sort
+
+  Block* prev = NULL;
+  Block* current = _work_list;
+  int po = block->post_order();
+  while (current != NULL) {
+    if (!current->has_post_order() || po > current->post_order())
+      break;
+    prev = current;
+    current = current->next();
+  }
+  if (prev == NULL) {
     block->set_next(_work_list);
     _work_list = block;
   } else {
-    Block *temp = _work_list;
-    while (!block->is_simpler_than(temp->next())) {
-      if (CITraceTypeFlow) {
-        tty->print(".");
-      }
-      temp = temp->next();
-    }
-    block->set_next(temp->next());
-    temp->set_next(block);
+    block->set_next(current);
+    prev->set_next(block);
   }
+
   if (CITraceTypeFlow) {
     tty->cr();
   }
@@ -2008,7 +2023,7 @@
   assert(ciblk->start_bci() == bci, "bad ciBlock boundaries");
   Block* block = get_block_for(ciblk->index(), jsrs, option);
 
-  assert(block == NULL? (option == no_create): block->is_private_copy() == (option == create_private_copy), "create option consistent with result");
+  assert(block == NULL? (option == no_create): block->is_backedge_copy() == (option == create_backedge_copy), "create option consistent with result");
 
   if (CITraceTypeFlow) {
     if (block != NULL) {
@@ -2072,8 +2087,9 @@
     }
 
     if (block->meet_exception(exception_klass, state)) {
-      // Block was modified.  Add it to the work list.
-      if (!block->is_on_work_list()) {
+      // Block was modified and has PO.  Add it to the work list.
+      if (block->has_post_order() &&
+          !block->is_on_work_list()) {
         add_to_work_list(block);
       }
     }
@@ -2091,8 +2107,9 @@
   for (int i = 0; i < len; i++) {
     Block* block = successors->at(i);
     if (block->meet(state)) {
-      // Block was modified.  Add it to the work list.
-      if (!block->is_on_work_list()) {
+      // Block was modified and has PO.  Add it to the work list.
+      if (block->has_post_order() &&
+          !block->is_on_work_list()) {
         add_to_work_list(block);
       }
     }
@@ -2133,6 +2150,111 @@
   return true;
 }
 
+// ------------------------------------------------------------------
+// ciTypeFlow::clone_loop_heads
+//
+// Clone the loop heads
+bool ciTypeFlow::clone_loop_heads(Loop* lp, StateVector* temp_vector, JsrSet* temp_set) {
+  bool rslt = false;
+  for (PreorderLoops iter(loop_tree_root()); !iter.done(); iter.next()) {
+    lp = iter.current();
+    Block* head = lp->head();
+    if (lp == loop_tree_root() ||
+        lp->is_irreducible() ||
+        !head->is_clonable_exit(lp))
+      continue;
+
+    // check not already cloned
+    if (head->backedge_copy_count() != 0)
+      continue;
+
+    // check _no_ shared head below us
+    Loop* ch;
+    for (ch = lp->child(); ch != NULL && ch->head() != head; ch = ch->sibling());
+    if (ch != NULL)
+      continue;
+
+    // Clone head
+    Block* new_head = head->looping_succ(lp);
+    Block* clone = clone_loop_head(lp, temp_vector, temp_set);
+    // Update lp's info
+    clone->set_loop(lp);
+    lp->set_head(new_head);
+    lp->set_tail(clone);
+    // And move original head into outer loop
+    head->set_loop(lp->parent());
+
+    rslt = true;
+  }
+  return rslt;
+}
+
+// ------------------------------------------------------------------
+// ciTypeFlow::clone_loop_head
+//
+// Clone lp's head and replace tail's successors with clone.
+//
+//  |
+//  v
+// head <-> body
+//  |
+//  v
+// exit
+//
+// new_head
+//
+//  |
+//  v
+// head ----------\
+//  |             |
+//  |             v
+//  |  clone <-> body
+//  |    |
+//  | /--/
+//  | |
+//  v v
+// exit
+//
+ciTypeFlow::Block* ciTypeFlow::clone_loop_head(Loop* lp, StateVector* temp_vector, JsrSet* temp_set) {
+  Block* head = lp->head();
+  Block* tail = lp->tail();
+  if (CITraceTypeFlow) {
+    tty->print(">> Requesting clone of loop head "); head->print_value_on(tty);
+    tty->print("  for predecessor ");                tail->print_value_on(tty);
+    tty->cr();
+  }
+  Block* clone = block_at(head->start(), head->jsrs(), create_backedge_copy);
+  assert(clone->backedge_copy_count() == 1, "one backedge copy for all back edges");
+
+  assert(!clone->has_pre_order(), "just created");
+  clone->set_next_pre_order();
+
+  // Insert clone after (orig) tail in reverse post order
+  clone->set_rpo_next(tail->rpo_next());
+  tail->set_rpo_next(clone);
+
+  // tail->head becomes tail->clone
+  for (SuccIter iter(tail); !iter.done(); iter.next()) {
+    if (iter.succ() == head) {
+      iter.set_succ(clone);
+      break;
+    }
+  }
+  flow_block(tail, temp_vector, temp_set);
+  if (head == tail) {
+    // For self-loops, clone->head becomes clone->clone
+    flow_block(clone, temp_vector, temp_set);
+    for (SuccIter iter(clone); !iter.done(); iter.next()) {
+      if (iter.succ() == head) {
+        iter.set_succ(clone);
+        break;
+      }
+    }
+  }
+  flow_block(clone, temp_vector, temp_set);
+
+  return clone;
+}
 
 // ------------------------------------------------------------------
 // ciTypeFlow::flow_block
@@ -2159,11 +2281,14 @@
 
   // Grab the state from the current block.
   block->copy_state_into(state);
+  state->def_locals()->clear();
 
   GrowableArray<Block*>*           exceptions = block->exceptions();
   GrowableArray<ciInstanceKlass*>* exc_klasses = block->exc_klasses();
   bool has_exceptions = exceptions->length() > 0;
 
+  bool exceptions_used = false;
+
   ciBytecodeStream str(method());
   str.reset_to_bci(start);
   Bytecodes::Code code;
@@ -2172,6 +2297,7 @@
     // Check for exceptional control flow from this point.
     if (has_exceptions && can_trap(str)) {
       flow_exceptions(exceptions, exc_klasses, state);
+      exceptions_used = true;
     }
     // Apply the effects of the current bytecode to our state.
     bool res = state->apply_one_bytecode(&str);
@@ -2189,9 +2315,14 @@
         block->print_on(tty);
       }
 
+      // Save set of locals defined in this block
+      block->def_locals()->add(state->def_locals());
+
       // Record (no) successors.
       block->successors(&str, state, jsrs);
 
+      assert(!has_exceptions || exceptions_used, "Not removing exceptions");
+
       // Discontinue interpretation of this Block.
       return;
     }
@@ -2202,6 +2333,7 @@
     // Check for exceptional control flow from this point.
     if (has_exceptions && can_trap(str)) {
       flow_exceptions(exceptions, exc_klasses, state);
+      exceptions_used = true;
     }
 
     // Fix the JsrSet to reflect effect of the bytecode.
@@ -2218,11 +2350,306 @@
     successors = block->successors(&str, NULL, NULL);
   }
 
+  // Save set of locals defined in this block
+  block->def_locals()->add(state->def_locals());
+
+  // Remove untaken exception paths
+  if (!exceptions_used)
+    exceptions->clear();
+
   // Pass our state to successors.
   flow_successors(successors, state);
 }
 
 // ------------------------------------------------------------------
+// ciTypeFlow::PostOrderLoops::next
+//
+// Advance to next loop tree using a postorder, left-to-right traversal.
+void ciTypeFlow::PostorderLoops::next() {
+  assert(!done(), "must not be done.");
+  if (_current->sibling() != NULL) {
+    _current = _current->sibling();
+    while (_current->child() != NULL) {
+      _current = _current->child();
+    }
+  } else {
+    _current = _current->parent();
+  }
+}
+
+// ------------------------------------------------------------------
+// ciTypeFlow::PreOrderLoops::next
+//
+// Advance to next loop tree using a preorder, left-to-right traversal.
+void ciTypeFlow::PreorderLoops::next() {
+  assert(!done(), "must not be done.");
+  if (_current->child() != NULL) {
+    _current = _current->child();
+  } else if (_current->sibling() != NULL) {
+    _current = _current->sibling();
+  } else {
+    while (_current != _root && _current->sibling() == NULL) {
+      _current = _current->parent();
+    }
+    if (_current == _root) {
+      _current = NULL;
+      assert(done(), "must be done.");
+    } else {
+      assert(_current->sibling() != NULL, "must be more to do");
+      _current = _current->sibling();
+    }
+  }
+}
+
+// ------------------------------------------------------------------
+// ciTypeFlow::Loop::sorted_merge
+//
+// Merge the branch lp into this branch, sorting on the loop head
+// pre_orders. Returns the leaf of the merged branch.
+// Child and sibling pointers will be setup later.
+// Sort is (looking from leaf towards the root)
+//  descending on primary key: loop head's pre_order, and
+//  ascending  on secondary key: loop tail's pre_order.
+ciTypeFlow::Loop* ciTypeFlow::Loop::sorted_merge(Loop* lp) {
+  Loop* leaf = this;
+  Loop* prev = NULL;
+  Loop* current = leaf;
+  while (lp != NULL) {
+    int lp_pre_order = lp->head()->pre_order();
+    // Find insertion point for "lp"
+    while (current != NULL) {
+      if (current == lp)
+        return leaf; // Already in list
+      if (current->head()->pre_order() < lp_pre_order)
+        break;
+      if (current->head()->pre_order() == lp_pre_order &&
+          current->tail()->pre_order() > lp->tail()->pre_order()) {
+        break;
+      }
+      prev = current;
+      current = current->parent();
+    }
+    Loop* next_lp = lp->parent(); // Save future list of items to insert
+    // Insert lp before current
+    lp->set_parent(current);
+    if (prev != NULL) {
+      prev->set_parent(lp);
+    } else {
+      leaf = lp;
+    }
+    prev = lp;     // Inserted item is new prev[ious]
+    lp = next_lp;  // Next item to insert
+  }
+  return leaf;
+}
+
+// ------------------------------------------------------------------
+// ciTypeFlow::build_loop_tree
+//
+// Incrementally build loop tree.
+void ciTypeFlow::build_loop_tree(Block* blk) {
+  assert(!blk->is_post_visited(), "precondition");
+  Loop* innermost = NULL; // merge of loop tree branches over all successors
+
+  for (SuccIter iter(blk); !iter.done(); iter.next()) {
+    Loop*  lp   = NULL;
+    Block* succ = iter.succ();
+    if (!succ->is_post_visited()) {
+      // Found backedge since predecessor post visited, but successor is not
+      assert(succ->pre_order() <= blk->pre_order(), "should be backedge");
+
+      // Create a LoopNode to mark this loop.
+      lp = new (arena()) Loop(succ, blk);
+      if (succ->loop() == NULL)
+        succ->set_loop(lp);
+      // succ->loop will be updated to innermost loop on a later call, when blk==succ
+
+    } else {  // Nested loop
+      lp = succ->loop();
+
+      // If succ is loop head, find outer loop.
+      while (lp != NULL && lp->head() == succ) {
+        lp = lp->parent();
+      }
+      if (lp == NULL) {
+        // Infinite loop, it's parent is the root
+        lp = loop_tree_root();
+      }
+    }
+
+    // Check for irreducible loop.
+    // Successor has already been visited. If the successor's loop head
+    // has already been post-visited, then this is another entry into the loop.
+    while (lp->head()->is_post_visited() && lp != loop_tree_root()) {
+      _has_irreducible_entry = true;
+      lp->set_irreducible(succ);
+      if (!succ->is_on_work_list()) {
+        // Assume irreducible entries need more data flow
+        add_to_work_list(succ);
+      }
+      lp = lp->parent();
+      assert(lp != NULL, "nested loop must have parent by now");
+    }
+
+    // Merge loop tree branch for all successors.
+    innermost = innermost == NULL ? lp : innermost->sorted_merge(lp);
+
+  } // end loop
+
+  if (innermost == NULL) {
+    assert(blk->successors()->length() == 0, "CFG exit");
+    blk->set_loop(loop_tree_root());
+  } else if (innermost->head() == blk) {
+    // If loop header, complete the tree pointers
+    if (blk->loop() != innermost) {
+#if ASSERT
+      assert(blk->loop()->head() == innermost->head(), "same head");
+      Loop* dl;
+      for (dl = innermost; dl != NULL && dl != blk->loop(); dl = dl->parent());
+      assert(dl == blk->loop(), "blk->loop() already in innermost list");
+#endif
+      blk->set_loop(innermost);
+    }
+    innermost->def_locals()->add(blk->def_locals());
+    Loop* l = innermost;
+    Loop* p = l->parent();
+    while (p && l->head() == blk) {
+      l->set_sibling(p->child());  // Put self on parents 'next child'
+      p->set_child(l);             // Make self the first child of parent
+      p->def_locals()->add(l->def_locals());
+      l = p;                       // Walk up the parent chain
+      p = l->parent();
+    }
+  } else {
+    blk->set_loop(innermost);
+    innermost->def_locals()->add(blk->def_locals());
+  }
+}
+
+// ------------------------------------------------------------------
+// ciTypeFlow::Loop::contains
+//
+// Returns true if lp is nested loop.
+bool ciTypeFlow::Loop::contains(ciTypeFlow::Loop* lp) const {
+  assert(lp != NULL, "");
+  if (this == lp || head() == lp->head()) return true;
+  int depth1 = depth();
+  int depth2 = lp->depth();
+  if (depth1 > depth2)
+    return false;
+  while (depth1 < depth2) {
+    depth2--;
+    lp = lp->parent();
+  }
+  return this == lp;
+}
+
+// ------------------------------------------------------------------
+// ciTypeFlow::Loop::depth
+//
+// Loop depth
+int ciTypeFlow::Loop::depth() const {
+  int dp = 0;
+  for (Loop* lp = this->parent(); lp != NULL; lp = lp->parent())
+    dp++;
+  return dp;
+}
+
+#ifndef PRODUCT
+// ------------------------------------------------------------------
+// ciTypeFlow::Loop::print
+void ciTypeFlow::Loop::print(outputStream* st, int indent) const {
+  for (int i = 0; i < indent; i++) st->print(" ");
+  st->print("%d<-%d %s",
+            is_root() ? 0 : this->head()->pre_order(),
+            is_root() ? 0 : this->tail()->pre_order(),
+            is_irreducible()?" irr":"");
+  st->print(" defs: ");
+  def_locals()->print_on(st, _head->outer()->method()->max_locals());
+  st->cr();
+  for (Loop* ch = child(); ch != NULL; ch = ch->sibling())
+    ch->print(st, indent+2);
+}
+#endif
+
+// ------------------------------------------------------------------
+// ciTypeFlow::df_flow_types
+//
+// Perform the depth first type flow analysis. Helper for flow_types.
+void ciTypeFlow::df_flow_types(Block* start,
+                               bool do_flow,
+                               StateVector* temp_vector,
+                               JsrSet* temp_set) {
+  int dft_len = 100;
+  GrowableArray<Block*> stk(arena(), dft_len, 0, NULL);
+
+  ciBlock* dummy = _methodBlocks->make_dummy_block();
+  JsrSet* root_set = new JsrSet(NULL, 0);
+  Block* root_head = new (arena()) Block(this, dummy, root_set);
+  Block* root_tail = new (arena()) Block(this, dummy, root_set);
+  root_head->set_pre_order(0);
+  root_head->set_post_order(0);
+  root_tail->set_pre_order(max_jint);
+  root_tail->set_post_order(max_jint);
+  set_loop_tree_root(new (arena()) Loop(root_head, root_tail));
+
+  stk.push(start);
+
+  _next_pre_order = 0;  // initialize pre_order counter
+  _rpo_list = NULL;
+  int next_po = 0;      // initialize post_order counter
+
+  // Compute RPO and the control flow graph
+  int size;
+  while ((size = stk.length()) > 0) {
+    Block* blk = stk.top(); // Leave node on stack
+    if (!blk->is_visited()) {
+      // forward arc in graph
+      assert (!blk->has_pre_order(), "");
+      blk->set_next_pre_order();
+
+      if (_next_pre_order >= MaxNodeLimit / 2) {
+        // Too many basic blocks.  Bail out.
+        // This can happen when try/finally constructs are nested to depth N,
+        // and there is O(2**N) cloning of jsr bodies.  See bug 4697245!
+        // "MaxNodeLimit / 2" is used because probably the parser will
+        // generate at least twice that many nodes and bail out.
+        record_failure("too many basic blocks");
+        return;
+      }
+      if (do_flow) {
+        flow_block(blk, temp_vector, temp_set);
+        if (failing()) return; // Watch for bailouts.
+      }
+    } else if (!blk->is_post_visited()) {
+      // cross or back arc
+      for (SuccIter iter(blk); !iter.done(); iter.next()) {
+        Block* succ = iter.succ();
+        if (!succ->is_visited()) {
+          stk.push(succ);
+        }
+      }
+      if (stk.length() == size) {
+        // There were no additional children, post visit node now
+        stk.pop(); // Remove node from stack
+
+        build_loop_tree(blk);
+        blk->set_post_order(next_po++);   // Assign post order
+        prepend_to_rpo_list(blk);
+        assert(blk->is_post_visited(), "");
+
+        if (blk->is_loop_head() && !blk->is_on_work_list()) {
+          // Assume loop heads need more data flow
+          add_to_work_list(blk);
+        }
+      }
+    } else {
+      stk.pop(); // Remove post-visited node from stack
+    }
+  }
+}
+
+// ------------------------------------------------------------------
 // ciTypeFlow::flow_types
 //
 // Perform the type flow analysis, creating and cloning Blocks as
@@ -2233,91 +2660,93 @@
   JsrSet* temp_set = new JsrSet(NULL, 16);
 
   // Create the method entry block.
-  Block* block = block_at(start_bci(), temp_set);
-  block->set_pre_order(_next_pre_order++);
-  assert(block->is_start(), "start block must have order #0");
+  Block* start = block_at(start_bci(), temp_set);
 
   // Load the initial state into it.
   const StateVector* start_state = get_start_state();
   if (failing())  return;
-  block->meet(start_state);
-  add_to_work_list(block);
+  start->meet(start_state);
 
-  // Trickle away.
+  // Depth first visit
+  df_flow_types(start, true /*do flow*/, temp_vector, temp_set);
+
+  if (failing())  return;
+  assert(_rpo_list == start, "must be start");
+
+  // Any loops found?
+  if (loop_tree_root()->child() != NULL &&
+      env()->comp_level() >= CompLevel_full_optimization) {
+      // Loop optimizations are not performed on Tier1 compiles.
+
+    bool changed = clone_loop_heads(loop_tree_root(), temp_vector, temp_set);
+
+    // If some loop heads were cloned, recompute postorder and loop tree
+    if (changed) {
+      loop_tree_root()->set_child(NULL);
+      for (Block* blk = _rpo_list; blk != NULL;) {
+        Block* next = blk->rpo_next();
+        blk->df_init();
+        blk = next;
+      }
+      df_flow_types(start, false /*no flow*/, temp_vector, temp_set);
+    }
+  }
+
+  if (CITraceTypeFlow) {
+    tty->print_cr("\nLoop tree");
+    loop_tree_root()->print();
+  }
+
+  // Continue flow analysis until fixed point reached
+
+  debug_only(int max_block = _next_pre_order;)
+
   while (!work_list_empty()) {
-    Block* block = work_list_next();
-    flow_block(block, temp_vector, temp_set);
+    Block* blk = work_list_next();
+    assert (blk->has_post_order(), "post order assigned above");
 
+    flow_block(blk, temp_vector, temp_set);
 
-    // NodeCountCutoff is the number of nodes at which the parser
-    // will bail out.  Probably if we already have lots of BBs,
-    // the parser will generate at least twice that many nodes and bail out.
-    // Therefore, this is a conservatively large limit at which to
-    // bail out in the pre-parse typeflow pass.
-    int block_limit = MaxNodeLimit / 2;
-
-    if (_next_pre_order >= block_limit) {
-      // Too many basic blocks.  Bail out.
-      //
-      // This can happen when try/finally constructs are nested to depth N,
-      // and there is O(2**N) cloning of jsr bodies.  See bug 4697245!
-      record_failure("too many basic blocks");
-      return;
-    }
-
-    // Watch for bailouts.
-    if (failing())  return;
+    assert (max_block == _next_pre_order, "no new blocks");
+    assert (!failing(), "no more bailouts");
   }
 }
 
 // ------------------------------------------------------------------
 // ciTypeFlow::map_blocks
 //
-// Create the block map, which indexes blocks in pre_order.
+// Create the block map, which indexes blocks in reverse post-order.
 void ciTypeFlow::map_blocks() {
   assert(_block_map == NULL, "single initialization");
-  int pre_order_limit = _next_pre_order;
-  _block_map = NEW_ARENA_ARRAY(arena(), Block*, pre_order_limit);
-  assert(pre_order_limit == block_count(), "");
-  int po;
-  for (po = 0; po < pre_order_limit; po++) {
-    debug_only(_block_map[po] = NULL);
+  int block_ct = _next_pre_order;
+  _block_map = NEW_ARENA_ARRAY(arena(), Block*, block_ct);
+  assert(block_ct == block_count(), "");
+
+  Block* blk = _rpo_list;
+  for (int m = 0; m < block_ct; m++) {
+    int rpo = blk->rpo();
+    assert(rpo == m, "should be sequential");
+    _block_map[rpo] = blk;
+    blk = blk->rpo_next();
   }
-  ciMethodBlocks *mblks = _methodBlocks;
-  ciBlock* current = NULL;
-  int limit_bci = code_size();
-  for (int bci = 0; bci < limit_bci; bci++) {
-    ciBlock* ciblk = mblks->block_containing(bci);
-    if (ciblk != NULL && ciblk != current) {
-      current = ciblk;
-      int curidx = ciblk->index();
-      int block_count = (_idx_to_blocklist[curidx] == NULL) ? 0 : _idx_to_blocklist[curidx]->length();
-      for (int i = 0; i < block_count; i++) {
-        Block* block = _idx_to_blocklist[curidx]->at(i);
-        if (!block->has_pre_order())  continue;
-        int po = block->pre_order();
-        assert(_block_map[po] == NULL, "unique ref to block");
-        assert(0 <= po && po < pre_order_limit, "");
-        _block_map[po] = block;
-      }
-    }
-  }
-  for (po = 0; po < pre_order_limit; po++) {
-    assert(_block_map[po] != NULL, "must not drop any blocks");
-    Block* block = _block_map[po];
+  assert(blk == NULL, "should be done");
+
+  for (int j = 0; j < block_ct; j++) {
+    assert(_block_map[j] != NULL, "must not drop any blocks");
+    Block* block = _block_map[j];
     // Remove dead blocks from successor lists:
     for (int e = 0; e <= 1; e++) {
       GrowableArray<Block*>* l = e? block->exceptions(): block->successors();
-      for (int i = 0; i < l->length(); i++) {
-        Block* s = l->at(i);
-        if (!s->has_pre_order()) {
+      for (int k = 0; k < l->length(); k++) {
+        Block* s = l->at(k);
+        if (!s->has_post_order()) {
           if (CITraceTypeFlow) {
             tty->print("Removing dead %s successor of #%d: ", (e? "exceptional":  "normal"), block->pre_order());
             s->print_value_on(tty);
             tty->cr();
           }
           l->remove(s);
-          --i;
+          --k;
         }
       }
     }
@@ -2329,7 +2758,7 @@
 //
 // Find a block with this ciBlock which has a compatible JsrSet.
 // If no such block exists, create it, unless the option is no_create.
-// If the option is create_private_copy, always create a fresh private copy.
+// If the option is create_backedge_copy, always create a fresh backedge copy.
 ciTypeFlow::Block* ciTypeFlow::get_block_for(int ciBlockIndex, ciTypeFlow::JsrSet* jsrs, CreateOption option) {
   Arena* a = arena();
   GrowableArray<Block*>* blocks = _idx_to_blocklist[ciBlockIndex];
@@ -2342,11 +2771,11 @@
     _idx_to_blocklist[ciBlockIndex] = blocks;
   }
 
-  if (option != create_private_copy) {
+  if (option != create_backedge_copy) {
     int len = blocks->length();
     for (int i = 0; i < len; i++) {
       Block* block = blocks->at(i);
-      if (!block->is_private_copy() && block->is_compatible_with(jsrs)) {
+      if (!block->is_backedge_copy() && block->is_compatible_with(jsrs)) {
         return block;
       }
     }
@@ -2357,15 +2786,15 @@
 
   // We did not find a compatible block.  Create one.
   Block* new_block = new (a) Block(this, _methodBlocks->block(ciBlockIndex), jsrs);
-  if (option == create_private_copy)  new_block->set_private_copy(true);
+  if (option == create_backedge_copy)  new_block->set_backedge_copy(true);
   blocks->append(new_block);
   return new_block;
 }
 
 // ------------------------------------------------------------------
-// ciTypeFlow::private_copy_count
+// ciTypeFlow::backedge_copy_count
 //
-int ciTypeFlow::private_copy_count(int ciBlockIndex, ciTypeFlow::JsrSet* jsrs) const {
+int ciTypeFlow::backedge_copy_count(int ciBlockIndex, ciTypeFlow::JsrSet* jsrs) const {
   GrowableArray<Block*>* blocks = _idx_to_blocklist[ciBlockIndex];
 
   if (blocks == NULL) {
@@ -2376,7 +2805,7 @@
   int len = blocks->length();
   for (int i = 0; i < len; i++) {
     Block* block = blocks->at(i);
-    if (block->is_private_copy() && block->is_compatible_with(jsrs)) {
+    if (block->is_backedge_copy() && block->is_compatible_with(jsrs)) {
       count++;
     }
   }
@@ -2405,10 +2834,12 @@
   if (failing()) {
     return;
   }
+
+  map_blocks();
+
   if (CIPrintTypeFlow || CITraceTypeFlow) {
-    print_on(tty);
+    rpo_print_on(tty);
   }
-  map_blocks();
 }
 
 // ------------------------------------------------------------------
@@ -2466,4 +2897,19 @@
   st->print_cr("********************************************************");
   st->cr();
 }
+
+void ciTypeFlow::rpo_print_on(outputStream* st) const {
+  st->print_cr("********************************************************");
+  st->print   ("TypeFlow for ");
+  method()->name()->print_symbol_on(st);
+  int limit_bci = code_size();
+  st->print_cr("  %d bytes", limit_bci);
+  for (Block* blk = _rpo_list; blk != NULL; blk = blk->rpo_next()) {
+    blk->print_on(st);
+    st->print_cr("--------------------------------------------------------");
+    st->cr();
+  }
+  st->print_cr("********************************************************");
+  st->cr();
+}
 #endif
--- a/hotspot/src/share/vm/ci/ciTypeFlow.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/ci/ciTypeFlow.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -34,11 +34,13 @@
   int _max_locals;
   int _max_stack;
   int _code_size;
+  bool      _has_irreducible_entry;
 
   const char* _failure_reason;
 
 public:
   class StateVector;
+  class Loop;
   class Block;
 
   // Build a type flow analyzer
@@ -55,6 +57,7 @@
   int       max_stack() const  { return _max_stack; }
   int       max_cells() const  { return _max_locals + _max_stack; }
   int       code_size() const  { return _code_size; }
+  bool      has_irreducible_entry() const { return _has_irreducible_entry; }
 
   // Represents information about an "active" jsr call.  This
   // class represents a call to the routine at some entry address
@@ -125,6 +128,19 @@
     void print_on(outputStream* st) const PRODUCT_RETURN;
   };
 
+  class LocalSet VALUE_OBJ_CLASS_SPEC {
+  private:
+    enum Constants { max = 63 };
+    uint64_t _bits;
+  public:
+    LocalSet() : _bits(0) {}
+    void add(uint32_t i)        { if (i < (uint32_t)max) _bits |=  (1LL << i); }
+    void add(LocalSet* ls)      { _bits |= ls->_bits; }
+    bool test(uint32_t i) const { return i < (uint32_t)max ? (_bits>>i)&1U : true; }
+    void clear()                { _bits = 0; }
+    void print_on(outputStream* st, int limit) const  PRODUCT_RETURN;
+  };
+
   // Used as a combined index for locals and temps
   enum Cell {
     Cell_0, Cell_max = INT_MAX
@@ -142,6 +158,8 @@
     int         _trap_bci;
     int         _trap_index;
 
+    LocalSet    _def_locals;  // For entire block
+
     static ciType* type_meet_internal(ciType* t1, ciType* t2, ciTypeFlow* analyzer);
 
   public:
@@ -181,6 +199,9 @@
     int         monitor_count() const  { return _monitor_count; }
     void    set_monitor_count(int mc)  { _monitor_count = mc; }
 
+    LocalSet* def_locals() { return &_def_locals; }
+    const LocalSet* def_locals() const { return &_def_locals; }
+
     static Cell start_cell()           { return (Cell)0; }
     static Cell next_cell(Cell c)      { return (Cell)(((int)c) + 1); }
     Cell        limit_cell() const {
@@ -250,6 +271,10 @@
       return type->basic_type() == T_DOUBLE;
     }
 
+    void store_to_local(int lnum) {
+      _def_locals.add((uint) lnum);
+    }
+
     void      push_translate(ciType* type);
 
     void      push_int() {
@@ -358,6 +383,7 @@
              "must be reference type or return address");
       overwrite_local_double_long(index);
       set_type_at(local(index), type);
+      store_to_local(index);
     }
 
     void load_local_double(int index) {
@@ -376,6 +402,8 @@
       overwrite_local_double_long(index);
       set_type_at(local(index), type);
       set_type_at(local(index+1), type2);
+      store_to_local(index);
+      store_to_local(index+1);
     }
 
     void load_local_float(int index) {
@@ -388,6 +416,7 @@
       assert(is_float(type), "must be float type");
       overwrite_local_double_long(index);
       set_type_at(local(index), type);
+      store_to_local(index);
     }
 
     void load_local_int(int index) {
@@ -400,6 +429,7 @@
       assert(is_int(type), "must be int type");
       overwrite_local_double_long(index);
       set_type_at(local(index), type);
+      store_to_local(index);
     }
 
     void load_local_long(int index) {
@@ -418,6 +448,8 @@
       overwrite_local_double_long(index);
       set_type_at(local(index), type);
       set_type_at(local(index+1), type2);
+      store_to_local(index);
+      store_to_local(index+1);
     }
 
     // Stop interpretation of this path with a trap.
@@ -450,13 +482,31 @@
   };
 
   // Parameter for "find_block" calls:
-  // Describes the difference between a public and private copy.
+  // Describes the difference between a public and backedge copy.
   enum CreateOption {
     create_public_copy,
-    create_private_copy,
+    create_backedge_copy,
     no_create
   };
 
+  // Successor iterator
+  class SuccIter : public StackObj {
+  private:
+    Block* _pred;
+    int    _index;
+    Block* _succ;
+  public:
+    SuccIter()                        : _pred(NULL), _index(-1), _succ(NULL) {}
+    SuccIter(Block* pred)             : _pred(pred), _index(-1), _succ(NULL) { next(); }
+    int    index()     { return _index; }
+    Block* pred()      { return _pred; }           // Return predecessor
+    bool   done()      { return _index < 0; }      // Finished?
+    Block* succ()      { return _succ; }           // Return current successor
+    void   next();                                 // Advance
+    void   set_succ(Block* succ);                  // Update current successor
+    bool   is_normal_ctrl() { return index() < _pred->successors()->length(); }
+  };
+
   // A basic block
   class Block : public ResourceObj {
   private:
@@ -470,15 +520,24 @@
     int                              _trap_bci;
     int                              _trap_index;
 
-    // A reasonable approximation to pre-order, provided.to the client.
+    // pre_order, assigned at first visit. Used as block ID and "visited" tag
     int                              _pre_order;
 
-    // Has this block been cloned for some special purpose?
-    bool                             _private_copy;
+    // A post-order, used to compute the reverse post order (RPO) provided to the client
+    int                              _post_order;  // used to compute rpo
+
+    // Has this block been cloned for a loop backedge?
+    bool                             _backedge_copy;
 
     // A pointer used for our internal work list
-    Block*                 _next;
-    bool                   _on_work_list;
+    Block*                           _next;
+    bool                             _on_work_list;      // on the work list
+    Block*                           _rpo_next;          // Reverse post order list
+
+    // Loop info
+    Loop*                            _loop;              // nearest loop
+    bool                             _irreducible_entry; // entry to irreducible loop
+    bool                             _exception_entry;   // entry to exception handler
 
     ciBlock*     ciblock() const     { return _ciblock; }
     StateVector* state() const     { return _state; }
@@ -504,10 +563,11 @@
     int start() const         { return _ciblock->start_bci(); }
     int limit() const         { return _ciblock->limit_bci(); }
     int control() const       { return _ciblock->control_bci(); }
+    JsrSet* jsrs() const      { return _jsrs; }
 
-    bool    is_private_copy() const       { return _private_copy; }
-    void   set_private_copy(bool z);
-    int        private_copy_count() const { return outer()->private_copy_count(ciblock()->index(), _jsrs); }
+    bool    is_backedge_copy() const       { return _backedge_copy; }
+    void   set_backedge_copy(bool z);
+    int        backedge_copy_count() const { return outer()->backedge_copy_count(ciblock()->index(), _jsrs); }
 
     // access to entry state
     int     stack_size() const         { return _state->stack_size(); }
@@ -515,6 +575,20 @@
     ciType* local_type_at(int i) const { return _state->local_type_at(i); }
     ciType* stack_type_at(int i) const { return _state->stack_type_at(i); }
 
+    // Data flow on locals
+    bool is_invariant_local(uint v) const {
+      assert(is_loop_head(), "only loop heads");
+      // Find outermost loop with same loop head
+      Loop* lp = loop();
+      while (lp->parent() != NULL) {
+        if (lp->parent()->head() != lp->head()) break;
+        lp = lp->parent();
+      }
+      return !lp->def_locals()->test(v);
+    }
+    LocalSet* def_locals() { return _state->def_locals(); }
+    const LocalSet* def_locals() const { return _state->def_locals(); }
+
     // Get the successors for this Block.
     GrowableArray<Block*>* successors(ciBytecodeStream* str,
                                       StateVector* state,
@@ -524,13 +598,6 @@
       return _successors;
     }
 
-    // Helper function for "successors" when making private copies of
-    // loop heads for C2.
-    Block * clone_loop_head(ciTypeFlow* analyzer,
-                            int branch_bci,
-                            Block* target,
-                            JsrSet* jsrs);
-
     // Get the exceptional successors for this Block.
     GrowableArray<Block*>* exceptions() {
       if (_exceptions == NULL) {
@@ -584,17 +651,126 @@
     bool   is_on_work_list() const  { return _on_work_list; }
 
     bool   has_pre_order() const  { return _pre_order >= 0; }
-    void   set_pre_order(int po)  { assert(!has_pre_order() && po >= 0, ""); _pre_order = po; }
+    void   set_pre_order(int po)  { assert(!has_pre_order(), ""); _pre_order = po; }
     int    pre_order() const      { assert(has_pre_order(), ""); return _pre_order; }
+    void   set_next_pre_order()   { set_pre_order(outer()->inc_next_pre_order()); }
     bool   is_start() const       { return _pre_order == outer()->start_block_num(); }
 
-    // A ranking used in determining order within the work list.
-    bool   is_simpler_than(Block* other);
+    // Reverse post order
+    void   df_init();
+    bool   has_post_order() const { return _post_order >= 0; }
+    void   set_post_order(int po) { assert(!has_post_order() && po >= 0, ""); _post_order = po; }
+    void   reset_post_order(int o){ _post_order = o; }
+    int    post_order() const     { assert(has_post_order(), ""); return _post_order; }
+
+    bool   has_rpo() const        { return has_post_order() && outer()->have_block_count(); }
+    int    rpo() const            { assert(has_rpo(), ""); return outer()->block_count() - post_order() - 1; }
+    void   set_rpo_next(Block* b) { _rpo_next = b; }
+    Block* rpo_next()             { return _rpo_next; }
+
+    // Loops
+    Loop*  loop() const                  { return _loop; }
+    void   set_loop(Loop* lp)            { _loop = lp; }
+    bool   is_loop_head() const          { return _loop && _loop->head() == this; }
+    void   set_irreducible_entry(bool c) { _irreducible_entry = c; }
+    bool   is_irreducible_entry() const  { return _irreducible_entry; }
+    bool   is_visited() const            { return has_pre_order(); }
+    bool   is_post_visited() const       { return has_post_order(); }
+    bool   is_clonable_exit(Loop* lp);
+    Block* looping_succ(Loop* lp);       // Successor inside of loop
+    bool   is_single_entry_loop_head() const {
+      if (!is_loop_head()) return false;
+      for (Loop* lp = loop(); lp != NULL && lp->head() == this; lp = lp->parent())
+        if (lp->is_irreducible()) return false;
+      return true;
+    }
 
     void   print_value_on(outputStream* st) const PRODUCT_RETURN;
     void   print_on(outputStream* st) const       PRODUCT_RETURN;
   };
 
+  // Loop
+  class Loop : public ResourceObj {
+  private:
+    Loop* _parent;
+    Loop* _sibling;  // List of siblings, null terminated
+    Loop* _child;    // Head of child list threaded thru sibling pointer
+    Block* _head;    // Head of loop
+    Block* _tail;    // Tail of loop
+    bool   _irreducible;
+    LocalSet _def_locals;
+
+  public:
+    Loop(Block* head, Block* tail) :
+      _head(head),   _tail(tail),
+      _parent(NULL), _sibling(NULL), _child(NULL),
+      _irreducible(false), _def_locals() {}
+
+    Loop* parent()  const { return _parent; }
+    Loop* sibling() const { return _sibling; }
+    Loop* child()   const { return _child; }
+    Block* head()   const { return _head; }
+    Block* tail()   const { return _tail; }
+    void set_parent(Loop* p)  { _parent = p; }
+    void set_sibling(Loop* s) { _sibling = s; }
+    void set_child(Loop* c)   { _child = c; }
+    void set_head(Block* hd)  { _head = hd; }
+    void set_tail(Block* tl)  { _tail = tl; }
+
+    int depth() const;              // nesting depth
+
+    // Returns true if lp is a nested loop or us.
+    bool contains(Loop* lp) const;
+    bool contains(Block* blk) const { return contains(blk->loop()); }
+
+    // Data flow on locals
+    LocalSet* def_locals() { return &_def_locals; }
+    const LocalSet* def_locals() const { return &_def_locals; }
+
+    // Merge the branch lp into this branch, sorting on the loop head
+    // pre_orders. Returns the new branch.
+    Loop* sorted_merge(Loop* lp);
+
+    // Mark non-single entry to loop
+    void set_irreducible(Block* entry) {
+      _irreducible = true;
+      entry->set_irreducible_entry(true);
+    }
+    bool is_irreducible() const { return _irreducible; }
+
+    bool is_root() const { return _tail->pre_order() == max_jint; }
+
+    void print(outputStream* st = tty, int indent = 0) const PRODUCT_RETURN;
+  };
+
+  // Postorder iteration over the loop tree.
+  class PostorderLoops : public StackObj {
+  private:
+    Loop* _root;
+    Loop* _current;
+  public:
+    PostorderLoops(Loop* root) : _root(root), _current(root) {
+      while (_current->child() != NULL) {
+        _current = _current->child();
+      }
+    }
+    bool done() { return _current == NULL; }  // Finished iterating?
+    void next();                            // Advance to next loop
+    Loop* current() { return _current; }      // Return current loop.
+  };
+
+  // Preorder iteration over the loop tree.
+  class PreorderLoops : public StackObj {
+  private:
+    Loop* _root;
+    Loop* _current;
+  public:
+    PreorderLoops(Loop* root) : _root(root), _current(root) {}
+    bool done() { return _current == NULL; }  // Finished iterating?
+    void next();                            // Advance to next loop
+    Loop* current() { return _current; }      // Return current loop.
+  };
+
   // Standard indexes of successors, for various bytecodes.
   enum {
     FALL_THROUGH   = 0,  // normal control
@@ -619,6 +795,12 @@
   // Tells if a given instruction is able to generate an exception edge.
   bool can_trap(ciBytecodeStream& str);
 
+  // Clone the loop heads. Returns true if any cloning occurred.
+  bool clone_loop_heads(Loop* lp, StateVector* temp_vector, JsrSet* temp_set);
+
+  // Clone lp's head and replace tail's successors with clone.
+  Block* clone_loop_head(Loop* lp, StateVector* temp_vector, JsrSet* temp_set);
+
 public:
   // Return the block beginning at bci which has a JsrSet compatible
   // with jsrs.
@@ -627,8 +809,8 @@
   // block factory
   Block* get_block_for(int ciBlockIndex, JsrSet* jsrs, CreateOption option = create_public_copy);
 
-  // How many of the blocks have the private_copy bit set?
-  int private_copy_count(int ciBlockIndex, JsrSet* jsrs) const;
+  // How many of the blocks have the backedge_copy bit set?
+  int backedge_copy_count(int ciBlockIndex, JsrSet* jsrs) const;
 
   // Return an existing block containing bci which has a JsrSet compatible
   // with jsrs, or NULL if there is none.
@@ -651,11 +833,18 @@
                                       return _block_map[po]; }
   Block* start_block() const        { return pre_order_at(start_block_num()); }
   int start_block_num() const       { return 0; }
+  Block* rpo_at(int rpo) const      { assert(0 <= rpo && rpo < block_count(), "out of bounds");
+                                      return _block_map[rpo]; }
+  int next_pre_order()              { return _next_pre_order; }
+  int inc_next_pre_order()          { return _next_pre_order++; }
 
 private:
   // A work list used during flow analysis.
   Block* _work_list;
 
+  // List of blocks in reverse post order
+  Block* _rpo_list;
+
   // Next Block::_pre_order.  After mapping, doubles as block_count.
   int _next_pre_order;
 
@@ -668,6 +857,15 @@
   // Add a basic block to our work list.
   void add_to_work_list(Block* block);
 
+  // Prepend a basic block to rpo list.
+  void prepend_to_rpo_list(Block* blk) {
+    blk->set_rpo_next(_rpo_list);
+    _rpo_list = blk;
+  }
+
+  // Root of the loop tree
+  Loop* _loop_tree_root;
+
   // State used for make_jsr_record
   int _jsr_count;
   GrowableArray<JsrRecord*>* _jsr_records;
@@ -677,6 +875,9 @@
   // does not already exist.
   JsrRecord* make_jsr_record(int entry_address, int return_address);
 
+  void  set_loop_tree_root(Loop* ltr) { _loop_tree_root = ltr; }
+  Loop* loop_tree_root()              { return _loop_tree_root; }
+
 private:
   // Get the initial state for start_bci:
   const StateVector* get_start_state();
@@ -703,6 +904,15 @@
   // necessary.
   void flow_types();
 
+  // Perform the depth first type flow analysis. Helper for flow_types.
+  void df_flow_types(Block* start,
+                     bool do_flow,
+                     StateVector* temp_vector,
+                     JsrSet* temp_set);
+
+  // Incrementally build loop tree.
+  void build_loop_tree(Block* blk);
+
   // Create the block map, which indexes blocks in pre_order.
   void map_blocks();
 
@@ -711,4 +921,6 @@
   void do_flow();
 
   void print_on(outputStream* st) const PRODUCT_RETURN;
+
+  void rpo_print_on(outputStream* st) const PRODUCT_RETURN;
 };
--- a/hotspot/src/share/vm/code/nmethod.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/code/nmethod.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -1350,11 +1350,7 @@
       return false;
     }
   }
-  if (!UseParallelOldGC || !VerifyParallelOldWithMarkSweep) {
-    // Cannot do this test if verification of the UseParallelOldGC
-    // code using the PSMarkSweep code is being done.
-    assert(unloading_occurred, "Inconsistency in unloading");
-  }
+  assert(unloading_occurred, "Inconsistency in unloading");
   make_unloaded(is_alive, obj);
   return true;
 }
--- a/hotspot/src/share/vm/compiler/methodLiveness.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/compiler/methodLiveness.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -76,8 +76,9 @@
   BitCounter() : _count(0) {}
 
   // Callback when bit in map is set
-  virtual void do_bit(size_t offset) {
+  virtual bool do_bit(size_t offset) {
     _count++;
+    return true;
   }
 
   int count() {
@@ -467,7 +468,7 @@
     bci = 0;
   }
 
-  MethodLivenessResult answer(NULL,0);
+  MethodLivenessResult answer((uintptr_t*)NULL,0);
 
   if (_block_count > 0) {
     if (TimeLivenessAnalysis) _time_total.start();
--- a/hotspot/src/share/vm/compiler/methodLiveness.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/compiler/methodLiveness.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -29,7 +29,7 @@
   bool _is_valid;
 
  public:
-  MethodLivenessResult(uintptr_t* map, idx_t size_in_bits)
+  MethodLivenessResult(BitMap::bm_word_t* map, idx_t size_in_bits)
     : BitMap(map, size_in_bits)
     , _is_valid(false)
   {}
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -790,7 +790,7 @@
 }
 
 
-HeapWord* CompactibleFreeListSpace::block_start(const void* p) const {
+HeapWord* CompactibleFreeListSpace::block_start_const(const void* p) const {
   NOT_PRODUCT(verify_objects_initialized());
   return _bt.block_start(p);
 }
@@ -2286,9 +2286,9 @@
 }
 
 void CompactibleFreeListSpace::verifyIndexedFreeList(size_t size) const {
-  guarantee(size % 2 == 0, "Odd slots should be empty");
-  for (FreeChunk* fc = _indexedFreeList[size].head(); fc != NULL;
-    fc = fc->next()) {
+  FreeChunk* fc =  _indexedFreeList[size].head();
+  guarantee((size % 2 == 0) || fc == NULL, "Odd slots should be empty");
+  for (; fc != NULL; fc = fc->next()) {
     guarantee(fc->size() == size, "Size inconsistency");
     guarantee(fc->isFree(), "!free?");
     guarantee(fc->next() == NULL || fc->next()->prev() == fc, "Broken list");
@@ -2790,10 +2790,11 @@
   assert(n_threads > 0, "Unexpected n_threads argument");
   const size_t task_size = rescan_task_size();
   size_t n_tasks = (used_region().word_size() + task_size - 1)/task_size;
-  assert((used_region().start() + (n_tasks - 1)*task_size <
-          used_region().end()) &&
-         (used_region().start() + n_tasks*task_size >=
-          used_region().end()), "n_task calculation incorrect");
+  assert((n_tasks == 0) == used_region().is_empty(), "n_tasks incorrect");
+  assert(n_tasks == 0 ||
+         ((used_region().start() + (n_tasks - 1)*task_size < used_region().end()) &&
+          (used_region().start() + n_tasks*task_size >= used_region().end())),
+         "n_tasks calculation incorrect");
   SequentialSubTasksDone* pst = conc_par_seq_tasks();
   assert(!pst->valid(), "Clobbering existing data?");
   pst->set_par_threads(n_threads);
@@ -2833,7 +2834,7 @@
   assert(n_tasks == 0 ||
          ((span.start() + (n_tasks - 1)*task_size < span.end()) &&
           (span.start() + n_tasks*task_size >= span.end())),
-         "n_task calculation incorrect");
+         "n_tasks calculation incorrect");
   SequentialSubTasksDone* pst = conc_par_seq_tasks();
   assert(!pst->valid(), "Clobbering existing data?");
   pst->set_par_threads(n_threads);
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -502,7 +502,7 @@
 
   void blk_iterate(BlkClosure* cl);
   void blk_iterate_careful(BlkClosureCareful* cl);
-  HeapWord* block_start(const void* p) const;
+  HeapWord* block_start_const(const void* p) const;
   HeapWord* block_start_careful(const void* p) const;
   size_t block_size(const HeapWord* p) const;
   size_t block_size_no_stall(HeapWord* p, const CMSCollector* c) const;
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -2761,13 +2761,14 @@
  public:
   VerifyMarkedClosure(CMSBitMap* bm): _marks(bm), _failed(false) {}
 
-  void do_bit(size_t offset) {
+  bool do_bit(size_t offset) {
     HeapWord* addr = _marks->offsetToHeapWord(offset);
     if (!_marks->isMarked(addr)) {
       oop(addr)->print();
       gclog_or_tty->print_cr(" ("INTPTR_FORMAT" should have been marked)", addr);
       _failed = true;
     }
+    return true;
   }
 
   bool failed() { return _failed; }
@@ -3650,6 +3651,7 @@
   CompactibleFreeListSpace*  _cms_space;
   CompactibleFreeListSpace* _perm_space;
   HeapWord*     _global_finger;
+  HeapWord*     _restart_addr;
 
   //  Exposed here for yielding support
   Mutex* const _bit_map_lock;
@@ -3680,7 +3682,7 @@
     _term.set_task(this);
     assert(_cms_space->bottom() < _perm_space->bottom(),
            "Finger incorrectly initialized below");
-    _global_finger = _cms_space->bottom();
+    _restart_addr = _global_finger = _cms_space->bottom();
   }
 
 
@@ -3698,6 +3700,10 @@
   bool result() { return _result; }
 
   void reset(HeapWord* ra) {
+    assert(_global_finger >= _cms_space->end(),  "Postcondition of ::work(i)");
+    assert(_global_finger >= _perm_space->end(), "Postcondition of ::work(i)");
+    assert(ra             <  _perm_space->end(), "ra too large");
+    _restart_addr = _global_finger = ra;
     _term.reset_for_reuse();
   }
 
@@ -3842,16 +3848,24 @@
   int n_tasks = pst->n_tasks();
   // We allow that there may be no tasks to do here because
   // we are restarting after a stack overflow.
-  assert(pst->valid() || n_tasks == 0, "Uninitializd use?");
+  assert(pst->valid() || n_tasks == 0, "Uninitialized use?");
   int nth_task = 0;
 
-  HeapWord* start = sp->bottom();
+  HeapWord* aligned_start = sp->bottom();
+  if (sp->used_region().contains(_restart_addr)) {
+    // Align down to a card boundary for the start of 0th task
+    // for this space.
+    aligned_start =
+      (HeapWord*)align_size_down((uintptr_t)_restart_addr,
+                                 CardTableModRefBS::card_size);
+  }
+
   size_t chunk_size = sp->marking_task_size();
   while (!pst->is_task_claimed(/* reference */ nth_task)) {
     // Having claimed the nth task in this space,
     // compute the chunk that it corresponds to:
-    MemRegion span = MemRegion(start + nth_task*chunk_size,
-                               start + (nth_task+1)*chunk_size);
+    MemRegion span = MemRegion(aligned_start + nth_task*chunk_size,
+                               aligned_start + (nth_task+1)*chunk_size);
     // Try and bump the global finger via a CAS;
     // note that we need to do the global finger bump
     // _before_ taking the intersection below, because
@@ -3866,26 +3880,40 @@
     // beyond the "top" address of the space.
     span = span.intersection(sp->used_region());
     if (!span.is_empty()) {  // Non-null task
-      // We want to skip the first object because
-      // the protocol is to scan any object in its entirety
-      // that _starts_ in this span; a fortiori, any
-      // object starting in an earlier span is scanned
-      // as part of an earlier claimed task.
-      // Below we use the "careful" version of block_start
-      // so we do not try to navigate uninitialized objects.
-      HeapWord* prev_obj = sp->block_start_careful(span.start());
-      // Below we use a variant of block_size that uses the
-      // Printezis bits to avoid waiting for allocated
-      // objects to become initialized/parsable.
-      while (prev_obj < span.start()) {
-        size_t sz = sp->block_size_no_stall(prev_obj, _collector);
-        if (sz > 0) {
-          prev_obj += sz;
+      HeapWord* prev_obj;
+      assert(!span.contains(_restart_addr) || nth_task == 0,
+             "Inconsistency");
+      if (nth_task == 0) {
+        // For the 0th task, we'll not need to compute a block_start.
+        if (span.contains(_restart_addr)) {
+          // In the case of a restart because of stack overflow,
+          // we might additionally skip a chunk prefix.
+          prev_obj = _restart_addr;
         } else {
-          // In this case we may end up doing a bit of redundant
-          // scanning, but that appears unavoidable, short of
-          // locking the free list locks; see bug 6324141.
-          break;
+          prev_obj = span.start();
+        }
+      } else {
+        // We want to skip the first object because
+        // the protocol is to scan any object in its entirety
+        // that _starts_ in this span; a fortiori, any
+        // object starting in an earlier span is scanned
+        // as part of an earlier claimed task.
+        // Below we use the "careful" version of block_start
+        // so we do not try to navigate uninitialized objects.
+        prev_obj = sp->block_start_careful(span.start());
+        // Below we use a variant of block_size that uses the
+        // Printezis bits to avoid waiting for allocated
+        // objects to become initialized/parsable.
+        while (prev_obj < span.start()) {
+          size_t sz = sp->block_size_no_stall(prev_obj, _collector);
+          if (sz > 0) {
+            prev_obj += sz;
+          } else {
+            // In this case we may end up doing a bit of redundant
+            // scanning, but that appears unavoidable, short of
+            // locking the free list locks; see bug 6324141.
+            break;
+          }
         }
       }
       if (prev_obj < span.end()) {
@@ -3938,12 +3966,14 @@
   void handle_stack_overflow(HeapWord* lost);
 };
 
-// Grey object rescan during work stealing phase --
-// the salient assumption here is that stolen oops must
-// always be initialized, so we do not need to check for
-// uninitialized objects before scanning here.
+// Grey object scanning during work stealing phase --
+// the salient assumption here is that any references
+// that are in these stolen objects being scanned must
+// already have been initialized (else they would not have
+// been published), so we do not need to check for
+// uninitialized objects before pushing here.
 void Par_ConcMarkingClosure::do_oop(oop obj) {
-  assert(obj->is_oop_or_null(), "expected an oop or NULL");
+  assert(obj->is_oop_or_null(true), "expected an oop or NULL");
   HeapWord* addr = (HeapWord*)obj;
   // Check if oop points into the CMS generation
   // and is not marked
@@ -4001,7 +4031,7 @@
 // in CMSCollector's _restart_address.
 void Par_ConcMarkingClosure::handle_stack_overflow(HeapWord* lost) {
   // We need to do this under a mutex to prevent other
-  // workers from interfering with the expansion below.
+  // workers from interfering with the work done below.
   MutexLockerEx ml(_overflow_stack->par_lock(),
                    Mutex::_no_safepoint_check_flag);
   // Remember the least grey address discarded
@@ -4640,8 +4670,11 @@
       startTimer();
       sample_eden();
       // Get and clear dirty region from card table
-      dirtyRegion = _ct->ct_bs()->dirty_card_range_after_preclean(
-                                    MemRegion(nextAddr, endAddr));
+      dirtyRegion = _ct->ct_bs()->dirty_card_range_after_reset(
+                                    MemRegion(nextAddr, endAddr),
+                                    true,
+                                    CardTableModRefBS::precleaned_card_val());
+
       assert(dirtyRegion.start() >= nextAddr,
              "returned region inconsistent?");
     }
@@ -5409,8 +5442,8 @@
                               &mrias_cl);
   {
     TraceTime t("grey object rescan", PrintGCDetails, false, gclog_or_tty);
-    // Iterate over the dirty cards, marking them precleaned, and
-    // setting the corresponding bits in the mod union table.
+    // Iterate over the dirty cards, setting the corresponding bits in the
+    // mod union table.
     {
       ModUnionClosure modUnionClosure(&_modUnionTable);
       _ct->ct_bs()->dirty_card_iterate(
@@ -6182,7 +6215,7 @@
 // bit vector itself. That is done by a separate call CMSBitMap::allocate()
 // further below.
 CMSBitMap::CMSBitMap(int shifter, int mutex_rank, const char* mutex_name):
-  _bm(NULL,0),
+  _bm(),
   _shifter(shifter),
   _lock(mutex_rank >= 0 ? new Mutex(mutex_rank, mutex_name, true) : NULL)
 {
@@ -6207,7 +6240,7 @@
   }
   assert(_virtual_space.committed_size() == brs.size(),
          "didn't reserve backing store for all of CMS bit map?");
-  _bm.set_map((uintptr_t*)_virtual_space.low());
+  _bm.set_map((BitMap::bm_word_t*)_virtual_space.low());
   assert(_virtual_space.committed_size() << (_shifter + LogBitsPerByte) >=
          _bmWordSize, "inconsistency in bit map sizing");
   _bm.set_size(_bmWordSize >> _shifter);
@@ -6554,7 +6587,7 @@
   if (obj != NULL) {
     // Ignore mark word because this could be an already marked oop
     // that may be chained at the end of the overflow list.
-    assert(obj->is_oop(), "expected an oop");
+    assert(obj->is_oop(true), "expected an oop");
     HeapWord* addr = (HeapWord*)obj;
     if (_span.contains(addr) &&
         !_bit_map->isMarked(addr)) {
@@ -6845,10 +6878,10 @@
 
 // Should revisit to see if this should be restructured for
 // greater efficiency.
-void MarkFromRootsClosure::do_bit(size_t offset) {
+bool MarkFromRootsClosure::do_bit(size_t offset) {
   if (_skipBits > 0) {
     _skipBits--;
-    return;
+    return true;
   }
   // convert offset into a HeapWord*
   HeapWord* addr = _bitMap->startWord() + offset;
@@ -6886,10 +6919,11 @@
           } // ...else the setting of klass will dirty the card anyway.
         }
       DEBUG_ONLY(})
-      return;
+      return true;
     }
   }
   scanOopsInOop(addr);
+  return true;
 }
 
 // We take a break if we've been at this for a while,
@@ -7023,10 +7057,10 @@
 
 // Should revisit to see if this should be restructured for
 // greater efficiency.
-void Par_MarkFromRootsClosure::do_bit(size_t offset) {
+bool Par_MarkFromRootsClosure::do_bit(size_t offset) {
   if (_skip_bits > 0) {
     _skip_bits--;
-    return;
+    return true;
   }
   // convert offset into a HeapWord*
   HeapWord* addr = _bit_map->startWord() + offset;
@@ -7041,10 +7075,11 @@
     if (p->klass_or_null() == NULL || !p->is_parsable()) {
       // in the case of Clean-on-Enter optimization, redirty card
       // and avoid clearing card by increasing  the threshold.
-      return;
+      return true;
     }
   }
   scan_oops_in_oop(addr);
+  return true;
 }
 
 void Par_MarkFromRootsClosure::scan_oops_in_oop(HeapWord* ptr) {
@@ -7167,7 +7202,7 @@
 
 // Should revisit to see if this should be restructured for
 // greater efficiency.
-void MarkFromRootsVerifyClosure::do_bit(size_t offset) {
+bool MarkFromRootsVerifyClosure::do_bit(size_t offset) {
   // convert offset into a HeapWord*
   HeapWord* addr = _verification_bm->startWord() + offset;
   assert(_verification_bm->endWord() && addr < _verification_bm->endWord(),
@@ -7195,6 +7230,7 @@
     new_oop->oop_iterate(&_pam_verify_closure);
   }
   assert(_mark_stack->isEmpty(), "tautology, emphasizing post-condition");
+  return true;
 }
 
 PushAndMarkVerifyClosure::PushAndMarkVerifyClosure(
@@ -7289,6 +7325,8 @@
   _should_remember_klasses(collector->should_unload_classes())
 { }
 
+// Assumes thread-safe access by callers, who are
+// responsible for mutual exclusion.
 void CMSCollector::lower_restart_addr(HeapWord* low) {
   assert(_span.contains(low), "Out of bounds addr");
   if (_restart_addr == NULL) {
@@ -7314,7 +7352,7 @@
 // in CMSCollector's _restart_address.
 void Par_PushOrMarkClosure::handle_stack_overflow(HeapWord* lost) {
   // We need to do this under a mutex to prevent other
-  // workers from interfering with the expansion below.
+  // workers from interfering with the work done below.
   MutexLockerEx ml(_overflow_stack->par_lock(),
                    Mutex::_no_safepoint_check_flag);
   // Remember the least grey address discarded
@@ -7438,8 +7476,12 @@
 // Grey object rescan during pre-cleaning and second checkpoint phases --
 // the non-parallel version (the parallel version appears further below.)
 void PushAndMarkClosure::do_oop(oop obj) {
-  // If _concurrent_precleaning, ignore mark word verification
-  assert(obj->is_oop_or_null(_concurrent_precleaning),
+  // Ignore mark word verification. If during concurrent precleaning,
+  // the object monitor may be locked. If during the checkpoint
+  // phases, the object may already have been reached by a  different
+  // path and may be at the end of the global overflow list (so
+  // the mark word may be NULL).
+  assert(obj->is_oop_or_null(true /* ignore mark word */),
          "expected an oop or NULL");
   HeapWord* addr = (HeapWord*)obj;
   // Check if oop points into the CMS generation
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp	Thu Oct 23 21:56:41 2008 -0700
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -1327,7 +1327,7 @@
                        CMSMarkStack*  markStack,
                        CMSMarkStack*  revisitStack,
                        bool should_yield, bool verifying = false);
-  void do_bit(size_t offset);
+  bool do_bit(size_t offset);
   void reset(HeapWord* addr);
   inline void do_yield_check();
 
@@ -1363,7 +1363,7 @@
                        CMSMarkStack*  overflow_stack,
                        CMSMarkStack*  revisit_stack,
                        bool should_yield);
-  void do_bit(size_t offset);
+  bool do_bit(size_t offset);
   inline void do_yield_check();
 
  private:
@@ -1411,7 +1411,7 @@
                              CMSBitMap* verification_bm,
                              CMSBitMap* cms_bm,
                              CMSMarkStack*  mark_stack);
-  void do_bit(size_t offset);
+  bool do_bit(size_t offset);
   void reset(HeapWord* addr);
 };
 
@@ -1420,8 +1420,9 @@
 // "empty" (i.e. the bit vector doesn't have any 1-bits).
 class FalseBitMapClosure: public BitMapClosure {
  public:
-  void do_bit(size_t offset) {
+  bool do_bit(size_t offset) {
     guarantee(false, "Should not have a 1 bit");
+    return true;
   }
 };
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc_implementation/g1/bufferingOopClosure.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2001-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ */
+
+// A BufferingOops closure tries to separate out the cost of finding roots
+// from the cost of applying closures to them.  It maintains an array of
+// ref-containing locations.  Until the array is full, applying the closure
+// to an oop* merely records that location in the array.  Since this
+// closure app cost is small, an elapsed timer can approximately attribute
+// all of this cost to the cost of finding the roots.  When the array fills
+// up, the wrapped closure is applied to all elements, keeping track of
+// this elapsed time of this process, and leaving the array empty.
+// The caller must be sure to call "done" to process any unprocessed
+// buffered entriess.
+
+class Generation;
+class HeapRegion;
+
+class BufferingOopClosure: public OopClosure {
+protected:
+  enum PrivateConstants {
+    BufferLength = 1024
+  };
+
+  oop          *_buffer[BufferLength];
+  oop         **_buffer_top;
+  oop         **_buffer_curr;
+
+  OopClosure  *_oc;
+  double       _closure_app_seconds;
+
+  void process_buffer () {
+
+    double start = os::elapsedTime();
+    for (oop **curr = _buffer; curr < _buffer_curr; ++curr) {
+      _oc->do_oop(*curr);
+    }
+    _buffer_curr = _buffer;
+    _closure_app_seconds += (os::elapsedTime() - start);
+  }
+
+public:
+  virtual void do_oop(narrowOop* p) {
+    guarantee(false, "NYI");
+  }
+  virtual void do_oop(oop *p) {
+    if (_buffer_curr == _buffer_top) {
+      process_buffer();
+    }
+
+    *_buffer_curr = p;
+    ++_buffer_curr;
+  }
+  void done () {
+    if (_buffer_curr > _buffer) {
+      process_buffer();
+    }
+  }
+  double closure_app_seconds () {
+    return _closure_app_seconds;
+  }
+  BufferingOopClosure (OopClosure *oc) :
+    _oc(oc),
+    _buffer_curr(_buffer), _buffer_top(_buffer + BufferLength),
+    _closure_app_seconds(0.0) { }
+};
+
+class BufferingOopsInGenClosure: public OopsInGenClosure {
+  BufferingOopClosure _boc;
+  OopsInGenClosure* _oc;
+public:
+  BufferingOopsInGenClosure(OopsInGenClosure *oc) :
+    _boc(oc), _oc(oc) {}
+
+  virtual void do_oop(narrowOop* p) {
+    guarantee(false, "NYI");
+  }
+
+  virtual void do_oop(oop* p) {
+    assert(generation()->is_in_reserved(p), "Must be in!");
+    _boc.do_oop(p);
+  }
+
+  void done() {
+    _boc.done();
+  }
+
+  double closure_app_seconds () {
+    return _boc.closure_app_seconds();
+  }
+
+  void set_generation(Generation* gen) {
+    OopsInGenClosure::set_generation(gen);
+    _oc->set_generation(gen);
+  }
+
+  void reset_generation() {
+    // Make sure we finish the current work with the current generation.
+    _boc.done();
+    OopsInGenClosure::reset_generation();
+    _oc->reset_generation();
+  }
+
+};
+
+
+class BufferingOopsInHeapRegionClosure: public OopsInHeapRegionClosure {
+private:
+  enum PrivateConstants {
+    BufferLength = 1024
+  };
+
+  oop                      *_buffer[BufferLength];
+  oop                     **_buffer_top;
+  oop                     **_buffer_curr;
+
+  HeapRegion               *_hr_buffer[BufferLength];
+  HeapRegion              **_hr_curr;
+
+  OopsInHeapRegionClosure  *_oc;
+  double                    _closure_app_seconds;
+
+  void process_buffer () {
+
+    assert((_hr_curr - _hr_buffer) == (_buffer_curr - _buffer),
+           "the two lengths should be the same");
+
+    double start = os::elapsedTime();
+    HeapRegion **hr_curr = _hr_buffer;
+    HeapRegion *hr_prev = NULL;
+    for (oop **curr = _buffer; curr < _buffer_curr; ++curr) {
+      HeapRegion *region = *hr_curr;
+      if (region != hr_prev) {
+        _oc->set_region(region);
+        hr_prev = region;
+      }
+      _oc->do_oop(*curr);
+      ++hr_curr;
+    }
+    _buffer_curr = _buffer;
+    _hr_curr = _hr_buffer;
+    _closure_app_seconds += (os::elapsedTime() - start);
+  }
+
+public:
+  virtual void do_oop(narrowOop *p) {
+    guarantee(false, "NYI");
+  }
+
+  virtual void do_oop(oop *p) {
+    if (_buffer_curr == _buffer_top) {
+      assert(_hr_curr > _hr_buffer, "_hr_curr should be consistent with _buffer_curr");
+      process_buffer();
+    }
+
+    *_buffer_curr = p;
+    ++_buffer_curr;
+    *_hr_curr = _from;
+    ++_hr_curr;
+  }
+  void done () {
+    if (_buffer_curr > _buffer) {
+      assert(_hr_curr > _hr_buffer, "_hr_curr should be consistent with _buffer_curr");
+      process_buffer();
+    }
+  }
+  double closure_app_seconds () {
+    return _closure_app_seconds;
+  }
+  BufferingOopsInHeapRegionClosure (OopsInHeapRegionClosure *oc) :
+    _oc(oc),
+    _buffer_curr(_buffer), _buffer_top(_buffer + BufferLength),
+    _hr_curr(_hr_buffer),
+    _closure_app_seconds(0.0) { }
+};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc_implementation/g1/collectionSetChooser.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -0,0 +1,409 @@
+/*
+ * Copyright 2001-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ */
+
+# include "incls/_precompiled.incl"
+# include "incls/_collectionSetChooser.cpp.incl"
+
+CSetChooserCache::CSetChooserCache() {
+  for (int i = 0; i < CacheLength; ++i)
+    _cache[i] = NULL;
+  clear();
+}
+
+void CSetChooserCache::clear() {
+  _occupancy = 0;
+  _first = 0;
+  for (int i = 0; i < CacheLength; ++i) {
+    HeapRegion *hr = _cache[i];
+    if (hr != NULL)
+      hr->set_sort_index(-1);
+    _cache[i] = NULL;
+  }
+}
+
+#ifndef PRODUCT
+bool CSetChooserCache::verify() {
+  int index = _first;
+  HeapRegion *prev = NULL;
+  for (int i = 0; i < _occupancy; ++i) {
+    guarantee(_cache[index] != NULL, "cache entry should not be empty");
+    HeapRegion *hr = _cache[index];
+    guarantee(!hr->is_young(), "should not be young!");
+    if (prev != NULL) {
+      guarantee(prev->gc_efficiency() >= hr->gc_efficiency(),
+                "cache should be correctly ordered");
+    }
+    guarantee(hr->sort_index() == get_sort_index(index),
+              "sort index should be correct");
+    index = trim_index(index + 1);
+    prev = hr;
+  }
+
+  for (int i = 0; i < (CacheLength - _occupancy); ++i) {
+    guarantee(_cache[index] == NULL, "cache entry should be empty");
+    index = trim_index(index + 1);
+  }
+
+  guarantee(index == _first, "we should have reached where we started from");
+  return true;
+}
+#endif // PRODUCT
+
+void CSetChooserCache::insert(HeapRegion *hr) {
+  assert(!is_full(), "cache should not be empty");
+  hr->calc_gc_efficiency();
+
+  int empty_index;
+  if (_occupancy == 0) {
+    empty_index = _first;
+  } else {
+    empty_index = trim_index(_first + _occupancy);
+    assert(_cache[empty_index] == NULL, "last slot should be empty");
+    int last_index = trim_index(empty_index - 1);
+    HeapRegion *last = _cache[last_index];
+    assert(last != NULL,"as the cache is not empty, last should not be empty");
+    while (empty_index != _first &&
+           last->gc_efficiency() < hr->gc_efficiency()) {
+      _cache[empty_index] = last;
+      last->set_sort_index(get_sort_index(empty_index));
+      empty_index = last_index;
+      last_index = trim_index(last_index - 1);
+      last = _cache[last_index];
+    }
+  }
+  _cache[empty_index] = hr;
+  hr->set_sort_index(get_sort_index(empty_index));
+
+  ++_occupancy;
+  assert(verify(), "cache should be consistent");
+}
+
+HeapRegion *CSetChooserCache::remove_first() {
+  if (_occupancy > 0) {
+    assert(_cache[_first] != NULL, "cache should have at least one region");
+    HeapRegion *ret = _cache[_first];
+    _cache[_first] = NULL;
+    ret->set_sort_index(-1);
+    --_occupancy;
+    _first = trim_index(_first + 1);
+    assert(verify(), "cache should be consistent");
+    return ret;
+  } else {
+    return NULL;
+  }
+}
+
+// this is a bit expensive... but we expect that it should not be called
+// to often.
+void CSetChooserCache::remove(HeapRegion *hr) {
+  assert(_occupancy > 0, "cache should not be empty");
+  assert(hr->sort_index() < -1, "should already be in the cache");
+  int index = get_index(hr->sort_index());
+  assert(_cache[index] == hr, "index should be correct");
+  int next_index = trim_index(index + 1);
+  int last_index = trim_index(_first + _occupancy - 1);
+  while (index != last_index) {
+    assert(_cache[next_index] != NULL, "should not be null");
+    _cache[index] = _cache[next_index];
+    _cache[index]->set_sort_index(get_sort_index(index));
+
+    index = next_index;
+    next_index = trim_index(next_index+1);
+  }
+  assert(index == last_index, "should have reached the last one");
+  _cache[index] = NULL;
+  hr->set_sort_index(-1);
+  --_occupancy;
+  assert(verify(), "cache should be consistent");
+}
+
+static inline int orderRegions(HeapRegion* hr1, HeapRegion* hr2) {
+  if (hr1 == NULL) {
+    if (hr2 == NULL) return 0;
+    else return 1;
+  } else if (hr2 == NULL) {
+    return -1;
+  }
+  if (hr2->gc_efficiency() < hr1->gc_efficiency()) return -1;
+  else if (hr1->gc_efficiency() < hr2->gc_efficiency()) return 1;
+  else return 0;
+}
+
+static int orderRegions(HeapRegion** hr1p, HeapRegion** hr2p) {
+  return orderRegions(*hr1p, *hr2p);
+}
+
+CollectionSetChooser::CollectionSetChooser() :
+  // The line below is the worst bit of C++ hackery I've ever written
+  // (Detlefs, 11/23).  You should think of it as equivalent to
+  // "_regions(100, true)": initialize the growable array and inform it
+  // that it should allocate its elem array(s) on the C heap.  The first
+  // argument, however, is actually a comma expression (new-expr, 100).
+  // The purpose of the new_expr is to inform the growable array that it
+  // is *already* allocated on the C heap: it uses the placement syntax to
+  // keep it from actually doing any allocation.
+  _markedRegions((ResourceObj::operator new (sizeof(GrowableArray<HeapRegion*>),
+                                             (void*)&_markedRegions,
+                                             ResourceObj::C_HEAP),
+                  100),
+                 true),
+  _curMarkedIndex(0),
+  _numMarkedRegions(0),
+  _unmarked_age_1_returned_as_new(false),
+  _first_par_unreserved_idx(0)
+{}
+
+
+
+#ifndef PRODUCT
+bool CollectionSetChooser::verify() {
+  int index = 0;
+  guarantee(_curMarkedIndex <= _numMarkedRegions,
+            "_curMarkedIndex should be within bounds");
+  while (index < _curMarkedIndex) {
+    guarantee(_markedRegions.at(index++) == NULL,
+              "all entries before _curMarkedIndex should be NULL");
+  }
+  HeapRegion *prev = NULL;
+  while (index < _numMarkedRegions) {
+    HeapRegion *curr = _markedRegions.at(index++);
+    if (curr != NULL) {
+      int si = curr->sort_index();
+      guarantee(!curr->is_young(), "should not be young!");
+      guarantee(si > -1 && si == (index-1), "sort index invariant");
+      if (prev != NULL) {
+        guarantee(orderRegions(prev, curr) != 1, "regions should be sorted");
+      }
+      prev = curr;
+    }
+  }
+  return _cache.verify();
+}
+#endif
+
+bool
+CollectionSetChooser::addRegionToCache() {
+  assert(!_cache.is_full(), "cache should not be full");
+
+  HeapRegion *hr = NULL;
+  while (hr == NULL && _curMarkedIndex < _numMarkedRegions) {
+    hr = _markedRegions.at(_curMarkedIndex++);
+  }
+  if (hr == NULL)
+    return false;
+  assert(!hr->is_young(), "should not be young!");
+  assert(hr->sort_index() == _curMarkedIndex-1, "sort_index invariant");
+  _markedRegions.at_put(hr->sort_index(), NULL);
+  _cache.insert(hr);
+  assert(!_cache.is_empty(), "cache should not be empty");
+  assert(verify(), "cache should be consistent");
+  return false;
+}
+
+void
+CollectionSetChooser::fillCache() {
+  while (!_cache.is_full() && addRegionToCache()) {
+  }
+}
+
+void
+CollectionSetChooser::sortMarkedHeapRegions() {
+  guarantee(_cache.is_empty(), "cache should be empty");
+  // First trim any unused portion of the top in the parallel case.
+  if (_first_par_unreserved_idx > 0) {
+    if (G1PrintParCleanupStats) {
+      gclog_or_tty->print("     Truncating _markedRegions from %d to %d.\n",
+                          _markedRegions.length(), _first_par_unreserved_idx);
+    }
+    assert(_first_par_unreserved_idx <= _markedRegions.length(),
+           "Or we didn't reserved enough length");
+    _markedRegions.trunc_to(_first_par_unreserved_idx);
+  }
+  _markedRegions.sort(orderRegions);
+  assert(_numMarkedRegions <= _markedRegions.length(), "Requirement");
+  assert(_numMarkedRegions == 0
+         || _markedRegions.at(_numMarkedRegions-1) != NULL,
+         "Testing _numMarkedRegions");
+  assert(_numMarkedRegions == _markedRegions.length()
+         || _markedRegions.at(_numMarkedRegions) == NULL,
+         "Testing _numMarkedRegions");
+  if (G1PrintParCleanupStats) {
+    gclog_or_tty->print_cr("     Sorted %d marked regions.", _numMarkedRegions);
+  }
+  for (int i = 0; i < _numMarkedRegions; i++) {
+    assert(_markedRegions.at(i) != NULL, "Should be true by sorting!");
+    _markedRegions.at(i)->set_sort_index(i);
+    if (G1PrintRegionLivenessInfo > 0) {
+      if (i == 0) gclog_or_tty->print_cr("Sorted marked regions:");
+      if (i < G1PrintRegionLivenessInfo ||
+          (_numMarkedRegions-i) < G1PrintRegionLivenessInfo) {
+        HeapRegion* hr = _markedRegions.at(i);
+        size_t u = hr->used();
+        gclog_or_tty->print_cr("  Region %d: %d used, %d max live, %5.2f%%.",
+                      i, u, hr->max_live_bytes(),
+                      100.0*(float)hr->max_live_bytes()/(float)u);
+      }
+    }
+  }
+  if (G1PolicyVerbose > 1)
+    printSortedHeapRegions();
+  assert(verify(), "should now be sorted");
+}
+
+void
+printHeapRegion(HeapRegion *hr) {
+  if (hr->isHumongous())
+    gclog_or_tty->print("H: ");
+  if (hr->in_collection_set())
+    gclog_or_tty->print("CS: ");
+  if (hr->popular())
+    gclog_or_tty->print("pop: ");
+  gclog_or_tty->print_cr("Region " PTR_FORMAT " (%s%s) "
+                         "[" PTR_FORMAT ", " PTR_FORMAT"] "
+                         "Used: " SIZE_FORMAT "K, garbage: " SIZE_FORMAT "K.",
+                         hr, hr->is_young() ? "Y " : "  ",
+                         hr->is_marked()? "M1" : "M0",
+                         hr->bottom(), hr->end(),
+                         hr->used()/K, hr->garbage_bytes()/K);
+}
+
+void
+CollectionSetChooser::addMarkedHeapRegion(HeapRegion* hr) {
+  assert(!hr->isHumongous(),
+         "Humongous regions shouldn't be added to the collection set");
+  assert(!hr->is_young(), "should not be young!");
+  _markedRegions.append(hr);
+  _numMarkedRegions++;
+  hr->calc_gc_efficiency();
+}
+
+void
+CollectionSetChooser::
+prepareForAddMarkedHeapRegionsPar(size_t n_regions, size_t chunkSize) {
+  _first_par_unreserved_idx = 0;
+  size_t max_waste = ParallelGCThreads * chunkSize;
+  // it should be aligned with respect to chunkSize
+  size_t aligned_n_regions =
+                     (n_regions + (chunkSize - 1)) / chunkSize * chunkSize;
+  assert( aligned_n_regions % chunkSize == 0, "should be aligned" );
+  _markedRegions.at_put_grow((int)(aligned_n_regions + max_waste - 1), NULL);
+}
+
+jint
+CollectionSetChooser::getParMarkedHeapRegionChunk(jint n_regions) {
+  jint res = Atomic::add(n_regions, &_first_par_unreserved_idx);
+  assert(_markedRegions.length() > res + n_regions - 1,
+         "Should already have been expanded");
+  return res - n_regions;
+}
+
+void
+CollectionSetChooser::setMarkedHeapRegion(jint index, HeapRegion* hr) {
+  assert(_markedRegions.at(index) == NULL, "precondition");
+  assert(!hr->is_young(), "should not be young!");
+  _markedRegions.at_put(index, hr);
+  hr->calc_gc_efficiency();
+}
+
+void
+CollectionSetChooser::incNumMarkedHeapRegions(jint inc_by) {
+  (void)Atomic::add(inc_by, &_numMarkedRegions);
+}
+
+void
+CollectionSetChooser::clearMarkedHeapRegions(){
+  for (int i = 0; i < _markedRegions.length(); i++) {
+    HeapRegion* r =   _markedRegions.at(i);
+    if (r != NULL) r->set_sort_index(-1);
+  }
+  _markedRegions.clear();
+  _curMarkedIndex = 0;
+  _numMarkedRegions = 0;
+  _cache.clear();
+};
+
+void
+CollectionSetChooser::updateAfterFullCollection() {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  clearMarkedHeapRegions();
+}
+
+void
+CollectionSetChooser::printSortedHeapRegions() {
+  gclog_or_tty->print_cr("Printing %d Heap Regions sorted by amount of known garbage",
+                _numMarkedRegions);
+  for (int i = 0; i < _markedRegions.length(); i++) {
+    printHeapRegion(_markedRegions.at(i));
+  }
+  gclog_or_tty->print_cr("Done sorted heap region print");
+}
+
+void CollectionSetChooser::removeRegion(HeapRegion *hr) {
+  int si = hr->sort_index();
+  assert(si == -1 || hr->is_marked(), "Sort index not valid.");
+  if (si > -1) {
+    assert(_markedRegions.at(si) == hr, "Sort index not valid." );
+    _markedRegions.at_put(si, NULL);
+  } else if (si < -1) {
+    assert(_cache.region_in_cache(hr), "should be in the cache");
+    _cache.remove(hr);
+    assert(hr->sort_index() == -1, "sort index invariant");
+  }
+  hr->set_sort_index(-1);
+}
+
+// if time_remaining < 0.0, then this method should try to return
+// a region, whether it fits within the remaining time or not
+HeapRegion*
+CollectionSetChooser::getNextMarkedRegion(double time_remaining,
+                                          double avg_prediction) {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  G1CollectorPolicy* g1p = g1h->g1_policy();
+  fillCache();
+  if (_cache.is_empty()) {
+    assert(_curMarkedIndex == _numMarkedRegions,
+           "if cache is empty, list should also be empty");
+    return NULL;
+  }
+
+  HeapRegion *hr = _cache.get_first();
+  assert(hr != NULL, "if cache not empty, first entry should be non-null");
+  double predicted_time = g1h->predict_region_elapsed_time_ms(hr, false);
+
+  if (g1p->adaptive_young_list_length()) {
+    if (time_remaining - predicted_time < 0.0) {
+      g1h->check_if_region_is_too_expensive(predicted_time);
+      return NULL;
+    }
+  } else {
+    if (predicted_time > 2.0 * avg_prediction) {
+      return NULL;
+    }
+  }
+
+  HeapRegion *hr2 = _cache.remove_first();
+  assert(hr == hr2, "cache contents should not have changed");
+
+  return hr;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc_implementation/g1/collectionSetChooser.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2001-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ */
+
+// We need to sort heap regions by collection desirability.
+
+class CSetChooserCache {
+private:
+  enum {
+    CacheLength = 16
+  } PrivateConstants;
+
+  HeapRegion*  _cache[CacheLength];
+  int          _occupancy; // number of region in cache
+  int          _first; // "first" region in the cache
+
+  // adding CacheLength to deal with negative values
+  inline int trim_index(int index) {
+    return (index + CacheLength) % CacheLength;
+  }
+
+  inline int get_sort_index(int index) {
+    return -index-2;
+  }
+  inline int get_index(int sort_index) {
+    return -sort_index-2;
+  }
+
+public:
+  CSetChooserCache(void);
+
+  inline int occupancy(void) { return _occupancy; }
+  inline bool is_full()      { return _occupancy == CacheLength; }
+  inline bool is_empty()     { return _occupancy == 0; }
+
+  void clear(void);
+  void insert(HeapRegion *hr);
+  HeapRegion *remove_first(void);
+  void remove (HeapRegion *hr);
+  inline HeapRegion *get_first(void) {
+    return _cache[_first];
+  }
+
+#ifndef PRODUCT
+  bool verify (void);
+  bool region_in_cache(HeapRegion *hr) {
+    int sort_index = hr->sort_index();
+    if (sort_index < -1) {
+      int index = get_index(sort_index);
+      guarantee(index < CacheLength, "should be within bounds");
+      return _cache[index] == hr;
+    } else
+      return 0;
+  }
+#endif // PRODUCT
+};
+
+class CollectionSetChooser: public CHeapObj {
+
+  GrowableArray<HeapRegion*> _markedRegions;
+  int _curMarkedIndex;
+  int _numMarkedRegions;
+  CSetChooserCache _cache;
+
+  // True iff last collection pause ran of out new "age 0" regions, and
+  // returned an "age 1" region.
+  bool _unmarked_age_1_returned_as_new;
+
+  jint _first_par_unreserved_idx;
+
+public:
+
+  HeapRegion* getNextMarkedRegion(double time_so_far, double avg_prediction);
+
+  CollectionSetChooser();
+
+  void printSortedHeapRegions();
+
+  void sortMarkedHeapRegions();
+  void fillCache();
+  bool addRegionToCache(void);
+  void addMarkedHeapRegion(HeapRegion *hr);
+
+  // Must be called before calls to getParMarkedHeapRegionChunk.
+  // "n_regions" is the number of regions, "chunkSize" the chunk size.
+  void prepareForAddMarkedHeapRegionsPar(size_t n_regions, size_t chunkSize);
+  // Returns the first index in a contiguous chunk of "n_regions" indexes
+  // that the calling thread has reserved.  These must be set by the
+  // calling thread using "setMarkedHeapRegion" (to NULL if necessary).
+  jint getParMarkedHeapRegionChunk(jint n_regions);
+  // Set the marked array entry at index to hr.  Careful to claim the index
+  // first if in parallel.
+  void setMarkedHeapRegion(jint index, HeapRegion* hr);
+  // Atomically increment the number of claimed regions by "inc_by".
+  void incNumMarkedHeapRegions(jint inc_by);
+
+  void clearMarkedHeapRegions();
+
+  void updateAfterFullCollection();
+
+  // Ensure that "hr" is not a member of the marked region array or the cache
+  void removeRegion(HeapRegion* hr);
+
+  bool unmarked_age_1_returned_as_new() { return _unmarked_age_1_returned_as_new; }
+
+  // Returns true if the used portion of "_markedRegions" is properly
+  // sorted, otherwise asserts false.
+#ifndef PRODUCT
+  bool verify(void);
+  bool regionProperlyOrdered(HeapRegion* r) {
+    int si = r->sort_index();
+    return (si == -1) ||
+      (si > -1 && _markedRegions.at(si) == r) ||
+      (si < -1 && _cache.region_in_cache(r));
+  }
+#endif
+
+};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -0,0 +1,355 @@
+/*
+ * Copyright 2001-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ */
+
+#include "incls/_precompiled.incl"
+#include "incls/_concurrentG1Refine.cpp.incl"
+
+bool ConcurrentG1Refine::_enabled = false;
+
+ConcurrentG1Refine::ConcurrentG1Refine() :
+  _pya(PYA_continue), _last_pya(PYA_continue),
+  _last_cards_during(), _first_traversal(false),
+  _card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL),
+  _hot_cache(NULL),
+  _def_use_cache(false), _use_cache(false),
+  _n_periods(0), _total_cards(0), _total_travs(0)
+{
+  if (G1ConcRefine) {
+    _cg1rThread = new ConcurrentG1RefineThread(this);
+    assert(cg1rThread() != NULL, "Conc refine should have been created");
+    assert(cg1rThread()->cg1r() == this,
+           "Conc refine thread should refer to this");
+  } else {
+    _cg1rThread = NULL;
+  }
+}
+
+void ConcurrentG1Refine::init() {
+  if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+    _n_card_counts =
+      (unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
+    _card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts);
+    for (size_t i = 0; i < _n_card_counts; i++) _card_counts[i] = 0;
+    ModRefBarrierSet* bs = g1h->mr_bs();
+    guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
+    CardTableModRefBS* ctbs = (CardTableModRefBS*)bs;
+    _ct_bot = ctbs->byte_for_const(g1h->reserved_region().start());
+    if (G1ConcRSCountTraversals) {
+      _cur_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
+      _cum_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
+      for (int i = 0; i < 256; i++) {
+        _cur_card_count_histo[i] = 0;
+        _cum_card_count_histo[i] = 0;
+      }
+    }
+  }
+  if (G1ConcRSLogCacheSize > 0) {
+    _def_use_cache = true;
+    _use_cache = true;
+    _hot_cache_size = (1 << G1ConcRSLogCacheSize);
+    _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size);
+    _n_hot = 0;
+    _hot_cache_idx = 0;
+  }
+}
+
+ConcurrentG1Refine::~ConcurrentG1Refine() {
+  if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
+    assert(_card_counts != NULL, "Logic");
+    FREE_C_HEAP_ARRAY(unsigned char, _card_counts);
+    assert(_cur_card_count_histo != NULL, "Logic");
+    FREE_C_HEAP_ARRAY(unsigned, _cur_card_count_histo);
+    assert(_cum_card_count_histo != NULL, "Logic");
+    FREE_C_HEAP_ARRAY(unsigned, _cum_card_count_histo);
+  }
+  if (G1ConcRSLogCacheSize > 0) {
+    assert(_hot_cache != NULL, "Logic");
+    FREE_C_HEAP_ARRAY(jbyte*, _hot_cache);
+  }
+}
+
+bool ConcurrentG1Refine::refine() {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  unsigned cards_before = g1h->g1_rem_set()->conc_refine_cards();
+  clear_hot_cache();  // Any previous values in this are now invalid.
+  g1h->g1_rem_set()->concurrentRefinementPass(this);
+  _traversals++;
+  unsigned cards_after = g1h->g1_rem_set()->conc_refine_cards();
+  unsigned cards_during = cards_after-cards_before;
+  // If this is the first traversal in the current enabling
+  // and we did some cards, or if the number of cards found is decreasing
+  // sufficiently quickly, then keep going.  Otherwise, sleep a while.
+  bool res =
+    (_first_traversal && cards_during > 0)
+    ||
+    (!_first_traversal && cards_during * 3 < _last_cards_during * 2);
+  _last_cards_during = cards_during;
+  _first_traversal = false;
+  return res;
+}
+
+void ConcurrentG1Refine::enable() {
+  MutexLocker x(G1ConcRefine_mon);
+  if (!_enabled) {
+    _enabled = true;
+    _first_traversal = true; _last_cards_during = 0;
+    G1ConcRefine_mon->notify_all();
+  }
+}
+
+unsigned ConcurrentG1Refine::disable() {
+  MutexLocker x(G1ConcRefine_mon);
+  if (_enabled) {
+    _enabled = false;
+    return _traversals;
+  } else {
+    return 0;
+  }
+}
+
+void ConcurrentG1Refine::wait_for_ConcurrentG1Refine_enabled() {
+  G1ConcRefine_mon->lock();
+  while (!_enabled) {
+    G1ConcRefine_mon->wait(Mutex::_no_safepoint_check_flag);
+  }
+  G1ConcRefine_mon->unlock();
+  _traversals = 0;
+};
+
+void ConcurrentG1Refine::set_pya_restart() {
+  // If we're using the log-based RS barrier, the above will cause
+  // in-progress traversals of completed log buffers to quit early; we will
+  // also abandon all other buffers.
+  if (G1RSBarrierUseQueue) {
+    DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+    dcqs.abandon_logs();
+    if (_cg1rThread->do_traversal()) {
+      _pya = PYA_restart;
+    } else {
+      _cg1rThread->set_do_traversal(true);
+      // Reset the post-yield actions.
+      _pya = PYA_continue;
+      _last_pya = PYA_continue;
+    }
+  } else {
+    _pya = PYA_restart;
+  }
+}
+
+void ConcurrentG1Refine::set_pya_cancel() {
+  _pya = PYA_cancel;
+}
+
+PostYieldAction ConcurrentG1Refine::get_pya() {
+  if (_pya != PYA_continue) {
+    jint val = _pya;
+    while (true) {
+      jint val_read = Atomic::cmpxchg(PYA_continue, &_pya, val);
+      if (val_read == val) {
+        PostYieldAction res = (PostYieldAction)val;
+        assert(res != PYA_continue, "Only the refine thread should reset.");
+        _last_pya = res;
+        return res;
+      } else {
+        val = val_read;
+      }
+    }
+  }
+  // QQQ WELL WHAT DO WE RETURN HERE???
+  // make up something!
+  return PYA_continue;
+}
+
+PostYieldAction ConcurrentG1Refine::get_last_pya() {
+  PostYieldAction res = _last_pya;
+  _last_pya = PYA_continue;
+  return res;
+}
+
+bool ConcurrentG1Refine::do_traversal() {
+  return _cg1rThread->do_traversal();
+}
+
+int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) {
+  size_t card_num = (card_ptr - _ct_bot);
+  guarantee(0 <= card_num && card_num < _n_card_counts, "Bounds");
+  unsigned char cnt = _card_counts[card_num];
+  if (cnt < 255) _card_counts[card_num]++;
+  return cnt;
+  _total_travs++;
+}
+
+jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) {
+  int count = add_card_count(card_ptr);
+  // Count previously unvisited cards.
+  if (count == 0) _total_cards++;
+  // We'll assume a traversal unless we store it in the cache.
+  if (count < G1ConcRSHotCardLimit) {
+    _total_travs++;
+    return card_ptr;
+  }
+  // Otherwise, it's hot.
+  jbyte* res = NULL;
+  MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
+  if (_n_hot == _hot_cache_size) {
+    _total_travs++;
+    res = _hot_cache[_hot_cache_idx];
+    _n_hot--;
+  }
+  // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
+  _hot_cache[_hot_cache_idx] = card_ptr;
+  _hot_cache_idx++;
+  if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
+  _n_hot++;
+  return res;
+}
+
+
+void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
+  assert(!use_cache(), "cache should be disabled");
+  int start_ind = _hot_cache_idx-1;
+  for (int i = 0; i < _n_hot; i++) {
+    int ind = start_ind - i;
+    if (ind < 0) ind = ind + _hot_cache_size;
+    jbyte* entry = _hot_cache[ind];
+    if (entry != NULL) {
+      g1rs->concurrentRefineOneCard(entry, worker_i);
+    }
+  }
+  _n_hot = 0;
+  _hot_cache_idx = 0;
+}
+
+void ConcurrentG1Refine::clear_and_record_card_counts() {
+  if (G1ConcRSLogCacheSize == 0 && !G1ConcRSCountTraversals) return;
+  _n_periods++;
+  if (G1ConcRSCountTraversals) {
+    for (size_t i = 0; i < _n_card_counts; i++) {
+      unsigned char bucket = _card_counts[i];
+      _cur_card_count_histo[bucket]++;
+      _card_counts[i] = 0;
+    }
+    gclog_or_tty->print_cr("Card counts:");
+    for (int i = 0; i < 256; i++) {
+      if (_cur_card_count_histo[i] > 0) {
+        gclog_or_tty->print_cr("  %3d: %9d", i, _cur_card_count_histo[i]);
+        _cum_card_count_histo[i] += _cur_card_count_histo[i];
+        _cur_card_count_histo[i] = 0;
+      }
+    }
+  } else {
+    assert(G1ConcRSLogCacheSize > 0, "Logic");
+    Copy::fill_to_words((HeapWord*)(&_card_counts[0]),
+                        _n_card_counts / HeapWordSize);
+  }
+}
+
+void
+ConcurrentG1Refine::
+print_card_count_histo_range(unsigned* histo, int from, int to,
+                             float& cum_card_pct,
+                             float& cum_travs_pct) {
+  unsigned cards = 0;
+  unsigned travs = 0;
+  guarantee(to <= 256, "Precondition");
+  for (int i = from; i < to-1; i++) {
+    cards += histo[i];
+    travs += histo[i] * i;
+  }
+  if (to == 256) {
+    unsigned histo_card_sum = 0;
+    unsigned histo_trav_sum = 0;
+    for (int i = 1; i < 255; i++) {
+      histo_trav_sum += histo[i] * i;
+    }
+    cards += histo[255];
+    // correct traversals for the last one.
+    unsigned travs_255 = (unsigned) (_total_travs - histo_trav_sum);
+    travs += travs_255;
+
+  } else {
+    cards += histo[to-1];
+    travs += histo[to-1] * (to-1);
+  }
+  float fperiods = (float)_n_periods;
+  float f_tot_cards = (float)_total_cards/fperiods;
+  float f_tot_travs = (float)_total_travs/fperiods;
+  if (cards > 0) {
+    float fcards = (float)cards/fperiods;
+    float ftravs = (float)travs/fperiods;
+    if (to == 256) {
+      gclog_or_tty->print(" %4d-       %10.2f%10.2f", from, fcards, ftravs);
+    } else {
+      gclog_or_tty->print(" %4d-%4d   %10.2f%10.2f", from, to-1, fcards, ftravs);
+    }
+    float pct_cards = fcards*100.0/f_tot_cards;
+    cum_card_pct += pct_cards;
+    float pct_travs = ftravs*100.0/f_tot_travs;
+    cum_travs_pct += pct_travs;
+    gclog_or_tty->print_cr("%10.2f%10.2f%10.2f%10.2f",
+                  pct_cards, cum_card_pct,
+                  pct_travs, cum_travs_pct);
+  }
+}
+
+void ConcurrentG1Refine::print_final_card_counts() {
+  if (!G1ConcRSCountTraversals) return;
+
+  gclog_or_tty->print_cr("Did %d total traversals of %d distinct cards.",
+                _total_travs, _total_cards);
+  float fperiods = (float)_n_periods;
+  gclog_or_tty->print_cr("  This is an average of %8.2f traversals, %8.2f cards, "
+                "per collection.", (float)_total_travs/fperiods,
+                (float)_total_cards/fperiods);
+  gclog_or_tty->print_cr("  This is an average of %8.2f traversals/distinct "
+                "dirty card.\n",
+                _total_cards > 0 ?
+                (float)_total_travs/(float)_total_cards : 0.0);
+
+
+  gclog_or_tty->print_cr("Histogram:\n\n%10s   %10s%10s%10s%10s%10s%10s",
+                "range", "# cards", "# travs", "% cards", "(cum)",
+                "% travs", "(cum)");
+  gclog_or_tty->print_cr("------------------------------------------------------------"
+                "-------------");
+  float cum_cards_pct = 0.0;
+  float cum_travs_pct = 0.0;
+  for (int i = 1; i < 10; i++) {
+    print_card_count_histo_range(_cum_card_count_histo, i, i+1,
+                                 cum_cards_pct, cum_travs_pct);
+  }
+  for (int i = 10; i < 100; i += 10) {
+    print_card_count_histo_range(_cum_card_count_histo, i, i+10,
+                                 cum_cards_pct, cum_travs_pct);
+  }
+  print_card_count_histo_range(_cum_card_count_histo, 100, 150,
+                               cum_cards_pct, cum_travs_pct);
+  print_card_count_histo_range(_cum_card_count_histo, 150, 200,
+                               cum_cards_pct, cum_travs_pct);
+  print_card_count_histo_range(_cum_card_count_histo, 150, 255,
+                               cum_cards_pct, cum_travs_pct);
+  print_card_count_histo_range(_cum_card_count_histo, 255, 256,
+                               cum_cards_pct, cum_travs_pct);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2001-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ */
+
+// Forward decl
+class ConcurrentG1RefineThread;
+class G1RemSet;
+
+// What to do after a yield:
+enum PostYieldAction {
+  PYA_continue,  // Continue the traversal
+  PYA_restart,   // Restart
+  PYA_cancel     // It's been completed by somebody else: cancel.
+};
+
+class ConcurrentG1Refine {
+  ConcurrentG1RefineThread* _cg1rThread;
+
+  volatile jint _pya;
+  PostYieldAction _last_pya;
+
+  static bool _enabled;  // Protected by G1ConcRefine_mon.
+  unsigned _traversals;
+
+  // Number of cards processed during last refinement traversal.
+  unsigned _first_traversal;
+  unsigned _last_cards_during;
+
+  // The cache for card refinement.
+  bool     _use_cache;
+  bool     _def_use_cache;
+  size_t _n_periods;
+  size_t _total_cards;
+  size_t _total_travs;
+
+  unsigned char*  _card_counts;
+  unsigned _n_card_counts;
+  const jbyte* _ct_bot;
+  unsigned* _cur_card_count_histo;
+  unsigned* _cum_card_count_histo;
+  jbyte**  _hot_cache;
+  int      _hot_cache_size;
+  int      _n_hot;
+  int      _hot_cache_idx;
+
+  // Returns the count of this card after incrementing it.
+  int add_card_count(jbyte* card_ptr);
+
+  void print_card_count_histo_range(unsigned* histo, int from, int to,
+                                    float& cum_card_pct,
+                                    float& cum_travs_pct);
+ public:
+  ConcurrentG1Refine();
+  ~ConcurrentG1Refine();
+
+  void init(); // Accomplish some initialization that has to wait.
+
+  // Enabled Conc refinement, waking up thread if necessary.
+  void enable();
+
+  // Returns the number of traversals performed since this refiner was enabled.
+  unsigned disable();
+
+  // Requires G1ConcRefine_mon to be held.
+  bool enabled() { return _enabled; }
+
+  // Returns only when G1 concurrent refinement has been enabled.
+  void wait_for_ConcurrentG1Refine_enabled();
+
+  // Do one concurrent refinement pass over the card table.  Returns "true"
+  // if heuristics determine that another pass should be done immediately.
+  bool refine();
+
+  // Indicate that an in-progress refinement pass should start over.
+  void set_pya_restart();
+  // Indicate that an in-progress refinement pass should quit.
+  void set_pya_cancel();
+
+  // Get the appropriate post-yield action.  Also sets last_pya.
+  PostYieldAction get_pya();
+
+  // The last PYA read by "get_pya".
+  PostYieldAction get_last_pya();
+
+  bool do_traversal();
+
+  ConcurrentG1RefineThread* cg1rThread() { return _cg1rThread; }
+
+  // If this is the first entry for the slot, writes into the cache and
+  // returns NULL.  If it causes an eviction, returns the evicted pointer.
+  // Otherwise, its a cache hit, and returns NULL.
+  jbyte* cache_insert(jbyte* card_ptr);
+
+  // Process the cached entries.
+  void clean_up_cache(int worker_i, G1RemSet* g1rs);
+
+  // Discard entries in the hot cache.
+  void clear_hot_cache() {
+    _hot_cache_idx = 0; _n_hot = 0;
+  }
+
+  bool hot_cache_is_empty() { return _n_hot == 0; }
+
+  bool use_cache() { return _use_cache; }
+  void set_use_cache(bool b) {
+    if (b) _use_cache = _def_use_cache;
+    else   _use_cache = false;
+  }
+
+  void clear_and_record_card_counts();
+  void print_final_card_counts();
+};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -0,0 +1,246 @@
+/*
+ * Copyright 2001-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ */
+
+#include "incls/_precompiled.incl"
+#include "incls/_concurrentG1RefineThread.cpp.incl"
+
+// ======= Concurrent Mark Thread ========
+
+// The CM thread is created when the G1 garbage collector is used
+
+ConcurrentG1RefineThread::
+ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r) :
+  ConcurrentGCThread(),
+  _cg1r(cg1r),
+  _started(false),
+  _in_progress(false),
+  _do_traversal(false),
+  _vtime_accum(0.0),
+  _co_tracker(G1CRGroup),
+  _interval_ms(5.0)
+{
+  create_and_start();
+}
+
+const long timeout = 200; // ms.
+
+void ConcurrentG1RefineThread::traversalBasedRefinement() {
+  _cg1r->wait_for_ConcurrentG1Refine_enabled();
+  MutexLocker x(G1ConcRefine_mon);
+  while (_cg1r->enabled()) {
+    MutexUnlocker ux(G1ConcRefine_mon);
+    ResourceMark rm;
+    HandleMark   hm;
+
+    if (TraceG1Refine) gclog_or_tty->print_cr("G1-Refine starting pass");
+    _sts.join();
+    bool no_sleep = _cg1r->refine();
+    _sts.leave();
+    if (!no_sleep) {
+      MutexLockerEx x(CGC_lock, Mutex::_no_safepoint_check_flag);
+      // We do this only for the timeout; we don't expect this to be signalled.
+      CGC_lock->wait(Mutex::_no_safepoint_check_flag, timeout);
+    }
+  }
+}
+
+void ConcurrentG1RefineThread::queueBasedRefinement() {
+  DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+  // Wait for completed log buffers to exist.
+  {
+    MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
+    while (!_do_traversal && !dcqs.process_completed_buffers() &&
+           !_should_terminate) {
+      DirtyCardQ_CBL_mon->wait(Mutex::_no_safepoint_check_flag);
+    }
+  }
+
+  if (_should_terminate) {
+    return;
+  }
+
+  // Now we take them off (this doesn't hold locks while it applies
+  // closures.)  (If we did a full collection, then we'll do a full
+  // traversal.
+  _sts.join();
+  if (_do_traversal) {
+    (void)_cg1r->refine();
+    switch (_cg1r->get_last_pya()) {
+    case PYA_cancel: case PYA_continue:
+      // Continue was caught and handled inside "refine".  If it's still
+      // "continue" when we get here, we're done.
+      _do_traversal = false;
+      break;
+    case PYA_restart:
+      assert(_do_traversal, "Because of Full GC.");
+      break;
+    }
+  } else {
+    int n_logs = 0;
+    int lower_limit = 0;
+    double start_vtime_sec; // only used when G1SmoothConcRefine is on
+    int prev_buffer_num; // only used when G1SmoothConcRefine is on
+
+    if (G1SmoothConcRefine) {
+      lower_limit = 0;
+      start_vtime_sec = os::elapsedVTime();
+      prev_buffer_num = (int) dcqs.completed_buffers_num();
+    } else {
+      lower_limit = DCQBarrierProcessCompletedThreshold / 4; // For now.
+    }
+    while (dcqs.apply_closure_to_completed_buffer(0, lower_limit)) {
+      double end_vtime_sec;
+      double elapsed_vtime_sec;
+      int elapsed_vtime_ms;
+      int curr_buffer_num;
+
+      if (G1SmoothConcRefine) {
+        end_vtime_sec = os::elapsedVTime();
+        elapsed_vtime_sec = end_vtime_sec - start_vtime_sec;
+        elapsed_vtime_ms = (int) (elapsed_vtime_sec * 1000.0);
+        curr_buffer_num = (int) dcqs.completed_buffers_num();
+
+        if (curr_buffer_num > prev_buffer_num ||
+            curr_buffer_num > DCQBarrierProcessCompletedThreshold) {
+          decreaseInterval(elapsed_vtime_ms);
+        } else if (curr_buffer_num < prev_buffer_num) {
+          increaseInterval(elapsed_vtime_ms);
+        }
+      }
+
+      sample_young_list_rs_lengths();
+      _co_tracker.update(false);
+
+      if (G1SmoothConcRefine) {
+        start_vtime_sec = os::elapsedVTime();
+        prev_buffer_num = curr_buffer_num;
+
+        _sts.leave();
+        os::sleep(Thread::current(), (jlong) _interval_ms, false);
+        _sts.join();
+      }
+
+      n_logs++;
+    }
+    // Make sure we harvest the PYA, if any.
+    (void)_cg1r->get_pya();
+  }
+  _sts.leave();
+}
+
+void ConcurrentG1RefineThread::sample_young_list_rs_lengths() {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  G1CollectorPolicy* g1p = g1h->g1_policy();
+  if (g1p->adaptive_young_list_length()) {
+    int regions_visited = 0;
+
+    g1h->young_list_rs_length_sampling_init();
+    while (g1h->young_list_rs_length_sampling_more()) {
+      g1h->young_list_rs_length_sampling_next();
+      ++regions_visited;
+
+      // we try to yield every time we visit 10 regions
+      if (regions_visited == 10) {
+        if (_sts.should_yield()) {
+          _sts.yield("G1 refine");
+          // we just abandon the iteration
+          break;
+        }
+        regions_visited = 0;
+      }
+    }
+
+    g1p->check_prediction_validity();
+  }
+}
+
+void ConcurrentG1RefineThread::run() {
+  initialize_in_thread();
+  _vtime_start = os::elapsedVTime();
+  wait_for_universe_init();
+
+  _co_tracker.enable();
+  _co_tracker.start();
+
+  while (!_should_terminate) {
+    // wait until started is set.
+    if (G1RSBarrierUseQueue) {
+      queueBasedRefinement();
+    } else {
+      traversalBasedRefinement();
+    }
+    _sts.join();
+    _co_tracker.update();
+    _sts.leave();
+    if (os::supports_vtime()) {
+      _vtime_accum = (os::elapsedVTime() - _vtime_start);
+    } else {
+      _vtime_accum = 0.0;
+    }
+  }
+  _sts.join();
+  _co_tracker.update(true);
+  _sts.leave();
+  assert(_should_terminate, "just checking");
+
+  terminate();
+}
+
+
+void ConcurrentG1RefineThread::yield() {
+  if (TraceG1Refine) gclog_or_tty->print_cr("G1-Refine-yield");
+  _sts.yield("G1 refine");
+  if (TraceG1Refine) gclog_or_tty->print_cr("G1-Refine-yield-end");
+}
+
+void ConcurrentG1RefineThread::stop() {
+  // it is ok to take late safepoints here, if needed
+  {
+    MutexLockerEx mu(Terminator_lock);
+    _should_terminate = true;
+  }
+
+  {
+    MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
+    DirtyCardQ_CBL_mon->notify_all();
+  }
+
+  {
+    MutexLockerEx mu(Terminator_lock);
+    while (!_has_terminated) {
+      Terminator_lock->wait();
+    }
+  }
+  if (TraceG1Refine) gclog_or_tty->print_cr("G1-Refine-stop");
+}
+
+void ConcurrentG1RefineThread::print() {
+  gclog_or_tty->print("\"Concurrent G1 Refinement Thread\" ");
+  Thread::print();
+  gclog_or_tty->cr();
+}
+
+void ConcurrentG1RefineThread::set_do_traversal(bool b) {
+  _do_traversal = b;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp	Wed Jul 05 16:43:15 2017 +0200
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2001-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ */
+
+// Forward Decl.
+class ConcurrentG1Refine;
+
+// The G1 Concurrent Refinement Thread (could be several in the future).
+
+class ConcurrentG1RefineThread: public ConcurrentGCThread {
+  friend class VMStructs;
+  friend class G1CollectedHeap;
+
+  double _vtime_start;  // Initial virtual time.
+  double _vtime_accum;  // Initial virtual time.
+
+ public:
+  virtual void run();
+
+ private:
+  ConcurrentG1Refine*              _cg1r;
+  bool                             _started;
+  bool                             _in_progress;
+  volatile bool                    _restart;
+
+  COTracker                        _co_tracker;
+  double                           _interval_ms;
+
+  bool                             _do_traversal;
+
+  void decreaseInterval(int processing_time_ms) {
+    double min_interval_ms = (double) processing_time_ms;
+    _interval_ms = 0.8 * _interval_ms;
+    if (_interval_ms < min_interval_ms)
+      _interval_ms = min_interval_ms;
+  }
+  void increaseInterval(int processing_time_ms) {
+    double max_interval_ms = 9.0 * (double) processing_time_ms;
+    _interval_ms = 1.1 * _interval_ms;
+    if (max_interval_ms > 0 && _interval_ms > max_interval_ms)
+      _interval_ms = max_interval_ms;
+  }
+
+  void sleepBeforeNextCycle();
+
+  void traversalBasedRefinement();
+
+  void queueBasedRefinement();
+
+  // For use by G1CollectedHeap, which is a friend.
+  static SuspendibleThreadSet* sts() { return &_sts; }
+
+ public:
+  // Constructor
+  ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r);
+
+  // Printing
+  void print();
+
+  // Total virtual time so far.
+  double vtime_accum() { return _vtime_accum; }
+
+  ConcurrentG1Refine* cg1r()                     { return _cg1r;     }
+
+
+  void            set_started()                  { _started = true;   }
+  void            clear_started()                { _started = false;  }
+  bool            started()                      { return _started;   }
+
+  void            set_in_progress()              { _in_progress = true;   }
+  void            clear_in_progress()            { _in_progress = false;  }
+  bool            in_progress()                  { return _in_progress;   }
+
+  void            set_do_traversal(bool b);
+  bool            do_traversal() { return _do_traversal; }
+
+  void            sample_young_list_rs_lengths();
+
+  // Yield for GC
+  void            yield();
+
+  // shutdown
+  static void stop();
+};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Wed Jul 05 16:43:15 2017 +0200
@@ -0,0 +1,3979 @@
+/*
+ * Copyright 2001-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ */
+
+#include "incls/_precompiled.incl"
+#include "incls/_concurrentMark.cpp.incl"
+
+//
+// CMS Bit Map Wrapper
+
+CMBitMapRO::CMBitMapRO(ReservedSpace rs, int shifter):
+  _bm((uintptr_t*)NULL,0),
+  _shifter(shifter) {
+  _bmStartWord = (HeapWord*)(rs.base());
+  _bmWordSize  = rs.size()/HeapWordSize;    // rs.size() is in bytes
+  ReservedSpace brs(ReservedSpace::allocation_align_size_up(
+                     (_bmWordSize >> (_shifter + LogBitsPerByte)) + 1));
+
+  guarantee(brs.is_reserved(), "couldn't allocate CMS bit map");
+  // For now we'll just commit all of the bit map up fromt.
+  // Later on we'll try to be more parsimonious with swap.
+  guarantee(_virtual_space.initialize(brs, brs.size()),
+            "couldn't reseve backing store for CMS bit map");
+  assert(_virtual_space.committed_size() == brs.size(),
+         "didn't reserve backing store for all of CMS bit map?");
+  _bm.set_map((uintptr_t*)_virtual_space.low());
+  assert(_virtual_space.committed_size() << (_shifter + LogBitsPerByte) >=
+         _bmWordSize, "inconsistency in bit map sizing");
+  _bm.set_size(_bmWordSize >> _shifter);
+}
+
+HeapWord* CMBitMapRO::getNextMarkedWordAddress(HeapWord* addr,
+                                               HeapWord* limit) const {
+  // First we must round addr *up* to a possible object boundary.
+  addr = (HeapWord*)align_size_up((intptr_t)addr,
+                                  HeapWordSize << _shifter);
+  size_t addrOffset = heapWordToOffset(addr);
+  if (limit == NULL) limit = _bmStartWord + _bmWordSize;
+  size_t limitOffset = heapWordToOffset(limit);
+  size_t nextOffset = _bm.get_next_one_offset(addrOffset, limitOffset);
+  HeapWord* nextAddr = offsetToHeapWord(nextOffset);
+  assert(nextAddr >= addr, "get_next_one postcondition");
+  assert(nextAddr == limit || isMarked(nextAddr),
+         "get_next_one postcondition");
+  return nextAddr;
+}
+
+HeapWord* CMBitMapRO::getNextUnmarkedWordAddress(HeapWord* addr,
+                                                 HeapWord* limit) const {
+  size_t addrOffset = heapWordToOffset(addr);
+  if (limit == NULL) limit = _bmStartWord + _bmWordSize;
+  size_t limitOffset = heapWordToOffset(limit);
+  size_t nextOffset = _bm.get_next_zero_offset(addrOffset, limitOffset);
+  HeapWord* nextAddr = offsetToHeapWord(nextOffset);
+  assert(nextAddr >= addr, "get_next_one postcondition");
+  assert(nextAddr == limit || !isMarked(nextAddr),
+         "get_next_one postcondition");
+  return nextAddr;
+}
+
+int CMBitMapRO::heapWordDiffToOffsetDiff(size_t diff) const {
+  assert((diff & ((1 << _shifter) - 1)) == 0, "argument check");
+  return (int) (diff >> _shifter);
+}
+
+bool CMBitMapRO::iterate(BitMapClosure* cl, MemRegion mr) {
+  HeapWord* left  = MAX2(_bmStartWord, mr.start());
+  HeapWord* right = MIN2(_bmStartWord + _bmWordSize, mr.end());
+  if (right > left) {
+    // Right-open interval [leftOffset, rightOffset).
+    return _bm.iterate(cl, heapWordToOffset(left), heapWordToOffset(right));
+  } else {
+    return true;
+  }
+}
+
+void CMBitMapRO::mostly_disjoint_range_union(BitMap*   from_bitmap,
+                                             size_t    from_start_index,
+                                             HeapWord* to_start_word,
+                                             size_t    word_num) {
+  _bm.mostly_disjoint_range_union(from_bitmap,
+                                  from_start_index,
+                                  heapWordToOffset(to_start_word),
+                                  word_num);
+}
+
+#ifndef PRODUCT
+bool CMBitMapRO::covers(ReservedSpace rs) const {
+  // assert(_bm.map() == _virtual_space.low(), "map inconsistency");
+  assert(((size_t)_bm.size() * (1 << _shifter)) == _bmWordSize,
+         "size inconsistency");
+  return _bmStartWord == (HeapWord*)(rs.base()) &&
+         _bmWordSize  == rs.size()>>LogHeapWordSize;
+}
+#endif
+
+void CMBitMap::clearAll() {
+  _bm.clear();
+  return;
+}
+
+void CMBitMap::markRange(MemRegion mr) {
+  mr.intersection(MemRegion(_bmStartWord, _bmWordSize));
+  assert(!mr.is_empty(), "unexpected empty region");
+  assert((offsetToHeapWord(heapWordToOffset(mr.end())) ==
+          ((HeapWord *) mr.end())),
+         "markRange memory region end is not card aligned");
+  // convert address range into offset range
+  _bm.at_put_range(heapWordToOffset(mr.start()),
+                   heapWordToOffset(mr.end()), true);
+}
+
+void CMBitMap::clearRange(MemRegion mr) {
+  mr.intersection(MemRegion(_bmStartWord, _bmWordSize));
+  assert(!mr.is_empty(), "unexpected empty region");
+  // convert address range into offset range
+  _bm.at_put_range(heapWordToOffset(mr.start()),
+                   heapWordToOffset(mr.end()), false);
+}
+
+MemRegion CMBitMap::getAndClearMarkedRegion(HeapWord* addr,
+                                            HeapWord* end_addr) {
+  HeapWord* start = getNextMarkedWordAddress(addr);
+  start = MIN2(start, end_addr);
+  HeapWord* end   = getNextUnmarkedWordAddress(start);
+  end = MIN2(end, end_addr);
+  assert(start <= end, "Consistency check");
+  MemRegion mr(start, end);
+  if (!mr.is_empty()) {
+    clearRange(mr);
+  }
+  return mr;
+}
+
+CMMarkStack::CMMarkStack(ConcurrentMark* cm) :
+  _base(NULL), _cm(cm)
+#ifdef ASSERT
+  , _drain_in_progress(false)
+  , _drain_in_progress_yields(false)
+#endif
+{}
+
+void CMMarkStack::allocate(size_t size) {
+  _base = NEW_C_HEAP_ARRAY(oop, size);
+  if (_base == NULL)
+    vm_exit_during_initialization("Failed to allocate "
+                                  "CM region mark stack");
+  _index = 0;
+  // QQQQ cast ...
+  _capacity = (jint) size;
+  _oops_do_bound = -1;
+  NOT_PRODUCT(_max_depth = 0);
+}
+
+CMMarkStack::~CMMarkStack() {
+  if (_base != NULL) FREE_C_HEAP_ARRAY(oop, _base);
+}
+
+void CMMarkStack::par_push(oop ptr) {
+  while (true) {
+    if (isFull()) {
+      _overflow = true;
+      return;
+    }
+    // Otherwise...
+    jint index = _index;
+    jint next_index = index+1;
+    jint res = Atomic::cmpxchg(next_index, &_index, index);
+    if (res == index) {
+      _base[index] = ptr;
+      // Note that we don't maintain this atomically.  We could, but it
+      // doesn't seem necessary.
+      NOT_PRODUCT(_max_depth = MAX2(_max_depth, next_index));
+      return;
+    }
+    // Otherwise, we need to try again.
+  }
+}
+
+void CMMarkStack::par_adjoin_arr(oop* ptr_arr, int n) {
+  while (true) {
+    if (isFull()) {
+      _overflow = true;
+      return;
+    }
+    // Otherwise...
+    jint index = _index;
+    jint next_index = index + n;
+    if (next_index > _capacity) {
+      _overflow = true;
+      return;
+    }
+    jint res = Atomic::cmpxchg(next_index, &_index, index);
+    if (res == index) {
+      for (int i = 0; i < n; i++) {
+        int ind = index + i;
+        assert(ind < _capacity, "By overflow test above.");
+        _base[ind] = ptr_arr[i];
+      }
+      NOT_PRODUCT(_max_depth = MAX2(_max_depth, next_index));
+      return;
+    }
+    // Otherwise, we need to try again.
+  }
+}
+
+
+void CMMarkStack::par_push_arr(oop* ptr_arr, int n) {
+  MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
+  jint start = _index;
+  jint next_index = start + n;
+  if (next_index > _capacity) {
+    _overflow = true;
+    return;
+  }
+  // Otherwise.
+  _index = next_index;
+  for (int i = 0; i < n; i++) {
+    int ind = start + i;
+    guarantee(ind < _capacity, "By overflow test above.");
+    _base[ind] = ptr_arr[i];
+  }
+}
+
+
+bool CMMarkStack::par_pop_arr(oop* ptr_arr, int max, int* n) {
+  MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
+  jint index = _index;
+  if (index == 0) {
+    *n = 0;
+    return false;
+  } else {
+    int k = MIN2(max, index);
+    jint new_ind = index - k;
+    for (int j = 0; j < k; j++) {
+      ptr_arr[j] = _base[new_ind + j];
+    }
+    _index = new_ind;
+    *n = k;
+    return true;
+  }
+}
+
+
+CMRegionStack::CMRegionStack() : _base(NULL) {}
+
+void CMRegionStack::allocate(size_t size) {
+  _base = NEW_C_HEAP_ARRAY(MemRegion, size);
+  if (_base == NULL)
+    vm_exit_during_initialization("Failed to allocate "
+                                  "CM region mark stack");
+  _index = 0;
+  // QQQQ cast ...
+  _capacity = (jint) size;
+}
+
+CMRegionStack::~CMRegionStack() {
+  if (_base != NULL) FREE_C_HEAP_ARRAY(oop, _base);
+}
+
+void CMRegionStack::push(MemRegion mr) {
+  assert(mr.word_size() > 0, "Precondition");
+  while (true) {
+    if (isFull()) {
+      _overflow = true;
+      return;
+    }
+    // Otherwise...
+    jint index = _index;
+    jint next_index = index+1;
+    jint res = Atomic::cmpxchg(next_index, &_index, index);
+    if (res == index) {
+      _base[index] = mr;
+      return;
+    }
+    // Otherwise, we need to try again.
+  }
+}
+
+MemRegion CMRegionStack::pop() {
+  while (true) {
+    // Otherwise...
+    jint index = _index;
+
+    if (index == 0) {
+      return MemRegion();
+    }
+    jint next_index = index-1;
+    jint res = Atomic::cmpxchg(next_index, &_index, index);
+    if (res == index) {
+      MemRegion mr = _base[next_index];
+      if (mr.start() != NULL) {
+        tmp_guarantee_CM( mr.end() != NULL, "invariant" );
+        tmp_guarantee_CM( mr.word_size() > 0, "invariant" );
+        return mr;
+      } else {
+        // that entry was invalidated... let's skip it
+        tmp_guarantee_CM( mr.end() == NULL, "invariant" );
+      }
+    }
+    // Otherwise, we need to try again.
+  }
+}
+
+bool CMRegionStack::invalidate_entries_into_cset() {
+  bool result = false;
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  for (int i = 0; i < _oops_do_bound; ++i) {
+    MemRegion mr = _base[i];
+    if (mr.start() != NULL) {
+      tmp_guarantee_CM( mr.end() != NULL, "invariant");
+      tmp_guarantee_CM( mr.word_size() > 0, "invariant" );
+      HeapRegion* hr = g1h->heap_region_containing(mr.start());
+      tmp_guarantee_CM( hr != NULL, "invariant" );
+      if (hr->in_collection_set()) {
+        // The region points into the collection set
+        _base[i] = MemRegion();
+        result = true;
+      }
+    } else {
+      // that entry was invalidated... let's skip it
+      tmp_guarantee_CM( mr.end() == NULL, "invariant" );
+    }
+  }
+  return result;
+}
+
+template<class OopClosureClass>
+bool CMMarkStack::drain(OopClosureClass* cl, CMBitMap* bm, bool yield_after) {
+  assert(!_drain_in_progress || !_drain_in_progress_yields || yield_after
+         || SafepointSynchronize::is_at_safepoint(),
+         "Drain recursion must be yield-safe.");
+  bool res = true;
+  debug_only(_drain_in_progress = true);
+  debug_only(_drain_in_progress_yields = yield_after);
+  while (!isEmpty()) {
+    oop newOop = pop();
+    assert(G1CollectedHeap::heap()->is_in_reserved(newOop), "Bad pop");
+    assert(newOop->is_oop(), "Expected an oop");
+    assert(bm == NULL || bm->isMarked((HeapWord*)newOop),
+           "only grey objects on this stack");
+    // iterate over the oops in this oop, marking and pushing
+    // the ones in CMS generation.
+    newOop->oop_iterate(cl);
+    if (yield_after && _cm->do_yield_check()) {
+      res = false; break;
+    }
+  }
+  debug_only(_drain_in_progress = false);
+  return res;
+}
+
+void CMMarkStack::oops_do(OopClosure* f) {
+  if (_index == 0) return;
+  assert(_oops_do_bound != -1 && _oops_do_bound <= _index,
+         "Bound must be set.");
+  for (int i = 0; i < _oops_do_bound; i++) {
+    f->do_oop(&_base[i]);
+  }
+  _oops_do_bound = -1;
+}
+
+bool ConcurrentMark::not_yet_marked(oop obj) const {
+  return (_g1h->is_obj_ill(obj)
+          || (_g1h->is_in_permanent(obj)
+              && !nextMarkBitMap()->isMarked((HeapWord*)obj)));
+}
+
+#ifdef _MSC_VER // the use of 'this' below gets a warning, make it go away
+#pragma warning( disable:4355 ) // 'this' : used in base member initializer list
+#endif // _MSC_VER
+
+ConcurrentMark::ConcurrentMark(ReservedSpace rs,
+                               int max_regions) :
+  _markBitMap1(rs, MinObjAlignment - 1),
+  _markBitMap2(rs, MinObjAlignment - 1),
+
+  _parallel_marking_threads(0),
+  _sleep_factor(0.0),
+  _marking_task_overhead(1.0),
+  _cleanup_sleep_factor(0.0),
+  _cleanup_task_overhead(1.0),
+  _region_bm(max_regions, false /* in_resource_area*/),
+  _card_bm((rs.size() + CardTableModRefBS::card_size - 1) >>
+           CardTableModRefBS::card_shift,
+           false /* in_resource_area*/),
+  _prevMarkBitMap(&_markBitMap1),
+  _nextMarkBitMap(&_markBitMap2),
+  _at_least_one_mark_complete(false),
+
+  _markStack(this),
+  _regionStack(),
+  // _finger set in set_non_marking_state
+
+  _max_task_num(MAX2(ParallelGCThreads, (size_t)1)),
+  // _active_tasks set in set_non_marking_state
+  // _tasks set inside the constructor
+  _task_queues(new CMTaskQueueSet((int) _max_task_num)),
+  _terminator(ParallelTaskTerminator((int) _max_task_num, _task_queues)),
+
+  _has_overflown(false),
+  _concurrent(false),
+
+  // _verbose_level set below
+
+  _init_times(),
+  _remark_times(), _remark_mark_times(), _remark_weak_ref_times(),
+  _cleanup_times(),
+  _total_counting_time(0.0),
+  _total_rs_scrub_time(0.0),
+
+  _parallel_workers(NULL),
+  _cleanup_co_tracker(G1CLGroup)
+{
+  CMVerboseLevel verbose_level =
+    (CMVerboseLevel) G1MarkingVerboseLevel;
+  if (verbose_level < no_verbose)
+    verbose_level = no_verbose;
+  if (verbose_level > high_verbose)
+    verbose_level = high_verbose;
+  _verbose_level = verbose_level;
+
+  if (verbose_low())
+    gclog_or_tty->print_cr("[global] init, heap start = "PTR_FORMAT", "
+                           "heap end = "PTR_FORMAT, _heap_start, _heap_end);
+
+  _markStack.allocate(G1CMStackSize);
+  _regionStack.allocate(G1CMRegionStackSize);
+
+  // Create & start a ConcurrentMark thread.
+  if (G1ConcMark) {
+    _cmThread = new ConcurrentMarkThread(this);
+    assert(cmThread() != NULL, "CM Thread should have been created");
+    assert(cmThread()->cm() != NULL, "CM Thread should refer to this cm");
+  } else {
+    _cmThread = NULL;
+  }
+  _g1h = G1CollectedHeap::heap();
+  assert(CGC_lock != NULL, "Where's the CGC_lock?");
+  assert(_markBitMap1.covers(rs), "_markBitMap1 inconsistency");
+  assert(_markBitMap2.covers(rs), "_markBitMap2 inconsistency");
+
+  SATBMarkQueueSet& satb_qs = JavaThread::satb_mark_queue_set();
+  satb_qs.set_buffer_size(G1SATBLogBufferSize);
+
+  int size = (int) MAX2(ParallelGCThreads, (size_t)1);
+  _par_cleanup_thread_state = NEW_C_HEAP_ARRAY(ParCleanupThreadState*, size);
+  for (int i = 0 ; i < size; i++) {
+    _par_cleanup_thread_state[i] = new ParCleanupThreadState;
+  }
+
+  _tasks = NEW_C_HEAP_ARRAY(CMTask*, _max_task_num);
+  _accum_task_vtime = NEW_C_HEAP_ARRAY(double, _max_task_num);
+
+  // so that the assertion in MarkingTaskQueue::task_queue doesn't fail
+  _active_tasks = _max_task_num;
+  for (int i = 0; i < (int) _max_task_num; ++i) {
+    CMTaskQueue* task_queue = new CMTaskQueue();
+    task_queue->initialize();
+    _task_queues->register_queue(i, task_queue);
+
+    _tasks[i] = new CMTask(i, this, task_queue, _task_queues);
+    _accum_task_vtime[i] = 0.0;
+  }
+
+  if (ParallelMarkingThreads > ParallelGCThreads) {
+    vm_exit_during_initialization("Can't have more ParallelMarkingThreads "
+                                  "than ParallelGCThreads.");
+  }
+  if (ParallelGCThreads == 0) {
+    // if we are not running with any parallel GC threads we will not
+    // spawn any marking threads either
+    _parallel_marking_threads =   0;
+    _sleep_factor             = 0.0;
+    _marking_task_overhead    = 1.0;
+  } else {
+    if (ParallelMarkingThreads > 0) {
+      // notice that ParallelMarkingThreads overwrites G1MarkingOverheadPerc
+      // if both are set
+
+      _parallel_marking_threads = ParallelMarkingThreads;
+      _sleep_factor             = 0.0;
+      _marking_task_overhead    = 1.0;
+    } else if (G1MarkingOverheadPerc > 0) {
+      // we will calculate the number of parallel marking threads
+      // based on a target overhead with respect to the soft real-time
+      // goal
+
+      double marking_overhead = (double) G1MarkingOverheadPerc / 100.0;
+      double overall_cm_overhead =
+        (double) G1MaxPauseTimeMS * marking_overhead / (double) G1TimeSliceMS;
+      double cpu_ratio = 1.0 / (double) os::processor_count();
+      double marking_thread_num = ceil(overall_cm_overhead / cpu_ratio);
+      double marking_task_overhead =
+        overall_cm_overhead / marking_thread_num *
+                                                (double) os::processor_count();
+      double sleep_factor =
+                         (1.0 - marking_task_overhead) / marking_task_overhead;
+
+      _parallel_marking_threads = (size_t) marking_thread_num;
+      _sleep_factor             = sleep_factor;
+      _marking_task_overhead    = marking_task_overhead;
+    } else {
+      _parallel_marking_threads = MAX2((ParallelGCThreads + 2) / 4, (size_t)1);
+      _sleep_factor             = 0.0;
+      _marking_task_overhead    = 1.0;
+    }
+
+    if (parallel_marking_threads() > 1)
+      _cleanup_task_overhead = 1.0;
+    else
+      _cleanup_task_overhead = marking_task_overhead();
+    _cleanup_sleep_factor =
+                     (1.0 - cleanup_task_overhead()) / cleanup_task_overhead();
+
+#if 0
+    gclog_or_tty->print_cr("Marking Threads          %d", parallel_marking_threads());
+    gclog_or_tty->print_cr("CM Marking Task Overhead %1.4lf", marking_task_overhead());
+    gclog_or_tty->print_cr("CM Sleep Factor          %1.4lf", sleep_factor());
+    gclog_or_tty->print_cr("CL Marking Task Overhead %1.4lf", cleanup_task_overhead());
+    gclog_or_tty->print_cr("CL Sleep Factor          %1.4lf", cleanup_sleep_factor());
+#endif
+
+    guarantee( parallel_marking_threads() > 0, "peace of mind" );
+    _parallel_workers = new WorkGang("Parallel Marking Threads",
+                                     (int) parallel_marking_threads(), false, true);
+    if (_parallel_workers == NULL)
+      vm_exit_during_initialization("Failed necessary allocation.");
+  }
+
+  // so that the call below can read a sensible value
+  _heap_start = (HeapWord*) rs.base();
+  set_non_marking_state();
+}
+
+void ConcurrentMark::update_g1_committed(bool force) {
+  // If concurrent marking is not in progress, then we do not need to
+  // update _heap_end. This has a subtle and important
+  // side-effect. Imagine that two evacuation pauses happen between
+  // marking completion and remark. The first one can grow the
+  // heap (hence now the finger is below the heap end). Then, the
+  // second one could unnecessarily push regions on the region
+  // stack. This causes the invariant that the region stack is empty
+  // at the beginning of remark to be false. By ensuring that we do
+  // not observe heap expansions after marking is complete, then we do
+  // not have this problem.
+  if (!concurrent_marking_in_progress() && !force)
+    return;
+
+  MemRegion committed = _g1h->g1_committed();
+  tmp_guarantee_CM( committed.start() == _heap_start,
+                    "start shouldn't change" );
+  HeapWord* new_end = committed.end();
+  if (new_end > _heap_end) {
+    // The heap has been expanded.
+
+    _heap_end = new_end;
+  }
+  // Notice that the heap can also shrink. However, this only happens
+  // during a Full GC (at least currently) and the entire marking
+  // phase will bail out and the task will not be restarted. So, let's
+  // do nothing.
+}
+
+void ConcurrentMark::reset() {
+  // Starting values for these two. This should be called in a STW
+  // phase. CM will be notified of any future g1_committed expansions
+  // will be at the end of evacuation pauses, when tasks are
+  // inactive.
+  MemRegion committed = _g1h->g1_committed();
+  _heap_start = committed.start();
+  _heap_end   = committed.end();
+
+  guarantee( _heap_start != NULL &&
+             _heap_end != NULL   &&
+             _heap_start < _heap_end, "heap bounds should look ok" );
+
+  // reset all the marking data structures and any necessary flags
+  clear_marking_state();
+
+  if (verbose_low())
+    gclog_or_tty->print_cr("[global] resetting");
+
+  // We do reset all of them, since different phases will use
+  // different number of active threads. So, it's easiest to have all
+  // of them ready.
+  for (int i = 0; i < (int) _max_task_num; ++i)
+    _tasks[i]->reset(_nextMarkBitMap);
+
+  // we need this to make sure that the flag is on during the evac
+  // pause with initial mark piggy-backed
+  set_concurrent_marking_in_progress();
+}
+
+void ConcurrentMark::set_phase(size_t active_tasks, bool concurrent) {
+  guarantee( active_tasks <= _max_task_num, "we should not have more" );
+
+  _active_tasks = active_tasks;
+  // Need to update the three data structures below according to the
+  // number of active threads for this phase.
+  _terminator   = ParallelTaskTerminator((int) active_tasks, _task_queues);
+  _first_overflow_barrier_sync.set_n_workers((int) active_tasks);
+  _second_overflow_barrier_sync.set_n_workers((int) active_tasks);
+
+  _concurrent = concurrent;
+  // We propagate this to all tasks, not just the active ones.
+  for (int i = 0; i < (int) _max_task_num; ++i)
+    _tasks[i]->set_concurrent(concurrent);
+
+  if (concurrent) {
+    set_concurrent_marking_in_progress();
+  } else {
+    // We currently assume that the concurrent flag has been set to
+    // false before we start remark. At this point we should also be
+    // in a STW phase.
+    guarantee( !concurrent_marking_in_progress(), "invariant" );
+    guarantee( _finger == _heap_end, "only way to get here" );
+    update_g1_committed(true);
+  }
+}
+
+void ConcurrentMark::set_non_marking_state() {
+  // We set the global marking state to some default values when we're
+  // not doing marking.
+  clear_marking_state();
+  _active_tasks = 0;
+  clear_concurrent_marking_in_progress();
+}
+
+ConcurrentMark::~ConcurrentMark() {
+  int size = (int) MAX2(ParallelGCThreads, (size_t)1);
+  for (int i = 0; i < size; i++) delete _par_cleanup_thread_state[i];
+  FREE_C_HEAP_ARRAY(ParCleanupThreadState*,
+                    _par_cleanup_thread_state);
+
+  for (int i = 0; i < (int) _max_task_num; ++i) {
+    delete _task_queues->queue(i);
+    delete _tasks[i];
+  }
+  delete _task_queues;
+  FREE_C_HEAP_ARRAY(CMTask*, _max_task_num);
+}
+
+// This closure is used to mark refs into the g1 generation
+// from external roots in the CMS bit map.
+// Called at the first checkpoint.
+//
+
+#define PRINT_REACHABLE_AT_INITIAL_MARK 0
+#if PRINT_REACHABLE_AT_INITIAL_MARK
+static FILE* reachable_file = NULL;
+
+class PrintReachableClosure: public OopsInGenClosure {
+  CMBitMap* _bm;
+  int _level;
+public:
+  PrintReachableClosure(CMBitMap* bm) :
+    _bm(bm), _level(0) {
+    guarantee(reachable_file != NULL, "pre-condition");
+  }
+  void do_oop(oop* p) {
+    oop obj = *p;
+    HeapWord* obj_addr = (HeapWord*)obj;
+    if (obj == NULL) return;
+    fprintf(reachable_file, "%d: "PTR_FORMAT" -> "PTR_FORMAT" (%d)\n",
+            _level, p, (void*) obj, _bm->isMarked(obj_addr));
+    if (!_bm->isMarked(obj_addr)) {
+      _bm->mark(obj_addr);
+      _level++;
+      obj->oop_iterate(this);
+      _level--;
+    }
+  }
+};
+#endif // PRINT_REACHABLE_AT_INITIAL_MARK
+
+#define SEND_HEAP_DUMP_TO_FILE 0
+#if SEND_HEAP_DUMP_TO_FILE
+static FILE* heap_dump_file = NULL;
+#endif // SEND_HEAP_DUMP_TO_FILE
+
+void ConcurrentMark::clearNextBitmap() {
+   guarantee(!G1CollectedHeap::heap()->mark_in_progress(), "Precondition.");
+
+   // clear the mark bitmap (no grey objects to start with).
+   // We need to do this in chunks and offer to yield in between
+   // each chunk.
+   HeapWord* start  = _nextMarkBitMap->startWord();
+   HeapWord* end    = _nextMarkBitMap->endWord();
+   HeapWord* cur    = start;
+   size_t chunkSize = M;
+   while (cur < end) {
+     HeapWord* next = cur + chunkSize;
+     if (next > end)
+       next = end;
+     MemRegion mr(cur,next);
+     _nextMarkBitMap->clearRange(mr);
+     cur = next;
+     do_yield_check();
+   }
+}
+
+class NoteStartOfMarkHRClosure: public HeapRegionClosure {
+public:
+  bool doHeapRegion(HeapRegion* r) {
+    if (!r->continuesHumongous()) {
+      r->note_start_of_marking(true);
+    }
+    return false;
+  }
+};
+
+void ConcurrentMark::checkpointRootsInitialPre() {
+  G1CollectedHeap*   g1h = G1CollectedHeap::heap();
+  G1CollectorPolicy* g1p = g1h->g1_policy();
+
+  _has_aborted = false;
+
+  // Find all the reachable objects...
+#if PRINT_REACHABLE_AT_INITIAL_MARK
+  guarantee(reachable_file == NULL, "Protocol");
+  char fn_buf[100];
+  sprintf(fn_buf, "/tmp/reachable.txt.%d", os::current_process_id());
+  reachable_file = fopen(fn_buf, "w");
+  // clear the mark bitmap (no grey objects to start with)
+  _nextMarkBitMap->clearAll();
+  PrintReachableClosure prcl(_nextMarkBitMap);
+  g1h->process_strong_roots(
+                            false,   // fake perm gen collection
+                            SharedHeap::SO_AllClasses,
+                            &prcl, // Regular roots
+                            &prcl    // Perm Gen Roots
+                            );
+  // The root iteration above "consumed" dirty cards in the perm gen.
+  // Therefore, as a shortcut, we dirty all such cards.
+  g1h->rem_set()->invalidate(g1h->perm_gen()->used_region(), false);
+  fclose(reachable_file);
+  reachable_file = NULL;
+  // clear the mark bitmap again.
+  _nextMarkBitMap->clearAll();
+  COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
+  COMPILER2_PRESENT(DerivedPointerTable::clear());
+#endif // PRINT_REACHABLE_AT_INITIAL_MARK
+
+  // Initialise marking structures. This has to be done in a STW phase.
+  reset();
+}
+
+class CMMarkRootsClosure: public OopsInGenClosure {
+private:
+  ConcurrentMark*  _cm;
+  G1CollectedHeap* _g1h;
+  bool             _do_barrier;
+
+public:
+  CMMarkRootsClosure(ConcurrentMark* cm,
+                     G1CollectedHeap* g1h,
+                     bool do_barrier) : _cm(cm), _g1h(g1h),
+                                        _do_barrier(do_barrier) { }
+
+  virtual void do_oop(narrowOop* p) {
+    guarantee(false, "NYI");
+  }
+
+  virtual void do_oop(oop* p) {
+    oop thisOop = *p;
+    if (thisOop != NULL) {
+      assert(thisOop->is_oop() || thisOop->mark() == NULL,
+             "expected an oop, possibly with mark word displaced");
+      HeapWord* addr = (HeapWord*)thisOop;
+      if (_g1h->is_in_g1_reserved(addr)) {
+        _cm->grayRoot(thisOop);
+      }
+    }
+    if (_do_barrier) {
+      assert(!_g1h->is_in_g1_reserved(p),
+             "Should be called on external roots");
+      do_barrier(p);
+    }
+  }
+};
+
+void ConcurrentMark::checkpointRootsInitialPost() {
+  G1CollectedHeap*   g1h = G1CollectedHeap::heap();
+
+  // For each region note start of marking.
+  NoteStartOfMarkHRClosure startcl;
+  g1h->heap_region_iterate(&startcl);
+
+  // Start weak-reference discovery.
+  ReferenceProcessor* rp = g1h->ref_processor();
+  rp->verify_no_references_recorded();
+  rp->enable_discovery(); // enable ("weak") refs discovery
+
+  SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
+  satb_mq_set.set_process_completed_threshold(G1SATBProcessCompletedThreshold);
+  satb_mq_set.set_active_all_threads(true);
+
+  // update_g1_committed() will be called at the end of an evac pause
+  // when marking is on. So, it's also called at the end of the
+  // initial-mark pause to update the heap end, if the heap expands
+  // during it. No need to call it here.
+
+  guarantee( !_cleanup_co_tracker.enabled(), "invariant" );
+
+  size_t max_marking_threads =
+    MAX2((size_t) 1, parallel_marking_threads());
+  for (int i = 0; i < (int)_max_task_num; ++i) {
+    _tasks[i]->enable_co_tracker();
+    if (i < (int) max_marking_threads)
+      _tasks[i]->reset_co_tracker(marking_task_overhead());
+    else
+      _tasks[i]->reset_co_tracker(0.0);
+  }
+}
+
+// Checkpoint the roots into this generation from outside
+// this generation. [Note this initial checkpoint need only
+// be approximate -- we'll do a catch up phase subsequently.]
+void ConcurrentMark::checkpointRootsInitial() {
+  assert(SafepointSynchronize::is_at_safepoint(), "world should be stopped");
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+
+  double start = os::elapsedTime();
+  GCOverheadReporter::recordSTWStart(start);
+
+  // If there has not been a GC[n-1] since last GC[n] cycle completed,
+  // precede our marking with a collection of all
+  // younger generations to keep floating garbage to a minimum.
+  // YSR: we won't do this for now -- it's an optimization to be
+  // done post-beta.
+
+  // YSR:    ignoring weak refs for now; will do at bug fixing stage
+  // EVM:    assert(discoveredRefsAreClear());
+
+
+  G1CollectorPolicy* g1p = G1CollectedHeap::heap()->g1_policy();
+  g1p->record_concurrent_mark_init_start();
+  checkpointRootsInitialPre();
+
+  // YSR: when concurrent precleaning is in place, we'll
+  // need to clear the cached card table here
+
+  ResourceMark rm;
+  HandleMark  hm;
+
+  g1h->ensure_parsability(false);
+  g1h->perm_gen()->save_marks();
+
+  CMMarkRootsClosure notOlder(this, g1h, false);
+  CMMarkRootsClosure older(this, g1h, true);
+
+  g1h->set_marking_started();
+  g1h->rem_set()->prepare_for_younger_refs_iterate(false);
+
+  g1h->process_strong_roots(false,   // fake perm gen collection
+                            SharedHeap::SO_AllClasses,
+                            &notOlder, // Regular roots
+                            &older    // Perm Gen Roots
+                            );
+  checkpointRootsInitialPost();
+
+  // Statistics.
+  double end = os::elapsedTime();
+  _init_times.add((end - start) * 1000.0);
+  GCOverheadReporter::recordSTWEnd(end);
+
+  g1p->record_concurrent_mark_init_end();
+}
+
+/*
+   Notice that in the next two methods, we actually leave the STS
+   during the barrier sync and join it immediately afterwards. If we
+   do not do this, this then the following deadlock can occur: one
+   thread could be in the barrier sync code, waiting for the other
+   thread to also sync up, whereas another one could be trying to
+   yield, while also waiting for the other threads to sync up too.
+
+   Because the thread that does the sync barrier has left the STS, it
+   is possible to be suspended for a Full GC or an evacuation pause
+   could occur. This is actually safe, since the entering the sync
+   barrier is one of the last things do_marking_step() does, and it
+   doesn't manipulate any data structures afterwards.
+*/
+
+void ConcurrentMark::enter_first_sync_barrier(int task_num) {
+  if (verbose_low())
+    gclog_or_tty->print_cr("[%d] entering first barrier", task_num);
+
+  ConcurrentGCThread::stsLeave();
+  _first_overflow_barrier_sync.enter();
+  ConcurrentGCThread::stsJoin();
+  // at this point everyone should have synced up and not be doing any
+  // more work
+
+  if (verbose_low())
+    gclog_or_tty->print_cr("[%d] leaving first barrier", task_num);
+
+  // let task 0 do this
+  if (task_num == 0) {
+    // task 0 is responsible for clearing the global data structures
+    clear_marking_state();
+
+    if (PrintGC) {
+      gclog_or_tty->date_stamp(PrintGCDateStamps);
+      gclog_or_tty->stamp(PrintGCTimeStamps);
+      gclog_or_tty->print_cr("[GC concurrent-mark-reset-for-overflow]");
+    }
+  }
+
+  // after this, each task should reset its own data structures then
+  // then go into the second barrier
+}
+
+void ConcurrentMark::enter_second_sync_barrier(int task_num) {
+  if (verbose_low())
+    gclog_or_tty->print_cr("[%d] entering second barrier", task_num);
+
+  ConcurrentGCThread::stsLeave();
+  _second_overflow_barrier_sync.enter();
+  ConcurrentGCThread::stsJoin();
+  // at this point everything should be re-initialised and ready to go
+
+  if (verbose_low())
+    gclog_or_tty->print_cr("[%d] leaving second barrier", task_num);
+}
+
+void ConcurrentMark::grayRoot(oop p) {
+  HeapWord* addr = (HeapWord*) p;
+  // We can't really check against _heap_start and _heap_end, since it
+  // is possible during an evacuation pause with piggy-backed
+  // initial-mark that the committed space is expanded during the
+  // pause without CM observing this change. So the assertions below
+  // is a bit conservative; but better than nothing.
+  tmp_guarantee_CM( _g1h->g1_committed().contains(addr),
+                    "address should be within the heap bounds" );
+
+  if (!_nextMarkBitMap->isMarked(addr))
+    _nextMarkBitMap->parMark(addr);
+}
+
+void ConcurrentMark::grayRegionIfNecessary(MemRegion mr) {
+  // The objects on the region have already been marked "in bulk" by
+  // the caller. We only need to decide whether to push the region on
+  // the region stack or not.
+
+  if (!concurrent_marking_in_progress() || !_should_gray_objects)
+    // We're done with marking and waiting for remark. We do not need to
+    // push anything else on the region stack.
+    return;
+
+  HeapWord* finger = _finger;
+
+  if (verbose_low())
+    gclog_or_tty->print_cr("[global] attempting to push "
+                           "region ["PTR_FORMAT", "PTR_FORMAT"), finger is at "
+                           PTR_FORMAT, mr.start(), mr.end(), finger);
+
+  if (mr.start() < finger) {
+    // The finger is always heap region aligned and it is not possible
+    // for mr to span heap regions.
+    tmp_guarantee_CM( mr.end() <= finger, "invariant" );
+
+    tmp_guarantee_CM( mr.start() <= mr.end() &&
+                      _heap_start <= mr.start() &&
+                      mr.end() <= _heap_end,
+                  "region boundaries should fall within the committed space" );
+    if (verbose_low())
+      gclog_or_tty->print_cr("[global] region ["PTR_FORMAT", "PTR_FORMAT") "
+                             "below the finger, pushing it",
+                             mr.start(), mr.end());
+
+    if (!region_stack_push(mr)) {
+      if (verbose_low())
+        gclog_or_tty->print_cr("[global] region stack has overflown.");
+    }
+  }
+}
+
+void ConcurrentMark::markAndGrayObjectIfNecessary(oop p) {
+  // The object is not marked by the caller. We need to at least mark
+  // it and maybe push in on the stack.
+
+  HeapWord* addr = (HeapWord*)p;
+  if (!_nextMarkBitMap->isMarked(addr)) {
+    // We definitely need to mark it, irrespective whether we bail out
+    // because we're done with marking.
+    if (_nextMarkBitMap->parMark(addr)) {
+      if (!concurrent_marking_in_progress() || !_should_gray_objects)
+        // If we're done with concurrent marking and we're waiting for
+        // remark, then we're not pushing anything on the stack.
+        return;
+
+      // No OrderAccess:store_load() is needed. It is implicit in the
+      // CAS done in parMark(addr) above
+      HeapWord* finger = _finger;
+
+      if (addr < finger) {
+        if (!mark_stack_push(oop(addr))) {
+          if (verbose_low())
+            gclog_or_tty->print_cr("[global] global stack overflow "
+                                   "during parMark");
+        }
+      }
+    }
+  }
+}
+
+class CMConcurrentMarkingTask: public AbstractGangTask {
+private:
+  ConcurrentMark*       _cm;
+  ConcurrentMarkThread* _cmt;
+
+public:
+  void work(int worker_i) {
+    guarantee( Thread::current()->is_ConcurrentGC_thread(),
+               "this should only be done by a conc GC thread" );
+
+    double start_vtime = os::elapsedVTime();
+
+    ConcurrentGCThread::stsJoin();
+
+    guarantee( (size_t)worker_i < _cm->active_tasks(), "invariant" );
+    CMTask* the_task = _cm->task(worker_i);
+    the_task->start_co_tracker();
+    the_task->record_start_time();
+    if (!_cm->has_aborted()) {
+      do {
+        double start_vtime_sec = os::elapsedVTime();
+        double start_time_sec = os::elapsedTime();
+        the_task->do_marking_step(10.0);
+        double end_time_sec = os::elapsedTime();
+        double end_vtime_sec = os::elapsedVTime();
+        double elapsed_vtime_sec = end_vtime_sec - start_vtime_sec;
+        double elapsed_time_sec = end_time_sec - start_time_sec;
+        _cm->clear_has_overflown();
+
+        bool ret = _cm->do_yield_check(worker_i);
+
+        jlong sleep_time_ms;
+        if (!_cm->has_aborted() && the_task->has_aborted()) {
+          sleep_time_ms =
+            (jlong) (elapsed_vtime_sec * _cm->sleep_factor() * 1000.0);
+          ConcurrentGCThread::stsLeave();
+          os::sleep(Thread::current(), sleep_time_ms, false);
+          ConcurrentGCThread::stsJoin();
+        }
+        double end_time2_sec = os::elapsedTime();
+        double elapsed_time2_sec = end_time2_sec - start_time_sec;
+
+        the_task->update_co_tracker();
+
+#if 0
+          gclog_or_tty->print_cr("CM: elapsed %1.4lf ms, sleep %1.4lf ms, "
+                                 "overhead %1.4lf",
+                                 elapsed_vtime_sec * 1000.0, (double) sleep_time_ms,
+                                 the_task->conc_overhead(os::elapsedTime()) * 8.0);
+          gclog_or_tty->print_cr("elapsed time %1.4lf ms, time 2: %1.4lf ms",
+                                 elapsed_time_sec * 1000.0, elapsed_time2_sec * 1000.0);
+#endif
+      } while (!_cm->has_aborted() && the_task->has_aborted());
+    }
+    the_task->record_end_time();
+    guarantee( !the_task->has_aborted() || _cm->has_aborted(), "invariant" );
+
+    ConcurrentGCThread::stsLeave();
+
+    double end_vtime = os::elapsedVTime();
+    the_task->update_co_tracker(true);
+    _cm->update_accum_task_vtime(worker_i, end_vtime - start_vtime);
+  }
+
+  CMConcurrentMarkingTask(ConcurrentMark* cm,
+                          ConcurrentMarkThread* cmt) :
+      AbstractGangTask("Concurrent Mark"), _cm(cm), _cmt(cmt) { }
+
+  ~CMConcurrentMarkingTask() { }
+};
+
+void ConcurrentMark::markFromRoots() {
+  // we might be tempted to assert that:
+  // assert(asynch == !SafepointSynchronize::is_at_safepoint(),
+  //        "inconsistent argument?");
+  // However that wouldn't be right, because it's possible that
+  // a safepoint is indeed in progress as a younger generation
+  // stop-the-world GC happens even as we mark in this generation.
+
+  _restart_for_overflow = false;
+
+  set_phase(MAX2((size_t) 1, parallel_marking_threads()), true);
+
+  CMConcurrentMarkingTask markingTask(this, cmThread());
+  if (parallel_marking_threads() > 0)
+    _parallel_workers->run_task(&markingTask);
+  else
+    markingTask.work(0);
+  print_stats();
+}
+
+void ConcurrentMark::checkpointRootsFinal(bool clear_all_soft_refs) {
+  // world is stopped at this checkpoint
+  assert(SafepointSynchronize::is_at_safepoint(),
+         "world should be stopped");
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+
+  // If a full collection has happened, we shouldn't do this.
+  if (has_aborted()) {
+    g1h->set_marking_complete(); // So bitmap clearing isn't confused
+    return;
+  }
+
+  G1CollectorPolicy* g1p = g1h->g1_policy();
+  g1p->record_concurrent_mark_remark_start();
+
+  double start = os::elapsedTime();
+  GCOverheadReporter::recordSTWStart(start);
+
+  checkpointRootsFinalWork();
+
+  double mark_work_end = os::elapsedTime();
+
+  weakRefsWork(clear_all_soft_refs);
+
+  if (has_overflown()) {
+    // Oops.  We overflowed.  Restart concurrent marking.
+    _restart_for_overflow = true;
+    // Clear the flag. We do not need it any more.
+    clear_has_overflown();
+    if (G1TraceMarkStackOverflow)
+      gclog_or_tty->print_cr("\nRemark led to restart for overflow.");
+  } else {
+    // We're done with marking.
+    JavaThread::satb_mark_queue_set().set_active_all_threads(false);
+  }
+
+#if VERIFY_OBJS_PROCESSED
+  _scan_obj_cl.objs_processed = 0;
+  ThreadLocalObjQueue::objs_enqueued = 0;
+#endif
+
+  // Statistics
+  double now = os::elapsedTime();
+  _remark_mark_times.add((mark_work_end - start) * 1000.0);
+  _remark_weak_ref_times.add((now - mark_work_end) * 1000.0);
+  _remark_times.add((now - start) * 1000.0);
+
+  GCOverheadReporter::recordSTWEnd(now);
+  for (int i = 0; i < (int)_max_task_num; ++i)
+    _tasks[i]->disable_co_tracker();
+  _cleanup_co_tracker.enable();
+  _cleanup_co_tracker.reset(cleanup_task_overhead());
+  g1p->record_concurrent_mark_remark_end();
+}
+
+
+#define CARD_BM_TEST_MODE 0
+
+class CalcLiveObjectsClosure: public HeapRegionClosure {
+
+  CMBitMapRO* _bm;
+  ConcurrentMark* _cm;
+  COTracker* _co_tracker;
+  bool _changed;
+  bool _yield;
+  size_t _words_done;
+  size_t _tot_live;
+  size_t _tot_used;
+  size_t _regions_done;
+  double _start_vtime_sec;
+
+  BitMap* _region_bm;
+  BitMap* _card_bm;
+  intptr_t _bottom_card_num;
+  bool _final;
+
+  void mark_card_num_range(intptr_t start_card_num, intptr_t last_card_num) {
+    for (intptr_t i = start_card_num; i <= last_card_num; i++) {
+#if CARD_BM_TEST_MODE
+      guarantee(_card_bm->at(i - _bottom_card_num),
+                "Should already be set.");
+#else
+      _card_bm->par_at_put(i - _bottom_card_num, 1);
+#endif
+    }
+  }
+
+public:
+  CalcLiveObjectsClosure(bool final,
+                         CMBitMapRO *bm, ConcurrentMark *cm,
+                         BitMap* region_bm, BitMap* card_bm,
+                         COTracker* co_tracker) :
+    _bm(bm), _cm(cm), _changed(false), _yield(true),
+    _words_done(0), _tot_live(0), _tot_used(0),
+    _region_bm(region_bm), _card_bm(card_bm),
+    _final(final), _co_tracker(co_tracker),
+    _regions_done(0), _start_vtime_sec(0.0)
+  {
+    _bottom_card_num =
+      intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
+               CardTableModRefBS::card_shift);
+  }
+
+  bool doHeapRegion(HeapRegion* hr) {
+    if (_co_tracker != NULL)
+      _co_tracker->update();
+
+    if (!_final && _regions_done == 0)
+      _start_vtime_sec = os::elapsedVTime();
+
+    if (hr->continuesHumongous()) return false;
+
+    HeapWord* nextTop = hr->next_top_at_mark_start();
+    HeapWord* start   = hr->top_at_conc_mark_count();
+    assert(hr->bottom() <= start && start <= hr->end() &&
+           hr->bottom() <= nextTop && nextTop <= hr->end() &&
+           start <= nextTop,
+           "Preconditions.");
+    // Otherwise, record the number of word's we'll examine.
+    size_t words_done = (nextTop - start);
+    // Find the first marked object at or after "start".
+    start = _bm->getNextMarkedWordAddress(start, nextTop);
+    size_t marked_bytes = 0;
+
+    // Below, the term "card num" means the result of shifting an address
+    // by the card shift -- address 0 corresponds to card number 0.  One
+    // must subtract the card num of the bottom of the heap to obtain a
+    // card table index.
+    // The first card num of the sequence of live cards currently being
+    // constructed.  -1 ==> no sequence.
+    intptr_t start_card_num = -1;
+    // The last card num of the sequence of live cards currently being
+    // constructed.  -1 ==> no sequence.
+    intptr_t last_card_num = -1;
+
+    while (start < nextTop) {
+      if (_yield && _cm->do_yield_check()) {
+        // We yielded.  It might be for a full collection, in which case
+        // all bets are off; terminate the traversal.
+        if (_cm->has_aborted()) {
+          _changed = false;
+          return true;
+        } else {
+          // Otherwise, it might be a collection pause, and the region
+          // we're looking at might be in the collection set.  We'll
+          // abandon this region.
+          return false;
+        }
+      }
+      oop obj = oop(start);
+      int obj_sz = obj->size();
+      // The card num of the start of the current object.
+      intptr_t obj_card_num =
+        intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
+
+      HeapWord* obj_last = start + obj_sz - 1;
+      intptr_t obj_last_card_num =
+        intptr_t(uintptr_t(obj_last) >> CardTableModRefBS::card_shift);
+
+      if (obj_card_num != last_card_num) {
+        if (start_card_num == -1) {
+          assert(last_card_num == -1, "Both or neither.");
+          start_card_num = obj_card_num;
+        } else {
+          assert(last_card_num != -1, "Both or neither.");
+          assert(obj_card_num >= last_card_num, "Inv");
+          if ((obj_card_num - last_card_num) > 1) {
+            // Mark the last run, and start a new one.
+            mark_card_num_range(start_card_num, last_card_num);
+            start_card_num = obj_card_num;
+          }
+        }
+#if CARD_BM_TEST_MODE
+        /*
+        gclog_or_tty->print_cr("Setting bits from %d/%d.",
+                               obj_card_num - _bottom_card_num,
+                               obj_last_card_num - _bottom_card_num);
+        */
+        for (intptr_t j = obj_card_num; j <= obj_last_card_num; j++) {
+          _card_bm->par_at_put(j - _bottom_card_num, 1);
+        }
+#endif
+      }
+      // In any case, we set the last card num.
+      last_card_num = obj_last_card_num;
+
+      marked_bytes += obj_sz * HeapWordSize;
+      // Find the next marked object after this one.
+      start = _bm->getNextMarkedWordAddress(start + 1, nextTop);
+      _changed = true;
+    }
+    // Handle the last range, if any.
+    if (start_card_num != -1)
+      mark_card_num_range(start_card_num, last_card_num);
+    if (_final) {
+      // Mark the allocated-since-marking portion...
+      HeapWord* tp = hr->top();
+      if (nextTop < tp) {
+        start_card_num =
+          intptr_t(uintptr_t(nextTop) >> CardTableModRefBS::card_shift);
+        last_card_num =
+          intptr_t(uintptr_t(tp) >> CardTableModRefBS::card_shift);
+        mark_card_num_range(start_card_num, last_card_num);
+        // This definitely means the region has live objects.
+        _region_bm->par_at_put(hr->hrs_index(), 1);
+      }
+    }
+
+    hr->add_to_marked_bytes(marked_bytes);
+    // Update the live region bitmap.
+    if (marked_bytes > 0) {
+      _region_bm->par_at_put(hr->hrs_index(), 1);
+    }
+    hr->set_top_at_conc_mark_count(nextTop);
+    _tot_live += hr->next_live_bytes();
+    _tot_used += hr->used();
+    _words_done = words_done;
+
+    if (!_final) {
+      ++_regions_done;
+      if (_regions_done % 10 == 0) {
+        double end_vtime_sec = os::elapsedVTime();
+        double elapsed_vtime_sec = end_vtime_sec - _start_vtime_sec;
+        if (elapsed_vtime_sec > (10.0 / 1000.0)) {
+          jlong sleep_time_ms =
+            (jlong) (elapsed_vtime_sec * _cm->cleanup_sleep_factor() * 1000.0);
+#if 0
+          gclog_or_tty->print_cr("CL: elapsed %1.4lf ms, sleep %1.4lf ms, "
+                                 "overhead %1.4lf",
+                                 elapsed_vtime_sec * 1000.0, (double) sleep_time_ms,
+                                 _co_tracker->concOverhead(os::elapsedTime()));
+#endif
+          os::sleep(Thread::current(), sleep_time_ms, false);