changeset 50475:7b916885654d

8201786: Modularize interpreter GC barriers: leftovers for ARM32 Reviewed-by: enevill, eosterlund
author shade
date Wed, 02 May 2018 19:26:42 +0200
parents c8684fac37bb
children 65b13c206495
files src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.cpp src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp src/hotspot/cpu/arm/gc/shared/cardTableBarrierSetAssembler_arm.cpp src/hotspot/cpu/arm/gc/shared/cardTableBarrierSetAssembler_arm.hpp src/hotspot/cpu/arm/gc/shared/modRefBarrierSetAssembler_arm.cpp src/hotspot/cpu/arm/gc/shared/modRefBarrierSetAssembler_arm.hpp src/hotspot/cpu/arm/interp_masm_arm.cpp src/hotspot/cpu/arm/interp_masm_arm.hpp src/hotspot/cpu/arm/macroAssembler_arm.cpp src/hotspot/cpu/arm/macroAssembler_arm.hpp src/hotspot/cpu/arm/stubGenerator_arm.cpp src/hotspot/cpu/arm/templateInterpreterGenerator_arm.cpp src/hotspot/cpu/arm/templateTable_arm.cpp
diffstat 15 files changed, 582 insertions(+), 477 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp	Wed May 02 19:26:42 2018 +0200
@@ -28,6 +28,7 @@
 #include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
 #include "gc/g1/g1CardTable.hpp"
+#include "gc/g1/g1ThreadLocalData.hpp"
 #include "gc/g1/heapRegion.hpp"
 #include "interpreter/interp_masm.hpp"
 #include "runtime/sharedRuntime.hpp"
@@ -127,6 +128,239 @@
 #endif // !AARCH64
 }
 
+// G1 pre-barrier.
+// Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+// If store_addr != noreg, then previous value is loaded from [store_addr];
+// in such case store_addr and new_val registers are preserved;
+// otherwise pre_val register is preserved.
+void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
+                                          Register store_addr,
+                                          Register new_val,
+                                          Register pre_val,
+                                          Register tmp1,
+                                          Register tmp2) {
+  Label done;
+  Label runtime;
+
+  if (store_addr != noreg) {
+    assert_different_registers(store_addr, new_val, pre_val, tmp1, tmp2, noreg);
+  } else {
+    assert (new_val == noreg, "should be");
+    assert_different_registers(pre_val, tmp1, tmp2, noreg);
+  }
+
+  Address in_progress(Rthread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
+  Address index(Rthread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
+  Address buffer(Rthread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));
+
+  // Is marking active?
+  assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "adjust this code");
+  __ ldrb(tmp1, in_progress);
+  __ cbz(tmp1, done);
+
+  // Do we need to load the previous value?
+  if (store_addr != noreg) {
+    __ load_heap_oop(pre_val, Address(store_addr, 0));
+  }
+
+  // Is the previous value null?
+  __ cbz(pre_val, done);
+
+  // Can we store original value in the thread's buffer?
+  // Is index == 0?
+  // (The index field is typed as size_t.)
+
+  __ ldr(tmp1, index);           // tmp1 := *index_adr
+  __ ldr(tmp2, buffer);
+
+  __ subs(tmp1, tmp1, wordSize); // tmp1 := tmp1 - wordSize
+  __ b(runtime, lt);             // If negative, goto runtime
+
+  __ str(tmp1, index);           // *index_adr := tmp1
+
+  // Record the previous value
+  __ str(pre_val, Address(tmp2, tmp1));
+  __ b(done);
+
+  __ bind(runtime);
+
+  // save the live input values
+#ifdef AARCH64
+  if (store_addr != noreg) {
+    __ raw_push(store_addr, new_val);
+  } else {
+    __ raw_push(pre_val, ZR);
+  }
+#else
+  if (store_addr != noreg) {
+    // avoid raw_push to support any ordering of store_addr and new_val
+    __ push(RegisterSet(store_addr) | RegisterSet(new_val));
+  } else {
+    __ push(pre_val);
+  }
+#endif // AARCH64
+
+  if (pre_val != R0) {
+    __ mov(R0, pre_val);
+  }
+  __ mov(R1, Rthread);
+
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), R0, R1);
+
+#ifdef AARCH64
+  if (store_addr != noreg) {
+    __ raw_pop(store_addr, new_val);
+  } else {
+    __ raw_pop(pre_val, ZR);
+  }
+#else
+  if (store_addr != noreg) {
+    __ pop(RegisterSet(store_addr) | RegisterSet(new_val));
+  } else {
+    __ pop(pre_val);
+  }
+#endif // AARCH64
+
+  __ bind(done);
+}
+
+// G1 post-barrier.
+// Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
+                                           Register store_addr,
+                                           Register new_val,
+                                           Register tmp1,
+                                           Register tmp2,
+                                           Register tmp3) {
+
+  Address queue_index(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
+  Address buffer(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
+
+  BarrierSet* bs = BarrierSet::barrier_set();
+  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
+  CardTable* ct = ctbs->card_table();
+  Label done;
+  Label runtime;
+
+  // Does store cross heap regions?
+
+  __ eor(tmp1, store_addr, new_val);
+#ifdef AARCH64
+  __ logical_shift_right(tmp1, tmp1, HeapRegion::LogOfHRGrainBytes);
+  __ cbz(tmp1, done);
+#else
+  __ movs(tmp1, AsmOperand(tmp1, lsr, HeapRegion::LogOfHRGrainBytes));
+  __ b(done, eq);
+#endif
+
+  // crosses regions, storing NULL?
+
+  __ cbz(new_val, done);
+
+  // storing region crossing non-NULL, is card already dirty?
+  const Register card_addr = tmp1;
+  assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
+
+  __ mov_address(tmp2, (address)ct->byte_map_base(), symbolic_Relocation::card_table_reference);
+  __ add(card_addr, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift));
+
+  __ ldrb(tmp2, Address(card_addr));
+  __ cmp(tmp2, (int)G1CardTable::g1_young_card_val());
+  __ b(done, eq);
+
+  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp2);
+
+  assert(CardTable::dirty_card_val() == 0, "adjust this code");
+  __ ldrb(tmp2, Address(card_addr));
+  __ cbz(tmp2, done);
+
+  // storing a region crossing, non-NULL oop, card is clean.
+  // dirty card and log.
+
+  __ strb(__ zero_register(tmp2), Address(card_addr));
+
+  __ ldr(tmp2, queue_index);
+  __ ldr(tmp3, buffer);
+
+  __ subs(tmp2, tmp2, wordSize);
+  __ b(runtime, lt); // go to runtime if now negative
+
+  __ str(tmp2, queue_index);
+
+  __ str(card_addr, Address(tmp3, tmp2));
+  __ b(done);
+
+  __ bind(runtime);
+
+  if (card_addr != R0) {
+    __ mov(R0, card_addr);
+  }
+  __ mov(R1, Rthread);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), R0, R1);
+
+  __ bind(done);
+}
+
+void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                    Register dst, Address src, Register tmp1, Register tmp2, Register tmp3) {
+  bool on_oop = type == T_OBJECT || type == T_ARRAY;
+  bool on_weak = (decorators & ON_WEAK_OOP_REF) != 0;
+  bool on_phantom = (decorators & ON_PHANTOM_OOP_REF) != 0;
+  bool on_reference = on_weak || on_phantom;
+
+  ModRefBarrierSetAssembler::load_at(masm, decorators, type, dst, src, tmp1, tmp2, tmp3);
+  if (on_oop && on_reference) {
+    // Generate the G1 pre-barrier code to log the value of
+    // the referent field in an SATB buffer.
+    g1_write_barrier_pre(masm, noreg, noreg, dst, tmp1, tmp2);
+  }
+}
+
+
+void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                         Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, bool is_null) {
+  bool in_heap = (decorators & IN_HEAP) != 0;
+  bool in_concurrent_root = (decorators & IN_CONCURRENT_ROOT) != 0;
+
+  bool needs_pre_barrier = in_heap || in_concurrent_root;
+  bool needs_post_barrier = (new_val != noreg) && in_heap;
+
+  // flatten object address if needed
+  assert (obj.mode() == basic_offset, "pre- or post-indexing is not supported here");
+
+  const Register store_addr = obj.base();
+  if (obj.index() != noreg) {
+    assert (obj.disp() == 0, "index or displacement, not both");
+#ifdef AARCH64
+    __ add(store_addr, obj.base(), obj.index(), obj.extend(), obj.shift_imm());
+#else
+    assert(obj.offset_op() == add_offset, "addition is expected");
+    __ add(store_addr, obj.base(), AsmOperand(obj.index(), obj.shift(), obj.shift_imm()));
+#endif // AARCH64
+  } else if (obj.disp() != 0) {
+    __ add(store_addr, obj.base(), obj.disp());
+  }
+
+  if (needs_pre_barrier) {
+    g1_write_barrier_pre(masm, store_addr, new_val, tmp1, tmp2, tmp3);
+  }
+
+  if (is_null) {
+    BarrierSetAssembler::store_at(masm, decorators, type, Address(store_addr), new_val, tmp1, tmp2, tmp3, true);
+  } else {
+    // G1 barrier needs uncompressed oop for region cross check.
+    Register val_to_store = new_val;
+    if (UseCompressedOops) {
+      val_to_store = tmp1;
+      __ mov(val_to_store, new_val);
+    }
+    BarrierSetAssembler::store_at(masm, decorators, type, Address(store_addr), val_to_store, tmp1, tmp2, tmp3, false);
+    if (needs_post_barrier) {
+      g1_write_barrier_post(masm, store_addr, new_val, tmp1, tmp2, tmp3);
+    }
+  }
+};
+
 #ifdef COMPILER1
 
 #undef __
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp	Wed May 02 19:26:42 2018 +0200
@@ -41,6 +41,27 @@
   void gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                         Register addr, Register count, Register tmp);
 
+  void g1_write_barrier_pre(MacroAssembler* masm,
+                            Register store_addr,
+                            Register new_val,
+                            Register pre_val,
+                            Register tmp1,
+                            Register tmp2);
+
+  void g1_write_barrier_post(MacroAssembler* masm,
+                             Register store_addr,
+                             Register new_val,
+                             Register tmp1,
+                             Register tmp2,
+                             Register tmp3);
+
+  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                            Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, bool is_null);
+
+public:
+  virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                       Register dst, Address src, Register tmp1, Register tmp2, Register tmp3);
+
 #ifdef COMPILER1
 public:
   void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.cpp	Wed May 02 19:26:42 2018 +0200
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+
+#define __ masm->
+
+void BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                  Register dst, Address src, Register tmp1, Register tmp2, Register tmp3) {
+  bool on_heap = (decorators & IN_HEAP) != 0;
+  bool on_root = (decorators & IN_ROOT) != 0;
+  switch (type) {
+  case T_OBJECT:
+  case T_ARRAY: {
+    if (on_heap) {
+#ifdef AARCH64
+      if (UseCompressedOops) {
+        __ ldr_w(dst, src);
+        __ decode_heap_oop(dst);
+      } else
+#endif // AARCH64
+      {
+        __ ldr(dst, src);
+      }
+    } else {
+      assert(on_root, "why else?");
+      __ ldr(dst, src);
+    }
+    break;
+  }
+  default: Unimplemented();
+  }
+
+}
+
+void BarrierSetAssembler::store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                   Address obj, Register val, Register tmp1, Register tmp2, Register tmp3, bool is_null) {
+  bool on_heap = (decorators & IN_HEAP) != 0;
+  bool on_root = (decorators & IN_ROOT) != 0;
+  switch (type) {
+  case T_OBJECT:
+  case T_ARRAY: {
+    if (on_heap) {
+#ifdef AARCH64
+      if (UseCompressedOops) {
+        assert(!dst.uses(src), "not enough registers");
+        if (!is_null) {
+          __ encode_heap_oop(src);
+        }
+        __ str_w(val, obj);
+      } else
+#endif // AARCH64
+      {
+        __ str(val, obj);
+      }
+    } else {
+      assert(on_root, "why else?");
+      __ str(val, obj);
+    }
+    break;
+  }
+  default: Unimplemented();
+  }
+}
+
--- a/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp	Wed May 02 19:26:42 2018 +0200
@@ -36,6 +36,11 @@
   virtual void arraycopy_epilogue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
                                   Register addr, Register count, Register tmp) {}
 
+  virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                       Register dst, Address src, Register tmp1, Register tmp2, Register tmp3);
+  virtual void store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                        Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, bool is_null);
+
   virtual void barrier_stubs_init() {}
 };
 
--- a/src/hotspot/cpu/arm/gc/shared/cardTableBarrierSetAssembler_arm.cpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/gc/shared/cardTableBarrierSetAssembler_arm.cpp	Wed May 02 19:26:42 2018 +0200
@@ -72,3 +72,107 @@
   __ b(L_cardtable_loop, ge);
   __ BIND(L_done);
 }
+
+void CardTableBarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                             Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, bool is_null) {
+  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
+  bool precise = on_array || on_anonymous;
+
+  if (is_null) {
+    BarrierSetAssembler::store_at(masm, decorators, type, obj, new_val, tmp1, tmp2, tmp3, true);
+  } else {
+    assert (!precise || (obj.index() == noreg && obj.disp() == 0),
+            "store check address should be calculated beforehand");
+
+    store_check_part1(masm, tmp1);
+    BarrierSetAssembler::store_at(masm, decorators, type, obj, new_val, tmp1, tmp2, tmp3, false);
+    new_val = noreg;
+    store_check_part2(masm, obj.base(), tmp1, tmp2);
+  }
+}
+
+// The 1st part of the store check.
+// Sets card_table_base register.
+void CardTableBarrierSetAssembler::store_check_part1(MacroAssembler* masm, Register card_table_base) {
+  // Check barrier set type (should be card table) and element size
+  BarrierSet* bs = BarrierSet::barrier_set();
+  assert(bs->kind() == BarrierSet::CardTableBarrierSet,
+         "Wrong barrier set kind");
+
+  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
+  CardTable* ct = ctbs->card_table();
+  assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "Adjust store check code");
+
+  // Load card table base address.
+
+  /* Performance note.
+
+     There is an alternative way of loading card table base address
+     from thread descriptor, which may look more efficient:
+
+     ldr(card_table_base, Address(Rthread, JavaThread::card_table_base_offset()));
+
+     However, performance measurements of micro benchmarks and specJVM98
+     showed that loading of card table base from thread descriptor is
+     7-18% slower compared to loading of literal embedded into the code.
+     Possible cause is a cache miss (card table base address resides in a
+     rarely accessed area of thread descriptor).
+  */
+  // TODO-AARCH64 Investigate if mov_slow is faster than ldr from Rthread on AArch64
+  __ mov_address(card_table_base, (address)ct->byte_map_base(), symbolic_Relocation::card_table_reference);
+}
+
+// The 2nd part of the store check.
+void CardTableBarrierSetAssembler::store_check_part2(MacroAssembler* masm, Register obj, Register card_table_base, Register tmp) {
+  assert_different_registers(obj, card_table_base, tmp);
+
+  assert(CardTable::dirty_card_val() == 0, "Dirty card value must be 0 due to optimizations.");
+#ifdef AARCH64
+  add(card_table_base, card_table_base, AsmOperand(obj, lsr, CardTable::card_shift));
+  Address card_table_addr(card_table_base);
+#else
+  Address card_table_addr(card_table_base, obj, lsr, CardTable::card_shift);
+#endif
+
+  if (UseCondCardMark) {
+#if INCLUDE_ALL_GCS
+    if (UseConcMarkSweepGC) {
+      __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), noreg);
+    }
+#endif
+    Label already_dirty;
+
+    __ ldrb(tmp, card_table_addr);
+    __ cbz(tmp, already_dirty);
+
+    set_card(masm, card_table_base, card_table_addr, tmp);
+    __ bind(already_dirty);
+
+  } else {
+#if INCLUDE_ALL_GCS
+    if (UseConcMarkSweepGC && CMSPrecleaningEnabled) {
+      __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore), noreg);
+    }
+#endif
+    set_card(masm, card_table_base, card_table_addr, tmp);
+  }
+}
+
+void CardTableBarrierSetAssembler::set_card(MacroAssembler* masm, Register card_table_base, Address card_table_addr, Register tmp) {
+#ifdef AARCH64
+  strb(ZR, card_table_addr);
+#else
+  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
+  CardTable* ct = ctbs->card_table();
+  if ((((uintptr_t)ct->byte_map_base() & 0xff) == 0)) {
+    // Card table is aligned so the lowest byte of the table address base is zero.
+    // This works only if the code is not saved for later use, possibly
+    // in a context where the base would no longer be aligned.
+    __ strb(card_table_base, card_table_addr);
+  } else {
+    __ mov(tmp, 0);
+    __ strb(tmp, card_table_addr);
+  }
+#endif // AARCH64
+}
--- a/src/hotspot/cpu/arm/gc/shared/cardTableBarrierSetAssembler_arm.hpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/gc/shared/cardTableBarrierSetAssembler_arm.hpp	Wed May 02 19:26:42 2018 +0200
@@ -29,9 +29,18 @@
 #include "gc/shared/modRefBarrierSetAssembler.hpp"
 
 class CardTableBarrierSetAssembler: public ModRefBarrierSetAssembler {
+private:
+  void store_check(MacroAssembler* masm, Register obj, Address dst);
+  void store_check_part1(MacroAssembler* masm, Register card_table_base);
+  void store_check_part2(MacroAssembler* masm, Register obj, Register card_table_base, Register tmp);
+
+  void set_card(MacroAssembler* masm, Register card_table_base, Address card_table_addr, Register tmp);
+
 protected:
   virtual void gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                 Register addr, Register count, Register tmp);
+  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                            Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, bool is_null);
 };
 
 #endif // #ifndef CPU_ARM_GC_SHARED_CARDTABLEBARRIERSETASSEMBLER_ARM_HPP
--- a/src/hotspot/cpu/arm/gc/shared/modRefBarrierSetAssembler_arm.cpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/gc/shared/modRefBarrierSetAssembler_arm.cpp	Wed May 02 19:26:42 2018 +0200
@@ -42,3 +42,12 @@
     gen_write_ref_array_post_barrier(masm, decorators, addr, count, tmp);
   }
 }
+
+void ModRefBarrierSetAssembler::store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                         Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, bool is_null) {
+  if (type == T_OBJECT || type == T_ARRAY) {
+    oop_store_at(masm, decorators, type, obj, new_val, tmp1, tmp2, tmp3, is_null);
+  } else {
+    BarrierSetAssembler::store_at(masm, decorators, type, obj, new_val, tmp1, tmp2, tmp3, is_null);
+  }
+}
--- a/src/hotspot/cpu/arm/gc/shared/modRefBarrierSetAssembler_arm.hpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/gc/shared/modRefBarrierSetAssembler_arm.hpp	Wed May 02 19:26:42 2018 +0200
@@ -28,6 +28,10 @@
 #include "asm/macroAssembler.hpp"
 #include "gc/shared/barrierSetAssembler.hpp"
 
+// The ModRefBarrierSetAssembler filters away accesses on BasicTypes other
+// than T_OBJECT/T_ARRAY (oops). The oop accesses call one of the protected
+// accesses, which are overridden in the concrete BarrierSetAssembler.
+
 class ModRefBarrierSetAssembler: public BarrierSetAssembler {
 protected:
   virtual void gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
@@ -35,11 +39,16 @@
   virtual void gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                 Register addr, Register count, Register tmp) {}
 
+  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                            Address obj, Register val, Register tmp1, Register tmp2, Register tmp3, bool is_null) = 0;
+
 public:
   virtual void arraycopy_prologue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
                                   Register addr, Register count, int callee_saved_regs);
   virtual void arraycopy_epilogue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
                                   Register addr, Register count, Register tmp);
+  virtual void store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                        Address obj, Register val, Register tmp1, Register tmp2, Register tmp3, bool is_null);
 };
 
 #endif // CPU_ARM_GC_SHARED_MODREFBARRIERSETASSEMBLER_ARM_HPP
--- a/src/hotspot/cpu/arm/interp_masm_arm.cpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/interp_masm_arm.cpp	Wed May 02 19:26:42 2018 +0200
@@ -406,91 +406,6 @@
 }
 
 
-// The 1st part of the store check.
-// Sets card_table_base register.
-void InterpreterMacroAssembler::store_check_part1(Register card_table_base) {
-  // Check barrier set type (should be card table) and element size
-  BarrierSet* bs = BarrierSet::barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableBarrierSet,
-         "Wrong barrier set kind");
-
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-  assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "Adjust store check code");
-
-  // Load card table base address.
-
-  /* Performance note.
-
-     There is an alternative way of loading card table base address
-     from thread descriptor, which may look more efficient:
-
-     ldr(card_table_base, Address(Rthread, JavaThread::card_table_base_offset()));
-
-     However, performance measurements of micro benchmarks and specJVM98
-     showed that loading of card table base from thread descriptor is
-     7-18% slower compared to loading of literal embedded into the code.
-     Possible cause is a cache miss (card table base address resides in a
-     rarely accessed area of thread descriptor).
-  */
-  // TODO-AARCH64 Investigate if mov_slow is faster than ldr from Rthread on AArch64
-  mov_address(card_table_base, (address)ct->byte_map_base(), symbolic_Relocation::card_table_reference);
-}
-
-// The 2nd part of the store check.
-void InterpreterMacroAssembler::store_check_part2(Register obj, Register card_table_base, Register tmp) {
-  assert_different_registers(obj, card_table_base, tmp);
-
-  assert(CardTable::dirty_card_val() == 0, "Dirty card value must be 0 due to optimizations.");
-#ifdef AARCH64
-  add(card_table_base, card_table_base, AsmOperand(obj, lsr, CardTable::card_shift));
-  Address card_table_addr(card_table_base);
-#else
-  Address card_table_addr(card_table_base, obj, lsr, CardTable::card_shift);
-#endif
-
-  if (UseCondCardMark) {
-#if INCLUDE_ALL_GCS
-    if (UseConcMarkSweepGC) {
-      membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), noreg);
-    }
-#endif
-    Label already_dirty;
-
-    ldrb(tmp, card_table_addr);
-    cbz(tmp, already_dirty);
-
-    set_card(card_table_base, card_table_addr, tmp);
-    bind(already_dirty);
-
-  } else {
-#if INCLUDE_ALL_GCS
-    if (UseConcMarkSweepGC && CMSPrecleaningEnabled) {
-      membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore), noreg);
-    }
-#endif
-    set_card(card_table_base, card_table_addr, tmp);
-  }
-}
-
-void InterpreterMacroAssembler::set_card(Register card_table_base, Address card_table_addr, Register tmp) {
-#ifdef AARCH64
-  strb(ZR, card_table_addr);
-#else
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  CardTable* ct = ctbs->card_table();
-  if ((((uintptr_t)ct->byte_map_base() & 0xff) == 0)) {
-    // Card table is aligned so the lowest byte of the table address base is zero.
-    // This works only if the code is not saved for later use, possibly
-    // in a context where the base would no longer be aligned.
-    strb(card_table_base, card_table_addr);
-  } else {
-    mov(tmp, 0);
-    strb(tmp, card_table_addr);
-  }
-#endif // AARCH64
-}
-
 //////////////////////////////////////////////////////////////////////////////////
 
 
--- a/src/hotspot/cpu/arm/interp_masm_arm.hpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/interp_masm_arm.hpp	Wed May 02 19:26:42 2018 +0200
@@ -144,11 +144,6 @@
   // load cpool->resolved_klass_at(index); Rtemp is corrupted upon return
   void load_resolved_klass_at_offset(Register Rcpool, Register Rindex, Register Rklass);
 
-  void store_check_part1(Register card_table_base);                // Sets card_table_base register.
-  void store_check_part2(Register obj, Register card_table_base, Register tmp);
-
-  void set_card(Register card_table_base, Address card_table_addr, Register tmp);
-
   void pop_ptr(Register r);
   void pop_i(Register r = R0_tos);
 #ifdef AARCH64
--- a/src/hotspot/cpu/arm/macroAssembler_arm.cpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/macroAssembler_arm.cpp	Wed May 02 19:26:42 2018 +0200
@@ -31,6 +31,7 @@
 #include "compiler/disassembler.hpp"
 #include "gc/shared/barrierSet.hpp"
 #include "gc/shared/cardTable.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
 #include "gc/shared/cardTableBarrierSet.hpp"
 #include "gc/shared/collectedHeap.inline.hpp"
 #include "interpreter/interpreter.hpp"
@@ -44,12 +45,6 @@
 #include "runtime/sharedRuntime.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "utilities/macros.hpp"
-#if INCLUDE_ALL_GCS
-#include "gc/g1/g1BarrierSet.hpp"
-#include "gc/g1/g1CardTable.hpp"
-#include "gc/g1/g1ThreadLocalData.hpp"
-#include "gc/g1/heapRegion.hpp"
-#endif
 
 // Implementation of AddressLiteral
 
@@ -2131,22 +2126,15 @@
   cbz(value, done);             // Use NULL as-is.
   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
   tbz(value, 0, not_weak);      // Test for jweak tag.
+
   // Resolve jweak.
-  ldr(value, Address(value, -JNIHandles::weak_tag_value));
-  verify_oop(value);
-#if INCLUDE_ALL_GCS
-  if (UseG1GC) {
-    g1_write_barrier_pre(noreg, // store_addr
-                         noreg, // new_val
-                         value, // pre_val
-                         tmp1,  // tmp1
-                         tmp2); // tmp2
-    }
-#endif // INCLUDE_ALL_GCS
+  access_load_at(T_OBJECT, IN_ROOT | ON_PHANTOM_OOP_REF,
+                 Address(value, -JNIHandles::weak_tag_value), value, tmp1, tmp2, noreg);
   b(done);
   bind(not_weak);
   // Resolve (untagged) jobject.
-  ldr(value, Address(value));
+  access_load_at(T_OBJECT, IN_ROOT | IN_CONCURRENT_ROOT,
+                 Address(value, 0), value, tmp1, tmp2, noreg);
   verify_oop(value);
   bind(done);
 }
@@ -2154,183 +2142,6 @@
 
 //////////////////////////////////////////////////////////////////////////////////
 
-#if INCLUDE_ALL_GCS
-
-// G1 pre-barrier.
-// Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
-// If store_addr != noreg, then previous value is loaded from [store_addr];
-// in such case store_addr and new_val registers are preserved;
-// otherwise pre_val register is preserved.
-void MacroAssembler::g1_write_barrier_pre(Register store_addr,
-                                          Register new_val,
-                                          Register pre_val,
-                                          Register tmp1,
-                                          Register tmp2) {
-  Label done;
-  Label runtime;
-
-  if (store_addr != noreg) {
-    assert_different_registers(store_addr, new_val, pre_val, tmp1, tmp2, noreg);
-  } else {
-    assert (new_val == noreg, "should be");
-    assert_different_registers(pre_val, tmp1, tmp2, noreg);
-  }
-
-  Address in_progress(Rthread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
-  Address index(Rthread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
-  Address buffer(Rthread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));
-
-  // Is marking active?
-  assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "adjust this code");
-  ldrb(tmp1, in_progress);
-  cbz(tmp1, done);
-
-  // Do we need to load the previous value?
-  if (store_addr != noreg) {
-    load_heap_oop(pre_val, Address(store_addr, 0));
-  }
-
-  // Is the previous value null?
-  cbz(pre_val, done);
-
-  // Can we store original value in the thread's buffer?
-  // Is index == 0?
-  // (The index field is typed as size_t.)
-
-  ldr(tmp1, index);           // tmp1 := *index_adr
-  ldr(tmp2, buffer);
-
-  subs(tmp1, tmp1, wordSize); // tmp1 := tmp1 - wordSize
-  b(runtime, lt);             // If negative, goto runtime
-
-  str(tmp1, index);           // *index_adr := tmp1
-
-  // Record the previous value
-  str(pre_val, Address(tmp2, tmp1));
-  b(done);
-
-  bind(runtime);
-
-  // save the live input values
-#ifdef AARCH64
-  if (store_addr != noreg) {
-    raw_push(store_addr, new_val);
-  } else {
-    raw_push(pre_val, ZR);
-  }
-#else
-  if (store_addr != noreg) {
-    // avoid raw_push to support any ordering of store_addr and new_val
-    push(RegisterSet(store_addr) | RegisterSet(new_val));
-  } else {
-    push(pre_val);
-  }
-#endif // AARCH64
-
-  if (pre_val != R0) {
-    mov(R0, pre_val);
-  }
-  mov(R1, Rthread);
-
-  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), R0, R1);
-
-#ifdef AARCH64
-  if (store_addr != noreg) {
-    raw_pop(store_addr, new_val);
-  } else {
-    raw_pop(pre_val, ZR);
-  }
-#else
-  if (store_addr != noreg) {
-    pop(RegisterSet(store_addr) | RegisterSet(new_val));
-  } else {
-    pop(pre_val);
-  }
-#endif // AARCH64
-
-  bind(done);
-}
-
-// G1 post-barrier.
-// Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
-void MacroAssembler::g1_write_barrier_post(Register store_addr,
-                                           Register new_val,
-                                           Register tmp1,
-                                           Register tmp2,
-                                           Register tmp3) {
-
-  Address queue_index(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-  Label done;
-  Label runtime;
-
-  // Does store cross heap regions?
-
-  eor(tmp1, store_addr, new_val);
-#ifdef AARCH64
-  logical_shift_right(tmp1, tmp1, HeapRegion::LogOfHRGrainBytes);
-  cbz(tmp1, done);
-#else
-  movs(tmp1, AsmOperand(tmp1, lsr, HeapRegion::LogOfHRGrainBytes));
-  b(done, eq);
-#endif
-
-  // crosses regions, storing NULL?
-
-  cbz(new_val, done);
-
-  // storing region crossing non-NULL, is card already dirty?
-  const Register card_addr = tmp1;
-  assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
-
-  mov_address(tmp2, (address)ct->byte_map_base(), symbolic_Relocation::card_table_reference);
-  add(card_addr, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift));
-
-  ldrb(tmp2, Address(card_addr));
-  cmp(tmp2, (int)G1CardTable::g1_young_card_val());
-  b(done, eq);
-
-  membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp2);
-
-  assert(CardTable::dirty_card_val() == 0, "adjust this code");
-  ldrb(tmp2, Address(card_addr));
-  cbz(tmp2, done);
-
-  // storing a region crossing, non-NULL oop, card is clean.
-  // dirty card and log.
-
-  strb(zero_register(tmp2), Address(card_addr));
-
-  ldr(tmp2, queue_index);
-  ldr(tmp3, buffer);
-
-  subs(tmp2, tmp2, wordSize);
-  b(runtime, lt); // go to runtime if now negative
-
-  str(tmp2, queue_index);
-
-  str(card_addr, Address(tmp3, tmp2));
-  b(done);
-
-  bind(runtime);
-
-  if (card_addr != R0) {
-    mov(R0, card_addr);
-  }
-  mov(R1, Rthread);
-  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), R0, R1);
-
-  bind(done);
-}
-
-#endif // INCLUDE_ALL_GCS
-
-//////////////////////////////////////////////////////////////////////////////////
-
 #ifdef AARCH64
 
 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
@@ -2873,38 +2684,39 @@
 #endif // AARCH64
 
 
-void MacroAssembler::load_heap_oop(Register dst, Address src) {
-#ifdef AARCH64
-  if (UseCompressedOops) {
-    ldr_w(dst, src);
-    decode_heap_oop(dst);
-    return;
+void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, Register tmp2, Register tmp3, DecoratorSet decorators) {
+  access_load_at(T_OBJECT, IN_HEAP | decorators, src, dst, tmp1, tmp2, tmp3);
+}
+
+// Blows src and flags.
+void MacroAssembler::store_heap_oop(Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, DecoratorSet decorators) {
+  access_store_at(T_OBJECT, IN_HEAP | decorators, obj, new_val, tmp1, tmp2, tmp3, false);
+}
+
+void MacroAssembler::store_heap_oop_null(Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, DecoratorSet decorators) {
+  access_store_at(T_OBJECT, IN_HEAP, obj, new_val, tmp1, tmp2, tmp3, true);
+}
+
+void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
+                                    Address src, Register dst, Register tmp1, Register tmp2, Register tmp3) {
+  BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
+  } else {
+    bs->load_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
   }
-#endif // AARCH64
-  ldr(dst, src);
 }
 
-// Blows src and flags.
-void MacroAssembler::store_heap_oop(Register src, Address dst) {
-#ifdef AARCH64
-  if (UseCompressedOops) {
-    assert(!dst.uses(src), "not enough registers");
-    encode_heap_oop(src);
-    str_w(src, dst);
-    return;
+void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
+                                     Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, bool is_null) {
+  BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::store_at(this, decorators, type, obj, new_val, tmp1, tmp2, tmp3, is_null);
+  } else {
+    bs->store_at(this, decorators, type, obj, new_val, tmp1, tmp2, tmp3, is_null);
   }
-#endif // AARCH64
-  str(src, dst);
-}
-
-void MacroAssembler::store_heap_oop_null(Register src, Address dst) {
-#ifdef AARCH64
-  if (UseCompressedOops) {
-    str_w(src, dst);
-    return;
-  }
-#endif // AARCH64
-  str(src, dst);
 }
 
 
--- a/src/hotspot/cpu/arm/macroAssembler_arm.hpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/macroAssembler_arm.hpp	Wed May 02 19:26:42 2018 +0200
@@ -401,27 +401,6 @@
 
   void resolve_jobject(Register value, Register tmp1, Register tmp2);
 
-#if INCLUDE_ALL_GCS
-  // G1 pre-barrier.
-  // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
-  // If store_addr != noreg, then previous value is loaded from [store_addr];
-  // in such case store_addr and new_val registers are preserved;
-  // otherwise pre_val register is preserved.
-  void g1_write_barrier_pre(Register store_addr,
-                            Register new_val,
-                            Register pre_val,
-                            Register tmp1,
-                            Register tmp2);
-
-  // G1 post-barrier.
-  // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
-  void g1_write_barrier_post(Register store_addr,
-                             Register new_val,
-                             Register tmp1,
-                             Register tmp2,
-                             Register tmp3);
-#endif // INCLUDE_ALL_GCS
-
 #ifndef AARCH64
   void nop() {
     mov(R0, R0);
@@ -1072,12 +1051,12 @@
 
     // oop manipulations
 
-  void load_heap_oop(Register dst, Address src);
-  void store_heap_oop(Register src, Address dst);
-  void store_heap_oop(Address dst, Register src) {
-    store_heap_oop(src, dst);
-  }
-  void store_heap_oop_null(Register src, Address dst);
+  void load_heap_oop(Register dst, Address src, Register tmp1 = noreg, Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
+  void store_heap_oop(Address obj, Register new_val, Register tmp1 = noreg, Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
+  void store_heap_oop_null(Address obj, Register new_val, Register tmp1 = noreg, Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
+
+  void access_load_at(BasicType type, DecoratorSet decorators, Address src, Register dst, Register tmp1, Register tmp2, Register tmp3);
+  void access_store_at(BasicType type, DecoratorSet decorators, Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, bool is_null);
 
 #ifdef AARCH64
   void encode_heap_oop(Register dst, Register src);
--- a/src/hotspot/cpu/arm/stubGenerator_arm.cpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/stubGenerator_arm.cpp	Wed May 02 19:26:42 2018 +0200
@@ -3260,7 +3260,7 @@
     __ align(OptoLoopAlignment);
     __ BIND(store_element);
     if (UseCompressedOops) {
-      __ store_heap_oop(R5, Address(to, BytesPerHeapOop, post_indexed));  // store the oop, changes flags
+      __ store_heap_oop(Address(to, BytesPerHeapOop, post_indexed), R5);  // store the oop, changes flags
       __ subs_32(count,count,1);
     } else {
       __ subs_32(count,count,1);
--- a/src/hotspot/cpu/arm/templateInterpreterGenerator_arm.cpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/templateInterpreterGenerator_arm.cpp	Wed May 02 19:26:42 2018 +0200
@@ -852,80 +852,53 @@
 //
 
 address TemplateInterpreterGenerator::generate_Reference_get_entry(void) {
-#if INCLUDE_ALL_GCS
-  if (UseG1GC) {
-    // Code: _aload_0, _getfield, _areturn
-    // parameter size = 1
-    //
-    // The code that gets generated by this routine is split into 2 parts:
-    //    1. The "intrinsified" code for G1 (or any SATB based GC),
-    //    2. The slow path - which is an expansion of the regular method entry.
-    //
-    // Notes:-
-    // * In the G1 code we do not check whether we need to block for
-    //   a safepoint. If G1 is enabled then we must execute the specialized
-    //   code for Reference.get (except when the Reference object is null)
-    //   so that we can log the value in the referent field with an SATB
-    //   update buffer.
-    //   If the code for the getfield template is modified so that the
-    //   G1 pre-barrier code is executed when the current method is
-    //   Reference.get() then going through the normal method entry
-    //   will be fine.
-    // * The G1 code can, however, check the receiver object (the instance
-    //   of java.lang.Reference) and jump to the slow path if null. If the
-    //   Reference object is null then we obviously cannot fetch the referent
-    //   and so we don't need to call the G1 pre-barrier. Thus we can use the
-    //   regular method entry code to generate the NPE.
-    //
-    // This code is based on generate_accessor_enty.
-    //
-    // Rmethod: Method*
-    // Rthread: thread
-    // Rsender_sp: sender sp, must be preserved for slow path, set SP to it on fast path
-    // Rparams: parameters
+  // Code: _aload_0, _getfield, _areturn
+  // parameter size = 1
+  //
+  // The code that gets generated by this routine is split into 2 parts:
+  //    1. The "intrinsified" code performing an ON_WEAK_OOP_REF load,
+  //    2. The slow path - which is an expansion of the regular method entry.
+  //
+  // Notes:-
+  // * An intrinsic is always executed, where an ON_WEAK_OOP_REF load is performed.
+  // * We may jump to the slow path iff the receiver is null. If the
+  //   Reference object is null then we no longer perform an ON_WEAK_OOP_REF load
+  //   Thus we can use the regular method entry code to generate the NPE.
+  //
+  // Rmethod: Method*
+  // Rthread: thread
+  // Rsender_sp: sender sp, must be preserved for slow path, set SP to it on fast path
+  // Rparams: parameters
 
-    address entry = __ pc();
-    Label slow_path;
-    const Register Rthis = R0;
-    const Register Rret_addr = Rtmp_save1;
-    assert_different_registers(Rthis, Rret_addr, Rsender_sp);
+  address entry = __ pc();
+  Label slow_path;
+  const Register Rthis = R0;
+  const Register Rret_addr = Rtmp_save1;
+  assert_different_registers(Rthis, Rret_addr, Rsender_sp);
 
-    const int referent_offset = java_lang_ref_Reference::referent_offset;
-    guarantee(referent_offset > 0, "referent offset not initialized");
+  const int referent_offset = java_lang_ref_Reference::referent_offset;
+  guarantee(referent_offset > 0, "referent offset not initialized");
 
-    // Check if local 0 != NULL
-    // If the receiver is null then it is OK to jump to the slow path.
-    __ ldr(Rthis, Address(Rparams));
-    __ cbz(Rthis, slow_path);
+  // Check if local 0 != NULL
+  // If the receiver is null then it is OK to jump to the slow path.
+  __ ldr(Rthis, Address(Rparams));
+  __ cbz(Rthis, slow_path);
 
-    // Generate the G1 pre-barrier code to log the value of
-    // the referent field in an SATB buffer.
+  // Preserve LR
+  __ mov(Rret_addr, LR);
 
-    // Load the value of the referent field.
-    __ load_heap_oop(R0, Address(Rthis, referent_offset));
+  // Load the value of the referent field.
+  const Address field_address(Rthis, referent_offset);
+  __ load_heap_oop(R0, field_address, Rtemp, R1_tmp, R2_tmp, ON_WEAK_OOP_REF);
 
-    // Preserve LR
-    __ mov(Rret_addr, LR);
+  // _areturn
+  __ mov(SP, Rsender_sp);
+  __ ret(Rret_addr);
 
-    __ g1_write_barrier_pre(noreg,   // store_addr
-                            noreg,   // new_val
-                            R0,      // pre_val
-                            Rtemp,   // tmp1
-                            R1_tmp); // tmp2
-
-    // _areturn
-    __ mov(SP, Rsender_sp);
-    __ ret(Rret_addr);
-
-    // generate a vanilla interpreter entry as the slow path
-    __ bind(slow_path);
-    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
-    return entry;
-  }
-#endif // INCLUDE_ALL_GCS
-
-  // If G1 is not enabled then attempt to go through the normal entry point
-  return NULL;
+  // generate a vanilla interpreter entry as the slow path
+  __ bind(slow_path);
+  __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
+  return entry;
 }
 
 // Not supported
--- a/src/hotspot/cpu/arm/templateTable_arm.cpp	Wed May 02 10:48:48 2018 -0700
+++ b/src/hotspot/cpu/arm/templateTable_arm.cpp	Wed May 02 19:26:42 2018 +0200
@@ -24,6 +24,7 @@
 
 #include "precompiled.hpp"
 #include "asm/macroAssembler.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
 #include "interpreter/interp_masm.hpp"
 #include "interpreter/interpreter.hpp"
 #include "interpreter/interpreterRuntime.hpp"
@@ -187,72 +188,24 @@
                          Register tmp1,
                          Register tmp2,
                          Register tmp3,
-                         BarrierSet::Name barrier,
-                         bool precise,
-                         bool is_null) {
+                         bool is_null,
+                         DecoratorSet decorators = 0) {
 
   assert_different_registers(obj.base(), new_val, tmp1, tmp2, tmp3, noreg);
-  switch (barrier) {
-#if INCLUDE_ALL_GCS
-    case BarrierSet::G1BarrierSet:
-      {
-        // flatten object address if needed
-        assert (obj.mode() == basic_offset, "pre- or post-indexing is not supported here");
-
-        const Register store_addr = obj.base();
-        if (obj.index() != noreg) {
-          assert (obj.disp() == 0, "index or displacement, not both");
-#ifdef AARCH64
-          __ add(store_addr, obj.base(), obj.index(), obj.extend(), obj.shift_imm());
-#else
-          assert(obj.offset_op() == add_offset, "addition is expected");
-          __ add(store_addr, obj.base(), AsmOperand(obj.index(), obj.shift(), obj.shift_imm()));
-#endif // AARCH64
-        } else if (obj.disp() != 0) {
-          __ add(store_addr, obj.base(), obj.disp());
-        }
-
-        __ g1_write_barrier_pre(store_addr, new_val, tmp1, tmp2, tmp3);
-        if (is_null) {
-          __ store_heap_oop_null(new_val, Address(store_addr));
-        } else {
-          // G1 barrier needs uncompressed oop for region cross check.
-          Register val_to_store = new_val;
-          if (UseCompressedOops) {
-            val_to_store = tmp1;
-            __ mov(val_to_store, new_val);
-          }
-          __ store_heap_oop(val_to_store, Address(store_addr)); // blows val_to_store:
-          val_to_store = noreg;
-          __ g1_write_barrier_post(store_addr, new_val, tmp1, tmp2, tmp3);
-        }
-      }
-      break;
-#endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableBarrierSet:
-      {
-        if (is_null) {
-          __ store_heap_oop_null(new_val, obj);
-        } else {
-          assert (!precise || (obj.index() == noreg && obj.disp() == 0),
-                  "store check address should be calculated beforehand");
-
-          __ store_check_part1(tmp1);
-          __ store_heap_oop(new_val, obj); // blows new_val:
-          new_val = noreg;
-          __ store_check_part2(obj.base(), tmp1, tmp2);
-        }
-      }
-      break;
-    case BarrierSet::ModRef:
-      ShouldNotReachHere();
-      break;
-    default:
-      ShouldNotReachHere();
-      break;
+  if (is_null) {
+    __ store_heap_oop_null(obj, new_val, tmp1, tmp2, tmp3, decorators);
+  } else {
+    __ store_heap_oop(obj, new_val, tmp1, tmp2, tmp3, decorators);
   }
 }
 
+static void do_oop_load(InterpreterMacroAssembler* _masm,
+                        Register dst,
+                        Address obj,
+                        DecoratorSet decorators = 0) {
+  __ load_heap_oop(dst, obj, noreg, noreg, noreg, decorators);
+}
+
 Address TemplateTable::at_bcp(int offset) {
   assert(_desc->uses_bcp(), "inconsistent uses_bcp information");
   return Address(Rbcp, offset);
@@ -863,7 +816,7 @@
   const Register Rindex = R0_tos;
 
   index_check(Rarray, Rindex);
-  __ load_heap_oop(R0_tos, get_array_elem_addr(T_OBJECT, Rarray, Rindex, Rtemp));
+  do_oop_load(_masm, R0_tos, get_array_elem_addr(T_OBJECT, Rarray, Rindex, Rtemp), IN_HEAP_ARRAY);
 }
 
 
@@ -1248,7 +1201,7 @@
   __ add(Raddr_1, Raddr_1, AsmOperand(Rindex_4, lsl, LogBytesPerHeapOop));
 
   // Now store using the appropriate barrier
-  do_oop_store(_masm, Raddr_1, Rvalue_2, Rtemp, R0_tmp, R3_tmp, _bs->kind(), true, false);
+  do_oop_store(_masm, Raddr_1, Rvalue_2, Rtemp, R0_tmp, R3_tmp, false, IN_HEAP_ARRAY);
   __ b(done);
 
   __ bind(throw_array_store);
@@ -1264,7 +1217,7 @@
   __ profile_null_seen(R0_tmp);
 
   // Store a NULL
-  do_oop_store(_masm, Address::indexed_oop(Raddr_1, Rindex_4), Rvalue_2, Rtemp, R0_tmp, R3_tmp, _bs->kind(), true, true);
+  do_oop_store(_masm, Address::indexed_oop(Raddr_1, Rindex_4), Rvalue_2, Rtemp, R0_tmp, R3_tmp, true, IN_HEAP_ARRAY);
 
   // Pop stack arguments
   __ bind(done);
@@ -3286,7 +3239,7 @@
     // atos case for AArch64 and slow version on 32-bit ARM
     if(!atos_merged_with_itos) {
       __ bind(Latos);
-      __ load_heap_oop(R0_tos, Address(Robj, Roffset));
+      do_oop_load(_masm, R0_tos, Address(Robj, Roffset));
       __ push(atos);
       // Rewrite bytecode to be faster
       if (!is_static && rc == may_rewrite) {
@@ -3638,7 +3591,7 @@
     __ pop(atos);
     if (!is_static) pop_and_check_object(Robj);
     // Store into the field
-    do_oop_store(_masm, Address(Robj, Roffset), R0_tos, Rtemp, R1_tmp, R5_tmp, _bs->kind(), false, false);
+    do_oop_store(_masm, Address(Robj, Roffset), R0_tos, Rtemp, R1_tmp, R5_tmp, false);
     if (!is_static && rc == may_rewrite) {
       patch_bytecode(Bytecodes::_fast_aputfield, R0_tmp, Rtemp, true, byte_no);
     }
@@ -3816,7 +3769,7 @@
 #endif // AARCH64
 
     case Bytecodes::_fast_aputfield:
-      do_oop_store(_masm, Address(Robj, Roffset), R0_tos, Rtemp, R1_tmp, R2_tmp, _bs->kind(), false, false);
+      do_oop_store(_masm, Address(Robj, Roffset), R0_tos, Rtemp, R1_tmp, R2_tmp, false);
       break;
 
     default:
@@ -3912,7 +3865,7 @@
     case Bytecodes::_fast_dgetfield: __ add(Roffset, Robj, Roffset); __ fldd(D0_tos, Address(Roffset)); break;
 #endif // __SOFTFP__
 #endif // AARCH64
-    case Bytecodes::_fast_agetfield: __ load_heap_oop(R0_tos, Address(Robj, Roffset)); __ verify_oop(R0_tos); break;
+    case Bytecodes::_fast_agetfield: do_oop_load(_masm, R0_tos, Address(Robj, Roffset)); __ verify_oop(R0_tos); break;
     default:
       ShouldNotReachHere();
   }
@@ -3992,7 +3945,7 @@
   if (state == itos) {
     __ ldr_s32(R0_tos, Address(Robj, Roffset));
   } else if (state == atos) {
-    __ load_heap_oop(R0_tos, Address(Robj, Roffset));
+    do_oop_load(_masm, R0_tos, Address(Robj, Roffset));
     __ verify_oop(R0_tos);
   } else if (state == ftos) {
 #ifdef AARCH64