changeset 55994:f6b2c8d757e7 fibers

Merge
author rpressler
date Fri, 19 Jul 2019 14:44:35 +0100
parents 423f4ecbf972 3d74bc8cc177
children eefca691e5d9
files src/hotspot/share/runtime/continuation.hpp
diffstat 21 files changed, 1139 insertions(+), 407 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/cpu/x86/continuation_x86.inline.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/cpu/x86/continuation_x86.inline.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -25,6 +25,7 @@
 #ifndef CPU_X86_CONTINUATION_X86_INLINE_HPP
 #define CPU_X86_CONTINUATION_X86_INLINE_HPP
 
+#include "compiler/oopMapStubGenerator.hpp"
 #include "runtime/frame.hpp"
 #include "runtime/frame.inline.hpp"
 
@@ -53,68 +54,205 @@
 //   print_vframe(thread->last_frame());
 // }
 
+#ifdef CONT_DOUBLE_NOP
+
+template<typename FrameT>
+__COLD NOINLINE static CachedCompiledMetadata patch_nop(NativePostCallNop* nop, const FrameT& f) {
+  f.get_cb();
+  f.oop_map();
+  assert(f.cb() != NULL && f.cb()->is_compiled() && f.oop_map() != NULL, "");
+  int fsize   = Compiled::size(f);
+  int oops    = Compiled::num_oops(f);
+  int argsize = Compiled::stack_argsize(f);
+
+  CachedCompiledMetadata md(fsize, oops, argsize);
+  if (!md.empty() && !f.cb()->as_compiled_method()->has_monitors()) {
+    nop->patch(md.int1(), 1); 
+    assert(nop->is_mode2(), "");
+  } else {
+    // TODO R prevent repeated attempts to patch ???
+  }
+  return md;
+}
+
+template<typename FrameT>
+__COLD NOINLINE void ContinuationHelper::patch_freeze_stub(const FrameT& f, address freeze_stub) {
+  assert(f.cb() != NULL && f.cb()->is_compiled() && f.oop_map() != NULL, "");
+  NativePostCallNop* nop = nativePostCallNop_unsafe_at(f.pc());
+  if (freeze_stub != NULL && nop->is_mode2()) {
+    intptr_t ptr = nop->int2_data();
+    if (ptr == 1) {
+      nop->patch_int2(OopMapStubGenerator::stub_to_offset((address)freeze_stub));
+    }
+  }
+}
+
+inline CachedCompiledMetadata ContinuationHelper::cached_metadata(address pc) {
+  NativePostCallNop* nop = nativePostCallNop_unsafe_at(pc);
+  if (LIKELY(nop->is_mode2())) {
+    return CachedCompiledMetadata(nop->int1_data());
+  } else {
+    return CachedCompiledMetadata(0);
+  }
+}
+
+template<op_mode mode, typename FrameT>
+inline CachedCompiledMetadata ContinuationHelper::cached_metadata(const FrameT& f) {
+  if (mode == mode_preempt) return CachedCompiledMetadata(0);
+
+  NativePostCallNop* nop = nativePostCallNop_unsafe_at(f.pc());
+  assert (!nop->is_mode2() || slow_get_cb(f)->is_compiled(), "");
+  if (LIKELY(nop->is_mode2())) {
+    // tty->print_cr(">>> PATCHED 33 -- %d", !md.empty());
+    return CachedCompiledMetadata(nop->int1_data());
+  } else {
+    return patch_nop(nop, f);
+  }
+}
+#endif
+
+template<op_mode mode, typename FrameT>
+FreezeFnT ContinuationHelper::freeze_stub(const FrameT& f) {
+  // static int __counter = 0;
+#ifdef CONT_DOUBLE_NOP
+  if (mode != mode_preempt) {
+    NativePostCallNop* nop = nativePostCallNop_unsafe_at(f.pc());
+    uint32_t ptr = nop->int2_data();
+    if (LIKELY(ptr > (uint32_t)1)) {
+      return (FreezeFnT)OopMapStubGenerator::offset_to_stub(ptr);
+    }
+    assert (ptr == 0 || ptr == 1, "");
+    if (f.cb() == NULL) return NULL; // f.get_cb();
+
+    // __counter++;
+    // if (__counter % 100 == 0) tty->print_cr(">>>> freeze_stub %d %d", ptr, __counter);
+    // if (mode == mode_fast) { 
+    //   tty->print_cr(">>>> freeze_stub"); f.print_on(tty); tty->print_cr("<<<< freeze_stub"); 
+    //   assert(false, "");
+    // }
+  }
+#endif
+
+  FreezeFnT f_fn = (FreezeFnT)f.oop_map()->freeze_stub();
+  if ((void*)f_fn == (void*)f.oop_map()) {
+    f_fn = NULL; // need CompressedOops for now ????
+  }
+#ifdef CONT_DOUBLE_NOP
+  // we currently patch explicitly, based on ConfigT etc.
+  // if (LIKELY(nop != NULL && f_fn != NULL && !nop->is_mode2())) {
+  //   nop->patch_int2(OopMapStubGenerator::stub_to_offset((address)f_fn));
+  // }
+#endif
+  return f_fn;
+}
+
+template<op_mode mode, typename FrameT>
+ThawFnT ContinuationHelper::thaw_stub(const FrameT& f) {
+#ifdef CONT_DOUBLE_NOP
+  if (mode != mode_preempt) {
+    NativePostCallNop* nop = nativePostCallNop_unsafe_at(f.pc());
+    uint32_t ptr = nop->int2_data();
+    if (LIKELY(ptr > (uint32_t)1)) {
+      address freeze_stub = OopMapStubGenerator::offset_to_stub(ptr);
+      address thaw_stub = OopMapStubGenerator::thaw_stub(freeze_stub);
+      if (f.cb() == NULL) { // TODO PERF: this is only necessary for new_frame called from thaw, because we need cb for deopt info
+        CodeBlob* cb = OopMapStubGenerator::code_blob(thaw_stub);
+        assert (cb == slow_get_cb(f), "");
+        const_cast<FrameT&>(f).set_cb(cb);
+      }
+      assert (f.cb() != NULL, "");
+      return (ThawFnT)thaw_stub;
+    }
+    assert (ptr == 0 || ptr == 1, "");
+    if (f.cb() == NULL) return NULL; // f.get_cb();
+  }
+#endif
+  ThawFnT t_fn = (ThawFnT)f.oop_map()->thaw_stub();
+  if ((void*)t_fn == (void*)f.oop_map()) {
+    t_fn = NULL; // need CompressedOops for now ????
+  }
+  return t_fn;
+}
+
 inline bool hframe::operator==(const hframe& other) const {
     return  HFrameBase::operator==(other) && _fp == other._fp;
 }
 
-inline void hframe::patch_real_fp_offset(int offset, intptr_t value) {
-  intptr_t* addr = (link_address() + offset);
-  *(link_address() + offset) = value;
-}
-
-template<> 
-inline intptr_t* hframe::link_address<Interpreted>(int sp, intptr_t fp, const CodeBlob* cb, const ContMirror& cont) {
-  assert (cont.valid_stack_index(fp), "fp: %ld stack_length: %d", fp, cont.stack_length());
-  return &cont.stack_address(fp)[frame::link_offset];
-}
-
-template<typename FKind> 
-inline intptr_t* hframe::link_address(int sp, intptr_t fp, const CodeBlob* cb, const ContMirror& cont) {
-  assert (cont.valid_stack_index(sp), "sp: %d stack_length: %d", sp, cont.stack_length());
-  assert (cb != NULL, "must be");
-  return (cont.stack_address(sp) + cb->frame_size()) - frame::sender_sp_offset;
-}
-
-template<typename FKind>
-inline void hframe::set_link_address(const ContMirror& cont) {
-  assert (FKind::is_instance(*this), "");
-  _link_address = link_address<FKind>(_sp, _fp, _cb, cont);
-}
-
-inline void hframe::set_link_address(const ContMirror& cont) {
-  _is_interpreted ? set_link_address<Interpreted>(cont) : set_link_address<NonInterpretedUnknown>(cont);
+intptr_t* hframe::interpreted_link_address(intptr_t fp, const ContMirror& cont) {
+  return cont.stack_address((int)fp + (frame::link_offset << LogElemsPerWord));
 }
 
 template<typename FKind>
 inline address* hframe::return_pc_address() const {
-  assert (FKind::is_instance(*this), "");
-  // for compiled frames, link_address = real_fp - frame::sender_sp_offset
-  return (address*)&link_address()[frame::return_addr_offset];
+  assert (FKind::interpreted, "");
+  return (address*)&interpreted_link_address()[frame::return_addr_offset];
 }
 
-inline int hframe::link_index(const ContMirror& cont) const {
-  return cont.stack_index(link_address());
+const CodeBlob* hframe::get_cb() const {
+  if (_cb_imd == NULL) {
+    int slot;
+    _cb_imd = CodeCache::find_blob_and_oopmap(_pc, slot);
+    if (_oop_map == NULL && slot >= 0) {
+      _oop_map = ((CodeBlob*)_cb_imd)->oop_map_for_slot(slot, _pc);
+    }
+  }
+  return (CodeBlob*)_cb_imd;
 }
 
-inline void hframe::patch_link_relative(intptr_t* fp) {
-  intptr_t* la = link_address();
-  intptr_t new_value = ContMirror::to_index((address)fp - (address)la);
+const ImmutableOopMap* hframe::get_oop_map() const {
+  if (_cb_imd == NULL) return NULL;
+  if (((CodeBlob*)_cb_imd)->oop_maps() != NULL) {
+    NativePostCallNop* nop = nativePostCallNop_at(_pc);
+    if (nop != NULL &&
+#ifdef CONT_DOUBLE_NOP
+      !nop->is_mode2() &&
+#endif
+      nop->displacement() != 0
+    ) {
+      int slot = ((nop->displacement() >> 24) & 0xff);
+      // tty->print_cr("hframe::get_oop_map slot: %d", slot);
+      return ((CodeBlob*)_cb_imd)->oop_map_for_slot(slot, _pc);
+    }
+    const ImmutableOopMap* oop_map = OopMapSet::find_map(cb(), pc());
+    return oop_map;
+  }
+  return NULL;
+}
+
+intptr_t* hframe::interpreter_frame_metadata_at(int offset) const {
+  return interpreted_link_address() + offset;
+}
+
+inline void hframe::patch_interpreter_metadata_offset(int offset, intptr_t value) {
+  *interpreter_frame_metadata_at(offset) = value;
+}
+
+inline void hframe::patch_interpreted_link(intptr_t value) {
+  intptr_t* la = interpreted_link_address();
+  log_develop_trace(jvmcont)("patch_interpreted_link patching link at %ld to %ld", _fp, value);
+  *la = value;
+}
+
+inline void hframe::patch_interpreted_link_relative(intptr_t fp) {
+  intptr_t* la = interpreted_link_address();
+  intptr_t new_value = fp - _fp;
+  log_develop_trace(jvmcont)("patch_interpreted_link_relative patching link at %ld to %ld", _fp, new_value);
   // assert (new_value == cont.stack_index(fp) - link_index(cont), "res: %d index delta: %d", new_value, cont.stack_index(fp) - link_index(cont));
   *la = new_value;
 }
 
 inline void hframe::patch_sender_sp_relative(intptr_t* value) {
   assert (_is_interpreted, "");
-  intptr_t* fp_address = link_address();
+  intptr_t* fp_address = interpreted_link_address();
   intptr_t* la = &fp_address[frame::interpreter_frame_sender_sp_offset];
   *la = ContMirror::to_index((address)value - (address)fp_address); // all relative indices are relative to fp
 }
 
 void hframe::interpreted_frame_oop_map(InterpreterOopMap* mask) const {
   assert (_is_interpreted, "");
-  Method* method = *(Method**)interpreter_frame_metadata_at(frame::interpreter_frame_method_offset);
-  int bci = method->bci_from(*(address*)interpreter_frame_metadata_at(frame::interpreter_frame_bcp_offset));
-  method->mask_for(bci, mask);
+  Method* m = method<Interpreted>();
+  int bci = m->bci_from(*(address*)interpreter_frame_metadata_at(frame::interpreter_frame_bcp_offset));
+  m->mask_for(bci, mask);
 }
 
 int hframe::interpreted_frame_num_monitors() const {
@@ -128,7 +266,7 @@
     interpreted_frame_oop_map(&mask);
     int top_offset = *(int*)interpreter_frame_metadata_at(frame::interpreter_frame_initial_sp_offset);
     int expression_stack_size = mask.expression_stack_size();
-    int index = _fp + top_offset - (expression_stack_size*elemsPerWord);
+    int index = _fp + top_offset - (expression_stack_size << LogElemsPerWord);
     return index;
   }
 #endif
@@ -140,14 +278,10 @@
     int bottom_offset = *(int*)interpreter_frame_metadata_at(frame::interpreter_frame_locals_offset) + (1*elemsPerWord); // exclusive, so we add 1 word
     return _fp + bottom_offset;
   } else {
-    return _sp + cb()->frame_size()*elemsPerWord;
+    return _sp + (cb()->frame_size() << LogElemsPerWord);
   }
 }
 
-intptr_t* hframe::interpreter_frame_metadata_at(int offset) const {
-  return link_address() + offset;
-}
-
 address hframe::interpreter_frame_bcp() const {
   address bcp;
   bcp = (address)*interpreter_frame_metadata_at(frame::interpreter_frame_bcp_offset);
@@ -156,7 +290,7 @@
 }
 
 intptr_t* hframe::interpreter_frame_local_at(int index) const {
-  intptr_t* fp = link_address();
+  intptr_t* fp = interpreted_link_address();
   const int n = Interpreter::local_offset_in_bytes(index)/wordSize;
   intptr_t* locals = (intptr_t*)((address)fp + ContMirror::to_bytes(*(intptr_t*)(fp + frame::interpreter_frame_locals_offset)));
   intptr_t* loc = &(locals[n]); // derelativize
@@ -166,7 +300,7 @@
 }
 
 intptr_t* hframe::interpreter_frame_expression_stack_at(int offset) const {
-  intptr_t* fp = link_address();
+  intptr_t* fp = interpreted_link_address();
   intptr_t* monitor_end = (intptr_t*)((address)fp + ContMirror::to_bytes(*(intptr_t*)(fp + frame::interpreter_frame_monitor_block_top_offset))); // derelativize
   intptr_t* expression_stack = monitor_end-1;
 
@@ -175,41 +309,86 @@
   return &(expression_stack[n]);
 }
 
+inline int hframe::callee_link_index() const {
+  return _sp - (frame::sender_sp_offset << LogElemsPerWord);
+}
+
+inline void hframe::patch_callee_link(intptr_t value, const ContMirror& cont) const {
+  *cont.stack_address(callee_link_index()) = value;
+}
+
+inline void hframe::patch_callee_link_relative(intptr_t fp, const ContMirror& cont) const {
+  int index = callee_link_index();
+  intptr_t* la = cont.stack_address(index);
+  intptr_t new_value = fp - index;
+  // assert (new_value == cont.stack_index(fp) - link_index(cont), "res: %d index delta: %d", new_value, cont.stack_index(fp) - link_index(cont));
+  *la = new_value;
+}
+
+inline int hframe::pc_index() const {
+  return _sp - (frame::return_addr_offset << LogElemsPerWord);
+}
+
+inline address hframe::real_pc(const ContMirror& cont) const {
+  return *(address*)cont.stack_address(pc_index());
+}
+
 template<typename FKind, op_mode mode>
 hframe hframe::sender(const ContMirror& cont, int num_oops) const {
   // tty->print_cr(">> sender of:");
   // print_on(cont, tty);
 
+  int sender_ref_sp = _ref_sp + num_oops;
+
+#ifdef CONT_DOUBLE_NOP
+  CachedCompiledMetadata md;
+  if (mode == mode_fast && LIKELY(!(md = ContinuationHelper::cached_metadata<mode>(*this)).empty())) {
+    int sender_sp = _sp + (md.size_words() << LogElemsPerWord);
+    assert (sender_sp > _sp, "");
+    if (sender_sp >= cont.stack_length())
+      return hframe();
+
+    int link_index = sender_sp - (frame::sender_sp_offset << LogElemsPerWord);
+    intptr_t sender_fp = *cont.stack_address(link_index);
+    address sender_pc  = (address)*cont.stack_address(link_index + (frame::return_addr_offset << LogElemsPerWord));
+    assert (mode != mode_fast || !Interpreter::contains(sender_pc), "");
+    return hframe(sender_sp, sender_ref_sp, sender_fp, sender_pc, NULL, false);
+  }
+#endif
+
   int sender_sp = frame_bottom_index<FKind>();
-  int sender_ref_sp = _ref_sp + num_oops;
   assert (sender_sp > _sp, "");
+
   if (sender_sp >= cont.stack_length())
-    return hframe();
+    return hframe(sender_sp, sender_ref_sp, 0, NULL, NULL, false); // hframe()
 
-  address sender_pc = return_pc<FKind>();
+  int link_index = FKind::interpreted ? _fp
+                                      : sender_sp - (frame::sender_sp_offset << LogElemsPerWord);
+
+  intptr_t sender_fp = *cont.stack_address(link_index);
+  address sender_pc  = FKind::interpreted ? return_pc<Interpreted>()
+                                          : (address)*cont.stack_address(sender_sp - (frame::return_addr_offset << LogElemsPerWord));
+
   assert (mode != mode_fast || !Interpreter::contains(sender_pc), "");
   bool is_sender_interpreted = mode == mode_fast ? false : Interpreter::contains(sender_pc); 
-  CodeBlob* sender_cb;
 
-  intptr_t sender_fp = link();
-
+  void* sender_md;
   if (mode != mode_fast && is_sender_interpreted) {
-    sender_fp += link_index(cont);
-    sender_cb = NULL;
+    sender_fp += link_index;
+    sender_md = cont.stack_address(sender_fp + (frame::link_offset << LogElemsPerWord));
     sender_sp += FKind::interpreted ? 0 : compiled_frame_stack_argsize() >> LogBytesPerElement;
-    // log_develop_trace(jvmcont)("real_fp: %d sender_fp: %ld", link_index(cont), sender_fp);
+    // log_develop_trace(jvmcont)("real_fp: %d sender_fp: %ld", link_index, sender_fp);
   } else {
-    sender_cb = ContinuationCodeBlobLookup::find_blob(sender_pc);
-    sender_pc = hframe::deopt_original_pc(cont, sender_pc, sender_cb, sender_sp); // TODO PERF: unnecessary in the long term solution of unrolling deopted frames on freeze
+    sender_md = ContinuationCodeBlobLookup::find_blob(sender_pc);
+    sender_pc = hframe::deopt_original_pc(cont, sender_pc, (CodeBlob*)sender_md, sender_sp); // TODO PERF: unnecessary in the long term solution of unrolling deopted frames on freeze
     // a stub can only appear as the topmost frame; all senders must be compiled/interpreted Java frames so we can call deopt_original_pc, which assumes a compiled Java frame
   }
-  return mode == mode_fast ? hframe::new_hframe<Compiled>(sender_sp, sender_ref_sp, sender_fp, sender_pc, cont)
-                           : hframe(sender_sp, sender_ref_sp, sender_fp, sender_pc, sender_cb, is_sender_interpreted, cont);
+  return hframe(sender_sp, sender_ref_sp, sender_fp, sender_pc, sender_md, is_sender_interpreted);
 }
 
 inline frame hframe::to_frame(ContMirror& cont, address pc, bool deopt) const {
   return frame(_sp, _ref_sp, _fp, pc,
-              _cb != NULL ? _cb : (_cb = CodeCache::find_blob(_pc)),
+              (!_is_interpreted && _cb_imd != NULL) ? cb() : (CodeBlob*)(_cb_imd = CodeCache::find_blob(_pc)),
               deopt);
 }
 
@@ -217,9 +396,9 @@
   if (is_empty()) {
     st->print_cr("\tempty");
   } else if (Interpreter::contains(pc())) { // in fast mode we cannot rely on _is_interpreted
-    st->print_cr("\tInterpreted sp: %d fp: %ld pc: " INTPTR_FORMAT " ref_sp: %d (is_interpreted: %d)", _sp, _fp, p2i(_pc), _ref_sp, _is_interpreted);
+    st->print_cr("\tInterpreted sp: %d fp: %ld pc: " INTPTR_FORMAT " ref_sp: %d (is_interpreted: %d) link address: " INTPTR_FORMAT, _sp, _fp, p2i(_pc), _ref_sp, _is_interpreted, p2i(interpreted_link_address()));
   } else {
-    st->print_cr("\tCompiled sp: %d fp: 0x%lx pc: " INTPTR_FORMAT " ref_sp: %d (is_interpreted: %d)", _sp, _fp, p2i(_pc), _ref_sp, _is_interpreted);
+    st->print_cr("\tCompiled sp: %d fp: 0x%lx pc: "  INTPTR_FORMAT " ref_sp: %d (is_interpreted: %d)", _sp, _fp, p2i(_pc), _ref_sp, _is_interpreted);
   }
 }
 
@@ -229,12 +408,12 @@
     return;
 
   if (Interpreter::contains(pc())) { // in fast mode we cannot rely on _is_interpreted
-    intptr_t* fp = link_address();
+    intptr_t* fp = cont.stack_address((int)_fp); // interpreted_link_address();
     Method** method_addr = (Method**)(fp + frame::interpreter_frame_method_offset);
     Method* method = *method_addr;
     st->print_cr("\tmethod: " INTPTR_FORMAT " (at " INTPTR_FORMAT ")", p2i(method), p2i(method_addr));
     st->print("\tmethod: "); method->print_short_name(st); st->cr();
-
+    st->print_cr("\tlink: %ld", *(intptr_t*) fp);
     st->print_cr("\tissp: %ld",             *(intptr_t*) (fp + frame::interpreter_frame_sender_sp_offset));
     st->print_cr("\tlast_sp: %ld",          *(intptr_t*) (fp + frame::interpreter_frame_last_sp_offset));
     st->print_cr("\tinitial_sp: %ld",       *(intptr_t*) (fp + frame::interpreter_frame_initial_sp_offset));
@@ -247,18 +426,19 @@
     st->print_cr("\tmirror: " INTPTR_FORMAT, p2i(*(void**)(fp + frame::interpreter_frame_mirror_offset)));
     // st->print("\tmirror: "); os::print_location(st, *(intptr_t*)(fp + frame::interpreter_frame_mirror_offset), true);
   } else {
+    if (_sp > 0) st->print_cr("\treal_pc: " INTPTR_FORMAT, p2i(real_pc(cont)));
     st->print_cr("\tcb: " INTPTR_FORMAT, p2i(cb()));
-    if (_cb != NULL) {
-      st->print("\tcb: "); _cb->print_value_on(st); st->cr();
-      st->print_cr("\tcb.frame_size: %d", _cb->frame_size());
+    if (cb() != NULL) {
+      st->print("\tcb: "); cb()->print_value_on(st); st->cr();
+      st->print_cr("\tcb.frame_size: %d", cb()->frame_size());
     }
   }
-  if (link_address() != NULL) {
-    st->print_cr("\tlink: 0x%lx %ld (at: " INTPTR_FORMAT ")", link(), link(), p2i(link_address()));
-    st->print_cr("\treturn_pc: " INTPTR_FORMAT " (at " INTPTR_FORMAT ")", p2i(CHOOSE2(_is_interpreted, return_pc)), p2i(CHOOSE2(_is_interpreted, return_pc_address)));
-  } else {
-    st->print_cr("\tlink address: NULL");
-  }
+  // if (link_address() != NULL) {
+  //   st->print_cr("\tlink: 0x%lx %ld (at: " INTPTR_FORMAT ")", link(), link(), p2i(link_address()));
+  //   st->print_cr("\treturn_pc: " INTPTR_FORMAT " (at " INTPTR_FORMAT ")", p2i(CHOOSE2(_is_interpreted, return_pc)), p2i(CHOOSE2(_is_interpreted, return_pc_address)));
+  // } else {
+  //   st->print_cr("\tlink address: NULL");
+  // }
 }
 
 /////
@@ -267,25 +447,77 @@
   set_fp(f.fp());
 }
 
+/*
+ * Here mode_preempt makes the fewest assumptions
+ */
 template<op_mode mode /* = mode_slow*/> // TODO: add default when switching to C++11+
 const hframe ContMirror::last_frame() {
   if (is_empty()) return hframe();
+
   assert (mode != mode_fast || !Interpreter::contains(_pc), "");
-  return mode == mode_fast ? hframe::new_hframe<Compiled>(_sp, _ref_sp, _fp, _pc, *this)
-                           : hframe(_sp, _ref_sp, _fp, _pc, *this);
+  assert (Interpreter::contains(_pc) == is_flag(FLAG_LAST_FRAME_INTERPRETED), "");
+
+  if (mode == mode_fast || !is_flag(FLAG_LAST_FRAME_INTERPRETED)) {
+    CodeBlob* cb;
+  #ifdef CONT_DOUBLE_NOP
+    if (mode != mode_preempt && LIKELY(!ContinuationHelper::cached_metadata(_pc).empty()))
+      cb = NULL;
+    else
+  #endif
+      cb = ContinuationCodeBlobLookup::find_blob(_pc);
+
+    return hframe(_sp, _ref_sp, _fp, _pc, cb, false);
+  } else {
+    return hframe(_sp, _ref_sp, _fp, _pc, hframe::interpreted_link_address(_fp, *this), true);
+  }
 }
 
 hframe ContMirror::from_frame(const frame& f) {
-  return hframe(f.cont_sp(), f.cont_ref_sp(), (intptr_t)f.fp(), f.pc(), f.cb(), f.is_interpreted_frame(), *this);
+  void* md = f.is_interpreted_frame() ? (void*)hframe::interpreted_link_address((intptr_t)f.fp(), *this) : (void*)f.cb();
+  return hframe(f.cont_sp(), f.cont_ref_sp(), (intptr_t)f.fp(), f.pc(), md, f.is_interpreted_frame());
 }
 
 ///////
 
+#ifdef ASSERT
+template<typename FKind>
+static intptr_t* slow_real_fp(const frame& f) {
+  assert (FKind::is_instance(f), "");
+  return FKind::interpreted ? f.fp() : f.unextended_sp() + slow_get_cb(f)->frame_size();
+}
+
+template<typename FKind> // TODO: maybe do the same CRTP trick with Interpreted and Compiled as with hframe
+static intptr_t** slow_link_address(const frame& f) {
+  assert (FKind::is_instance(f), "");
+  return FKind::interpreted
+            ? (intptr_t**)(f.fp() + frame::link_offset)
+            : (intptr_t**)(slow_real_fp<FKind>(f) - frame::sender_sp_offset);
+}
+
+template<typename FKind>
+static address* slow_return_pc_address(const frame& f) {
+  return (address*)(slow_real_fp<FKind>(f) - 1);
+}
+#endif
+
+inline intptr_t** Frame::callee_link_address(const frame& f) {
+  return (intptr_t**)(f.sp() - frame::sender_sp_offset);
+}
+
+static void patch_callee_link(const frame& f, intptr_t* fp) {
+  *Frame::callee_link_address(f) = fp;
+  log_trace(jvmcont)("patched link at " INTPTR_FORMAT ": " INTPTR_FORMAT, p2i(Frame::callee_link_address(f)), p2i(fp));
+}
+
 template <typename RegisterMapT>
 inline intptr_t** Frame::map_link_address(const RegisterMapT* map) {
   return (intptr_t**)map->location(rbp->as_VMReg());
 }
 
+static inline intptr_t* noninterpreted_real_fp(intptr_t* unextended_sp, int size_in_words) {
+  return unextended_sp + size_in_words;
+}
+
 template<typename FKind>
 static inline intptr_t* real_fp(const frame& f) {
   assert (FKind::is_instance(f), "");
@@ -294,6 +526,10 @@
   return FKind::interpreted ? f.fp() : f.unextended_sp() + f.cb()->frame_size();
 }
 
+static inline intptr_t** noninterpreted_link_address(intptr_t* unextended_sp, int size_in_words) {
+  return (intptr_t**)(noninterpreted_real_fp(unextended_sp, size_in_words) - frame::sender_sp_offset);
+}
+
 template<typename FKind> // TODO: maybe do the same CRTP trick with Interpreted and Compiled as with hframe
 static inline intptr_t** link_address(const frame& f) {
   assert (FKind::is_instance(f), "");
@@ -302,6 +538,13 @@
             : (intptr_t**)(real_fp<FKind>(f) - frame::sender_sp_offset);
 }
 
+template<typename FKind>
+static void patch_link(frame& f, intptr_t* fp) {
+  assert (FKind::interpreted, "");
+  *link_address<FKind>(f) = fp;
+  log_trace(jvmcont)("patched link at " INTPTR_FORMAT ": " INTPTR_FORMAT, p2i(link_address<FKind>(f)), p2i(fp));
+}
+
 // static inline intptr_t** link_address_stub(const frame& f) {
 //   assert (!f.is_java_frame(), "");
 //   return (intptr_t**)(f.fp() - frame::sender_sp_offset);
@@ -311,10 +554,8 @@
   return f.is_interpreted_frame() ? link_address<Interpreted>(f) : link_address<NonInterpretedUnknown>(f);
 }
 
-template<typename FKind>
-static void patch_link(frame& f, intptr_t* fp) {
-  *link_address<FKind>(f) = fp;
-  log_trace(jvmcont)("patched link at " INTPTR_FORMAT ": " INTPTR_FORMAT, p2i(link_address<FKind>(f)), p2i(fp));
+inline address* Interpreted::return_pc_address(const frame& f) {
+  return (address*)(f.fp() + frame::return_addr_offset);
 }
 
 void Interpreted::patch_sender_sp(frame& f, intptr_t* sp) {
@@ -323,22 +564,20 @@
   log_trace(jvmcont)("patched sender_sp: " INTPTR_FORMAT, p2i(sp));
 }
 
-
-inline address* Interpreted::return_pc_address(const frame& f) {
-  return (address*)(f.fp() + frame::return_addr_offset);
+inline address* Frame::return_pc_address(const frame& f) {
+  return (address*)(f.real_fp() - 1);
 }
 
-template<typename Self>
-inline address* NonInterpreted<Self>::return_pc_address(const frame& f) {
-  return (address*)(f.real_fp() - 1);
-}
+// inline address* Frame::pc_address(const frame& f) {
+//   return (address*)(f.sp() - frame::return_addr_offset);
+// }
 
 inline address Frame::real_pc(const frame& f) {
   address* pc_addr = &(((address*) f.sp())[-1]);
   return *pc_addr;
 }
 
-inline void Frame::patch_pc(frame& f, address pc) {
+inline void Frame::patch_pc(const frame& f, address pc) {
   address* pc_addr = &(((address*) f.sp())[-1]);
   *pc_addr = pc;
 }
@@ -346,9 +585,10 @@
 inline intptr_t* Interpreted::frame_top(const frame& f, InterpreterOopMap* mask) { // inclusive; this will be copied with the frame
   intptr_t* res = *(intptr_t**)f.addr_at(frame::interpreter_frame_initial_sp_offset) - expression_stack_size(f, mask);
   assert (res == (intptr_t*)f.interpreter_frame_monitor_end() - expression_stack_size(f, mask), "");
+  assert (res >= f.unextended_sp(), "");
   return res;
   // Not true, but using unextended_sp might work
-  // assert (res == f.unextended_sp() + 1, "res: " INTPTR_FORMAT " unextended_sp: " INTPTR_FORMAT, p2i(res), p2i(f.unextended_sp() + 1));
+  // assert (res == f.unextended_sp(), "res: " INTPTR_FORMAT " unextended_sp: " INTPTR_FORMAT, p2i(res), p2i(f.unextended_sp() + 1));
 }
 
 inline intptr_t* Interpreted::frame_bottom(const frame& f) { // exclusive; this will not be copied with the frame
@@ -358,6 +598,9 @@
 
 /////////
 
+static inline intptr_t** callee_link_address(const frame& f) {
+  return (intptr_t**)(f.sp() - frame::sender_sp_offset);
+}
 
 template<typename FKind, typename RegisterMapT>
 inline void ContinuationHelper::update_register_map(RegisterMapT* map, const frame& f) {
@@ -369,9 +612,14 @@
   frame::update_map_with_saved_link(map, link_address);
 }
 
-void ContinuationHelper::update_register_map(RegisterMap* map, const hframe& hf, const ContMirror& cont) {
+template<typename RegisterMapT>
+inline void ContinuationHelper::update_register_map_with_callee(RegisterMapT* map, const frame& f) {
+  frame::update_map_with_saved_link(map, callee_link_address(f));
+}
+
+void ContinuationHelper::update_register_map(RegisterMap* map, const hframe& caller, const ContMirror& cont) {
   // we save the link _index_ in the oop map; it is read and converted back in Continuation::reg_to_location
-  int link_index = cont.stack_index(hf.link_address());
+  int link_index = caller.callee_link_index();
   log_develop_trace(jvmcont)("ContinuationHelper::update_register_map: frame::update_map_with_saved_link: %d", link_index);
   intptr_t link_index0 = link_index;
   frame::update_map_with_saved_link(map, reinterpret_cast<intptr_t**>(link_index0));
@@ -402,7 +650,8 @@
 inline void ContinuationHelper::to_frame_info_pd(const frame& f, const frame& callee, FrameInfo* fi) {
   // we have an indirection for fp, because the link at the entry frame may hold a sender's oop, and it can be relocated
   // at a safpoint on the VM->Java transition, so we point at an address where the GC would find it
-  fi->fp = (intptr_t*)link_address<FKind>(callee); // f.fp();
+  assert (callee_link_address(f) == slow_link_address<FKind>(callee), "");
+  fi->fp = (intptr_t*)callee_link_address(f); // f.fp();
 }
 
 inline void ContinuationHelper::to_frame_info_pd(const frame& f, FrameInfo* fi) {
@@ -432,22 +681,49 @@
   assert (StubRoutines::cont_doYield_stub()->contains(anchor->last_Java_pc()), "must be");
   assert (StubRoutines::cont_doYield_stub()->oop_maps()->count() == 1, "must be");
 
-  return frame(anchor->last_Java_sp(), anchor->last_Java_sp(), anchor->last_Java_fp(), anchor->last_Java_pc(), 
-    StubRoutines::cont_doYield_stub(), StubRoutines::cont_doYield_stub()->oop_map_for_slot(0, anchor->last_Java_pc()));
+  return frame(anchor->last_Java_sp(), anchor->last_Java_sp(), anchor->last_Java_fp(), anchor->last_Java_pc(), NULL, NULL, true);
+  // return frame(anchor->last_Java_sp(), anchor->last_Java_sp(), anchor->last_Java_fp(), anchor->last_Java_pc(), 
+  //   StubRoutines::cont_doYield_stub(), StubRoutines::cont_doYield_stub()->oop_map_for_slot(0, anchor->last_Java_pc()), true);
 }
 
-template<bool fast>
-static inline frame sender_for_compiled_frame(const frame& f, intptr_t** link_addr) {
+template<typename FKind, op_mode mode>
+static inline frame sender_for_compiled_frame(const frame& f) {
+#ifdef CONT_DOUBLE_NOP
+  CachedCompiledMetadata md;
+  // tty->print_cr(">>> sender fast: %d !FKind::stub: %d", fast, !FKind::stub);
+  if (mode == mode_fast && !FKind::stub && LIKELY(!(md = ContinuationHelper::cached_metadata<mode>(f)).empty())) {
+    intptr_t* sender_sp = f.unextended_sp() + md.size_words();
+    intptr_t** link_addr = (intptr_t**)(sender_sp - frame::sender_sp_offset);
+    address sender_pc = (address) *(sender_sp-1);
+
+    assert(sender_sp != f.sp(), "must have changed");
+    return frame(sender_sp, sender_sp, *link_addr, sender_pc, NULL, NULL, true); // no deopt check TODO PERF: use a faster constructor that doesn't write cb (shows up in profile)
+  }
+  // tty->print_cr(">>> slow sender1");
+#endif
+
+  assert (mode == mode_preempt || !FKind::stub || StubRoutines::cont_doYield_stub()->contains(f.pc()), "must be");
+  assert (mode == mode_preempt || !FKind::stub || slow_get_cb(f)->frame_size() == 5, "must be");
+  intptr_t** link_addr = (mode != mode_preempt && FKind::stub) ? noninterpreted_link_address(f.unextended_sp(), 5) : link_address<FKind>(f);
+
   intptr_t* sender_sp = (intptr_t*)(link_addr + frame::sender_sp_offset); //  f.unextended_sp() + (fsize/wordSize); // 
   address sender_pc = (address) *(sender_sp-1);
   assert(sender_sp != f.sp(), "must have changed");
 
+#ifdef CONT_DOUBLE_NOP
+  if (mode == mode_fast) {
+    assert (!Interpreter::contains(sender_pc), "");
+    return frame(sender_sp, sender_sp, *link_addr, sender_pc, NULL, NULL, true); // no deopt check
+  }
+#endif
+
+  // tty->print_cr("33333 fast: %d stub: %d", fast, FKind::stub); if (fast) f.print_on(tty);
   int slot = 0;
   CodeBlob* sender_cb = ContinuationCodeBlobLookup::find_blob_and_oopmap(sender_pc, slot);
-  if (fast) {
+  if (mode == mode_fast) {
     assert (!Interpreter::contains(sender_pc), "");
     assert (sender_cb != NULL, "");
-    return frame(sender_sp, sender_sp, *link_addr, sender_pc, sender_cb, slot == -1 ? NULL : sender_cb->oop_map_for_slot(slot, sender_pc), true); // no deopt check; TODO: not sure about this
+    return frame(sender_sp, sender_sp, *link_addr, sender_pc, sender_cb, slot == -1 ? NULL : sender_cb->oop_map_for_slot(slot, sender_pc), true); // no deopt check TODO PERF: use a faster constructor that doesn't write cb (shows up in profile)
   } else {
     return sender_cb != NULL
       ? frame(sender_sp, sender_sp, *link_addr, sender_pc, sender_cb, slot == -1 ? NULL : sender_cb->oop_map_for_slot(slot, sender_pc))
@@ -455,8 +731,7 @@
   }
 }
 
-static inline frame sender_for_interpreted_frame(const frame& f, intptr_t** link_addr) {
-  assert (*link_addr == f.link(), "");
+static inline frame sender_for_interpreted_frame(const frame& f) {
   return frame(f.sender_sp(), f.interpreter_frame_sender_sp(), f.link(), f.sender_pc());
 }
 
@@ -466,25 +741,17 @@
 
 template <typename ConfigT, op_mode mode>
 template<typename FKind>
-inline frame Freeze<ConfigT, mode>::sender(const frame& f, intptr_t*** link_address_out) {
+inline frame Freeze<ConfigT, mode>::sender(const frame& f) {
   assert (FKind::is_instance(f), "");
-  intptr_t** link_addr = link_address<FKind>(f);
-  *link_address_out = link_addr;
-  return FKind::interpreted 
-    ? sender_for_interpreted_frame(f, link_addr) 
-    : (mode == mode_fast ? sender_for_compiled_frame<true> (f, link_addr) 
-                         : sender_for_compiled_frame<false>(f, link_addr));
+  if (FKind::interpreted) {
+    return sender_for_interpreted_frame(f);
+  } else {
+    return sender_for_compiled_frame<FKind, mode>(f);
+  }
 }
 
-template <typename ConfigT, op_mode mode>
-template<typename FKind>
-inline frame Freeze<ConfigT, mode>::sender(const frame& f) {
-  assert (FKind::is_instance(f), "");
-  intptr_t** link_addr = link_address<FKind>(f);
-  return FKind::interpreted 
-    ? sender_for_interpreted_frame(f, link_addr) 
-    : (mode == mode_fast ? sender_for_compiled_frame<true> (f, link_addr) 
-                         : sender_for_compiled_frame<false>(f, link_addr));
+static inline int callee_link_index(const hframe& f) {
+  return f.sp() - (frame::sender_sp_offset << LogElemsPerWord);
 }
 
 template <typename ConfigT, op_mode mode>
@@ -492,28 +759,34 @@
 hframe Freeze<ConfigT, mode>::new_bottom_hframe(int sp, int ref_sp, address pc, bool interpreted) {
   intptr_t fp = _cont.fp();
   assert (!cont_empty || fp == 0, "");
-  intptr_t* link_address = (cont_empty || !interpreted) ? NULL // if we're not interpreted, we're not interested in the link addresss
-                                                        : hframe::link_address<Interpreted>(sp, fp, NULL, _cont);
-  return hframe(sp, ref_sp, fp, pc, NULL, interpreted, link_address);
+  void* imd = NULL;
+  DEBUG_ONLY(imd = interpreted ? hframe::interpreted_link_address(fp, _cont) : NULL);
+  return hframe(sp, ref_sp, fp, pc, imd, interpreted);
 }
 
 template <typename ConfigT, op_mode mode>
-template<typename FKind> hframe Freeze<ConfigT, mode>::new_callee_hframe(const frame& f, intptr_t* vsp, const hframe& caller, int fsize, int num_oops) {
+template<typename FKind> hframe Freeze<ConfigT, mode>::new_hframe(const frame& f, intptr_t* vsp, const hframe& caller, int fsize, int num_oops, int argsize) {
   assert (FKind::is_instance(f), "");
+  assert (f.sp() <= vsp, "");
+  assert (mode != mode_fast || f.sp() == f.unextended_sp(), "");
 
   int sp = caller.sp() - ContMirror::to_index(fsize);
-
+  // int sp = mode == mode_fast ? usp : usp - ((vsp - f.sp()) << LogElemsPerWord);
+  int ref_sp = caller.ref_sp() - num_oops;
+  if (mode != mode_fast && caller.is_interpreted_frame()) { // must be done after computing sp above
+    const_cast<hframe&>(caller).set_sp(caller.sp() - (argsize >> LogBytesPerElement));
+  }
   intptr_t fp;
-  CodeBlob* cb;
+  void* cb_imd;
   if (FKind::interpreted) {
     fp = sp + ((f.fp() - vsp) << LogElemsPerWord);
-    cb = NULL;
+    cb_imd = hframe::interpreted_link_address(fp, _cont);
   } else {
     fp = (intptr_t)f.fp();
-    cb = f.cb();
+    cb_imd = f.cb();
   }
 
-  return hframe(sp, caller.ref_sp() - num_oops, fp, f.pc(), cb, FKind::interpreted, hframe::link_address<FKind>(sp, fp, cb, _cont));
+  return hframe(sp, ref_sp, fp, f.pc(), cb_imd, FKind::interpreted);
 }
 
 template <typename ConfigT, op_mode mode>
@@ -521,25 +794,29 @@
 inline void Freeze<ConfigT, mode>::patch_pd(const frame& f, hframe& hf, const hframe& caller) {
   if (!FKind::interpreted) {
     if (_fp_oop_info._has_fp_oop) {
-      hf.set_fp(_fp_oop_info._fp_index);
+      hf.set_fp(_fp_oop_info._fp_index); // TODO PERF non-temporal store
     }
   } else {
     assert (!_fp_oop_info._has_fp_oop, "only compiled frames");
   }
 
+  assert (!FKind::interpreted || hf.interpreted_link_address() == _cont.stack_address(hf.fp()), "");
   assert (mode != mode_fast || bottom || !Interpreter::contains(caller.pc()), "");
   assert (!bottom || caller.is_interpreted_frame() == _cont.is_flag(FLAG_LAST_FRAME_INTERPRETED), "");
 
   if ((mode != mode_fast || bottom) && caller.is_interpreted_frame()) {
-    hf.patch_link_relative(caller.link_address());
+    FKind::interpreted ? hf.patch_interpreted_link_relative(caller.fp())
+                       : caller.patch_callee_link_relative(caller.fp(), _cont); // TODO PERF non-temporal store
   } else {
     assert (!Interpreter::contains(caller.pc()), "");
-    hf.patch_link(caller.fp()); // caller.fp() already contains _fp_oop_info._fp_index if appropriate, as it was patched when patch is called on the caller
+    // TODO PERF non-temporal store
+    FKind::interpreted ? hf.patch_interpreted_link(caller.fp())
+                       : caller.patch_callee_link(caller.fp(), _cont); // caller.fp() already contains _fp_oop_info._fp_index if appropriate, as it was patched when patch is called on the caller
   }
   if (FKind::interpreted) {
     assert (mode != mode_fast, "");
     if (bottom && _cont.is_empty()) { // dynamic test, but we don't care because we're interpreted
-      hf.patch_real_fp_offset(frame::interpreter_frame_sender_sp_offset, 0);
+      hf.patch_interpreter_metadata_offset(frame::interpreter_frame_sender_sp_offset, 0);
     } else {
       hf.patch_sender_sp_relative(_cont.stack_address(caller.sp()));
     }
@@ -547,7 +824,7 @@
 }
 
 template <typename ConfigT, op_mode mode>
-template <bool bottom> 
+template <bool bottom>
 inline void Freeze<ConfigT, mode>::align(const hframe& caller) {
   assert (mode != mode_fast || bottom || !Interpreter::contains(caller.pc()), "");
   if ((mode != mode_fast || bottom) && caller.is_interpreted_frame()) {
@@ -574,21 +851,24 @@
 
 template <typename ConfigT, op_mode mode>
 inline frame Thaw<ConfigT, mode>::new_entry_frame() {
-  return frame(_cont.entrySP(), _cont.entryFP(), _cont.entryPC()); // TODO PERF: This find code blob and computes deopt state
+  return frame(_cont.entrySP(), _cont.entryFP(), _cont.entryPC()); // TODO PERF: This finds code blob and computes deopt state
 }
 
 template <typename ConfigT, op_mode mode>
 template<typename FKind> frame Thaw<ConfigT, mode>::new_frame(const hframe& hf, intptr_t* vsp) {
   assert (FKind::is_instance(hf), "");
 
-  intptr_t* fp;
   if (FKind::interpreted) {
+    // intptr_t* sp = vsp - ((hsp - hf.sp()) >> LogElemsPerWord);
     int hsp = hf.sp();
-    fp = vsp + ((hf.fp() - hsp) >> LogElemsPerWord);
+    intptr_t* fp = vsp + ((hf.fp() - hsp) >> LogElemsPerWord);
     return frame(vsp, vsp, fp, hf.pc());
   } else {
-    fp = (intptr_t*)hf.fp();
-    assert (hf.oop_map() != NULL, "");
+    intptr_t* fp = (intptr_t*)hf.fp();
+  #ifdef CONT_DOUBLE_NOP
+    hf.get_cb();
+  #endif
+    assert (hf.cb() != NULL && hf.oop_map() != NULL, "");
     return frame(vsp, vsp, fp, hf.pc(), hf.cb(), hf.oop_map()); // TODO PERF : this computes deopt state; is it necessary?
   }
 }
@@ -600,24 +880,29 @@
 
 template <typename ConfigT, op_mode mode>
 template<typename FKind, bool top, bool bottom>
-inline intptr_t* Thaw<ConfigT, mode>::align(const hframe& hf, intptr_t* vsp, const frame& caller) {
+inline intptr_t* Thaw<ConfigT, mode>::align(const hframe& hf, intptr_t* vsp, frame& caller) {
   assert (FKind::is_instance(hf), "");
+  assert (mode != mode_fast || bottom, "");
 
   if (!FKind::interpreted && !FKind::stub) {
+    assert (_cont.is_flag(FLAG_LAST_FRAME_INTERPRETED) == Interpreter::contains(_cont.pc()), "");
+    if ((!bottom && mode != mode_fast && caller.is_interpreted_frame())
+        || (bottom && _cont.is_flag(FLAG_LAST_FRAME_INTERPRETED))) {
+      _cont.sub_size(sizeof(intptr_t)); // we do this whether or not we've aligned because we add it in freeze_interpreted_frame
+    }
+
   #ifdef _LP64
     if ((intptr_t)vsp % 16 != 0) {
       log_develop_trace(jvmcont)("Aligning compiled frame: " INTPTR_FORMAT " -> " INTPTR_FORMAT, p2i(vsp), p2i(vsp - 1));
       assert(caller.is_interpreted_frame() 
         || (bottom && !FKind::stub && hf.compiled_frame_stack_argsize() % 16 != 0), "");
       vsp--;
+      caller.set_sp(caller.sp() - 1);
     }
     assert((intptr_t)vsp % 16 == 0, "");
   #endif
-  
-    if (Interpreter::contains(hf.return_pc<FKind>())) { // false if bottom-most frame, as the return address would be patched to NULL if interpreted
-      _cont.sub_size(sizeof(intptr_t)); // we do this whether or not we've aligned because we add it in freeze_interpreted_frame
-    }
   }
+
   return vsp;
 }
 
@@ -625,21 +910,22 @@
 template<typename FKind, bool top, bool bottom>
 inline void Thaw<ConfigT, mode>::patch_pd(frame& f, const frame& caller) {
   assert (!bottom || caller.fp() == _cont.entryFP(), "caller.fp: " INTPTR_FORMAT " entryFP: " INTPTR_FORMAT, p2i(caller.fp()), p2i(_cont.entryFP()));
-
-  patch_link<FKind>(f, caller.fp());
+  assert (FKind::interpreted || slow_link_address<FKind>(f) == Frame::callee_link_address(caller), "");
+  FKind::interpreted ? patch_link<FKind>(f, caller.fp())
+                     : patch_callee_link(caller, caller.fp());
 }
 
 template <typename ConfigT, op_mode mode>
 inline void Thaw<ConfigT, mode>::derelativize_interpreted_frame_metadata(const hframe& hf, const frame& f) {
-  intptr_t* hfp = _cont.stack_address(hf.fp());
   intptr_t* vfp = f.fp();
 
+  intptr_t* hfp = _cont.stack_address(hf.fp());
   if (*(hfp + frame::interpreter_frame_last_sp_offset) == 0) {
       *(vfp + frame::interpreter_frame_last_sp_offset) = 0;
   } else {
     ContMirror::derelativize(vfp, frame::interpreter_frame_last_sp_offset);
   }
-  ContMirror::derelativize(vfp, frame::interpreter_frame_initial_sp_offset); // == block_top == block_bottom
+  ContMirror::derelativize(vfp, frame::interpreter_frame_initial_sp_offset);
   ContMirror::derelativize(vfp, frame::interpreter_frame_locals_offset);
 }
 
--- a/src/hotspot/cpu/x86/frame_x86.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/cpu/x86/frame_x86.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -116,7 +116,7 @@
   // uses sp() to mean "raw" sp and unextended_sp() to mean the caller's
   // original sp we use that convention.
 
-  intptr_t*     _unextended_sp;
+  intptr_t* _unextended_sp;
   void adjust_unextended_sp() NOT_DEBUG_RETURN;
 
   intptr_t* ptr_at_addr(int offset) const {
--- a/src/hotspot/cpu/x86/frame_x86.inline.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/cpu/x86/frame_x86.inline.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -131,7 +131,7 @@
   _fp = fp;
   _pc = pc;
   assert(pc != NULL, "no pc?");
-  _cb = CodeCache::find_blob(pc);
+  _cb = CodeCache::find_blob(pc); // TODO R find_blob_fast
   _oop_map = NULL;
   setup(pc);
 }
@@ -453,7 +453,12 @@
   if (_cb == NULL) return NULL;
   if (_cb->oop_maps() != NULL) {
     NativePostCallNop* nop = nativePostCallNop_at(_pc);
-    if (nop != NULL && nop->displacement() != 0) {
+    if (nop != NULL &&
+#ifdef CONT_DOUBLE_NOP
+      !nop->is_mode2() &&
+#endif
+      nop->displacement() != 0
+    ) {
       int slot = ((nop->displacement() >> 24) & 0xff);
       return _cb->oop_map_for_slot(slot, _pc);
     }
--- a/src/hotspot/cpu/x86/hframe_x86.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/cpu/x86/hframe_x86.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -30,64 +30,52 @@
   // additional fields beyond _sp and _pc:
   intptr_t _fp;
 
-  intptr_t* _link_address;
-
-private:
-  inline int link_index(const ContMirror& cont) const;
-  inline intptr_t* interpreter_frame_metadata_at(int offset) const;
-
 public:
 
   typedef intptr_t** callee_info;
 
 public:
-  hframe() : HFrameBase(), _fp(0), _link_address(NULL) {}
+  hframe() : HFrameBase(), _fp(0) {}
 
-  hframe(const hframe& hf) : HFrameBase(hf), _fp(hf._fp), _link_address(hf._link_address) {}
+  hframe(const hframe& hf) : HFrameBase(hf), _fp(hf._fp) {}
 
   hframe(int sp, int ref_sp, intptr_t fp, address pc, const ContMirror& cont) // called by ContMirror::last_frame
-    : HFrameBase(sp, ref_sp, pc, cont), _fp(fp) { set_link_address(cont); }
+    : HFrameBase(sp, ref_sp, pc, cont), _fp(fp) {}
   
-  hframe(int sp, int ref_sp, intptr_t fp, address pc, CodeBlob* cb, bool is_interpreted, const ContMirror& cont)
-    : HFrameBase(sp, ref_sp, pc, cb, is_interpreted), _fp(fp) { set_link_address(cont); }
 
-  hframe(int sp, int ref_sp, intptr_t fp, address pc, CodeBlob* cb, bool is_interpreted, intptr_t* link_address) // called by new_callee/bottom_hframe
-    : HFrameBase(sp, ref_sp, pc, cb, is_interpreted), _fp(fp), _link_address(link_address) {}
-
-  template <typename FKind> static hframe new_hframe(int sp, int ref_sp, intptr_t fp, address pc, const ContMirror& cont) {
-    assert (FKind::interpreted == Interpreter::contains(pc), "");
-    CodeBlob* cb = FKind::interpreted ? NULL : ContinuationCodeBlobLookup::find_blob(pc);
-    return hframe(sp, ref_sp, fp, pc, cb, FKind::interpreted, link_address<FKind>(sp, fp, cb, cont));
-  }
+  hframe(int sp, int ref_sp, intptr_t fp, address pc, void* cb_md, bool is_interpreted) 
+    : HFrameBase(sp, ref_sp, pc, cb_md, is_interpreted), _fp(fp) {}
 
   inline bool operator==(const hframe& other) const;
 
   void copy_partial_pd(const hframe& other) {
     _fp = other._fp;
-    _link_address = other._link_address;
   } 
 
   inline intptr_t  fp()     const { return _fp; }
 
   inline void set_fp(intptr_t fp) { _fp = fp; }
 
-  // the link is an offset from the real fp to the sender's fp IFF the sender is interpreted; otherwise, it's the contents of the rbp register
-  intptr_t* link_address() const { return _link_address; }
-  intptr_t link() const          { return *link_address(); }
+  const CodeBlob* get_cb() const;
+  const ImmutableOopMap* get_oop_map() const;
 
-  template<typename FKind> static inline intptr_t* link_address(int sp, intptr_t fp, const CodeBlob* cb, const ContMirror& cont);
-  template<typename FKind> inline void set_link_address(const ContMirror& cont);
-  inline void set_link_address(const ContMirror& cont);
+  inline int callee_link_index() const;
+  inline int pc_index() const;
 
-  void patch_link(intptr_t value) {
-    intptr_t* la = link_address();
-    *la = value;
-  }
+  inline address real_pc(const ContMirror& cont) const;
 
-  inline void patch_link_relative(intptr_t* fp);
+  inline intptr_t* interpreted_link_address() const { assert (Interpreter::contains(_pc), ""); return (intptr_t*)_cb_imd; }
 
-  inline void patch_real_fp_offset(int offset, intptr_t value);
-  inline intptr_t* get_real_fp_offset(int offset) { return (intptr_t*)*(link_address() + offset); }
+  static intptr_t* interpreted_link_address(intptr_t fp, const ContMirror& cont);
+
+  inline void patch_interpreter_metadata_offset(int offset, intptr_t value);
+  inline intptr_t* interpreter_frame_metadata_at(int offset) const;
+
+  inline void patch_interpreted_link(intptr_t value);
+  inline void patch_interpreted_link_relative(intptr_t fp);
+
+  inline void patch_callee_link(intptr_t value, const ContMirror& cont) const;
+  inline void patch_callee_link_relative(intptr_t fp, const ContMirror& cont) const;
 
   inline void patch_sender_sp_relative(intptr_t* value);
 
@@ -121,4 +109,56 @@
   return *(Method**)interpreter_frame_metadata_at(frame::interpreter_frame_method_offset);
 }
 
+#ifdef CONT_DOUBLE_NOP
+
+// TODO R move to continuation_x86.inline.hpp once PD has been separated
+
+const int mdSizeBits    = 13;
+const int mdOopBits     = 14;
+const int mdArgsizeBits = 5;
+STATIC_ASSERT(mdSizeBits + mdOopBits + mdArgsizeBits == 32);
+
+class CachedCompiledMetadata {
+private:
+  union {
+    struct {
+      uint _size    : mdSizeBits; // in DWORDS
+      uint _oops    : mdOopBits;
+      uint _argsize : mdArgsizeBits;
+    };
+    uint32_t _int1;
+  };
+
+public:
+  CachedCompiledMetadata() {}
+  CachedCompiledMetadata(uint32_t int1) { _int1 = int1; }
+  CachedCompiledMetadata(int size, int oops, int argsize) {
+    assert (size % 8 == 0, "");
+    size >>= LogBytesPerWord;
+    if (size <= ((1 << mdSizeBits) - 1) && oops <= ((1 << mdOopBits) - 1) && argsize <= ((1 << mdArgsizeBits) - 1)) {
+      _size = size;
+      _oops = oops;
+      _argsize = argsize;
+    } else {
+      tty->print_cr(">> metadata failed: size: %d oops: %d argsize: %d", size, oops, argsize);
+      _int1 = 0;
+    }
+  }
+
+  bool empty()        const { return _size == 0; }
+  int size()          const { return ((int)_size) << LogBytesPerWord; }
+  int size_words()    const { return (int)_size; }
+  int num_oops()      const { return (int)_oops; }
+  int stack_argsize() const { return (int)_argsize; }
+
+  uint32_t int1() const { return _int1; }
+
+  void print_on(outputStream* st) { st->print("size: %d args: %d oops: %d", size(), stack_argsize(), num_oops()); }
+  void print() { print_on(tty); }
+};
+
+STATIC_ASSERT(sizeof(CachedCompiledMetadata) == 4);
+
+#endif
+
 #endif // CPU_X86_HFRAME_X86_HPP
\ No newline at end of file
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Fri Jul 19 14:44:35 2019 +0100
@@ -53,6 +53,8 @@
 #include "opto/intrinsicnode.hpp"
 #endif
 
+#include "runtime/continuation.hpp" // TODO LOOM remove after testing CONT_DOUBLE_NOP
+
 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) /* nothing */
 #define STOP(error) stop(error)
@@ -2794,6 +2796,13 @@
   emit_int8((int8_t)0x84);
   emit_int8((int8_t)0x00);
   emit_int32(0x00);
+#ifdef CONT_DOUBLE_NOP
+  emit_int8((int8_t)0x0f);
+  emit_int8((int8_t)0x1f);
+  emit_int8((int8_t)0x84);
+  emit_int8((int8_t)0x00);
+  emit_int32(0x00);
+#endif
 }
 
 // A 5 byte nop that is safe for patching (see patch_verified_entry)
--- a/src/hotspot/cpu/x86/nativeInst_x86.cpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/cpu/x86/nativeInst_x86.cpp	Fri Jul 19 14:44:35 2019 +0100
@@ -699,3 +699,20 @@
   *((int32_t *)(code_pos)) = (int32_t) diff;
 }
 
+#ifdef CONT_DOUBLE_NOP
+void NativePostCallNop::patch(uint32_t int1, uint32_t int2) {
+  patch_int2(int2);
+  patch_int1(int1); // order matters
+
+  // *((uint8_t *)addr_at(3))  = (uint8_t) short1 >> 8;
+  // *((uint8_t *)addr_at(11)) = (uint8_t) short1;
+}
+void NativePostCallNop::patch_int1(uint32_t int1) {
+  *((uint32_t *)addr_at(4))  = (int32_t) int1;
+  wrote(4);
+}
+void NativePostCallNop::patch_int2(uint32_t int2) {
+  *((uint32_t *)addr_at(12)) = (int32_t) int2;
+  wrote(12);
+}
+#endif
--- a/src/hotspot/cpu/x86/nativeInst_x86.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/cpu/x86/nativeInst_x86.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -30,6 +30,8 @@
 #include "runtime/os.hpp"
 #include "runtime/safepointMechanism.hpp"
 
+#include "runtime/continuation.hpp" // TODO LOOM remove after testing CONT_DOUBLE_NOP
+
 // We have interfaces for the following instructions:
 // - NativeInstruction
 // - - NativeCall
@@ -762,17 +764,54 @@
     displacement_offset = 4
   };
 
-  bool check() const { return int_at(0) == 0x841f0f; }
+  bool check() const {
+  #ifdef CONT_DOUBLE_NOP
+    return check1() && int2_data() == 0;
+  #else
+    return int_at(0) == 0x841f0f; 
+  #endif
+  }
   int displacement() const { return (jint) int_at(displacement_offset); }
   void patch(jint diff);
+
+#ifdef CONT_DOUBLE_NOP
+  bool check1() const { return (int_at(0) & 0xffffff) == 0x841f0f && (int_at(8) & 0xffffff) == 0x841f0f; }
+  uint16_t short_data() const { return (uint16_t)((ubyte_at(3) << 8) | ubyte_at(11)); }
+  uint32_t int1_data()  const { return (uint32_t)int_at(4); }
+  uint32_t int2_data()  const { return (uint32_t)int_at(12); }
+  void patch(uint32_t int1, uint32_t int2);
+  void patch_int1(uint32_t int1);
+  void patch_int2(uint32_t int2);
+
+  // int mode() {
+  //   assert (int2_data() == 0 || int1_data() != 0, "");
+  //   return static_cast<bool>(int1_data()) + static_cast<bool>(int2_data());
+  // }
+
+  bool is_mode2() { return int2_data() != 0; } // mode2 is used for fast continuation freeze/thaw metadata
+#endif
 };
 
 inline NativePostCallNop* nativePostCallNop_at(address address) {
   NativePostCallNop* nop = (NativePostCallNop*) address;
+#ifdef CONT_DOUBLE_NOP
+  if (nop->check1()) {
+#else
   if (nop->check()) {
+#endif
     return nop;
   }
   return NULL;
 }
 
+inline NativePostCallNop* nativePostCallNop_unsafe_at(address address) {
+  NativePostCallNop* nop = (NativePostCallNop*) address;
+#ifdef CONT_DOUBLE_NOP
+  assert (nop->check1(), "");
+#else
+  assert (nop->check(), "");
+#endif
+  return nop;
+}
+
 #endif // CPU_X86_NATIVEINST_X86_HPP
--- a/src/hotspot/cpu/x86/oopMapStubGenerator_x86.cpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/cpu/x86/oopMapStubGenerator_x86.cpp	Fri Jul 19 14:44:35 2019 +0100
@@ -37,6 +37,8 @@
 #include "opto/optoreg.hpp"
 #endif
 
+// TODO PERF: Use non-temporal stores in freeze and non-temporal loads in thaw. See https://stackoverflow.com/a/45636785/750563
+
 static Register vsp = c_rarg0;
 static Register ref_stack = c_rarg1;
 static Register frame_ptr = c_rarg2;
@@ -1029,6 +1031,10 @@
     return _thaw;
   }
 
+  address freeze_stub() {
+    return _freeze;
+  }
+
   bool has_rbp_index() {
     return _written_rbp_index;
   }
@@ -1052,6 +1058,8 @@
     _link_offset_loaded = false;
     _written_rbp_index = false;
 
+    _masm->align(8);
+    _masm->emit_int64(0); // make room for CodeBlob pointer
     _thaw = _masm->pc();
 
     if (UseCompressedOops) {
@@ -1197,6 +1205,7 @@
 
   void generate_freeze(const ImmutableOopMap& map) {
     _masm->align(8);
+    _masm->emit_int64(0); // make room for thaw stub pointer
     _freeze = _masm->pc();
 
     freeze_prologue(_masm);
@@ -1361,8 +1370,17 @@
   cgen.generate_freeze(_oopmap);
   cgen.generate_thaw(_oopmap);
 
-  _freeze_stub = _blob->code_begin();
-  _thaw_stub = cgen.thaw_stub();
+  _freeze_stub = cgen.freeze_stub(); // _blob->code_begin();
+  _thaw_stub   = cgen.thaw_stub();
+
+  *((address*)_freeze_stub - 1) = _thaw_stub;
+  *((const CodeBlob**)_thaw_stub - 1) = _cb;
+
+  assert (_cb != NULL, "");
+
+  assert (thaw_stub(_freeze_stub) == _thaw_stub, "");
+  assert ((intptr_t)_freeze_stub % 8 == 0, "");
+  assert ((intptr_t)_thaw_stub   % 8 == 0, "");
 
   return true;
 }
@@ -1375,3 +1393,29 @@
   _freeze_stub = NULL;
   _thaw_stub = NULL;
 }
+
+address OopMapStubGenerator::thaw_stub(address freeze_stub_address) {
+  return *((address*)freeze_stub_address - 1);
+}
+
+intptr_t OopMapStubGenerator::stub_to_offset(address stub) {
+  assert (code_cache_base == (intptr_t)CodeCache::low_bound(CodeBlobType::NonNMethod), "");
+  intptr_t offset = (intptr_t)stub - (intptr_t)code_cache_base;
+  assert ((offset & 0xffffffff) == offset, "");
+  return offset;
+}
+
+address OopMapStubGenerator::offset_to_stub(intptr_t offset) {
+  assert (code_cache_base == (intptr_t)CodeCache::low_bound(CodeBlobType::NonNMethod), "");
+  return (address)(code_cache_base + offset);
+}
+
+CodeBlob* OopMapStubGenerator::code_blob(address thaw_stub_address) {
+  return (CodeBlob*)*((address*)thaw_stub_address - 1);
+}
+
+intptr_t OopMapStubGenerator::code_cache_base;
+
+void OopMapStubGenerator::init() {
+  code_cache_base = (intptr_t)CodeCache::low_bound(CodeBlobType::NonNMethod);
+}
--- a/src/hotspot/share/code/codeCache.cpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/code/codeCache.cpp	Fri Jul 19 14:44:35 2019 +0100
@@ -659,6 +659,9 @@
     slot = -1;
     log_debug(codecache)("failed to encode %d %d", oopmap_slot, (int) offset);
   }
+#ifdef CONT_DOUBLE_NOP
+  assert (!nop->is_mode2(), "");
+#endif
   return cb;
 }
 
--- a/src/hotspot/share/code/codeCache.inline.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/code/codeCache.inline.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -35,15 +35,24 @@
 
 inline CodeBlob* CodeCache::find_blob_and_oopmap(void* pc, int& slot) {
   NativePostCallNop* nop = nativePostCallNop_at((address) pc);
-  if (LIKELY(nop != NULL)) {
+  if (LIKELY(nop != NULL)
+#ifdef CONT_DOUBLE_NOP
+      && !nop->is_mode2()
+#endif
+  ) {
     CodeBlob* cb;
     if (LIKELY(nop->displacement() != 0)) {
       int offset = (nop->displacement() & 0xffffff);
       cb = (CodeBlob*) ((address) pc - offset);
       slot = ((nop->displacement() >> 24) & 0xff);
+      // tty->print_cr(">>> PATCHED 22"); cb->print_on(tty);
     } else {
+      // tty->print_cr(">>> patching");
       cb = CodeCache::patch_nop(nop, pc, slot);
     }
+#ifdef CONT_DOUBLE_NOP
+    assert(!nop->is_mode2() == 1, "");
+#endif
     assert(cb != NULL, "must be");
     return cb;
   } else {
--- a/src/hotspot/share/compiler/oopMap.cpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/compiler/oopMap.cpp	Fri Jul 19 14:44:35 2019 +0100
@@ -526,11 +526,11 @@
 }
 
 // NULL, fail, success (address)
-void ImmutableOopMap::generate_stub() const {
+void ImmutableOopMap::generate_stub(const CodeBlob* cb) const {
   /* The address of the ImmutableOopMap is put into the _freeze_stub and _thaw_stub 
    * if we can't generate the stub for some reason */
   if (_freeze_stub == NULL) {
-    OopMapStubGenerator cgen(*this);
+    OopMapStubGenerator cgen(cb, *this);
     if (Atomic::cmpxchg((address) this, &_freeze_stub, (address) NULL) == NULL) {
       if (!cgen.generate()) {
         Atomic::store((address) this, &_thaw_stub);
--- a/src/hotspot/share/compiler/oopMap.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/compiler/oopMap.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -352,7 +352,7 @@
   void print_on(outputStream* st) const;
   void print() const;
 
-  void generate_stub() const;
+  void generate_stub(const CodeBlob* cb) const;
   address freeze_stub() const { return _freeze_stub; }
   address thaw_stub() const { return _thaw_stub; }
 };
--- a/src/hotspot/share/compiler/oopMap.inline.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/compiler/oopMap.inline.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -30,7 +30,7 @@
 #include "oops/compressedOops.hpp"
 
 inline const ImmutableOopMap* ImmutableOopMapSet::find_map_at_slot(int slot, int pc_offset) const {
-  assert(slot >= 0 && slot < _count, "bounds");
+  assert(slot >= 0 && slot < _count, "bounds count: %d slot: %d", _count, slot);
   ImmutableOopMapPair* pairs = get_pairs();
   ImmutableOopMapPair* last = &pairs[slot];
   assert(last->pc_offset() == pc_offset, "oopmap not found");
--- a/src/hotspot/share/compiler/oopMapStubGenerator.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/compiler/oopMapStubGenerator.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -32,19 +32,28 @@
 class BufferBlob;
 
 class OopMapStubGenerator {
+  static intptr_t code_cache_base;
+
+  const CodeBlob* _cb;
   const ImmutableOopMap& _oopmap;
   BufferBlob* _blob;
   address _freeze_stub;
   address _thaw_stub;
 
 public:
-  OopMapStubGenerator(const ImmutableOopMap& oopmap) : _oopmap(oopmap), _blob(NULL), _freeze_stub(NULL), _thaw_stub(NULL) {}
+  OopMapStubGenerator(const CodeBlob* cb, const ImmutableOopMap& oopmap) : _cb(cb), _oopmap(oopmap), _blob(NULL), _freeze_stub(NULL), _thaw_stub(NULL) {}
 
   address freeze_stub() { return _freeze_stub; }
   address thaw_stub() { return _thaw_stub; }
-
   bool generate();
   void free();
+
+  static void init();
+  
+  static address thaw_stub(address freeze_stub_address);
+  static CodeBlob* code_blob(address thaw_stub_address);
+  static intptr_t stub_to_offset(address stub);
+  static address offset_to_stub(intptr_t offset);
 };
 
 #endif
--- a/src/hotspot/share/runtime/continuation.cpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/runtime/continuation.cpp	Fri Jul 19 14:44:35 2019 +0100
@@ -58,7 +58,7 @@
 #include "utilities/exceptions.hpp"
 #include "utilities/macros.hpp"
 
-// #define PERFTEST 1
+#define PERFTEST 1
 
 #ifdef PERFTEST
 #define PERFTEST_ONLY(code) code
@@ -88,6 +88,15 @@
   // static void callgrind() {}
 #endif
 
+// #undef log_develop_info
+// #undef log_develop_debug
+// #undef log_develop_trace
+// #undef log_develop_is_enabled
+// #define log_develop_info(...)  (!log_is_enabled(Info, __VA_ARGS__))   ? (void)0 : LogImpl<LOG_TAGS(__VA_ARGS__)>::write<LogLevel::Info>
+// #define log_develop_debug(...) (!log_is_enabled(Debug, __VA_ARGS__)) ? (void)0 : LogImpl<LOG_TAGS(__VA_ARGS__)>::write<LogLevel::Debug>
+// #define log_develop_trace(...) (!log_is_enabled(Trace, __VA_ARGS__))  ? (void)0 : LogImpl<LOG_TAGS(__VA_ARGS__)>::write<LogLevel::Trace>
+// #define log_develop_is_enabled(level, ...)  log_is_enabled(level, __VA_ARGS__)
+
 // #undef ASSERT
 // #undef assert
 // #define assert(p, ...)
@@ -117,7 +126,6 @@
 
 // TODO
 //
-// !!! Keep an eye out for deopt, and patch_pc
 //
 // Add:
 //  - method/nmethod metadata
@@ -129,6 +137,8 @@
 // Things to compress in interpreted frames: return address, monitors, last_sp
 //
 // See: deoptimization.cpp, vframeArray.cpp, abstractInterpreter_x86.cpp
+//
+// For non-temporal load/store in clang (__builtin_nontemporal_load/store) see: https://clang.llvm.org/docs/LanguageExtensions.html
 
 #define YIELD_SIG  "java.lang.Continuation.yield(Ljava/lang/ContinuationScope;)V"
 #define YIELD0_SIG  "java.lang.Continuation.yield0(Ljava/lang/ContinuationScope;Ljava/lang/Continuation;)Z"
@@ -164,6 +174,7 @@
 const int       elemsPerWord       = wordSize/elementSizeInBytes;
 const int       LogElemsPerWord    = 1;
 
+STATIC_ASSERT(elemsPerWord >= 1);
 STATIC_ASSERT(elementSizeInBytes == sizeof(ElemType));
 STATIC_ASSERT(elementSizeInBytes == (1 << LogBytesPerElement));
 STATIC_ASSERT(elementSizeInBytes <<  LogElemsPerWord == wordSize);
@@ -187,12 +198,22 @@
 template <typename ConfigT>
 class CompiledMethodKeepalive;
 
+#ifdef CONT_DOUBLE_NOP
+class CachedCompiledMetadata;
+#endif
+
+// TODO R remove
+template<typename FKind> static intptr_t** slow_link_address(const frame& f);
+
 class Frame {
 public:
   template<typename RegisterMapT> static inline intptr_t** map_link_address(const RegisterMapT* map);
+  static inline intptr_t** callee_link_address(const frame& f);
   static inline Method* frame_method(const frame& f);
   static inline address real_pc(const frame& f);
-  static inline void patch_pc(frame& f, address pc);
+  static inline void patch_pc(const frame& f, address pc);
+  static address* return_pc_address(const frame& f);
+  static address return_pc(const frame& f);
 
   DEBUG_ONLY(static inline intptr_t* frame_top(const frame &f);)
 };
@@ -202,11 +223,7 @@
 public:
   static inline Method* frame_method(const frame& f);
 
-  static inline address return_pc(const frame& f);
-  static void patch_return_pc(frame& f, address pc);
-
-  static bool is_instance(const frame& f);
-  static bool is_instance(const hframe& hf);
+  template <typename FrameT> static bool is_instance(const FrameT& f);
 };
 
 class Interpreted : public FrameCommon<Interpreted> {
@@ -218,11 +235,13 @@
   static const char type = 'i';
 
 public:
-  static inline address* return_pc_address(const frame& f);
 
   static inline intptr_t* frame_top(const frame& f, InterpreterOopMap* mask);
   static inline intptr_t* frame_bottom(const frame& f);
 
+  static inline address* return_pc_address(const frame& f);
+  static inline address return_pc(const frame& f);
+  static void patch_return_pc(frame& f, address pc);
   static void patch_sender_sp(frame& f, intptr_t* sp);
 
   static void oop_map(const frame& f, InterpreterOopMap* mask);
@@ -241,11 +260,11 @@
 public:
   static inline intptr_t* frame_top(const frame& f);
   static inline intptr_t* frame_bottom(const frame& f);
-  static inline int size(const frame& f);
+
+  template <typename FrameT> static inline int size(const FrameT& f);
+  template <typename FrameT> static inline int stack_argsize(const FrameT& f);
   static inline int num_oops(const frame& f);
-  static inline int stack_argsize(const frame& f);
-
-  static inline address* return_pc_address(const frame& f);
+ 
   template <typename RegisterMapT>
   static bool is_owning_locks(JavaThread* thread, const RegisterMapT* map, const frame& f);
 };
@@ -255,8 +274,7 @@
   DEBUG_ONLY(static const char* name;)
   static const bool interpreted = false;
 
-  static bool is_instance(const frame& f);
-  static bool is_instance(const hframe& hf);
+  template <typename FrameT> static bool is_instance(const FrameT& f);
 };
 
 DEBUG_ONLY(const char* NonInterpretedUnknown::name = "NonInterpretedUnknown";)
@@ -300,60 +318,68 @@
 
 // Represents a stack frame on the horizontal stack, analogous to the frame class, for vertical-stack frames.
 
+// We do not maintain an sp and an unexetended sp. Instead, sp represents frame's unextended_sp, and various patching of interpreted frames is especially handled.
 template<typename SelfPD>
 class HFrameBase {
 protected:
-  int _sp; // corresponds to unextended sp in frame
-  int _ref_sp;
+  int     _sp;
+  int     _ref_sp;
   address _pc;
-  mutable CodeBlob* _cb;
+
+  bool _is_interpreted;
+  mutable void* _cb_imd; // stores CodeBlob in compiled frames and interpreted frame metadata for interpretedd frames
   mutable const ImmutableOopMap* _oop_map; // oop map, for compiled/stubs frames only
-  bool _is_interpreted;
-  // int _ref_length;
 
   friend class ContMirror;
 private:
-  const ImmutableOopMap* get_oop_map() const;
-
   const SelfPD& self() const { return static_cast<const SelfPD&>(*this); }
   SelfPD& self() { return static_cast<SelfPD&>(*this); }
 
-  template<typename FKind> address* return_pc_address() const { return self().template return_pc_address<FKind>(); }
+  const ImmutableOopMap* get_oop_map() const { return self().get_oop_map(); };
 
   void set_codeblob(address pc) {
-    if (_cb == NULL && !_is_interpreted) {// compute lazily
-      _cb = ContinuationCodeBlobLookup::find_blob(_pc);
-      assert(_cb != NULL, "must be valid");
+    if (_cb_imd == NULL && !_is_interpreted) {// compute lazily
+      _cb_imd = ContinuationCodeBlobLookup::find_blob(_pc);
+      assert(_cb_imd != NULL, "must be valid");
     }
   }
 
 protected:
-  HFrameBase() : _sp(-1), _ref_sp(-1), _pc(NULL), _cb(NULL), _oop_map(NULL), _is_interpreted(true) {}
-
-  HFrameBase(const HFrameBase& hf) : _sp(hf._sp), _ref_sp(hf._ref_sp), _pc(hf._pc),
-                                     _cb(hf._cb), _oop_map(hf._oop_map), _is_interpreted(hf._is_interpreted) {}
+  HFrameBase() : _sp(-1), _ref_sp(-1), _pc(NULL), _is_interpreted(true), _cb_imd(NULL), _oop_map(NULL) {}
+
+  HFrameBase(const HFrameBase& hf) : _sp(hf._sp),_ref_sp(hf._ref_sp), _pc(hf._pc),
+                                     _is_interpreted(hf._is_interpreted), _cb_imd(hf._cb_imd), _oop_map(hf._oop_map) {}
+
+  HFrameBase(int sp, int ref_sp, address pc, void* cb_md, bool is_interpreted)
+    : _sp(sp), _ref_sp(ref_sp), _pc(pc),
+      _is_interpreted(is_interpreted), _cb_imd((intptr_t*)cb_md), _oop_map(NULL) {}
 
   HFrameBase(int sp, int ref_sp, address pc, const ContMirror& cont)
     : _sp(sp), _ref_sp(ref_sp), _pc(pc),
-      _oop_map(NULL), _is_interpreted(Interpreter::contains(pc)) {
-      _cb = NULL;
+      _is_interpreted(Interpreter::contains(pc)), _cb_imd(NULL), _oop_map(NULL) {
       set_codeblob(_pc);
     }
 
-  HFrameBase(int sp, int ref_sp, address pc, CodeBlob* cb, bool is_interpreted) // called by ContMirror::new_hframe
-    : _sp(sp), _ref_sp(ref_sp), _pc(pc),
-      _cb(cb), _oop_map(NULL), _is_interpreted(is_interpreted) {}
-
   static address deopt_original_pc(const ContMirror& cont, address pc, CodeBlob* cb, int sp);
 
 public:
   inline bool operator==(const HFrameBase& other) const;
-  bool is_empty() const { return _pc == NULL && _sp < 0; }
+  bool is_empty() const { return _pc == NULL; }
 
   inline int       sp()     const { return _sp; }
   inline address   pc()     const { return _pc; }
   inline int       ref_sp() const { return _ref_sp; }
-  inline CodeBlob* cb()     const { return _cb; }
+
+  inline void set_sp(int sp) { _sp = sp; }
+
+  inline CodeBlob* cb()     const { assert (!Interpreter::contains(pc()), ""); return (CodeBlob*)_cb_imd; }
+  void set_cb(CodeBlob* cb) {
+    assert (!_is_interpreted, "");
+    if (_cb_imd == NULL) _cb_imd = cb;
+    assert (cb == slow_get_cb(*this), "");
+    assert (_cb_imd == cb, "");
+    assert (((CodeBlob*)_cb_imd)->contains(_pc), "");
+  }
   inline bool is_interpreted_frame() const { return _is_interpreted; } // due to partial copy below, this may lie in mode_fast
 
   template<op_mode mode>
@@ -370,7 +396,9 @@
   inline void set_pc(address pc) { _pc = pc; }
   inline void set_ref_sp(int ref_sp) { _ref_sp = ref_sp; }
 
-  template<typename FKind> address return_pc() const { return *return_pc_address<FKind>(); }
+  template<typename FKind> address return_pc() const { return *self().template return_pc_address<FKind>(); }
+
+  const CodeBlob* get_cb() const { return self().get_cb(); }
 
   const ImmutableOopMap* oop_map() const {
     if (_oop_map == NULL) {
@@ -382,8 +410,10 @@
   template<typename FKind> int frame_top_index() const;
   template<typename FKind> int frame_bottom_index() const { return self().template frame_bottom_index<FKind>(); };
 
-  template<typename FKind> inline void patch_return_pc(address value);
-
+  address real_pc(const ContMirror& cont) const;
+  void patch_pc(address pc, const ContMirror& cont) const;
+  template<typename FKind> inline void patch_return_pc(address value); // only for interpreted frames
+  
   int compiled_frame_size() const;
   int compiled_frame_num_oops() const;
   int compiled_frame_stack_argsize() const;
@@ -420,11 +450,16 @@
 // defines hframe
 #include CPU_HEADER(hframe)
 
-template<typename Self> bool FrameCommon<Self>::is_instance(const frame& f)  { return (Self::interpreted == f.is_interpreted_frame()) && (Self::stub == is_stub(f.cb())); }
-template<typename Self> bool FrameCommon<Self>::is_instance(const hframe& f) { return (Self::interpreted == f.is_interpreted_frame()) && (Self::stub == is_stub(f.cb())); }
-
-bool NonInterpretedUnknown::is_instance(const frame& f)  { return (interpreted == f.is_interpreted_frame()); }
-bool NonInterpretedUnknown::is_instance(const hframe& f) { return (interpreted == f.is_interpreted_frame()); }
+template<typename Self> 
+template <typename FrameT> 
+bool FrameCommon<Self>::is_instance(const FrameT& f) { 
+  return (Self::interpreted == f.is_interpreted_frame()) && (Self::stub == (!Self::interpreted && is_stub(slow_get_cb(f))));
+}
+
+template <typename FrameT> 
+bool NonInterpretedUnknown::is_instance(const FrameT& f) {
+  return (interpreted == f.is_interpreted_frame()); 
+}
 
 // Mirrors the Java continuation objects.
 // This object is created when we begin a freeze/thaw operation for a continuation, and is destroyed when the operation completes.
@@ -482,6 +517,7 @@
   oop raw_allocate(Klass* klass, size_t words, size_t elements, bool zero);
 
 public:
+  // TODO R: get rid of these:
   static inline int to_index(int x) { return x >> LogBytesPerElement; }
   static inline int to_bytes(int x)    { return x << LogBytesPerElement; }
   static inline int to_index(const void* base, const void* ptr) { return to_index((const char*)ptr - (const char*)base); }
@@ -495,7 +531,14 @@
   ContMirror(JavaThread* thread, oop cont);
   ContMirror(const RegisterMap* map);
 
-  DEBUG_ONLY(intptr_t hash() { return Thread::current()->is_Java_thread() ? _cont->identity_hash() : -1; })
+  intptr_t hash() { 
+    #ifndef PRODUCT
+      return Thread::current()->is_Java_thread() ? _cont->identity_hash() : -1;
+    #else
+      return 0;
+    #endif
+  }
+
   void write();
 
   oop mirror() { return _cont; }
@@ -598,21 +641,6 @@
 }
 
 template<typename SelfPD>
-const ImmutableOopMap* HFrameBase<SelfPD>::get_oop_map() const {
-  if (_cb == NULL) return NULL;
-  if (_cb->oop_maps() != NULL) {
-    NativePostCallNop* nop = nativePostCallNop_at(_pc);
-    if (nop != NULL && nop->displacement() != 0) {
-      int slot = ((nop->displacement() >> 24) & 0xff);
-      return _cb->oop_map_for_slot(slot, _pc);
-    }
-    const ImmutableOopMap* oop_map = OopMapSet::find_map(cb(), pc());
-    return oop_map;
-  }
-  return NULL;
-}
-
-template<typename SelfPD>
 address HFrameBase<SelfPD>::deopt_original_pc(const ContMirror& cont, address pc, CodeBlob* cb, int sp) {
   // TODO DEOPT: unnecessary in the long term solution of unroll on freeze
 
@@ -631,12 +659,25 @@
 }
 
 template<typename SelfPD>
+address HFrameBase<SelfPD>::real_pc(const ContMirror& cont) const {
+  address* pc_addr = cont.stack_address(self().pc_index());
+  return *pc_addr;
+}
+
+template<typename SelfPD>
 template<typename FKind>
 inline void HFrameBase<SelfPD>::patch_return_pc(address value) {
   *(self().template return_pc_address<FKind>()) = value;
 }
 
 template<typename SelfPD>
+void HFrameBase<SelfPD>::patch_pc(address pc, const ContMirror& cont) const {
+  address* pc_addr = (address*)cont.stack_address(self().pc_index());
+  // tty->print_cr(">>>> patching %p with %p", pc_addr, pc);
+  *pc_addr = pc;
+}
+
+template<typename SelfPD>
 template<typename FKind>
 bool HFrameBase<SelfPD>::is_bottom(const ContMirror& cont) const {
   return frame_bottom_index<FKind>()
@@ -660,13 +701,6 @@
 }
 
 template<typename SelfPD>
-int HFrameBase<SelfPD>::compiled_frame_stack_argsize() const {
-  assert (!_is_interpreted, "");
-  assert (cb()->is_compiled(), "");
-  return cb()->as_compiled_method()->method()->num_stack_arg_slots() * VMRegImpl::stack_slot_size;
-}
-
-template<typename SelfPD>
 inline int HFrameBase<SelfPD>::compiled_frame_num_oops() const {
   assert (!_is_interpreted, "");
   return oop_map()->num_oops();
@@ -674,8 +708,12 @@
 
 template<typename SelfPD>
 int HFrameBase<SelfPD>::compiled_frame_size() const {
-  assert (!_is_interpreted, "");
-  return cb()->frame_size() * wordSize;
+  return NonInterpretedUnknown::size(*this);
+}
+
+template<typename SelfPD>
+int HFrameBase<SelfPD>::compiled_frame_stack_argsize() const {
+  return NonInterpretedUnknown::stack_argsize(*this);
 }
 
 template<typename SelfPD>
@@ -687,11 +725,35 @@
   return _sp;
 }
 
+#ifdef CONT_DOUBLE_NOP
+// TODO R remove after PD separation
+template<op_mode mode>
+static inline CachedCompiledMetadata cached_metadata(const hframe& hf);
+#endif
+
 template<typename SelfPD>
 template<typename FKind, op_mode mode>
 SelfPD HFrameBase<SelfPD>::sender(const ContMirror& cont, const InterpreterOopMap* mask, int extra_oops) const {
   assert (mode != mode_fast || !FKind::interpreted, "");
-  return sender<FKind, mode>(cont, extra_oops + (FKind::interpreted ? interpreted_frame_num_oops(*mask) : compiled_frame_num_oops()));
+  int num_oops;
+#ifdef CONT_DOUBLE_NOP
+  CachedCompiledMetadata md;
+#endif
+  if (FKind::interpreted) {
+    num_oops = interpreted_frame_num_oops(*mask);
+  } else
+#ifdef CONT_DOUBLE_NOP
+  if (mode == mode_fast && LIKELY(!(md = cached_metadata<mode>(self())).empty()))
+    num_oops = md.num_oops();
+  else {
+    get_cb();
+#endif
+    num_oops = compiled_frame_num_oops();
+#ifdef CONT_DOUBLE_NOP
+  }
+#endif
+
+  return sender<FKind, mode>(cont, extra_oops + num_oops);
 }
 
 template<typename SelfPD>
@@ -844,16 +906,16 @@
 }
 
 bool ContMirror::is_empty() {
-  bool empty = _sp < 0 || _sp >= _stack->length();
-  assert (empty == (_pc == NULL), "");
-  return empty;
+  assert ((_pc == NULL) == (_sp < 0 || _sp >= _stack->length()), "");
+  return _pc == NULL;
 }
 
 template<op_mode mode>
 inline void ContMirror::set_last_frame(const hframe& f) {
-  // assert (f._length = _stack_length, "");
+  assert (mode != mode_fast || !Interpreter::contains(f.pc()), "");
+  assert (mode == mode_fast || f.is_interpreted_frame() == Interpreter::contains(f.pc()), "");
+  set_pc(f.pc(), mode == mode_fast ? false : f.is_interpreted_frame());
   set_sp(f.sp());
-  set_pc(f.pc(), mode == mode_fast ? false : f.is_interpreted_frame());
   set_last_frame_pd(f);
   set_refSP(f.ref_sp());
 
@@ -861,7 +923,7 @@
 
   if (log_develop_is_enabled(Trace, jvmcont)) {
     log_develop_trace(jvmcont)("set_last_frame cont sp: %d fp: 0x%lx pc: " INTPTR_FORMAT " interpreted: %d flag: %d", sp(), fp(), p2i(pc()), f.is_interpreted_frame(), is_flag(FLAG_LAST_FRAME_INTERPRETED));
-    f.print_on(*this, tty);
+    f.print_on(tty);
   }
 }
 
@@ -919,6 +981,7 @@
   assert (stack_index(to) >= 0, "");
   assert (to_index(_hstack, (address)to + size) <= _sp, "");
 
+  // TODO PERF non-temporal store
   PERFTEST_ONLY(if (PERFTEST_LEVEL >= 25))
     memcpy(to, from, size); //Copy::conjoint_memory_atomic(from, to, size); // Copy::disjoint_words((HeapWord*)from, (HeapWord*)to, size/wordSize); //
 
@@ -933,6 +996,7 @@
   assert (stack_index(from) >= 0, "");
   assert (to_index(stack(), (address)from + size) <= stack_length(), "index: %d length: %d", to_index(stack(), (address)from + size), stack_length());
 
+  // TODO PERF non-temporal load
   PERFTEST_ONLY(if (PERFTEST_LEVEL >= 125))
     memcpy(to, from, size); //Copy::conjoint_memory_atomic(from, to, size);
 
@@ -976,12 +1040,30 @@
 
 //////////////////////////// frame functions ///////////////
 
+class CachedCompiledMetadata; // defined in PD
+struct FpOopInfo;
+
+typedef int (*FreezeFnT)(address, address, address, address, int, FpOopInfo*);
+typedef int (*ThawFnT)(address /* dst */, address /* objArray */, address /* map */);
+
+
 class ContinuationHelper {
 public:
+#ifdef CONT_DOUBLE_NOP
+  static inline CachedCompiledMetadata cached_metadata(address pc);
+  template<op_mode mode, typename FrameT> static inline CachedCompiledMetadata cached_metadata(const FrameT& f);
+  template<typename FrameT> static void patch_freeze_stub(const FrameT& f, address freeze_stub);
+#endif
+
+  template<op_mode mode, typename FrameT> static FreezeFnT freeze_stub(const FrameT& f);
+  template<op_mode mode, typename FrameT> static ThawFnT thaw_stub(const FrameT& f);
+  
   template<typename FKind, typename RegisterMapT> static inline void update_register_map(RegisterMapT* map, const frame& f);
+  template<typename RegisterMapT> static inline void update_register_map_with_callee(RegisterMapT* map, const frame& f);
   template<typename RegisterMapT> static inline void update_register_map(RegisterMapT* map, hframe::callee_info callee_info);
-  static void update_register_map(RegisterMap* map, const hframe& h, const ContMirror& cont);
+  static void update_register_map(RegisterMap* map, const hframe& sender, const ContMirror& cont);
   static void update_register_map_from_last_vstack_frame(RegisterMap* map);
+
   static inline frame frame_with(frame& f, intptr_t* sp, address pc);
   static inline frame last_frame(JavaThread* thread);
   static inline void to_frame_info(const frame& f, const frame& callee, FrameInfo* fi);
@@ -1002,6 +1084,13 @@
   static bool assert_top_java_frame_name(const frame& f, const char* name);
   static bool assert_bottom_java_frame_name(const frame& f, const char* name);
   static inline bool is_deopt_return(address pc, const frame& sender);
+
+  template <typename FrameT> static CodeBlob* slow_get_cb(const FrameT& f);
+  template <typename FrameT> static const ImmutableOopMap* slow_get_oopmap(const FrameT& f);
+  template <typename FrameT> static int slow_size(const FrameT& f);
+  template <typename FrameT> static address slow_return_pc(const FrameT& f);
+  template <typename FrameT> static int slow_stack_argsize(const FrameT& f);
+  template <typename FrameT> static int slow_num_oops(const FrameT& f);
 #endif
 
 
@@ -1014,22 +1103,22 @@
   return m;
 }
 
-template<typename Self>
-inline address FrameCommon<Self>::return_pc(const frame& f) {
-  return *Self::return_pc_address(f);
-}
-
-template<typename Self>
-void FrameCommon<Self>::patch_return_pc(frame& f, address pc) {
-  *Self::return_pc_address(f) = pc;
-  log_develop_trace(jvmcont)("patched return_pc at " INTPTR_FORMAT ": " INTPTR_FORMAT, p2i(Self::return_pc_address(f)), p2i(pc));
-  // os::print_location(tty, (intptr_t)pc);
+address Frame::return_pc(const frame& f) {
+  return *return_pc_address(f);
 }
 
 // static void patch_interpreted_bci(frame& f, int bci) {
 //   f.interpreter_frame_set_bcp(f.interpreter_frame_method()->bcp_from(bci));
 // }
 
+address Interpreted::return_pc(const frame& f) {
+  return *return_pc_address(f);
+}
+
+void Interpreted::patch_return_pc(frame& f, address pc) {
+  *return_pc_address(f) = pc;
+}
+
 void Interpreted::oop_map(const frame& f, InterpreterOopMap* mask) {
   assert (mask != NULL, "");
   Method* m = f.interpreter_frame_method();
@@ -1093,25 +1182,26 @@
 #endif
 
 template<typename Self>
-inline int NonInterpreted<Self>::size(const frame&f) {
+template<typename FrameT>
+inline int NonInterpreted<Self>::size(const FrameT& f) {
   assert (!f.is_interpreted_frame() && Self::is_instance(f), "");
   return f.cb()->frame_size() * wordSize;
 }
 
 template<typename Self>
-inline int NonInterpreted<Self>::num_oops(const frame&f) {
+template<typename FrameT>
+inline int NonInterpreted<Self>::stack_argsize(const FrameT& f) {  assert (f.cb()->is_compiled(), "");
+  return f.cb()->as_compiled_method()->method()->num_stack_arg_slots() * VMRegImpl::stack_slot_size;
+}
+
+template<typename Self>
+inline int NonInterpreted<Self>::num_oops(const frame& f) {
   assert (!f.is_interpreted_frame() && Self::is_instance(f), "");
   assert (f.oop_map() != NULL, "");
   return f.oop_map()->num_oops() + Self::extra_oops;
 }
 
 template<typename Self>
-inline int NonInterpreted<Self>::stack_argsize(const frame&f) {
-  assert (f.cb()->is_compiled(), "");
-  return f.cb()->as_compiled_method()->method()->num_stack_arg_slots() * VMRegImpl::stack_slot_size;
-}
-
-template<typename Self>
 template<typename RegisterMapT>
 bool NonInterpreted<Self>::is_owning_locks(JavaThread* thread, const RegisterMapT* map, const frame& f) {
   // if (!DetectLocksInCompiledFrames) return false;
@@ -1475,12 +1565,11 @@
 #endif
 
   template<typename FKind> static inline frame sender(const frame& f);
-  template<typename FKind> static inline frame sender(const frame& f, hframe::callee_info* callee_info);
   template <typename FKind, bool top, bool bottom> inline void patch_pd(const frame& f, hframe& callee, const hframe& caller);
   template <bool bottom> inline void align(const hframe& caller);
   inline void relativize_interpreted_frame_metadata(const frame& f, intptr_t* vsp, const hframe& hf);
-  template<typename FKind> hframe new_callee_hframe(const frame& f, intptr_t* vsp, const hframe& caller, int fsize, int num_oops);
   template<bool cont_empty> hframe new_bottom_hframe(int sp, int ref_sp, address pc, bool interpreted);
+  template<typename FKind> hframe new_hframe(const frame& f, intptr_t* vsp, const hframe& caller, int fsize, int num_oops, int argsize);
 
 public:
 
@@ -1502,11 +1591,13 @@
 
     HandleMark hm(_thread);
 
+    // tty->print_cr(">>> freeze mode: %d", mode);
+
     // assert (map.update_map(), "RegisterMap not set to update");
     assert (!_map.include_argument_oops(), "should be");
     frame f = freeze_start_frame(_map);
     hframe caller;
-    return freeze<true>(f, caller, Frame::map_link_address(&_map), 0);
+    return freeze<true>(f, caller, 0);
   }
 
   frame freeze_start_frame(SmallRegisterMap& ignored) {
@@ -1520,8 +1611,12 @@
     // Note: if the doYield stub does not have its own frame, we may need to consider deopt here, especially if yield is inlinable
     frame f = ContinuationHelper::last_frame(_thread); // thread->last_frame();
     assert(StubRoutines::cont_doYield_stub()->contains(f.pc()), "must be");
-    ContinuationHelper::update_register_map<StubF>(&_map, f);
-    f = sender<StubF>(f);  // this is the yield frame
+  #ifdef ASSERT
+    hframe::callee_info my_info = slow_link_address<StubF>(f);
+  #endif
+    f = sender<StubF>(f);
+    assert (Frame::callee_link_address(f) == my_info, "");
+    // ContinuationHelper::update_register_map_with_callee(&_map, f);
 
     // The following doesn't work because fi->fp can contain an oop, that a GC doesn't know about when walking.
     // frame::update_map_with_saved_link(&map, (intptr_t **)&fi->fp);
@@ -1563,30 +1658,33 @@
   }
 
   template<bool top>
-  NOINLINE freeze_result freeze(const frame& f, hframe& caller, hframe::callee_info callee_info, int callee_argsize) {
+  NOINLINE freeze_result freeze(const frame& f, hframe& caller, int callee_argsize) {
     assert (f.unextended_sp() < _bottom_address - SP_WIGGLE, ""); // see recurse_freeze_java_frame
     assert (f.is_interpreted_frame() || ((top && mode == mode_preempt) == is_stub(f.cb())), "");
-    assert (mode != mode_fast || (f.is_compiled_frame() && f.oop_map() != NULL), "");
+    assert (mode != mode_fast || (!f.is_interpreted_frame() && slow_get_cb(f)->is_compiled()), "");
     assert (mode != mode_fast || !f.is_deoptimized_frame(), "");
 
     // Dynamically branch on frame type
     if (mode == mode_fast || f.is_compiled_frame()) {
-      if (mode != mode_fast && f.oop_map() == NULL)            return freeze_pinned_native; // special native frame
-      if (Compiled::is_owning_locks(_cont.thread(), &_map, f)) return freeze_pinned_monitor;
-
-      assert (f.oop_map() != NULL, "");
+      if (UNLIKELY(mode != mode_fast && f.oop_map() == NULL))            return freeze_pinned_native; // special native frame
+      if (UNLIKELY(
+               #ifdef CONT_DOUBLE_NOP
+                   !(mode == mode_fast && !ContinuationHelper::cached_metadata<mode>(f).empty()) &&
+               #endif
+                   Compiled::is_owning_locks(_cont.thread(), &_map, f))) return freeze_pinned_monitor;
+
       // Keepalive info here...
       CompiledMethodKeepaliveT kd(f.cb()->as_compiled_method(), _keepalive, _thread);
       if (kd.required()) {
         _keepalive = &kd;
-        return recurse_freeze_compiled_frame<top, true>(f, caller, callee_info, &kd);
+        return recurse_freeze_compiled_frame<top, true>(f, caller, &kd);
       }
 
-      return recurse_freeze_compiled_frame<top, false>(f, caller, callee_info, &kd);
+      return recurse_freeze_compiled_frame<top, false>(f, caller, &kd);
     } else if (f.is_interpreted_frame()) {
       if (Interpreted::is_owning_locks(f)) return freeze_pinned_monitor;
 
-      return recurse_freeze_interpreted_frame<top>(f, caller, callee_info, callee_argsize);
+      return recurse_freeze_interpreted_frame<top>(f, caller, callee_argsize);
     } else if (mode == mode_preempt && top && is_stub(f.cb())) {
       return recurse_freeze_stub_frame(f, caller);
     } else {
@@ -1595,26 +1693,30 @@
   }
 
   template<typename FKind, bool top, bool IsKeepalive>
-  inline freeze_result recurse_freeze_java_frame(const frame& f, hframe& caller, hframe::callee_info callee_info, int fsize, int argsize, int oops, typename FKind::ExtraT extra, CompiledMethodKeepaliveT* kd) {
+  inline freeze_result recurse_freeze_java_frame(const frame& f, hframe& caller, int fsize, int argsize, int oops, typename FKind::ExtraT extra, CompiledMethodKeepaliveT* kd) {
     assert (FKind::is_instance(f), "");
     log_develop_trace(jvmcont)("recurse_freeze_java_frame fsize: %d oops: %d", fsize, oops);
 
-    hframe::callee_info my_info;
-    frame senderf = sender<FKind>(f, &my_info);
+  #ifdef ASSERT
+    hframe::callee_info my_info = slow_link_address<FKind>(f);
+  #endif
+    frame senderf = sender<FKind>(f); // f.sender_for_compiled_frame<ContinuationCodeBlobLookup>(&map);
+    assert (FKind::interpreted || senderf.sp() == senderf.unextended_sp(), "");
+    assert (Frame::callee_link_address(senderf) == my_info, "");
 
     // sometimes an interpreted caller's sp extends a bit below entrySP, plus another word for possible alignment of compiled callee
     if (senderf.unextended_sp() >= _bottom_address - SP_WIGGLE) { // dynamic branch
+      if (UNLIKELY(!allocate()))
+        return freeze_exception;
+
       // senderf is the entry frame
-      freeze_result result = finalize<FKind>(senderf, f, argsize, caller); // recursion end
-      if (UNLIKELY(result != freeze_ok))
-        return result;
-
-      ContinuationHelper::update_register_map(&_map, callee_info); // restore saved link
+      argsize = finalize<FKind>(senderf, f, caller); // recursion end
+
       freeze_java_frame<FKind, top, true, IsKeepalive>(f, caller, fsize, argsize, oops, extra, kd);
 
       if (log_develop_is_enabled(Trace, jvmcont)) {
         log_develop_trace(jvmcont)("bottom h-frame:");
-        caller.print(_cont); // caller is now the current hframe
+        caller.print_on(tty); // caller is now the current hframe
       }
     } else {
       bool safepoint_stub_caller; // the use of _safepoint_stub_caller is not nice, but given preemption being performance non-critical, we don't want to add either a template or a regular parameter
@@ -1623,12 +1725,11 @@
         _safepoint_stub_caller = false;
       }
 
-      freeze_result result = freeze<false>(senderf, caller, my_info, argsize); // recursive call
+      freeze_result result = freeze<false>(senderf, caller, argsize); // recursive call
       if (UNLIKELY(result != freeze_ok))
         return result;
 
       if (mode == mode_preempt) _safepoint_stub_caller = safepoint_stub_caller; // restore _stub_caller
-      ContinuationHelper::update_register_map(&_map, callee_info);  // restore saved link
 
       freeze_java_frame<FKind, top, false, IsKeepalive>(f, caller, fsize, argsize, oops, extra, kd);
     }
@@ -1651,8 +1752,13 @@
     }
   }
 
+  inline bool allocate() {
+    _cont.allocate_stacks<ConfigT>(_size, _oops, _frames);
+    return !_thread->has_pending_exception();
+  }
+
   template<typename FKind> // the callee's type
-  NOINLINE freeze_result finalize(const frame& f, const frame& callee, int argsize, hframe& caller) {
+  int finalize(const frame& f, const frame& callee, hframe& caller) {
   #ifdef CALLGRIND_START_INSTRUMENTATION
     if (_frames > 0 && _cgrind_interpreted_frames == 0 && callgrind_counter == 1) {
       callgrind_counter = 2;
@@ -1690,18 +1796,30 @@
 
     allocate_keepalive();
 
+    int argsize = 0;
     if (_cont.is_empty()) {
-      assert (argsize == 0, ""); // the entry frame has an argsize of 0
       caller = new_bottom_hframe<true>(_cont.sp(), _cont.refSP(), NULL, false);
     } else {
       assert (_cont.is_flag(FLAG_LAST_FRAME_INTERPRETED) == Interpreter::contains(_cont.pc()), "");
       int sp = _cont.sp();
-      if (_cont.is_flag(FLAG_LAST_FRAME_INTERPRETED)) {
-        log_develop_trace(jvmcont)("finalize _size: %d add argsize: %d", _size, argsize);
-        _size += argsize;
-      } else {
-        // the arguments of the bottom-most frame are part of the topmost compiled frame on the hstack; we overwrite that part
-        sp += argsize >> LogBytesPerElement;
+
+      if (!FKind::interpreted) {
+    #ifdef CONT_DOUBLE_NOP
+        CachedCompiledMetadata md = ContinuationHelper::cached_metadata<mode>(callee);
+        if (LIKELY(!md.empty())) {
+          argsize = md.stack_argsize();
+          assert(argsize == slow_stack_argsize(callee), "argsize: %d slow_stack_argsize: %d", argsize, slow_stack_argsize(callee));
+        } else
+    #endif
+          argsize = Compiled::stack_argsize(callee);
+
+        if (_cont.is_flag(FLAG_LAST_FRAME_INTERPRETED)) {
+          log_develop_trace(jvmcont)("finalize _size: %d add argsize: %d", _size, argsize);
+          _size += argsize;
+        } else {
+          // the arguments of the bottom-most frame are part of the topmost compiled frame on the hstack; we overwrite that part
+          sp += argsize >> LogBytesPerElement;
+        }
       }
       caller = new_bottom_hframe<false>(sp, _cont.refSP(), _cont.pc(), _cont.is_flag(FLAG_LAST_FRAME_INTERPRETED));
     }
@@ -1712,7 +1830,7 @@
     _cont.add_size(_size);
     _cont.e_add_refs(_oops);
 
-    return freeze_ok;
+    return argsize;
   }
 
   template<typename FKind> // the callee's type
@@ -1761,10 +1879,11 @@
     if (LIKELY(!FKind::interpreted && extra != NULL)) { // dynamic branch
       FreezeFnT f_fn = (FreezeFnT)extra;
       // tty->print_cr(">>>>0000<<<<<");
-      frozen = freeze_compiled_oops_stub(f_fn, vsp, hsp, &_map, index);
+      frozen = freeze_compiled_oops_stub(f_fn, f, vsp, hsp, index);
     } else {
       if (num_oops == 0)
         return;
+      ContinuationHelper::update_register_map_with_callee(&_map, f); // restore saved link
       frozen = FKind::interpreted ? freeze_intepreted_oops(f, vsp, hsp, index, *(InterpreterOopMap*)extra)
                                   : freeze_compiled_oops  (f, vsp, hsp, index);
     }
@@ -1775,22 +1894,26 @@
   void patch(const frame& f, hframe& hf, const hframe& caller) {
     assert (FKind::is_instance(f), "");
     assert (bottom || !caller.is_empty(), "");
-    assert (bottom || mode == mode_fast || Interpreter::contains(hf.return_pc<FKind>()) == caller.is_interpreted_frame(), "");
+    // in fast mode, partial copy does not copy _is_interpreted for the caller
+    assert (bottom || mode == mode_fast || Interpreter::contains(FKind::interpreted ? hf.return_pc<FKind>() : caller.real_pc(_cont)) == caller.is_interpreted_frame(), 
+      "FKind: %s contains: %d is_interpreted: %d", FKind::name, Interpreter::contains(FKind::interpreted ? hf.return_pc<FKind>() : caller.real_pc(_cont)), caller.is_interpreted_frame()); // fails for perftest < 25, but that's ok
     assert (!bottom || !_cont.is_empty() || (_cont.fp() == 0 && _cont.pc() == NULL), "");
     assert (!bottom || _cont.is_empty() || caller == _cont.last_frame<mode_slow>(), "");
     assert (!bottom || _cont.is_empty() || Continuation::is_cont_barrier_frame(f), "");
     assert (!bottom || _cont.is_flag(FLAG_LAST_FRAME_INTERPRETED) == Interpreter::contains(_cont.pc()), "");
+    assert (!FKind::interpreted || hf.interpreted_link_address() == _cont.stack_address(hf.fp()), "");
 
     if (bottom) {
       log_develop_trace(jvmcont)("Fixing return address on bottom frame: " INTPTR_FORMAT, p2i(_cont.pc()));
-      hf.patch_return_pc<FKind>(_cont.pc());
+      FKind::interpreted ? hf.patch_return_pc<FKind>(_cont.pc())
+                         : caller.patch_pc(_cont.pc(), _cont); // TODO PERF non-temporal store
     }
 
     patch_pd<FKind, top, bottom>(f, hf, caller);
 
 #ifdef ASSERT
     // TODO DEOPT: long term solution: unroll on freeze and patch pc
-    if (!FKind::interpreted && !FKind::stub) {
+    if (mode != mode_fast && !FKind::interpreted && !FKind::stub) {
       assert (hf.cb()->is_compiled(), "");
       if (f.is_deoptimized_frame()) {
         log_develop_trace(jvmcont)("Freezing deoptimized frame");
@@ -1802,7 +1925,7 @@
   }
 
   template<bool top>
-  NOINLINE freeze_result recurse_freeze_interpreted_frame(const frame& f, hframe& caller, hframe::callee_info callee_info, int callee_argsize) {
+  NOINLINE freeze_result recurse_freeze_interpreted_frame(const frame& f, hframe& caller, int callee_argsize) {
     // ResourceMark rm(_thread);
     InterpreterOopMap mask;
     Interpreted::oop_map(f, &mask);
@@ -1815,7 +1938,7 @@
     _frames++;
     _cgrind_interpreted_frames++;
 
-    return recurse_freeze_java_frame<Interpreted, top, false>(f, caller, callee_info, fsize, 0, oops, &mask, NULL);
+    return recurse_freeze_java_frame<Interpreted, top, false>(f, caller, fsize, 0, oops, &mask, NULL);
   }
 
   template <bool top, bool bottom>
@@ -1823,7 +1946,7 @@
     intptr_t* vsp = Interpreted::frame_top(f, mask);
     assert ((Interpreted::frame_bottom(f) - vsp) * sizeof(intptr_t) == (size_t)fsize, "");
 
-    hframe hf = new_callee_hframe<Interpreted>(f, vsp, caller, fsize, oops);
+    hframe hf = new_hframe<Interpreted>(f, vsp, caller, fsize, oops, 0);
     intptr_t* hsp = _cont.stack_address(hf.sp());
 
     freeze_raw_frame(vsp, hsp, fsize);
@@ -1846,10 +1969,25 @@
   }
 
   template<bool top, bool IsKeepalive>
-  freeze_result recurse_freeze_compiled_frame(const frame& f, hframe& caller, hframe::callee_info callee_info, CompiledMethodKeepaliveT* kd) {
-    int fsize = Compiled::size(f);
-    int oops  = Compiled::num_oops(f);
-    int argsize = Compiled::stack_argsize(f);
+  freeze_result recurse_freeze_compiled_frame(const frame& f, hframe& caller, CompiledMethodKeepaliveT* kd) {
+    int fsize, oops, argsize;
+#ifdef CONT_DOUBLE_NOP
+    CachedCompiledMetadata md = ContinuationHelper::cached_metadata<mode>(f); // MUST BE SAFE FOR STUB CALLER; we're not at a call instruction
+    fsize = md.size();
+    if (LIKELY(fsize != 0)) {
+      oops = md.num_oops();
+      argsize = md.stack_argsize();
+
+      assert(fsize == slow_size(f), "fsize: %d slow_size: %d", fsize, slow_size(f));
+      assert(oops  == slow_num_oops(f), "oops: %d slow_num_oops: %d", oops, slow_num_oops(f));
+      assert(argsize == slow_stack_argsize(f), "argsize: %d slow_stack_argsize: %d", argsize, slow_stack_argsize(f));
+    } else
+#endif
+    {
+      fsize = Compiled::size(f);
+      oops  = Compiled::num_oops(f);
+      argsize = mode == mode_fast ? 0 : Compiled::stack_argsize(f);
+    }
     FreezeFnT f_fn = get_oopmap_stub(f); // try to do this early, so we wouldn't need to look at the oopMap again.
 
     log_develop_trace(jvmcont)("recurse_freeze_compiled_frame _size: %d add fsize: %d", _size, fsize);
@@ -1857,8 +1995,8 @@
     _oops += oops;
     _frames++;
 
-    // TODO: consider recalculating fsize, argsize and oops in freeze_compiled_frame instead of passing them, as we now do in thaw
-    return recurse_freeze_java_frame<Compiled, top, IsKeepalive>(f, caller, callee_info, fsize, argsize, oops, f_fn, kd);
+    // TODO PERF: consider recalculating fsize, argsize and oops in freeze_compiled_frame instead of passing them, as we now do in thaw
+    return recurse_freeze_java_frame<Compiled, top, IsKeepalive>(f, caller, fsize, argsize, oops, f_fn, kd);
   }
 
   template <typename FKind, bool top, bool bottom, bool IsKeepalive>
@@ -1876,12 +2014,15 @@
     // in mode_fast we must not look at caller.is_interpreted_frame() because it may be wrong (hframe::partial_copy)
 
     if (bottom || (mode != mode_fast && caller.is_interpreted_frame())) {
+      if (!bottom) { // if we're bottom, argsize has been computed in finalize
+        argsize = Compiled::stack_argsize(f);
+      }
       log_develop_trace(jvmcont)("freeze_compiled_frame add argsize: fsize: %d argsize: %d fsize: %d", fsize, argsize, fsize + argsize);
       fsize += argsize;
       align<bottom>(caller); // TODO PERF
     }
 
-    hframe hf = new_callee_hframe<FKind>(f, vsp, caller, fsize, oops);
+    hframe hf = new_hframe<FKind>(f, vsp, caller, fsize, oops, argsize);
     intptr_t* hsp = _cont.stack_address(hf.sp());
 
     freeze_raw_frame(vsp, hsp, fsize);
@@ -1910,22 +2051,23 @@
 
     patch<FKind, top, bottom>(f, hf, caller);
 
-    log_develop_trace(jvmcont)("freeze_compiled_frame real_pc: %p address: %p sp: %p", Frame::real_pc(f), &(((address*) f.sp())[-1]), f.sp());
-
-    assert(bottom || mode == mode_fast || Interpreter::contains(hf.return_pc<FKind>()) == caller.is_interpreted_frame(), "");
+    // log_develop_trace(jvmcont)("freeze_compiled_frame real_pc: " INTPTR_FORMAT " address: " INTPTR_FORMAT " sp: " INTPTR_FORMAT, p2i(Frame::real_pc(f)), p2i(&(((address*) f.sp())[-1])), p2i(f.sp()));
+    assert(bottom || mode == mode_fast || Interpreter::contains(caller.real_pc(_cont)) == caller.is_interpreted_frame(), "");
 
     return hf;
   }
 
   int freeze_compiled_oops(const frame& f, intptr_t* vsp, intptr_t* hsp, int starting_index) {
-    const ImmutableOopMap* oopmap = f.oop_map();
-    assert(oopmap, "must have");
-    // if (oopmap->num_oops() == 0) {
-    //   return 0;
-    // }
-
-    if (mode != mode_preempt && ConfigT::allow_stubs && oopmap->freeze_stub() == NULL) {
-      oopmap->generate_stub();
+    if (mode != mode_preempt && ConfigT::allow_stubs && get_oopmap_stub(f) == NULL) {
+  #ifdef CONT_DOUBLE_NOP
+      f.get_cb();
+  #endif
+      const ImmutableOopMap* oopmap = f.oop_map();
+      assert(oopmap, "must have");
+      oopmap->generate_stub(f.cb());
+  #ifdef CONT_DOUBLE_NOP
+      ContinuationHelper::patch_freeze_stub(f, (address)get_oopmap_stub(f));
+  #endif
       log_develop_trace(jvmcont)("freeze_compiled_oops generating oopmap stub; success: %d", get_oopmap_stub(f) != NULL);
       // tty->print_cr(">>>> generating oopmap stub; success: %d <<<<<", get_oopmap_stub(f) != NULL);
       // f.print_on(tty);
@@ -1934,7 +2076,7 @@
 
     if (mode != mode_preempt && ConfigT::allow_stubs && stub != NULL) {
       assert (_safepoint_stub.is_empty(), "");
-      return freeze_compiled_oops_stub(stub, vsp, hsp, &_map, starting_index);
+      return freeze_compiled_oops_stub(stub, f, vsp, hsp, starting_index);
     } else {
       // tty->print_cr(">>>>33333<<<<<");
       intptr_t *stub_vsp = NULL;
@@ -1948,6 +2090,12 @@
   #endif
       }
 
+  #ifdef CONT_DOUBLE_NOP
+      f.get_cb();
+  #endif
+      const ImmutableOopMap* oopmap = f.oop_map();
+      assert(oopmap, "must have");
+
       FreezeOopFn oopFn(&_cont, &_fp_oop_info, &f, vsp, hsp, &_map, starting_index, stub_vsp, stub_hsp);
 
       OopMapDo<FreezeOopFn, FreezeOopFn, IncludeAllValues> visitor(&oopFn, &oopFn);
@@ -1958,10 +2106,12 @@
     }
   }
 
-  int freeze_compiled_oops_stub(FreezeFnT f_fn, intptr_t* vsp, intptr_t* hsp, RegisterMapT* map, int starting_index) {
+  inline int freeze_compiled_oops_stub(FreezeFnT f_fn, const frame& f, intptr_t* vsp, intptr_t* hsp, int starting_index) {
     // tty->print_cr(">>>>2222<<<<<");
+    // ContinuationHelper::update_register_map_with_callee(&_map, f);
+    intptr_t** link_addr = Frame::callee_link_address(f); // Frame::map_link_address(map);
     typename ConfigT::OopT* addr = _cont.refStack()->template obj_at_address<typename ConfigT::OopT>(starting_index);
-    int cnt = f_fn( (address) vsp,  (address) addr, (address) Frame::map_link_address(map), (address) hsp, _cont.refStack()->length() - starting_index, &_fp_oop_info);
+    int cnt = f_fn( (address) vsp,  (address) addr, (address) link_addr, (address) hsp, _cont.refStack()->length() - starting_index, &_fp_oop_info);
     return cnt;
   }
 
@@ -1976,11 +2126,10 @@
 
     if (log_develop_is_enabled(Trace, jvmcont)) {
       log_develop_trace(jvmcont)("top_hframe after (freeze):");
-      _cont.last_frame<mode_slow>().print_on(_cont, tty);
+      _cont.last_frame<mode_preempt>().print_on(_cont, tty);
     }
 
-    assert (_cont.is_flag(FLAG_LAST_FRAME_INTERPRETED) == _cont.last_frame<mode_slow>().is_interpreted_frame(),
-      "flag: %d is_interpreted: %d", _cont.is_flag(FLAG_LAST_FRAME_INTERPRETED), _cont.last_frame<mode_slow>().is_interpreted_frame());
+    assert (_cont.is_flag(FLAG_LAST_FRAME_INTERPRETED) == _cont.last_frame<mode>().is_interpreted_frame(), "");
   }
 
   NOINLINE freeze_result recurse_freeze_stub_frame(const frame& f, hframe& caller) {
@@ -1993,16 +2142,19 @@
     assert (mode == mode_preempt, "");
     _safepoint_stub = f;
 
-    hframe::callee_info my_info;
-    frame senderf = sender<StubF>(f, &my_info); // f.sender_for_compiled_frame<ContinuationCodeBlobLookup>(&map);
-
+  #ifdef ASSERT
+    hframe::callee_info my_info = slow_link_address<StubF>(f);
+  #endif
+    frame senderf = sender<StubF>(f); // f.sender_for_compiled_frame<ContinuationCodeBlobLookup>(&map);
+
+    assert (Frame::callee_link_address(senderf) == my_info, "");
     assert (senderf.unextended_sp() < _bottom_address - SP_WIGGLE, "");
-    assert (senderf.is_compiled_frame(), "");
+    assert (senderf.is_compiled_frame(), ""); // TODO has been seen to fail in Preempt.java with -XX:+DeoptimizeALot
     assert (senderf.oop_map() != NULL, "");
 
     // we can have stub_caller as a value template argument, but that's unnecessary
     _safepoint_stub_caller = true;
-    freeze_result result = recurse_freeze_compiled_frame<false, false>(senderf, caller, my_info, NULL);
+    freeze_result result = recurse_freeze_compiled_frame<false, false>(senderf, caller, NULL);
     if (result == freeze_ok) {
       finish(f, _safepoint_stub_h);
     }
@@ -2030,12 +2182,7 @@
   inline FreezeFnT get_oopmap_stub(const frame& f) {
     if (!ConfigT::allow_stubs)
       return NULL;
-
-    FreezeFnT f_fn = (FreezeFnT)f.oop_map()->freeze_stub();
-    if ((void*)f_fn == (void*)f.oop_map()) {
-      f_fn = NULL; // need CompressedOops for now ????
-    }
-    return f_fn;
+    return ContinuationHelper::freeze_stub<mode>(f);
   }
 
   inline void freeze_raw_frame(intptr_t* vsp, intptr_t* hsp, int fsize) {
@@ -2066,7 +2213,6 @@
       int index = add_oop(obj, _starting_index + this->_count - 1);
 
   #ifdef ASSERT
-      // oop obj = NativeAccess<>::oop_load(p);
       print_oop(p, obj);
       assert (oopDesc::is_oop_or_null(obj), "invalid oop");
       log_develop_trace(jvmcont)("narrow: %d", sizeof(T) < wordSize);
@@ -2488,9 +2634,7 @@
   template<typename FKind, bool top, bool bottom> inline void patch_pd(frame& f, const frame& sender);
   void derelativize_interpreted_frame_metadata(const hframe& hf, const frame& f);
   inline hframe::callee_info frame_callee_info_address(frame& f);
-  template<typename FKind, bool top, bool bottom> inline intptr_t* align(const hframe& hf, intptr_t* vsp, const frame& caller);
-
-  typedef int (*ThawFnT)(address /* dst */, address /* objArray */, address /* map */);
+  template<typename FKind, bool top, bool bottom> inline intptr_t* align(const hframe& hf, intptr_t* vsp, frame& caller);
 
   bool should_deoptimize() { return true; /* mode != mode_fast && _thread->is_interp_only_mode(); */ } // TODO PERF
 
@@ -2522,7 +2666,7 @@
 
     assert (_cont.num_frames() == orig_num_frames - _frames, "cont.is_empty: %d num_frames: %d orig_num_frames: %d frame_count: %d", _cont.is_empty(), _cont.num_frames(), orig_num_frames, _frames);
     assert (mode != mode_fast || _fastpath, "");
-    return mode == mode_fast ? true : _fastpath;
+    return _fastpath;
   }
 
   template<bool top>
@@ -2599,8 +2743,17 @@
     } else {
       _cont.set_last_frame<mode>(hf); // _last_frame = hf;
       if (!FKind::interpreted && !hf.is_interpreted_frame()) {
+        int argsize;
+    #ifdef CONT_DOUBLE_NOP
+        CachedCompiledMetadata md = ContinuationHelper::cached_metadata<mode>(callee);
+        if (LIKELY(!md.empty())) {
+          argsize = md.stack_argsize();
+          assert(argsize == slow_stack_argsize(callee), "argsize: %d slow_stack_argsize: %d", argsize, slow_stack_argsize(callee));
+        } else
+    #endif
+          argsize = callee.compiled_frame_stack_argsize();
         // we'll be subtracting the argsize in thaw_compiled_frame, but if the caller is compiled, we shouldn't
-        _cont.add_size(callee.compiled_frame_stack_argsize());
+        _cont.add_size(argsize);
       } 
       // else {
       //   _fastpath = false; // see discussion in Freeze::freeze_compiled_frame
@@ -2673,13 +2826,13 @@
 
   template<typename FKind, bool top, bool bottom>
   inline void patch(frame& f, const frame& caller) {
-    assert (!bottom || caller.sp() == _cont.entrySP(), "caller.sp: " INTPTR_FORMAT " entrySP: " INTPTR_FORMAT, p2i(caller.sp()), p2i(_cont.entrySP()));
-
     if (bottom && !_cont.is_empty()) {
       log_develop_trace(jvmcont)("Setting return address to return barrier: " INTPTR_FORMAT, p2i(StubRoutines::cont_returnBarrier()));
-      FKind::patch_return_pc(f, StubRoutines::cont_returnBarrier());
+      FKind::interpreted ? Interpreted::patch_return_pc(f, StubRoutines::cont_returnBarrier())
+                         : FKind::patch_pc(caller, StubRoutines::cont_returnBarrier());
     } else if (bottom || should_deoptimize()) {
-      FKind::patch_return_pc(f, caller.raw_pc()); // this patches the return address to the deopt handler if necessary
+      FKind::interpreted ? Interpreted::patch_return_pc(f, caller.raw_pc())
+                         : FKind::patch_pc(caller, caller.raw_pc()); // this patches the return address to the deopt handler if necessary
     }
     patch_pd<FKind, top, bottom>(f, caller);
 
@@ -2699,7 +2852,7 @@
     int fsize = hf.interpreted_frame_size();
     int oops  = hf.interpreted_frame_num_oops(mask);
 
-    return recurse_thaw_java_frame<Interpreted, top>(hf, caller, num_frames, (void*)&mask);
+    recurse_thaw_java_frame<Interpreted, top>(hf, caller, num_frames, (void*)&mask);
   }
 
   template<bool top, bool bottom>
@@ -2752,19 +2905,42 @@
   frame thaw_compiled_frame(const hframe& hf, const frame& caller, ThawFnT t_fn) {
     thaw_compiled_frame_bp();
     assert(FKind::stub == is_stub(hf.cb()), "");
-
-    int fsize = hf.compiled_frame_size();
+    assert (caller.sp() == caller.unextended_sp(), "");
+
+    int fsize;
+#ifdef CONT_DOUBLE_NOP
+    CachedCompiledMetadata md;
+    if (mode != mode_preempt) {
+      md = ContinuationHelper::cached_metadata(hf.pc());
+      fsize = md.size();
+    }
+    if (mode == mode_preempt || UNLIKELY(fsize == 0))
+#endif
+      fsize = hf.compiled_frame_size();
+    assert(fsize == slow_size(hf), "argsize: %d slow_size: %d", fsize, slow_size(hf));
     log_develop_trace(jvmcont)("fsize: %d", fsize);
 
-    intptr_t* vsp = (intptr_t*)((address)caller.sp() - fsize);
+    intptr_t* vsp = (intptr_t*)((address)caller.unextended_sp() - fsize);
     log_develop_trace(jvmcont)("vsp: " INTPTR_FORMAT, p2i(vsp));
 
     if (bottom || (mode != mode_fast && caller.is_interpreted_frame())) {
       log_develop_trace(jvmcont)("thaw_compiled_frame add argsize: fsize: %d argsize: %d fsize: %d", fsize, hf.compiled_frame_stack_argsize(), fsize + hf.compiled_frame_stack_argsize());
-      int argsize = hf.compiled_frame_stack_argsize();
+      int argsize;
+  #ifdef CONT_DOUBLE_NOP
+      if (mode != mode_preempt && LIKELY(!md.empty())) {
+        argsize = md.stack_argsize();
+        assert(argsize == slow_stack_argsize(hf), "argsize: %d slow_stack_argsize: %d", argsize, slow_stack_argsize(hf));
+      } else
+  #endif
+        argsize = hf.compiled_frame_stack_argsize();
+
       fsize += argsize;
       vsp   -= argsize >> LogBytesPerWord;
-      vsp = align<FKind, top, bottom>(hf, vsp, caller);
+
+      const_cast<frame&>(caller).set_sp((intptr_t*)((address)caller.sp() - argsize));
+      assert (caller.sp() == (intptr_t*)((address)vsp + (fsize-argsize)), "");
+
+      vsp = align<FKind, top, bottom>(hf, vsp, const_cast<frame&>(caller));
     }
 
     _cont.sub_size(fsize);
@@ -2790,7 +2966,9 @@
     _cont.dec_num_frames();
 
     if (!FKind::stub) {
-      if (should_deoptimize() && !f.is_deoptimized_frame()
+      if (f.is_deoptimized_frame()) { // TODO PERF
+        _fastpath = false;
+      } else if (should_deoptimize()
           && (hf.cb()->as_compiled_method()->is_marked_for_deoptimization() || (mode != mode_fast && _thread->is_interp_only_mode()))) {
         log_develop_trace(jvmcont)("Deoptimizing thawed frame");
         DEBUG_ONLY(Frame::patch_pc(f, NULL));
@@ -2803,7 +2981,8 @@
         assert (f.is_deoptimized_frame() && is_deopt_return(f.raw_pc(), f), 
           "f.is_deoptimized_frame(): %d is_deopt_return(f.raw_pc()): %d is_deopt_return(f.pc()): %d", 
           f.is_deoptimized_frame(), is_deopt_return(f.raw_pc(), f), is_deopt_return(f.pc(), f));
-      }
+        _fastpath = false;
+      } 
     }
 
     return f;
@@ -2911,12 +3090,7 @@
   inline ThawFnT get_oopmap_stub(const hframe& f) {
     if (!ConfigT::allow_stubs)
       return NULL;
-
-    ThawFnT t_fn = (ThawFnT)f.oop_map()->thaw_stub();
-    if ((void*)t_fn == (void*)f.oop_map()) {
-      t_fn = NULL; // need CompressedOops for now ????
-    }
-    return t_fn;
+    return ContinuationHelper::thaw_stub<mode>(f);
   }
 
   inline void thaw_raw_frame(intptr_t* hsp, intptr_t* vsp, int fsize) {
@@ -3110,6 +3284,12 @@
 // caller is still frozen on the h-stack.
 // The continuation object can be extracted from the thread.
 bool Continuation::is_cont_barrier_frame(const frame& f) {
+#ifdef CONT_DOUBLE_NOP
+  #ifdef ASSERT
+    if (!f.is_interpreted_frame()) return is_return_barrier_entry(slow_return_pc(f));
+  #endif
+#endif
+  assert (f.is_interpreted_frame() || f.cb() != NULL, "");
   return is_return_barrier_entry(f.is_interpreted_frame() ? Interpreted::return_pc(f) : Compiled::return_pc(f));
   // return is_return_barrier_entry(CHOOSE1(f.is_interpreted_frame(), return_pc, f));
 }
@@ -3264,7 +3444,7 @@
 static frame continuation_top_frame(oop contOop, RegisterMap* map) {
   ContMirror cont(NULL, contOop);
 
-  hframe hf = cont.last_frame<mode_slow>();
+  hframe hf = cont.last_frame<mode_preempt>(); // here mode_preempt merely makes the fewest assumptions
   assert (!hf.is_empty(), "");
 
   // tty->print_cr(">>>> continuation_top_frame");
@@ -3350,7 +3530,7 @@
       if (is_stub(f.cb())) {
         f.oop_map()->update_register_map(&f, map); // we have callee-save registers in this case
       }
-      ContinuationHelper::update_register_map(map, hf, cont);
+      ContinuationHelper::update_register_map(map, sender, cont);
     }
   }
 
@@ -3435,7 +3615,7 @@
   return ioc.index();
 }
 
-static int find_oop_in_interpreted_frame(const hframe& hf, int offset, const InterpreterOopMap& mask) {
+static int find_oop_in_interpreted_frame(const hframe& hf, int offset, const InterpreterOopMap& mask, const ContMirror& cont) {
   // see void frame::oops_interpreted_do
   InterpreterOopIndexClosure ioc(offset);
   mask.iterate_oop(&ioc);
@@ -3569,7 +3749,7 @@
   log_develop_trace(jvmcont)("interpreter_frame_expression_stack_at oop_address: stack index: %d, length: %d exp: %d index1: %d", cont.stack_index(loc), cont.stack_length(), index, index1);
 
   address res = oop_mask.is_oop(index1)
-    ? oop_address(cont.refStack(), cont.refSP(), hf.ref_sp() + find_oop_in_interpreted_frame(hf, index1, oop_mask))
+    ? oop_address(cont.refStack(), cont.refSP(), hf.ref_sp() + find_oop_in_interpreted_frame(hf, index1, oop_mask, cont))
     : loc;
   return res;
 }
@@ -3583,7 +3763,7 @@
 
   log_develop_trace(jvmcont)("interpreter_frame_local_at oop_address: stack index: %d length: %d local: %d", cont.stack_index(loc), cont.stack_length(), index);
   address res = oop_mask.is_oop(index)
-    ? oop_address(cont.refStack(), cont.refSP(), hf.ref_sp() + find_oop_in_interpreted_frame(hf, index, oop_mask))
+    ? oop_address(cont.refStack(), cont.refSP(), hf.ref_sp() + find_oop_in_interpreted_frame(hf, index, oop_mask, cont))
     : loc;
   return res;
 }
@@ -3591,14 +3771,12 @@
 Method* Continuation::interpreter_frame_method(const frame& fr, const RegisterMap* map) {
   assert (fr.is_interpreted_frame(), "");
   hframe hf = ContMirror(map).from_frame(fr);
-
   return hf.method<Interpreted>();
 }
 
 address Continuation::interpreter_frame_bcp(const frame& fr, const RegisterMap* map) {
   assert (fr.is_interpreted_frame(), "");
   hframe hf = ContMirror(map).from_frame(fr);
-
   return hf.interpreter_frame_bcp();
 }
 
@@ -4030,6 +4208,13 @@
 
 #include CPU_HEADER_INLINE(continuation)
 
+#ifdef CONT_DOUBLE_NOP
+template<op_mode mode>
+static inline CachedCompiledMetadata cached_metadata(const hframe& hf) {
+  return ContinuationHelper::cached_metadata<mode>(hf);
+}
+#endif
+
 template <bool compressed_oops, bool post_barrier, bool gen_stubs>
 class Config {
 public:
@@ -4072,10 +4257,12 @@
   static void resolve_gencode() {
     LoomGenCode ? resolve<use_compressed, is_modref, true>()
                 : resolve<use_compressed, is_modref, false>();
-  }
+  } 
 
   template <bool use_compressed, bool is_modref, bool gen_code>
   static void resolve() {
+    // tty->print_cr(">>> ConfigResolve::resolve use_compressed: %d is_modref: %d gen_code:%d", use_compressed, is_modref, gen_code);
+
     cont_freeze_fast    = Config<use_compressed, is_modref, gen_code>::template freeze<mode_fast>;
     cont_freeze_slow    = Config<use_compressed, is_modref, gen_code>::template freeze<mode_slow>;
     cont_freeze_preempt = Config<use_compressed, is_modref, gen_code>::template freeze<mode_preempt>;
@@ -4088,6 +4275,7 @@
 
 void Continuations::init() {
   ConfigResolve::resolve();
+  OopMapStubGenerator::init();
 }
 
 class KeepaliveCleanupClosure : public ThreadClosure {
@@ -4312,6 +4500,49 @@
   return cm->is_deopt_pc(pc);
 }
 
+template <typename FrameT>
+static CodeBlob* slow_get_cb(const FrameT& f) {
+  assert (!f.is_interpreted_frame(), "");
+  CodeBlob* cb = f.cb();
+  if (cb == NULL) {
+    cb = CodeCache::find_blob(f.pc());
+  }
+  assert (cb != NULL, "");
+  return cb;
+}
+
+template <typename FrameT>
+static const ImmutableOopMap* slow_get_oopmap(const FrameT& f) {
+  const ImmutableOopMap* oopmap = f.oop_map();
+  if (oopmap == NULL) {
+    oopmap = OopMapSet::find_map(slow_get_cb(f), f.pc());
+  }
+  assert (oopmap != NULL, "");
+  return oopmap;
+}
+
+template <typename FrameT>
+static int slow_size(const FrameT& f) { 
+  return slow_get_cb(f)->frame_size() * wordSize; 
+}
+
+template <typename FrameT>
+static address slow_return_pc(const FrameT& f) { 
+  return *slow_return_pc_address<NonInterpretedUnknown>(f); 
+}
+
+template <typename FrameT>
+static int slow_stack_argsize(const FrameT& f) { 
+  CodeBlob* cb = slow_get_cb(f);
+  assert (cb->is_compiled(), "");
+  return cb->as_compiled_method()->method()->num_stack_arg_slots() * VMRegImpl::stack_slot_size; 
+}
+
+template <typename FrameT>
+static int slow_num_oops(const FrameT& f) { 
+  return slow_get_oopmap(f)->num_oops(); 
+}
+
 static void print_blob(outputStream* st, address addr) {
   CodeBlob* b = CodeCache::find_blob_unsafe(addr);
   st->print("address: " INTPTR_FORMAT " blob: ", p2i(addr));
--- a/src/hotspot/share/runtime/continuation.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/runtime/continuation.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -30,6 +30,8 @@
 #include "runtime/globals.hpp"
 #include "jni.h"
 
+// #define CONT_DOUBLE_NOP 1
+
 #define CONT_FULL_STACK (!UseContinuationLazyCopy)
 
 // The order of this struct matters as it's directly manipulated by assembly code (push/pop)
--- a/src/hotspot/share/runtime/frame.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/runtime/frame.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -60,7 +60,7 @@
     } _cont_sp;
   };
   address   _pc; // program counter (the next instruction after the call)
-  CodeBlob* _cb; // CodeBlob that "owns" pc
+  mutable CodeBlob* _cb; // CodeBlob that "owns" pc
   mutable const ImmutableOopMap* _oop_map; // oop map, for compiled/stubs frames only
   enum deopt_state {
     not_deoptimized,
@@ -105,8 +105,12 @@
 
   int cont_sp()     const { return _cont_sp._sp; }
   int cont_ref_sp() const { return _cont_sp._ref_sp; }
+  int cont_unextended_sp() const;
 
   CodeBlob* cb() const           { return _cb; }
+  inline CodeBlob* get_cb() const;
+  // inline void set_cb(CodeBlob* cb);
+
   const ImmutableOopMap* oop_map() const {
     if (_oop_map == NULL) {
       _oop_map = get_oop_map();
--- a/src/hotspot/share/runtime/frame.inline.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/runtime/frame.inline.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -84,6 +84,24 @@
   }
 }
 
+inline CodeBlob* frame::get_cb() const {
+  // if (_cb == NULL) _cb = CodeCache::find_blob(_pc);
+  if (_cb == NULL) {
+    int slot;
+    _cb = CodeCache::find_blob_and_oopmap(_pc, slot);
+    if (_oop_map == NULL && slot >= 0) {
+      _oop_map = _cb->oop_map_for_slot(slot, _pc);
+    }
+  }
+  return _cb;
+}
+
+// inline void frame::set_cb(CodeBlob* cb) {
+//   if (_cb == NULL) _cb = cb;
+//   assert (_cb == cb, "");
+//   assert (_cb->contains(_pc), "");
+// }
+
 inline bool StackFrameStream::is_done() {
   return (_is_done) ? true : (_is_done = _fr.is_first_frame(), false);
 }
--- a/src/hotspot/share/utilities/globalDefinitions.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/utilities/globalDefinitions.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -38,10 +38,22 @@
 #ifndef ALWAYSINLINE
 #define ALWAYSINLINE inline
 #endif
+#ifndef __HOT
+#define __HOT
+#endif
+#ifndef __COLD
+#define __COLD
+#endif
 
 #ifndef ATTRIBUTE_ALIGNED
 #define ATTRIBUTE_ALIGNED(x)
 #endif
+#ifndef LIKELY
+#define LIKELY(condition)   (condition)
+#endif
+#ifndef UNLIKELY
+#define UNLIKELY(condition) (condition)
+#endif
 
 // These are #defines to selectively turn on/off the Print(Opto)Assembly
 // capabilities. Choices should be led by a tradeoff between
@@ -1134,13 +1146,4 @@
   return k0 == k1;
 }
 
-#if (defined(__GNUC__) || defined(__clang__))
-#define LIKELY(condition)   __builtin_expect(static_cast<bool>(condition), 1)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-#define LIKELY(condition)   (condition)
-#define UNLIKELY(condition) (condition)
-#endif
-
-
 #endif // SHARE_UTILITIES_GLOBALDEFINITIONS_HPP
--- a/src/hotspot/share/utilities/globalDefinitions_gcc.hpp	Fri Jul 19 11:10:25 2019 +0200
+++ b/src/hotspot/share/utilities/globalDefinitions_gcc.hpp	Fri Jul 19 14:44:35 2019 +0100
@@ -275,4 +275,10 @@
 //
 #define ATTRIBUTE_ALIGNED(x) __attribute__((aligned((__typeof__(x))x+0)))
 
+#define LIKELY(condition)   __builtin_expect(static_cast<bool>(condition), 1)
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+
+#define __COLD __attribute__((cold))
+#define __HOT  __attribute__((hot))
+
 #endif // SHARE_UTILITIES_GLOBALDEFINITIONS_GCC_HPP
--- a/test/jdk/java/lang/Continuation/Basic.java	Fri Jul 19 11:10:25 2019 +0200
+++ b/test/jdk/java/lang/Continuation/Basic.java	Fri Jul 19 14:44:35 2019 +0100
@@ -50,8 +50,6 @@
 * @run testng/othervm -XX:+UnlockExperimentalVMOptions -XX:+UseJVMCICompiler -XX:+UnlockDiagnosticVMOptions -XX:-TieredCompilation -Xcomp -XX:CompileOnly=java/lang/Continuation,Basic -XX:+UseContinuationLazyCopy Basic
 * @run testng/othervm -XX:+UnlockExperimentalVMOptions -XX:+UseJVMCICompiler -XX:+UnlockDiagnosticVMOptions -XX:-TieredCompilation -Xcomp -XX:CompileOnly=java/lang/Continuation,Basic -XX:+UseContinuationLazyCopy -XX:CompileCommand=exclude,Basic.manyArgsDriver Basic
 * @run testng/othervm -XX:+UnlockExperimentalVMOptions -XX:+UseJVMCICompiler -XX:+UnlockDiagnosticVMOptions -XX:-TieredCompilation -Xcomp -XX:CompileOnly=java/lang/Continuation,Basic -XX:+UseContinuationLazyCopy -XX:CompileCommand=exclude,java/lang/Continuation.enter Basic
-* @run testng/othervm -XX:+UnlockExperimentalVMOptions -XX:+UseJVMCICompiler -XX:TieredStopAtLevel=3 -Xcomp -XX:CompileOnly=java/lang/Continuation,Basic -XX:-UseContinuationLazyCopy Basic
-* @run testng/othervm -XX:+UnlockExperimentalVMOptions -XX:+UseJVMCICompiler -XX:+UnlockDiagnosticVMOptions -XX:TieredStopAtLevel=3 -Xcomp -XX:CompileOnly=java/lang/Continuation,Basic -XX:+UseContinuationLazyCopy Basic
 */
 
 // Anything excluded or not compileonly is not compiled; see CompilerOracle::should_exclude
@@ -87,6 +85,15 @@
 
     static final ContinuationScope FOO = new ContinuationScope() {};
     
+    // @Test
+    // public void test0() {
+    //     fooooooo();
+    // }
+
+    // private static void fooooooo() {
+    //     new Basic().test1();
+    // }
+
     public void test1() {
         System.out.println("test1");
         final AtomicInteger res = new AtomicInteger(0);