changeset 13527:d9b9c6c9c4b2 mvt

8185265: [MVT] improve performance of return of value types with new calling convention Reviewed-by: thartmann
author roland
date Mon, 21 Aug 2017 12:50:40 +0200
parents f5dd157e3889
children f017570123b2
files src/cpu/x86/vm/interp_masm_x86.cpp src/cpu/x86/vm/sharedRuntime_x86_32.cpp src/cpu/x86/vm/sharedRuntime_x86_64.cpp src/cpu/x86/vm/templateInterpreterGenerator_x86.cpp src/cpu/x86/vm/x86_64.ad src/share/vm/adlc/formssel.cpp src/share/vm/classfile/classLoaderData.cpp src/share/vm/classfile/classLoaderData.hpp src/share/vm/code/codeBlob.cpp src/share/vm/code/codeBlob.hpp src/share/vm/compiler/compileBroker.cpp src/share/vm/oops/instanceKlass.hpp src/share/vm/oops/klassVtable.cpp src/share/vm/oops/valueKlass.cpp src/share/vm/oops/valueKlass.hpp src/share/vm/opto/callGenerator.cpp src/share/vm/opto/callnode.cpp src/share/vm/opto/callnode.hpp src/share/vm/opto/castnode.cpp src/share/vm/opto/compile.cpp src/share/vm/opto/compile.hpp src/share/vm/opto/escape.cpp src/share/vm/opto/machnode.cpp src/share/vm/opto/macro.cpp src/share/vm/opto/macro.hpp src/share/vm/opto/matcher.cpp src/share/vm/opto/mulnode.cpp src/share/vm/opto/output.cpp src/share/vm/opto/phaseX.cpp src/share/vm/opto/phaseX.hpp src/share/vm/opto/runtime.cpp src/share/vm/opto/runtime.hpp src/share/vm/opto/type.cpp src/share/vm/opto/type.hpp src/share/vm/opto/valuetypenode.cpp src/share/vm/opto/valuetypenode.hpp src/share/vm/runtime/deoptimization.cpp src/share/vm/runtime/globals.hpp src/share/vm/runtime/sharedRuntime.cpp src/share/vm/runtime/sharedRuntime.hpp test/compiler/valhalla/valuetypes/ValueTypeTestBench.java
diffstat 41 files changed, 915 insertions(+), 224 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/x86/vm/interp_masm_x86.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/cpu/x86/vm/interp_masm_x86.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -32,6 +32,7 @@
 #include "oops/markOop.hpp"
 #include "oops/methodData.hpp"
 #include "oops/method.hpp"
+#include "oops/valueKlass.hpp"
 #include "prims/jvmtiExport.hpp"
 #include "prims/jvmtiThreadState.hpp"
 #include "runtime/basicLock.hpp"
@@ -1129,8 +1130,22 @@
   }
   if (load_values) {
     // We are returning a value type, load its fields into registers
+#ifndef _LP64
     super_call_VM_leaf(StubRoutines::load_value_type_fields_in_regs());
+#else
+    load_klass(rdi, rax);
+    movptr(rdi, Address(rdi, ValueKlass::unpack_handler_offset()));
 
+    Label skip;
+    testptr(rdi, rdi);
+    jcc(Assembler::equal, skip);
+
+    // Load fields from a buffered value with a value class specific
+    // handler
+    call(rdi);
+
+    bind(skip);
+#endif
     // call above kills the value in rbx. Reload it.
     movptr(rbx, Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize));
   }
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -3146,3 +3146,8 @@
   // frame_size_words or bytes??
   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_words, oop_maps, true);
 }
+
+BufferedValueTypeBlob* SharedRuntime::generate_buffered_value_type_adapter(const ValueKlass* vk) {
+  Unimplemented();
+  return NULL;
+}
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -4161,3 +4161,100 @@
   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
 }
 #endif // COMPILER2
+
+BufferedValueTypeBlob* SharedRuntime::generate_buffered_value_type_adapter(const ValueKlass* vk) {
+  BufferBlob* buf = BufferBlob::create("value types pack/unpack", 16 * K);
+  CodeBuffer buffer(buf);
+  short buffer_locs[20];
+  buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
+                                         sizeof(buffer_locs)/sizeof(relocInfo));
+
+  MacroAssembler _masm(&buffer);
+  MacroAssembler* masm = &_masm;
+
+  const Array<SigEntry>* sig_vk = vk->extended_sig();
+  const Array<VMRegPair>* regs = vk->return_regs();
+
+  int pack_fields_off = __ offset();
+
+  int j = 1;
+  for (int i = 0; i < sig_vk->length(); i++) {
+    BasicType bt = sig_vk->at(i)._bt;
+    if (bt == T_VALUETYPE) {
+      continue;
+    }
+    if (bt == T_VOID) {
+      if (sig_vk->at(i-1)._bt == T_LONG ||
+          sig_vk->at(i-1)._bt == T_DOUBLE) {
+        j++;
+      }
+      continue;
+    }
+    int off = sig_vk->at(i)._offset;
+    VMRegPair pair = regs->at(j);
+    VMReg r_1 = pair.first();
+    VMReg r_2 = pair.second();
+    Address to(rax, off);
+    if (bt == T_FLOAT) {
+      __ movflt(to, r_1->as_XMMRegister());
+    } else if (bt == T_DOUBLE) {
+      __ movdbl(to, r_1->as_XMMRegister());
+    } else if (bt == T_OBJECT || bt == T_ARRAY) {
+      __ store_heap_oop(to, r_1->as_Register());
+    } else {
+      assert(is_java_primitive(bt), "unexpected basic type");
+      size_t size_in_bytes = type2aelembytes(bt);
+      __ store_sized_value(to, r_1->as_Register(), size_in_bytes);
+    }
+    j++;
+  }
+  assert(j == regs->length(), "missed a field?");
+
+  __ ret(0);
+
+  int unpack_fields_off = __ offset();
+
+  j = 1;
+  for (int i = 0; i < sig_vk->length(); i++) {
+    BasicType bt = sig_vk->at(i)._bt;
+    if (bt == T_VALUETYPE) {
+      continue;
+    }
+    if (bt == T_VOID) {
+      if (sig_vk->at(i-1)._bt == T_LONG ||
+          sig_vk->at(i-1)._bt == T_DOUBLE) {
+        j++;
+      }
+      continue;
+    }
+    int off = sig_vk->at(i)._offset;
+    VMRegPair pair = regs->at(j);
+    VMReg r_1 = pair.first();
+    VMReg r_2 = pair.second();
+    Address from(rax, off);
+    if (bt == T_FLOAT) {
+      __ movflt(r_1->as_XMMRegister(), from);
+    } else if (bt == T_DOUBLE) {
+      __ movdbl(r_1->as_XMMRegister(), from);
+    } else if (bt == T_OBJECT || bt == T_ARRAY) {
+      __ load_heap_oop(r_1->as_Register(), from);
+    } else {
+      assert(is_java_primitive(bt), "unexpected basic type");
+      size_t size_in_bytes = type2aelembytes(bt);
+      __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
+    }
+    j++;
+  }
+  assert(j == regs->length(), "missed a field?");
+
+  if (StressValueTypeReturnedAsFields) {
+    __ load_klass(rax, rax);
+    __ orptr(rax, 1);
+  }
+
+  __ ret(0);
+
+  __ flush();
+
+  return BufferedValueTypeBlob::create(&buffer, pack_fields_off, unpack_fields_off);
+}
--- a/src/cpu/x86/vm/templateInterpreterGenerator_x86.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/cpu/x86/vm/templateInterpreterGenerator_x86.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -34,6 +34,7 @@
 #include "oops/methodData.hpp"
 #include "oops/method.hpp"
 #include "oops/oop.inline.hpp"
+#include "oops/valueKlass.hpp"
 #include "prims/jvmtiExport.hpp"
 #include "prims/jvmtiThreadState.hpp"
 #include "runtime/arguments.hpp"
@@ -204,15 +205,63 @@
   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD);
 
   if (state == qtos && ValueTypeReturnedAsFields) {
+#ifndef _LP64
+    __ super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf());
+#else
     // A value type is being returned. If fields are in registers we
     // need to allocate a value type instance and initialize it with
     // the value of the fields.
+    Label skip, slow_case;
+    // We only need a new buffered value if a new one is not returned
+    __ testptr(rax, 1);
+    __ jcc(Assembler::zero, skip);
+
+    // Try to allocate a new buffered value (from the heap)
+    if (UseTLAB) {
+      __ mov(rbx, rax);
+      __ andptr(rbx, -2);
+
+      __ movl(r14, Address(rbx, Klass::layout_helper_offset()));
+
+      __ movptr(r13, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
+      __ lea(r14, Address(r13, r14, Address::times_1));
+      __ cmpptr(r14, Address(r15_thread, in_bytes(JavaThread::tlab_end_offset())));
+      __ jcc(Assembler::above, slow_case);
+      __ movptr(Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())), r14);
+
+      if (UseBiasedLocking) {
+        __ movptr(rax, Address(rbx, Klass::prototype_header_offset()));
+        __ movptr(Address(r13, oopDesc::mark_offset_in_bytes ()), rax);
+      } else {
+        __ movptr(Address(r13, oopDesc::mark_offset_in_bytes ()),
+                  (intptr_t)markOopDesc::prototype());
+      }
+      __ xorl(rax, rax); // use zero reg to clear memory (shorter code)
+      __ store_klass_gap(r13, rax);  // zero klass gap for compressed oops
+      __ mov(rax, rbx);
+      __ store_klass(r13, rbx);  // klass
+
+      // We have our new buffered value, initialize its fields with a
+      // value class specific handler
+      __ movptr(rbx, Address(rax, ValueKlass::pack_handler_offset()));
+      __ mov(rax, r13);
+      __ call(rbx);
+      __ jmp(skip);
+    }
+
+    __ bind(slow_case);
+    // We failed to allocate a new value, fall back to a runtime
+    // call. Some oop field may be live in some registers but we can't
+    // tell. That runtime call will take care of preserving them
+    // across a GC if there's one.
     __ super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf());
+    __ bind(skip);
+#endif
   }
 
   __ restore_bcp();
   __ restore_locals();
-  
+
   if (state == atos) {
     Register mdp = rbx;
     Register tmp = rcx;
--- a/src/cpu/x86/vm/x86_64.ad	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/cpu/x86/vm/x86_64.ad	Mon Aug 21 12:50:40 2017 +0200
@@ -12106,8 +12106,24 @@
 %}
 
 // Call runtime without safepoint
+// entry point is null, target holds the address to call
+instruct CallLeafNoFPInDirect(rRegP target)
+%{
+  predicate(n->as_Call()->entry_point() == NULL);
+  match(CallLeafNoFP target);
+
+  ins_cost(300);
+  format %{ "call_leaf_nofp,runtime indirect " %}
+  ins_encode %{
+     __ call($target$$Register);
+  %}
+
+  ins_pipe(pipe_slow);
+%}
+
 instruct CallLeafNoFPDirect(method meth)
 %{
+  predicate(n->as_Call()->entry_point() != NULL);
   match(CallLeafNoFP);
   effect(USE meth);
 
--- a/src/share/vm/adlc/formssel.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/adlc/formssel.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -890,7 +890,8 @@
       strcmp(_matrule->_opType,"TailCall"  )==0 ||
       strcmp(_matrule->_opType,"TailJump"  )==0 ||
       strcmp(_matrule->_opType,"SafePoint" )==0 ||
-      strcmp(_matrule->_opType,"Halt"      )==0 )
+      strcmp(_matrule->_opType,"Halt"      )==0 ||
+      strcmp(_matrule->_opType,"CallLeafNoFP")==0)
     return AdlcVMDeps::Parms;   // Skip the machine-state edges
 
   if( _matrule->_rChild &&
--- a/src/share/vm/classfile/classLoaderData.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/classfile/classLoaderData.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -64,6 +64,7 @@
 #include "memory/resourceArea.hpp"
 #include "oops/objArrayOop.inline.hpp"
 #include "oops/oop.inline.hpp"
+#include "oops/valueKlass.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/javaCalls.hpp"
 #include "runtime/jniHandles.hpp"
@@ -290,6 +291,16 @@
   }
 }
 
+void ClassLoaderData::value_classes_do(void f(ValueKlass*)) {
+  // Lock-free access requires load_ptr_acquire
+  for (Klass* k = load_ptr_acquire(&_klasses); k != NULL; k = k->next_link()) {
+    if (k->is_value()) {
+      f(ValueKlass::cast(k));
+    }
+    assert(k != k->next_link(), "no loops!");
+  }
+}
+
 void ClassLoaderData::modules_do(void f(ModuleEntry*)) {
   assert_locked_or_safepoint(Module_lock);
   if (_unnamed_module != NULL) {
@@ -474,6 +485,7 @@
 void ClassLoaderData::unload() {
   _unloading = true;
 
+  value_classes_do(ValueKlass::cleanup);
   // Tell serviceability tools these classes are unloading
   classes_do(InstanceKlass::notify_unload_class);
 
@@ -671,7 +683,11 @@
       } else if (m->is_constantPool()) {
         MetadataFactory::free_metadata(this, (ConstantPool*)m);
       } else if (m->is_klass()) {
-        MetadataFactory::free_metadata(this, (InstanceKlass*)m);
+        if (!((Klass*)m)->is_value()) {
+          MetadataFactory::free_metadata(this, (InstanceKlass*)m);
+        } else {
+          MetadataFactory::free_metadata(this, (ValueKlass*)m);
+        }
       } else {
         ShouldNotReachHere();
       }
--- a/src/share/vm/classfile/classLoaderData.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/classfile/classLoaderData.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -259,6 +259,7 @@
   void classes_do(void f(Klass*));
   void loaded_classes_do(KlassClosure* klass_closure);
   void classes_do(void f(InstanceKlass*));
+  void value_classes_do(void f(ValueKlass*));
   void methods_do(void f(Method*));
   void modules_do(void f(ModuleEntry*));
   void packages_do(void f(PackageEntry*));
--- a/src/share/vm/code/codeBlob.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/code/codeBlob.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -324,6 +324,30 @@
 }
 
 //----------------------------------------------------------------------------------------------------
+// Implementation of BufferedValueTypeBlob
+BufferedValueTypeBlob::BufferedValueTypeBlob(int size, CodeBuffer* cb, int pack_fields_off, int unpack_fields_off) :
+  BufferBlob("buffered value type", size, cb),
+  _pack_fields_off(pack_fields_off),
+  _unpack_fields_off(unpack_fields_off) {
+  CodeCache::commit(this);
+}
+
+BufferedValueTypeBlob* BufferedValueTypeBlob::create(CodeBuffer* cb, int pack_fields_off, int unpack_fields_off) {
+  ThreadInVMfromUnknown __tiv;  // get to VM state in case we block on CodeCache_lock
+
+  BufferedValueTypeBlob* blob = NULL;
+  unsigned int size = CodeBlob::allocation_size(cb, sizeof(BufferedValueTypeBlob));
+  {
+    MutexLockerEx mu(CodeCache_lock, Mutex::_no_safepoint_check_flag);
+    blob = new (size) BufferedValueTypeBlob(size, cb, pack_fields_off, unpack_fields_off);
+  }
+  // Track memory usage statistic after releasing CodeCache_lock
+  MemoryService::track_code_cache_memory_usage();
+
+  return blob;
+}
+
+//----------------------------------------------------------------------------------------------------
 // Implementation of RuntimeStub
 
 RuntimeStub::RuntimeStub(
--- a/src/share/vm/code/codeBlob.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/code/codeBlob.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -121,6 +121,7 @@
   virtual bool is_method_handles_adapter_blob() const { return false; }
   virtual bool is_aot() const                         { return false; }
   virtual bool is_compiled() const                    { return false; }
+  virtual bool is_buffered_value_type_blob() const    { return false; }
 
   inline bool is_compiled_by_c1() const    { return _type == compiler_c1; };
   inline bool is_compiled_by_c2() const    { return _type == compiler_c2; };
@@ -361,6 +362,7 @@
   friend class VMStructs;
   friend class AdapterBlob;
   friend class MethodHandlesAdapterBlob;
+  friend class BufferedValueTypeBlob;
   friend class WhiteBox;
 
  private:
@@ -427,6 +429,26 @@
   virtual bool is_method_handles_adapter_blob() const { return true; }
 };
 
+//----------------------------------------------------------------------------------------------------
+// BufferedValueTypeBlob : used for pack/unpack handlers
+
+class BufferedValueTypeBlob: public BufferBlob {
+private:
+  const int _pack_fields_off;
+  const int _unpack_fields_off;
+
+  BufferedValueTypeBlob(int size, CodeBuffer* cb, int pack_fields_off, int unpack_fields_off);
+
+public:
+  // Creation
+  static BufferedValueTypeBlob* create(CodeBuffer* cb, int pack_fields_off, int unpack_fields_off);
+
+  address pack_fields() const { return code_begin() + _pack_fields_off; }
+  address unpack_fields() const { return code_begin() + _unpack_fields_off; }
+
+  // Typing
+  virtual bool is_buffered_value_type_blob() const { return true; }
+};
 
 //----------------------------------------------------------------------------------------------------
 // RuntimeStub: describes stubs used by compiled code to call a (static) C++ runtime routine
--- a/src/share/vm/compiler/compileBroker.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/compiler/compileBroker.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -1067,13 +1067,6 @@
     }
   }
 
-  // Returning a value type as a pointer can break if the compiled
-  // call site knows the value type being returned and expects it in
-  // registers.
-  if (ValueTypeReturnedAsFields && method->is_compiled_lambda_form() && method->is_returning_vt()) {
-    return NULL;
-  }
-
   // lock, make sure that the compilation
   // isn't prohibited in a straightforward way.
   AbstractCompiler* comp = CompileBroker::compiler(comp_level);
--- a/src/share/vm/oops/instanceKlass.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/oops/instanceKlass.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -68,6 +68,7 @@
 class JNIid;
 class JvmtiCachedClassFieldMap;
 class SuperTypeClosure;
+class BufferedValueTypeBlob;
 
 // This is used in iterators below.
 class FieldClosure: public StackObj {
@@ -1094,6 +1095,8 @@
                   bool is_interface, bool is_anonymous, bool has_stored_fingerprint,
                   int java_fields, bool is_value_type) {
     return align_metadata_size(header_size() +
+           (is_value_type ? (int)sizeof(address) : 0) +
+           (is_value_type ? (int)sizeof(address) : 0) +
            vtable_length +
            itable_length +
            nonstatic_oop_map_size +
@@ -1117,7 +1120,7 @@
   virtual void collect_statistics(KlassSizeStats *sz) const;
 #endif
 
-  intptr_t* start_of_itable()   const { return (intptr_t*)start_of_vtable() + vtable_length(); }
+  intptr_t* start_of_itable()   const { return (intptr_t*)start_of_vtable() + (is_value() ? 2 : 0 ) + vtable_length(); }
   intptr_t* end_of_itable()     const { return start_of_itable() + itable_length(); }
 
   int  itable_offset_in_words() const { return start_of_itable() - (intptr_t*)this; }
--- a/src/share/vm/oops/klassVtable.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/oops/klassVtable.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -1409,7 +1409,7 @@
   CountInterfacesClosure cic;
   visit_all_interfaces(transitive_interfaces, &cic);
 
-  // There's alway an extra itable entry so we can null-terminate it.
+  // There's always an extra itable entry so we can null-terminate it.
   int itable_size = calc_itable_size(cic.nof_interfaces() + 1, cic.nof_methods());
 
   // Statistics
--- a/src/share/vm/oops/valueKlass.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/oops/valueKlass.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -344,29 +344,67 @@
 }
 
 void ValueKlass::initialize_calling_convention() {
-  Thread* THREAD = Thread::current();
-  assert(!HAS_PENDING_EXCEPTION, "should have no exception");
-  ResourceMark rm;
-  const GrowableArray<SigEntry>& sig_vk = collect_fields();
-  int nb_fields = SigEntry::count_fields(sig_vk)+1;
-  Array<SigEntry>* extended_sig = MetadataFactory::new_array<SigEntry>(class_loader_data(), sig_vk.length(), CHECK_AND_CLEAR);
-  *((Array<SigEntry>**)adr_extended_sig()) = extended_sig;
-  for (int i = 0; i < sig_vk.length(); i++ ) {
-    extended_sig->at_put(i, sig_vk.at(i));
+  // Because the pack and unpack handler addresses need to be loadable from generated code,
+  // they are stored at a fixed offset in the klass metadata. Since value type klasses do
+  // not have a vtable, the vtable offset is used to store these addresses.
+  guarantee(vtable_length() == 0, "vtables are not supported in value klasses");
+  if (ValueTypeReturnedAsFields || ValueTypePassFieldsAsArgs) {
+    Thread* THREAD = Thread::current();
+    assert(!HAS_PENDING_EXCEPTION, "should have no exception");
+    ResourceMark rm;
+    const GrowableArray<SigEntry>& sig_vk = collect_fields();
+    int nb_fields = SigEntry::count_fields(sig_vk)+1;
+    Array<SigEntry>* extended_sig = MetadataFactory::new_array<SigEntry>(class_loader_data(), sig_vk.length(), CHECK_AND_CLEAR);
+    *((Array<SigEntry>**)adr_extended_sig()) = extended_sig;
+    for (int i = 0; i < sig_vk.length(); i++) {
+      extended_sig->at_put(i, sig_vk.at(i));
+    }
+
+    if (ValueTypeReturnedAsFields) {
+      BasicType* sig_bt = NEW_RESOURCE_ARRAY(BasicType, nb_fields);
+      sig_bt[0] = T_METADATA;
+      SigEntry::fill_sig_bt(sig_vk, sig_bt+1, nb_fields-1, true);
+      VMRegPair* regs = NEW_RESOURCE_ARRAY(VMRegPair, nb_fields);
+      int total = SharedRuntime::java_return_convention(sig_bt, regs, nb_fields);
+
+      if (total > 0) {
+        Array<VMRegPair>* return_regs = MetadataFactory::new_array<VMRegPair>(class_loader_data(), nb_fields, CHECK_AND_CLEAR);
+        *((Array<VMRegPair>**)adr_return_regs()) = return_regs;
+        for (int i = 0; i < nb_fields; i++) {
+          return_regs->at_put(i, regs[i]);
+        }
+
+        BufferedValueTypeBlob* buffered_blob = SharedRuntime::generate_buffered_value_type_adapter(this);
+        *((address*)adr_pack_handler()) = buffered_blob->pack_fields();
+        *((address*)adr_unpack_handler()) = buffered_blob->unpack_fields();
+        assert(CodeCache::find_blob(pack_handler()) == buffered_blob, "lost track of blob");
+      }
+    }
   }
+}
 
-  BasicType* sig_bt = NEW_RESOURCE_ARRAY(BasicType, nb_fields);
-  sig_bt[0] = T_METADATA;
-  SigEntry::fill_sig_bt(sig_vk, sig_bt+1, nb_fields-1, true);
-  VMRegPair* regs = NEW_RESOURCE_ARRAY(VMRegPair, nb_fields);
-  int total = SharedRuntime::java_return_convention(sig_bt, regs, nb_fields);
+void ValueKlass::deallocate_contents(ClassLoaderData* loader_data) {
+  if (extended_sig() != NULL) {
+    MetadataFactory::free_array<SigEntry>(loader_data, extended_sig());
+  }
+  if (return_regs() != NULL) {
+    MetadataFactory::free_array<VMRegPair>(loader_data, return_regs());
+  }
+  cleanup_blobs();
+  InstanceKlass::deallocate_contents(loader_data);
+}
 
-  if (total > 0) {
-    Array<VMRegPair>* return_regs = MetadataFactory::new_array<VMRegPair>(class_loader_data(), nb_fields, CHECK_AND_CLEAR);
-    *((Array<VMRegPair>**)adr_return_regs()) = return_regs;
-    for (int i = 0; i < nb_fields; i++ ) {
-      return_regs->at_put(i, regs[i]);
-    }
+void ValueKlass::cleanup(ValueKlass* ik) {
+  ik->cleanup_blobs();
+}
+
+void ValueKlass::cleanup_blobs() {
+  if (pack_handler() != NULL) {
+    CodeBlob* buffered_blob = CodeCache::find_blob(pack_handler());
+    assert(buffered_blob->is_buffered_value_type_blob(), "bad blob type");
+    BufferBlob::free((BufferBlob*)buffered_blob);
+    *((address*)adr_pack_handler()) = NULL;
+    *((address*)adr_unpack_handler()) = NULL;
   }
 }
 
@@ -444,8 +482,14 @@
 
 // Fields are in registers. Create an instance of the value type and
 // initialize it with the values of the fields.
-oop ValueKlass::realloc_result(const RegisterMap& reg_map, const GrowableArray<Handle>& handles, TRAPS) {
-  oop new_vt = allocate_instance(CHECK_NULL);
+oop ValueKlass::realloc_result(const RegisterMap& reg_map, const GrowableArray<Handle>& handles, bool buffered, TRAPS) {
+  bool ignored = false;
+  oop new_vt = NULL;
+  if (buffered) {
+    new_vt = allocate_buffered_or_heap_instance(&ignored, CHECK_NULL);
+  } else {
+    new_vt = allocate_instance(CHECK_NULL);
+  }
 
   const Array<SigEntry>* sig_vk = extended_sig();
   const Array<VMRegPair>* regs = return_regs();
@@ -528,6 +572,7 @@
     default:
       ShouldNotReachHere();
     }
+    *(intptr_t*)loc = 0xDEAD;
     j++;
   }
   assert(j == regs->length(), "missed a field?");
--- a/src/share/vm/oops/valueKlass.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/oops/valueKlass.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -42,6 +42,12 @@
   ValueKlass(const ClassFileParser& parser)
     : InstanceKlass(parser, InstanceKlass::_misc_kind_value_type) {
     set_has_vcc_klass();
+    // Addresses used for value type calling convention
+    *((Array<SigEntry>**)adr_extended_sig()) = NULL;
+    *((Array<VMRegPair>**)adr_return_regs()) = NULL;
+    *((address*)adr_pack_handler()) = NULL;
+    *((address*)adr_unpack_handler()) = NULL;
+    assert(pack_handler() == NULL, "pack handler not null");
   }
 
   address adr_extended_sig() const {
@@ -77,10 +83,29 @@
     return adr_extended_sig() + sizeof(intptr_t);
   }
 
+  // pack and unpack handlers for value types return
+  address adr_pack_handler() const {
+    return (address)this + in_bytes(pack_handler_offset());
+  }
+
+  address adr_unpack_handler() const {
+    return (address)this + in_bytes(unpack_handler_offset());
+  }
+
+  address pack_handler() const {
+    return *(address*)adr_pack_handler();
+  }
+
+  address unpack_handler() const {
+    return *(address*)adr_unpack_handler();
+  }
+
   // static Klass* array_klass_impl(InstanceKlass* this_k, bool or_null, int n, TRAPS);
 
   GrowableArray<SigEntry> collect_fields(int base_off = 0) const;
 
+  void cleanup_blobs();
+
  protected:
   // Returns the array class for the n'th dimension
   Klass* array_klass_impl(bool or_null, int n, TRAPS);
@@ -121,28 +146,28 @@
     return ((address) (void*) o) + first_field_offset();
   }
 
-   oop oop_for_data(address data) const {
+  oop oop_for_data(address data) const {
     oop o = (oop) (data - first_field_offset());
     assert(o->is_oop(false), "Not an oop");
     return o;
   }
 
-   void set_if_bufferable() {
-     bool bufferable;
-     if (contains_oops()) {
-       bufferable = false;
-     } else {
-       int size_in_heap_words = size_helper();
-       int base_offset = instanceOopDesc::base_offset_in_bytes();
-       size_t size_in_bytes = size_in_heap_words * HeapWordSize - base_offset;
-       bufferable = size_in_bytes <= BigValueTypeThreshold;
-     }
-     if (bufferable) {
-       _extra_flags |= _extra_is_bufferable;
-     } else {
-       _extra_flags &= ~_extra_is_bufferable;
-     }
-   }
+  void set_if_bufferable() {
+    bool bufferable;
+    if (contains_oops()) {
+      bufferable = false;
+    } else {
+      int size_in_heap_words = size_helper();
+      int base_offset = instanceOopDesc::base_offset_in_bytes();
+      size_t size_in_bytes = size_in_heap_words * HeapWordSize - base_offset;
+      bufferable = size_in_bytes <= BigValueTypeThreshold;
+    }
+    if (bufferable) {
+      _extra_flags |= _extra_is_bufferable;
+    } else {
+      _extra_flags &= ~_extra_is_bufferable;
+    }
+  }
 
   bool is_bufferable() const          {
     return (_extra_flags & _extra_is_bufferable) != 0;
@@ -182,19 +207,32 @@
 
   // calling convention support
   void initialize_calling_convention();
-  const Array<SigEntry>* extended_sig() const {
+  Array<SigEntry>* extended_sig() const {
     assert(this != SystemDictionary::___Value_klass(), "make no sense for __Value");
     return *((Array<SigEntry>**)adr_extended_sig());
   }
-  const Array<VMRegPair>* return_regs() const {
+  Array<VMRegPair>* return_regs() const {
     assert(this != SystemDictionary::___Value_klass(), "make no sense for __Value");
     return *((Array<VMRegPair>**)adr_return_regs());
   }
   void save_oop_fields(const RegisterMap& map, GrowableArray<Handle>& handles) const;
   bool save_oop_results(RegisterMap& map, GrowableArray<Handle>& handles) const;
   void restore_oop_results(RegisterMap& map, GrowableArray<Handle>& handles) const;
-  oop realloc_result(const RegisterMap& reg_map, const GrowableArray<Handle>& handles, TRAPS);
+  oop realloc_result(const RegisterMap& reg_map, const GrowableArray<Handle>& handles, bool buffered, TRAPS);
   static ValueKlass* returned_value_type(const RegisterMap& reg_map);
+
+  // pack and unpack handlers. Need to be loadable from generated code
+  // so at a fixed offset from the base of the klass pointer.
+  static ByteSize pack_handler_offset() {
+    return in_ByteSize(InstanceKlass::header_size() * wordSize);
+  }
+
+  static ByteSize unpack_handler_offset() {
+    return in_ByteSize((InstanceKlass::header_size()+1) * wordSize);
+  }
+
+  void deallocate_contents(ClassLoaderData* loader_data);
+  static void cleanup(ValueKlass* ik) ;
 };
 
 #endif /* SHARE_VM_OOPS_VALUEKLASS_HPP */
--- a/src/share/vm/opto/callGenerator.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/callGenerator.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -495,6 +495,7 @@
   C->set_inlining_progress(true);
 
   if (return_type->is_valuetype()) {
+    const Type* vt_t = call->_tf->range_sig()->field_at(TypeFunc::Parms);
     if (result->is_ValueType()) {
       ValueTypeNode* vt = result->as_ValueType();
       if (!call->tf()->returns_value_type_as_fields()) {
@@ -503,14 +504,20 @@
       } else {
         // Return of multiple values (the fields of a value type)
         vt->replace_call_results(call, C);
+        if (gvn.type(vt->get_oop()) == TypePtr::NULL_PTR) {
+          result = vt->tagged_klass(gvn);
+        } else {
+          result = vt->get_oop();
+        }
       }
     } else {
-      const Type* vt_t = call->_tf->range_sig()->field_at(TypeFunc::Parms);
       if (vt_t->is_valuetypeptr()->value_type()->value_klass() != C->env()->___Value_klass()) {
         if (gvn.type(result)->isa_valuetypeptr() && call->tf()->returns_value_type_as_fields()) {
-          Node* cast = gvn.transform(new CheckCastPPNode(NULL, result, vt_t));
-          ValueTypePtrNode* vtptr = ValueTypePtrNode::make(gvn, kit.merged_memory(), cast);
+          Node* cast = new CheckCastPPNode(NULL, result, vt_t);
+          gvn.record_for_igvn(cast);
+          ValueTypePtrNode* vtptr = ValueTypePtrNode::make(gvn, kit.merged_memory(), gvn.transform(cast));
           vtptr->replace_call_results(call, C);
+          result = cast;
         } else {
           assert(result->is_top(), "what else?");
           for (DUIterator_Fast imax, i = call->fast_outs(imax); i < imax; i++) {
--- a/src/share/vm/opto/callnode.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/callnode.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -1094,6 +1094,13 @@
 
 //------------------------------calling_convention-----------------------------
 void CallRuntimeNode::calling_convention( BasicType* sig_bt, VMRegPair *parm_regs, uint argcnt ) const {
+  if (_entry_point == NULL) {
+    // The call to that stub is a special case: its inputs are
+    // multiple values returned from a call and so it should follow
+    // the return convention.
+    SharedRuntime::java_return_convention(sig_bt, parm_regs, argcnt);
+    return;
+  }
   Matcher::c_calling_convention( sig_bt, parm_regs, argcnt );
 }
 
@@ -1110,6 +1117,12 @@
 }
 #endif
 
+uint CallLeafNoFPNode::match_edge(uint idx) const {
+  // Null entry point is a special case for which the target is in a
+  // register. Need to match that edge.
+  return entry_point() == NULL && idx == TypeFunc::Parms;
+}
+
 //=============================================================================
 
 void SafePointNode::set_local(JVMState* jvms, uint idx, Node *c) {
--- a/src/share/vm/opto/callnode.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/callnode.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -704,6 +704,17 @@
       init_flags(Flag_is_macro);
       C->add_macro_node(this);
     }
+    const TypeTuple *r = tf->range_sig();
+    if (ValueTypeReturnedAsFields &&
+        method != NULL &&
+        method->is_method_handle_intrinsic() &&
+        r->cnt() > TypeFunc::Parms &&
+        r->field_at(TypeFunc::Parms)->isa_valuetypeptr() &&
+        r->field_at(TypeFunc::Parms)->is_valuetypeptr()->value_type()->value_klass() == C->env()->___Value_klass()) {
+      init_flags(Flag_is_macro);
+      C->add_macro_node(this);
+    }
+
     _is_scalar_replaceable = false;
     _is_non_escaping = false;
   }
@@ -813,6 +824,7 @@
   {
   }
   virtual int   Opcode() const;
+  virtual uint match_edge(uint idx) const;
 };
 
 
--- a/src/share/vm/opto/castnode.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/castnode.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -404,18 +404,6 @@
   // return join;
 }
 
-static void replace_in_uses(PhaseIterGVN *igvn, Node* n, Node* m, uint last) {
-  for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
-    Node* u = n->fast_out(i);
-    if (u->_idx < last) {
-      assert(n != u && m != u, "cycle!");
-      igvn->rehash_node_delayed(u);
-      int nb = u->replace_edge(n, m);
-      --i, imax -= nb;
-    }
-  }
-}
-
 Node* CheckCastPPNode::Ideal(PhaseGVN *phase, bool can_reshape) {
   // This is a value type. Its input is the return of a call: the call
   // returns a value type and we now know its exact type: build a
@@ -426,13 +414,16 @@
       type()->isa_valuetypeptr() &&
       in(1) != NULL && in(1)->is_Proj() &&
       in(1)->in(0) != NULL && in(1)->in(0)->is_CallStaticJava() &&
+      in(1)->in(0)->as_CallStaticJava()->method() != NULL &&
       in(1)->as_Proj()->_con == TypeFunc::Parms) {
     ciValueKlass* vk = type()->is_valuetypeptr()->value_type()->value_klass();
     assert(vk != phase->C->env()->___Value_klass(), "why cast to __Value?");
     PhaseIterGVN *igvn = phase->is_IterGVN();
 
     if (ValueTypeReturnedAsFields && vk->can_be_returned_as_fields()) {
+      igvn->set_delay_transform(true);
       CallNode* call = in(1)->in(0)->as_Call();
+      phase->C->remove_macro_node(call);
       // We now know the return type of the call
       const TypeTuple *range_sig = TypeTuple::make_range(vk, false);
       const TypeTuple *range_cc = TypeTuple::make_range(vk, true);
@@ -440,25 +431,46 @@
       call->_tf = TypeFunc::make(call->_tf->domain_sig(), call->_tf->domain_cc(),
                                  range_sig, range_cc);
       phase->set_type(call, call->Value(phase));
+      phase->set_type(in(1), in(1)->Value(phase));
 
       CallProjections projs;
       call->extract_projections(&projs, true, true);
-      Node* ctl = projs.fallthrough_catchproj;
-      Node* mem = projs.fallthrough_memproj;
-      Node* io = projs.fallthrough_ioproj;
-      Node* ex_ctl = projs.catchall_catchproj;
-      Node* ex_mem = projs.catchall_memproj;
-      Node* ex_io = projs.catchall_ioproj;
 
-      uint last = phase->C->unique();
+      Node* init_ctl = new Node(1);
+      Node* init_mem = new Node(1);
+      Node* init_io = new Node(1);
+      Node* init_ex_ctl = new Node(1);
+      Node* init_ex_mem = new Node(1);
+      Node* init_ex_io = new Node(1);
+      Node* res = new Node(1);
 
-      Node* r = new RegionNode(3);
-      Node* mem_phi = new PhiNode(r, Type::MEMORY, TypePtr::BOTTOM);
-      Node* io_phi = new PhiNode(r, Type::ABIO);
+      Node* ctl = init_ctl;
+      Node* mem = init_mem;
+      Node* io = init_io;
+      Node* ex_ctl = init_ex_ctl;
+      Node* ex_mem = init_ex_mem;
+      Node* ex_io = init_ex_io;
 
-      r->init_req(2, ex_ctl);
-      mem_phi->init_req(2, ex_mem);
-      io_phi->init_req(2, ex_io);
+      // Either we get a buffered value pointer and we can case use it
+      // or we get a tagged klass pointer and we need to allocate a
+      // value.
+      Node* cast = phase->transform(new CastP2XNode(ctl, res));
+      Node* masked = phase->transform(new AndXNode(cast, phase->MakeConX(0x1)));
+      Node* cmp = phase->transform(new CmpXNode(masked, phase->MakeConX(0x1)));
+      Node* bol = phase->transform(new BoolNode(cmp, BoolTest::eq));
+      IfNode* iff = phase->transform(new IfNode(ctl, bol, PROB_MAX, COUNT_UNKNOWN))->as_If();
+      Node* iftrue = phase->transform(new IfTrueNode(iff));
+      Node* iffalse = phase->transform(new IfFalseNode(iff));
+
+      ctl = iftrue;
+
+      Node* ex_r = new RegionNode(3);
+      Node* ex_mem_phi = new PhiNode(ex_r, Type::MEMORY, TypePtr::BOTTOM);
+      Node* ex_io_phi = new PhiNode(ex_r, Type::ABIO);
+
+      ex_r->init_req(2, ex_ctl);
+      ex_mem_phi->init_req(2, ex_mem);
+      ex_io_phi->init_req(2, ex_io);
 
       // We need an oop pointer in case allocation elimination
       // fails. Allocate a new instance here.
@@ -469,36 +481,72 @@
 
 
 
-      r->init_req(1, ex_ctl);
-      mem_phi->init_req(1, ex_mem);
-      io_phi->init_req(1, ex_io);
+      ex_r->init_req(1, ex_ctl);
+      ex_mem_phi->init_req(1, ex_mem);
+      ex_io_phi->init_req(1, ex_io);
+
+      ex_r = igvn->transform(ex_r);
+      ex_mem_phi = igvn->transform(ex_mem_phi);
+      ex_io_phi = igvn->transform(ex_io_phi);
+
+      // Create the ValueTypePtrNode. This will add extra projections
+      // to the call.
+      ValueTypePtrNode* vtptr = ValueTypePtrNode::make(igvn, this);
+      // Newly allocated value type must be initialized
+      vtptr->store(igvn, ctl, mem->as_MergeMem(), javaoop);
+      vtptr->set_oop(javaoop);
+
+      Node* r = new RegionNode(3);
+      Node* mem_phi = new PhiNode(r, Type::MEMORY, TypePtr::BOTTOM);
+      Node* io_phi = new PhiNode(r, Type::ABIO);
+      Node* res_phi = new PhiNode(r, type());
+
+      r->init_req(1, ctl);
+      mem_phi->init_req(1, mem);
+      io_phi->init_req(1, io);
+      res_phi->init_req(1, igvn->transform(vtptr));
+
+      ctl = iffalse;
+      mem = init_mem;
+      io = init_io;
+
+      Node* castnotnull = new CastPPNode(res, TypePtr::NOTNULL);
+      castnotnull->set_req(0, ctl);
+      castnotnull = phase->transform(castnotnull);
+      Node* ccast = clone();
+      ccast->set_req(0, ctl);
+      ccast->set_req(1, castnotnull);
+      ccast = phase->transform(ccast);
+
+      vtptr = ValueTypePtrNode::make(*phase, mem, ccast);
+
+      r->init_req(2, ctl);
+      mem_phi->init_req(2, mem);
+      io_phi->init_req(2, io);
+      res_phi->init_req(2, igvn->transform(vtptr));
 
       r = igvn->transform(r);
       mem_phi = igvn->transform(mem_phi);
       io_phi = igvn->transform(io_phi);
+      res_phi = igvn->transform(res_phi);
 
-      replace_in_uses(igvn, ex_ctl, r, last);
-      replace_in_uses(igvn, ex_mem, mem_phi, last);
-      replace_in_uses(igvn, ex_io, io_phi, last);
+      igvn->replace_in_uses(projs.fallthrough_catchproj, r);
+      igvn->replace_in_uses(projs.fallthrough_memproj, mem_phi);
+      igvn->replace_in_uses(projs.fallthrough_ioproj, io_phi);
+      igvn->replace_in_uses(projs.resproj, res_phi);
+      igvn->replace_in_uses(projs.catchall_catchproj, ex_r);
+      igvn->replace_in_uses(projs.catchall_memproj, ex_mem_phi);
+      igvn->replace_in_uses(projs.catchall_ioproj, ex_io_phi);
 
-      // Create the ValueTypePtrNode. This will add extra projections
-      // to the call.
-      ValueTypePtrNode* vtptr = ValueTypePtrNode::make(igvn, this);
-      igvn->set_delay_transform(true); // stores can be captured. If
-                                       // they are the whole subgraph
-                                       // shouldn't go away.
+      igvn->set_delay_transform(false);
 
-      // Newly allocated value type must be initialized
-      vtptr->store(igvn, ctl, mem->as_MergeMem(), javaoop);
-      igvn->set_delay_transform(false);
-      vtptr->set_oop(javaoop);
-
-      mem = igvn->transform(mem);
-      replace_in_uses(igvn, projs.fallthrough_catchproj, ctl, last);
-      replace_in_uses(igvn, projs.fallthrough_memproj, mem, last);
-      replace_in_uses(igvn, projs.fallthrough_ioproj, io, last);
-
-      igvn->replace_node(in(1), igvn->transform(vtptr));
+      igvn->replace_node(init_ctl, projs.fallthrough_catchproj);
+      igvn->replace_node(init_mem, projs.fallthrough_memproj);
+      igvn->replace_node(init_io, projs.fallthrough_ioproj);
+      igvn->replace_node(res, projs.resproj);
+      igvn->replace_node(init_ex_ctl, projs.catchall_catchproj);
+      igvn->replace_node(init_ex_mem, projs.catchall_memproj);
+      igvn->replace_node(init_ex_io, projs.catchall_ioproj);
 
       return this;
     } else {
--- a/src/share/vm/opto/compile.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/compile.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -2712,91 +2712,6 @@
   }
 }
 
-void Compile::value_type_return_from_mh_intrinsic(CallNode *call, Final_Reshape_Counts &frc) {
-  if (ValueTypeReturnedAsFields &&
-      call->is_CallStaticJava() &&
-      call->as_CallStaticJava()->method() != NULL &&
-      call->as_CallStaticJava()->method()->is_method_handle_intrinsic() &&
-      call->proj_out(TypeFunc::Parms) != NULL &&
-      call->proj_out(TypeFunc::Parms)->bottom_type()->isa_valuetypeptr()) {
-    // A value type is returned from the call but we don't know its
-    // type. One of the values being returned is the klass of the
-    // value type. We need to allocate a value type instance of that
-    // type and initialize it with other values being returned. This
-    // is done with the stub call below that we add right after this
-    // call.
-    Node* ret = call->proj_out(TypeFunc::Parms);
-    assert(ret->bottom_type()->is_valuetypeptr()->klass() == env()->___Value_klass(), "unexpected return type from MH intrinsic");
-    const TypeFunc* tf = call->_tf;
-    const TypeTuple* domain = OptoRuntime::store_value_type_fields_Type()->domain_cc();
-    const TypeFunc* new_tf = TypeFunc::make(tf->domain_sig(), tf->domain_cc(), tf->range_sig(), domain);
-    call->_tf = new_tf;
-
-    CallProjections projs;
-    call->extract_projections(&projs, true, true);
-    Node* ctl = projs.fallthrough_catchproj;
-    Node* mem = projs.fallthrough_memproj;
-    Node* io = projs.fallthrough_ioproj;
-    Node* ex_ctl = projs.catchall_catchproj;
-    Node* ex_mem = projs.catchall_memproj;
-    Node* ex_io = projs.catchall_ioproj;
-    CallStaticJavaNode* rt_call = new CallStaticJavaNode(OptoRuntime::store_value_type_fields_Type(),
-                                                         StubRoutines::store_value_type_fields_to_buf(),
-                                                         "store_value_type_fields",
-                                                         call->jvms()->bci(),
-                                                         TypePtr::BOTTOM);
-    Node* out_ctl = new ProjNode(rt_call, TypeFunc::Control);
-    Node* out_mem = new ProjNode(rt_call, TypeFunc::Memory);
-    Node* out_io = new ProjNode(rt_call, TypeFunc::I_O);
-    Node* res = new ProjNode(rt_call, TypeFunc::Parms);
-
-    Node* catc = new CatchNode(out_ctl, out_io, 2);
-    Node* norm = new CatchProjNode(catc, CatchProjNode::fall_through_index, CatchProjNode::no_handler_bci);
-    Node* excp = new CatchProjNode(catc, CatchProjNode::catch_all_index,    CatchProjNode::no_handler_bci);
-    Node* r = new RegionNode(3);
-    Node* mem_phi = new PhiNode(r, Type::MEMORY, TypePtr::BOTTOM);
-    Node* io_phi = new PhiNode(r, Type::ABIO);
-    r->init_req(1, excp);
-    mem_phi->init_req(1, out_mem);
-    io_phi->init_req(1, out_io);
-
-    frc._visited.set(norm->_idx);
-    frc._visited.set(excp->_idx);
-
-    ctl->replace_by(norm);
-    mem->replace_by(out_mem);
-    io->replace_by(out_io);
-    ret->replace_by(res);
-    ex_ctl->replace_by(r);
-    ex_mem->replace_by(mem_phi);
-    ex_io->replace_by(io_phi);
-
-    r->init_req(2, ex_ctl);
-    mem_phi->init_req(2, ex_mem);
-    io_phi->init_req(2, ex_io);
-
-    rt_call->init_req(TypeFunc::Control, ctl);
-    rt_call->init_req(TypeFunc::Memory, mem);
-    rt_call->init_req(TypeFunc::I_O, io);
-    rt_call->init_req(TypeFunc::FramePtr, call->in(TypeFunc::FramePtr));
-    rt_call->init_req(TypeFunc::ReturnAdr, call->in(TypeFunc::ReturnAdr));
-
-    rt_call->init_req(TypeFunc::Parms, ret);
-    // We don't know how many values are returned. This assumes the
-    // worst case, that all available registers are used.
-    for (uint i = TypeFunc::Parms+1; i < domain->cnt(); i++) {
-      if (domain->field_at(i) == Type::HALF) {
-        rt_call->init_req(i, top());
-        continue;
-      }
-      Node* proj = new ProjNode(call, i);
-      rt_call->init_req(i, proj);
-    }
-
-    // We can safepoint at that new call
-    add_safepoint_edges(rt_call, call->jvms());
-  }
-}
 
 //------------------------------final_graph_reshaping_impl----------------------
 // Implement items 1-5 from final_graph_reshaping below.
@@ -2912,7 +2827,6 @@
         call->set_req( TypeFunc::Parms, x );
       }
     }
-    value_type_return_from_mh_intrinsic(call, frc);
     break;
   }
 
@@ -3665,6 +3579,7 @@
       }
       // Recheck with a better notion of 'required_outcnt'
       if (n->outcnt() != required_outcnt) {
+        assert(false, "malformed control flow");
         record_method_not_compilable("malformed control flow");
         return true;            // Not all targets reachable!
       }
@@ -3674,6 +3589,7 @@
     for (DUIterator_Fast jmax, j = n->fast_outs(jmax); j < jmax; j++)
       if (!frc._visited.test(n->fast_out(j)->_idx)) {
         record_method_not_compilable("infinite loop");
+        assert(false, "infinite loop");
         return true;            // Found unvisited kid; must be unreach
       }
   }
--- a/src/share/vm/opto/compile.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/compile.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -1292,7 +1292,6 @@
   void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc);
   void final_graph_reshaping_walk( Node_Stack &nstack, Node *root, Final_Reshape_Counts &frc );
   void eliminate_redundant_card_marks(Node* n);
-  void value_type_return_from_mh_intrinsic(CallNode *call, Final_Reshape_Counts &frc);
 
  public:
 
--- a/src/share/vm/opto/escape.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/escape.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -374,7 +374,7 @@
         bool returns_oop = false;
         for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax && !returns_oop; i++) {
           ProjNode* pn = n->fast_out(i)->as_Proj();
-          if (pn->_con >= TypeFunc::Parms && pn->bottom_type()->isa_oopptr()) {
+          if (pn->_con >= TypeFunc::Parms && pn->bottom_type()->isa_ptr()) {
             returns_oop = true;
           }
         }
@@ -486,7 +486,7 @@
     case Op_Proj: {
       // we are only interested in the oop result projection from a call
       if (n->as_Proj()->_con >= TypeFunc::Parms && n->in(0)->is_Call() &&
-          (n->in(0)->as_Call()->returns_pointer() || n->bottom_type()->isa_oopptr())) {
+          (n->in(0)->as_Call()->returns_pointer() || n->bottom_type()->isa_ptr())) {
         assert((n->as_Proj()->_con == TypeFunc::Parms && n->in(0)->as_Call()->returns_pointer()) ||
                n->in(0)->as_Call()->tf()->returns_value_type_as_fields(), "what kind of oop return is it?");
         add_local_var_and_edge(n, PointsToNode::NoEscape,
@@ -695,7 +695,7 @@
     case Op_Proj: {
       // we are only interested in the oop result projection from a call
       if (n->as_Proj()->_con >= TypeFunc::Parms && n->in(0)->is_Call() &&
-          (n->in(0)->as_Call()->returns_pointer()|| n->bottom_type()->isa_oopptr())) {
+          (n->in(0)->as_Call()->returns_pointer()|| n->bottom_type()->isa_ptr())) {
         assert((n->as_Proj()->_con == TypeFunc::Parms && n->in(0)->as_Call()->returns_pointer()) ||
                n->in(0)->as_Call()->tf()->returns_value_type_as_fields(), "what kind of oop return is it?");
         add_local_var_and_edge(n, PointsToNode::NoEscape, n->in(0), NULL);
--- a/src/share/vm/opto/machnode.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/machnode.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -717,6 +717,11 @@
 const RegMask &MachCallNode::in_RegMask(uint idx) const {
   // Values in the domain use the users calling convention, embodied in the
   // _in_rms array of RegMasks.
+  if (entry_point() == NULL && idx == TypeFunc::Parms) {
+    // Null entry point is a special cast where the target of the call
+    // is in a register.
+    return MachNode::in_RegMask(idx);
+  }
   if (idx < tf()->domain_sig()->cnt()) {
     return _in_rms[idx];
   }
--- a/src/share/vm/opto/macro.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/macro.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -2638,6 +2638,216 @@
   _igvn.replace_node(_memproj_fallthrough, mem_phi);
 }
 
+// A value type is returned from the call but we don't know its
+// type. Either we get a buffered value (and nothing needs to be done)
+// or one of the values being returned is the klass of the value type
+// and we need to allocate a value type instance of that type and
+// initialize it with other values being returned. In that case, we
+// first try a fast path allocation and initialize the value with the
+// value klass's pack handler or we fall back to a runtime call.
+void PhaseMacroExpand::expand_mh_intrinsic_return(CallStaticJavaNode* call) {
+  Node* ret = call->proj_out(TypeFunc::Parms);
+  if (ret == NULL) {
+    return;
+  }
+  assert(ret->bottom_type()->is_valuetypeptr()->klass() == C->env()->___Value_klass(), "unexpected return type from MH intrinsic");
+  const TypeFunc* tf = call->_tf;
+  const TypeTuple* domain = OptoRuntime::store_value_type_fields_Type()->domain_cc();
+  const TypeFunc* new_tf = TypeFunc::make(tf->domain_sig(), tf->domain_cc(), tf->range_sig(), domain);
+  call->_tf = new_tf;
+  // Make sure the change of type is applied before projections are
+  // processed by igvn
+  _igvn.set_type(call, call->Value(&_igvn));
+  _igvn.set_type(ret, ret->Value(&_igvn));
+
+  // Before any new projection is added:
+  CallProjections projs;
+  call->extract_projections(&projs, true, true);
+
+  Node* ctl = new Node(1);
+  Node* mem = new Node(1);
+  Node* io = new Node(1);
+  Node* ex_ctl = new Node(1);
+  Node* ex_mem = new Node(1);
+  Node* ex_io = new Node(1);
+  Node* res = new Node(1);
+
+  Node* cast = transform_later(new CastP2XNode(ctl, res));
+  Node* mask = MakeConX(0x1);
+  Node* masked = transform_later(new AndXNode(cast, mask));
+  Node* cmp = transform_later(new CmpXNode(masked, mask));
+  Node* bol = transform_later(new BoolNode(cmp, BoolTest::eq));
+  IfNode* allocation_iff = new IfNode(ctl, bol, PROB_MAX, COUNT_UNKNOWN);
+  transform_later(allocation_iff);
+  Node* allocation_ctl = transform_later(new IfTrueNode(allocation_iff));
+  Node* no_allocation_ctl = transform_later(new IfFalseNode(allocation_iff));
+
+  Node* no_allocation_res = transform_later(new CheckCastPPNode(no_allocation_ctl, res, TypeValueTypePtr::NOTNULL));
+
+  Node* mask2 = MakeConX(-2);
+  Node* masked2 = transform_later(new AndXNode(cast, mask2));
+  Node* rawklassptr = transform_later(new CastX2PNode(masked2));
+  Node* klass_node = transform_later(new CheckCastPPNode(allocation_ctl, rawklassptr, TypeKlassPtr::VALUE));
+
+  Node* top_adr;
+  Node* end_adr;
+
+  Node* slowpath_bol = NULL;
+  Node* old_top = NULL;
+  Node* new_top = NULL;
+  if (UseTLAB) {
+    set_eden_pointers(top_adr, end_adr);
+    Node* end = make_load(ctl, mem, end_adr, 0, TypeRawPtr::BOTTOM, T_ADDRESS);
+    old_top = new LoadPNode(ctl, mem, top_adr, TypeRawPtr::BOTTOM, TypeRawPtr::BOTTOM, MemNode::unordered);
+    transform_later(old_top);
+    Node* layout_val = make_load(NULL, mem, klass_node, in_bytes(Klass::layout_helper_offset()), TypeInt::INT, T_INT);
+    Node* size_in_bytes = ConvI2X(layout_val);
+    new_top = new AddPNode(top(), old_top, size_in_bytes);
+    transform_later(new_top);
+    Node* slowpath_cmp = new CmpPNode(new_top, end);
+    transform_later(slowpath_cmp);
+    slowpath_bol = new BoolNode(slowpath_cmp, BoolTest::ge);
+    transform_later(slowpath_bol);
+  } else {
+    slowpath_bol = intcon(1);
+    old_top = top();
+    new_top = top();
+  }
+  IfNode* slowpath_iff = new IfNode(allocation_ctl, slowpath_bol, PROB_UNLIKELY_MAG(4), COUNT_UNKNOWN);
+  transform_later(slowpath_iff);
+
+  Node* slowpath_true = new IfTrueNode(slowpath_iff);
+  transform_later(slowpath_true);
+
+
+  CallStaticJavaNode* slow_call = new CallStaticJavaNode(OptoRuntime::store_value_type_fields_Type(),
+                                                         StubRoutines::store_value_type_fields_to_buf(),
+                                                         "store_value_type_fields",
+                                                         call->jvms()->bci(),
+                                                         TypePtr::BOTTOM);
+  slow_call->init_req(TypeFunc::Control, slowpath_true);
+  slow_call->init_req(TypeFunc::Memory, mem);
+  slow_call->init_req(TypeFunc::I_O, io);
+  slow_call->init_req(TypeFunc::FramePtr, call->in(TypeFunc::FramePtr));
+  slow_call->init_req(TypeFunc::ReturnAdr, call->in(TypeFunc::ReturnAdr));
+  slow_call->init_req(TypeFunc::Parms, res);
+
+  Node* slow_ctl = transform_later(new ProjNode(slow_call, TypeFunc::Control));
+  Node* slow_mem = transform_later(new ProjNode(slow_call, TypeFunc::Memory));
+  Node* slow_io = transform_later(new ProjNode(slow_call, TypeFunc::I_O));
+  Node* slow_res = transform_later(new ProjNode(slow_call, TypeFunc::Parms));
+  Node* slow_catc = transform_later(new CatchNode(slow_ctl, slow_io, 2));
+  Node* slow_norm = transform_later(new CatchProjNode(slow_catc, CatchProjNode::fall_through_index, CatchProjNode::no_handler_bci));
+  Node* slow_excp = transform_later(new CatchProjNode(slow_catc, CatchProjNode::catch_all_index,    CatchProjNode::no_handler_bci));
+
+  Node* ex_r = new RegionNode(3);
+  Node* ex_mem_phi = new PhiNode(ex_r, Type::MEMORY, TypePtr::BOTTOM);
+  Node* ex_io_phi = new PhiNode(ex_r, Type::ABIO);
+  ex_r->init_req(1, slow_excp);
+  ex_mem_phi->init_req(1, slow_mem);
+  ex_io_phi->init_req(1, slow_io);
+  ex_r->init_req(2, ex_ctl);
+  ex_mem_phi->init_req(2, ex_mem);
+  ex_io_phi->init_req(2, ex_io);
+
+  transform_later(ex_r);
+  transform_later(ex_mem_phi);
+  transform_later(ex_io_phi);
+
+  Node* slowpath_false = new IfFalseNode(slowpath_iff);
+  transform_later(slowpath_false);
+  Node* rawmem = new StorePNode(slowpath_false, mem, top_adr, TypeRawPtr::BOTTOM, new_top, MemNode::unordered);
+  transform_later(rawmem);
+  Node* mark_node = NULL;
+  // For now only enable fast locking for non-array types
+  if (UseBiasedLocking) {
+    mark_node = make_load(slowpath_false, rawmem, klass_node, in_bytes(Klass::prototype_header_offset()), TypeRawPtr::BOTTOM, T_ADDRESS);
+  } else {
+    mark_node = makecon(TypeRawPtr::make((address)markOopDesc::prototype()));
+  }
+  rawmem = make_store(slowpath_false, rawmem, old_top, oopDesc::mark_offset_in_bytes(), mark_node, T_ADDRESS);
+  rawmem = make_store(slowpath_false, rawmem, old_top, oopDesc::klass_offset_in_bytes(), klass_node, T_METADATA);
+  rawmem = make_store(slowpath_false, rawmem, old_top, oopDesc::klass_gap_offset_in_bytes(), intcon(0), T_INT);
+  Node* pack_handler = make_load(slowpath_false, rawmem, klass_node, in_bytes(ValueKlass::pack_handler_offset()), TypeRawPtr::BOTTOM, T_ADDRESS);
+
+  CallLeafNoFPNode* handler_call = new CallLeafNoFPNode(OptoRuntime::pack_value_type_Type(),
+                                                        NULL,
+                                                        "pack handler",
+                                                        TypeRawPtr::BOTTOM);
+  handler_call->init_req(TypeFunc::Control, slowpath_false);
+  handler_call->init_req(TypeFunc::Memory, rawmem);
+  handler_call->init_req(TypeFunc::I_O, top());
+  handler_call->init_req(TypeFunc::FramePtr, call->in(TypeFunc::FramePtr));
+  handler_call->init_req(TypeFunc::ReturnAdr, top());
+  handler_call->init_req(TypeFunc::Parms, pack_handler);
+  handler_call->init_req(TypeFunc::Parms+1, old_top);
+
+  // We don't know how many values are returned. This assumes the
+  // worst case, that all available registers are used.
+  for (uint i = TypeFunc::Parms+1; i < domain->cnt(); i++) {
+    if (domain->field_at(i) == Type::HALF) {
+      slow_call->init_req(i, top());
+      handler_call->init_req(i+1, top());
+      continue;
+    }
+    Node* proj = transform_later(new ProjNode(call, i));
+    slow_call->init_req(i, proj);
+    handler_call->init_req(i+1, proj);
+  }
+
+  // We can safepoint at that new call
+  C->add_safepoint_edges(slow_call, call->jvms());
+  transform_later(slow_call);
+  transform_later(handler_call);
+
+  Node* handler_ctl = transform_later(new ProjNode(handler_call, TypeFunc::Control));
+  rawmem = transform_later(new ProjNode(handler_call, TypeFunc::Memory));
+  Node* slowpath_false_res = transform_later(new ProjNode(handler_call, TypeFunc::Parms));
+
+  MergeMemNode* slowpath_false_mem = MergeMemNode::make(mem);
+  slowpath_false_mem->set_memory_at(Compile::AliasIdxRaw, rawmem);
+  transform_later(slowpath_false_mem);
+
+  Node* r = new RegionNode(4);
+  Node* mem_phi = new PhiNode(r, Type::MEMORY, TypePtr::BOTTOM);
+  Node* io_phi = new PhiNode(r, Type::ABIO);
+  Node* res_phi = new PhiNode(r, ret->bottom_type());
+
+  r->init_req(1, no_allocation_ctl);
+  mem_phi->init_req(1, mem);
+  io_phi->init_req(1, io);
+  res_phi->init_req(1, no_allocation_res);
+  r->init_req(2, slow_norm);
+  mem_phi->init_req(2, slow_mem);
+  io_phi->init_req(2, slow_io);
+  res_phi->init_req(2, slow_res);
+  r->init_req(3, handler_ctl);
+  mem_phi->init_req(3, slowpath_false_mem);
+  io_phi->init_req(3, io);
+  res_phi->init_req(3, slowpath_false_res);
+
+  transform_later(r);
+  transform_later(mem_phi);
+  transform_later(io_phi);
+  transform_later(res_phi);
+
+  _igvn.replace_in_uses(projs.fallthrough_catchproj, r);
+  _igvn.replace_in_uses(projs.fallthrough_memproj, mem_phi);
+  _igvn.replace_in_uses(projs.fallthrough_ioproj, io_phi);
+  _igvn.replace_in_uses(projs.resproj, res_phi);
+  _igvn.replace_in_uses(projs.catchall_catchproj, ex_r);
+  _igvn.replace_in_uses(projs.catchall_memproj, ex_mem_phi);
+  _igvn.replace_in_uses(projs.catchall_ioproj, ex_io_phi);
+
+  _igvn.replace_node(ctl, projs.fallthrough_catchproj);
+  _igvn.replace_node(mem, projs.fallthrough_memproj);
+  _igvn.replace_node(io, projs.fallthrough_ioproj);
+  _igvn.replace_node(res, projs.resproj);
+  _igvn.replace_node(ex_ctl, projs.catchall_catchproj);
+  _igvn.replace_node(ex_mem, projs.catchall_memproj);
+  _igvn.replace_node(ex_io, projs.catchall_ioproj);
+ }
+
 //---------------------------eliminate_macro_nodes----------------------
 // Eliminate scalar replaced allocations and associated locks.
 void PhaseMacroExpand::eliminate_macro_nodes() {
@@ -2682,9 +2892,13 @@
       case Node::Class_AllocateArray:
         success = eliminate_allocate_node(n->as_Allocate());
         break;
-      case Node::Class_CallStaticJava:
-        success = eliminate_boxing_node(n->as_CallStaticJava());
+      case Node::Class_CallStaticJava: {
+        CallStaticJavaNode* call = n->as_CallStaticJava();
+        if (!call->method()->is_method_handle_intrinsic()) {
+          success = eliminate_boxing_node(n->as_CallStaticJava());
+        }
         break;
+      }
       case Node::Class_Lock:
       case Node::Class_Unlock:
         assert(!n->as_AbstractLock()->is_eliminated(), "sanity");
@@ -2731,10 +2945,13 @@
         _igvn._worklist.push(n);
         success = true;
       } else if (n->Opcode() == Op_CallStaticJava) {
-        // Remove it from macro list and put on IGVN worklist to optimize.
-        C->remove_macro_node(n);
-        _igvn._worklist.push(n);
-        success = true;
+        CallStaticJavaNode* call = n->as_CallStaticJava();
+        if (!call->method()->is_method_handle_intrinsic()) {
+          // Remove it from macro list and put on IGVN worklist to optimize.
+          C->remove_macro_node(n);
+          _igvn._worklist.push(n);
+          success = true;
+        }
       } else if (n->Opcode() == Op_Opaque1 || n->Opcode() == Op_Opaque2) {
         _igvn.replace_node(n, n->in(1));
         success = true;
@@ -2814,6 +3031,10 @@
     case Node::Class_Unlock:
       expand_unlock_node(n->as_Unlock());
       break;
+    case Node::Class_CallStaticJava:
+      expand_mh_intrinsic_return(n->as_CallStaticJava());
+      C->remove_macro_node(n);
+      break;
     default:
       assert(false, "unknown node type in macro list");
     }
--- a/src/share/vm/opto/macro.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/macro.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -101,6 +101,7 @@
   bool eliminate_locking_node(AbstractLockNode *alock);
   void expand_lock_node(LockNode *lock);
   void expand_unlock_node(UnlockNode *unlock);
+  void expand_mh_intrinsic_return(CallStaticJavaNode* call);
 
   // More helper methods modeled after GraphKit for array copy
   void insert_mem_bar(Node** ctrl, Node** mem, int opcode, Node* precedent = NULL);
--- a/src/share/vm/opto/matcher.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/matcher.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -1259,13 +1259,16 @@
 
 
   // Do the normal argument list (parameters) register masks
-  int argcnt = cnt - TypeFunc::Parms;
+  // Null entry point is a special cast where the target of the call
+  // is in a register.
+  int adj = (call != NULL && call->entry_point() == NULL) ? 1 : 0;
+  int argcnt = cnt - TypeFunc::Parms - adj;
   if( argcnt > 0 ) {          // Skip it all if we have no args
     BasicType *sig_bt  = NEW_RESOURCE_ARRAY( BasicType, argcnt );
     VMRegPair *parm_regs = NEW_RESOURCE_ARRAY( VMRegPair, argcnt );
     int i;
     for( i = 0; i < argcnt; i++ ) {
-      sig_bt[i] = domain->field_at(i+TypeFunc::Parms)->basic_type();
+      sig_bt[i] = domain->field_at(i+TypeFunc::Parms+adj)->basic_type();
     }
     // V-call to pick proper calling convention
     call->calling_convention( sig_bt, parm_regs, argcnt );
@@ -1306,7 +1309,7 @@
     // and over the entire method.
     for( i = 0; i < argcnt; i++ ) {
       // Address of incoming argument mask to fill in
-      RegMask *rm = &mcall->_in_rms[i+TypeFunc::Parms];
+      RegMask *rm = &mcall->_in_rms[i+TypeFunc::Parms+adj];
       if( !parm_regs[i].first()->is_valid() &&
           !parm_regs[i].second()->is_valid() ) {
         continue;               // Avoid Halves
--- a/src/share/vm/opto/mulnode.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/mulnode.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -155,6 +155,18 @@
     if( t2->higher_equal( zero ) ) return zero;
   }
 
+  // Code pattern on return from a call that returns an __Value.  Can
+  // be optimized away if the return value turns out to be an oop.
+  if (op == Op_AndX &&
+      in(1) != NULL &&
+      in(1)->Opcode() == Op_CastP2X &&
+      in(1)->in(1) != NULL &&
+      phase->type(in(1)->in(1))->isa_oopptr() &&
+      t2->isa_intptr_t()->_lo >= 0 &&
+      t2->isa_intptr_t()->_hi <= MinObjAlignmentInBytesMask) {
+    return add_id();
+  }
+
   // Either input is BOTTOM ==> the result is the local BOTTOM
   if( t1 == Type::BOTTOM || t2 == Type::BOTTOM )
     return bottom_type();
--- a/src/share/vm/opto/output.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/output.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -287,7 +287,9 @@
           MachCallNode *mcall = mach->as_MachCall();
           // This destination address is NOT PC-relative
 
-          mcall->method_set((intptr_t)mcall->entry_point());
+          if (mcall->entry_point() != NULL) {
+            mcall->method_set((intptr_t)mcall->entry_point());
+          }
 
           if (mcall->is_MachCallJava() && mcall->as_MachCallJava()->_method) {
             stub_size  += CompiledStaticCall::to_interp_stub_size();
@@ -1237,8 +1239,10 @@
         if (is_mcall) {
           MachCallNode *mcall = mach->as_MachCall();
 
-          // This destination address is NOT PC-relative
-          mcall->method_set((intptr_t)mcall->entry_point());
+          if (mcall->entry_point() != NULL) {
+            // This destination address is NOT PC-relative
+            mcall->method_set((intptr_t)mcall->entry_point());
+          }
 
           // Save the return address
           call_returns[block->_pre_order] = current_offset + mcall->ret_addr_offset();
--- a/src/share/vm/opto/phaseX.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/phaseX.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -1483,6 +1483,17 @@
   temp->destruct();         // reuse the _idx of this little guy
 }
 
+void PhaseIterGVN::replace_in_uses(Node* n, Node* m) {
+  for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
+    Node* u = n->fast_out(i);
+    if (u != n) {
+      rehash_node_delayed(u);
+      int nb = u->replace_edge(n, m);
+      --i, imax -= nb;
+    }
+  }
+}
+
 //------------------------------add_users_to_worklist--------------------------
 void PhaseIterGVN::add_users_to_worklist0( Node *n ) {
   for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
@@ -1628,6 +1639,14 @@
       Node* imem = use->as_Initialize()->proj_out(TypeFunc::Memory);
       if (imem != NULL)  add_users_to_worklist0(imem);
     }
+    if (use_op == Op_CastP2X) {
+      for (DUIterator_Fast i2max, i2 = use->fast_outs(i2max); i2 < i2max; i2++) {
+        Node* u = use->fast_out(i2);
+        if (u->Opcode() == Op_AndX) {
+          _worklist.push(u);
+        }
+      }
+    }
   }
 }
 
@@ -1763,6 +1782,14 @@
             worklist.push(phi);
           }
         }
+        if (m_op == Op_CastP2X) {
+          for (DUIterator_Fast i2max, i2 = m->fast_outs(i2max); i2 < i2max; i2++) {
+            Node* u = m->fast_out(i2);
+            if (u->Opcode() == Op_AndX) {
+              worklist.push(u);
+            }
+          }
+        }
       }
     }
   }
--- a/src/share/vm/opto/phaseX.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/phaseX.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -514,6 +514,8 @@
     subsume_node(old, nn);
   }
 
+  void replace_in_uses(Node* n, Node* m);
+
   // Delayed node rehash: remove a node from the hash table and rehash it during
   // next optimizing pass
   void rehash_node_delayed(Node* n) {
--- a/src/share/vm/opto/runtime.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/runtime.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -1718,3 +1718,31 @@
 
   return TypeFunc::make(domain, range);
 }
+
+const TypeFunc *OptoRuntime::pack_value_type_Type() {
+  // create input type (domain)
+  uint total = 1 + SharedRuntime::java_return_convention_max_int + SharedRuntime::java_return_convention_max_float*2;
+  const Type **fields = TypeTuple::fields(total);
+  // We don't know the number of returned values and their
+  // types. Assume all registers available to the return convention
+  // are used.
+  fields[TypeFunc::Parms] = TypeRawPtr::BOTTOM;
+  fields[TypeFunc::Parms+1] = TypeRawPtr::BOTTOM;
+  uint i = 2;
+  for (; i < SharedRuntime::java_return_convention_max_int+1; i++) {
+    fields[TypeFunc::Parms+i] = TypeInt::INT;
+  }
+  for (; i < total; i+=2) {
+    fields[TypeFunc::Parms+i] = Type::DOUBLE;
+    fields[TypeFunc::Parms+i+1] = Type::HALF;
+  }
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + total, fields);
+
+  // create result type (range)
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = TypeValueTypePtr::NOTNULL;
+
+  const TypeTuple *range = TypeTuple::make(TypeFunc::Parms+1,fields);
+
+  return TypeFunc::make(domain, range);
+}
--- a/src/share/vm/opto/runtime.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/runtime.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -330,6 +330,7 @@
   static const TypeFunc* dtrace_object_alloc_Type();
 
   static const TypeFunc* store_value_type_fields_Type();
+  static const TypeFunc* pack_value_type_Type();
 
  private:
  static NamedCounter * volatile _named_counters;
--- a/src/share/vm/opto/type.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/type.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -649,6 +649,7 @@
   TypeKlassPtr::OBJECT = TypeKlassPtr::make(TypePtr::NotNull, current->env()->Object_klass(), Offset(0) );
   TypeKlassPtr::OBJECT_OR_NULL = TypeKlassPtr::make(TypePtr::BotPTR, current->env()->Object_klass(), Offset(0) );
   TypeKlassPtr::BOTTOM = (EnableValhalla | EnableMVT) ? TypeKlassPtr::make(TypePtr::BotPTR, NULL, Offset(0)) : TypeKlassPtr::OBJECT_OR_NULL;
+  TypeKlassPtr::VALUE = TypeKlassPtr::make(TypePtr::NotNull, current->env()->___Value_klass(), Offset(0));
 
   const Type **fi2c = TypeTuple::fields(2);
   fi2c[TypeFunc::Parms+0] = TypeInstPtr::BOTTOM; // Method*
@@ -4803,13 +4804,16 @@
     case DoubleBot:
     case NarrowOop:
     case NarrowKlass:
+    case Bottom:                  // Ye Olde Default
+      return Type::BOTTOM;
+
     case MetadataPtr:
     case KlassPtr:
     case RawPtr:
     case AryPtr:
     case InstPtr:
-    case Bottom:                  // Ye Olde Default
-      return Type::BOTTOM;
+      return TypePtr::BOTTOM;
+
     case Top:
       return this;
 
@@ -5288,6 +5292,7 @@
 const TypeKlassPtr *TypeKlassPtr::OBJECT;
 const TypeKlassPtr *TypeKlassPtr::OBJECT_OR_NULL;
 const TypeKlassPtr* TypeKlassPtr::BOTTOM;
+const TypeKlassPtr *TypeKlassPtr::VALUE;
 
 //------------------------------TypeKlassPtr-----------------------------------
 TypeKlassPtr::TypeKlassPtr( PTR ptr, ciKlass* klass, Offset offset )
@@ -5708,7 +5713,8 @@
     domain_cc = TypeTuple::make_domain(method->holder(), method->signature(), ValueTypePassFieldsAsArgs);
   }
   const TypeTuple *range_sig = TypeTuple::make_range(method->signature(), false);
-  const TypeTuple *range_cc = TypeTuple::make_range(method->signature(), ValueTypeReturnedAsFields);
+  bool as_fields = ValueTypeReturnedAsFields;
+  const TypeTuple *range_cc = TypeTuple::make_range(method->signature(), as_fields);
   tf = TypeFunc::make(domain_sig, domain_cc, range_sig, range_cc);
   C->set_last_tf(method, tf);  // fill cache
   return tf;
--- a/src/share/vm/opto/type.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/type.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -1442,6 +1442,7 @@
   // Convenience common pre-built types.
   static const TypeKlassPtr* OBJECT; // Not-null object klass or below
   static const TypeKlassPtr* OBJECT_OR_NULL; // Maybe-null version of same
+  static const TypeKlassPtr* VALUE;
   static const TypeKlassPtr* BOTTOM;
 #ifndef PRODUCT
   virtual void dump2( Dict &d, uint depth, outputStream *st ) const; // Specialized per-Type dumping
--- a/src/share/vm/opto/valuetypenode.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/valuetypenode.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -614,13 +614,16 @@
   return oop_type->meet(TypePtr::NULL_PTR) != oop_type;
 }
 
-void ValueTypeNode::pass_klass(Node* n, uint pos, const GraphKit& kit) {
+Node* ValueTypeNode::tagged_klass(PhaseGVN& gvn) {
   ciValueKlass* vk = value_klass();
   const TypeKlassPtr* tk = TypeKlassPtr::make(vk);
   intptr_t bits = tk->get_con();
   set_nth_bit(bits, 0);
-  Node* klass_tagged = kit.MakeConX(bits);
-  n->init_req(pos, klass_tagged);
+  return gvn.makecon(TypeRawPtr::make((address)bits));
+}
+
+void ValueTypeNode::pass_klass(Node* n, uint pos, const GraphKit& kit) {
+  n->init_req(pos, tagged_klass(kit.gvn()));
 }
 
 uint ValueTypeNode::pass_fields(Node* n, int base_input, const GraphKit& kit, ciValueKlass* base_vk, int base_offset) {
--- a/src/share/vm/opto/valuetypenode.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/opto/valuetypenode.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -132,6 +132,7 @@
   Node* allocate(GraphKit* kit);
   bool  is_allocated(PhaseGVN* phase) const;
 
+  Node* tagged_klass(PhaseGVN& gvn);
   void pass_klass(Node* n, uint pos, const GraphKit& kit);
   uint pass_fields(Node* call, int base_input, const GraphKit& kit, ciValueKlass* base_vk = NULL, int base_offset = 0);
 
--- a/src/share/vm/runtime/deoptimization.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/runtime/deoptimization.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -880,7 +880,7 @@
 // reference to a value type instance. Allocate and initialize it from
 // the register values here.
 bool Deoptimization::realloc_value_type_result(ValueKlass* vk, const RegisterMap& map, GrowableArray<Handle>& return_oops, TRAPS) {
-  oop new_vt = vk->realloc_result(map, return_oops, THREAD);
+  oop new_vt = vk->realloc_result(map, return_oops, false, THREAD);
   if (new_vt == NULL) {
     CLEAR_PENDING_EXCEPTION;
     THROW_OOP_(Universe::out_of_memory_error_realloc_objects(), true);
--- a/src/share/vm/runtime/globals.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/runtime/globals.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -4118,6 +4118,10 @@
                                                                             \
   product(int, MinimumVTBufferChunkPerFrame, 2,                             \
           "Minimum number of VT buffer chunk allowed per frame")            \
+                                                                            \
+  develop(bool, StressValueTypeReturnedAsFields, false,                     \
+          "stress return of fields instead of a value type reference")      \
+
 
 
 
--- a/src/share/vm/runtime/sharedRuntime.cpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/runtime/sharedRuntime.cpp	Mon Aug 21 12:50:40 2017 +0200
@@ -3485,7 +3485,7 @@
   JRT_BLOCK;
   {
     Thread* THREAD = thread;
-    oop vt = vk->realloc_result(reg_map, handles, CHECK);
+    oop vt = vk->realloc_result(reg_map, handles, callerFrame.is_interpreted_frame(), CHECK);
     new_vt = Handle(thread, vt);
 
 #ifdef ASSERT
--- a/src/share/vm/runtime/sharedRuntime.hpp	Mon Aug 21 12:26:21 2017 +0200
+++ b/src/share/vm/runtime/sharedRuntime.hpp	Mon Aug 21 12:50:40 2017 +0200
@@ -523,6 +523,7 @@
 
   static address handle_unsafe_access(JavaThread* thread, address next_pc);
 
+  static BufferedValueTypeBlob* generate_buffered_value_type_adapter(const ValueKlass* vk);
 #ifndef PRODUCT
 
   // Collect and print inline cache miss statistics
--- a/test/compiler/valhalla/valuetypes/ValueTypeTestBench.java	Mon Aug 21 12:26:21 2017 +0200
+++ b/test/compiler/valhalla/valuetypes/ValueTypeTestBench.java	Mon Aug 21 12:50:40 2017 +0200
@@ -2753,6 +2753,13 @@
                                                                                 MethodHandles.dropArguments(MethodHandles.invoker(test104_mt3), 1, MethodHandle.class),
                                                                                 MethodHandles.dropArguments(MethodHandles.invoker(test104_mt3), 0, MethodHandle.class))
                                                     );
+
+            MethodHandle test105_mh1 = lookup.findStatic(ValueTypeTestBench.class, "test105_target1", myvalue2_mt);
+            test105_mh2 = lookup.findStatic(ValueTypeTestBench.class, "test105_target2", myvalue2_mt);
+            MethodHandle test105_mh_test = lookup.findStatic(ValueTypeTestBench.class, "test105_test", boolean_mt);
+            test105_mh = MethodHandles.guardWithTest(test105_mh_test,
+                                                    MethodHandles.dropArguments(test105_mh1, 0, MethodHandle.class),
+                                                    MethodHandles.invoker(myvalue2_mt));
         } catch (NoSuchMethodException|IllegalAccessException|NoSuchFieldException e) {
             e.printStackTrace();
             throw new RuntimeException("method handle lookup fails");
@@ -2781,7 +2788,7 @@
 
     static final MethodHandle test99_mh;
 
-    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE)
+    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE + STOREVALUETYPEFIELDS)
     @Test(valid = ValueTypeReturnedAsFieldsOff)
     MyValue3 test99() throws Throwable {
         return (MyValue3)test99_mh.invokeExact(this);
@@ -2815,7 +2822,7 @@
     static final MethodHandle test100_mh;
     static MethodHandle test100_mh1;
 
-    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE)
+    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE + STOREVALUETYPEFIELDS)
     @Test(valid = ValueTypeReturnedAsFieldsOff)
     long test100() throws Throwable {
         return ((MyValue2)test100_mh.invokeExact(test100_mh1)).hash();
@@ -2849,7 +2856,7 @@
     static final MethodHandle test101_mh;
     static MethodHandle test101_mh2;
 
-    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE)
+    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE + STOREVALUETYPEFIELDS)
     @Test(valid = ValueTypeReturnedAsFieldsOff)
     long test101() throws Throwable {
         return ((MyValue2)test101_mh.invokeExact(test101_mh2)).hash();
@@ -2867,7 +2874,7 @@
     // loop should go away as the result is a constant.
     static final MethodHandle test102_mh;
 
-    @Test(failOn = ALLOC + STORE + LOOP)
+    @Test(failOn = ALLOC + STORE + LOOP + STOREVALUETYPEFIELDS)
     long test102() throws Throwable {
         return (long)test102_mh.invokeExact();
     }
@@ -2912,7 +2919,7 @@
 
     static final MethodHandle test103_mh;
 
-    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE)
+    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE + STOREVALUETYPEFIELDS)
     @Test(valid = ValueTypeReturnedAsFieldsOff)
     MyValue3 test103() throws Throwable {
         return (MyValue3)test103_mh.invokeExact(this);
@@ -2960,7 +2967,7 @@
     static MethodHandle test104_mh2;
     static MethodHandle test104_mh3;
 
-    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE)
+    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE + STOREVALUETYPEFIELDS)
     @Test(valid = ValueTypeReturnedAsFieldsOff)
     long test104() throws Throwable {
         return ((MyValue2)test104_mh.invokeExact(test104_mh2, test104_mh3)).hash();
@@ -2978,6 +2985,42 @@
         Asserts.assertEQ(hash, MyValue2.createWithFieldsInline(i, b).hash());
     }
 
+    @ForceInline
+    static MyValue2 test105_target1() {
+        return MyValue2.createWithFieldsInline(rI+test105_i, true);
+    }
+
+    @ForceInline
+    static MyValue2 test105_target2() {
+        return MyValue2.createWithFieldsInline(rI-test105_i, false);
+    }
+
+    static int test105_i = 0;
+    @ForceInline
+    static boolean test105_test() {
+        return (test105_i % 100) == 0;
+    }
+
+    static final MethodHandle test105_mh;
+    static MethodHandle test105_mh2;
+
+    // Check that a buffered value returned by a compiled lambda form
+    // is properly handled by the caller.
+    @Test(valid = ValueTypeReturnedAsFieldsOn, failOn = ALLOC + ALLOCA + STORE + STOREVALUETYPEFIELDS)
+    @Test(valid = ValueTypeReturnedAsFieldsOff)
+    @Warmup(11000)
+    long test105() throws Throwable {
+        return ((MyValue2)test105_mh.invokeExact(test105_mh2)).hash();
+    }
+
+    @DontCompile
+    public void test105_verifier(boolean warmup) throws Throwable {
+        test105_i++;
+        long hash = test105();
+        boolean b = (test105_i % 100) == 0;
+        Asserts.assertEQ(hash, MyValue2.createWithFieldsInline(rI+test105_i * (b ? 1 : -1), b).hash());
+    }
+
     // ========== Test infrastructure ==========
 
     private static final WhiteBox WHITE_BOX = WhiteBox.getWhiteBox();
@@ -2991,7 +3034,7 @@
     private static final boolean ValueTypePassFieldsAsArgs = (Boolean)WHITE_BOX.getVMFlag("ValueTypePassFieldsAsArgs");
     private static final boolean ValueTypeArrayFlatten = (Boolean)WHITE_BOX.getVMFlag("ValueArrayFlatten");
     private static final boolean ValueTypeReturnedAsFields = (Boolean)WHITE_BOX.getVMFlag("ValueTypeReturnedAsFields");
-    private static final int COMP_LEVEL_ANY = -1;
+    private static final int COMP_LEVEL_ANY = -2;
     private static final int COMP_LEVEL_FULL_OPTIMIZATION = 4;
     private static final Hashtable<String, Method> tests = new Hashtable<String, Method>();
     private static final int WARMUP = 251;
@@ -3017,6 +3060,7 @@
     private static final String NPE = START + "CallStaticJava" + MID + "null_check" + END;
     private static final String CCE = START + "CallStaticJava" + MID + "class_check" + END;
     private static final String CALL = START + "CallStaticJava" + MID + END;
+    private static final String STOREVALUETYPEFIELDS = START + "CallStaticJava" + MID + "store_value_type_fields" + END;
     private static final String SCOBJ = "(.*# ScObj.*" + END;
 
     static {
@@ -3053,7 +3097,7 @@
     }
 
     public static void main(String[] args) throws Throwable {
-        //tests.values().removeIf(p -> !p.getName().equals("test85")); // Run single test
+        //tests.values().removeIf(p -> !p.getName().equals("test104")); // Run single test
         if (args.length == 0) {
             execute_vm("-XX:+IgnoreUnrecognizedVMOptions", "-XX:-BackgroundCompilation",
                     "-XX:+PrintCompilation", "-XX:+PrintInlining", "-XX:+PrintIdeal", "-XX:+PrintOptoAssembly",
@@ -3218,7 +3262,9 @@
         for (Method test : tests.values()) {
             Method verifier = getClass().getDeclaredMethod(test.getName() + "_verifier", boolean.class);
             // Warmup using verifier method
-            for (int i = 0; i < WARMUP; ++i) {
+            Warmup anno = test.getAnnotation(Warmup.class);
+            int warmup = anno == null ? WARMUP : anno.value();
+            for (int i = 0; i < warmup; ++i) {
                 verifier.invoke(this, true);
             }
             // Trigger compilation
@@ -3259,3 +3305,8 @@
 // Prevent method compilation
 @Retention(RetentionPolicy.RUNTIME)
 @interface DontCompile { }
+
+@Retention(RetentionPolicy.RUNTIME)
+@interface Warmup {
+    int value();
+}