--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tailc.patch Tue Feb 10 15:03:23 2009 -0800
@@ -0,0 +1,6044 @@
+diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp
+--- a/src/cpu/x86/vm/assembler_x86.cpp
++++ b/src/cpu/x86/vm/assembler_x86.cpp
+@@ -3276,6 +3276,16 @@
+ emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
+ }
+
++
++void MacroAssembler::parent_is_not_interpreter_jcc(Register temp, Label& is_not_interpreter_continutation) {
++ movl(temp, Address(rbp, frame::return_addr_offset * wordSize));
++ cmp32 (temp, ExternalAddress(Interpreter::interpreter_code_begin_address()));
++ jcc(Assembler::less, is_not_interpreter_continutation);
++ cmp32 (temp, ExternalAddress(Interpreter::interpreter_code_end_address()));
++ jcc(Assembler::greater, is_not_interpreter_continutation);
++}
++
++
+ #else // LP64
+
+ // 64bit only pieces of the assembler
+@@ -3738,6 +3748,15 @@
+ emit_byte(0xC0 | encode);
+ }
+
++void MacroAssembler::parent_is_not_interpreter_jcc(Register temp, Label& is_not_interpreter_continutation) {
++ assert(0, "update this code");
++ movl(temp, Address(rbp, frame::return_addr_offset * wordSize));
++ cmp32 (temp, ExternalAddress(Interpreter::interpreter_code_begin_address()));
++ jcc(Assembler::less, is_not_interpreter_continutation);
++ cmp32 (temp, ExternalAddress(Interpreter::interpreter_code_end_address()));
++ jcc(Assembler::greater, is_not_interpreter_continutation);
++}
++
+ void Assembler::decl(Register dst) {
+ // Don't use it directly. Use MacroAssembler::decrementl() instead.
+ // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
+diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp
+--- a/src/cpu/x86/vm/assembler_x86.hpp
++++ b/src/cpu/x86/vm/assembler_x86.hpp
+@@ -2033,6 +2033,8 @@
+ // Can push value or effective address
+ void pushptr(AddressLiteral src);
+
++ void parent_is_not_interpreter_jcc(Register temp, Label& is_not_interpreter_continutation);
++
+ void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
+ void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
+
+@@ -2042,7 +2044,6 @@
+ void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
+ void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
+
+-
+ #undef VIRTUAL
+
+ };
+diff --git a/src/cpu/x86/vm/c1_CodeStubs_x86.cpp b/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
+--- a/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
++++ b/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
+@@ -441,7 +441,7 @@
+ }
+ }
+
+- ce->align_call(lir_static_call);
++ ce->align_call(lir_static_call, false);
+
+ ce->emit_static_call_stub();
+ AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
+diff --git a/src/cpu/x86/vm/c1_FrameMap_x86.cpp b/src/cpu/x86/vm/c1_FrameMap_x86.cpp
+--- a/src/cpu/x86/vm/c1_FrameMap_x86.cpp
++++ b/src/cpu/x86/vm/c1_FrameMap_x86.cpp
+@@ -27,7 +27,7 @@
+
+ const int FrameMap::pd_c_runtime_reserved_arg_size = 0;
+
+-LIR_Opr FrameMap::map_to_opr(BasicType type, VMRegPair* reg, bool) {
++LIR_Opr FrameMap::map_to_opr(BasicType type, VMRegPair* reg, bool, bool is_tail_call) {
+ LIR_Opr opr = LIR_OprFact::illegalOpr;
+ VMReg r_1 = reg->first();
+ VMReg r_2 = reg->second();
+@@ -36,7 +36,13 @@
+ // The calling convention does not count the SharedRuntime::out_preserve_stack_slots() value
+ // so we must add it in here.
+ int st_off = (r_1->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
+- opr = LIR_OprFact::address(new LIR_Address(rsp_opr, st_off, type));
++ if (!is_tail_call)
++ opr = LIR_OprFact::address(new LIR_Address(rsp_opr, st_off, type));
++ else {
++ // Ignore RET address and RBP.
++ st_off += (2* VMRegImpl::stack_slot_size);
++ opr = LIR_OprFact::address(new LIR_Address(rbp_opr, st_off, type));
++ }
+ } else if (r_1->is_Register()) {
+ Register reg = r_1->as_Register();
+ if (r_2->is_Register() && (type == T_LONG || type == T_DOUBLE)) {
+diff --git a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+--- a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
++++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+@@ -510,6 +510,270 @@
+
+ }
+
++void LIR_Assembler::emit_static_not_sibling_tail_call_stub() {
++ // if the last instruction is a call (typically to do a throw which
++ // is coming at the end after block reordering) the return address
++ // must still point into the code area in order to avoid assertion
++ // failures when searching for the corresponding bci => add a nop
++ // (was bug 5/14/1999 - gri)
++ __ nop();
++ CallingConvention * incoming_args = frame_map()->incoming_arguments();
++ Label call_to_interpreter;
++ assert(incoming_args!=NULL, "ops");
++ int arg_slots = incoming_args->reserved_stack_slots();
++ int stub_size = 8*arg_slots + static_not_sibling_tail_call_stub_size;
++ // Generate code for static tail call (we know that klass and protection
++ // domain is correct).
++ address handler_base = __ start_a_stub(stub_size);
++ if (handler_base == NULL) {
++ // not enough space left for the handler
++ bailout("static tail call stub overflow");
++ return;
++ }
++#ifdef ASSERT
++ int offset = code_offset();
++#endif // ASSERT
++ __ align(CodeEntryAlignment);
++ compilation()->offsets()->set_value(CodeOffsets::Static_Not_Sibling_Tail_Call_Entry, code_offset());
++ if (TraceTailCalls) __ warn("Compiled entry point: Static_Not_Sibling_Tail_Call_Entry");
++ // Fast case: parent is interpreter. This means we can extend its stack frame.
++ // Assume: rax, rbx are scratch. rbx would hold methodOop, rax the IC_klass token
++ Register tmp = rax; // scratch
++ Register last_sp = rbx; // scratch
++ __ parent_is_not_interpreter_jcc(rbx, call_to_interpreter);
++ // Store old rbp
++ __ movl(tmp, Address(rbp, frame::link_offset*wordSize));
++ __ push_reg(tmp);
++ // Store ret address.
++ __ movl(tmp, Address(rbp, frame::return_addr_offset * wordSize));
++ __ push_reg(tmp);
++
++ // when debugging the return addr pointer remove the 2 __ a_long() lines in
++ // templateInterpreter_x86_32.cpp to get a sensible assembler output
++ //__ stop("static_not_sibling_call, parent is interpreted rax contains ret_entry");
++
++ // Get last_sp from parent frame.
++ __ movl(tmp, Address(rbp, frame::link_offset * wordSize)); // old rbp
++ __ movl(last_sp, Address(tmp, frame::interpreter_frame_last_sp_offset * wordSize));
++ // Shuffle arguments
++ for (int src_slot = arg_slots, dest_slot=-1; src_slot > 0; src_slot--, dest_slot--) {
++ // saved old_rbp, old_retaddr on top of stack => +2
++ Address src(rsp, VMRegImpl::stack_slot_size * (2+src_slot));
++ Address dest(last_sp, VMRegImpl::stack_slot_size * (dest_slot));
++ __ movl(tmp, src);
++ __ movl(dest, tmp);
++ }
++ // Set return address.
++ __ subl(last_sp, (1+arg_slots)*wordSize);
++ __ pop_reg(tmp);
++ __ movl(Address(last_sp, 0), tmp);
++ // Set new rbp
++ __ pop_reg(rbp);
++ // Set new rsp. Need to do this after the pop!
++ __ movl(rsp, last_sp);
++ // jump to VEP
++ address vep_entry = compilation()->code()->insts()->start() +
++ compilation()->offsets()->value(CodeOffsets::Verified_Entry);
++ RelocationHolder rh = section_call_Relocation::spec(vep_entry, CodeBuffer::SECT_INSTS);
++ // Jump to vep.
++ __ jump(AddressLiteral((address)vep_entry, rh));
++ // Slow case: parent is not interpreted. Jump to interpreter entry of called
++ // function in order to lazily create an interpreted frame on the stack.
++ __ bind(call_to_interpreter);
++ compilation()->offsets()->set_value(CodeOffsets::Static_Not_Sibling_Tail_Call_Set_Data_Entry, code_offset());
++ // Set methodoop.
++ __ movoop(rbx, (jobject)Universe::non_oop_word());
++ // Jump to C2I Entry Point
++ __ jump(RuntimeAddress((address)-1));
++ // TODO: adapt static_tail_call_stub_size
++ assert(code_offset() - offset <= stub_size, "overflow");
++ __ end_a_stub();
++}
++
++void LIR_Assembler::emit_static_tail_call_stub() {
++ // if the last instruction is a call (typically to do a throw which
++ // is coming at the end after block reordering) the return address
++ // must still point into the code area in order to avoid assertion
++ // failures when searching for the corresponding bci => add a nop
++ // (was bug 5/14/1999 - gri)
++
++ __ nop();
++ CallingConvention * incoming_args = frame_map()->incoming_arguments();
++ assert(incoming_args!=NULL, "ops");
++ int arg_slots = incoming_args->reserved_stack_slots();
++ int stub_size = 8*arg_slots + static_tail_call_stub_size;
++ // Generate code for static tail call (we know that klass and protection
++ // domain is correct).
++ address handler_base = __ start_a_stub(stub_size);
++ if (handler_base == NULL) {
++ // not enough space left for the handler
++ bailout("static tail call stub overflow");
++ return;
++ }
++#ifdef ASSERT
++ int offset = code_offset();
++#endif // ASSERT
++ __ align(CodeEntryAlignment);
++ compilation()->offsets()->set_value(CodeOffsets::Static_Tail_Call_Entry, code_offset());
++ if (TraceTailCalls) __ warn("Compiled entry point: Static_Tail_Call_Entry");
++ // Move arguments.
++ emit_tail_call_argument_move(arg_slots);
++ // Remove tail calling caller's stack frame.
++ __ leave();
++
++ // Compute target of jump. Verified entry point of current method.
++ address vep_entry = compilation()->code()->insts()->start() +
++ compilation()->offsets()->value(CodeOffsets::Verified_Entry);
++ RelocationHolder rh = section_call_Relocation::spec(vep_entry, CodeBuffer::SECT_INSTS);
++ // Jump to vep.
++ __ jump(AddressLiteral((address)vep_entry, rh));
++
++ // TODO: adapt static_tail_call_stub_size
++ assert(code_offset() - offset <= stub_size, "overflow");
++ __ end_a_stub();
++}
++
++void LIR_Assembler::emit_tail_call_argument_move(int arg_slots) {
++ // Copy the args to tail call position using register rbx.
++ Register tmp = rbx;
++ for (int slot = 1; slot <= arg_slots; slot++) {
++ Address src (rsp, VMRegImpl::stack_slot_size * (SharedRuntime::out_preserve_stack_slots()+slot));
++ // Need to add safed eip slot so slot+1 VVV
++ Address dest(rbp, VMRegImpl::stack_slot_size * (SharedRuntime::out_preserve_stack_slots()+slot+1));
++ __ movl(tmp, src);
++ __ movl(dest, tmp);
++ }
++}
++
++void LIR_Assembler::emit_not_sibling_monomorphic_tail_call_stub() {
++ // if the last instruction is a call (typically to do a throw which
++ // is coming at the end after block reordering) the return address
++ // must still point into the code area in order to avoid assertion
++ // failures when searching for the corresponding bci => add a nop
++ // (was bug 5/14/1999 - gri)
++ __ nop();
++ CallingConvention * incoming_args = frame_map()->incoming_arguments();
++ Label call_to_interpreter;
++ assert(incoming_args!=NULL, "ops");
++ int arg_slots = incoming_args->reserved_stack_slots();
++ int stub_size = 8*arg_slots + monomorphic_not_sibling_tail_call_stub_size;
++ // Generate code for static tail call (we know that klass and protection
++ // domain is correct).
++ address handler_base = __ start_a_stub(stub_size);
++ if (handler_base == NULL) {
++ // not enough space left for the handler
++ bailout("static tail call stub overflow");
++ return;
++ }
++#ifdef ASSERT
++ int offset = code_offset();
++#endif // ASSERT
++ __ align(CodeEntryAlignment);
++ compilation()->offsets()->set_value(CodeOffsets::Monomorphic_Not_Sibling_Tail_Call_Entry, code_offset());
++ if (TraceTailCalls) __ warn("Compiled entry point: Monomorphic_Not_Sibling_Tail_Call_Entry");
++ check_icache();
++ // Fast case: parent is interpreter. This means we can extend its stack frame.
++ // Assume: rax, rbx are scratch here since rax is needed only for
++ // check_icache. rbx would hold methodOop, rax the IC_klass token
++ Register tmp = rax; // scratch
++ Register last_sp = rbx; // scratch
++ __ parent_is_not_interpreter_jcc(tmp, call_to_interpreter);
++ // Store old rbp
++ __ movl(tmp, Address(rbp, frame::link_offset*wordSize));
++ __ push_reg(tmp);
++ // Store ret address.
++ __ movl(tmp, Address(rbp, frame::return_addr_offset * wordSize));
++ __ push_reg(tmp);
++
++ // when debugging the return addr pointer remove the 2 __ a_long() lines in
++ // templateInterpreter_x86_32.cpp to get a sensible assembler output
++ //__ stop("static_not_sibling_call, parent is interpreted rax contains ret_entry");
++
++ // Get last_sp from parent frame.
++ __ movl(tmp, Address(rbp, frame::link_offset * wordSize)); // old rbp
++ __ movl(last_sp, Address(tmp, frame::interpreter_frame_last_sp_offset * wordSize));
++ // Shuffle arguments
++ for (int src_slot = arg_slots, dest_slot=-1; src_slot > 0; src_slot--, dest_slot--) {
++ // saved old_rbp, old_retaddr on top of stack => +2
++ Address src(rsp, VMRegImpl::stack_slot_size * (2+src_slot));
++ Address dest(last_sp, VMRegImpl::stack_slot_size * (dest_slot));
++ __ movl(tmp, src);
++ __ movl(dest, tmp);
++ }
++ // Set return address.
++ __ subl(last_sp, (1+arg_slots)*wordSize);
++ __ pop_reg(tmp);
++ __ movl(Address(last_sp, 0), tmp);
++ // Set new rbp
++ __ pop_reg(rbp);
++ // Set new rsp. Need to do this after the pop!
++ __ movl(rsp, last_sp);
++ // jump to VEP
++ address vep_entry = compilation()->code()->insts()->start() +
++ compilation()->offsets()->value(CodeOffsets::Verified_Entry);
++ RelocationHolder rh = section_call_Relocation::spec(vep_entry, CodeBuffer::SECT_INSTS);
++ // Jump to vep.
++ __ jump(AddressLiteral((address)vep_entry, rh));
++ // Slow case: parent is not interpreted. Jump to interpreter entry of called
++ // function in order to lazily create an interpreted frame on the stack.
++ __ bind(call_to_interpreter);
++ compilation()->offsets()->set_value(CodeOffsets::Monomorphic_Not_Sibling_Tail_Call_Set_Data_Entry, code_offset());
++ // Set methodoop.
++ __ movoop(rbx, (jobject)Universe::non_oop_word());
++ // Jump to C2I Entry Point
++ __ jump(RuntimeAddress((address)-1));
++ // TODO: adapt static_tail_call_stub_size
++ assert(code_offset() - offset <= stub_size, "overflow");
++ __ end_a_stub();
++}
++
++void LIR_Assembler::emit_monomorphic_tail_call_stub() {
++ // if the last instruction is a call (typically to do a throw which
++ // is coming at the end after block reordering) the return address
++ // must still point into the code area in order to avoid assertion
++ // failures when searching for the corresponding bci => add a nop
++ // (was bug 5/14/1999 - gri)
++
++ __ nop();
++ CallingConvention * incoming_args = frame_map()->incoming_arguments();
++ assert(incoming_args!=NULL, "ops");
++ int arg_slots = incoming_args->reserved_stack_slots();
++ int stub_size = 8*arg_slots + monomorphic_tail_call_stub_size;
++ // Generate code for monomorphic tail call (we know that protection
++ // domain is correct).
++ address handler_base = __ start_a_stub(stub_size);
++ if (handler_base == NULL) {
++ // not enough space left for the handler
++ bailout("static tail call stub overflow");
++ return;
++ }
++#ifdef ASSERT
++ int offset = code_offset();
++#endif // ASSERT
++ __ align(CodeEntryAlignment);
++ compilation()->offsets()->set_value(CodeOffsets::Monomorphic_Tail_Call_Entry, code_offset());
++ if (TraceTailCalls) __ warn("Compiled entry point: Monomorphic_Tail_Call_Entry");
++ // Check inline cache - needs to be done before poping the frame.
++ // If the check is done after popping the frame and the icache check fails,
++ // the frame would be popped again by handle_ic_miss code path.
++ check_icache();
++ // Move arguments.
++ emit_tail_call_argument_move(arg_slots);
++
++ // Remove tail calling caller's stack frame.
++ __ leave();
++
++ // Compute target of jump. Verified entry point of current method.
++ address vep_entry = compilation()->code()->insts()->start() +
++ compilation()->offsets()->value(CodeOffsets::Verified_Entry);
++ RelocationHolder rh = section_call_Relocation::spec(vep_entry, CodeBuffer::SECT_INSTS);
++ // Jump to vep.
++ __ jump(AddressLiteral((address)vep_entry, rh));
++
++ // TODO: adapt size
++ assert(code_offset() - offset <= stub_size, "overflow");
++ __ end_a_stub();
++}
+
+ // This is the fast version of java.lang.String.compare; it has not
+ // OSR-entry and therefore, we generate a slow version for OSR's
+@@ -2751,7 +3015,7 @@
+ }
+
+
+-void LIR_Assembler::align_call(LIR_Code code) {
++void LIR_Assembler::align_call(LIR_Code code, bool is_tail_call) {
+ if (os::is_MP()) {
+ // make sure that the displacement word of the call ends up word aligned
+ int offset = __ offset();
+@@ -2759,9 +3023,12 @@
+ case lir_static_call:
+ case lir_optvirtual_call:
+ offset += NativeCall::displacement_offset;
+- break;
++ break;
+ case lir_icvirtual_call:
+- offset += NativeCall::displacement_offset + NativeMovConstReg::instruction_size;
++ if (is_tail_call)
++ offset += NativeCall::displacement_offset + NativeMovConstReg::instruction_size+NativeMovConstProtectionDomain::instruction_size;
++ else
++ offset += NativeCall::displacement_offset + NativeMovConstReg::instruction_size;
+ break;
+ case lir_virtual_call: // currently, sparc-specific for niagara
+ default: ShouldNotReachHere();
+@@ -2772,17 +3039,62 @@
+ }
+ }
+
++void LIR_Assembler::set_protection_domain_token() {
++ // needs 7 bytes on x86-32
++ __ movoop(Address(rsp, 0), (jobject)Universe::non_oop_word());
++}
+
+ void LIR_Assembler::call(address entry, relocInfo::relocType rtype, CodeEmitInfo* info) {
+- assert(!os::is_MP() || (__ offset() + NativeCall::displacement_offset) % BytesPerWord == 0,
++ assert(!os::is_MP() || ((__ offset() + NativeCall::displacement_offset) % BytesPerWord == 0),
+ "must be aligned");
+- __ call(AddressLiteral(entry, rtype));
++ if (entry == SharedRuntime::get_resolve_not_sibling_static_tail_call_stub()) {
++ assert(rtype == relocInfo::static_call_type, "expect static call");
++ RelocationHolder rh = static_call_Relocation::spec(relocInfo::not_sibling_tail_call_type);
++ __ call(AddressLiteral(entry, rh));
++ } else if (entry == SharedRuntime::get_resolve_static_tail_call_stub() ) {
++ assert(rtype==relocInfo::static_call_type, "expect static call");
++ RelocationHolder rh = static_call_Relocation::spec(relocInfo::sibling_tail_call_type);
++ __ call(AddressLiteral(entry, rh));
++ } else if (entry == SharedRuntime::get_resolve_opt_virtual_tail_call_stub()) {
++ assert(rtype==relocInfo::opt_virtual_call_type, "expect opt virtual call");
++ RelocationHolder rh = opt_virtual_call_Relocation::spec(relocInfo::sibling_tail_call_type);
++ __ call(AddressLiteral(entry, rh));
++ } else if (entry == SharedRuntime::get_resolve_opt_not_sibling_virtual_tail_call_stub()) {
++ assert(rtype==relocInfo::opt_virtual_call_type, "expect opt virtual call");
++ RelocationHolder rh = opt_virtual_call_Relocation::spec(relocInfo::not_sibling_tail_call_type);
++ __ call(AddressLiteral(entry, rh));
++ } else {
++ __ call(AddressLiteral(entry, rtype));
++ }
+ add_call_info(code_offset(), info);
+ }
+
++static relocInfo::tailCallType tail_call_type_from_resolve_stub(address entry) {
++
++ assert((entry == SharedRuntime::get_resolve_virtual_tail_call_stub()) ||
++ (entry == SharedRuntime::get_resolve_virtual_call_stub()) ||
++ (entry == SharedRuntime::get_resolve_not_sibling_virtual_tail_call_stub()),
++ "must be virtual resolve stub");
++
++ relocInfo::tailCallType type = relocInfo::not_tail_call;
++ if (entry == SharedRuntime::get_resolve_virtual_tail_call_stub()) {
++ type = relocInfo::sibling_tail_call_type;
++
++ } else if (entry == SharedRuntime::get_resolve_not_sibling_virtual_tail_call_stub()) {
++ type = relocInfo::not_sibling_tail_call_type;
++ }
+
+-void LIR_Assembler::ic_call(address entry, CodeEmitInfo* info) {
+- RelocationHolder rh = virtual_call_Relocation::spec(pc());
++ return type;
++}
++
++void LIR_Assembler::ic_call(address entry, CodeEmitInfo* info, bool is_tail_call) {
++
++ if (is_tail_call) {
++ set_protection_domain_token();
++ }
++
++ relocInfo::tailCallType type = tail_call_type_from_resolve_stub(entry);
++ RelocationHolder rh = virtual_call_Relocation::spec(pc(), NULL, type);
+ __ movoop(IC_Klass, (jobject)Universe::non_oop_word());
+ assert(!os::is_MP() ||
+ (__ offset() + NativeCall::displacement_offset) % BytesPerWord == 0,
+diff --git a/src/cpu/x86/vm/c1_LIRAssembler_x86.hpp b/src/cpu/x86/vm/c1_LIRAssembler_x86.hpp
+--- a/src/cpu/x86/vm/c1_LIRAssembler_x86.hpp
++++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.hpp
+@@ -36,6 +36,9 @@
+ address float_constant(float f);
+ address double_constant(double d);
+
++
++ void emit_tail_call_argument_move(int arg_slots);
++
+ bool is_literal_address(LIR_Address* addr);
+
+ // When we need to use something other than rscratch1 use this
+@@ -51,5 +54,9 @@
+
+ enum { call_stub_size = NOT_LP64(15) LP64_ONLY(28),
+ exception_handler_size = DEBUG_ONLY(1*K) NOT_DEBUG(175),
++ static_tail_call_stub_size = NOT_LP64(64) LP64_ONLY(xxx), //TODO: proper size ;)
++ static_not_sibling_tail_call_stub_size = NOT_LP64(300) LP64_ONLY(xxx),
++ monomorphic_tail_call_stub_size = NOT_LP64(128) LP64_ONLY(xxx),
++ monomorphic_not_sibling_tail_call_stub_size = NOT_LP64(300) LP64_ONLY(xxx),
+ deopt_handler_size = NOT_LP64(10) LP64_ONLY(17)
+ };
+diff --git a/src/cpu/x86/vm/frame_x86.hpp b/src/cpu/x86/vm/frame_x86.hpp
+--- a/src/cpu/x86/vm/frame_x86.hpp
++++ b/src/cpu/x86/vm/frame_x86.hpp
+@@ -109,7 +109,9 @@
+ interpreter_frame_cache_offset = interpreter_frame_mdx_offset - 1,
+ interpreter_frame_locals_offset = interpreter_frame_cache_offset - 1,
+ interpreter_frame_bcx_offset = interpreter_frame_locals_offset - 1,
+- interpreter_frame_initial_sp_offset = interpreter_frame_bcx_offset - 1,
++ interpreter_frame_osr_offset = interpreter_frame_bcx_offset - 1,
++ //interpreter_frame_initial_sp_offset = interpreter_frame_bcx_offset - 1,
++ interpreter_frame_initial_sp_offset = interpreter_frame_osr_offset - 1,
+
+ interpreter_frame_monitor_block_top_offset = interpreter_frame_initial_sp_offset,
+ interpreter_frame_monitor_block_bottom_offset = interpreter_frame_initial_sp_offset,
+diff --git a/src/cpu/x86/vm/frame_x86.inline.hpp b/src/cpu/x86/vm/frame_x86.inline.hpp
+--- a/src/cpu/x86/vm/frame_x86.inline.hpp
++++ b/src/cpu/x86/vm/frame_x86.inline.hpp
+@@ -191,6 +191,9 @@
+ #else /* asm interpreter */
+ inline intptr_t* frame::sender_sp() const { return addr_at( sender_sp_offset); }
+
++inline int32_t* frame::interpreter_frame_osr_addr() const {
++ return (int32_t*)addr_at(interpreter_frame_osr_offset);
++}
+ inline intptr_t** frame::interpreter_frame_locals_addr() const {
+ return (intptr_t**)addr_at(interpreter_frame_locals_offset);
+ }
+diff --git a/src/cpu/x86/vm/interp_masm_x86_32.cpp b/src/cpu/x86/vm/interp_masm_x86_32.cpp
+--- a/src/cpu/x86/vm/interp_masm_x86_32.cpp
++++ b/src/cpu/x86/vm/interp_masm_x86_32.cpp
+@@ -588,11 +588,15 @@
+
+ // Jump to from_interpreted entry of a call unless single stepping is possible
+ // in this thread in which case we must call the i2i entry
+-void InterpreterMacroAssembler::jump_from_interpreted(Register method, Register temp) {
++void InterpreterMacroAssembler::jump_from_interpreted(Register method, Register temp, bool is_tail_call) {
++ // tail call codes sets its own sender sp
+ // set sender sp
+- lea(rsi, Address(rsp, wordSize));
+- // record last_sp
+- movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), rsi);
++ //leal(rsi, Address(rsp, wordSize));
++ if (is_tail_call==false) {
++ leal(rsi, Address(rsp, wordSize));
++ // record last_sp
++ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), rsi);
++ }
+
+ if (JvmtiExport::can_post_interpreter_events()) {
+ Label run_compiled_code;
+diff --git a/src/cpu/x86/vm/interp_masm_x86_32.hpp b/src/cpu/x86/vm/interp_masm_x86_32.hpp
+--- a/src/cpu/x86/vm/interp_masm_x86_32.hpp
++++ b/src/cpu/x86/vm/interp_masm_x86_32.hpp
+@@ -161,7 +161,7 @@
+
+
+ // jump to an invoked target
+- void jump_from_interpreted(Register method, Register temp);
++ void jump_from_interpreted(Register method, Register temp, bool is_tail_call=false);
+
+ // Returning from interpreted functions
+ //
+diff --git a/src/cpu/x86/vm/nativeInst_x86.cpp b/src/cpu/x86/vm/nativeInst_x86.cpp
+--- a/src/cpu/x86/vm/nativeInst_x86.cpp
++++ b/src/cpu/x86/vm/nativeInst_x86.cpp
+@@ -114,7 +114,18 @@
+
+ }
+
+-
++void NativeCall::replace_with_jmp_mt_safe(address instr_addr, address jmp_dest) {
++ unsigned char code_buf[5];
++ int32_t rel_jmp_dest = jmp_dest - (instr_addr+NativeCall::return_address_offset);
++ unsigned char * jmp_dest_ptr = (unsigned char *)&rel_jmp_dest;
++ code_buf[0] = NativeJump::instruction_code;
++ code_buf[1] = jmp_dest_ptr[0];
++ code_buf[2] = jmp_dest_ptr[1];
++ code_buf[3] = jmp_dest_ptr[2];
++ code_buf[4] = jmp_dest_ptr[3];
++ replace_mt_safe(instr_addr, code_buf);
++
++}
+ // Similar to replace_mt_safe, but just changes the destination. The
+ // important thing is that free-running threads are able to execute this
+ // call instruction at all times. If the displacement field is aligned
+@@ -199,6 +210,21 @@
+ }
+
+
++void NativeMovConstProtectionDomain::verify() {
++#ifdef AMD64
++ assert(false, "not implemented");
++#else
++ u_char test_byte = *(u_char*)instruction_address();
++ if (test_byte != instruction_code) fatal("not a mov (%reg, #) imm32");
++#endif
++}
++
++void NativeMovConstProtectionDomain::print() {
++ tty->print_cr(PTR_FORMAT ": mov (%reg,#), " INTPTR_FORMAT,
++ instruction_address(), data());
++}
++
++
+ void NativeMovConstReg::verify() {
+ #ifdef AMD64
+ // make sure code pattern is actually a mov reg64, imm64 instruction
+@@ -565,8 +591,6 @@
+
+ }
+
+-
+-
+ address NativeGeneralJump::jump_destination() const {
+ int op_code = ubyte_at(0);
+ bool is_rel32off = (op_code == 0xE9 || op_code == 0x0F);
+diff --git a/src/cpu/x86/vm/nativeInst_x86.hpp b/src/cpu/x86/vm/nativeInst_x86.hpp
+--- a/src/cpu/x86/vm/nativeInst_x86.hpp
++++ b/src/cpu/x86/vm/nativeInst_x86.hpp
+@@ -154,6 +154,8 @@
+ static void insert(address code_pos, address entry);
+
+ static void replace_mt_safe(address instr_addr, address code_buffer);
++ // MT-safe patch to jmp (rel32) instruction.
++ static void replace_with_jmp_mt_safe(address instr_addr, address jmp_dest);
+ };
+
+ inline NativeCall* nativeCall_at(address address) {
+@@ -172,6 +174,48 @@
+ return call;
+ }
+
++// An interface for accessing/manipultation of mov (%esp,offset) imm32
++// protection domain token instructions.
++class NativeMovConstProtectionDomain : public NativeInstruction {
++ public:
++ enum Intel_specific_constants {
++ instruction_code = 0xC7,
++ instruction_size = 1 + 2 + wordSize,
++ instruction_offset = 0,
++ data_offset = 1 + 2,
++ next_instruction_offset = instruction_size
++ };
++
++ address instruction_address() const { return addr_at(instruction_offset); }
++ address next_instruction_address() const { return addr_at(next_instruction_offset); }
++ oop * oop_address() const { return (oop*)addr_at(data_offset); }
++ intptr_t data() const { return ptr_at(data_offset); }
++ void set_data(intptr_t x) { set_ptr_at(data_offset, x); }
++
++ void verify();
++ void print();
++
++ // Creation
++ inline friend NativeMovConstProtectionDomain* nativeMovConstPD_at(address address);
++ inline friend NativeMovConstProtectionDomain* nativeMovConstPD_before(address address);
++};
++
++inline NativeMovConstProtectionDomain* nativeMovConstPD_at(address address) {
++ NativeMovConstProtectionDomain* test = (NativeMovConstProtectionDomain*)(address - NativeMovConstProtectionDomain::instruction_offset);
++#ifdef ASSERT
++ test->verify();
++#endif
++ return test;
++}
++
++inline NativeMovConstProtectionDomain* nativeMovConstPD_before(address address) {
++ NativeMovConstProtectionDomain* test = (NativeMovConstProtectionDomain*)(address - NativeMovConstProtectionDomain::instruction_size - NativeMovConstProtectionDomain::instruction_offset);
++#ifdef ASSERT
++ test->verify();
++#endif
++ return test;
++}
++
+ // An interface for accessing/manipulating native mov reg, imm32 instructions.
+ // (used to manipulate inlined 32bit data dll calls, etc.)
+ class NativeMovConstReg: public NativeInstruction {
+diff --git a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
+--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
++++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
+@@ -38,6 +38,12 @@
+ RuntimeStub* SharedRuntime::_resolve_opt_virtual_call_blob;
+ RuntimeStub* SharedRuntime::_resolve_virtual_call_blob;
+ RuntimeStub* SharedRuntime::_resolve_static_call_blob;
++RuntimeStub* SharedRuntime::_resolve_static_tail_call_blob;
++RuntimeStub* SharedRuntime::_resolve_not_sibling_static_tail_call_blob;
++RuntimeStub* SharedRuntime::_resolve_virtual_tail_call_blob;
++RuntimeStub* SharedRuntime::_resolve_not_sibling_virtual_tail_call_blob;
++RuntimeStub* SharedRuntime::_resolve_opt_virtual_tail_call_blob;
++RuntimeStub* SharedRuntime::_resolve_opt_not_sibling_virtual_tail_call_blob;
+
+ const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
+
+@@ -318,6 +324,13 @@
+ return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
+ }
+
++
++// Tail call support: slots used on top of stack for the protection domain
++// token. Currently set to 18 to find bugs. Normally we would need only one.
++int SharedRuntime::tail_call_protection_domain_slots() {
++ if (TailCalls) return 1;
++ else return 0;
++}
+ // ---------------------------------------------------------------------------
+ // Read the array of BasicTypes from a signature, and compute where the
+ // arguments should go. Values in the VMRegPair regs array refer to 4-byte
+@@ -349,12 +362,14 @@
+ VMRegPair *regs,
+ int total_args_passed,
+ int is_outgoing) {
+- uint stack = 0; // Starting stack position for args on stack
++ uint stack = tail_call_protection_domain_slots(); // Starting stack position for args on stack
+
+
+ // Pass first two oop/int args in registers ECX and EDX.
+ uint reg_arg0 = 9999;
+ uint reg_arg1 = 9999;
++ // +TailCall edx is used for security token
++ // uint reg_arg1 = 0;
+
+ // Pass first two float/double args in registers XMM0 and XMM1.
+ // Doubles have precedence, so if you pass a mix of floats and doubles
+@@ -383,7 +398,7 @@
+ stack += 2;
+ }
+ }
+- int dstack = 0; // Separate counter for placing doubles
++ int dstack = tail_call_protection_domain_slots(); // Separate counter for placing doubles
+
+ // Now pick where all else goes.
+ for( i = 0; i < total_args_passed; i++) {
+@@ -538,7 +553,8 @@
+ int comp_args_on_stack,
+ const BasicType *sig_bt,
+ const VMRegPair *regs,
+- Label& skip_fixup) {
++ Label& skip_fixup,
++ bool is_tail_call) {
+ // Before we get into the guts of the C2I adapter, see if we should be here
+ // at all. We've come from compiled code and are attempting to jump to the
+ // interpreter, which means the caller made a static call to get here
+@@ -566,7 +582,8 @@
+ __ pop(rax);
+
+ // set senderSP value
+- __ movptr(rsi, rsp);
++ if (is_tail_call==false)
++ __ movptr(rsi, rsp);
+
+ __ subptr(rsp, extraspace);
+
+@@ -913,6 +930,163 @@
+ gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
+
+ // -------------------------------------------------------------------------
++ // Generate tail call C2I adapters.
++
++ assert (SharedRuntime::out_preserve_stack_slots() == 0, "This code assumes that there are not preserved stack slots");
++ Label skip_fixup_tail_call;
++ Label parent_is_not_interpreted;
++ Label static_tail_call;
++ Label static_tail_call_not_sibling;
++ Label continue_in_interpreter;
++ Label skip_fixup_tailcall;
++ Label skip_fixup;
++ address c2i_unverified_not_sibling_tail_call_entry = __ pc();
++ {
++
++ Label missed;
++ Register holder = rax;
++ Register receiver = rcx;
++ Register temp = rbx;
++
++ __ verify_oop(holder);
++ __ movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
++ __ verify_oop(temp);
++
++ __ cmpl(temp, Address(holder, compiledICHolderOopDesc::holder_klass_offset()));
++ __ movl(rbx, Address(holder, compiledICHolderOopDesc::holder_method_offset()));
++ __ jcc(Assembler::notEqual, missed);
++ // Method might have been compiled since the call site was patched to
++ // interpreted if that is the case treat it as a miss so we can get
++ // the call site corrected.
++ __ cmpl(Address(rbx, in_bytes(methodOopDesc::code_offset())), NULL_WORD);
++ __ jcc(Assembler::equal, static_tail_call_not_sibling);
++
++ __ bind(missed);
++ __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
++ }
++
++ address c2i_static_not_sibling_tail_call_entry = __ pc();
++ {
++ //__ warn("c2i_not_sibling_tail_call_entry");
++ patch_callers_callsite(masm);
++ __ verify_oop(rbx); // Rbx should contain methodoop of callee
++ Register tmp = rax; // Rax (IC_klass) is not used for static calls.
++ __ bind(static_tail_call_not_sibling);
++ __ parent_is_not_interpreter_jcc(tmp, continue_in_interpreter);
++ //__ warn("c2i_not_sibling_tail_call_entry: parent is interpreted");
++ // Parent is interpreted: can use code path of static tail call.
++ // It moves the arguments relative to the last_sp of the parent frame.
++ __ jmp(static_tail_call);
++ __ bind(continue_in_interpreter);
++ //__ warn("c2i_not_sibling_tail_call_entry: parent is compiled: continue in interpreted");
++ // Leave an int frame - lazily create an interpreter frame (the callee frame)
++ // Since we want to guarantee an interpreter frame on the stack we turn off
++ // on stack replacement (OSR) for one run (frame) of the called function.
++ // See InterpreterGenerator::generate_native_entry(bool synchronized) for the
++ // corresponding handshake code.
++ const Address disable_osr_for_frame(tmp,
++ in_bytes(JavaThread::disable_osr_for_frame_offset()));
++ __ push(tmp); // Probably don't have to safe it. Just to be safe.
++ __ get_thread(tmp);
++ __ movbool(disable_osr_for_frame, true);
++ __ pop(tmp);
++ __ jmp(skip_fixup); // to interpreter.
++ }
++
++ address c2i_unverified_tail_call_entry = __ pc();
++ {
++
++ Label missed;
++ Register holder = rax;
++ Register receiver = rcx;
++ Register temp = rbx;
++
++ __ verify_oop(holder);
++ __ movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
++ __ verify_oop(temp);
++
++ __ cmpl(temp, Address(holder, compiledICHolderOopDesc::holder_klass_offset()));
++ __ movl(rbx, Address(holder, compiledICHolderOopDesc::holder_method_offset()));
++ __ jcc(Assembler::notEqual, missed);
++ // Method might have been compiled since the call site was patched to
++ // interpreted if that is the case treat it as a miss so we can get
++ // the call site corrected.
++ __ cmpl(Address(rbx, in_bytes(methodOopDesc::code_offset())), NULL_WORD);
++ __ jcc(Assembler::equal, static_tail_call);
++
++ __ bind(missed);
++ __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
++ }
++
++ // Static tail call entry: We know that the protection domain and klass is
++ // correct.
++
++ address static_tail_call_c2i_entry = __ pc();;
++ patch_callers_callsite(masm);
++ __ bind (static_tail_call);
++ // Check whether parent frame is interpreter.
++ Register tmp = rax; // Klass not used if static call.
++ __ parent_is_not_interpreter_jcc(tmp, parent_is_not_interpreted);
++ //__ warn("c2i_static_tail_call_entry: parent is interpreted");
++ // Parent is interpreted
++ // Safe some registers.
++ __ push(rax);
++ __ push(rbx);
++ // store rbp
++ __ movl(tmp, Address(rbp, frame::link_offset*wordSize));
++ __ push(tmp);
++ // Store ret address.
++ __ movl(tmp, Address(rbp, frame::return_addr_offset * wordSize));
++ __ push(tmp);
++ // Get last_sp from parent frame.
++ Register last_sp = tmp; tmp = rbx;
++ __ movl(tmp, Address(rbp, frame::link_offset * wordSize)); // old rbp
++ __ movl(last_sp, Address(tmp, frame::interpreter_frame_last_sp_offset * wordSize));
++ __ movl(rsi, last_sp); // old_sp for interpreter
++ // Shuffle arguments
++ for (int src_slot = comp_args_on_stack, dest_slot=-1; src_slot > 0; src_slot--, dest_slot--) {
++ // rax,rbx,old_rbp,old_retaddr => +4
++ Address src(rsp, VMRegImpl::stack_slot_size * (4+src_slot));
++ Address dest(last_sp, VMRegImpl::stack_slot_size * (dest_slot));
++ __ movl(tmp, src);
++ __ movl(dest, tmp);
++ }
++ // Set return address.
++ __ subl(last_sp, (1+comp_args_on_stack)*wordSize);
++ __ pop(tmp);
++ __ movl(Address(last_sp, 0), tmp);
++ // Set rbp
++ __ pop(rbp);
++ // Restore used registers.
++ __ pop(rbx);
++ // Set new rsp.
++ __ movl(rsp, last_sp);
++ // Set new rbp
++ // BUG: might be overwritten !!!???!!! see not sibling entry point. push rbp
++ // at the beginning.
++ //__ movl(rbp, Address(rbp, frame::link_offset * wordSize)); // old rbp
++ __ jmp(skip_fixup_tail_call);
++
++ // Parent is compiled.
++ __ bind(parent_is_not_interpreted);
++ tmp = rax;
++ for (int slot = 1; slot <= comp_args_on_stack; slot++) {
++ Address src (rsp, VMRegImpl::stack_slot_size * (SharedRuntime::out_preserve_stack_slots()+slot));
++ // Need to add safed rbp slot so slot+1 VVV
++ Address dest(rbp, VMRegImpl::stack_slot_size * (SharedRuntime::out_preserve_stack_slots()+slot+1));
++ __ movl(tmp, src);
++ __ movl(dest, tmp);
++ }
++ // pop frame
++ __ leave();
++ // set old_sp
++ __ leal(rsi,Address(rsp, wordSize)); // ret addr on stack
++ // jump to normal c2i entry
++ __ jmp(skip_fixup_tail_call);
++ gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup_tail_call, true);
++ // End of static tail call entry.
++
++ // -------------------------------------------------------------------------
+ // Generate a C2I adapter. On entry we know rbx, holds the methodOop during calls
+ // to the interpreter. The args start out packed in the compiled layout. They
+ // need to be unpacked into the interpreter layout. This will almost always
+@@ -922,7 +1096,7 @@
+ // compiled code, which relys solely on SP and not EBP, get sick).
+
+ address c2i_unverified_entry = __ pc();
+- Label skip_fixup;
++ //Label skip_fixup;
+
+ Register holder = rax;
+ Register receiver = rcx;
+@@ -948,13 +1122,21 @@
+ __ bind(missed);
+ __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+ }
+-
++ // For tail calls (lazy adapter/ interpreter frame) we skip the fix up check.
++ address c2i_entry_skip_fixup = __ pc();
++ __ jmp(continue_in_interpreter);
++ //__ jmp(skip_fixup);
++
+ address c2i_entry = __ pc();
+
+- gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
++ gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup, false);
+
+ __ flush();
+- return new AdapterHandlerEntry(i2c_entry, c2i_entry, c2i_unverified_entry);
++ return new AdapterHandlerEntry(i2c_entry, c2i_entry, c2i_unverified_entry,
++ static_tail_call_c2i_entry, c2i_unverified_tail_call_entry,
++ c2i_entry_skip_fixup,
++ c2i_static_not_sibling_tail_call_entry,
++ c2i_unverified_not_sibling_tail_call_entry);
+ }
+
+ int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
+@@ -1453,7 +1635,9 @@
+ // sure we can capture all the incoming oop args from the
+ // caller.
+ //
+- OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
++ //OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
++ OopMap* map = new OopMap((tail_call_protection_domain_slots()+stack_slots) * 2, 0 /* arg_slots*/);
++ // oopmap that copes with size of tail_call_pd_slots
+
+ // Mark location of rbp,
+ // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, rbp->as_VMReg());
+@@ -2952,7 +3136,7 @@
+ // but since this is generic code we don't know what they are and the caller
+ // must do any gc of the args.
+ //
+-static RuntimeStub* generate_resolve_blob(address destination, const char* name) {
++static RuntimeStub* generate_resolve_blob(address destination, const char* name, bool is_tail_call=false) {
+ assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
+
+ // allocate space for the code
+@@ -3007,10 +3191,24 @@
+
+ __ movptr(Address(rsp, RegisterSaver::rax_offset() * wordSize), rax);
+
+- RegisterSaver::restore_live_registers(masm);
++ //RegisterSaver::restore_live_registers(masm);
+
+ // We are back the the original state on entry and ready to go.
+-
++ /*Dont need to do the following here since this is performed by jumping to
++ verified_static_tail_call_code_entry.
++ if (is_tail_call) {
++ Label normal_call;
++ __ movl(rbx, Address(rbx, methodOopDesc::code_offset()));
++ __ cmpl(rax, Address(rbx, nmethod::verified_entry_point_offset()));
++ __ jcc(Assembler::notEqual, normal_call);
++ // Remove caller's stack frame.
++ RegisterSaver::restore_live_registers(masm);
++ __ leave();
++ // Tail call.
++ __ jmp(rax);
++ __ bind (normal_call);
++ }*/
++ RegisterSaver::restore_live_registers(masm);
+ __ jmp(rax);
+
+ // Pending exception after the safepoint
+@@ -3052,6 +3250,23 @@
+ _resolve_static_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_static_call_C),
+ "resolve_static_call");
+
++ _resolve_static_tail_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_static_tail_call_C),
++ "resolve_static_tail_call", true);
++
++ _resolve_not_sibling_static_tail_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_not_sibling_static_tail_call_C),
++ "resolve_not_sibling_static_tail_call", true);
++
++ _resolve_virtual_tail_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_virtual_tail_call_C),
++ "resolve_virtual_tail_call", true);
++
++ _resolve_not_sibling_virtual_tail_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_not_sibling_virtual_tail_call_C),
++ "resolve_not_sibling_virtual_tail_call", true);
++
++ _resolve_opt_virtual_tail_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_opt_virtual_tail_call_C),
++ "resolve_opt_virtual_tail_call", true);
++ _resolve_opt_not_sibling_virtual_tail_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_opt_not_sibling_virtual_tail_call_C),
++ "resolve_opt_not_sibling_virtual_tail_call", true);
++
+ _polling_page_safepoint_handler_blob =
+ generate_handler_blob(CAST_FROM_FN_PTR(address,
+ SafepointSynchronize::handle_polling_page_exception), false);
+diff --git a/src/cpu/x86/vm/templateInterpreterGenerator_x86.hpp b/src/cpu/x86/vm/templateInterpreterGenerator_x86.hpp
+--- a/src/cpu/x86/vm/templateInterpreterGenerator_x86.hpp
++++ b/src/cpu/x86/vm/templateInterpreterGenerator_x86.hpp
+@@ -24,6 +24,6 @@
+
+ protected:
+
+- void generate_fixed_frame(bool native_call);
++void generate_fixed_frame(bool native_call, bool disable_osr=false);
+
+ // address generate_asm_interpreter_entry(bool synchronized);
+diff --git a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
+--- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
++++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
+@@ -132,7 +132,8 @@
+ address TemplateInterpreterGenerator::generate_return_entry_for(TosState state, int step) {
+ Label interpreter_entry;
+ address compiled_entry = __ pc();
+-
++ // Tail call invokes have wide prefix so their length is one bigger.
++ bool is_tail_call = (step == 4) || (step==6);
+ #ifdef COMPILER2
+ // The FPU stack is clean if UseSSE >= 2 but must be cleaned in other cases
+ if ((state == ftos && UseSSE < 1) || (state == dtos && UseSSE < 2)) {
+@@ -180,7 +181,10 @@
+
+ __ restore_bcp();
+ __ restore_locals();
+- __ get_cache_and_index_at_bcp(rbx, rcx, 1);
++ if (is_tail_call)
++ __ get_cache_and_index_at_bcp(rbx, rcx, 2);
++ else
++ __ get_cache_and_index_at_bcp(rbx, rcx, 1);
+ __ movl(rbx, Address(rbx, rcx,
+ Address::times_ptr, constantPoolCacheOopDesc::base_offset() +
+ ConstantPoolCacheEntry::flags_offset()));
+@@ -526,7 +530,7 @@
+ // Generate a fixed interpreter frame. This is identical setup for interpreted methods
+ // and for native methods hence the shared code.
+
+-void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {
++void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call, bool disable_osr) {
+ // initialize fixed part of activation frame
+ __ push(rax); // save return address
+ __ enter(); // save old & set new rbp,
+@@ -558,6 +562,11 @@
+ } else {
+ __ push(rsi); // set bcp
+ }
++ // OSR token
++ if (disable_osr)
++ __ push(1);
++ else
++ __ push(0);
+ __ push(0); // reserve word for pointer to expression stack bottom
+ __ movptr(Address(rsp, 0), rsp); // set expression stack bottom
+ }
+@@ -749,8 +758,38 @@
+ if (inc_counter) __ movl(rcx, invocation_counter); // (pre-)fetch invocation count
+ // initialize fixed part of activation frame
+
++ // Tail calls want to disable OSR if we are comming from a compiled frame that
++ // tried to do a non sibling tail call and failed because the parent was not
++ // interpreted.
++ Label disable_osr_frame_generated;
++ if (TailCalls) {
++ Register temp = rax;
++ Label normal_frame;
++ // safe temp regiser
++ __ push(temp);
++ __ get_thread(temp);
++ const Address disable_osr_for_frame(temp,
++ in_bytes(JavaThread::disable_osr_for_frame_offset()));
++ __ movbool(temp, disable_osr_for_frame);
++ __ testbool(temp);
++ __ jcc(Assembler::zero, normal_frame);
++ // Generate frame which temporary disables osr.
++ __ pop(temp); // restore register
++ generate_fixed_frame(true, true);
++ if (TraceTailCalls) __ warn("saw disabled osr");
++ __ push(temp);
++ // Turn osr back on (That is if DoOnStackReplacement is set to true).
++ __ get_thread(temp);
++ __ movbool(disable_osr_for_frame, false);
++ __ pop(temp);
++ __ jmp(disable_osr_frame_generated);
++ __ bind(normal_frame);
++ // restore register
++ __ pop(temp);
++ }
++
+ generate_fixed_frame(true);
+-
++ __ bind(disable_osr_frame_generated);
+ // make sure method is native & not abstract
+ #ifdef ASSERT
+ __ movl(rax, access_flags);
+diff --git a/src/cpu/x86/vm/templateTable_x86_32.cpp b/src/cpu/x86/vm/templateTable_x86_32.cpp
+--- a/src/cpu/x86/vm/templateTable_x86_32.cpp
++++ b/src/cpu/x86/vm/templateTable_x86_32.cpp
+@@ -1699,6 +1699,21 @@
+
+ // invocation counter overflow
+ __ bind(backedge_counter_overflow);
++
++ if (TailCalls) {
++ Label continue_osr;
++ // Tail calls might disable OSR during an execution of one method in
++ // the interpreter.
++ __ push (rcx);
++ Address disable_osr_offset(rbp, frame::interpreter_frame_osr_offset * wordSize);
++ __ testl(rcx, rcx);
++ __ jcc(Assembler::zero, continue_osr);
++ __ pop(rcx);
++ if (TraceTailCalls) __ warn("continue with loop because osr disabled");
++ __ jmp(dispatch);
++ __ bind(continue_osr);
++ __ pop(rcx);
++ }
+ __ negptr(rdx);
+ __ addptr(rdx, rsi); // branch bcp
+ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::frequency_counter_overflow), rdx);
+@@ -2103,7 +2118,7 @@
+ __ membar(order_constraint);
+ }
+
+-void TemplateTable::resolve_cache_and_index(int byte_no, Register Rcache, Register index) {
++void TemplateTable::resolve_cache_and_index(int byte_no, Register Rcache, Register index, bool is_tail_call) {
+ assert(byte_no == 1 || byte_no == 2, "byte_no out of range");
+
+ Register temp = rbx;
+@@ -2112,7 +2127,10 @@
+
+ const int shift_count = (1 + byte_no)*BitsPerByte;
+ Label resolved;
+- __ get_cache_and_index_at_bcp(Rcache, index, 1);
++ if (is_tail_call)
++ __ get_cache_and_index_at_bcp(Rcache, index, 2); // Skip the wide instruction.
++ else
++ __ get_cache_and_index_at_bcp(Rcache, index, 1);
+ __ movl(temp, Address(Rcache,
+ index,
+ Address::times_ptr,
+@@ -2139,7 +2157,10 @@
+ __ movl(temp, (int)bytecode());
+ __ call_VM(noreg, entry, temp);
+ // Update registers with resolved info
+- __ get_cache_and_index_at_bcp(Rcache, index, 1);
++ if (is_tail_call)
++ __ get_cache_and_index_at_bcp(Rcache, index, 2);
++ else
++ __ get_cache_and_index_at_bcp(Rcache, index, 1);
+ __ bind(resolved);
+ }
+
+@@ -2169,6 +2190,7 @@
+ }
+
+ void TemplateTable::load_invoke_cp_cache_entry(int byte_no,
++ bool is_tail_call,
+ Register method,
+ Register itable_index,
+ Register flags,
+@@ -2195,7 +2217,7 @@
+ const int index_offset = in_bytes(constantPoolCacheOopDesc::base_offset() +
+ ConstantPoolCacheEntry::f2_offset());
+
+- resolve_cache_and_index(byte_no, cache, index);
++ resolve_cache_and_index(byte_no, cache, index, is_tail_call);
+
+ __ movptr(method, Address(cache, index, Address::times_ptr, method_offset));
+ if (itable_index != noreg) {
+@@ -2204,6 +2226,23 @@
+ __ movl(flags , Address(cache, index, Address::times_ptr, flags_offset ));
+ }
+
++void TemplateTable::load_invoke_cp_cache_flags(int byte_no,
++ Register Rcache,
++ Register Rindex,
++ Register OutFlags) {
++ // Rcache and Rindex are used. OutFlags contains flag on return.
++ assert_different_registers(OutFlags, Rcache, Rindex);
++
++ const int flags_offset = in_bytes(constantPoolCacheOopDesc::base_offset() +
++ ConstantPoolCacheEntry::flags_offset());
++ // access constant pool cache fields Since this function is only called from
++ // tail_call set parameter is_tail_call=true.
++ resolve_cache_and_index(byte_no, Rcache, Rindex, true);
++
++ assert(wordSize == 4, "adjust code below");
++ __ movl(OutFlags , Address(Rcache, Rindex, Address::times_4, flags_offset ));
++}
++
+
+ // The registers cache and index expected to be set before call.
+ // Correct values of the cache and index registers are preserved.
+@@ -2255,7 +2294,7 @@
+ const Register off = rbx;
+ const Register flags = rax;
+
+- resolve_cache_and_index(byte_no, cache, index);
++ resolve_cache_and_index(byte_no, cache, index, false);
+ jvmti_post_field_access(cache, index, is_static, false);
+ load_field_cp_cache_entry(obj, cache, index, off, flags, is_static);
+
+@@ -2464,7 +2503,7 @@
+ const Register off = rbx;
+ const Register flags = rax;
+
+- resolve_cache_and_index(byte_no, cache, index);
++ resolve_cache_and_index(byte_no, cache, index, false);
+ jvmti_post_field_mod(cache, index, is_static);
+ load_field_cp_cache_entry(obj, cache, index, off, flags, is_static);
+
+@@ -2884,7 +2923,7 @@
+ }
+
+
+-void TemplateTable::prepare_invoke(Register method, Register index, int byte_no, Bytecodes::Code code) {
++void TemplateTable::prepare_invoke(Register method, Register index, int byte_no, Bytecodes::Code code, bool is_tail_call) {
+ // determine flags
+ const bool is_invokeinterface = code == Bytecodes::_invokeinterface;
+ const bool is_invokevirtual = code == Bytecodes::_invokevirtual;
+@@ -2900,7 +2939,7 @@
+ // save 'interpreter return address'
+ __ save_bcp();
+
+- load_invoke_cp_cache_entry(byte_no, method, index, flags, is_invokevirtual);
++ load_invoke_cp_cache_entry(byte_no, is_tail_call, method, index, flags, is_invokevirtual);
+
+ // load receiver if needed (note: no return address pushed yet)
+ if (load_receiver) {
+@@ -2926,8 +2965,9 @@
+ ConstantPoolCacheEntry::verify_tosBits();
+ // load return address
+ {
+- ExternalAddress table(is_invokeinterface ? (address)Interpreter::return_5_addrs_by_index_table() :
+- (address)Interpreter::return_3_addrs_by_index_table());
++ int instruction_size = is_invokeinterface ? 5 : 3;
++ if (is_tail_call) instruction_size += 1;
++ ExternalAddress table((address)Interpreter::return_addrs_by_index_table(instruction_size));
+ __ movptr(flags, ArrayAddress(table, Address(noreg, flags, Address::times_ptr)));
+ }
+
+@@ -2942,15 +2982,49 @@
+ }
+ }
+
++// Load the klass holding the method passed in register 'method' to register
++// klass.
++void TemplateTable::load_pool_holder_of_method(Register method, Register klass) {
++ __ verify_oop(method);
++ __ movl (klass, Address(method, methodOopDesc::constants_offset() ));
++ __ verify_oop(klass);
++ __ movl (klass, Address(klass, constantPoolOopDesc::pool_holder_offset_in_bytes()));
++ __ verify_oop(klass);
++}
++
++void TemplateTable::jcc_protection_domain_mismatch(Register temp, Register temp2, Register recv, Label& mismatch_cont, bool receiver_holds_klass) {
++ // Check protection domains.
++ // Get receiver PD.
++ if (receiver_holds_klass) {
++ __ movl (temp, Address(recv, instanceKlass::protection_domain_offset() * wordSize));
++ } else {
++ __ movl (temp, Address(recv, oopDesc::klass_offset_in_bytes()));
++ __ movl (temp, Address(temp, instanceKlass::protection_domain_offset() * wordSize));
++ }
++ __ verify_oop(temp);
++ // Get caller PD.
++ __ movl (temp2, Address(rbp, frame::interpreter_frame_method_offset * wordSize));
++ __ verify_oop(temp2);
++ load_pool_holder_of_method(temp2, temp2);
++ __ movl (temp2, Address(temp2, instanceKlass::protection_domain_offset() * wordSize));
++ __ verify_oop(temp2);
++ __ cmpl(temp, temp2);
++ __ jcc(Assembler::notEqual, mismatch_cont);
++}
+
+ void TemplateTable::invokevirtual_helper(Register index, Register recv,
+- Register flags) {
++ Register flags, int byte_no, bool is_tail_call) {
+
+ // Uses temporary registers rax, rdx
+ assert_different_registers(index, recv, rax, rdx);
+
+ // Test for an invoke of a final method
+ Label notFinal;
++ // Tail call executed as a normal call (protection domain mismatch or parent
++ // frame is not an interpreter frame)
++ Label regular_call_continuation;
++ Label protection_domain_mismatch_cont;
++
+ __ movl(rax, flags);
+ __ andl(rax, (1 << ConstantPoolCacheEntry::vfinalMethod));
+ __ jcc(Assembler::zero, notFinal);
+@@ -2966,8 +3040,13 @@
+
+ // profile this call
+ __ profile_final_call(rax);
+-
+- __ jump_from_interpreted(method, rax);
++
++ // Prepare for tail call.
++ if (is_tail_call) {
++ jcc_protection_domain_mismatch(rax, rdx, recv, protection_domain_mismatch_cont, false);
++ tail_call(byte_no, regular_call_continuation);
++ }
++ __ jump_from_interpreted(method, rax, is_tail_call);
+
+ __ bind(notFinal);
+
+@@ -2984,41 +3063,212 @@
+ const int base = instanceKlass::vtable_start_offset() * wordSize;
+ assert(vtableEntry::size() * wordSize == 4, "adjust the scaling in the code below");
+ __ movptr(method, Address(rax, index, Address::times_ptr, base + vtableEntry::method_offset_in_bytes()));
+- __ jump_from_interpreted(method, rdx);
++
++ if (is_tail_call) {
++ // Check protection domain.
++ jcc_protection_domain_mismatch(rax, rdx, recv, protection_domain_mismatch_cont, false);
++ // Shift arguments onto caller's outgoing parameter area
++ // pop frame.
++ tail_call(byte_no, regular_call_continuation);
++ }
++ __ jump_from_interpreted(method, rdx, is_tail_call);
++
++ // We arrive here if tail call should be performed as regular call.
++ __ bind(regular_call_continuation);
++ __ jump_from_interpreted(method, rdx, false);
++ // Tail call exception on protection domain mismatch
++ __ bind(protection_domain_mismatch_cont);
++ __ restore_bcp();
++ __ call_VM(noreg, CAST_FROM_FN_PTR(address,
++ InterpreterRuntime::throw_TailCallException));
+ }
+
++void TemplateTable::tail_call(int byte_no, Label& regular_call_continuation) {
++ Register temp = rdx;
++ // Only tail call if parent frame is interpreted. Otherwise we might have an
++ // c2i adapter sitting around that nobody takes care of. And that would not be
++ // nice (i.e result in a wrong rsp). This is also needed in order for lazy
++ // adapter frame creation in the not sibling tail call code path.
++ __ parent_is_not_interpreter_jcc(temp, regular_call_continuation);
++ // Assumption can use rdx, rdi since they contain no vital info. Rsi is
++ // computed from sender_sp so its free too. Other registers need to be saved.
++ // Store return_addr, link (old fp), sender_sp (old sp) to top of stack to
++ // prevent those values from being overwritten when moving parameters further
++ // down.
++ __ movl(temp, Address(rbp, frame::return_addr_offset * wordSize));
++ __ movl(Address(rsp, 0), temp); // Save ret addr.
++ __ movl(temp, Address(rbp, frame::link_offset * wordSize));
++ __ push(temp); // Save old rbp.
++ __ movl(temp, Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize));
++ __ push(temp); // Save old rsp. needed if caller was compiled - c2i adapter
++ // parameters lurking around!
++ // Save rax, rbx, rcx.
++ __ push(rax); // callVM in resolve_cache_and_index saves?
++ __ push(rbx); // Clobbered by resolve_cache_and_index.
++ __ push(rcx); // callVM saves?
++
++ Register param_end = rdx;
++
++ // Clobbers caller saved registers rax,rcx,rdx.
++ load_invoke_cp_cache_flags(byte_no, rdi, rsi, param_end);
++ __ andl(param_end, 0xFF);
++ __ shll(param_end, 2); // a param needs 4 bytes. REFACTOR.
++ Register source = rsi;
++ Register dest = rdi;
++ temp = rbx;
++ assert_different_registers(param_end, source, dest);
++ assert_different_registers(temp, param_end, source);
++ assert_different_registers(dest, temp);
++
++ // Load copy-from start.
++ __ leal(source, Address(rsp, 5 * wordSize)); // eax, ebx, ecx, oldsp, oldbp, retaddr
++ __ addl(source, param_end);
++ __ leal(param_end, Address(rsp, 4 * wordSize)); // Also copy retaddr.
++ // Load copy-to destination start.
++ __ movl(dest, Address(rbp, frame::interpreter_frame_locals_offset* wordSize));
++
++ // Copy word from source to dest parameter-count times.
++ // I am sure we can do a better memcpy.
++ { Label exit, loop;
++ // copy
++ __ bind(loop);
++ __ cmpl(param_end, source);
++ __ jcc(Assembler::equal, exit);
++ __ movl(temp, Address(source, 0));
++ __ movl(Address(dest, 0), temp);
++ __ subl(source, wordSize);
++ __ subl(dest, wordSize);
++
++ __ jmp(loop); // next iteration
++ __ bind(exit);
++ }
++
++ // Restore rcx, rbx, rax.
++ __ pop(rcx);
++ __ pop(rbx);
++ __ pop(rax);
++ // Get old stack pointer.
++ __ pop(rsi); //no need except if our sender was compiled
++ // Get old frame pointer.
++ __ pop(rbp);
++ // Adjust stack pointer to ret addr.
++ __ addl(dest, wordSize);
++ __ movl(rsp, dest);
++}
+
+ void TemplateTable::invokevirtual(int byte_no) {
+ transition(vtos, vtos);
+- prepare_invoke(rbx, noreg, byte_no, bytecode());
++ prepare_invoke(rbx, noreg, byte_no, bytecode(), false);
+
+ // rbx,: index
+ // rcx: receiver
+ // rdx: flags
+
+- invokevirtual_helper(rbx, rcx, rdx);
++ invokevirtual_helper(rbx, rcx, rdx, byte_no);
+ }
+
++void TemplateTable::wide_invokevirtual(int byte_no) {
++ transition(vtos, vtos);
++ // Bcp points to wide, advance to invoke instruction.
++ //__ increment(rsi, 1);
++ if (TraceTailCalls) __ warn("Interpreter: wide_invokevirtual");
++ prepare_invoke(rbx, noreg, byte_no, bytecode(), true);
++ // rbx,: index
++ // rcx: receiver
++ // rdx: flags
++ invokevirtual_helper(rbx, rcx, rdx, byte_no, true);
++}
+
+ void TemplateTable::invokespecial(int byte_no) {
+ transition(vtos, vtos);
+- prepare_invoke(rbx, noreg, byte_no, bytecode());
++ prepare_invoke(rbx, noreg, byte_no, bytecode(), false);
+ // do the call
+ __ verify_oop(rbx);
+ __ profile_call(rax);
+ __ jump_from_interpreted(rbx, rax);
+ }
+
++void TemplateTable::wide_invokespecial(int byte_no) {
++ transition(vtos, vtos);
++ Label regular_call_continuation;
++ Label protection_domain_mismatch_cont;
++ //__ increment(rsi, 1);
++ if (TraceTailCalls) __ warn("Interpreter: wide_invokespecial");
++ prepare_invoke(rbx, noreg, byte_no, bytecode(), true);
++ // do the call
++ __ verify_oop(rbx);
++ __ profile_call(rax);
++ __ push(rax);
++ __ push(rdx);
++ __ push(rcx);
++ // Load the 'receiver' (e.g the class holding the method) class.
++ load_pool_holder_of_method(rbx, rcx);
++ __ verify_oop(rcx);
++ jcc_protection_domain_mismatch(rax, rdx, rcx, protection_domain_mismatch_cont, true);
++ __ pop(rcx);
++ __ pop(rdx);
++ __ pop(rax);
++ tail_call(byte_no, regular_call_continuation);
++ __ jump_from_interpreted(rbx, rax, true);
++ // not a tail call
++ __ bind (regular_call_continuation);
++ __ jump_from_interpreted(rbx, rax, false);
++ // Tail call exception on protection domain mismatch
++ __ bind(protection_domain_mismatch_cont);
++ __ pop(rcx);
++ __ pop(rdx);
++ __ pop(rax);
++ __ restore_bcp();
++ __ call_VM(noreg, CAST_FROM_FN_PTR(address,
++ InterpreterRuntime::throw_TailCallException));
++}
+
+ void TemplateTable::invokestatic(int byte_no) {
+ transition(vtos, vtos);
+- prepare_invoke(rbx, noreg, byte_no, bytecode());
++ prepare_invoke(rbx, noreg, byte_no, bytecode(), false);
+ // do the call
+ __ verify_oop(rbx);
+ __ profile_call(rax);
+ __ jump_from_interpreted(rbx, rax);
+ }
+
++void TemplateTable::wide_invokestatic(int byte_no) {
++ transition(vtos, vtos);
++ Label regular_call_continuation;
++ Label protection_domain_mismatch_cont;
++ // Bcp points to wide, advance to invoke instruction.
++ //__ increment(rsi, 1);
++ if (TraceTailCalls) __ warn("Interpreter: wide_invokestatic");
++ prepare_invoke(rbx, noreg, byte_no, bytecode(), true);
++ // do the call
++ __ verify_oop(rbx);
++ __ profile_call(rax);
++ __ null_check(rcx); //receiver
++ // Store clobbered registers.
++ __ push(rax);
++ __ push(rdx);
++ __ push(rcx);
++ // Load the 'receiver' (e.g the class holding the method) class.
++ load_pool_holder_of_method(rbx, rcx);
++ __ verify_oop(rcx);
++ jcc_protection_domain_mismatch(rax, rdx, rcx, protection_domain_mismatch_cont, true);
++ __ pop(rcx);
++ __ pop(rdx);
++ __ pop(rax);
++ tail_call(byte_no, regular_call_continuation);
++ __ jump_from_interpreted(rbx, rax, true);
++ // not a tail call
++ __ bind (regular_call_continuation);
++ __ jump_from_interpreted(rbx, rax, false);
++ // Tail call exception on protection domain mismatch
++ __ bind(protection_domain_mismatch_cont);
++ __ pop(rcx);
++ __ pop(rdx);
++ __ pop(rax);
++ __ restore_bcp();
++ __ call_VM(noreg, CAST_FROM_FN_PTR(address,
++ InterpreterRuntime::throw_TailCallException));
++}
+
+ void TemplateTable::fast_invokevfinal(int byte_no) {
+ transition(vtos, vtos);
+@@ -3027,8 +3277,23 @@
+
+
+ void TemplateTable::invokeinterface(int byte_no) {
++ invokeinterface_helper(byte_no, false);
++}
++
++void TemplateTable::wide_invokeinterface(int byte_no) {
++ if (TraceTailCalls) __ warn("Interpreter: wide_invokeinterface");
++ invokeinterface_helper(byte_no, true);
++}
++
++void TemplateTable::invokeinterface_helper(int byte_no, bool is_tail_call) {
+ transition(vtos, vtos);
+- prepare_invoke(rax, rbx, byte_no, bytecode());
++
++ if (is_tail_call) {
++ // Bcp points to wide, advance to invoke instruction.
++ //__ increment(rsi, 1);
++ }
++
++ prepare_invoke(rax, rbx, byte_no, bytecode(), is_tail_call);
+
+ // rax,: Interface
+ // rbx,: index
+@@ -3044,7 +3309,7 @@
+ __ andl(rdi, (1 << ConstantPoolCacheEntry::methodInterface));
+ __ jcc(Assembler::zero, notMethod);
+
+- invokevirtual_helper(rbx, rcx, rdx);
++ invokevirtual_helper(rbx, rcx, rdx, byte_no, is_tail_call);
+ __ bind(notMethod);
+
+ // Get receiver klass into rdx - also a null check
+@@ -3123,10 +3388,35 @@
+ __ bind(L);
+ }
+
++ Label regular_call_continuation;
++ Label protection_domain_mismatch_cont;
++ // If tail calling pop stack and move paramters.
++ if (is_tail_call) {
++ // rax, rdx are clobbered by jcc_protection_domain_mismatch
++ __ push(rax);
++ __ push(rdx);
++ jcc_protection_domain_mismatch(rax, rdx, rcx, protection_domain_mismatch_cont, false);
++ __ pop(rdx);
++ __ pop(rax);
++ // The bcp in rsi has been clobbered but is needed in tail_call.
++ __ restore_bcp();
++ tail_call(byte_no, regular_call_continuation);
++ }
++
+ // do the call
+ // rcx: receiver
+ // rbx,: methodOop
+- __ jump_from_interpreted(rbx, rdx);
++ __ jump_from_interpreted(rbx, rdx, is_tail_call);
++ // not a tail call
++ __ bind (regular_call_continuation);
++ __ jump_from_interpreted(rbx, rax, false);
++ // Tail call exception on protection domain mismatch
++ __ bind(protection_domain_mismatch_cont);
++ __ pop(rdx);
++ __ pop(rax);
++ __ restore_bcp();
++ __ call_VM(noreg, CAST_FROM_FN_PTR(address,
++ InterpreterRuntime::throw_TailCallException));
+ }
+
+ //----------------------------------------------------------------------------------------------------
+diff --git a/src/cpu/x86/vm/templateTable_x86_32.hpp b/src/cpu/x86/vm/templateTable_x86_32.hpp
+--- a/src/cpu/x86/vm/templateTable_x86_32.hpp
++++ b/src/cpu/x86/vm/templateTable_x86_32.hpp
+@@ -22,10 +22,21 @@
+ *
+ */
+
+- static void prepare_invoke(Register method, Register index, int byte_no,
+- Bytecodes::Code code);
++static void prepare_invoke(Register method, Register index, int byte_no, Bytecodes::Code code, bool is_tail_call);
++
+ static void invokevirtual_helper(Register index, Register recv,
+- Register flags);
++ Register flags, int byte_no, bool is_tail_call = false);
++
++ static void invokeinterface_helper(int byte_no, bool is_tail_call = false);
++
++ static void load_pool_holder_of_method(Register method, Register klass);
++ static void jcc_protection_domain_mismatch(Register temp, Register temp2, Register recv, Label& mismatch_cont, bool receiver_holds_klass);
++
++ static void tail_call(int byte_no, Label& regular_call_continuation);
++ // Tail call helper.
++ static void load_invoke_cp_cache_flags(int byte_no, Register Rcache,
++ Register Rindex, Register Flags);
++
+ static void volatile_barrier(Assembler::Membar_mask_bits order_constraint );
+
+ // Helpers
+diff --git a/src/cpu/x86/vm/vtableStubs_x86_32.cpp b/src/cpu/x86/vm/vtableStubs_x86_32.cpp
+--- a/src/cpu/x86/vm/vtableStubs_x86_32.cpp
++++ b/src/cpu/x86/vm/vtableStubs_x86_32.cpp
+@@ -39,9 +39,9 @@
+ // Leave receiver in rcx; required behavior when +OptoArgsInRegisters
+ // is modifed to put first oop in rcx.
+ //
+-VtableStub* VtableStubs::create_vtable_stub(int vtable_index) {
++VtableStub* VtableStubs::create_vtable_stub(int vtable_index, bool is_tail_call, bool is_sibling) {
+ const int i486_code_length = VtableStub::pd_code_size_limit(true);
+- VtableStub* s = new(i486_code_length) VtableStub(true, vtable_index);
++ VtableStub* s = new(i486_code_length) VtableStub(true, vtable_index, is_tail_call, is_sibling);
+ ResourceMark rm;
+ CodeBuffer cb(s->entry_point(), i486_code_length);
+ MacroAssembler* masm = new MacroAssembler(&cb);
+@@ -77,11 +77,21 @@
+
+ // load methodOop and target address
+ __ movptr(method, Address(rax, entry_offset*wordSize + vtableEntry::method_offset_in_bytes()));
++
++ ByteSize method_entry_offset = in_ByteSize(0);
++ if (is_tail_call && is_sibling) {
++ method_entry_offset = methodOopDesc::from_compiled_static_tail_call_offset();
++ } else if (is_tail_call) {
++ method_entry_offset = methodOopDesc::from_compiled_not_sibling_static_tail_call_offset();
++ } else {
++ method_entry_offset = methodOopDesc::from_compiled_offset();
++ }
++
+ if (DebugVtables) {
+ Label L;
+ __ cmpptr(method, (int32_t)NULL_WORD);
+ __ jcc(Assembler::equal, L);
+- __ cmpptr(Address(method, methodOopDesc::from_compiled_offset()), (int32_t)NULL_WORD);
++ __ cmpptr(Address(method, method_entry_offset), (int32_t)NULL_WORD);
+ __ jcc(Assembler::notZero, L);
+ __ stop("Vtable entry is NULL");
+ __ bind(L);
+@@ -91,7 +101,7 @@
+ // method (rbx): methodOop
+ // rcx: receiver
+ address ame_addr = __ pc();
+- __ jmp( Address(method, methodOopDesc::from_compiled_offset()));
++ __ jmp( Address(method, method_entry_offset));
+
+ masm->flush();
+ s->set_exception_points(npe_addr, ame_addr);
+@@ -99,11 +109,11 @@
+ }
+
+
+-VtableStub* VtableStubs::create_itable_stub(int vtable_index) {
++VtableStub* VtableStubs::create_itable_stub(int vtable_index, bool is_tail_call, bool is_sibling) {
+ // Note well: pd_code_size_limit is the absolute minimum we can get away with. If you
+ // add code here, bump the code stub size returned by pd_code_size_limit!
+ const int i486_code_length = VtableStub::pd_code_size_limit(false);
+- VtableStub* s = new(i486_code_length) VtableStub(false, vtable_index);
++ VtableStub* s = new(i486_code_length) VtableStub(false, vtable_index, is_tail_call, is_sibling);
+ ResourceMark rm;
+ CodeBuffer cb(s->entry_point(), i486_code_length);
+ MacroAssembler* masm = new MacroAssembler(&cb);
+@@ -170,13 +180,21 @@
+
+ // method (rbx): methodOop
+ // rcx: receiver
++ ByteSize method_entry_offset = in_ByteSize(0);
++ if (is_tail_call && is_sibling) {
++ method_entry_offset = methodOopDesc::from_compiled_static_tail_call_offset();
++ } else if (is_tail_call) {
++ method_entry_offset = methodOopDesc::from_compiled_not_sibling_static_tail_call_offset();
++ } else {
++ method_entry_offset = methodOopDesc::from_compiled_offset();
++ }
+
+ #ifdef ASSERT
+ if (DebugVtables) {
+ Label L1;
+ __ cmpptr(method, (int32_t)NULL_WORD);
+ __ jcc(Assembler::equal, L1);
+- __ cmpptr(Address(method, methodOopDesc::from_compiled_offset()), (int32_t)NULL_WORD);
++ __ cmpptr(Address(method, method_entry_offset), (int32_t)NULL_WORD);
+ __ jcc(Assembler::notZero, L1);
+ __ stop("methodOop is null");
+ __ bind(L1);
+@@ -184,7 +202,7 @@
+ #endif // ASSERT
+
+ address ame_addr = __ pc();
+- __ jmp(Address(method, methodOopDesc::from_compiled_offset()));
++ __ jmp(Address(method, method_entry_offset));
+
+ __ bind(throw_icce);
+ // Restore saved register
+diff --git a/src/share/tools/hsdis/Makefile b/src/share/tools/hsdis/Makefile
+--- a/src/share/tools/hsdis/Makefile
++++ b/src/share/tools/hsdis/Makefile
+@@ -65,6 +65,7 @@
+ OS = linux
+ CC = gcc
+ CCFLAGS += -O
++LDFLAGS += -ldl
+ DLDFLAGS += -shared
+ OUTFLAGS += -o $@
+ LIB_EXT = .so
+diff --git a/src/share/tools/hsdis/hsdis-demo.c b/src/share/tools/hsdis/hsdis-demo.c
+--- a/src/share/tools/hsdis/hsdis-demo.c
++++ b/src/share/tools/hsdis/hsdis-demo.c
+@@ -221,3 +221,4 @@
+ if (res != to)
+ printf("*** Result was %p!\n", res);
+ }
++
+diff --git a/src/share/tools/hsdis/hsdis.c b/src/share/tools/hsdis/hsdis.c
+--- a/src/share/tools/hsdis/hsdis.c
++++ b/src/share/tools/hsdis/hsdis.c
+@@ -29,6 +29,7 @@
+
+ #include "hsdis.h"
+
++#include <stdint.h>
+ #include <sysdep.h>
+ #include <libiberty.h>
+ #include <bfd.h>
+diff --git a/src/share/tools/hsdis/hsdis.h b/src/share/tools/hsdis/hsdis.h
+--- a/src/share/tools/hsdis/hsdis.h
++++ b/src/share/tools/hsdis/hsdis.h
+@@ -65,3 +65,4 @@
+ decode_instructions_printf_callback_ftype printf_callback,
+ void* printf_stream,
+ const char* options);
++
+diff --git a/src/share/vm/asm/codeBuffer.hpp b/src/share/vm/asm/codeBuffer.hpp
+--- a/src/share/vm/asm/codeBuffer.hpp
++++ b/src/share/vm/asm/codeBuffer.hpp
+@@ -39,6 +39,13 @@
+ Dtrace_trap = OSR_Entry, // dtrace probes can never have an OSR entry so reuse it
+ Exceptions, // Offset where exception handler lives
+ Deopt, // Offset where deopt handler lives
++ Static_Tail_Call_Entry, // Offset for static tail calls.
++ Static_Not_Sibling_Tail_Call_Entry,
++ Monomorphic_Tail_Call_Entry, // Offset for monomorphic tail
++ // calls.
++ Monomorphic_Not_Sibling_Tail_Call_Entry,
++ Static_Not_Sibling_Tail_Call_Set_Data_Entry,
++ Monomorphic_Not_Sibling_Tail_Call_Set_Data_Entry,
+ max_Entries };
+
+ // special value to note codeBlobs where profile (forte) stack walking is
+@@ -57,6 +64,12 @@
+ _values[OSR_Entry] = 0;
+ _values[Exceptions] = -1;
+ _values[Deopt] = -1;
++ _values[Static_Tail_Call_Entry] = 0;
++ _values[Monomorphic_Tail_Call_Entry] = 0;
++ _values[Monomorphic_Not_Sibling_Tail_Call_Entry] = 0;
++ _values[Static_Not_Sibling_Tail_Call_Entry] = 0;
++ _values[Static_Not_Sibling_Tail_Call_Set_Data_Entry] = 0;
++ _values[Monomorphic_Not_Sibling_Tail_Call_Set_Data_Entry] = 0;
+ }
+
+ int value(Entries e) { return _values[e]; }
+diff --git a/src/share/vm/c1/c1_Compilation.cpp b/src/share/vm/c1/c1_Compilation.cpp
+--- a/src/share/vm/c1/c1_Compilation.cpp
++++ b/src/share/vm/c1/c1_Compilation.cpp
+@@ -218,7 +218,14 @@
+ CHECK_BAILOUT();
+ assembler->emit_deopt_handler();
+ CHECK_BAILOUT();
+-
++ assembler->emit_static_tail_call_stub();
++ CHECK_BAILOUT();
++ assembler->emit_monomorphic_tail_call_stub();
++ CHECK_BAILOUT();
++ assembler->emit_not_sibling_monomorphic_tail_call_stub();
++ CHECK_BAILOUT();
++ assembler->emit_static_not_sibling_tail_call_stub();
++ CHECK_BAILOUT();
+ // done
+ masm()->flush();
+ }
+@@ -274,8 +281,13 @@
+
+ {
+ PhaseTraceTime timeit(_t_emit_lir);
+-
+- _frame_map = new FrameMap(method(), hir()->number_of_locks(), MAX2(4, hir()->max_stack()));
++ int tail_call_pd_slots = SharedRuntime::tail_call_protection_domain_slots();
++ // Really only need 1. Stack frame code depends
++ // on reserved_argument_area_size to be correct
++ // maximum. It seems that the stack offsets for incoming arguments is not
++ // calculated right. FrameMap::java_calling_conv() misses a '*BytePerWord'
++ _frame_map = new FrameMap(method(), hir()->number_of_locks(),
++ MAX2(4, hir()->max_stack()+tail_call_pd_slots));
+ emit_lir();
+ }
+ CHECK_BAILOUT_(no_frame_size);
+diff --git a/src/share/vm/c1/c1_FrameMap.cpp b/src/share/vm/c1/c1_FrameMap.cpp
+--- a/src/share/vm/c1/c1_FrameMap.cpp
++++ b/src/share/vm/c1/c1_FrameMap.cpp
+@@ -49,7 +49,7 @@
+ }
+
+
+-CallingConvention* FrameMap::java_calling_convention(const BasicTypeArray* signature, bool outgoing) {
++CallingConvention* FrameMap::java_calling_convention(const BasicTypeArray* signature, bool outgoing, bool is_tail_call) {
+ // compute the size of the arguments first. The signature array
+ // that java_calling_convention takes includes a T_VOID after double
+ // work items but our signatures do not.
+@@ -76,12 +76,14 @@
+ BasicType t = sig_bt[i];
+ assert(t != T_VOID, "should be skipping these");
+
+- LIR_Opr opr = map_to_opr(t, regs + i, outgoing);
++ LIR_Opr opr = map_to_opr(t, regs + i, outgoing, is_tail_call);
+ args->append(opr);
+ if (opr->is_address()) {
+ LIR_Address* addr = opr->as_address_ptr();
+ assert(addr->disp() == (int)addr->disp(), "out of range value");
++ assert(MAX2(out_preserve, (intptr_t)addr->disp() / 4) == out_preserve, "c1 different from java_call_conv");
+ out_preserve = MAX2(out_preserve, (intptr_t)addr->disp() / 4);
++
+ }
+ i += type2size[t];
+ }
+@@ -90,7 +92,10 @@
+
+ if (outgoing) {
+ // update the space reserved for arguments.
+- update_reserved_argument_area_size(out_preserve);
++ // shouldn't that be out_preserve * BytesPerWord
++ // like in FrameMap::FrameMap()
++ //update_reserved_argument_area_size(out_preserve);
++ update_reserved_argument_area_size(out_preserve*BytesPerWord);
+ }
+ return new CallingConvention(args, out_preserve);
+ }
+diff --git a/src/share/vm/c1/c1_FrameMap.hpp b/src/share/vm/c1/c1_FrameMap.hpp
+--- a/src/share/vm/c1/c1_FrameMap.hpp
++++ b/src/share/vm/c1/c1_FrameMap.hpp
+@@ -144,7 +144,7 @@
+ // stack addresses are expressable in a simm13.
+ bool validate_frame();
+
+- static LIR_Opr map_to_opr(BasicType type, VMRegPair* reg, bool incoming);
++ static LIR_Opr map_to_opr(BasicType type, VMRegPair* reg, bool incoming, bool is_tail_call=false);
+
+ public:
+ // Opr representing the stack_pointer on this platform
+@@ -156,7 +156,7 @@
+ // for outgoing calls, these also update the reserved area to
+ // include space for arguments and any ABI area.
+ CallingConvention* c_calling_convention (const BasicTypeArray* signature);
+- CallingConvention* java_calling_convention (const BasicTypeArray* signature, bool outgoing);
++ CallingConvention* java_calling_convention (const BasicTypeArray* signature, bool outgoing, bool is_tail_call=false);
+
+ // deopt support
+ ByteSize sp_offset_for_orig_pc() { return sp_offset_for_monitor_base(_num_monitors); }
+diff --git a/src/share/vm/c1/c1_GraphBuilder.cpp b/src/share/vm/c1/c1_GraphBuilder.cpp
+--- a/src/share/vm/c1/c1_GraphBuilder.cpp
++++ b/src/share/vm/c1/c1_GraphBuilder.cpp
+@@ -1504,6 +1504,8 @@
+
+ void GraphBuilder::invoke(Bytecodes::Code code) {
+ bool will_link;
++ bool is_tail_call = stream()->is_wide();
++
+ ciMethod* target = stream()->get_method(will_link);
+ // we have to make sure the argument size (incl. the receiver)
+ // is correct for compilation (the call would fail later during
+@@ -1718,7 +1720,7 @@
+ profile_call(recv, target_klass);
+ }
+
+- Invoke* result = new Invoke(code, result_type, recv, args, vtable_index, target);
++ Invoke* result = new Invoke(code, result_type, recv, args, vtable_index, target, is_tail_call);
+ // push result
+ append_split(result);
+
+@@ -3332,7 +3334,7 @@
+ !InlineSynchronizedMethods ) INLINE_BAILOUT("callee is synchronized");
+ if (!callee->holder()->is_initialized()) INLINE_BAILOUT("callee's klass not initialized yet");
+ if (!callee->has_balanced_monitors()) INLINE_BAILOUT("callee's monitors do not match");
+-
++ if (callee->contains_tail_call()) INLINE_BAILOUT("callee contains a tail call");
+ // Proper inlining of methods with jsrs requires a little more work.
+ if (callee->has_jsrs() ) INLINE_BAILOUT("jsrs not handled properly by inliner yet");
+
+diff --git a/src/share/vm/c1/c1_Instruction.cpp b/src/share/vm/c1/c1_Instruction.cpp
+--- a/src/share/vm/c1/c1_Instruction.cpp
++++ b/src/share/vm/c1/c1_Instruction.cpp
+@@ -334,7 +334,7 @@
+
+
+ Invoke::Invoke(Bytecodes::Code code, ValueType* result_type, Value recv, Values* args,
+- int vtable_index, ciMethod* target)
++ int vtable_index, ciMethod* target, bool is_tail_call)
+ : StateSplit(result_type)
+ , _code(code)
+ , _recv(recv)
+@@ -345,7 +345,7 @@
+ set_flag(TargetIsLoadedFlag, target->is_loaded());
+ set_flag(TargetIsFinalFlag, target_is_loaded() && target->is_final_method());
+ set_flag(TargetIsStrictfpFlag, target_is_loaded() && target->is_strict());
+-
++ set_flag(TailCallFlag, is_tail_call);
+ assert(args != NULL, "args must exist");
+ #ifdef ASSERT
+ values_do(assert_value);
+diff --git a/src/share/vm/c1/c1_Instruction.hpp b/src/share/vm/c1/c1_Instruction.hpp
+--- a/src/share/vm/c1/c1_Instruction.hpp
++++ b/src/share/vm/c1/c1_Instruction.hpp
+@@ -312,7 +312,8 @@
+ NeedsPatchingFlag,
+ ThrowIncompatibleClassChangeErrorFlag,
+ ProfileMDOFlag,
+- InstructionLastFlag
++ InstructionLastFlag,
++ TailCallFlag
+ };
+
+ public:
+@@ -1144,7 +1145,7 @@
+ public:
+ // creation
+ Invoke(Bytecodes::Code code, ValueType* result_type, Value recv, Values* args,
+- int vtable_index, ciMethod* target);
++ int vtable_index, ciMethod* target, bool is_tail_call=false);
+
+ // accessors
+ Bytecodes::Code code() const { return _code; }
+@@ -1162,6 +1163,9 @@
+ // Returns false if target is not loaded
+ bool target_is_strictfp() const { return check_flag(TargetIsStrictfpFlag); }
+
++ // Is this a tail call?
++ bool is_tail_call() const { return check_flag(TailCallFlag); }
++
+ // generic
+ virtual bool can_trap() const { return true; }
+ virtual void input_values_do(void f(Value*)) {
+diff --git a/src/share/vm/c1/c1_LIR.hpp b/src/share/vm/c1/c1_LIR.hpp
+--- a/src/share/vm/c1/c1_LIR.hpp
++++ b/src/share/vm/c1/c1_LIR.hpp
+@@ -1032,25 +1032,28 @@
+ private:
+ ciMethod* _method;
+ LIR_Opr _receiver;
+-
++ bool _is_tail_call;
+ public:
+ LIR_OpJavaCall(LIR_Code code, ciMethod* method,
+ LIR_Opr receiver, LIR_Opr result,
+ address addr, LIR_OprList* arguments,
+- CodeEmitInfo* info)
++ CodeEmitInfo* info, bool is_tail_call)
+ : LIR_OpCall(code, addr, result, arguments, info)
+ , _receiver(receiver)
+- , _method(method) { assert(is_in_range(code, begin_opJavaCall, end_opJavaCall), "code check"); }
++ , _method(method)
++ , _is_tail_call(is_tail_call) { assert(is_in_range(code, begin_opJavaCall, end_opJavaCall), "code check"); }
+
+ LIR_OpJavaCall(LIR_Code code, ciMethod* method,
+ LIR_Opr receiver, LIR_Opr result, intptr_t vtable_offset,
+- LIR_OprList* arguments, CodeEmitInfo* info)
++ LIR_OprList* arguments, CodeEmitInfo* info, bool is_tail_call)
+ : LIR_OpCall(code, (address)vtable_offset, result, arguments, info)
+ , _receiver(receiver)
+- , _method(method) { assert(is_in_range(code, begin_opJavaCall, end_opJavaCall), "code check"); }
++ , _method(method)
++ , _is_tail_call(is_tail_call) { assert(is_in_range(code, begin_opJavaCall, end_opJavaCall), "code check"); }
+
+ LIR_Opr receiver() const { return _receiver; }
+ ciMethod* method() const { return _method; }
++ bool is_tail_call() const { return _is_tail_call; }
+
+ intptr_t vtable_offset() const {
+ assert(_code == lir_virtual_call, "only have vtable for real vcall");
+@@ -1751,20 +1754,20 @@
+ //---------- instructions -------------
+ void call_opt_virtual(ciMethod* method, LIR_Opr receiver, LIR_Opr result,
+ address dest, LIR_OprList* arguments,
+- CodeEmitInfo* info) {
+- append(new LIR_OpJavaCall(lir_optvirtual_call, method, receiver, result, dest, arguments, info));
++ CodeEmitInfo* info, bool is_tail_call=false) {
++ append(new LIR_OpJavaCall(lir_optvirtual_call, method, receiver, result, dest, arguments, info, is_tail_call));
+ }
+ void call_static(ciMethod* method, LIR_Opr result,
+- address dest, LIR_OprList* arguments, CodeEmitInfo* info) {
+- append(new LIR_OpJavaCall(lir_static_call, method, LIR_OprFact::illegalOpr, result, dest, arguments, info));
++ address dest, LIR_OprList* arguments, CodeEmitInfo* info, bool is_tail_call=false) {
++ append(new LIR_OpJavaCall(lir_static_call, method, LIR_OprFact::illegalOpr, result, dest, arguments, info, is_tail_call));
+ }
+ void call_icvirtual(ciMethod* method, LIR_Opr receiver, LIR_Opr result,
+- address dest, LIR_OprList* arguments, CodeEmitInfo* info) {
+- append(new LIR_OpJavaCall(lir_icvirtual_call, method, receiver, result, dest, arguments, info));
++ address dest, LIR_OprList* arguments, CodeEmitInfo* info, bool is_tail_call=false) {
++ append(new LIR_OpJavaCall(lir_icvirtual_call, method, receiver, result, dest, arguments, info, is_tail_call));
+ }
+ void call_virtual(ciMethod* method, LIR_Opr receiver, LIR_Opr result,
+- intptr_t vtable_offset, LIR_OprList* arguments, CodeEmitInfo* info) {
+- append(new LIR_OpJavaCall(lir_virtual_call, method, receiver, result, vtable_offset, arguments, info));
++ intptr_t vtable_offset, LIR_OprList* arguments, CodeEmitInfo* info, bool is_tail_call=false) {
++ append(new LIR_OpJavaCall(lir_virtual_call, method, receiver, result, vtable_offset, arguments, info, is_tail_call));
+ }
+
+ void get_thread(LIR_Opr result) { append(new LIR_Op0(lir_get_thread, result)); }
+diff --git a/src/share/vm/c1/c1_LIRAssembler.cpp b/src/share/vm/c1/c1_LIRAssembler.cpp
+--- a/src/share/vm/c1/c1_LIRAssembler.cpp
++++ b/src/share/vm/c1/c1_LIRAssembler.cpp
+@@ -411,10 +411,11 @@
+
+ void LIR_Assembler::emit_call(LIR_OpJavaCall* op) {
+ verify_oop_map(op->info());
+-
++
+ if (os::is_MP()) {
+- // must align calls sites, otherwise they can't be updated atomically on MP hardware
+- align_call(op->code());
++ // Must align calls sites, otherwise they can't be updated atomically on MP
++ // hardware.
++ align_call(op->code(), op->is_tail_call());
+ }
+
+ // emit the static call stub stuff out of line
+@@ -428,7 +429,7 @@
+ call(op->addr(), relocInfo::opt_virtual_call_type, op->info());
+ break;
+ case lir_icvirtual_call:
+- ic_call(op->addr(), op->info());
++ ic_call(op->addr(), op->info(), op->is_tail_call());
+ break;
+ case lir_virtual_call:
+ vtable_call(op->vtable_offset(), op->info());
+@@ -570,6 +571,7 @@
+
+ case lir_std_entry:
+ // init offsets
++ assert(_masm->offset()== 0 , "Depend on this constraint in emit_static_tail_call_stub");
+ offsets()->set_value(CodeOffsets::OSR_Entry, _masm->offset());
+ _masm->align(CodeEntryAlignment);
+ if (needs_icache(compilation()->method())) {
+diff --git a/src/share/vm/c1/c1_LIRAssembler.hpp b/src/share/vm/c1/c1_LIRAssembler.hpp
+--- a/src/share/vm/c1/c1_LIRAssembler.hpp
++++ b/src/share/vm/c1/c1_LIRAssembler.hpp
+@@ -129,6 +129,10 @@
+ // stubs
+ void emit_slow_case_stubs();
+ void emit_static_call_stub();
++ void emit_static_tail_call_stub();
++ void emit_static_not_sibling_tail_call_stub();
++ void emit_monomorphic_tail_call_stub();
++ void emit_not_sibling_monomorphic_tail_call_stub();
+ void emit_code_stub(CodeStub* op);
+ void add_call_info_here(CodeEmitInfo* info) { add_call_info(code_offset(), info); }
+
+@@ -205,7 +209,9 @@
+ void comp_fl2i(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr result, LIR_Op2* op);
+ void cmove(LIR_Condition code, LIR_Opr left, LIR_Opr right, LIR_Opr result);
+
+- void ic_call(address destination, CodeEmitInfo* info);
++ // Set protection domain token for virtual tail calls.
++ void set_protection_domain_token();
++ void ic_call(address destination, CodeEmitInfo* info, bool is_tail_call);
+ void vtable_call(int vtable_offset, CodeEmitInfo* info);
+ void call(address entry, relocInfo::relocType rtype, CodeEmitInfo* info);
+
+@@ -217,7 +223,7 @@
+ void monitor_address(int monitor_ix, LIR_Opr dst);
+
+ void align_backward_branch_target();
+- void align_call(LIR_Code code);
++ void align_call(LIR_Code code, bool is_tail_call);
+
+ void negate(LIR_Opr left, LIR_Opr dest);
+ void leal(LIR_Opr left, LIR_Opr dest);
+diff --git a/src/share/vm/c1/c1_LIRGenerator.cpp b/src/share/vm/c1/c1_LIRGenerator.cpp
+--- a/src/share/vm/c1/c1_LIRGenerator.cpp
++++ b/src/share/vm/c1/c1_LIRGenerator.cpp
+@@ -2257,6 +2257,16 @@
+ __ move(LIR_Assembler::osrBufferPointer(), result);
+ }
+
++bool LIRGenerator::is_sibling_call(CallingConvention * callee_cc) {
++ // Size of incoming arguments must accomodate outgoing arguments.
++ // TODO: Check the following number 4 must match with c1_FrameMap.cpp:FrameMap
++ // and c1_Compilation:compile_java_method.
++ //int incoming_args_slots = MAX2(4,
++ // frame_map()->incoming_arguments()->reserved_stack_slots());
++ int incoming_args_slots = frame_map()->incoming_arguments()->reserved_stack_slots();
++ int outgoing_args_slots = callee_cc->reserved_stack_slots();
++ return outgoing_args_slots <= incoming_args_slots;
++}
+
+ void LIRGenerator::invoke_load_arguments(Invoke* x, LIRItemList* args, const LIR_OprList* arg_list) {
+ int i = x->has_receiver() ? 1 : 0;
+@@ -2291,7 +2301,7 @@
+
+
+ // Visits all arguments, returns appropriate items without loading them
+-LIRItemList* LIRGenerator::invoke_visit_arguments(Invoke* x) {
++LIRItemList* LIRGenerator::invoke_visit_arguments(Invoke* x, bool is_tail_call) {
+ LIRItemList* argument_items = new LIRItemList();
+ if (x->has_receiver()) {
+ LIRItem* receiver = new LIRItem(x->receiver(), this);
+@@ -2300,6 +2310,16 @@
+ int idx = x->has_receiver() ? 1 : 0;
+ for (int i = 0; i < x->number_of_arguments(); i++) {
+ LIRItem* param = new LIRItem(x->argument_at(i), this);
++ Instruction * instr = instruction_for_opr(param->result());
++ if(instr && is_tail_call) {
++ if (instr->as_Local()!=NULL) {
++ // Generate another move to a virtual register. This is done in order to
++ // prevent arguments sourcing from a stack slot being overwritten when
++ // tail calling. Related code: c1_LinearScan::handle_method_arguments.
++ LIR_Opr dest = new_register(param->result()->type());
++ param->load_item_force(dest);
++ }
++ }
+ argument_items->append(param);
+ idx += (param->type()->is_double_word() ? 2 : 1);
+ }
+@@ -2331,11 +2351,15 @@
+ // - if we keep the receiver locked while doing spill-save,
+ // we cannot spill it as it is spill-locked
+ //
+-void LIRGenerator::do_Invoke(Invoke* x) {
++void LIRGenerator::do_Invoke(Invoke* x) {
+ CallingConvention* cc = frame_map()->java_calling_convention(x->signature(), true);
++ bool is_tail = x->is_tail_call();
++ bool is_sibling = is_tail? is_sibling_call(cc) : false;
++ //CallingConvention* cc_tail = frame_map()->java_calling_convention(x->signature(), true, is_sibling);
++ //cc = is_sibling ? cc_tail : cc;
+
+ LIR_OprList* arg_list = cc->args();
+- LIRItemList* args = invoke_visit_arguments(x);
++ LIRItemList* args = invoke_visit_arguments(x, false);
+ LIR_Opr receiver = LIR_OprFact::illegalOpr;
+
+ // setup result register
+@@ -2359,9 +2383,19 @@
+
+ switch (x->code()) {
+ case Bytecodes::_invokestatic:
+- __ call_static(x->target(), result_register,
+- SharedRuntime::get_resolve_static_call_stub(),
+- arg_list, info);
++ if (is_tail && is_sibling) {
++ __ call_static(x->target(), result_register,
++ SharedRuntime::get_resolve_static_tail_call_stub(),
++ arg_list, info, is_tail);
++ } else if (is_tail) {
++ // non sibling call
++ __ call_static(x->target(), result_register,
++ SharedRuntime::get_resolve_not_sibling_static_tail_call_stub(),
++ arg_list, info, is_tail);
++ } else {
++ __ call_static(x->target(), result_register,
++ SharedRuntime::get_resolve_static_call_stub(),
++ arg_list, info, is_tail); }
+ break;
+ case Bytecodes::_invokespecial:
+ case Bytecodes::_invokevirtual:
+@@ -2369,13 +2403,32 @@
+ // for final target we still produce an inline cache, in order
+ // to be able to call mixed mode
+ if (x->code() == Bytecodes::_invokespecial || optimized) {
+- __ call_opt_virtual(x->target(), receiver, result_register,
++ if (is_tail && is_sibling) {
++ __ call_opt_virtual(x->target(), receiver, result_register,
++ SharedRuntime::get_resolve_opt_virtual_tail_call_stub(),
++ arg_list, info);
++ } else if (is_tail) {
++ __ call_opt_virtual(x->target(), receiver, result_register,
++ SharedRuntime::get_resolve_opt_not_sibling_virtual_tail_call_stub(),
++ arg_list, info);
++ } else
++ __ call_opt_virtual(x->target(), receiver, result_register,
+ SharedRuntime::get_resolve_opt_virtual_call_stub(),
+ arg_list, info);
+ } else if (x->vtable_index() < 0) {
+- __ call_icvirtual(x->target(), receiver, result_register,
+- SharedRuntime::get_resolve_virtual_call_stub(),
+- arg_list, info);
++ if (is_tail && is_sibling) {
++ __ call_icvirtual(x->target(), receiver, result_register,
++ SharedRuntime::get_resolve_virtual_tail_call_stub(),
++ arg_list, info, is_tail);
++ } else if (is_tail) {
++ __ call_icvirtual(x->target(), receiver, result_register,
++ SharedRuntime::get_resolve_not_sibling_virtual_tail_call_stub(),
++ arg_list, info, is_tail);
++ } else {
++ __ call_icvirtual(x->target(), receiver, result_register,
++ SharedRuntime::get_resolve_virtual_call_stub(),
++ arg_list, info);
++ }
+ } else {
+ int entry_offset = instanceKlass::vtable_start_offset() + x->vtable_index() * vtableEntry::size();
+ int vtable_offset = entry_offset * wordSize + vtableEntry::method_offset_in_bytes();
+diff --git a/src/share/vm/c1/c1_LIRGenerator.hpp b/src/share/vm/c1/c1_LIRGenerator.hpp
+--- a/src/share/vm/c1/c1_LIRGenerator.hpp
++++ b/src/share/vm/c1/c1_LIRGenerator.hpp
+@@ -30,6 +30,7 @@
+ class Invoke;
+ class SwitchRange;
+ class LIRItem;
++class CallingConvention;
+
+ define_array(LIRItemArray, LIRItem*)
+ define_stack(LIRItemList, LIRItemArray)
+@@ -269,8 +270,10 @@
+
+ ciObject* get_jobject_constant(Value value);
+
+- LIRItemList* invoke_visit_arguments(Invoke* x);
++ LIRItemList* invoke_visit_arguments(Invoke* x, bool is_tail_call);
+ void invoke_load_arguments(Invoke* x, LIRItemList* args, const LIR_OprList* arg_list);
++ // Tail call optimization support
++ bool is_sibling_call(CallingConvention* callee_cc);
+
+ void trace_block_entry(BlockBegin* block);
+
+diff --git a/src/share/vm/c1/c1_Runtime1.cpp b/src/share/vm/c1/c1_Runtime1.cpp
+--- a/src/share/vm/c1/c1_Runtime1.cpp
++++ b/src/share/vm/c1/c1_Runtime1.cpp
+@@ -140,9 +140,18 @@
+ locs_buffer_size / sizeof(relocInfo));
+ code->initialize_consts_size(desired_max_constant_size());
+ // Call stubs + deopt/exception handler
++ int max_arg_count = 2 * 32;
++ int tail_call_stubs = 4;
++ int move_arg_size = 8; // SYNC: keep in sync with number in tail call stubs.
+ code->initialize_stubs_size((call_stub_estimate * LIR_Assembler::call_stub_size) +
+ LIR_Assembler::exception_handler_size +
+- LIR_Assembler::deopt_handler_size);
++ LIR_Assembler::deopt_handler_size +
++ LIR_Assembler::static_tail_call_stub_size+
++ LIR_Assembler::static_not_sibling_tail_call_stub_size+
++ LIR_Assembler::monomorphic_tail_call_stub_size+
++ LIR_Assembler::monomorphic_not_sibling_tail_call_stub_size+
++ (move_arg_size * max_arg_count * tail_call_stubs )); //TODO: decent estimate take parameter
++ //moving into acount.
+ }
+
+
+diff --git a/src/share/vm/ci/ciMethod.cpp b/src/share/vm/ci/ciMethod.cpp
+--- a/src/share/vm/ci/ciMethod.cpp
++++ b/src/share/vm/ci/ciMethod.cpp
+@@ -49,6 +49,9 @@
+ _handler_count = h_m()->exception_table()->length() / 4;
+ _uses_monitors = h_m()->access_flags().has_monitor_bytecodes();
+ _balanced_monitors = !_uses_monitors || h_m()->access_flags().is_monitor_matching();
++ _contains_tail_call = h_m()->contains_tail_call();
++ _contains_tail_call_var_initialized = false;
++
+ _is_compilable = !h_m()->is_not_compilable();
+ // Lazy fields, filled in on demand. Require allocation.
+ _code = NULL;
+@@ -124,6 +127,8 @@
+ _bcea = NULL;
+ _method_blocks = NULL;
+ _method_data = NULL;
++ _contains_tail_call = false;
++ _contains_tail_call_var_initialized = false;
+ #ifdef COMPILER2
+ _flow = NULL;
+ #endif // COMPILER2
+@@ -290,6 +295,47 @@
+ return true;
+ }
+
++// ------------------------------------------------------------------
++// ciMethod::uses_balanced_monitors
++//
++// Does this method use monitors in a strict stack-disciplined manner?
++bool ciMethod::contains_tail_call() {
++ check_is_loaded();
++
++ if (_contains_tail_call) return true;
++
++ if (_contains_tail_call_var_initialized)
++ return _contains_tail_call;
++
++ // Analyze the method to see if monitors are used properly.
++ VM_ENTRY_MARK;
++ methodHandle method(THREAD, get_methodOop());
++
++ // Check to see if a previous compilation computed the
++ // monitor-matching analysis.
++ if (method->contains_tail_call()) {
++ _contains_tail_call = true;
++ return true;
++ }
++
++ {
++ EXCEPTION_MARK;
++ ResourceMark rm(THREAD);
++ ContainsTailCallInfo tci(method);
++ tci.compute_map(CATCH);
++ if (!tci.contains_tail_call()) {
++ _contains_tail_call =false;
++ // only need to set this variable to true here. Because if
++ // _contains_tail_call is true we know that its value has been computed.
++ _contains_tail_call_var_initialized = true;
++ return false;
++ }
++ method->set_contains_tail_call(true);
++ _contains_tail_call = true;
++ }
++ return true;
++}
++
+
+ // ------------------------------------------------------------------
+ // ciMethod::get_flow_analysis
+diff --git a/src/share/vm/ci/ciMethod.hpp b/src/share/vm/ci/ciMethod.hpp
+--- a/src/share/vm/ci/ciMethod.hpp
++++ b/src/share/vm/ci/ciMethod.hpp
+@@ -62,6 +62,8 @@
+ bool _balanced_monitors;
+ bool _is_compilable;
+ bool _can_be_statically_bound;
++ bool _contains_tail_call;
++ bool _contains_tail_call_var_initialized;
+
+ // Lazy fields, filled in on demand
+ address _code;
+@@ -148,6 +150,7 @@
+ bool uses_monitors() const { return _uses_monitors; } // this one should go away, it has a misleading name
+ bool has_monitor_bytecodes() const { return _uses_monitors; }
+ bool has_balanced_monitors();
++ bool contains_tail_call();
+
+ MethodLivenessResult liveness_at_bci(int bci);
+
+diff --git a/src/share/vm/ci/ciStreams.cpp b/src/share/vm/ci/ciStreams.cpp
+--- a/src/share/vm/ci/ciStreams.cpp
++++ b/src/share/vm/ci/ciStreams.cpp
+@@ -88,10 +88,9 @@
+ {
+ // Get following bytecode; do not return wide
+ Bytecodes::Code bc = (Bytecodes::Code)_pc[1];
+- _pc += 2; // Skip both bytecodes
+- _pc += 2; // Skip index always
+- if( bc == Bytecodes::_iinc )
+- _pc += 2; // Skip optional constant
++ Bytecodes::Prefix pfx = Bytecodes::allowed_prefix(bc);
++ int advance = Bytecodes::length_for(pfx, bc);
++ _pc += advance;
+ _was_wide = _pc; // Flag last wide bytecode found
+ return bc;
+ }
+@@ -303,11 +302,15 @@
+ int ciBytecodeStream::get_method_index() {
+ switch (cur_bc()) {
+ case Bytecodes::_invokeinterface:
++ // This should also work for wide invokeinterfaces.
+ return Bytes::get_Java_u2(_pc-4);
+ case Bytecodes::_invokevirtual:
+ case Bytecodes::_invokespecial:
+ case Bytecodes::_invokestatic:
+- return get_index_big();
++ if(!is_wide())
++ return get_index_big();
++ else
++ return Bytes::get_Java_u2(_bc_start+2);
+ default:
+ ShouldNotReachHere();
+ return 0;
+diff --git a/src/share/vm/classfile/verifier.cpp b/src/share/vm/classfile/verifier.cpp
+--- a/src/share/vm/classfile/verifier.cpp
++++ b/src/share/vm/classfile/verifier.cpp
+@@ -323,6 +323,7 @@
+ bool no_control_flow = false; // Set to true when there is no direct control
+ // flow from current instruction to the next
+ // instruction in sequence
++ bool handlers_prohibited = false; // Set to true immediately after a tail-call.
+ Bytecodes::Code opcode;
+ while (!bcs.is_last_bytecode()) {
+ opcode = bcs.raw_next();
+@@ -331,6 +332,8 @@
+ // Set current frame's offset to bci
+ current_frame.set_offset(bci);
+
++ assert(handlers_prohibited == false, "should be set true only briefly");
++
+ // Make sure every offset in stackmap table point to the beginning to
+ // an instruction. Match current_frame to stackmap_table entry with
+ // the same offset if exists.
+@@ -350,22 +353,46 @@
+ #ifndef PRODUCT
+ if (_verify_verbose) {
+ current_frame.print();
+- tty->print_cr("offset = %d, opcode = %s", bci, Bytecodes::name(opcode));
++ if (bcs.has_prefix())
++ tty->print_cr("offset = %d, opcode = %s:%s",
++ bci, Bytecodes::prefix_name(bcs.prefix()), Bytecodes::name(opcode));
++ else
++ tty->print_cr("offset = %d, opcode = %s", bci, Bytecodes::name(opcode));
+ }
+ #endif
+
+ // Make sure wide instruction is in correct format
+- if (bcs.is_wide()) {
++ if (bcs.prefix() == Bytecodes::Prefix_illegal) {
++ // BytecodeStream checks and decodes all prefixes.
++ verify_error(bci, "Bad wide instruction");
++ return;
++ }
++
++#ifdef ASSERT
++ switch (bcs.prefix()) {
++ case Bytecodes::Prefix_none:
++ break;
++ case Bytecodes::Prefix_wide_index:
+ if (opcode != Bytecodes::_iinc && opcode != Bytecodes::_iload &&
+ opcode != Bytecodes::_aload && opcode != Bytecodes::_lload &&
+ opcode != Bytecodes::_istore && opcode != Bytecodes::_astore &&
+ opcode != Bytecodes::_lstore && opcode != Bytecodes::_fload &&
+ opcode != Bytecodes::_dload && opcode != Bytecodes::_fstore &&
+ opcode != Bytecodes::_dstore) {
+- verify_error(bci, "Bad wide instruction");
+- return;
++ assert(false, "should have seen Prefix_unknown");
+ }
++ break;
++ case Bytecodes::Prefix_tail_call:
++ assert(TailCalls, "");
++ if (opcode != Bytecodes::_invokevirtual && opcode != Bytecodes::_invokespecial &&
++ opcode != Bytecodes::_invokeinterface && opcode != Bytecodes::_invokestatic) {
++ assert(false, "should have seen Prefix_unknown");
++ }
++ return;
++ default:
++ ShouldNotReachHere();
+ }
++#endif //ASSERT
+
+ switch (opcode) {
+ case Bytecodes::_nop :
+@@ -1169,14 +1196,15 @@
+ case Bytecodes::_invokevirtual :
+ case Bytecodes::_invokespecial :
+ case Bytecodes::_invokestatic :
+- verify_invoke_instructions(
+- &bcs, code_length, ¤t_frame,
+- &this_uninit, return_type, cp, CHECK_VERIFY(this));
+- no_control_flow = false; break;
+ case Bytecodes::_invokeinterface :
+ verify_invoke_instructions(
+ &bcs, code_length, ¤t_frame,
+ &this_uninit, return_type, cp, CHECK_VERIFY(this));
++ // Check for tail call.
++ if (bcs.prefix() == Bytecodes::Prefix_tail_call) {
++ verify_tail_call(&bcs, bci, CHECK_VERIFY(this));
++ handlers_prohibited = true;
++ }
+ no_control_flow = false; break;
+ case Bytecodes::_new :
+ {
+@@ -1278,8 +1306,10 @@
+ // matches current_frame
+ if (bci >= ex_min && bci < ex_max) {
+ verify_exception_handler_targets(
+- bci, this_uninit, ¤t_frame, &stackmap_table, CHECK_VERIFY(this));
++ bci, this_uninit, handlers_prohibited, ¤t_frame, &stackmap_table, CHECK_VERIFY(this));
+ }
++ handlers_prohibited = false; // reset brief setting for next iteration of loop
++
+ } // end while
+
+ // Make sure that control flow does not fall through end of the method
+@@ -1415,8 +1445,9 @@
+ return stackmap_index;
+ }
+
+-void ClassVerifier::verify_exception_handler_targets(u2 bci, bool this_uninit, StackMapFrame* current_frame,
+- StackMapTable* stackmap_table, TRAPS) {
++void ClassVerifier::verify_exception_handler_targets(
++ u2 bci, bool this_uninit, bool handlers_prohibited,
++ StackMapFrame* current_frame, StackMapTable* stackmap_table, TRAPS) {
+ constantPoolHandle cp (THREAD, _method->constants());
+ typeArrayHandle exhandlers (THREAD, _method->exception_table());
+ if (exhandlers() != NULL) {
+@@ -1426,6 +1457,13 @@
+ u2 handler_pc = exhandlers->int_at(i++);
+ int catch_type_index = exhandlers->int_at(i++);
+ if(bci >= start_pc && bci < end_pc) {
++ if (handlers_prohibited) {
++ verify_error(bci,
++ "Tail-call covered by exception handler %d",
++ handler_pc);
++ return;
++ }
++
+ u1 flags = current_frame->flags();
+ if (this_uninit) { flags |= FLAG_THIS_UNINIT; }
+
+@@ -1723,8 +1761,9 @@
+ }
+
+ // Get referenced class type
+- VerificationType ref_class_type = cp_ref_index_to_type(
+- index, cp, CHECK_VERIFY(this));
++ symbolHandle ref_class_name = symbolHandle(THREAD,
++ cp->klass_name_at(cp->klass_ref_index_at(index)));
++ VerificationType ref_class_type = VerificationType::reference_type(ref_class_name);
+ if (!ref_class_type.is_object()) {
+ verify_error(
+ "Expecting reference to class in class %s at constant pool index %d",
+@@ -1793,8 +1832,6 @@
+ check_protected: {
+ if (_this_type == stack_object_type)
+ break; // stack_object_type must be assignable to _current_class_type
+- symbolHandle ref_class_name = symbolHandle(THREAD,
+- cp->klass_name_at(cp->klass_ref_index_at(index)));
+ if (!name_in_supers(ref_class_name(), current_class()))
+ // stack_object_type must be assignable to _current_class_type since:
+ // 1. stack_object_type must be assignable to ref_class.
+@@ -1890,13 +1927,20 @@
+ RawBytecodeStream* bcs, u4 code_length, StackMapFrame* current_frame,
+ bool *this_uninit, VerificationType return_type,
+ constantPoolHandle cp, TRAPS) {
++
+ // Make sure the constant pool item is the right type
+ u2 index = bcs->get_index_big();
+ Bytecodes::Code opcode = bcs->code();
+- unsigned int types = (opcode == Bytecodes::_invokeinterface
+- ? 1 << JVM_CONSTANT_InterfaceMethodref
+- : 1 << JVM_CONSTANT_Methodref);
+- verify_cp_type(index, cp, types, CHECK_VERIFY(this));
++ switch (opcode) {
++ case Bytecodes::_invokeinterface:
++ verify_cp_type(index, cp, 1 << JVM_CONSTANT_InterfaceMethodref, CHECK_VERIFY(this));
++ //nt_pair_index = cp->name_and_type_ref_index_at(raw_index);
++ break;
++ default:
++ verify_cp_type(index, cp, 1 << JVM_CONSTANT_Methodref, CHECK_VERIFY(this));
++ //nt_pair_index = cp->name_and_type_ref_index_at(raw_index);
++ break;
++ }
+
+ // Get method name and signature
+ symbolHandle method_name(THREAD, cp->name_ref_at(index));
+@@ -1973,6 +2017,7 @@
+ if (method_name->byte_at(0) == '<') {
+ // Make sure <init> can only be invoked by invokespecial
+ if (opcode != Bytecodes::_invokespecial ||
++ bcs->has_prefix() || // no tail-calls to <init>
+ method_name() != vmSymbols::object_initializer_name()) {
+ verify_error(bci, "Illegal call to internal method");
+ return;
+@@ -2007,8 +2052,8 @@
+ current_frame->pop_stack(ref_class_type, CHECK_VERIFY(this));
+ if (current_type() != stack_object_type) {
+ assert(cp->cache() == NULL, "not rewritten yet");
+- symbolHandle ref_class_name = symbolHandle(THREAD,
+- cp->klass_name_at(cp->klass_ref_index_at(index)));
++ symbolHandle ref_class_name = symbolHandle(THREAD,
++ cp->klass_name_at(cp->klass_ref_index_at(index)));
+ // See the comments in verify_field_instructions() for
+ // the rationale behind this.
+ if (name_in_supers(ref_class_name(), current_class())) {
+@@ -2056,6 +2101,43 @@
+ }
+ }
+
++void ClassVerifier::verify_tail_call(
++ RawBytecodeStream* bcs, u2 bci, TRAPS) {
++ assert(TailCalls, "BCS will produce tailcalls only if feature is enabled");
++
++ // Rules for tail call:
++ // - Must be immediately followed by a return opcode. (Checked here.)
++ // - Return values are consistent. (Checked by return opcode.)
++ // - No exception handlers. (Checked by caller, via handlers_prohibited.)
++ // - Caller method not synchronized. (Checked here.)
++ // - Caller holding no object locks. (IllegalMonitorStateException test.)
++ // - Callee accessible from caller. (Checked as in non-tail case.)
++ RawBytecodeStream lookahead_bcs(_method);
++ lookahead_bcs.set_start(bcs->next_bci());
++ switch (lookahead_bcs.raw_next()) {
++ case Bytecodes::_ireturn :
++ case Bytecodes::_lreturn :
++ case Bytecodes::_freturn :
++ case Bytecodes::_dreturn :
++ case Bytecodes::_areturn :
++ case Bytecodes::_return :
++ // Next iteration of main loop will verify compatibility of return value.
++ // Note: This allows a certain amount of "widening" of the result.
++ // A void method can tail-call a non-void method, etc.
++
++ break;
++ default:
++ verify_error(bci, "Tail call must be followed by a return instruction");
++ return;
++ }
++
++ if (_method()->is_synchronized()) {
++ // An implicit exception handler...
++ verify_error(bci, "Tail call from synchronized method");
++ return;
++ }
++}
++
+ VerificationType ClassVerifier::get_newarray_type(
+ u2 index, u2 bci, TRAPS) {
+ const char* from_bt[] = {
+diff --git a/src/share/vm/classfile/verifier.hpp b/src/share/vm/classfile/verifier.hpp
+--- a/src/share/vm/classfile/verifier.hpp
++++ b/src/share/vm/classfile/verifier.hpp
+@@ -107,7 +107,7 @@
+ StackMapTable* stackmap_table, bool no_control_flow, TRAPS);
+
+ void verify_exception_handler_targets(
+- u2 bci, bool this_uninit, StackMapFrame* current_frame,
++ u2 bci, bool this_uninit, bool handlers_prohibited, StackMapFrame* current_frame,
+ StackMapTable* stackmap_table, TRAPS);
+
+ void verify_ldc(
+@@ -132,6 +132,9 @@
+ bool* this_uninit, VerificationType return_type,
+ constantPoolHandle cp, TRAPS);
+
++ void verify_tail_call(
++ RawBytecodeStream* bcs, u2 bci, TRAPS);
++
+ VerificationType get_newarray_type(u2 index, u2 bci, TRAPS);
+ void verify_anewarray(
+ u2 index, constantPoolHandle cp, StackMapFrame* current_frame, TRAPS);
+diff --git a/src/share/vm/classfile/vmSymbols.hpp b/src/share/vm/classfile/vmSymbols.hpp
+--- a/src/share/vm/classfile/vmSymbols.hpp
++++ b/src/share/vm/classfile/vmSymbols.hpp
+@@ -151,6 +151,8 @@
+ template(java_lang_RuntimeException, "java/lang/RuntimeException") \
+ template(java_io_IOException, "java/io/IOException") \
+ template(java_security_PrivilegedActionException, "java/security/PrivilegedActionException") \
++ /* Tail call exception */ \
++ template(java_lang_TailCallException, "java/lang/TailCallException") \
+ \
+ /* error klasses: at least all errors thrown by the VM have entries here */ \
+ template(java_lang_AbstractMethodError, "java/lang/AbstractMethodError") \
+diff --git a/src/share/vm/code/compiledIC.cpp b/src/share/vm/code/compiledIC.cpp
+--- a/src/share/vm/code/compiledIC.cpp
++++ b/src/share/vm/code/compiledIC.cpp
+@@ -116,12 +116,37 @@
+ return _ic_call->destination();
+ }
+
++// Is 'Thread safe' because the megamorphic call site is the only client. It
++// does not matter if monomorphic call site sees a protection domain.
++void CompiledIC::set_protection_domain_token(oop protection_domain) {
++ // Verify, creation also verifies the object.
++ NativeMovConstReg* set_cache_oop = nativeMovConstReg_at(_first_set_oop_inst);
++ NativeMovConstProtectionDomain * set_pd_oop = nativeMovConstPD_before(_first_set_oop_inst);
++
++ // Set the protection domain.
++ set_pd_oop->set_data((intptr_t) protection_domain);
++ // Fix the relocation entry.
++ oop* oop_addr = set_pd_oop->oop_address();
++ bool is_fixed = false;
++ assert(_oops.code()!=NULL, "oops");
++ RelocIterator iter = RelocIterator(_oops.code(), (address)set_pd_oop, ((address)set_pd_oop)+1);
++ while (iter.next()) {
++ if (iter.type() == relocInfo::oop_type) {
++ oop_Relocation* r = iter.oop_reloc();
++ if (r->oop_addr() == oop_addr) {
++ r->fix_oop_relocation();
++ is_fixed = true;
++ }
++ }
++ }
++ assert (is_fixed, "Oop relocation fixed");
++}
+
+ //-----------------------------------------------------------------------------
+ // High-level access to an inline cache. Guaranteed to be MT-safe.
+
+
+-void CompiledIC::set_to_megamorphic(CallInfo* call_info, Bytecodes::Code bytecode, TRAPS) {
++void CompiledIC::set_to_megamorphic(CallInfo* call_info, Bytecodes::Code bytecode, oop protection_domain, TRAPS) {
+ methodHandle method = call_info->selected_method();
+ bool is_invoke_interface = (bytecode == Bytecodes::_invokeinterface && !call_info->has_vtable_index());
+ assert(CompiledIC_lock->is_locked() || SafepointSynchronize::is_at_safepoint(), "");
+@@ -130,9 +155,13 @@
+ assert(is_call_to_compiled() || is_call_to_interpreted(), "going directly to megamorphic?");
+
+ address entry;
++ if (call_info->is_tail_call()) {
++ assert(is_tail_call(), "Should match call_info->is_tail_call()");
++ set_protection_domain_token((oop)protection_domain);
++ }
+ if (is_invoke_interface) {
+ int index = klassItable::compute_itable_index(call_info->resolved_method()());
+- entry = VtableStubs::create_stub(false, index, method());
++ entry = VtableStubs::create_stub(false, index, method(), is_tail_call(), is_sibling());
+ assert(entry != NULL, "entry not computed");
+ klassOop k = call_info->resolved_method()->method_holder();
+ assert(Klass::cast(k)->is_interface(), "sanity check");
+@@ -140,7 +169,7 @@
+ } else {
+ // Can be different than method->vtable_index(), due to package-private etc.
+ int vtable_index = call_info->vtable_index();
+- entry = VtableStubs::create_stub(true, vtable_index, method());
++ entry = VtableStubs::create_stub(true, vtable_index, method(), is_tail_call(), is_sibling());
+ InlineCacheBuffer::create_transition_stub(this, method(), entry);
+ }
+
+@@ -238,9 +267,34 @@
+
+ address entry;
+ if (is_optimized()) {
+- entry = SharedRuntime::get_resolve_opt_virtual_call_stub();
++ switch (tail_call_type()) {
++ case relocInfo::not_tail_call:
++ entry = SharedRuntime::get_resolve_opt_virtual_call_stub();
++ break;
++ case relocInfo::sibling_tail_call_type:
++ entry = SharedRuntime::get_resolve_opt_virtual_tail_call_stub();
++ break;
++ case relocInfo::not_sibling_tail_call_type:
++ entry = SharedRuntime::get_resolve_opt_not_sibling_virtual_tail_call_stub();
++ break;
++ default: assert(false, "oops"); break;
++ }
++
+ } else {
+- entry = SharedRuntime::get_resolve_virtual_call_stub();
++ switch (tail_call_type()) {
++ case relocInfo::not_tail_call:
++ entry = SharedRuntime::get_resolve_virtual_call_stub();
++ break;
++ case relocInfo::sibling_tail_call_type:
++ entry = SharedRuntime::get_resolve_virtual_tail_call_stub();
++ break;
++ case relocInfo::not_sibling_tail_call_type:
++ entry = SharedRuntime::get_resolve_not_sibling_virtual_tail_call_stub();
++ break;
++ default:
++ assert(false,"oops");
++ break;
++ }
+ }
+
+ // A zombie transition will always be safe, since the oop has already been set to NULL, so
+@@ -274,7 +328,11 @@
+ bool is_clean = false;
+ address dest = ic_destination();
+ is_clean = dest == SharedRuntime::get_resolve_opt_virtual_call_stub() ||
+- dest == SharedRuntime::get_resolve_virtual_call_stub();
++ dest == SharedRuntime::get_resolve_virtual_call_stub() ||
++ dest == SharedRuntime::get_resolve_virtual_tail_call_stub() ||
++ dest == SharedRuntime::get_resolve_not_sibling_virtual_tail_call_stub() ||
++ dest == SharedRuntime::get_resolve_opt_virtual_tail_call_stub() ||
++ dest == SharedRuntime::get_resolve_opt_not_sibling_virtual_tail_call_stub();
+ assert(!is_clean || is_optimized() || cached_oop() == NULL, "sanity check");
+ return is_clean;
+ }
+@@ -370,6 +428,8 @@
+ KlassHandle receiver_klass,
+ bool is_optimized,
+ bool static_bound,
++ bool is_tail_call,
++ bool is_sibling,
+ CompiledICInfo& info,
+ TRAPS) {
+ info._is_optimized = is_optimized;
+@@ -379,9 +439,41 @@
+ if (method_code != NULL) {
+ // Call to compiled code
+ if (static_bound || is_optimized) {
+- entry = method_code->verified_entry_point();
++ if (is_tail_call && is_sibling) {
++ if (TraceTailCalls) {
++ tty->print("CompiledIC::compute_monomorphic_entry():");
++ method->print_value_string();
++ tty->print_cr(" to compiled: static_tail_call_entry_point");
++ }
++ entry = method_code->static_tail_call_entry_point();
++ } else if (is_tail_call) {
++ if (TraceTailCalls) {
++ tty->print("CompiledIC::compute_monomorphic_entry():");
++ method->print_value_string();
++ tty->print_cr(" to compiled: static_not_sibling_tail_call_entry_point");
++ }
++ entry = method_code->static_not_sibling_tail_call_entry_point();
++ } else {
++ entry = method_code->verified_entry_point();
++ }
+ } else {
+- entry = method_code->entry_point();
++ if (is_tail_call && is_sibling) {
++ if (TraceTailCalls) {
++ tty->print("CompiledIC::compute_monomorphic_entry():");
++ method->print_value_string();
++ tty->print_cr(" to compiled: monomorphic_tail_call_entry_point");
++ }
++ entry = method_code->monomorphic_tail_call_entry_point();
++ } else if (is_tail_call) {
++ if (TraceTailCalls) {
++ tty->print("CompiledIC::compute_monomorphic_entry():");
++ method->print_value_string();
++ tty->print_cr(" to compiled: monomorphic_not_sibling_call_entry_point");
++ }
++ entry = method_code->monomorphic_not_sibling_tail_call_entry_point();
++ } else {
++ entry = method_code->entry_point();
++ }
+ }
+ }
+ if (entry != NULL) {
+@@ -429,35 +521,66 @@
+ #endif // COMPILER2
+ if (is_optimized) {
+ // Use stub entry
+- info._entry = method()->get_c2i_entry();
++ if (is_tail_call && is_sibling) {
++ if (TraceTailCalls) {
++ tty->print("CompiledIC::compute_monomorphic_entry():");
++ method->print_value_string();
++ tty->print_cr(" to interpreter: get_c2i_static_tail_call_entry_point");
++ }
++ info._entry = method()->get_c2i_static_tail_call_entry();
++ } else if(is_tail_call) {
++ if (TraceTailCalls) {
++ tty->print("CompiledIC::compute_monomorphic_entry():");
++ method->print_value_string();
++ tty->print_cr(" to interpreter: get_c2i_not_sibling_static_tail_call_entry_point");
++ }
++ info._entry = method()->get_c2i_static_not_sibling_tail_call_entry();
++ } else
++ info._entry = method()->get_c2i_entry();
+ info._cached_oop = method;
+ } else {
+ // Use mkh entry
+ oop holder = oopFactory::new_compiledICHolder(method, receiver_klass, CHECK);
+ info._cached_oop = Handle(THREAD, holder);
+- info._entry = method()->get_c2i_unverified_entry();
++ if (is_tail_call && is_sibling) {
++ if (TraceTailCalls) {
++ tty->print("CompiledIC::compute_monomorphic_entry():");
++ method->print_value_string();
++ tty->print_cr(" to interpreter: get_c2i_unverified_tail_call_entry_point");
++ }
++ info._entry = method()->get_c2i_unverified_tail_call_entry();
++ } else if (is_tail_call) {
++ if (TraceTailCalls) {
++ tty->print("CompiledIC::compute_monomorphic_entry():");
++ method->print_value_string();
++ tty->print_cr(" to interpreter: get_c2i_unverified_not_sibling_tail_call_entry_point");
++ }
++ info._entry = method()->get_c2i_unverified_not_sibling_tail_call_entry();
++ } else
++ info._entry = method()->get_c2i_unverified_entry();
+ }
+ }
+ }
+
+
+-inline static RelocIterator parse_ic(CodeBlob* code, address ic_call, oop* &_oop_addr, bool *is_optimized) {
++inline static RelocIterator parse_ic(CodeBlob* code, address ic_call, oop* &_oop_addr, bool *is_optimized, relocInfo::tailCallType* type, address& first_set_oop_addr) {
+ address first_oop = NULL;
++ first_set_oop_addr = NULL;
+ // Mergers please note: Sun SC5.x CC insists on an lvalue for a reference parameter.
+ CodeBlob *code1 = code;
+- return virtual_call_Relocation::parse_ic(code1, ic_call, first_oop, _oop_addr, is_optimized);
++ return virtual_call_Relocation::parse_ic(code1, ic_call, first_set_oop_addr, _oop_addr, is_optimized, type);
+ }
+
+ CompiledIC::CompiledIC(NativeCall* ic_call)
+ : _ic_call(ic_call),
+- _oops(parse_ic(NULL, ic_call->instruction_address(), _oop_addr, &_is_optimized))
++ _oops(parse_ic(NULL, ic_call->instruction_address(), _oop_addr, &_is_optimized, &_tail_call_type, _first_set_oop_inst))
+ {
+ }
+
+
+ CompiledIC::CompiledIC(Relocation* ic_reloc)
+ : _ic_call(nativeCall_at(ic_reloc->addr())),
+- _oops(parse_ic(ic_reloc->code(), ic_reloc->addr(), _oop_addr, &_is_optimized))
++ _oops(parse_ic(ic_reloc->code(), ic_reloc->addr(), _oop_addr, &_is_optimized, &_tail_call_type, _first_set_oop_inst))
+ {
+ assert(ic_reloc->type() == relocInfo::virtual_call_type ||
+ ic_reloc->type() == relocInfo::opt_virtual_call_type, "wrong reloc. info");
+@@ -474,8 +597,17 @@
+ CodeBlob* cb = CodeCache::find_blob_unsafe(this);
+ assert(cb != NULL && cb->is_nmethod(), "must be nmethod");
+ #endif
+- set_destination_mt_safe(SharedRuntime::get_resolve_static_call_stub());
+-
++ address addr = instruction_address();
++ relocInfo::tailCallType tail_call_type;
++ static_call_Relocation::parse_static_call(addr, tail_call_type);
++
++ if (tail_call_type == relocInfo::not_tail_call)
++ set_destination_mt_safe(SharedRuntime::get_resolve_static_call_stub());
++ else if (tail_call_type == relocInfo::sibling_tail_call_type)
++ set_destination_mt_safe(SharedRuntime::get_resolve_static_tail_call_stub());
++ else if (tail_call_type == relocInfo::not_sibling_tail_call_type)
++ set_destination_mt_safe(SharedRuntime::get_resolve_not_sibling_static_tail_call_stub());
++ else assert(0, "Something went terribly wrong here.");
+ // Do not reset stub here: It is too expensive to call find_stub.
+ // Instead, rely on caller (nmethod::clear_inline_caches) to clear
+ // both the call and its stub.
+@@ -486,6 +618,12 @@
+ return destination() == SharedRuntime::get_resolve_static_call_stub();
+ }
+
++bool CompiledStaticCall::is_clean_static_tail_call() const {
++ return destination() == SharedRuntime::get_resolve_static_tail_call_stub() ||
++ destination() == SharedRuntime::get_resolve_not_sibling_static_tail_call_stub();
++}
++
++
+ bool CompiledStaticCall::is_call_to_compiled() const {
+ return CodeCache::contains(destination());
+ }
+@@ -495,7 +633,13 @@
+ // It is a call to interpreted, if it calls to a stub. Hence, the destination
+ // must be in the stub part of the nmethod that contains the call
+ nmethod* nm = CodeCache::find_nmethod(instruction_address());
+- return nm->stub_contains(destination());
++ // Change: Because of tail calls there is another stub destination. So we need
++ // to check that the destination is not the tail call stub.
++ address dest = destination();
++ bool is_stub = nm->stub_contains(dest) && dest != nm->static_tail_call_entry_point() &&
++ dest != nm->static_not_sibling_tail_call_entry_point();
++ return is_stub;
++ //return nm->stub_contains(destination());
+ }
+
+
+@@ -550,20 +694,76 @@
+ }
+ }
+
++void CompiledStaticCall::set_tail_call(const StaticCallInfo& info) {
++ assert (CompiledIC_lock->is_locked() || SafepointSynchronize::is_at_safepoint(), "mt unsafe call");
++ MutexLockerEx pl(Patching_lock, Mutex::_no_safepoint_check_flag);
++ // Updating a cache to the wrong entry can cause bugs that are very hard
++ // to track down - if cache entry gets invalid - we just clean it. In
++ // this way it is always the same code path that is responsible for
++ // updating and resolving an inline cache
++ assert(is_clean_static_tail_call(), "do not update a call entry - use clean");
++
++ if (info._to_interpreter) {
++ // Call to interpreted code
++ set_to_interpreted(info.callee(), info.entry());
++ } else {
++ if (TraceICs) {
++ ResourceMark rm;
++ tty->print_cr("CompiledStaticCall@" INTPTR_FORMAT ": set_to_static_tail_compiled " INTPTR_FORMAT,
++ instruction_address(),
++ info.entry());
++ }
++ // Call to compiled code
++ assert (CodeCache::contains(info.entry()), "wrong entry point");
++ set_destination_mt_safe(info.entry());
++ }
++}
++
+
+ // Compute settings for a CompiledStaticCall. Since we might have to set
+ // the stub when calling to the interpreter, we need to return arguments.
+-void CompiledStaticCall::compute_entry(methodHandle m, StaticCallInfo& info) {
++void CompiledStaticCall::compute_entry(methodHandle m, StaticCallInfo& info, bool is_tail_call, bool is_sibling_call) {
+ nmethod* m_code = m->code();
+ info._callee = m;
+ if (m_code != NULL) {
+ info._to_interpreter = false;
+- info._entry = m_code->verified_entry_point();
++ if (is_tail_call && is_sibling_call) {
++ if (TraceTailCalls) {
++ tty->print("CompiledStaticCall:compute_entry() ");
++ m->print_value_string();
++ tty->print_cr(" to compiled: static_tail_call_entry_point");
++ }
++ info._entry = m_code->static_tail_call_entry_point();
++ } else if (is_tail_call) {
++ if (TraceTailCalls) {
++ tty->print("CompiledStaticCall:compute_entry() ");
++ m->print_value_string();
++ tty->print_cr(" to compiled: static_not_sibling_tail_call_entry_point");
++ }
++ info._entry = m_code->static_not_sibling_tail_call_entry_point();
++ } else {
++ info._entry = m_code->verified_entry_point();
++ }
+ } else {
+ // Callee is interpreted code. In any case entering the interpreter
+ // puts a converter-frame on the stack to save arguments.
+ info._to_interpreter = true;
+- info._entry = m()->get_c2i_entry();
++ if (is_tail_call && is_sibling_call) {
++ if (TraceTailCalls) {
++ tty->print("CompiledStaticCall:compute_entry() ");
++ m->print_value_string();
++ tty->print_cr(" to interpreter: get_c2i_static_tail_call_entry");
++ }
++ info._entry = m()->get_c2i_static_tail_call_entry();
++ } else if (is_tail_call) {
++ if (TraceTailCalls) {
++ tty->print("CompiledStaticCall:compute_entry() ");
++ m->print_value_string();
++ tty->print_cr(" to interpreter: get_c2i_not_sibling_static_tail_call_entry");
++ }
++ info._entry = m()->get_c2i_static_not_sibling_tail_call_entry();
++ } else
++ info._entry = m()->get_c2i_entry();
+ }
+ }
+
+@@ -642,6 +842,22 @@
+ tty->cr();
+ }
+
++void CompiledStaticCall::verify_static_tail_call() {
++ // Verify call
++ NativeCall::verify();
++ if (os::is_MP()) {
++ verify_alignment_static_tail_call();
++ }
++
++ // Verify stub
++ address stub = find_stub();
++ assert(stub != NULL, "no stub found for static call");
++ NativeMovConstReg* method_holder = nativeMovConstReg_at(stub); // creation also verifies the object
++ NativeJump* jump = nativeJump_at(method_holder->next_instruction_address());
++ // Verify state
++ assert(is_clean_static_tail_call(), "sanity check");
++}
++
+ void CompiledStaticCall::verify() {
+ // Verify call
+ NativeCall::verify();
+@@ -654,7 +870,6 @@
+ assert(stub != NULL, "no stub found for static call");
+ NativeMovConstReg* method_holder = nativeMovConstReg_at(stub); // creation also verifies the object
+ NativeJump* jump = nativeJump_at(method_holder->next_instruction_address());
+-
+ // Verify state
+ assert(is_clean() || is_call_to_compiled() || is_call_to_interpreted(), "sanity check");
+ }
+diff --git a/src/share/vm/code/compiledIC.hpp b/src/share/vm/code/compiledIC.hpp
+--- a/src/share/vm/code/compiledIC.hpp
++++ b/src/share/vm/code/compiledIC.hpp
+@@ -76,6 +76,8 @@
+ oop* _oop_addr; // patchable oop cell for this IC
+ RelocIterator _oops; // iteration over any and all set-oop instructions
+ bool _is_optimized; // an optimized virtual call (i.e., no compiled IC)
++ relocInfo::tailCallType _tail_call_type; // Whether a tail call and what kind.
++ address _first_set_oop_inst;
+
+ CompiledIC(NativeCall* ic_call);
+ CompiledIC(Relocation* ic_reloc); // Must be of virtual_call_type/opt_virtual_call_type
+@@ -86,6 +88,9 @@
+ void set_ic_destination(address entry_point);
+ void set_cached_oop(oop cache);
+
++ // Megamorphic tail calls need to set the protection domain token.
++ void set_protection_domain_token(oop protection_domain);
++
+ // Reads the location of the transition stub. This will fail with an assertion, if no transition stub is
+ // associated with the inline cache.
+ address stub_address() const;
+@@ -103,6 +108,9 @@
+ address ic_destination() const;
+
+ bool is_optimized() const { return _is_optimized; }
++ relocInfo::tailCallType tail_call_type() { return _tail_call_type; }
++ bool is_tail_call() { return _tail_call_type != relocInfo::not_tail_call; }
++ bool is_sibling() { return _tail_call_type == relocInfo::sibling_tail_call_type; }
+
+ // State
+ bool is_clean() const;
+@@ -122,10 +130,11 @@
+ //
+ void set_to_clean(); // Can only be called during a safepoint operation
+ void set_to_monomorphic(const CompiledICInfo& info);
+- void set_to_megamorphic(CallInfo* call_info, Bytecodes::Code bytecode, TRAPS);
++ void set_to_megamorphic(CallInfo* call_info, Bytecodes::Code bytecode, oop protection_domain, TRAPS);
+
+ static void compute_monomorphic_entry(methodHandle method, KlassHandle receiver_klass,
+- bool is_optimized, bool static_bound, CompiledICInfo& info, TRAPS);
++ bool is_optimized, bool static_bound, bool is_tail_call,
++ bool is_sibling, CompiledICInfo& info, TRAPS);
+
+ // Location
+ address instruction_address() const { return _ic_call->instruction_address(); }
+@@ -191,7 +200,7 @@
+
+ // Also used by CompiledIC
+ void set_to_interpreted(methodHandle callee, address entry);
+- bool is_optimized_virtual();
++ //bool is_optimized_virtual(); seems to be dead?
+
+ public:
+ friend CompiledStaticCall* compiledStaticCall_before(address return_addr);
+@@ -200,6 +209,8 @@
+
+ // State
+ bool is_clean() const;
++ bool is_clean_static_tail_call() const;
++
+ bool is_call_to_compiled() const;
+ bool is_call_to_interpreted() const;
+
+@@ -210,9 +221,11 @@
+ // Computation and setting is split up, since the actions are separate during
+ // a OptoRuntime::resolve_xxx.
+ void set(const StaticCallInfo& info);
++ // Convert this call to a jump (to the tail call stub).
++ void set_tail_call(const StaticCallInfo& info);
+
+ // Compute entry point given a method
+- static void compute_entry(methodHandle m, StaticCallInfo& info);
++ static void compute_entry(methodHandle m, StaticCallInfo& info, bool is_tail_call=false, bool is_sibling_call=false);
+
+ // Stub support
+ address find_stub();
+@@ -221,6 +234,8 @@
+ // Misc.
+ void print() PRODUCT_RETURN;
+ void verify() PRODUCT_RETURN;
++ void verify_static_tail_call() PRODUCT_RETURN;
++ void verify_alignment_static_tail_call() {assert((intptr_t)addr_at(instruction_offset) % BytesPerInt == 0, "must be aligned");}
+ };
+
+
+@@ -230,6 +245,12 @@
+ return st;
+ }
+
++inline CompiledStaticCall* compiledStaticTailCall_before(address return_addr) {
++ CompiledStaticCall* st = (CompiledStaticCall*)nativeCall_before(return_addr);
++ st->verify_static_tail_call();
++ return st;
++}
++
+ inline CompiledStaticCall* compiledStaticCall_at(address native_call) {
+ CompiledStaticCall* st = (CompiledStaticCall*)native_call;
+ st->verify();
+diff --git a/src/share/vm/code/nmethod.cpp b/src/share/vm/code/nmethod.cpp
+--- a/src/share/vm/code/nmethod.cpp
++++ b/src/share/vm/code/nmethod.cpp
+@@ -786,6 +786,12 @@
+ // Exception handler and deopt handler are in the stub section
+ _exception_offset = _stub_offset + offsets->value(CodeOffsets::Exceptions);
+ _deoptimize_offset = _stub_offset + offsets->value(CodeOffsets::Deopt);
++ _static_tail_call_offset = _stub_offset + offsets->value(CodeOffsets::Static_Tail_Call_Entry);
++ _static_not_sibling_tail_call_offset = _stub_offset + offsets->value(CodeOffsets::Static_Not_Sibling_Tail_Call_Entry);
++ _static_not_sibling_tail_call_set_data_offset = _stub_offset + offsets->value(CodeOffsets::Static_Not_Sibling_Tail_Call_Set_Data_Entry);
++ _monomorphic_tail_call_offset = _stub_offset + offsets->value(CodeOffsets::Monomorphic_Tail_Call_Entry);
++ _monomorphic_not_sibling_tail_call_offset = _stub_offset + offsets->value(CodeOffsets::Monomorphic_Not_Sibling_Tail_Call_Entry);
++ _monomorphic_not_sibling_tail_call_set_data_offset = _stub_offset + offsets->value(CodeOffsets::Monomorphic_Not_Sibling_Tail_Call_Set_Data_Entry);
+ _consts_offset = instructions_offset() + code_buffer->total_offset_of(code_buffer->consts()->start());
+ _scopes_data_offset = data_offset();
+ _scopes_pcs_offset = _scopes_data_offset + round_to(debug_info->data_size (), oopSize);
+@@ -797,6 +803,19 @@
+ _entry_point = instructions_begin();
+ _verified_entry_point = instructions_begin() + offsets->value(CodeOffsets::Verified_Entry);
+ _osr_entry_point = instructions_begin() + offsets->value(CodeOffsets::OSR_Entry);
++ _static_tail_call_entry_point = static_tail_call_begin();
++ _static_not_sibling_tail_call_entry_point = static_not_sibling_tail_call_begin();
++ _monomorphic_tail_call_entry_point = monomorphic_tail_call_begin();
++ _monomorphic_not_sibling_tail_call_entry_point = monomorphic_not_sibling_tail_call_begin();
++
++#if ASSERT
++ // Check entry point alignment: when making the nmethod not entrant or
++ // zombie the tail call entry points need to be aligned properly.
++ NativeJump::check_verified_entry_alignment(0, _static_tail_call_entry_point);
++ NativeJump::check_verified_entry_alignment(0, _static_not_sibling_tail_call_entry_point);
++ NativeJump::check_verified_entry_alignment(0, _monomorphic_tail_call_entry_point);
++ NativeJump::check_verified_entry_alignment(0, _monomorphic_not_sibling_tail_call_entry_point);
++#endif
+ _exception_cache = NULL;
+ _pc_desc_cache.reset_to(scopes_pcs_begin());
+
+@@ -970,6 +989,44 @@
+ }
+
+
++// Sets methodoop and c2i entry in tail call stub at address move_addr.
++static void set_adapter_info_in_tail_call_stub_helper(CodeBlob * buffer, methodOop method, address move_addr, AdapterHandlerEntry* adapter) {
++ // Get instruction sequence:
++ // mov ebx, [oop]
++ // jmp [c2i]
++ NativeMovConstReg* method_holder = nativeMovConstReg_at(move_addr);
++ NativeJump* jump = nativeJump_at(method_holder->next_instruction_address());
++ method_holder->set_data((intptr_t)method);
++ assert(method->is_perm(), "Must be in permament gen.");
++ //fixup relocations
++ RelocIterator iter(buffer, (address)method_holder, ((address) method_holder)+1);
++ oop* oop_addr = (oop*) (((address)method_holder)+NativeMovConstReg::data_offset);
++ bool is_fixed = false;
++ while (iter.next()) {
++ if (iter.type() == relocInfo::oop_type) {
++ oop_Relocation* r = iter.oop_reloc();
++ if (r->oop_addr() == oop_addr) {
++ r->fix_oop_relocation();
++ is_fixed = true;
++ }
++ }
++ }
++ assert (is_fixed, "Oop relocation fixed");
++ jump->set_jump_destination(adapter->get_c2i_entry_skip_fixup());
++}
++
++void nmethod::set_adapter_info_in_tail_call_stubs(methodOop method, AdapterHandlerEntry* adapter) {
++ guarantee(is_java_method(), "only works for java methods");
++ // Currently only c1 supports tail calls.
++ guarantee(is_compiled_by_c1(), "currently only supported by c1");
++
++ // Static not sibling tail call entry.
++ address move_addr = header_begin() + _static_not_sibling_tail_call_set_data_offset;
++ set_adapter_info_in_tail_call_stub_helper(this, method, move_addr, adapter);
++ move_addr = header_begin() + _monomorphic_not_sibling_tail_call_set_data_offset;
++ set_adapter_info_in_tail_call_stub_helper(this, method, move_addr, adapter);
++}
++
+ void nmethod::clear_inline_caches() {
+ assert(SafepointSynchronize::is_at_safepoint(), "cleaning of IC's only allowed at safepoint");
+ if (is_zombie()) {
+@@ -1173,6 +1230,18 @@
+ // The caller can be calling the method statically or through an inline
+ // cache call.
+ if (!is_not_entrant()) {
++ // Patch tail call entries before the verified entry point otherwise we
++ // might get to a patched verified entry point with a popped frame and
++ // get_handle_wrong method_stub would see a wrong return address (of the
++ // parent frame).
++ NativeJump::patch_verified_entry(0, static_tail_call_entry_point(),
++ SharedRuntime::get_handle_wrong_method_stub());
++ NativeJump::patch_verified_entry(0, static_not_sibling_tail_call_entry_point(),
++ SharedRuntime::get_handle_wrong_method_stub());
++ NativeJump::patch_verified_entry(0, monomorphic_tail_call_entry_point(),
++ SharedRuntime::get_handle_wrong_method_stub());
++ NativeJump::patch_verified_entry(0, monomorphic_not_sibling_tail_call_entry_point(),
++ SharedRuntime::get_handle_wrong_method_stub());
+ NativeJump::patch_verified_entry(entry_point(), verified_entry_point(),
+ SharedRuntime::get_handle_wrong_method_stub());
+ assert (NativeJump::instruction_size == nmethod::_zombie_instruction_size, "");
+@@ -2171,6 +2240,7 @@
+ case relocInfo::poll_type: return "poll";
+ case relocInfo::poll_return_type: return "poll_return";
+ case relocInfo::type_mask: return "type_bit_mask";
++ case relocInfo::section_call_type: return "section_call";
+ }
+ }
+ return have_one ? "other" : NULL;
+@@ -2220,8 +2290,9 @@
+ st->print("method is native");
+ } else {
+ address bcp = sd->method()->bcp_from(sd->bci());
+- Bytecodes::Code bc = Bytecodes::java_code_at(bcp);
+- st->print(";*%s", Bytecodes::name(bc));
++ Bytecodes::Prefix pfx;
++ Bytecodes::Code bc = Bytecodes::java_code_at(bcp, pfx);
++ st->print(";*%s%s", Bytecodes::prefix_name(pfx), Bytecodes::name(bc));
+ switch (bc) {
+ case Bytecodes::_invokevirtual:
+ case Bytecodes::_invokespecial:
+diff --git a/src/share/vm/code/nmethod.hpp b/src/share/vm/code/nmethod.hpp
+--- a/src/share/vm/code/nmethod.hpp
++++ b/src/share/vm/code/nmethod.hpp
+@@ -151,7 +151,12 @@
+ int _handler_table_offset;
+ int _nul_chk_table_offset;
+ int _nmethod_end_offset;
+-
++ int _static_tail_call_offset;
++ int _static_not_sibling_tail_call_offset;
++ int _static_not_sibling_tail_call_set_data_offset;
++ int _monomorphic_tail_call_offset;
++ int _monomorphic_not_sibling_tail_call_offset;
++ int _monomorphic_not_sibling_tail_call_set_data_offset;
+ // location in frame (offset for sp) that deopt can store the original
+ // pc during a deopt.
+ int _orig_pc_offset;
+@@ -163,7 +168,11 @@
+ address _entry_point; // entry point with class check
+ address _verified_entry_point; // entry point without class check
+ address _osr_entry_point; // entry point for on stack replacement
+-
++ address _static_tail_call_entry_point; // entry point without class and
++ // protection domain check
++ address _static_not_sibling_tail_call_entry_point;
++ address _monomorphic_tail_call_entry_point; // with class, without pd check
++ address _monomorphic_not_sibling_tail_call_entry_point;
+ nmFlags flags; // various flags to keep track of nmethod state
+ bool _markedForDeoptimization; // Used for stack deoptimization
+ enum { alive = 0,
+@@ -338,7 +347,10 @@
+ address handler_table_end () const { return header_begin() + _nul_chk_table_offset ; }
+ address nul_chk_table_begin() const { return header_begin() + _nul_chk_table_offset ; }
+ address nul_chk_table_end () const { return header_begin() + _nmethod_end_offset ; }
+-
++ address static_tail_call_begin() const { return header_begin() + _static_tail_call_offset; }
++ address static_not_sibling_tail_call_begin() const { return header_begin() + _static_not_sibling_tail_call_offset; }
++ address monomorphic_tail_call_begin() const { return header_begin() + _monomorphic_tail_call_offset; }
++ address monomorphic_not_sibling_tail_call_begin() const { return header_begin() + _monomorphic_not_sibling_tail_call_offset; }
+ int code_size () const { return code_end () - code_begin (); }
+ int stub_size () const { return stub_end () - stub_begin (); }
+ int consts_size () const { return consts_end () - consts_begin (); }
+@@ -361,7 +373,14 @@
+ // entry points
+ address entry_point() const { return _entry_point; } // normal entry point
+ address verified_entry_point() const { return _verified_entry_point; } // if klass is correct
+-
++ address static_tail_call_entry_point() const { return _static_tail_call_entry_point; } // If klass and pd is korrect.
++ address static_not_sibling_tail_call_entry_point() const { return _static_not_sibling_tail_call_entry_point; } // If klass and pd is korrect.
++ address monomorphic_tail_call_entry_point() const { return _monomorphic_tail_call_entry_point; } // klass
++ // check
++ // (pd
++ // is
++ // correct)
++ address monomorphic_not_sibling_tail_call_entry_point() const { return _monomorphic_not_sibling_tail_call_entry_point; }
+ // flag accessing and manipulation
+ bool is_in_use() const { return flags.state == alive; }
+ bool is_alive() const { return flags.state == alive || flags.state == not_entrant; }
+@@ -432,6 +451,8 @@
+ // note: native wrappers cannot be deoptimized.
+ bool can_be_deoptimized() const { return is_java_method(); }
+
++ // Tail call support
++ void set_adapter_info_in_tail_call_stubs(methodOop method, AdapterHandlerEntry* adapter);
+ // Inline cache support
+ void clear_inline_caches();
+ void cleanup_inline_caches();
+@@ -581,7 +602,7 @@
+ static int verified_entry_point_offset() { return offset_of(nmethod, _verified_entry_point); }
+ static int osr_entry_point_offset() { return offset_of(nmethod, _osr_entry_point); }
+ static int entry_bci_offset() { return offset_of(nmethod, _entry_bci); }
+-
++ static int static_tail_call_entry_point_offset() { return offset_of(nmethod, _static_tail_call_entry_point); }
+ };
+
+ // Locks an nmethod so its code will not get removed, even if it is a zombie/not_entrant method
+diff --git a/src/share/vm/code/relocInfo.cpp b/src/share/vm/code/relocInfo.cpp
+--- a/src/share/vm/code/relocInfo.cpp
++++ b/src/share/vm/code/relocInfo.cpp
+@@ -106,6 +106,37 @@
+ assert(found, "no relocInfo found for pc");
+ }
+
++void relocInfo::change_reloc_info_for_address(RelocIterator *itr, address pc, relocType old_type, relocType new_type, address target, int section) {
++ bool found = false;
++ assert(old_type == relocInfo::static_call_type, "only works for static call");
++ assert(new_type == relocInfo::section_call_type, "only works for inter section call");
++ while (itr->next() && !found) {
++ if (itr->addr() == pc) {
++ assert(itr->type()==old_type, "wrong relocInfo type found");
++ itr->current()->set_type(new_type);
++ // Set relocation info.
++ short* p = (short*)itr->data();
++ assert(itr->datalen() == 2, "Sanity check.");
++ assert(itr->code()->is_nmethod(), "oops");
++ nmethod * code = (nmethod*)itr->code();
++ int sindex = section;
++
++ assert(sindex != CodeBuffer::SECT_NONE, "must belong somewhere");
++ assert(target != NULL, "sanity");
++ assert(sindex == CodeBuffer::SECT_STUBS, "assume section stub");
++ address base = code->stub_begin();
++ jint offset = Relocation::scaled_offset(target, base);
++ assert((uint)sindex < (uint)CodeBuffer::SECT_LIMIT, "sanity");
++ assert(CodeBuffer::SECT_LIMIT <= (1 << section_call_Relocation::section_width), "section_width++");
++ p = Relocation::add_jint(p, (offset << section_call_Relocation::section_width) | sindex);
++
++ // TODO: end
++ found=true;
++ }
++ }
++ assert(found, "no relocInfo found for pc");
++}
++
+
+ void relocInfo::remove_reloc_info_for_address(RelocIterator *itr, address pc, relocType old_type) {
+ change_reloc_info_for_address(itr, pc, old_type, none);
+@@ -404,6 +435,31 @@
+ }
+
+
++relocInfo::tailCallType RelocIterator::tail_call_type() {
++ assert(has_current(), "must have a reloc");
++
++ relocInfo::relocType type = reloc()->type();
++ assert(type==relocInfo::static_call_type || type==relocInfo::virtual_call_type ||
++ type==relocInfo::opt_virtual_call_type, "must be a call type");
++
++ relocInfo::tailCallType tail_call_type;
++ switch(type) {
++ case relocInfo::static_call_type: {
++ static_call_Relocation * r = static_call_reloc();
++ tail_call_type = r->tail_call_type(); }
++ break;
++ case relocInfo::opt_virtual_call_type: {
++ opt_virtual_call_Relocation* r = opt_virtual_call_reloc();
++ tail_call_type = r->tail_call_type(); }
++ break;
++ case relocInfo::virtual_call_type: {
++ virtual_call_Relocation * r = virtual_call_reloc();
++ tail_call_type = r->tail_call_type(); }
++ break;
++ default: guarantee(0, "should not get here");break;
++ }
++ return tail_call_type;
++}
+ //////// Methods for flyweight Relocation types
+
+
+@@ -579,18 +635,43 @@
+ normalize_address(_oop_limit, dest);
+ jint x0 = scaled_offset_null_special(_first_oop, point);
+ jint x1 = scaled_offset_null_special(_oop_limit, point);
+- p = pack_2_ints_to(p, x0, x1);
++ //p = pack_2_ints_to(p, x0, x1); added add_short so pack_2_ints_logic might
++ //not work.
++ p = add_jint(p, x0);
++ p = add_jint(p, x1);
++ p = add_short(p, (short)_tail_call_type);
+ dest->set_locs_end((relocInfo*) p);
+ }
+
+
+ void virtual_call_Relocation::unpack_data() {
+- jint x0, x1; unpack_2_ints(x0, x1);
++ jint x0, x1;
++ short * p = data();
++ int dlen = datalen();
++ //short* p = unpack_2_ints(x0, x1);
++ x0 = relocInfo::jint_data_at(0, p, dlen);
++ x1 = relocInfo::jint_data_at(2, p, dlen);
++
+ address point = addr();
+ _first_oop = x0==0? NULL: address_from_scaled_offset(x0, point);
+ _oop_limit = x1==0? NULL: address_from_scaled_offset(x1, point);
++ //_tail_call_type = (relocInfo::tailCallType)relocInfo::short_data_at(0, p,
++ //1);
++ _tail_call_type = (relocInfo::tailCallType)relocInfo::short_data_at(4, p, dlen);
+ }
+
++// un-/pack tail_call_type data
++void opt_virtual_call_Relocation::pack_data_to(CodeSection* dest) {
++ short* p = (short*) dest->locs_end();
++ p = add_short(p, (short)_tail_call_type);
++ dest->set_locs_end((relocInfo*) p);
++}
++
++void opt_virtual_call_Relocation::unpack_data() {
++ assert(datalen()==1, "data length must be 1");
++ short* p = data();
++ _tail_call_type = (relocInfo::tailCallType)relocInfo::short_data_at(0, p, 1);
++}
+
+ void static_stub_Relocation::pack_data_to(CodeSection* dest) {
+ short* p = (short*) dest->locs_end();
+@@ -702,6 +783,69 @@
+ _target = address_from_scaled_offset(offset, base);
+ }
+
++void static_call_Relocation::pack_data_to(CodeSection* dest) {
++ short* p = (short*) dest->locs_end();
++ // Store dummy integer here.
++ p = add_jint(p, 0);
++ p = add_short(p, (short)_tail_call_type);
++ dest->set_locs_end((relocInfo*) p);
++}
++
++void static_call_Relocation::unpack_data() {
++ assert(datalen() == 3, "data length must be 3");
++ // For symmetry.
++ short* p = data();
++ jint dummy = relocInfo::jint_from_data(p);
++ p+=2; // advance over int
++ _tail_call_type = (relocInfo::tailCallType)relocInfo::short_data_at(0, p, 1);
++
++}
++
++void section_call_Relocation::pack_data_to(CodeSection* dest) {
++ short* p = (short*) dest->locs_end();
++ normalize_address(_target, dest, true);
++
++ // Check whether my target address is valid within this section.
++ // If not, strengthen the relocation type to point to another section.
++ int sindex = _section;
++
++ assert(sindex != CodeBuffer::SECT_NONE, "must belong somewhere");
++ assert(_target != NULL, "sanity");
++
++ CodeSection* sect = dest->outer()->code_section(sindex);
++ guarantee(sect->allocates2(_target), "must be in correct section");
++ address base = sect->start();
++ jint offset = scaled_offset(_target, base);
++ assert((uint)sindex < (uint)CodeBuffer::SECT_LIMIT, "sanity");
++ assert(CodeBuffer::SECT_LIMIT <= (1 << section_width), "section_width++");
++ //p = pack_1_int_to(p, (offset << section_width) | sindex);
++ // Want fixed size.
++ p = add_jint(p, (offset << section_width) | sindex);
++
++ dest->set_locs_end((relocInfo*) p);
++}
++
++void section_call_Relocation::unpack_data() {
++ jint x = unpack_1_int();
++ jint offset = (x >> section_width);
++ int sindex = (x & ((1<<section_width)-1));
++ address base = binding()->section_start(sindex);
++
++ _section = sindex;
++ _target = address_from_scaled_offset(offset, base);
++}
++
++void section_call_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) {
++ address target = _target;
++ if (target == NULL) {
++ if (addr_in_const()) {
++ target = new_addr_for(*(address*)addr(), src, dest);
++ } else {
++ target = new_addr_for(pd_get_address_from_code(), src, dest);
++ }
++ }
++ set_value(target);
++}
+
+ void breakpoint_Relocation::pack_data_to(CodeSection* dest) {
+ short* p = (short*) dest->locs_end();
+@@ -775,9 +919,21 @@
+ }
+ }
+
+-
++void static_call_Relocation::parse_static_call(address &static_call, relocInfo::tailCallType & tail_call_type) {
++ CodeBlob * code = CodeCache::find_blob(static_call);
++ RelocIterator iter(code, static_call, static_call+1);
++ bool ret = iter.next();
++ assert(ret == true, "relocInfo must exist at this address");
++ assert(iter.addr() == static_call, "must find static call");
++ if (iter.type()==relocInfo::static_call_type) {
++ static_call_Relocation * r = iter.static_call_reloc();
++ tail_call_type = r->tail_call_type();
++ } else {
++ assert(0, "must be a static call");
++ }
++}
+ RelocIterator virtual_call_Relocation::parse_ic(CodeBlob* &code, address &ic_call, address &first_oop,
+- oop* &oop_addr, bool *is_optimized) {
++ oop* &oop_addr, bool *is_optimized, relocInfo::tailCallType* tail_call_type) {
+ assert(ic_call != NULL, "ic_call address must be set");
+ assert(ic_call != NULL || first_oop != NULL, "must supply a non-null input");
+ if (code == NULL) {
+@@ -803,12 +959,15 @@
+ virtual_call_Relocation* r = iter.virtual_call_reloc();
+ first_oop = r->first_oop();
+ oop_limit = r->oop_limit();
++ *tail_call_type = r->tail_call_type();
+ *is_optimized = false;
+ } else {
+ assert(iter.type() == relocInfo::opt_virtual_call_type, "must be a virtual call");
++ opt_virtual_call_Relocation* r = iter.opt_virtual_call_reloc();
+ *is_optimized = true;
+ oop_addr = NULL;
+ first_oop = NULL;
++ *tail_call_type = r->tail_call_type();
+ return iter;
+ }
+ }
+@@ -865,7 +1024,9 @@
+ return _oop_limit;
+ }
+
+-
++relocInfo::tailCallType virtual_call_Relocation::tail_call_type() {
++ return _tail_call_type;
++}
+
+ void virtual_call_Relocation::clear_inline_cache() {
+ // No stubs for ICs
+@@ -875,6 +1036,9 @@
+ icache->set_to_clean();
+ }
+
++relocInfo::tailCallType opt_virtual_call_Relocation::tail_call_type() {
++ return _tail_call_type;
++}
+
+ void opt_virtual_call_Relocation::clear_inline_cache() {
+ // No stubs for ICs
+@@ -900,6 +1064,10 @@
+ }
+
+
++relocInfo::tailCallType static_call_Relocation::tail_call_type() {
++ return _tail_call_type;
++}
++
+ void static_call_Relocation::clear_inline_cache() {
+ // Safe call site info
+ CompiledStaticCall* handler = compiledStaticCall_at(this);
+diff --git a/src/share/vm/code/relocInfo.hpp b/src/share/vm/code/relocInfo.hpp
+--- a/src/share/vm/code/relocInfo.hpp
++++ b/src/share/vm/code/relocInfo.hpp
+@@ -261,12 +261,17 @@
+ poll_type = 10, // polling instruction for safepoints
+ poll_return_type = 11, // polling instruction for safepoints at return
+ breakpoint_type = 12, // an initialization barrier or safepoint
+- yet_unused_type = 13, // Still unused
++ section_call_type = 13,
++ //yet_unused_type = 13, // Still unused
+ yet_unused_type_2 = 14, // Still unused
+ data_prefix_tag = 15, // tag for a prefix (carries data arguments)
+ type_mask = 15 // A mask which selects only the above values
+ };
+-
++ enum tailCallType {
++ not_tail_call = 0,
++ sibling_tail_call_type = 1,
++ not_sibling_tail_call_type = 2
++ };
+ protected:
+ unsigned short _value;
+
+@@ -302,7 +307,7 @@
+ visitor(poll_return) \
+ visitor(breakpoint) \
+ visitor(section_word) \
+-
++ visitor(section_call) \
+
+ public:
+ enum {
+@@ -412,6 +417,7 @@
+ // (since code is dynamically patched, we also need to dynamically update the relocation info)
+ // Both methods takes old_type, so it is able to performe sanity checks on the information removed.
+ static void change_reloc_info_for_address(RelocIterator *itr, address pc, relocType old_type, relocType new_type);
++ static void change_reloc_info_for_address(RelocIterator *itr, address pc, relocType old_type, relocType new_type, address target, int section);
+ static void remove_reloc_info_for_address(RelocIterator *itr, address pc, relocType old_type);
+
+ // Machine dependent stuff
+@@ -616,6 +622,8 @@
+ #undef EACH_TYPE
+ // generic relocation accessor; switches on type to call the above
+ Relocation* reloc();
++ // Get the tail call type of current relocation provided it is a call.
++ relocInfo::tailCallType tail_call_type();
+
+ // CodeBlob's have relocation indexes for faster random access:
+ static int locs_and_index_size(int code_size, int locs_size);
+@@ -732,7 +740,7 @@
+ }
+ return p;
+ }
+- void unpack_2_ints(jint& x0, jint& x1) {
++ short* unpack_2_ints(jint& x0, jint& x1) {
+ int dlen = datalen();
+ short* dp = data();
+ if (dlen <= 2) {
+@@ -743,6 +751,7 @@
+ x0 = relocInfo::jint_data_at(0, dp, dlen);
+ x1 = relocInfo::jint_data_at(2, dp, dlen);
+ }
++ return dp;
+ }
+
+ protected:
+@@ -821,6 +830,8 @@
+ virtual void fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) { }
+
+ void print();
++
++ friend class relocInfo;
+ };
+
+
+@@ -948,20 +959,24 @@
+ // "first_oop" points to the first associated set-oop.
+ // The oop_limit helps find the last associated set-oop.
+ // (See comments at the top of this file.)
+- static RelocationHolder spec(address first_oop, address oop_limit = NULL) {
++ static RelocationHolder spec(address first_oop, address oop_limit = NULL,
++ relocInfo::tailCallType tail_call_type = relocInfo::not_tail_call) {
+ RelocationHolder rh = newHolder();
+- new(rh) virtual_call_Relocation(first_oop, oop_limit);
++ new(rh) virtual_call_Relocation(first_oop, oop_limit, tail_call_type);
+ return rh;
+ }
+
+- virtual_call_Relocation(address first_oop, address oop_limit) {
++ virtual_call_Relocation(address first_oop, address oop_limit,
++ relocInfo::tailCallType tail_call_type = relocInfo::not_tail_call) {
+ _first_oop = first_oop; _oop_limit = oop_limit;
++ _tail_call_type = tail_call_type;
+ assert(first_oop != NULL, "first oop address must be specified");
+ }
+
+ private:
+ address _first_oop; // location of first set-oop instruction
+ address _oop_limit; // search limit for set-oop instructions
++ relocInfo::tailCallType _tail_call_type;
+
+ friend class RelocIterator;
+ virtual_call_Relocation() { }
+@@ -970,6 +985,7 @@
+ public:
+ address first_oop();
+ address oop_limit();
++ relocInfo::tailCallType tail_call_type();
+
+ // data is packed as scaled offsets in "2_ints" format: [f l] or [Ff Ll]
+ // oop_limit is set to 0 if the limit falls somewhere within the call.
+@@ -987,7 +1003,7 @@
+ // The returned iterator will enumerate over the oops and the ic_call,
+ // as well as any other relocations that happen to be in that span of code.
+ // Recognize relevant set_oops with: oop_reloc()->oop_addr() == oop_addr.
+- static RelocIterator parse_ic(CodeBlob* &code, address &ic_call, address &first_oop, oop* &oop_addr, bool *is_optimized);
++ static RelocIterator parse_ic(CodeBlob* &code, address &ic_call, address &first_oop, oop* &oop_addr, bool *is_optimized, relocInfo::tailCallType* tail_call_type);
+ };
+
+
+@@ -995,21 +1011,27 @@
+ relocInfo::relocType type() { return relocInfo::opt_virtual_call_type; }
+
+ public:
+- static RelocationHolder spec() {
++ static RelocationHolder spec(relocInfo::tailCallType tail_call_type = relocInfo::not_tail_call) {
+ RelocationHolder rh = newHolder();
+- new(rh) opt_virtual_call_Relocation();
++ new(rh) opt_virtual_call_Relocation(tail_call_type);
+ return rh;
+ }
+
+ private:
+ friend class RelocIterator;
+- opt_virtual_call_Relocation() { }
+-
++ opt_virtual_call_Relocation(relocInfo::tailCallType tail_call_type= relocInfo::not_tail_call) {
++ _tail_call_type = tail_call_type;
++ }
++ relocInfo::tailCallType _tail_call_type;
+ public:
+ void clear_inline_cache();
+-
++ relocInfo::tailCallType tail_call_type();
+ // find the matching static_stub
+ address static_stub();
++
++ // un-/pack tail_call_type data
++ void pack_data_to(CodeSection* dest);
++ void unpack_data();
+ };
+
+
+@@ -1017,21 +1039,28 @@
+ relocInfo::relocType type() { return relocInfo::static_call_type; }
+
+ public:
+- static RelocationHolder spec() {
++ static RelocationHolder spec(relocInfo::tailCallType tail_call_type = relocInfo::not_tail_call) {
+ RelocationHolder rh = newHolder();
+- new(rh) static_call_Relocation();
++ new(rh) static_call_Relocation(tail_call_type);
+ return rh;
+ }
+
+ private:
+ friend class RelocIterator;
+- static_call_Relocation() { }
+-
++ static_call_Relocation(relocInfo::tailCallType tail_call_type= relocInfo::not_tail_call) {
++ _tail_call_type=tail_call_type;
++ }
++ relocInfo::tailCallType _tail_call_type;
+ public:
++ relocInfo::tailCallType tail_call_type();
+ void clear_inline_cache();
+-
++ // Added to accomodate size for transformation to section_call_Relocation.
++ void pack_data_to(CodeSection* dest);
++ void unpack_data();
+ // find the matching static_stub
+ address static_stub();
++
++ static void parse_static_call(address &static_call, relocInfo::tailCallType &tail_call_type);
+ };
+
+ class static_stub_Relocation : public Relocation {
+@@ -1197,6 +1226,41 @@
+ };
+
+
++class section_call_Relocation : public CallRelocation {
++ relocInfo::relocType type() { return relocInfo::section_call_type; }
++ public:
++ static RelocationHolder spec(address target, int section) {
++ RelocationHolder rh = newHolder();
++ new(rh) section_call_Relocation(target, section);
++ return rh;
++ }
++
++ section_call_Relocation(address target, int section) {
++ assert(target != NULL, "must not be null");
++ assert(section >= 0, "must be a valid section");
++ _target = target;
++ _section = section;
++ };
++
++ // Need to take section index into account.
++ void pack_data_to(CodeSection* dest);
++ void unpack_data();
++ void fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest);
++
++protected:
++ address _target; // address in CodeBlob
++ int _section; // section providing base address, if any
++
++public:
++ // bit-width of LSB field in packed offset, if section >= 0
++ enum { section_width = 2 }; // must equal CodeBuffer::sect_bits
++
++private:
++ friend class RelocIterator;
++ section_call_Relocation() { }
++};
++
++
+ class poll_Relocation : public Relocation {
+ bool is_data() { return true; }
+ relocInfo::relocType type() { return relocInfo::poll_type; }
+diff --git a/src/share/vm/code/vtableStubs.cpp b/src/share/vm/code/vtableStubs.cpp
+--- a/src/share/vm/code/vtableStubs.cpp
++++ b/src/share/vm/code/vtableStubs.cpp
+@@ -96,17 +96,17 @@
+ }
+
+
+-address VtableStubs::create_stub(bool is_vtable_stub, int vtable_index, methodOop method) {
++address VtableStubs::create_stub(bool is_vtable_stub, int vtable_index, methodOop method, bool is_tail_call, bool is_sibling) {
+ assert(vtable_index >= 0, "must be positive");
+
+- VtableStub* s = ShareVtableStubs ? lookup(is_vtable_stub, vtable_index) : NULL;
++ VtableStub* s = ShareVtableStubs ? lookup(is_vtable_stub, vtable_index, is_tail_call, is_sibling) : NULL;
+ if (s == NULL) {
+ if (is_vtable_stub) {
+- s = create_vtable_stub(vtable_index);
++ s = create_vtable_stub(vtable_index, is_tail_call, is_sibling);
+ } else {
+- s = create_itable_stub(vtable_index);
++ s = create_itable_stub(vtable_index, is_tail_call, is_sibling);
+ }
+- enter(is_vtable_stub, vtable_index, s);
++ enter(is_vtable_stub, vtable_index, s, is_tail_call, is_sibling);
+ #ifndef PRODUCT
+ if (PrintAdapterHandlers) {
+ tty->print_cr("Decoding VtableStub %s[%d]@%d",
+@@ -119,26 +119,28 @@
+ }
+
+
+-inline uint VtableStubs::hash(bool is_vtable_stub, int vtable_index){
++inline uint VtableStubs::hash(bool is_vtable_stub, int vtable_index, bool is_tail_call, bool is_sibling) {
+ // Assumption: receiver_location < 4 in most cases.
+ int hash = ((vtable_index << 2) ^ VtableStub::receiver_location()->value()) + vtable_index;
+ return (is_vtable_stub ? ~hash : hash) & mask;
+ }
+
+
+-VtableStub* VtableStubs::lookup(bool is_vtable_stub, int vtable_index) {
++VtableStub* VtableStubs::lookup(bool is_vtable_stub, int vtable_index,
++ bool is_tail_call, bool is_sibling) {
+ MutexLocker ml(VtableStubs_lock);
+- unsigned hash = VtableStubs::hash(is_vtable_stub, vtable_index);
++ unsigned hash = VtableStubs::hash(is_vtable_stub, vtable_index, is_tail_call, is_sibling);
+ VtableStub* s = _table[hash];
+- while( s && !s->matches(is_vtable_stub, vtable_index)) s = s->next();
++ while( s && !s->matches(is_vtable_stub, vtable_index, is_tail_call, is_sibling)) s = s->next();
+ return s;
+ }
+
+
+-void VtableStubs::enter(bool is_vtable_stub, int vtable_index, VtableStub* s) {
++void VtableStubs::enter(bool is_vtable_stub, int vtable_index, VtableStub* s,
++ bool is_tail_call, bool is_sibling) {
+ MutexLocker ml(VtableStubs_lock);
+- assert(s->matches(is_vtable_stub, vtable_index), "bad vtable stub");
+- unsigned int h = VtableStubs::hash(is_vtable_stub, vtable_index);
++ assert(s->matches(is_vtable_stub, vtable_index, is_tail_call, is_sibling), "bad vtable stub");
++ unsigned int h = VtableStubs::hash(is_vtable_stub, vtable_index, is_tail_call, is_sibling);
+ // enter s at the beginning of the corresponding list
+ s->set_next(_table[h]);
+ _table[h] = s;
+@@ -149,7 +151,7 @@
+ bool VtableStubs::is_entry_point(address pc) {
+ MutexLocker ml(VtableStubs_lock);
+ VtableStub* stub = (VtableStub*)(pc - VtableStub::entry_offset());
+- uint hash = VtableStubs::hash(stub->is_vtable_stub(), stub->index());
++ uint hash = VtableStubs::hash(stub->is_vtable_stub(), stub->index(), stub->is_tail_call(), stub->is_sibling());
+ VtableStub* s;
+ for (s = _table[hash]; s != NULL && s != stub; s = s->next()) {}
+ return s == stub;
+diff --git a/src/share/vm/code/vtableStubs.hpp b/src/share/vm/code/vtableStubs.hpp
+--- a/src/share/vm/code/vtableStubs.hpp
++++ b/src/share/vm/code/vtableStubs.hpp
+@@ -37,14 +37,18 @@
+ const short _index; // vtable index
+ short _ame_offset; // Where an AbstractMethodError might occur
+ short _npe_offset; // Where a NullPointerException might occur
+- bool _is_vtable_stub; // True if vtable stub, false, is itable stub
++ bool _is_vtable_stub; // True if vtable stub, false, is itable
++ // stub
++ bool _is_tail_call;
++ bool _is_sibling;
+ /* code follows here */ // The vtableStub code
+
+ void* operator new(size_t size, int code_size);
+
+- VtableStub(bool is_vtable_stub, int index)
++ VtableStub(bool is_vtable_stub, int index, bool is_tail_call, bool is_sibling)
+ : _next(NULL), _is_vtable_stub(is_vtable_stub),
+- _index(index), _ame_offset(-1), _npe_offset(-1) {}
++ _index(index), _ame_offset(-1), _npe_offset(-1),
++ _is_tail_call(is_tail_call), _is_sibling(is_sibling) {}
+ VtableStub* next() const { return _next; }
+ int index() const { return _index; }
+ static VMReg receiver_location() { return _receiver_location; }
+@@ -54,8 +58,9 @@
+ address entry_point() const { return code_begin(); }
+ static int entry_offset() { return sizeof(class VtableStub); }
+
+- bool matches(bool is_vtable_stub, int index) const {
+- return _index == index && _is_vtable_stub == is_vtable_stub;
++ bool matches(bool is_vtable_stub, int index, bool is_tail_call, bool is_sibling) const {
++ return _index == index && _is_vtable_stub == is_vtable_stub && _is_tail_call==is_tail_call &&
++ _is_sibling == is_sibling;
+ }
+ bool contains(address pc) const { return code_begin() <= pc && pc < code_end(); }
+
+@@ -83,6 +88,8 @@
+ // Query
+ bool is_itable_stub() { return !_is_vtable_stub; }
+ bool is_vtable_stub() { return _is_vtable_stub; }
++ bool is_tail_call() { return _is_tail_call; }
++ bool is_sibling() { return _is_sibling; }
+ bool is_abstract_method_error(address epc) { return epc == code_begin()+_ame_offset; }
+ bool is_null_pointer_exception(address epc) { return epc == code_begin()+_npe_offset; }
+
+@@ -105,14 +112,14 @@
+ static VtableStub* _table[N]; // table of existing stubs
+ static int _number_of_vtable_stubs; // number of stubs created so far (for statistics)
+
+- static VtableStub* create_vtable_stub(int vtable_index);
+- static VtableStub* create_itable_stub(int vtable_index);
+- static VtableStub* lookup (bool is_vtable_stub, int vtable_index);
+- static void enter (bool is_vtable_stub, int vtable_index, VtableStub* s);
+- static inline uint hash (bool is_vtable_stub, int vtable_index);
++ static VtableStub* create_vtable_stub(int vtable_index, bool is_tail_call, bool is_sibling);
++ static VtableStub* create_itable_stub(int vtable_index, bool is_tail_call, bool is_sibling);
++ static VtableStub* lookup (bool is_vtable_stub, int vtable_index, bool is_tail_call, bool is_sibling);
++ static void enter (bool is_vtable_stub, int vtable_index, VtableStub* s, bool is_tail_call, bool is_sibling);
++ static inline uint hash (bool is_vtable_stub, int vtable_index, bool is_tail_call, bool is_sibling);
+
+ public:
+- static address create_stub(bool is_vtable_stub, int vtable_index, methodOop method); // return the entry point of a stub for this call
++ static address create_stub(bool is_vtable_stub, int vtable_index, methodOop method, bool is_tail_call, bool is_sibling); // return the entry point of a stub for this call
+ static bool is_entry_point(address pc); // is pc a vtable stub entry point?
+ static bool contains(address pc); // is pc within any stub?
+ static VtableStub* stub_containing(address pc); // stub containing pc or NULL
+diff --git a/src/share/vm/compiler/disassembler.cpp b/src/share/vm/compiler/disassembler.cpp
+--- a/src/share/vm/compiler/disassembler.cpp
++++ b/src/share/vm/compiler/disassembler.cpp
+@@ -293,6 +293,9 @@
+ if (p == nm->exception_begin()) st->print_cr("[Exception Handler]");
+ if (p == nm->stub_begin()) st->print_cr("[Stub Code]");
+ if (p == nm->consts_begin()) st->print_cr("[Constants]");
++ if (p == nm->static_tail_call_begin()) st->print_cr("[Static Tail Call Entry Point] %x", nm->static_tail_call_entry_point());
++ if (p == nm->monomorphic_tail_call_begin()) st->print_cr("[Monomorphic Tail Call Entry Point] %x", nm->monomorphic_tail_call_entry_point());
++ if (p == nm->static_not_sibling_tail_call_begin()) st->print_cr("[Static Not Sibling Tail Call Entry Point] %x", nm->monomorphic_tail_call_entry_point());
+ }
+ CodeBlob* cb = _code;
+ if (cb != NULL) {
+diff --git a/src/share/vm/compiler/disassembler.hpp b/src/share/vm/compiler/disassembler.hpp
+--- a/src/share/vm/compiler/disassembler.hpp
++++ b/src/share/vm/compiler/disassembler.hpp
+@@ -29,7 +29,7 @@
+
+ class Disassembler {
+ friend class decode_env;
+- private:
++public:
+ // this is the type of the dll entry point:
+ typedef void* (*decode_func)(void* start, void* end,
+ void* (*event_callback)(void*, const char*, void*),
+@@ -37,6 +37,8 @@
+ int (*printf_callback)(void*, const char*, ...),
+ void* printf_stream,
+ const char* options);
++ private:
++
+ // points to the library.
+ static void* _library;
+ // bailout
+diff --git a/src/share/vm/interpreter/abstractInterpreter.hpp b/src/share/vm/interpreter/abstractInterpreter.hpp
+--- a/src/share/vm/interpreter/abstractInterpreter.hpp
++++ b/src/share/vm/interpreter/abstractInterpreter.hpp
+@@ -73,7 +73,9 @@
+ };
+
+ enum SomeConstants {
+- number_of_result_handlers = 10 // number of result handlers for native calls
++ number_of_result_handlers = 10, // number of result handlers for native calls
++ min_invoke_length = 3,
++ max_invoke_length = 6 // wide invokeinterface
+ };
+
+ protected:
+@@ -91,7 +93,11 @@
+
+ static address _rethrow_exception_entry; // rethrows an activation in previous frame
+
+-
++ // Tail calls in interpreter need to check whether parent frame is an
++ // interpreter frame. To support this we need the address range of interpreter
++ // code.
++ static address _interpreter_code_begin;
++ static address _interpreter_code_end;
+
+ friend class AbstractInterpreterGenerator;
+ friend class InterpreterGenerator;
+@@ -118,6 +124,10 @@
+
+ static address rethrow_exception_entry() { return _rethrow_exception_entry; }
+
++ // Tail call support
++ static address interpreter_code_begin_address() { return (address)&_interpreter_code_begin; }
++ static address interpreter_code_end_address() { return (address)&_interpreter_code_end; }
++
+ // Activation size in words for a method that is just being called.
+ // Parameters haven't been pushed so count them too.
+ static int size_top_interpreter_activation(methodOop method);
+diff --git a/src/share/vm/interpreter/bytecode.hpp b/src/share/vm/interpreter/bytecode.hpp
+--- a/src/share/vm/interpreter/bytecode.hpp
++++ b/src/share/vm/interpreter/bytecode.hpp
+@@ -167,10 +167,20 @@
+
+ class Bytecode_invoke: public ResourceObj {
+ protected:
+- methodHandle _method; // method containing the bytecode
+- int _bci; // position of the bytecode
++ methodHandle _method; // method containing the bytecode
++ int _bci; // position of the bytecode
++ Bytecodes::Prefix _prefix; // prefix, if any
+
+- Bytecode_invoke(methodHandle method, int bci) : _method(method), _bci(bci) {}
++ Bytecode_invoke(methodHandle method, int bci)
++ : _method(method), _bci(bci)
++ {
++ _prefix = Bytecodes::Prefix_none;
++ if (code() == Bytecodes::_wide) {
++ // unlike bytecode streams, note the prefix and skip over it:
++ _prefix = Bytecodes::Prefix_tail_call;
++ _bci += 1;
++ }
++ }
+
+ public:
+ void verify() const;
+@@ -191,6 +201,7 @@
+ methodHandle static_target(TRAPS); // "specified" method (from constant pool)
+
+ // Testers
++ bool is_tailcall() const { return _prefix == Bytecodes::Prefix_tail_call; }
+ bool is_invokeinterface() const { return adjusted_invoke_code() == Bytecodes::_invokeinterface; }
+ bool is_invokevirtual() const { return adjusted_invoke_code() == Bytecodes::_invokevirtual; }
+ bool is_invokestatic() const { return adjusted_invoke_code() == Bytecodes::_invokestatic; }
+diff --git a/src/share/vm/interpreter/bytecodeStream.cpp b/src/share/vm/interpreter/bytecodeStream.cpp
+--- a/src/share/vm/interpreter/bytecodeStream.cpp
++++ b/src/share/vm/interpreter/bytecodeStream.cpp
+@@ -37,14 +37,24 @@
+ _next_bci += l;
+ assert(_bci < _next_bci, "length must be > 0");
+ // set attributes
+- _is_wide = false;
++ _prefix = Bytecodes::Prefix_none;
+ // check for special (uncommon) cases
+ if (code == Bytecodes::_wide) {
+ if (bcp + 1 >= end) {
+ code = Bytecodes::_illegal;
+ } else {
+ code = (Bytecodes::Code)bcp[1];
+- _is_wide = true;
++ switch(code) {
++ case Bytecodes::_invokevirtual:
++ case Bytecodes::_invokeinterface:
++ case Bytecodes::_invokespecial:
++ case Bytecodes::_invokestatic:
++ _prefix = Bytecodes::Prefix_tail_call;
++ break;
++ default:
++ _prefix = Bytecodes::Prefix_wide_index;
++ break;
++ }
+ }
+ }
+ }
+diff --git a/src/share/vm/interpreter/bytecodeStream.hpp b/src/share/vm/interpreter/bytecodeStream.hpp
+--- a/src/share/vm/interpreter/bytecodeStream.hpp
++++ b/src/share/vm/interpreter/bytecodeStream.hpp
+@@ -47,16 +47,17 @@
+ int _bci; // bci if current bytecode
+ int _next_bci; // bci of next bytecode
+ int _end_bci; // bci after the current iteration interval
+-
++ //bool _is_wide;
+ // last bytecode read
+ Bytecodes::Code _code;
+- bool _is_wide;
++ Bytecodes::Prefix _prefix;
+
+ public:
+ // Construction
+ RawBytecodeStream(methodHandle method) : _method(method) {
+ set_interval(0, _method->code_size());
+ }
++ //bool is_wide() { return _is_wide;}
+
+ // Iteration control
+ void set_interval(int beg_bci, int end_bci) {
+@@ -84,12 +85,12 @@
+ code = Bytecodes::code_or_bp_at(bcp);
+
+ // set next bytecode position
+- int l = Bytecodes::length_for(code);
+- if (l > 0 && (_bci + l) <= _end_bci) {
++ int len = Bytecodes::length_for(code);
++ if (len > 0 && (_bci + len) <= _end_bci) {
+ assert(code != Bytecodes::_wide && code != Bytecodes::_tableswitch
+ && code != Bytecodes::_lookupswitch, "can't be special bytecode");
+- _is_wide = false;
+- _next_bci += l;
++ _prefix = Bytecodes::Prefix_none;
++ _next_bci += len;
+ _code = code;
+ return code;
+ } else if (code == Bytecodes::_wide && _bci + 1 >= _end_bci) {
+@@ -108,7 +109,11 @@
+ int end_bci() const { return _end_bci; }
+
+ Bytecodes::Code code() const { return _code; }
+- bool is_wide() const { return _is_wide; }
++ Bytecodes::Prefix prefix() const { return _prefix; }
++ bool has_prefix() const { return _prefix != Bytecodes::Prefix_none; }
++ int prefix_length() const { return has_prefix() ? 1 : 0; } // all prefixes are 1 byte
++ bool is_wide_index() const { return _prefix == Bytecodes::Prefix_wide_index; }
++ bool is_tail_call() const { return _prefix == Bytecodes::Prefix_tail_call;}
+ bool is_last_bytecode() const { return _next_bci >= _end_bci; }
+
+ address bcp() const { return method()->code_base() + _bci; }
+@@ -122,8 +127,8 @@
+ int dest_w() const { return bci() + (int )Bytes::get_Java_u4(bcp() + 1); }
+
+ // Unsigned indices, widening
+- int get_index() const { return (is_wide()) ? Bytes::get_Java_u2(bcp() + 2) : bcp()[1]; }
+- int get_index_big() const { return (int)Bytes::get_Java_u2(bcp() + 1); }
++ int get_index() const { return (is_wide_index()) ? Bytes::get_Java_u2(bcp() + 2) : bcp()[1]; }
++ int get_index_big() const { return (int)Bytes::get_Java_u2(bcp() + prefix_length() + 1); }
+ };
+
+ // In BytecodeStream, non-java bytecodes will be translated into the
+@@ -151,18 +156,23 @@
+ // note that we cannot advance before having the
+ // tty bytecode otherwise the stepping is wrong!
+ // (carefull: length_for(...) must be used first!)
+- int l = Bytecodes::length_for(code);
+- if (l == 0) l = Bytecodes::length_at(bcp);
+- _next_bci += l;
++ int len = Bytecodes::length_for(code);
++ _prefix = Bytecodes::Prefix_none;
++ if (len == 0) {
++ len = Bytecodes::length_at(bcp);
++ if (code == Bytecodes::_wide) {
++ code = (Bytecodes::Code)bcp[1];
++ _prefix = Bytecodes::allowed_prefix(code);
++ assert(prefix_length() == 1, "");
++ if (_prefix == Bytecodes::Prefix_illegal) {
++ code = Bytecodes::_illegal;
++ }
++ }
++ }
++ _next_bci += len;
+ assert(_bci < _next_bci, "length must be > 0");
+- // set attributes
+- _is_wide = false;
+- // check for special (uncommon) cases
+- if (code == Bytecodes::_wide) {
+- code = (Bytecodes::Code)bcp[1];
+- _is_wide = true;
+- }
+- assert(Bytecodes::is_java_code(code), "sanity check");
++ if (code != Bytecodes::_illegal)
++ assert(Bytecodes::is_java_code(code), "sanity check");
+ }
+ _code = code;
+ return _code;
+diff --git a/src/share/vm/interpreter/bytecodeTracer.cpp b/src/share/vm/interpreter/bytecodeTracer.cpp
+--- a/src/share/vm/interpreter/bytecodeTracer.cpp
++++ b/src/share/vm/interpreter/bytecodeTracer.cpp
+@@ -38,7 +38,7 @@
+ // operations on the pointer, except within a critical section.
+ // (Also, ensure that occasional false positives are benign.)
+ methodOop _current_method;
+- bool _is_wide;
++ Bytecodes::Prefix _prefix;
+ address _next_pc; // current decoding position
+
+ void align() { _next_pc = (address)round_to((intptr_t)_next_pc, sizeof(jint)); }
+@@ -48,10 +48,11 @@
+
+ int get_index() { return *(address)_next_pc++; }
+ int get_big_index() { int i=Bytes::get_Java_u2(_next_pc); _next_pc+=2; return i; }
+- int get_index_special() { return (is_wide()) ? get_big_index() : get_index(); }
++ int get_index_special() { return (is_wide_index()) ? get_big_index() : get_index(); }
+ methodOop method() { return _current_method; }
+- bool is_wide() { return _is_wide; }
+-
++ bool has_prefix() const { return _prefix != Bytecodes::Prefix_none; }
++ int prefix_length() const { return has_prefix() ? 1 : 0; }
++ bool is_wide_index() const { return _prefix == Bytecodes::Prefix_wide_index; }
+
+ void print_constant(int i, outputStream* st = tty);
+ void print_attributes(Bytecodes::Code code, int bci, outputStream* st = tty);
+@@ -59,7 +60,7 @@
+
+ public:
+ BytecodePrinter() {
+- _is_wide = false;
++ _prefix = Bytecodes::Prefix_none;
+ }
+
+ // This method is called while executing the raw bytecodes, so none of
+@@ -80,26 +81,32 @@
+ _current_method = method();
+ }
+ Bytecodes::Code code;
+- if (is_wide()) {
++ if (has_prefix()) {
+ // bcp wasn't advanced if previous bytecode was _wide.
+- code = Bytecodes::code_at(bcp+1);
++ code = Bytecodes::code_at(bcp+prefix_length());
+ } else {
+ code = Bytecodes::code_at(bcp);
+ }
+ int bci = bcp - method->code_base();
+ st->print("[%d] ", (int) Thread::current()->osthread()->thread_id());
++ const char* bcn = Bytecodes::name(code);
++ const char* pfn = Bytecodes::prefix_name(_prefix);
+ if (Verbose) {
+- st->print("%8d %4d " INTPTR_FORMAT " " INTPTR_FORMAT " %s",
+- BytecodeCounter::counter_value(), bci, tos, tos2, Bytecodes::name(code));
++ st->print("%8d %4d " INTPTR_FORMAT " " INTPTR_FORMAT " %s%s",
++ BytecodeCounter::counter_value(), bci, tos, tos2, pfn, bcn);
+ } else {
+- st->print("%8d %4d %s",
+- BytecodeCounter::counter_value(), bci, Bytecodes::name(code));
++ st->print("%8d %4d %s%s",
++ BytecodeCounter::counter_value(), bci, pfn, bcn);
+ }
+- _next_pc = is_wide() ? bcp+2 : bcp+1;
++ _next_pc = bcp + prefix_length() + 1;
+ print_attributes(code, bci);
+- // Set is_wide for the next one, since the caller of this doesn't skip
++ // Set is_prefix for the next one, since the caller of this doesn't skip
+ // the next bytecode.
+- _is_wide = (code == Bytecodes::_wide);
++ _prefix = Bytecodes::Prefix_none;
++ if (code == Bytecodes::_wide) {
++ code = Bytecodes::code_at(bcp+1);
++ _prefix = Bytecodes::allowed_prefix(code);
++ }
+ }
+
+ // Used for methodOop::print_codes(). The input bcp comes from
+@@ -108,19 +115,18 @@
+ _current_method = method();
+ ResourceMark rm;
+ Bytecodes::Code code = Bytecodes::code_at(bcp);
+- // Set is_wide
+- _is_wide = (code == Bytecodes::_wide);
+- if (is_wide()) {
++ // Set prefix
++ _prefix = Bytecodes::Prefix_none;
++ if (code == Bytecodes::_wide) {
+ code = Bytecodes::code_at(bcp+1);
++ _prefix = Bytecodes::allowed_prefix(code);
+ }
+ int bci = bcp - method->code_base();
+ // Print bytecode index and name
+- if (is_wide()) {
+- st->print("%d %s_w", bci, Bytecodes::name(code));
+- } else {
+- st->print("%d %s", bci, Bytecodes::name(code));
+- }
+- _next_pc = is_wide() ? bcp+2 : bcp+1;
++ const char* bcn = Bytecodes::name(code);
++ const char* pfn = Bytecodes::prefix_name(_prefix);
++ st->print("%d %s%s", bci, pfn, bcn);
++ _next_pc = bcp + prefix_length() + 1;
+ print_attributes(code, bci, st);
+ bytecode_epilog(bci, st);
+ }
+@@ -250,7 +256,7 @@
+
+ case Bytecodes::_iinc:
+ { int index = get_index_special();
+- jint offset = is_wide() ? get_short(): get_byte();
++ jint offset = is_wide_index() ? get_short(): get_byte();
+ st->print_cr(" #%d " INT32_FORMAT, index, offset);
+ }
+ break;
+diff --git a/src/share/vm/interpreter/bytecodes.cpp b/src/share/vm/interpreter/bytecodes.cpp
+--- a/src/share/vm/interpreter/bytecodes.cpp
++++ b/src/share/vm/interpreter/bytecodes.cpp
+@@ -40,6 +40,8 @@
+ BasicType Bytecodes::_result_type [Bytecodes::number_of_codes];
+ s_char Bytecodes::_depth [Bytecodes::number_of_codes];
+ u_char Bytecodes::_length [Bytecodes::number_of_codes];
++u_char Bytecodes::_wide_length [Bytecodes::number_of_codes];
++s_char Bytecodes::_allowed_prefix[Bytecodes::number_of_codes];
+ bool Bytecodes::_can_trap [Bytecodes::number_of_codes];
+ Bytecodes::Code Bytecodes::_java_code [Bytecodes::number_of_codes];
+ bool Bytecodes::_can_rewrite [Bytecodes::number_of_codes];
+@@ -61,7 +63,7 @@
+ if (end != NULL && bcp + 1 >= end) {
+ return -1; // don't read past end of code buffer
+ }
+- return wide_length_for(cast(*(bcp + 1)));
++ return _wide_length[*(bcp + 1)];
+ case _tableswitch:
+ { address aligned_bcp = (address)round_to((intptr_t)bcp + 1, jintSize);
+ if (end != NULL && aligned_bcp + 3*jintSize >= end) {
+@@ -92,6 +94,32 @@
+ return 0;
+ }
+
++const char* Bytecodes::_prefix_name[Bytecodes::number_of_prefixes] = {
++ "", // 'none' displays as empty string
++ // the others display with a trailing colon:
++ "wide_index:", "tail_call:"
++};
++const char* Bytecodes::prefix_name(Prefix pfx) {
++ if (0 <= pfx && pfx < number_of_prefixes)
++ return _prefix_name[pfx];
++ else
++ return "(unknown)";
++}
++
++const char* Bytecodes::name_for(Bytecodes::Prefix pfx, Bytecodes::Code code) {
++ const char* cn = name(code);
++ if (pfx == Prefix_none)
++ return cn;
++ else {
++ const char* pn = prefix_name(pfx);
++ char* buf = NEW_RESOURCE_ARRAY(char, strlen(pn) + strlen(cn) + 1);
++ strcpy(buf, pn);
++ strcat(buf, cn);
++ return buf;
++ }
++}
++
++
+ // At a breakpoint instruction, this returns the breakpoint's length,
+ // otherwise, it's the same as special_length_at(). This is used by
+ // the RawByteCodeStream, which wants to see the actual bytecode
+@@ -114,7 +142,6 @@
+ }
+
+
+-
+ void Bytecodes::def(Code code, const char* name, const char* format, const char* wide_format, BasicType result_type, int depth, bool can_trap) {
+ def(code, name, format, wide_format, result_type, depth, can_trap, code);
+ }
+@@ -122,13 +149,39 @@
+
+ void Bytecodes::def(Code code, const char* name, const char* format, const char* wide_format, BasicType result_type, int depth, bool can_trap, Code java_code) {
+ assert(wide_format == NULL || format != NULL, "short form must exist if there's a wide form");
++
++ const char* wide_fmt = NULL; // will be wide_format or NULL
++ Prefix allowed_prefix = Prefix_illegal;
++ if (wide_format != NULL) {
++ switch (wide_format[0]) {
++ case 'w':
++ allowed_prefix = Prefix_wide_index;
++ wide_fmt = wide_format;
++ break;
++
++ // certain new features allow the _wide prefix to be a semantic modifier:
++ case 'T':
++ assert(wide_format[1] == 'w', "");
++ if (TailCalls) {
++ allowed_prefix = Prefix_tail_call;
++ wide_fmt = &wide_format[1];
++ }
++ break;
++
++ default:
++ ShouldNotReachHere();
++ }
++ }
++
+ _name [code] = name;
+ _format [code] = format;
+- _wide_format [code] = wide_format;
++ _wide_format [code] = wide_fmt;
+ _result_type [code] = result_type;
+ _depth [code] = depth;
+ _can_trap [code] = can_trap;
+- _length [code] = format != NULL ? (u_char)strlen(format) : 0;
++ _length [code] = (u_char)( (format != NULL) ? strlen(format) : 0 );
++ _wide_length [code] = (u_char)( (wide_fmt != NULL) ? strlen(wide_fmt) : 0 );
++ _allowed_prefix[code] = allowed_prefix;
+ _java_code [code] = java_code;
+ if (java_code != code) _can_rewrite[java_code] = true;
+ }
+@@ -159,6 +212,9 @@
+ if (_is_initialized) return;
+ assert(number_of_codes <= 256, "too many bytecodes");
+
++ assert(0 == strcmp("wide_index:", prefix_name(Prefix_wide_index)),
++ "_prefix_name initialized correctly"); // spot-check only
++
+ // initialize bytecode tables - didn't use static array initializers
+ // (such as {}) so we can do additional consistency checks and init-
+ // code is independent of actual bytecode numbering.
+@@ -353,10 +409,10 @@
+ def(_putstatic , "putstatic" , "bjj" , NULL , T_ILLEGAL, -1, true );
+ def(_getfield , "getfield" , "bjj" , NULL , T_ILLEGAL, 0, true );
+ def(_putfield , "putfield" , "bjj" , NULL , T_ILLEGAL, -2, true );
+- def(_invokevirtual , "invokevirtual" , "bjj" , NULL , T_ILLEGAL, -1, true);
+- def(_invokespecial , "invokespecial" , "bjj" , NULL , T_ILLEGAL, -1, true);
+- def(_invokestatic , "invokestatic" , "bjj" , NULL , T_ILLEGAL, 0, true);
+- def(_invokeinterface , "invokeinterface" , "bjj__", NULL , T_ILLEGAL, -1, true);
++ def(_invokevirtual , "invokevirtual" , "bjj" , "Twbjj" , T_ILLEGAL, -1, true); // w=tailcall
++ def(_invokespecial , "invokespecial" , "bjj" , "Twbjj" , T_ILLEGAL, -1, true); // w=tailcall
++ def(_invokestatic , "invokestatic" , "bjj" , "Twbjj" , T_ILLEGAL, 0, true); // w=tailcall
++ def(_invokeinterface , "invokeinterface" , "bjj__", "Twbjj__", T_ILLEGAL, -1, true); // w=tailcall
+ def(_xxxunusedxxx , "xxxunusedxxx" , NULL , NULL , T_VOID , 0, false);
+ def(_new , "new" , "bii" , NULL , T_OBJECT , 1, true );
+ def(_newarray , "newarray" , "bc" , NULL , T_OBJECT , 0, true );
+diff --git a/src/share/vm/interpreter/bytecodes.hpp b/src/share/vm/interpreter/bytecodes.hpp
+--- a/src/share/vm/interpreter/bytecodes.hpp
++++ b/src/share/vm/interpreter/bytecodes.hpp
+@@ -280,6 +280,16 @@
+ number_of_codes
+ };
+
++ // Internal codes for various kinds of bytecode prefixes.
++ enum Prefix {
++ Prefix_illegal = -1, // stray _wide bytecode; an error
++ Prefix_none = 0,
++ Prefix_wide_index,
++ Prefix_tail_call,
++ number_of_prefixes
++ // Note: Keep this enum in sync. with Bytecodes::_prefix_name.
++ };
++
+ private:
+ static bool _is_initialized;
+ static const char* _name [number_of_codes];
+@@ -288,9 +298,12 @@
+ static BasicType _result_type [number_of_codes];
+ static s_char _depth [number_of_codes];
+ static u_char _length [number_of_codes];
++ static u_char _wide_length [number_of_codes];
++ static s_char _allowed_prefix[number_of_codes];
+ static bool _can_trap [number_of_codes];
+ static Code _java_code [number_of_codes];
+ static bool _can_rewrite [number_of_codes];
++ static const char* _prefix_name [number_of_prefixes];
+
+ static void def(Code code, const char* name, const char* format, const char* wide_format, BasicType result_type, int depth, bool can_trap);
+ static void def(Code code, const char* name, const char* format, const char* wide_format, BasicType result_type, int depth, bool can_trap, Code java_code);
+@@ -300,9 +313,9 @@
+ public:
+ // Conversion
+ static void check (Code code) { assert(is_defined(code), "illegal code"); }
+- static void wide_check (Code code) { assert(wide_is_defined(code), "illegal code"); }
+ static Code cast (int code) { return (Code)code; }
+
++ static void prefix_check(Prefix pfx, Code code) { assert(pfx == allowed_prefix(code), "illegal prefix"); }
+
+ // Fetch a bytecode, hiding breakpoints as necessary:
+ static Code code_at(address bcp, methodOop method = NULL) {
+@@ -311,6 +324,16 @@
+ static Code java_code_at(address bcp, methodOop method = NULL) {
+ return java_code(code_at(bcp, method));
+ }
++ static Code java_code_at(address bcp, Prefix& prefix_return) {
++ Code code = java_code(code_at(bcp));
++ if (code != _wide) {
++ prefix_return = Prefix_none;
++ } else {
++ code = java_code(code_at(bcp + 1));
++ prefix_return = allowed_prefix(code);
++ }
++ return code;
++ }
+
+ // Fetch a bytecode or a breakpoint:
+ static Code code_or_bp_at(address bcp) { return (Code)cast(*bcp); }
+@@ -323,23 +346,28 @@
+
+ // Bytecode attributes
+ static bool is_defined (int code) { return 0 <= code && code < number_of_codes && _format[code] != NULL; }
+- static bool wide_is_defined(int code) { return is_defined(code) && _wide_format[code] != NULL; }
+ static const char* name (Code code) { check(code); return _name [code]; }
+ static const char* format (Code code) { check(code); return _format [code]; }
+ static const char* wide_format (Code code) { return _wide_format[code]; }
+ static BasicType result_type (Code code) { check(code); return _result_type [code]; }
+ static int depth (Code code) { check(code); return _depth [code]; }
+ static int length_for (Code code) { return _length[code]; }
++ static bool can_have_prefix(Code code) { return ( allowed_prefix(code) > Prefix_none ); }
++ static Prefix allowed_prefix (Code code) { check(code); return (Prefix)_allowed_prefix[code]; }
+ static bool can_trap (Code code) { check(code); return _can_trap [code]; }
+ static Code java_code (Code code) { check(code); return _java_code [code]; }
+ static bool can_rewrite (Code code) { check(code); return _can_rewrite [code]; }
+- static int wide_length_for(Code code) {
+- if (!is_defined(code)) {
++ static int length_for(Prefix pfx, Code code) {
++ if (pfx == Prefix_none)
++ return length_for(code);
++ else if (pfx == allowed_prefix(code))
++ return _wide_length[code];
++ else
++ // no other kind of prefix allowed on this code:
+ return 0;
+- }
+- const char* wf = wide_format(code);
+- return (wf == NULL) ? 0 : (int)strlen(wf);
+ }
++ static const char* prefix_name (Prefix pfx);
++ static const char* name_for (Prefix pfx, Code code); // may resource-allocate
+ // if 'end' is provided, it indicates the end of the code buffer which
+ // should not be read past when parsing.
+ static int special_length_at(address bcp, address end = NULL);
+diff --git a/src/share/vm/interpreter/interpreter.cpp b/src/share/vm/interpreter/interpreter.cpp
+--- a/src/share/vm/interpreter/interpreter.cpp
++++ b/src/share/vm/interpreter/interpreter.cpp
+@@ -131,6 +131,9 @@
+ address AbstractInterpreter::_entry_table [AbstractInterpreter::number_of_method_entries];
+ address AbstractInterpreter::_native_abi_to_tosca [AbstractInterpreter::number_of_result_handlers];
+
++address AbstractInterpreter::_interpreter_code_begin = NULL;
++address AbstractInterpreter::_interpreter_code_end = NULL;
++
+ //------------------------------------------------------------------------------------------------------------------------
+ // Generation of complete interpreter
+
+diff --git a/src/share/vm/interpreter/interpreterRuntime.cpp b/src/share/vm/interpreter/interpreterRuntime.cpp
+--- a/src/share/vm/interpreter/interpreterRuntime.cpp
++++ b/src/share/vm/interpreter/interpreterRuntime.cpp
+@@ -432,6 +432,10 @@
+ THROW(vmSymbols::java_lang_IncompatibleClassChangeError());
+ IRT_END
+
++IRT_ENTRY(void, InterpreterRuntime::throw_TailCallException(JavaThread* thread))
++ THROW(vmSymbols::java_lang_TailCallException());
++IRT_END
++
+
+ //------------------------------------------------------------------------------------------------------------------------
+ // Fields
+diff --git a/src/share/vm/interpreter/interpreterRuntime.hpp b/src/share/vm/interpreter/interpreterRuntime.hpp
+--- a/src/share/vm/interpreter/interpreterRuntime.hpp
++++ b/src/share/vm/interpreter/interpreterRuntime.hpp
+@@ -33,7 +33,13 @@
+ // Helper functions to access current interpreter state
+ static frame last_frame(JavaThread *thread) { return thread->last_frame(); }
+ static methodOop method(JavaThread *thread) { return last_frame(thread).interpreter_frame_method(); }
+- static address bcp(JavaThread *thread) { return last_frame(thread).interpreter_frame_bcp(); }
++ static address bcp(JavaThread *thread) {
++ Bytecodes::Prefix pfx;
++ address p = last_frame(thread).interpreter_frame_bcp();
++ Bytecodes::java_code_at(p, pfx);
++ if (pfx & Bytecodes::Prefix_tail_call) return (p+1);
++ return p;
++ }
+ static void set_bcp_and_mdp(address bcp, JavaThread*thread);
+ static Bytecodes::Code code(JavaThread *thread) {
+ // pass method to avoid calling unsafe bcp_to_method (partial fix 4926272)
+@@ -70,6 +76,7 @@
+ static void create_klass_exception(JavaThread* thread, char* name, oopDesc* obj);
+ static address exception_handler_for_exception(JavaThread* thread, oopDesc* exception);
+ static void throw_pending_exception(JavaThread* thread);
++ static void throw_TailCallException(JavaThread* thread);
+
+ // Statics & fields
+ static void resolve_get_put(JavaThread* thread, Bytecodes::Code bytecode);
+diff --git a/src/share/vm/interpreter/linkResolver.hpp b/src/share/vm/interpreter/linkResolver.hpp
+--- a/src/share/vm/interpreter/linkResolver.hpp
++++ b/src/share/vm/interpreter/linkResolver.hpp
+@@ -69,7 +69,7 @@
+ methodHandle _resolved_method; // static target method
+ methodHandle _selected_method; // dynamic (actual) target method
+ int _vtable_index; // vtable index of selected method
+-
++ bool _is_tail_call;
+ void set_static( KlassHandle resolved_klass, methodHandle resolved_method , TRAPS);
+ void set_interface(KlassHandle resolved_klass, KlassHandle selected_klass, methodHandle resolved_method, methodHandle selected_method , TRAPS);
+ void set_virtual( KlassHandle resolved_klass, KlassHandle selected_klass, methodHandle resolved_method, methodHandle selected_method, int vtable_index, TRAPS);
+@@ -84,6 +84,8 @@
+ methodHandle selected_method() const { return _selected_method; }
+
+ BasicType result_type() const { return selected_method()->result_type(); }
++ bool is_tail_call() const { return _is_tail_call; }
++ void set_tail_call(bool is_tail_call) { _is_tail_call=is_tail_call; }
+ bool has_vtable_index() const { return _vtable_index >= 0; }
+ bool is_statically_bound() const { return _vtable_index == methodOopDesc::nonvirtual_vtable_index; }
+ int vtable_index() const {
+diff --git a/src/share/vm/interpreter/rewriter.cpp b/src/share/vm/interpreter/rewriter.cpp
+--- a/src/share/vm/interpreter/rewriter.cpp
++++ b/src/share/vm/interpreter/rewriter.cpp
+@@ -113,7 +113,7 @@
+ // moves, the bytecodes will also move.
+ No_Safepoint_Verifier nsv;
+ Bytecodes::Code c;
+-
++ bool is_wide = false;
+ // Bytecodes and their length
+ const address code_base = method->code_base();
+ const int code_length = method->code_size();
+@@ -122,6 +122,7 @@
+ for (int bci = 0; bci < code_length; bci += bc_length) {
+ address bcp = code_base + bci;
+ c = (Bytecodes::Code)(*bcp);
++ is_wide = false;
+
+ // Since we have the code, see if we can get the length
+ // directly. Some more complicated bytecodes will report
+@@ -135,6 +136,7 @@
+ // by 'wide'. We don't currently examine any of the bytecodes
+ // modified by wide, but in case we do in the future...
+ if (c == Bytecodes::_wide) {
++ is_wide = true;
+ c = (Bytecodes::Code)bcp[1];
+ }
+ }
+@@ -162,6 +164,8 @@
+ case Bytecodes::_invokestatic : // fall through
+ case Bytecodes::_invokeinterface: {
+ address p = bcp + 1;
++ // Skip wide prefix
++ if (is_wide) p++;
+ Bytes::put_native_u2(p, index_map[Bytes::get_Java_u2(p)]);
+ break;
+ }
+diff --git a/src/share/vm/interpreter/templateInterpreter.cpp b/src/share/vm/interpreter/templateInterpreter.cpp
+--- a/src/share/vm/interpreter/templateInterpreter.cpp
++++ b/src/share/vm/interpreter/templateInterpreter.cpp
+@@ -50,6 +50,10 @@
+ if (PrintInterpreter) print();
+ }
+
++ // Needed for tail calls.
++ _interpreter_code_begin = _code->code_start();
++ _interpreter_code_end = _code->code_end();
++
+ // initialize dispatch table
+ _active_table = _normal_table;
+ }
+@@ -181,8 +185,8 @@
+ EntryPoint TemplateInterpreter::_continuation_entry;
+ EntryPoint TemplateInterpreter::_safept_entry;
+
+-address TemplateInterpreter::_return_3_addrs_by_index[TemplateInterpreter::number_of_return_addrs];
+-address TemplateInterpreter::_return_5_addrs_by_index[TemplateInterpreter::number_of_return_addrs];
++address TemplateInterpreter::_return_addr_tables[TemplateInterpreter::number_of_invoke_lengths]
++ [TemplateInterpreter::number_of_return_addrs];
+
+ DispatchTable TemplateInterpreter::_active_table;
+ DispatchTable TemplateInterpreter::_normal_table;
+@@ -297,8 +301,11 @@
+
+ for (int j = 0; j < number_of_states; j++) {
+ const TosState states[] = {btos, ctos, stos, itos, ltos, ftos, dtos, atos, vtos};
+- Interpreter::_return_3_addrs_by_index[Interpreter::TosState_as_index(states[j])] = Interpreter::return_entry(states[j], 3);
+- Interpreter::_return_5_addrs_by_index[Interpreter::TosState_as_index(states[j])] = Interpreter::return_entry(states[j], 5);
++ for (int k = Interpreter::min_invoke_length; k <= Interpreter::max_invoke_length; k++) {
++ if (!TailCalls && k != 3 && k != 5) continue;
++ int jx = Interpreter::TosState_as_index(states[j]);
++ Interpreter::return_addrs_by_index_table(k)[jx] = Interpreter::return_entry(states[j], k);
++ }
+ }
+
+ { CodeletMark cm(_masm, "continuation entry points");
+@@ -438,11 +445,13 @@
+ Template* t = TemplateTable::template_for(code);
+ assert(t->is_valid(), "just checking");
+ set_short_entry_points(t, bep, cep, sep, aep, iep, lep, fep, dep, vep);
+- }
+- if (Bytecodes::wide_is_defined(code)) {
+- Template* t = TemplateTable::template_for_wide(code);
+- assert(t->is_valid(), "just checking");
+- set_wide_entry_point(t, wep);
++
++ if (Bytecodes::can_have_prefix(code)) {
++ Bytecodes::Prefix pfx = Bytecodes::allowed_prefix(code);
++ Template* t = TemplateTable::template_for_prefix(pfx, code);
++ assert(t->is_valid(), "just checking");
++ set_wide_entry_point(t, wep);
++ }
+ }
+ // set entry points
+ EntryPoint entry(bep, cep, sep, aep, iep, lep, fep, dep, vep);
+@@ -489,7 +498,7 @@
+ #endif // !PRODUCT
+ int step;
+ if (!t->does_dispatch()) {
+- step = t->is_wide() ? Bytecodes::wide_length_for(t->bytecode()) : Bytecodes::length_for(t->bytecode());
++ step = Bytecodes::length_for(t->prefix(), t->bytecode());
+ if (tos_out == ilgl) tos_out = t->tos_out();
+ // compute bytecode size
+ assert(step > 0, "just checkin'");
+@@ -519,6 +528,12 @@
+
+ address TemplateInterpreter::return_entry(TosState state, int length) {
+ guarantee(0 <= length && length < Interpreter::number_of_return_entries, "illegal length");
++ // Tail call stuff: We assume that the only client of return_entry is
++ // generate_return_entry_for in generate_all. And that we can distinguish a
++ // tail call from a regular call by looking at the length (e.g 4 instead of 3
++ // respectively 6 instead of 5. The following check should catch other
++ // uses of return_entry.
++ assert(length >= 3, "oops");
+ return _return_entry[length].entry(state);
+ }
+
+diff --git a/src/share/vm/interpreter/templateInterpreter.hpp b/src/share/vm/interpreter/templateInterpreter.hpp
+--- a/src/share/vm/interpreter/templateInterpreter.hpp
++++ b/src/share/vm/interpreter/templateInterpreter.hpp
+@@ -84,7 +84,8 @@
+ enum MoreConstants {
+ number_of_return_entries = 9, // number of return entry points
+ number_of_deopt_entries = 9, // number of deoptimization entry points
+- number_of_return_addrs = 9 // number of return addresses
++ number_of_return_addrs = 9, // number of return addresses
++ number_of_invoke_lengths = 4 // 3 - 6
+ };
+
+ protected:
+@@ -112,8 +113,11 @@
+ static EntryPoint _continuation_entry;
+ static EntryPoint _safept_entry;
+
++#if 0
+ static address _return_3_addrs_by_index[number_of_return_addrs]; // for invokevirtual return entries
+ static address _return_5_addrs_by_index[number_of_return_addrs]; // for invokeinterface return entries
++#endif
++ static address _return_addr_tables[number_of_invoke_lengths][number_of_return_addrs];
+
+ static DispatchTable _active_table; // the active dispatch table (used by the interpreter for dispatch)
+ static DispatchTable _normal_table; // the normal dispatch table (used to set the active table in normal mode)
+@@ -152,14 +156,15 @@
+ static address* normal_table() { return _normal_table.table_for(); }
+
+ // Support for invokes
+- static address* return_3_addrs_by_index_table() { return _return_3_addrs_by_index; }
+- static address* return_5_addrs_by_index_table() { return _return_5_addrs_by_index; }
++ static address* return_3_addrs_by_index_table() { return return_addrs_by_index_table(3); }
++ static address* return_5_addrs_by_index_table() { return return_addrs_by_index_table(5); }
++ static address* return_addrs_by_index_table(int inst_size) { assert(inst_size >= 3 && inst_size <=6, "wrong size"); return _return_addr_tables[inst_size-AbstractInterpreter::min_invoke_length];}
+ static int TosState_as_index(TosState state); // computes index into return_3_entry_by_index table
+
+ static address return_entry (TosState state, int length);
+ static address deopt_entry (TosState state, int length);
+
+- // Safepoint support
++ // Safepoint supportx
+ static void notice_safepoints(); // stops the thread when reaching a safepoint
+ static void ignore_safepoints(); // ignores safepoints
+
+diff --git a/src/share/vm/interpreter/templateTable.cpp b/src/share/vm/interpreter/templateTable.cpp
+--- a/src/share/vm/interpreter/templateTable.cpp
++++ b/src/share/vm/interpreter/templateTable.cpp
+@@ -37,19 +37,21 @@
+ // Implementation of Template
+
+
+-void Template::initialize(int flags, TosState tos_in, TosState tos_out, generator gen, int arg) {
+- _flags = flags;
+- _tos_in = tos_in;
+- _tos_out = tos_out;
+- _gen = gen;
+- _arg = arg;
+-}
++void Template::initialize(Bytecodes::Code code, Bytecodes::Prefix pfx,
++ int flags, TosState tos_in, TosState tos_out, generator gen, int arg) {
++ _code = (jubyte) code;
++ _prefix = (jubyte) pfx;
++ _flags = (jubyte) flags;
++ _tos_in = tos_in;
++ _tos_out = tos_out;
++ _gen = gen;
++ _arg = (jshort) arg;
+
+-
+-Bytecodes::Code Template::bytecode() const {
+- int i = this - TemplateTable::_template_table;
+- if (i < 0 || i >= Bytecodes::number_of_codes) i = this - TemplateTable::_template_table_wide;
+- return Bytecodes::cast(i);
++ // Make sure we haven't lost any precision:
++ assert(_flags == (int) flags, "");
++ assert(_tos_in == (int) tos_in, "");
++ assert(_tos_out == (int) tos_out, "");
++ assert(_arg == (int) arg, "");
+ }
+
+
+@@ -180,24 +182,30 @@
+ def(code, flags, in, out, (Template::generator)gen, 0);
+ }
+
++#define PrefixShift BitsPerByte // kludge to make prefix be a pseudo-flag
+
+ void TemplateTable::def(Bytecodes::Code code, int flags, TosState in, TosState out, void (*gen)(int arg), int arg) {
+ // should factor out these constants
+ const int ubcp = 1 << Template::uses_bcp_bit;
+ const int disp = 1 << Template::does_dispatch_bit;
+ const int clvm = 1 << Template::calls_vm_bit;
+- const int iswd = 1 << Template::wide_bit;
++
++ // extract prefix from flag bits:
++ Bytecodes::Prefix pfx = (Bytecodes::Prefix)(flags >> PrefixShift);
++ flags -= (int)pfx << PrefixShift; // erase fake flag bits
++
+ // determine which table to use
+- bool is_wide = (flags & iswd) != 0;
++ bool is_wide = (pfx != Bytecodes::Prefix_none);
+ // make sure that wide instructions have a vtos entry point
+ // (since they are executed extremely rarely, it doesn't pay out to have an
+ // extra set of 5 dispatch tables for the wide instructions - for simplicity
+ // they all go with one table)
+ assert(in == vtos || !is_wide, "wide instructions have vtos entry point only");
+- Template* t = is_wide ? template_for_wide(code) : template_for(code);
++ Template* t = is_wide ? template_for_prefix(pfx, code) : template_for(code);
+ // setup entry
+- t->initialize(flags, in, out, gen, arg);
++ t->initialize(code, pfx, flags, in, out, gen, arg);
+ assert(t->bytecode() == code, "just checkin'");
++ assert(t->prefix() == pfx, "just checkin'");
+ }
+
+
+@@ -253,9 +261,11 @@
+ const int ubcp = 1 << Template::uses_bcp_bit;
+ const int disp = 1 << Template::does_dispatch_bit;
+ const int clvm = 1 << Template::calls_vm_bit;
+- const int iswd = 1 << Template::wide_bit;
++ // Various kinds of prefixes:
++ const int tail = (int)(Bytecodes::Prefix_tail_call) << PrefixShift;
++ const int iswd = (int)(Bytecodes::Prefix_wide_index) << PrefixShift;
+ // interpr. templates
+- // Java spec bytecodes ubcp|disp|clvm|iswd in out generator argument
++ // Java spec bytecodes ubcp|disp|clvm|prfx in out generator argument
+ def(Bytecodes::_nop , ____|____|____|____, vtos, vtos, nop , _ );
+ def(Bytecodes::_aconst_null , ____|____|____|____, vtos, atos, aconst_null , _ );
+ def(Bytecodes::_iconst_m1 , ____|____|____|____, vtos, itos, iconst , -1 );
+@@ -473,6 +483,13 @@
+ def(Bytecodes::_ret , ubcp|disp|____|iswd, vtos, vtos, wide_ret , _ );
+ def(Bytecodes::_breakpoint , ubcp|disp|clvm|____, vtos, vtos, _breakpoint , _ );
+
++ if (TailCalls) {
++ def(Bytecodes::_invokevirtual , ubcp|disp|clvm|tail, vtos, vtos, wide_invokevirtual , 2 );
++ def(Bytecodes::_invokespecial , ubcp|disp|clvm|tail, vtos, vtos, wide_invokespecial , 1 );
++ def(Bytecodes::_invokestatic , ubcp|disp|clvm|tail, vtos, vtos, wide_invokestatic , 1 );
++ def(Bytecodes::_invokeinterface , ubcp|disp|clvm|tail, vtos, vtos, wide_invokeinterface , 1 );
++ }
++
+ // JVM bytecodes
+ def(Bytecodes::_fast_agetfield , ubcp|____|____|____, atos, atos, fast_accessfield , atos );
+ def(Bytecodes::_fast_bgetfield , ubcp|____|____|____, atos, itos, fast_accessfield , itos );
+@@ -536,6 +553,12 @@
+
+
+ void TemplateTable::unimplemented_bc() {
+- _masm->unimplemented( Bytecodes::name(_desc->bytecode()));
++ char buf[1024];
++ buf[0] = '\0';
++ if (_desc->has_prefix()) {
++ sprintf(buf, "%s:", Bytecodes::prefix_name(_desc->prefix()));
++ }
++ strcat(buf, Bytecodes::name(_desc->bytecode()));
++ _masm->unimplemented(buf);
+ }
+ #endif /* !CC_INTERP */
+diff --git a/src/share/vm/interpreter/templateTable.hpp b/src/share/vm/interpreter/templateTable.hpp
+--- a/src/share/vm/interpreter/templateTable.hpp
++++ b/src/share/vm/interpreter/templateTable.hpp
+@@ -47,14 +47,19 @@
+ TosState _tos_in; // tos cache state before template execution
+ TosState _tos_out; // tos cache state after template execution
+ generator _gen; // template code generator
+- int _arg; // argument for template code generator
+-
+- void initialize(int flags, TosState tos_in, TosState tos_out, generator gen, int arg);
++ int _arg; // argument for template code
++ // generator
++ int _prefix; // prefix
++ int _code;
++ void initialize(Bytecodes::Code code, Bytecodes::Prefix pfx,
++ int flags, TosState tos_in, TosState tos_out, generator gen, int arg);
+
+ friend class TemplateTable;
+
+ public:
+- Bytecodes::Code bytecode() const;
++ Bytecodes::Prefix prefix() const { return (Bytecodes::Prefix)_prefix; }
++ bool has_prefix() { return (_prefix & Bytecodes::Prefix_none) || (_prefix & Bytecodes::Prefix_wide_index) || (_prefix & Bytecodes::Prefix_tail_call);}
++ Bytecodes::Code bytecode() const { return (Bytecodes::Code)_code; }
+ bool is_valid() const { return _gen != NULL; }
+ bool uses_bcp() const { return (_flags & (1 << uses_bcp_bit )) != 0; }
+ bool does_dispatch() const { return (_flags & (1 << does_dispatch_bit)) != 0; }
+@@ -244,8 +249,9 @@
+
+ static void _return(TosState state);
+
+- static void resolve_cache_and_index(int byte_no, Register cache, Register index);
++ static void resolve_cache_and_index(int byte_no, Register cache, Register index, bool is_tail_call);
+ static void load_invoke_cp_cache_entry(int byte_no,
++ bool is_tail_call,
+ Register method,
+ Register itable_index,
+ Register flags,
+@@ -262,6 +268,15 @@
+ static void invokestatic(int byte_no);
+ static void invokeinterface(int byte_no);
+ static void fast_invokevfinal(int byte_no);
++ // Tail calls
++ static void wide_invokevirtual(int byte_no);
++
++ static void wide_invokespecial(int byte_no);
++ static void wide_invokestatic(int byte_no);
++ static void wide_invokeinterface(int byte_no);
++#if 0
++ static void wide_fast_invokevfinal(int byte_no);
++#endif
+
+ static void getfield_or_static(int byte_no, bool is_static);
+ static void putfield_or_static(int byte_no, bool is_static);
+@@ -322,7 +337,15 @@
+
+ // Templates
+ static Template* template_for (Bytecodes::Code code) { Bytecodes::check (code); return &_template_table [code]; }
+- static Template* template_for_wide(Bytecodes::Code code) { Bytecodes::wide_check(code); return &_template_table_wide[code]; }
++ //static Template* template_for_wide(Bytecodes::Code code) {
++ //Bytecodes::wide_check(code); return &_template_table_wide[code]; }
++ static Template* template_for_prefix(Bytecodes::Prefix pfx, Bytecodes::Code code) {
++ Bytecodes::prefix_check(pfx, code);
++ assert((pfx & Bytecodes::Prefix_wide_index) || (pfx & Bytecodes::Prefix_tail_call),
++ "Only support wide now");
++
++ return &_template_table_wide[code];
++ }
+
+ // Platform specifics
+ #include "incls/_templateTable_pd.hpp.incl"
+diff --git a/src/share/vm/oops/generateOopMap.cpp b/src/share/vm/oops/generateOopMap.cpp
+--- a/src/share/vm/oops/generateOopMap.cpp
++++ b/src/share/vm/oops/generateOopMap.cpp
+@@ -549,7 +549,7 @@
+ break;
+ }
+ case Bytecodes::_jsr:
+- assert(bcs->is_wide()==false, "sanity check");
++ assert(bcs->is_wide_index()==false, "sanity check");
+ (*jmpFct)(this, bcs->dest(), data);
+
+
+@@ -1319,7 +1319,14 @@
+ break;
+ }
+ }
+-
++ // Check for tail calls and remember if one was encountered.
++ if (itr->is_tail_call()) {
++ assert(itr->code() == Bytecodes::_invokevirtual ||
++ itr->code() == Bytecodes::_invokestatic ||
++ itr->code() == Bytecodes::_invokeinterface ||
++ itr->code() == Bytecodes::_invokespecial, "Expect a method");
++ _contains_tail_call = true;
++ }
+ // abstract interpretation of current opcode
+ switch(itr->code()) {
+ case Bytecodes::_nop: break;
+@@ -2043,6 +2050,7 @@
+ _ret_adr_tos = new GrowableArray<intptr_t>(5); // 5 seems like a good number;
+ _did_rewriting = false;
+ _did_relocation = false;
++ _contains_tail_call = false;
+
+ if (TraceNewOopMapGeneration) {
+ tty->print("Method name: %s\n", method()->name()->as_C_string());
+diff --git a/src/share/vm/oops/generateOopMap.hpp b/src/share/vm/oops/generateOopMap.hpp
+--- a/src/share/vm/oops/generateOopMap.hpp
++++ b/src/share/vm/oops/generateOopMap.hpp
+@@ -298,6 +298,7 @@
+ bool _did_relocation; // was relocation neccessary
+ bool _monitor_safe; // The monitors in this method have been determined
+ // to be safe.
++ bool _contains_tail_call; // This method contains a tail call.
+
+ // Working Cell type state
+ int _state_len; // Size of states
+@@ -469,6 +470,8 @@
+
+ static void print_time();
+
++ // Method contains tail call.
++ bool contains_tail_call() { return _contains_tail_call; }
+ // Monitor query
+ bool monitor_safe() { return _monitor_safe; }
+
+@@ -557,3 +560,26 @@
+
+ // Call compute_map(CHECK) to generate info.
+ };
++
++//
++// Subclass used by the compiler to query whether a method contains tail calls.
++//
++class ContainsTailCallInfo : public GenerateOopMap {
++ private:
++
++ virtual bool report_results() const { return false; }
++ virtual bool report_init_vars() const { return false; }
++ virtual bool allow_rewrites() const { return false; }
++ virtual bool possible_gc_point (BytecodeStream *bcs) { return false; }
++ virtual void fill_stackmap_prolog (int nof_gc_points) {}
++ virtual void fill_stackmap_epilog () {}
++ virtual void fill_stackmap_for_opcodes (BytecodeStream *bcs,
++ CellTypeState* vars,
++ CellTypeState* stack,
++ int stack_top) {}
++ virtual void fill_init_vars (GrowableArray<intptr_t> *init_vars) {}
++ public:
++ ContainsTailCallInfo(methodHandle method) : GenerateOopMap(method) {};
++
++ // Call compute_map(CHECK) to generate info.
++};
+diff --git a/src/share/vm/oops/instanceKlass.hpp b/src/share/vm/oops/instanceKlass.hpp
+--- a/src/share/vm/oops/instanceKlass.hpp
++++ b/src/share/vm/oops/instanceKlass.hpp
+@@ -578,6 +578,7 @@
+ int object_size() const { return object_size(align_object_offset(vtable_length()) + align_object_offset(itable_length()) + static_field_size() + nonstatic_oop_map_size()); }
+ static int vtable_start_offset() { return header_size(); }
+ static int vtable_length_offset() { return oopDesc::header_size() + offset_of(instanceKlass, _vtable_len) / HeapWordSize; }
++ static int protection_domain_offset() { return oopDesc::header_size() + offset_of(instanceKlass, _protection_domain) / HeapWordSize; }
+ static int object_size(int extra) { return align_object_size(header_size() + extra); }
+
+ intptr_t* start_of_vtable() const { return ((intptr_t*)as_klassOop()) + vtable_start_offset(); }
+diff --git a/src/share/vm/oops/methodKlass.cpp b/src/share/vm/oops/methodKlass.cpp
+--- a/src/share/vm/oops/methodKlass.cpp
++++ b/src/share/vm/oops/methodKlass.cpp
+@@ -72,6 +72,7 @@
+ m->set_method_data(NULL);
+ m->set_interpreter_throwout_count(0);
+ m->set_vtable_index(methodOopDesc::garbage_vtable_index);
++ m->set_contains_tail_call(false);
+
+ // Fix and bury in methodOop
+ m->set_interpreter_entry(NULL); // sets i2i entry and from_int
+diff --git a/src/share/vm/oops/methodOop.cpp b/src/share/vm/oops/methodOop.cpp
+--- a/src/share/vm/oops/methodOop.cpp
++++ b/src/share/vm/oops/methodOop.cpp
+@@ -43,6 +43,26 @@
+ return _adapter->get_c2i_unverified_entry();
+ }
+
++address methodOopDesc::get_c2i_static_tail_call_entry() {
++ assert(_adapter != NULL, "must have");
++ return _adapter->get_c2i_static_tail_call_entry();
++}
++
++address methodOopDesc::get_c2i_unverified_tail_call_entry() {
++ assert(_adapter != NULL, "must have");
++ return _adapter->get_c2i_unverified_tail_call_entry();
++}
++
++address methodOopDesc::get_c2i_static_not_sibling_tail_call_entry() {
++ assert(_adapter != NULL, "must have");
++ return _adapter->get_c2i_static_not_sibling_tail_call_entry();
++}
++
++address methodOopDesc::get_c2i_unverified_not_sibling_tail_call_entry() {
++ assert(_adapter != NULL, "must have");
++ return _adapter->get_c2i_unverified_not_sibling_tail_call_entry();
++}
++
+ char* methodOopDesc::name_and_sig_as_C_string() {
+ return name_and_sig_as_C_string(Klass::cast(constants()->pool_holder()), name(), signature());
+ }
+@@ -610,6 +630,8 @@
+ _from_compiled_entry = NULL;
+ } else {
+ _from_compiled_entry = _adapter->get_c2i_entry();
++ _from_compiled_static_tail_call_entry = _adapter->get_c2i_static_tail_call_entry();
++ _from_compiled_not_sibling_static_tail_call_entry = _adapter->get_c2i_static_not_sibling_tail_call_entry();
+ }
+ OrderAccess::storestore();
+ _from_interpreted_entry = _i2i_entry;
+@@ -631,6 +653,8 @@
+ backedge_counter()->reset();
+ _adapter = NULL;
+ _from_compiled_entry = NULL;
++ _from_compiled_static_tail_call_entry = NULL;
++ _from_compiled_not_sibling_static_tail_call_entry = NULL;
+ assert(_method_data == NULL, "unexpected method data?");
+ set_method_data(NULL);
+ set_interpreter_throwout_count(0);
+@@ -671,7 +695,7 @@
+
+ }
+
+-address methodOopDesc::make_adapters(methodHandle mh, TRAPS) {
++ address methodOopDesc::make_adapters(methodHandle mh, TRAPS) {
+ // Adapters for compiled code are made eagerly here. They are fairly
+ // small (generally < 100 bytes) and quick to make (and cached and shared)
+ // so making them eagerly shouldn't be too expensive.
+@@ -682,6 +706,10 @@
+
+ mh->set_adapter_entry(adapter);
+ mh->_from_compiled_entry = adapter->get_c2i_entry();
++ mh->_from_compiled_static_tail_call_entry = adapter->get_c2i_static_tail_call_entry();
++ mh->_from_compiled_not_sibling_static_tail_call_entry =
++ adapter->get_c2i_static_not_sibling_tail_call_entry();
++
+ return adapter->get_c2i_entry();
+ }
+
+@@ -698,6 +726,19 @@
+ return _from_compiled_entry;
+ }
+
++
++address methodOopDesc::verified_static_tail_call_code_entry() {
++ debug_only(No_Safepoint_Verifier nsv;)
++ assert(_from_compiled_static_tail_call_entry != NULL, "must be set");
++ return _from_compiled_static_tail_call_entry;
++}
++
++address methodOopDesc::verified_not_sibling_static_tail_call_code_entry() {
++ debug_only(No_Safepoint_Verifier nsv;)
++ assert(_from_compiled_not_sibling_static_tail_call_entry != NULL, "must be set");
++ return _from_compiled_not_sibling_static_tail_call_entry;
++}
++
+ // Check that if an nmethod ref exists, it has a backlink to this or no backlink at all
+ // (could be racing a deopt).
+ // Not inline to avoid circular ref.
+@@ -725,13 +766,19 @@
+ if (comp_level > highest_tier_compile()) {
+ set_highest_tier_compile(comp_level);
+ }
++ // Set methodoop and c2i entry point in tail call stubs.
++ if ( code->is_java_method() && code->is_compiled_by_c1())
++ code->set_adapter_info_in_tail_call_stubs(mh(), mh->adapter());
+
+ OrderAccess::storestore();
+ mh->_from_compiled_entry = code->verified_entry_point();
+ OrderAccess::storestore();
++ mh->_from_compiled_static_tail_call_entry = code->static_tail_call_entry_point();
++ OrderAccess::storestore();
++ mh->_from_compiled_not_sibling_static_tail_call_entry = code->static_not_sibling_tail_call_entry_point();
+ // Instantly compiled code can execute.
+ mh->_from_interpreted_entry = mh->get_i2c_entry();
+-
++
+ }
+
+
+diff --git a/src/share/vm/oops/methodOop.hpp b/src/share/vm/oops/methodOop.hpp
+--- a/src/share/vm/oops/methodOop.hpp
++++ b/src/share/vm/oops/methodOop.hpp
+@@ -128,6 +128,12 @@
+ nmethod* volatile _code; // Points to the corresponding piece of native code
+ volatile address _from_interpreted_entry; // Cache of _code ? _adapter->i2c_entry() : _i2i_entry
+
++ // Entry point for static tail calling from compiled code.
++ volatile address _from_compiled_static_tail_call_entry;
++ volatile address _from_compiled_not_sibling_static_tail_call_entry;
++ // Probably should go in access flags but i am unsure whether there is a bit
++ // left.
++ bool _contains_tail_call;
+ public:
+
+ static const bool IsUnsafeConc = false;
+@@ -140,7 +146,10 @@
+
+ static address make_adapters(methodHandle mh, TRAPS);
+ volatile address from_compiled_entry() const { return (address)OrderAccess::load_ptr_acquire(&_from_compiled_entry); }
++ volatile address from_compiled_static_tail_call_entry() const { return (address)OrderAccess::load_ptr_acquire(&_from_compiled_static_tail_call_entry); }
++ volatile address from_compiled_not_sibling_static_tail_call_entry() const { return (address)OrderAccess::load_ptr_acquire(&_from_compiled_not_sibling_static_tail_call_entry); }
+ volatile address from_interpreted_entry() const{ return (address)OrderAccess::load_ptr_acquire(&_from_interpreted_entry); }
++
+
+ // access flag
+ AccessFlags access_flags() const { return _access_flags; }
+@@ -302,6 +311,9 @@
+
+ // nmethod/verified compiler entry
+ address verified_code_entry();
++ address verified_static_tail_call_code_entry();
++ address verified_not_sibling_static_tail_call_code_entry();
++
+ bool check_code() const; // Not inline to avoid circular ref
+ nmethod* volatile code() const { assert( check_code(), "" ); return (nmethod *)OrderAccess::load_ptr_acquire(&_code); }
+ void clear_code(); // Clear out any compiled code
+@@ -310,6 +322,11 @@
+ address get_i2c_entry();
+ address get_c2i_entry();
+ address get_c2i_unverified_entry();
++ address get_c2i_static_tail_call_entry();
++ address get_c2i_unverified_tail_call_entry();
++ address get_c2i_static_not_sibling_tail_call_entry();
++ address get_c2i_unverified_not_sibling_tail_call_entry();
++
+ AdapterHandlerEntry* adapter() { return _adapter; }
+ // setup entry points
+ void link_method(methodHandle method, TRAPS);
+@@ -458,6 +475,11 @@
+ bool guaranteed_monitor_matching() const { return access_flags().is_monitor_matching(); }
+ void set_guaranteed_monitor_matching() { _access_flags.set_monitor_matching(); }
+
++ // Does this method contain a tail call. Might return false because the info
++ // was not computed yet. So rely only on a true value.
++ bool contains_tail_call() const { return _contains_tail_call; }
++ void set_contains_tail_call(bool does_contain) { _contains_tail_call = does_contain; }
++
+ // returns true if the method is an accessor function (setter/getter).
+ bool is_accessor() const;
+
+@@ -486,6 +508,9 @@
+ static ByteSize size_of_locals_offset() { return byte_offset_of(methodOopDesc, _max_locals ); }
+ static ByteSize size_of_parameters_offset() { return byte_offset_of(methodOopDesc, _size_of_parameters); }
+ static ByteSize from_compiled_offset() { return byte_offset_of(methodOopDesc, _from_compiled_entry); }
++ static ByteSize from_compiled_static_tail_call_offset() { return byte_offset_of(methodOopDesc, _from_compiled_static_tail_call_entry); }
++ static ByteSize from_compiled_not_sibling_static_tail_call_offset() { return byte_offset_of(methodOopDesc, _from_compiled_not_sibling_static_tail_call_entry); }
++
+ static ByteSize code_offset() { return byte_offset_of(methodOopDesc, _code); }
+ static ByteSize invocation_counter_offset() { return byte_offset_of(methodOopDesc, _invocation_counter); }
+ static ByteSize backedge_counter_offset() { return byte_offset_of(methodOopDesc, _backedge_counter); }
+diff --git a/src/share/vm/prims/jvmtiClassFileReconstituter.cpp b/src/share/vm/prims/jvmtiClassFileReconstituter.cpp
+--- a/src/share/vm/prims/jvmtiClassFileReconstituter.cpp
++++ b/src/share/vm/prims/jvmtiClassFileReconstituter.cpp
+@@ -642,7 +642,7 @@
+ assert(len > 0, "length must be > 0");
+
+ // copy the bytecodes
+- *p = (unsigned char) (bs.is_wide()? Bytecodes::_wide : code);
++ *p = (unsigned char) (bs.is_wide_index() || bs.is_tail_call()? Bytecodes::_wide : code);
+ if (len > 1) {
+ memcpy(p+1, bcp+1, len-1);
+ }
+diff --git a/src/share/vm/prims/methodComparator.cpp b/src/share/vm/prims/methodComparator.cpp
+--- a/src/share/vm/prims/methodComparator.cpp
++++ b/src/share/vm/prims/methodComparator.cpp
+@@ -236,7 +236,7 @@
+ case Bytecodes::_lload : // fall through
+ case Bytecodes::_lstore : // fall through
+ case Bytecodes::_ret :
+- if (_s_old->is_wide() != _s_new->is_wide())
++ if (_s_old->is_wide_index() != _s_new->is_wide_index())
+ return false;
+ if (_s_old->get_index() != _s_new->get_index())
+ return false;
+@@ -282,9 +282,9 @@
+ }
+
+ case Bytecodes::_iinc :
+- if (_s_old->is_wide() != _s_new->is_wide())
++ if (_s_old->is_wide_index() != _s_new->is_wide_index())
+ return false;
+- if (! _s_old->is_wide()) {
++ if (! _s_old->is_wide_index()) {
+ if (_s_old->get_index_big() != _s_new->get_index_big())
+ return false;
+ } else {
+diff --git a/src/share/vm/runtime/frame.cpp b/src/share/vm/runtime/frame.cpp
+--- a/src/share/vm/runtime/frame.cpp
++++ b/src/share/vm/runtime/frame.cpp
+@@ -326,7 +326,10 @@
+
+ // Interpreter frames
+
+-
++void frame::interpreter_frame_set_osr(int turn_off_OSR) {
++ assert(is_interpreted_frame(), "Not an interpreted frame");
++ *interpreter_frame_osr_addr() = turn_off_OSR;
++}
+ void frame::interpreter_frame_set_locals(intptr_t* locs) {
+ assert(is_interpreted_frame(), "Not an interpreted frame");
+ *interpreter_frame_locals_addr() = locs;
+diff --git a/src/share/vm/runtime/frame.hpp b/src/share/vm/runtime/frame.hpp
+--- a/src/share/vm/runtime/frame.hpp
++++ b/src/share/vm/runtime/frame.hpp
+@@ -186,6 +186,7 @@
+ // Interpreter frames:
+
+ private:
++ int32_t * interpreter_frame_osr_addr() const;
+ intptr_t** interpreter_frame_locals_addr() const;
+ intptr_t* interpreter_frame_bcx_addr() const;
+ intptr_t* interpreter_frame_mdx_addr() const;
+@@ -218,6 +219,9 @@
+ intptr_t interpreter_frame_bcx() const { return *interpreter_frame_bcx_addr(); }
+ void interpreter_frame_set_bcx(intptr_t bcx);
+
++ // OSR disabling stuff: use by tail calls. if passed value other than 0 OSR is
++ // turned of for this frame.
++ void interpreter_frame_set_osr(int turn_off_osr);
+ // byte code index
+ jint interpreter_frame_bci() const;
+ void interpreter_frame_set_bci(jint bci);
+diff --git a/src/share/vm/runtime/globals.hpp b/src/share/vm/runtime/globals.hpp
+--- a/src/share/vm/runtime/globals.hpp
++++ b/src/share/vm/runtime/globals.hpp
+@@ -3260,6 +3260,11 @@
+ product(bool, AnonymousClasses, false, \
+ "support sun.misc.Unsafe.defineAnonymousClass") \
+ \
++ product(bool, TailCalls, false, \
++ "recognize the tailcall instruction prefix") \
++ develop(bool, TraceTailCalls, false, \
++ "trace tail calls") \
++ \
+ product(bool, TaggedStackInterpreter, false, \
+ "Insert tags in interpreter execution stack for oopmap generaion")\
+ \
+diff --git a/src/share/vm/runtime/sharedRuntime.cpp b/src/share/vm/runtime/sharedRuntime.cpp
+--- a/src/share/vm/runtime/sharedRuntime.cpp
++++ b/src/share/vm/runtime/sharedRuntime.cpp
+@@ -783,9 +783,9 @@
+ // Find caller and bci from vframe
+ methodHandle caller (THREAD, vfst.method());
+ int bci = vfst.bci();
+-
+ // Find bytecode
+ Bytecode_invoke* bytecode = Bytecode_invoke_at(caller, bci);
++ callinfo.set_tail_call(bytecode->is_tailcall());
+ bc = bytecode->adjusted_invoke_code();
+ int bytecode_index = bytecode->index();
+
+@@ -869,9 +869,11 @@
+ // Resolves a call.
+ methodHandle SharedRuntime::resolve_helper(JavaThread *thread,
+ bool is_virtual,
+- bool is_optimized, TRAPS) {
++ bool is_optimized,
++ bool is_tail_call,
++ bool is_sibling_call, TRAPS) {
+ methodHandle callee_method;
+- callee_method = resolve_sub_helper(thread, is_virtual, is_optimized, THREAD);
++ callee_method = resolve_sub_helper(thread, is_virtual, is_optimized, is_tail_call, is_sibling_call, THREAD);
+ if (JvmtiExport::can_hotswap_or_post_breakpoint()) {
+ int retry_count = 0;
+ while (!HAS_PENDING_EXCEPTION && callee_method->is_old() &&
+@@ -888,17 +890,47 @@
+ guarantee((retry_count++ < 100),
+ "Could not resolve to latest version of redefined method");
+ // method is redefined in the middle of resolve so re-try.
+- callee_method = resolve_sub_helper(thread, is_virtual, is_optimized, THREAD);
++ callee_method = resolve_sub_helper(thread, is_virtual, is_optimized, is_tail_call, is_sibling_call, THREAD);
+ }
+ }
+ return callee_method;
+ }
+
++instanceKlassHandle caller_klass_from_frame(JavaThread * thread, Thread * the_thread) {
++ vframeStream vfst(thread, true);
++ assert(!vfst.at_end(), "Java frame must exist");
++ methodHandle caller(the_thread, vfst.method());
++ KlassHandle caller_klass(the_thread, caller->method_holder());
++ assert(caller_klass->oop_is_instance(), "caller should be instanceoop");
++ instanceKlassHandle caller_instance_klass(the_thread, caller_klass());
++ return caller_instance_klass;
++}
++
++// Check whether protecion domain of caller and callee are equal.
++bool SharedRuntime::protection_domains_match(JavaThread * thread,
++ CallInfo& call_info,
++ TRAPS) {
++ KlassHandle callee_klass(call_info.selected_klass());
++ assert(callee_klass->oop_is_instance(), "callee should be instanceoop");
++
++ instanceKlassHandle callee_instance_klass(THREAD, callee_klass());
++ instanceKlassHandle caller_instance_klass(caller_klass_from_frame(thread, THREAD));
++
++ // check whether protection domains match
++ if (caller_instance_klass->protection_domain() !=
++ callee_instance_klass->protection_domain()) {
++ return false;
++ } else
++ return true;
++}
++
+ // Resolves a call. The compilers generate code for calls that go here
+ // and are patched with the real destination of the call.
+ methodHandle SharedRuntime::resolve_sub_helper(JavaThread *thread,
+ bool is_virtual,
+- bool is_optimized, TRAPS) {
++ bool is_optimized,
++ bool is_tail_call,
++ bool is_sibling_call, TRAPS) {
+
+ ResourceMark rm(thread);
+ RegisterMap cbl_map(thread, false);
+@@ -941,6 +973,20 @@
+ }
+ #endif
+
++ // Check whether protection domains match. Currently we throw an exception if
++ // they don't. Might change to changing is_tail_call -> false in the future
++ if (is_tail_call) {
++ methodHandle nullHandle;
++ bool pd_match = protection_domains_match(thread, call_info, CHECK_(nullHandle));
++ if (pd_match == false) {
++ tty->print_cr("protection domains don't match");
++ THROW_0(vmSymbols::java_lang_TailCallException());
++ // instead we could set
++ //is_tail_call = false;
++ // and compress the stack after a while
++ }
++ }
++
+ // Compute entry points. This might require generation of C2I converter
+ // frames, so we cannot be holding any locks here. Furthermore, the
+ // computation of the entry points is independent of patching the call. We
+@@ -966,11 +1012,11 @@
+ bool static_bound = call_info.resolved_method()->can_be_statically_bound();
+ KlassHandle h_klass(THREAD, receiver->klass());
+ CompiledIC::compute_monomorphic_entry(callee_method, h_klass,
+- is_optimized, static_bound, virtual_call_info,
+- CHECK_(methodHandle()));
++ is_optimized, static_bound, is_tail_call, is_sibling_call,
++ virtual_call_info, CHECK_(methodHandle()));
+ } else {
+ // static call
+- CompiledStaticCall::compute_entry(callee_method, static_call_info);
++ CompiledStaticCall::compute_entry(callee_method, static_call_info, is_tail_call, is_sibling_call);
+ }
+
+ // grab lock, check for deoptimization and potentially patch caller
+@@ -989,13 +1035,38 @@
+ }
+ #endif
+ if (is_virtual) {
++ if (is_tail_call) {
++ // last java frame on stack (which includes native call frames)
++ vframeStream vfst(thread, true); // Do not skip and javaCalls
++ assert(!vfst.at_end(), "Arnold Java frame must exist");
++ // Find caller and bci from vframe
++ methodHandle caller (THREAD, vfst.method());
++ // check caller class
++ KlassHandle caller_klass(THREAD, caller->method_holder());
++ if (caller_klass->oop_is_instance()) {
++ instanceKlassHandle caller_instance_klass(THREAD, caller_klass());
++ tty->print_cr("ARNOLD: caller class");
++ //caller_klass.print();
++ } else {
++ tty->print_cr("ARNOLD: error caller klass not instance klass");
++ }
++ }
+ CompiledIC* inline_cache = CompiledIC_before(caller_frame.pc());
+ if (inline_cache->is_clean()) {
+ inline_cache->set_to_monomorphic(virtual_call_info);
+ }
+ } else {
+- CompiledStaticCall* ssc = compiledStaticCall_before(caller_frame.pc());
+- if (ssc->is_clean()) ssc->set(static_call_info);
++ if (is_tail_call) {
++ CompiledStaticCall * ssc = compiledStaticCall_before(caller_frame.pc());
++ if (ssc->is_clean_static_tail_call())
++ ssc->set_tail_call(static_call_info);
++ else assert(false, "Something is wrong here.");
++ } else {
++ CompiledStaticCall* ssc = compiledStaticCall_before(caller_frame.pc());
++ if (ssc->is_clean()) {
++ ssc->set(static_call_info);
++ }
++ }
+ }
+ }
+
+@@ -1004,14 +1075,51 @@
+ return callee_method;
+ }
+
++// Compute entry point for the type of a call when we are in a
++// handle_wrong_method_xxx stub.
++static address get_entry_for_tail_call_type(methodOop callee, address caller_pc, bool want_c2i_entry) {
++ address c2i_entry = NULL, verified_entry = NULL;
++ assert(NativeCall::is_call_before(caller_pc), "must be a call");
++ NativeCall *call_site = nativeCall_before(caller_pc);
++ CodeBlob* cb = CodeCache::find_blob(caller_pc);
++ address call_addr = call_site->instruction_address();
++ RelocIterator iter(cb, call_site->instruction_address(), call_site->next_instruction_address());
++ iter.next();
++ assert(iter.has_current(), "must have a reloc at java call site");
++
++ switch (iter.tail_call_type()) {
++ case relocInfo::not_tail_call:
++ c2i_entry = callee->get_c2i_entry();
++ verified_entry = callee->verified_code_entry();
++ break;
++ case relocInfo::sibling_tail_call_type:
++ c2i_entry = callee->get_c2i_static_tail_call_entry();
++ verified_entry = callee->verified_static_tail_call_code_entry();
++ break;
++ case relocInfo::not_sibling_tail_call_type:
++ c2i_entry = callee->get_c2i_static_not_sibling_tail_call_entry();
++ verified_entry = callee->verified_not_sibling_static_tail_call_code_entry();
++ break;
++ default: assert(0, "oops"); break;
++ }
++ if (want_c2i_entry) {
++ assert(c2i_entry!=NULL, "c2i_entry not null");
++ return c2i_entry;
++ }
++ assert(verified_entry!=NULL, "verified_entry not null");
++ return verified_entry;
++}
+
+ // Inline caches exist only in compiled code
+ JRT_BLOCK_ENTRY(address, SharedRuntime::handle_wrong_method_ic_miss(JavaThread* thread))
+-#ifdef ASSERT
++
+ RegisterMap reg_map(thread, false);
+ frame stub_frame = thread->last_frame();
++#ifdef ASSERT
+ assert(stub_frame.is_runtime_frame(), "sanity check");
++#endif
+ frame caller_frame = stub_frame.sender(®_map);
++#ifdef ASSERT
+ assert(!caller_frame.is_interpreted_frame() && !caller_frame.is_entry_frame(), "unexpected frame");
+ #endif /* ASSERT */
+
+@@ -1023,7 +1131,8 @@
+ JRT_BLOCK_END
+ // return compiled code entry point after potential safepoints
+ assert(callee_method->verified_code_entry() != NULL, " Jump to zero!");
+- return callee_method->verified_code_entry();
++ return get_entry_for_tail_call_type(callee_method(), caller_frame.pc(), false);
++ //return callee_method->verified_code_entry();
+ JRT_END
+
+
+@@ -1042,12 +1151,16 @@
+ frame stub_frame = thread->last_frame();
+ assert(stub_frame.is_runtime_frame(), "sanity check");
+ frame caller_frame = stub_frame.sender(®_map);
++ address pc = caller_frame.pc();
++
+ if (caller_frame.is_interpreted_frame() || caller_frame.is_entry_frame() ) {
+ methodOop callee = thread->callee_target();
+ guarantee(callee != NULL && callee->is_method(), "bad handshake");
+ thread->set_vm_result(callee);
+ thread->set_callee_target(NULL);
+- return callee->get_c2i_entry();
++
++ return get_entry_for_tail_call_type(callee, pc, true);
++ //return callee->get_c2i_entry();
+ }
+
+ // Must be compiled to compiled path which is safe to stackwalk
+@@ -1059,7 +1172,8 @@
+ JRT_BLOCK_END
+ // return compiled code entry point after potential safepoints
+ assert(callee_method->verified_code_entry() != NULL, " Jump to zero!");
+- return callee_method->verified_code_entry();
++ //return callee_method->verified_code_entry();
++ return get_entry_for_tail_call_type(callee_method(), pc, false);
+ JRT_END
+
+
+@@ -1067,7 +1181,7 @@
+ JRT_BLOCK_ENTRY(address, SharedRuntime::resolve_static_call_C(JavaThread *thread ))
+ methodHandle callee_method;
+ JRT_BLOCK
+- callee_method = SharedRuntime::resolve_helper(thread, false, false, CHECK_NULL);
++ callee_method = SharedRuntime::resolve_helper(thread, false, false, false, false, CHECK_NULL);
+ thread->set_vm_result(callee_method());
+ JRT_BLOCK_END
+ // return compiled code entry point after potential safepoints
+@@ -1075,12 +1189,34 @@
+ return callee_method->verified_code_entry();
+ JRT_END
+
++// resolve a static tail call and patch code
++JRT_BLOCK_ENTRY(address, SharedRuntime::resolve_static_tail_call_C(JavaThread *thread ))
++ methodHandle callee_method;
++ JRT_BLOCK
++ callee_method = SharedRuntime::resolve_helper(thread, false, false, true, true, CHECK_NULL);
++ thread->set_vm_result(callee_method());
++ JRT_BLOCK_END
++ // return compiled code entry point after potential safepoints
++ assert(callee_method->verified_static_tail_call_code_entry() != NULL, " Jump to zero!");
++ return callee_method->verified_static_tail_call_code_entry();
++JRT_END
++
++JRT_BLOCK_ENTRY(address, SharedRuntime::resolve_not_sibling_static_tail_call_C(JavaThread *thread ))
++ methodHandle callee_method;
++ JRT_BLOCK
++ callee_method = SharedRuntime::resolve_helper(thread, false, false, true, false, CHECK_NULL);
++ thread->set_vm_result(callee_method());
++ JRT_BLOCK_END
++ // return compiled code entry point after potential safepoints
++ assert(callee_method->verified_not_sibling_static_tail_call_code_entry() != NULL, " Jump to zero!");
++ return callee_method->verified_not_sibling_static_tail_call_code_entry();
++JRT_END
+
+ // resolve virtual call and update inline cache to monomorphic
+ JRT_BLOCK_ENTRY(address, SharedRuntime::resolve_virtual_call_C(JavaThread *thread ))
+ methodHandle callee_method;
+ JRT_BLOCK
+- callee_method = SharedRuntime::resolve_helper(thread, true, false, CHECK_NULL);
++ callee_method = SharedRuntime::resolve_helper(thread, true, false, false, false, CHECK_NULL);
+ thread->set_vm_result(callee_method());
+ JRT_BLOCK_END
+ // return compiled code entry point after potential safepoints
+@@ -1088,13 +1224,60 @@
+ return callee_method->verified_code_entry();
+ JRT_END
+
++// resolve virtual tail call and update inline cache to monomorphic
++JRT_BLOCK_ENTRY(address, SharedRuntime::resolve_virtual_tail_call_C(JavaThread *thread ))
++ methodHandle callee_method;
++ JRT_BLOCK
++ callee_method = SharedRuntime::resolve_helper(thread, true, false, true, true, CHECK_NULL);
++ thread->set_vm_result(callee_method());
++ JRT_BLOCK_END
++ // return compiled code entry point after potential safepoints
++ assert(callee_method->verified_static_tail_call_code_entry() != NULL, " Jump to zero!");
++ return callee_method->verified_static_tail_call_code_entry();
++JRT_END
++
++// resolve virtual tail call and update inline cache to monomorphic
++JRT_BLOCK_ENTRY(address, SharedRuntime::resolve_not_sibling_virtual_tail_call_C(JavaThread *thread ))
++ methodHandle callee_method;
++ JRT_BLOCK
++ callee_method = SharedRuntime::resolve_helper(thread, true, false, true, false, CHECK_NULL);
++ thread->set_vm_result(callee_method());
++ JRT_BLOCK_END
++ // return compiled code entry point after potential safepoints
++ assert(callee_method->verified_static_tail_call_code_entry() != NULL, " Jump to zero!");
++ return callee_method->verified_not_sibling_static_tail_call_code_entry();
++JRT_END
++
++// Resolve a virtual tail call that can be statically bound (e.g., always
++// monomorphic, so it has no inline cache). Patch code to resolved target.
++JRT_BLOCK_ENTRY(address, SharedRuntime::resolve_opt_virtual_tail_call_C(JavaThread *thread ))
++ methodHandle callee_method;
++ JRT_BLOCK
++ callee_method = SharedRuntime::resolve_helper(thread, true, true, true, true, CHECK_NULL);
++ thread->set_vm_result(callee_method());
++ JRT_BLOCK_END
++ // return compiled code entry point after potential safepoints
++ assert(callee_method->verified_static_tail_call_code_entry() != NULL, " Jump to zero!");
++ return callee_method->verified_static_tail_call_code_entry();
++JRT_END
++
++JRT_BLOCK_ENTRY(addre