annotate src/cpu/ppc/vm/stubGenerator_ppc.cpp @ 8979:bcccbecdde63

8131048: ppc implement CRC32 intrinsic Reviewed-by: goetz
author gromero
date Mon, 24 Sep 2018 17:18:38 -0400
parents 9575483cce09
children f892c3b6b651
rev   line source
goetz@6023 1 /*
goetz@6023 2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
goetz@6077 3 * Copyright 2012, 2014 SAP AG. All rights reserved.
goetz@6023 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
goetz@6023 5 *
goetz@6023 6 * This code is free software; you can redistribute it and/or modify it
goetz@6023 7 * under the terms of the GNU General Public License version 2 only, as
goetz@6023 8 * published by the Free Software Foundation.
goetz@6023 9 *
goetz@6023 10 * This code is distributed in the hope that it will be useful, but WITHOUT
goetz@6023 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
goetz@6023 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
goetz@6023 13 * version 2 for more details (a copy is included in the LICENSE file that
goetz@6023 14 * accompanied this code).
goetz@6023 15 *
goetz@6023 16 * You should have received a copy of the GNU General Public License version
goetz@6023 17 * 2 along with this work; if not, write to the Free Software Foundation,
goetz@6023 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
goetz@6023 19 *
goetz@6023 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
goetz@6023 21 * or visit www.oracle.com if you need additional information or have any
goetz@6023 22 * questions.
goetz@6023 23 *
goetz@6023 24 */
goetz@6023 25
goetz@6023 26 #include "precompiled.hpp"
goetz@6023 27 #include "asm/macroAssembler.inline.hpp"
goetz@6023 28 #include "interpreter/interpreter.hpp"
goetz@6023 29 #include "nativeInst_ppc.hpp"
goetz@6023 30 #include "oops/instanceOop.hpp"
goetz@6023 31 #include "oops/method.hpp"
goetz@6023 32 #include "oops/objArrayKlass.hpp"
goetz@6023 33 #include "oops/oop.inline.hpp"
goetz@6023 34 #include "prims/methodHandles.hpp"
goetz@6023 35 #include "runtime/frame.inline.hpp"
goetz@6023 36 #include "runtime/handles.inline.hpp"
goetz@6023 37 #include "runtime/sharedRuntime.hpp"
goetz@6023 38 #include "runtime/stubCodeGenerator.hpp"
goetz@6023 39 #include "runtime/stubRoutines.hpp"
goetz@6023 40 #include "utilities/top.hpp"
goetz@6077 41 #include "runtime/thread.inline.hpp"
goetz@6023 42
goetz@6023 43 #define __ _masm->
goetz@6023 44
goetz@6023 45 #ifdef PRODUCT
goetz@6023 46 #define BLOCK_COMMENT(str) // nothing
goetz@6023 47 #else
goetz@6023 48 #define BLOCK_COMMENT(str) __ block_comment(str)
goetz@6023 49 #endif
goetz@6023 50
goetz@6023 51 class StubGenerator: public StubCodeGenerator {
goetz@6023 52 private:
goetz@6023 53
goetz@6023 54 // Call stubs are used to call Java from C
goetz@6023 55 //
goetz@6023 56 // Arguments:
goetz@6023 57 //
goetz@6023 58 // R3 - call wrapper address : address
goetz@6023 59 // R4 - result : intptr_t*
goetz@6023 60 // R5 - result type : BasicType
goetz@6023 61 // R6 - method : Method
goetz@6023 62 // R7 - frame mgr entry point : address
goetz@6023 63 // R8 - parameter block : intptr_t*
goetz@6023 64 // R9 - parameter count in words : int
goetz@6023 65 // R10 - thread : Thread*
goetz@6023 66 //
goetz@6023 67 address generate_call_stub(address& return_address) {
goetz@6023 68 // Setup a new c frame, copy java arguments, call frame manager or
goetz@6023 69 // native_entry, and process result.
goetz@6023 70
goetz@6023 71 StubCodeMark mark(this, "StubRoutines", "call_stub");
goetz@6023 72
goetz@6076 73 address start = __ function_entry();
goetz@6023 74
goetz@6023 75 // some sanity checks
goetz@6076 76 assert((sizeof(frame::abi_minframe) % 16) == 0, "unaligned");
goetz@6076 77 assert((sizeof(frame::abi_reg_args) % 16) == 0, "unaligned");
goetz@6023 78 assert((sizeof(frame::spill_nonvolatiles) % 16) == 0, "unaligned");
goetz@6023 79 assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
goetz@6023 80 assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned");
goetz@6023 81
goetz@6023 82 Register r_arg_call_wrapper_addr = R3;
goetz@6023 83 Register r_arg_result_addr = R4;
goetz@6023 84 Register r_arg_result_type = R5;
goetz@6023 85 Register r_arg_method = R6;
goetz@6023 86 Register r_arg_entry = R7;
goetz@6023 87 Register r_arg_thread = R10;
goetz@6023 88
goetz@6023 89 Register r_temp = R24;
goetz@6023 90 Register r_top_of_arguments_addr = R25;
goetz@6023 91 Register r_entryframe_fp = R26;
goetz@6023 92
goetz@6023 93 {
goetz@6023 94 // Stack on entry to call_stub:
goetz@6023 95 //
goetz@6023 96 // F1 [C_FRAME]
goetz@6023 97 // ...
goetz@6023 98
goetz@6023 99 Register r_arg_argument_addr = R8;
goetz@6023 100 Register r_arg_argument_count = R9;
goetz@6023 101 Register r_frame_alignment_in_bytes = R27;
goetz@6023 102 Register r_argument_addr = R28;
goetz@6023 103 Register r_argumentcopy_addr = R29;
goetz@6023 104 Register r_argument_size_in_bytes = R30;
goetz@6023 105 Register r_frame_size = R23;
goetz@6023 106
goetz@6023 107 Label arguments_copied;
goetz@6023 108
goetz@6023 109 // Save LR/CR to caller's C_FRAME.
goetz@6023 110 __ save_LR_CR(R0);
goetz@6023 111
goetz@6023 112 // Zero extend arg_argument_count.
goetz@6023 113 __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
goetz@6023 114
goetz@6023 115 // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
goetz@6023 116 __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
goetz@6023 117
goetz@6023 118 // Keep copy of our frame pointer (caller's SP).
goetz@6023 119 __ mr(r_entryframe_fp, R1_SP);
goetz@6023 120
goetz@6023 121 BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
goetz@6023 122 // Push ENTRY_FRAME including arguments:
goetz@6023 123 //
goetz@6023 124 // F0 [TOP_IJAVA_FRAME_ABI]
goetz@6023 125 // alignment (optional)
goetz@6023 126 // [outgoing Java arguments]
goetz@6023 127 // [ENTRY_FRAME_LOCALS]
goetz@6023 128 // F1 [C_FRAME]
goetz@6023 129 // ...
goetz@6023 130
goetz@6023 131 // calculate frame size
goetz@6023 132
goetz@6023 133 // unaligned size of arguments
goetz@6023 134 __ sldi(r_argument_size_in_bytes,
goetz@6023 135 r_arg_argument_count, Interpreter::logStackElementSize);
goetz@6023 136 // arguments alignment (max 1 slot)
goetz@6023 137 // FIXME: use round_to() here
goetz@6023 138 __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
goetz@6023 139 __ sldi(r_frame_alignment_in_bytes,
goetz@6060 140 r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
goetz@6023 141
goetz@6023 142 // size = unaligned size of arguments + top abi's size
goetz@6023 143 __ addi(r_frame_size, r_argument_size_in_bytes,
goetz@6023 144 frame::top_ijava_frame_abi_size);
goetz@6023 145 // size += arguments alignment
goetz@6023 146 __ add(r_frame_size,
goetz@6060 147 r_frame_size, r_frame_alignment_in_bytes);
goetz@6023 148 // size += size of call_stub locals
goetz@6023 149 __ addi(r_frame_size,
goetz@6023 150 r_frame_size, frame::entry_frame_locals_size);
goetz@6023 151
goetz@6023 152 // push ENTRY_FRAME
goetz@6023 153 __ push_frame(r_frame_size, r_temp);
goetz@6023 154
goetz@6023 155 // initialize call_stub locals (step 1)
goetz@6023 156 __ std(r_arg_call_wrapper_addr,
goetz@6023 157 _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
goetz@6023 158 __ std(r_arg_result_addr,
goetz@6023 159 _entry_frame_locals_neg(result_address), r_entryframe_fp);
goetz@6023 160 __ std(r_arg_result_type,
goetz@6023 161 _entry_frame_locals_neg(result_type), r_entryframe_fp);
goetz@6023 162 // we will save arguments_tos_address later
goetz@6023 163
goetz@6023 164
goetz@6023 165 BLOCK_COMMENT("Copy Java arguments");
goetz@6023 166 // copy Java arguments
goetz@6023 167
goetz@6023 168 // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
goetz@6023 169 // FIXME: why not simply use SP+frame::top_ijava_frame_size?
goetz@6023 170 __ addi(r_top_of_arguments_addr,
goetz@6023 171 R1_SP, frame::top_ijava_frame_abi_size);
goetz@6023 172 __ add(r_top_of_arguments_addr,
goetz@6060 173 r_top_of_arguments_addr, r_frame_alignment_in_bytes);
goetz@6023 174
goetz@6023 175 // any arguments to copy?
goetz@6023 176 __ cmpdi(CCR0, r_arg_argument_count, 0);
goetz@6023 177 __ beq(CCR0, arguments_copied);
goetz@6023 178
goetz@6023 179 // prepare loop and copy arguments in reverse order
goetz@6023 180 {
goetz@6023 181 // init CTR with arg_argument_count
goetz@6023 182 __ mtctr(r_arg_argument_count);
goetz@6023 183
goetz@6023 184 // let r_argumentcopy_addr point to last outgoing Java arguments P
goetz@6023 185 __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
goetz@6023 186
goetz@6023 187 // let r_argument_addr point to last incoming java argument
goetz@6023 188 __ add(r_argument_addr,
goetz@6023 189 r_arg_argument_addr, r_argument_size_in_bytes);
goetz@6023 190 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
goetz@6023 191
goetz@6023 192 // now loop while CTR > 0 and copy arguments
goetz@6023 193 {
goetz@6023 194 Label next_argument;
goetz@6023 195 __ bind(next_argument);
goetz@6023 196
goetz@6023 197 __ ld(r_temp, 0, r_argument_addr);
goetz@6023 198 // argument_addr--;
goetz@6023 199 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
goetz@6023 200 __ std(r_temp, 0, r_argumentcopy_addr);
goetz@6023 201 // argumentcopy_addr++;
goetz@6023 202 __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
goetz@6023 203
goetz@6023 204 __ bdnz(next_argument);
goetz@6023 205 }
goetz@6023 206 }
goetz@6023 207
goetz@6023 208 // Arguments copied, continue.
goetz@6023 209 __ bind(arguments_copied);
goetz@6023 210 }
goetz@6023 211
goetz@6023 212 {
goetz@6023 213 BLOCK_COMMENT("Call frame manager or native entry.");
goetz@6023 214 // Call frame manager or native entry.
goetz@6706 215 Register r_new_arg_entry = R14;
goetz@6023 216 assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
goetz@6023 217 r_arg_method, r_arg_thread);
goetz@6023 218
goetz@6023 219 __ mr(r_new_arg_entry, r_arg_entry);
goetz@6023 220
goetz@6023 221 // Register state on entry to frame manager / native entry:
goetz@6023 222 //
goetz@6060 223 // tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
goetz@6023 224 // R19_method - Method
goetz@6023 225 // R16_thread - JavaThread*
goetz@6023 226
goetz@6060 227 // Tos must point to last argument - element_size.
goetz@6077 228 #ifdef CC_INTERP
goetz@6060 229 const Register tos = R17_tos;
goetz@6077 230 #else
goetz@6077 231 const Register tos = R15_esp;
goetz@6077 232 #endif
goetz@6060 233 __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
goetz@6023 234
goetz@6023 235 // initialize call_stub locals (step 2)
goetz@6060 236 // now save tos as arguments_tos_address
goetz@6060 237 __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
goetz@6023 238
goetz@6023 239 // load argument registers for call
goetz@6023 240 __ mr(R19_method, r_arg_method);
goetz@6023 241 __ mr(R16_thread, r_arg_thread);
goetz@6060 242 assert(tos != r_arg_method, "trashed r_arg_method");
goetz@6060 243 assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
goetz@6023 244
goetz@6023 245 // Set R15_prev_state to 0 for simplifying checks in callee.
goetz@6077 246 #ifdef CC_INTERP
goetz@6023 247 __ li(R15_prev_state, 0);
goetz@6077 248 #else
goetz@6077 249 __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
goetz@6077 250 #endif
goetz@6023 251 // Stack on entry to frame manager / native entry:
goetz@6023 252 //
goetz@6023 253 // F0 [TOP_IJAVA_FRAME_ABI]
goetz@6023 254 // alignment (optional)
goetz@6023 255 // [outgoing Java arguments]
goetz@6023 256 // [ENTRY_FRAME_LOCALS]
goetz@6023 257 // F1 [C_FRAME]
goetz@6023 258 // ...
goetz@6023 259 //
goetz@6023 260
goetz@6023 261 // global toc register
goetz@6023 262 __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
goetz@6023 263
goetz@6023 264 // Load narrow oop base.
goetz@6023 265 __ reinit_heapbase(R30, R11_scratch1);
goetz@6023 266
goetz@6023 267 // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
goetz@6023 268 // when called via a c2i.
goetz@6023 269
goetz@6023 270 // Pass initial_caller_sp to framemanager.
goetz@6023 271 __ mr(R21_tmp1, R1_SP);
goetz@6023 272
goetz@6023 273 // Do a light-weight C-call here, r_new_arg_entry holds the address
goetz@6023 274 // of the interpreter entry point (frame manager or native entry)
goetz@6023 275 // and save runtime-value of LR in return_address.
goetz@6060 276 assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
goetz@6023 277 "trashed r_new_arg_entry");
goetz@6023 278 return_address = __ call_stub(r_new_arg_entry);
goetz@6023 279 }
goetz@6023 280
goetz@6023 281 {
goetz@6023 282 BLOCK_COMMENT("Returned from frame manager or native entry.");
goetz@6023 283 // Returned from frame manager or native entry.
goetz@6023 284 // Now pop frame, process result, and return to caller.
goetz@6023 285
goetz@6023 286 // Stack on exit from frame manager / native entry:
goetz@6023 287 //
goetz@6023 288 // F0 [ABI]
goetz@6023 289 // ...
goetz@6023 290 // [ENTRY_FRAME_LOCALS]
goetz@6023 291 // F1 [C_FRAME]
goetz@6023 292 // ...
goetz@6023 293 //
goetz@6023 294 // Just pop the topmost frame ...
goetz@6023 295 //
goetz@6023 296
goetz@6023 297 Label ret_is_object;
goetz@6023 298 Label ret_is_long;
goetz@6023 299 Label ret_is_float;
goetz@6023 300 Label ret_is_double;
goetz@6023 301
goetz@6023 302 Register r_entryframe_fp = R30;
goetz@6023 303 Register r_lr = R7_ARG5;
goetz@6023 304 Register r_cr = R8_ARG6;
goetz@6023 305
goetz@6023 306 // Reload some volatile registers which we've spilled before the call
goetz@6023 307 // to frame manager / native entry.
goetz@6023 308 // Access all locals via frame pointer, because we know nothing about
goetz@6023 309 // the topmost frame's size.
goetz@6023 310 __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
goetz@6023 311 assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
goetz@6023 312 __ ld(r_arg_result_addr,
goetz@6023 313 _entry_frame_locals_neg(result_address), r_entryframe_fp);
goetz@6023 314 __ ld(r_arg_result_type,
goetz@6023 315 _entry_frame_locals_neg(result_type), r_entryframe_fp);
goetz@6023 316 __ ld(r_cr, _abi(cr), r_entryframe_fp);
goetz@6023 317 __ ld(r_lr, _abi(lr), r_entryframe_fp);
goetz@6023 318
goetz@6023 319 // pop frame and restore non-volatiles, LR and CR
goetz@6023 320 __ mr(R1_SP, r_entryframe_fp);
goetz@6023 321 __ mtcr(r_cr);
goetz@6023 322 __ mtlr(r_lr);
goetz@6023 323
goetz@6023 324 // Store result depending on type. Everything that is not
goetz@6023 325 // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
goetz@6023 326 __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
goetz@6023 327 __ cmpwi(CCR1, r_arg_result_type, T_LONG);
goetz@6060 328 __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
goetz@6060 329 __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
goetz@6023 330
goetz@6023 331 // restore non-volatile registers
goetz@6023 332 __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
goetz@6023 333
goetz@6023 334
goetz@6023 335 // Stack on exit from call_stub:
goetz@6023 336 //
goetz@6023 337 // 0 [C_FRAME]
goetz@6023 338 // ...
goetz@6023 339 //
goetz@6023 340 // no call_stub frames left.
goetz@6023 341
goetz@6023 342 // All non-volatiles have been restored at this point!!
goetz@6023 343 assert(R3_RET == R3, "R3_RET should be R3");
goetz@6023 344
goetz@6023 345 __ beq(CCR0, ret_is_object);
goetz@6023 346 __ beq(CCR1, ret_is_long);
goetz@6060 347 __ beq(CCR5, ret_is_float);
goetz@6060 348 __ beq(CCR6, ret_is_double);
goetz@6023 349
goetz@6023 350 // default:
goetz@6023 351 __ stw(R3_RET, 0, r_arg_result_addr);
goetz@6023 352 __ blr(); // return to caller
goetz@6023 353
goetz@6023 354 // case T_OBJECT:
goetz@6023 355 __ bind(ret_is_object);
goetz@6023 356 __ std(R3_RET, 0, r_arg_result_addr);
goetz@6023 357 __ blr(); // return to caller
goetz@6023 358
goetz@6023 359 // case T_LONG:
goetz@6023 360 __ bind(ret_is_long);
goetz@6023 361 __ std(R3_RET, 0, r_arg_result_addr);
goetz@6023 362 __ blr(); // return to caller
goetz@6023 363
goetz@6023 364 // case T_FLOAT:
goetz@6023 365 __ bind(ret_is_float);
goetz@6023 366 __ stfs(F1_RET, 0, r_arg_result_addr);
goetz@6023 367 __ blr(); // return to caller
goetz@6023 368
goetz@6023 369 // case T_DOUBLE:
goetz@6023 370 __ bind(ret_is_double);
goetz@6023 371 __ stfd(F1_RET, 0, r_arg_result_addr);
goetz@6023 372 __ blr(); // return to caller
goetz@6023 373 }
goetz@6023 374
goetz@6023 375 return start;
goetz@6023 376 }
goetz@6023 377
goetz@6023 378 // Return point for a Java call if there's an exception thrown in
goetz@6023 379 // Java code. The exception is caught and transformed into a
goetz@6023 380 // pending exception stored in JavaThread that can be tested from
goetz@6023 381 // within the VM.
goetz@6023 382 //
goetz@6023 383 address generate_catch_exception() {
goetz@6023 384 StubCodeMark mark(this, "StubRoutines", "catch_exception");
goetz@6023 385
goetz@6023 386 address start = __ pc();
goetz@6023 387
goetz@6023 388 // Registers alive
goetz@6023 389 //
goetz@6023 390 // R16_thread
goetz@6023 391 // R3_ARG1 - address of pending exception
goetz@6023 392 // R4_ARG2 - return address in call stub
goetz@6023 393
goetz@6023 394 const Register exception_file = R21_tmp1;
goetz@6023 395 const Register exception_line = R22_tmp2;
goetz@6023 396
goetz@6023 397 __ load_const(exception_file, (void*)__FILE__);
goetz@6023 398 __ load_const(exception_line, (void*)__LINE__);
goetz@6023 399
goetz@6023 400 __ std(R3_ARG1, thread_(pending_exception));
goetz@6023 401 // store into `char *'
goetz@6023 402 __ std(exception_file, thread_(exception_file));
goetz@6023 403 // store into `int'
goetz@6023 404 __ stw(exception_line, thread_(exception_line));
goetz@6023 405
goetz@6023 406 // complete return to VM
goetz@6023 407 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
goetz@6023 408
goetz@6023 409 __ mtlr(R4_ARG2);
goetz@6023 410 // continue in call stub
goetz@6023 411 __ blr();
goetz@6023 412
goetz@6023 413 return start;
goetz@6023 414 }
goetz@6023 415
goetz@6023 416 // Continuation point for runtime calls returning with a pending
goetz@6023 417 // exception. The pending exception check happened in the runtime
goetz@6023 418 // or native call stub. The pending exception in Thread is
goetz@6023 419 // converted into a Java-level exception.
goetz@6023 420 //
goetz@6023 421 address generate_forward_exception() {
goetz@6023 422 StubCodeMark mark(this, "StubRoutines", "forward_exception");
goetz@6023 423 address start = __ pc();
goetz@6023 424
goetz@6023 425 #if !defined(PRODUCT)
goetz@6023 426 if (VerifyOops) {
goetz@6023 427 // Get pending exception oop.
goetz@6023 428 __ ld(R3_ARG1,
goetz@6023 429 in_bytes(Thread::pending_exception_offset()),
goetz@6023 430 R16_thread);
goetz@6023 431 // Make sure that this code is only executed if there is a pending exception.
goetz@6023 432 {
goetz@6023 433 Label L;
goetz@6023 434 __ cmpdi(CCR0, R3_ARG1, 0);
goetz@6023 435 __ bne(CCR0, L);
goetz@6023 436 __ stop("StubRoutines::forward exception: no pending exception (1)");
goetz@6023 437 __ bind(L);
goetz@6023 438 }
goetz@6023 439 __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
goetz@6023 440 }
goetz@6023 441 #endif
goetz@6023 442
goetz@6023 443 // Save LR/CR and copy exception pc (LR) into R4_ARG2.
goetz@6023 444 __ save_LR_CR(R4_ARG2);
goetz@6076 445 __ push_frame_reg_args(0, R0);
goetz@6023 446 // Find exception handler.
goetz@6023 447 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
goetz@6023 448 SharedRuntime::exception_handler_for_return_address),
goetz@6023 449 R16_thread,
goetz@6023 450 R4_ARG2);
goetz@6023 451 // Copy handler's address.
goetz@6023 452 __ mtctr(R3_RET);
goetz@6023 453 __ pop_frame();
goetz@6023 454 __ restore_LR_CR(R0);
goetz@6023 455
goetz@6023 456 // Set up the arguments for the exception handler:
goetz@6023 457 // - R3_ARG1: exception oop
goetz@6023 458 // - R4_ARG2: exception pc.
goetz@6023 459
goetz@6023 460 // Load pending exception oop.
goetz@6023 461 __ ld(R3_ARG1,
goetz@6023 462 in_bytes(Thread::pending_exception_offset()),
goetz@6023 463 R16_thread);
goetz@6023 464
goetz@6023 465 // The exception pc is the return address in the caller.
goetz@6023 466 // Must load it into R4_ARG2.
goetz@6023 467 __ mflr(R4_ARG2);
goetz@6023 468
goetz@6023 469 #ifdef ASSERT
goetz@6023 470 // Make sure exception is set.
goetz@6023 471 {
goetz@6023 472 Label L;
goetz@6023 473 __ cmpdi(CCR0, R3_ARG1, 0);
goetz@6023 474 __ bne(CCR0, L);
goetz@6023 475 __ stop("StubRoutines::forward exception: no pending exception (2)");
goetz@6023 476 __ bind(L);
goetz@6023 477 }
goetz@6023 478 #endif
goetz@6023 479
goetz@6023 480 // Clear the pending exception.
goetz@6023 481 __ li(R0, 0);
goetz@6023 482 __ std(R0,
goetz@6023 483 in_bytes(Thread::pending_exception_offset()),
goetz@6023 484 R16_thread);
goetz@6023 485 // Jump to exception handler.
goetz@6023 486 __ bctr();
goetz@6023 487
goetz@6023 488 return start;
goetz@6023 489 }
goetz@6023 490
goetz@6023 491 #undef __
goetz@6023 492 #define __ masm->
goetz@6023 493 // Continuation point for throwing of implicit exceptions that are
goetz@6023 494 // not handled in the current activation. Fabricates an exception
goetz@6023 495 // oop and initiates normal exception dispatching in this
goetz@6023 496 // frame. Only callee-saved registers are preserved (through the
goetz@6023 497 // normal register window / RegisterMap handling). If the compiler
goetz@6023 498 // needs all registers to be preserved between the fault point and
goetz@6023 499 // the exception handler then it must assume responsibility for that
goetz@6023 500 // in AbstractCompiler::continuation_for_implicit_null_exception or
goetz@6023 501 // continuation_for_implicit_division_by_zero_exception. All other
goetz@6023 502 // implicit exceptions (e.g., NullPointerException or
goetz@6023 503 // AbstractMethodError on entry) are either at call sites or
goetz@6023 504 // otherwise assume that stack unwinding will be initiated, so
goetz@6023 505 // caller saved registers were assumed volatile in the compiler.
goetz@6023 506 //
goetz@6023 507 // Note that we generate only this stub into a RuntimeStub, because
goetz@6023 508 // it needs to be properly traversed and ignored during GC, so we
goetz@6023 509 // change the meaning of the "__" macro within this method.
goetz@6023 510 //
goetz@6023 511 // Note: the routine set_pc_not_at_call_for_caller in
goetz@6023 512 // SharedRuntime.cpp requires that this code be generated into a
goetz@6023 513 // RuntimeStub.
goetz@6023 514 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
goetz@6023 515 Register arg1 = noreg, Register arg2 = noreg) {
goetz@6023 516 CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
goetz@6023 517 MacroAssembler* masm = new MacroAssembler(&code);
goetz@6023 518
goetz@6023 519 OopMapSet* oop_maps = new OopMapSet();
goetz@6076 520 int frame_size_in_bytes = frame::abi_reg_args_size;
goetz@6023 521 OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
goetz@6023 522
goetz@6023 523 StubCodeMark mark(this, "StubRoutines", "throw_exception");
goetz@6023 524
goetz@6023 525 address start = __ pc();
goetz@6023 526
goetz@6023 527 __ save_LR_CR(R11_scratch1);
goetz@6023 528
goetz@6023 529 // Push a frame.
goetz@6076 530 __ push_frame_reg_args(0, R11_scratch1);
goetz@6023 531
goetz@6023 532 address frame_complete_pc = __ pc();
goetz@6023 533
goetz@6023 534 if (restore_saved_exception_pc) {
goetz@6023 535 __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
goetz@6023 536 }
goetz@6023 537
goetz@6023 538 // Note that we always have a runtime stub frame on the top of
goetz@6023 539 // stack by this point. Remember the offset of the instruction
goetz@6023 540 // whose address will be moved to R11_scratch1.
goetz@6023 541 address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
goetz@6023 542
goetz@6023 543 __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
goetz@6023 544
goetz@6023 545 __ mr(R3_ARG1, R16_thread);
goetz@6023 546 if (arg1 != noreg) {
goetz@6023 547 __ mr(R4_ARG2, arg1);
goetz@6023 548 }
goetz@6023 549 if (arg2 != noreg) {
goetz@6023 550 __ mr(R5_ARG3, arg2);
goetz@6023 551 }
goetz@6076 552 #if defined(ABI_ELFv2)
goetz@6076 553 __ call_c(runtime_entry, relocInfo::none);
goetz@6076 554 #else
goetz@6076 555 __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
goetz@6076 556 #endif
goetz@6023 557
goetz@6023 558 // Set an oopmap for the call site.
goetz@6023 559 oop_maps->add_gc_map((int)(gc_map_pc - start), map);
goetz@6023 560
goetz@6023 561 __ reset_last_Java_frame();
goetz@6023 562
goetz@6023 563 #ifdef ASSERT
goetz@6023 564 // Make sure that this code is only executed if there is a pending
goetz@6023 565 // exception.
goetz@6023 566 {
goetz@6023 567 Label L;
goetz@6023 568 __ ld(R0,
goetz@6023 569 in_bytes(Thread::pending_exception_offset()),
goetz@6023 570 R16_thread);
goetz@6023 571 __ cmpdi(CCR0, R0, 0);
goetz@6023 572 __ bne(CCR0, L);
goetz@6023 573 __ stop("StubRoutines::throw_exception: no pending exception");
goetz@6023 574 __ bind(L);
goetz@6023 575 }
goetz@6023 576 #endif
goetz@6023 577
goetz@6023 578 // Pop frame.
goetz@6023 579 __ pop_frame();
goetz@6023 580
goetz@6023 581 __ restore_LR_CR(R11_scratch1);
goetz@6023 582
goetz@6023 583 __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
goetz@6023 584 __ mtctr(R11_scratch1);
goetz@6023 585 __ bctr();
goetz@6023 586
goetz@6023 587 // Create runtime stub with OopMap.
goetz@6023 588 RuntimeStub* stub =
goetz@6023 589 RuntimeStub::new_runtime_stub(name, &code,
goetz@6023 590 /*frame_complete=*/ (int)(frame_complete_pc - start),
goetz@6023 591 frame_size_in_bytes/wordSize,
goetz@6023 592 oop_maps,
goetz@6023 593 false);
goetz@6023 594 return stub->entry_point();
goetz@6023 595 }
goetz@6023 596 #undef __
goetz@6023 597 #define __ _masm->
goetz@6023 598
goetz@6023 599 // Generate G1 pre-write barrier for array.
goetz@6023 600 //
goetz@6023 601 // Input:
goetz@6023 602 // from - register containing src address (only needed for spilling)
goetz@6023 603 // to - register containing starting address
goetz@6023 604 // count - register containing element count
goetz@6023 605 // tmp - scratch register
goetz@6023 606 //
goetz@6023 607 // Kills:
goetz@6023 608 // nothing
goetz@6023 609 //
goetz@6023 610 void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
goetz@6023 611 BarrierSet* const bs = Universe::heap()->barrier_set();
goetz@6023 612 switch (bs->kind()) {
goetz@6023 613 case BarrierSet::G1SATBCT:
goetz@6023 614 case BarrierSet::G1SATBCTLogging:
goetz@6023 615 // With G1, don't generate the call if we statically know that the target in uninitialized
goetz@6023 616 if (!dest_uninitialized) {
goetz@6023 617 const int spill_slots = 4 * wordSize;
goetz@6076 618 const int frame_size = frame::abi_reg_args_size + spill_slots;
goetz@6060 619 Label filtered;
goetz@6060 620
goetz@6060 621 // Is marking active?
goetz@6060 622 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
goetz@6060 623 __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
goetz@6060 624 } else {
goetz@6060 625 guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
goetz@6060 626 __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
goetz@6060 627 }
goetz@6060 628 __ cmpdi(CCR0, Rtmp1, 0);
goetz@6060 629 __ beq(CCR0, filtered);
goetz@6023 630
goetz@6023 631 __ save_LR_CR(R0);
goetz@6076 632 __ push_frame_reg_args(spill_slots, R0);
goetz@6023 633 __ std(from, frame_size - 1 * wordSize, R1_SP);
goetz@6023 634 __ std(to, frame_size - 2 * wordSize, R1_SP);
goetz@6023 635 __ std(count, frame_size - 3 * wordSize, R1_SP);
goetz@6023 636
goetz@6023 637 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
goetz@6023 638
goetz@6023 639 __ ld(from, frame_size - 1 * wordSize, R1_SP);
goetz@6023 640 __ ld(to, frame_size - 2 * wordSize, R1_SP);
goetz@6023 641 __ ld(count, frame_size - 3 * wordSize, R1_SP);
goetz@6023 642 __ pop_frame();
goetz@6023 643 __ restore_LR_CR(R0);
goetz@6060 644
goetz@6060 645 __ bind(filtered);
goetz@6023 646 }
goetz@6023 647 break;
goetz@6023 648 case BarrierSet::CardTableModRef:
goetz@6023 649 case BarrierSet::CardTableExtension:
goetz@6023 650 case BarrierSet::ModRef:
goetz@6023 651 break;
goetz@6023 652 default:
goetz@6023 653 ShouldNotReachHere();
goetz@6023 654 }
goetz@6023 655 }
goetz@6023 656
goetz@6023 657 // Generate CMS/G1 post-write barrier for array.
goetz@6023 658 //
goetz@6023 659 // Input:
goetz@6023 660 // addr - register containing starting address
goetz@6023 661 // count - register containing element count
goetz@6023 662 // tmp - scratch register
goetz@6023 663 //
goetz@6023 664 // The input registers and R0 are overwritten.
goetz@6023 665 //
goetz@6060 666 void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
goetz@6023 667 BarrierSet* const bs = Universe::heap()->barrier_set();
goetz@6023 668
goetz@6023 669 switch (bs->kind()) {
goetz@6023 670 case BarrierSet::G1SATBCT:
goetz@6023 671 case BarrierSet::G1SATBCTLogging:
goetz@6023 672 {
goetz@6060 673 if (branchToEnd) {
goetz@6060 674 __ save_LR_CR(R0);
goetz@6060 675 // We need this frame only to spill LR.
goetz@6076 676 __ push_frame_reg_args(0, R0);
goetz@6060 677 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
goetz@6060 678 __ pop_frame();
goetz@6060 679 __ restore_LR_CR(R0);
goetz@6060 680 } else {
goetz@6060 681 // Tail call: fake call from stub caller by branching without linking.
goetz@6060 682 address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
goetz@6060 683 __ mr_if_needed(R3_ARG1, addr);
goetz@6060 684 __ mr_if_needed(R4_ARG2, count);
goetz@6060 685 __ load_const(R11, entry_point, R0);
goetz@6060 686 __ call_c_and_return_to_caller(R11);
goetz@6060 687 }
goetz@6023 688 }
goetz@6023 689 break;
goetz@6023 690 case BarrierSet::CardTableModRef:
goetz@6023 691 case BarrierSet::CardTableExtension:
goetz@6023 692 {
goetz@6023 693 Label Lskip_loop, Lstore_loop;
goetz@6023 694 if (UseConcMarkSweepGC) {
goetz@6023 695 // TODO PPC port: contribute optimization / requires shared changes
goetz@6023 696 __ release();
goetz@6023 697 }
goetz@6023 698
goetz@6023 699 CardTableModRefBS* const ct = (CardTableModRefBS*)bs;
goetz@6023 700 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
goetz@6023 701 assert_different_registers(addr, count, tmp);
goetz@6023 702
goetz@6023 703 __ sldi(count, count, LogBytesPerHeapOop);
goetz@6023 704 __ addi(count, count, -BytesPerHeapOop);
goetz@6023 705 __ add(count, addr, count);
goetz@6023 706 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
goetz@6023 707 __ srdi(addr, addr, CardTableModRefBS::card_shift);
goetz@6023 708 __ srdi(count, count, CardTableModRefBS::card_shift);
goetz@6023 709 __ subf(count, addr, count);
goetz@6023 710 assert_different_registers(R0, addr, count, tmp);
goetz@6023 711 __ load_const(tmp, (address)ct->byte_map_base);
goetz@6023 712 __ addic_(count, count, 1);
goetz@6023 713 __ beq(CCR0, Lskip_loop);
goetz@6023 714 __ li(R0, 0);
goetz@6023 715 __ mtctr(count);
goetz@6023 716 // Byte store loop
goetz@6023 717 __ bind(Lstore_loop);
goetz@6023 718 __ stbx(R0, tmp, addr);
goetz@6023 719 __ addi(addr, addr, 1);
goetz@6023 720 __ bdnz(Lstore_loop);
goetz@6023 721 __ bind(Lskip_loop);
goetz@6060 722
goetz@6060 723 if (!branchToEnd) __ blr();
goetz@6023 724 }
goetz@6023 725 break;
goetz@6023 726 case BarrierSet::ModRef:
goetz@6060 727 if (!branchToEnd) __ blr();
goetz@6023 728 break;
goetz@6023 729 default:
goetz@6023 730 ShouldNotReachHere();
goetz@6023 731 }
goetz@6023 732 }
goetz@6023 733
goetz@6023 734 // Support for void zero_words_aligned8(HeapWord* to, size_t count)
goetz@6023 735 //
goetz@6023 736 // Arguments:
goetz@6023 737 // to:
goetz@6023 738 // count:
goetz@6023 739 //
goetz@6023 740 // Destroys:
goetz@6023 741 //
goetz@6023 742 address generate_zero_words_aligned8() {
goetz@6023 743 StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
goetz@6023 744
goetz@6023 745 // Implemented as in ClearArray.
goetz@6076 746 address start = __ function_entry();
goetz@6023 747
goetz@6023 748 Register base_ptr_reg = R3_ARG1; // tohw (needs to be 8b aligned)
goetz@6023 749 Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
goetz@6023 750 Register tmp1_reg = R5_ARG3;
goetz@6023 751 Register tmp2_reg = R6_ARG4;
goetz@6023 752 Register zero_reg = R7_ARG5;
goetz@6023 753
goetz@6023 754 // Procedure for large arrays (uses data cache block zero instruction).
goetz@6023 755 Label dwloop, fast, fastloop, restloop, lastdword, done;
goetz@6023 756 int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
goetz@6023 757 int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
goetz@6023 758
goetz@6023 759 // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
goetz@6023 760 __ dcbtst(base_ptr_reg); // Indicate write access to first cache line ...
goetz@6023 761 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if number of dwords is even.
goetz@6023 762 __ srdi_(tmp1_reg, cnt_dwords_reg, 1); // number of double dwords
goetz@6023 763 __ load_const_optimized(zero_reg, 0L); // Use as zero register.
goetz@6023 764
goetz@6023 765 __ cmpdi(CCR1, tmp2_reg, 0); // cnt_dwords even?
goetz@6023 766 __ beq(CCR0, lastdword); // size <= 1
goetz@6023 767 __ mtctr(tmp1_reg); // Speculatively preload counter for rest loop (>0).
goetz@6023 768 __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
goetz@6023 769 __ neg(tmp1_reg, base_ptr_reg); // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
goetz@6023 770
goetz@6023 771 __ blt(CCR0, restloop); // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
goetz@6023 772 __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
goetz@6023 773
goetz@6023 774 __ beq(CCR0, fast); // already 128byte aligned
goetz@6023 775 __ mtctr(tmp1_reg); // Set ctr to hit 128byte boundary (0<ctr<cnt).
goetz@6023 776 __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
goetz@6023 777
goetz@6023 778 // Clear in first cache line dword-by-dword if not already 128byte aligned.
goetz@6023 779 __ bind(dwloop);
goetz@6023 780 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
goetz@6023 781 __ addi(base_ptr_reg, base_ptr_reg, 8);
goetz@6023 782 __ bdnz(dwloop);
goetz@6023 783
goetz@6023 784 // clear 128byte blocks
goetz@6023 785 __ bind(fast);
goetz@6023 786 __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
goetz@6023 787 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if rest even
goetz@6023 788
goetz@6023 789 __ mtctr(tmp1_reg); // load counter
goetz@6023 790 __ cmpdi(CCR1, tmp2_reg, 0); // rest even?
goetz@6023 791 __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
goetz@6023 792
goetz@6023 793 __ bind(fastloop);
goetz@6023 794 __ dcbz(base_ptr_reg); // Clear 128byte aligned block.
goetz@6023 795 __ addi(base_ptr_reg, base_ptr_reg, cl_size);
goetz@6023 796 __ bdnz(fastloop);
goetz@6023 797
goetz@6023 798 //__ dcbtst(base_ptr_reg); // Indicate write access to last cache line.
goetz@6023 799 __ beq(CCR0, lastdword); // rest<=1
goetz@6023 800 __ mtctr(tmp1_reg); // load counter
goetz@6023 801
goetz@6023 802 // Clear rest.
goetz@6023 803 __ bind(restloop);
goetz@6023 804 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
goetz@6023 805 __ std(zero_reg, 8, base_ptr_reg); // Clear 8byte aligned block.
goetz@6023 806 __ addi(base_ptr_reg, base_ptr_reg, 16);
goetz@6023 807 __ bdnz(restloop);
goetz@6023 808
goetz@6023 809 __ bind(lastdword);
goetz@6023 810 __ beq(CCR1, done);
goetz@6023 811 __ std(zero_reg, 0, base_ptr_reg);
goetz@6023 812 __ bind(done);
goetz@6023 813 __ blr(); // return
goetz@6023 814
goetz@6023 815 return start;
goetz@6023 816 }
goetz@6023 817
goetz@6023 818 // The following routine generates a subroutine to throw an asynchronous
goetz@6023 819 // UnknownError when an unsafe access gets a fault that could not be
goetz@6023 820 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.)
goetz@6023 821 //
goetz@6023 822 address generate_handler_for_unsafe_access() {
goetz@6023 823 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
goetz@6076 824 address start = __ function_entry();
goetz@6023 825 __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);
goetz@6023 826 return start;
goetz@6023 827 }
goetz@6023 828
goetz@6023 829 #if !defined(PRODUCT)
goetz@6023 830 // Wrapper which calls oopDesc::is_oop_or_null()
goetz@6023 831 // Only called by MacroAssembler::verify_oop
goetz@6023 832 static void verify_oop_helper(const char* message, oop o) {
goetz@6023 833 if (!o->is_oop_or_null()) {
goetz@6023 834 fatal(message);
goetz@6023 835 }
goetz@6023 836 ++ StubRoutines::_verify_oop_count;
goetz@6023 837 }
goetz@6023 838 #endif
goetz@6023 839
goetz@6023 840 // Return address of code to be called from code generated by
goetz@6023 841 // MacroAssembler::verify_oop.
goetz@6023 842 //
goetz@6023 843 // Don't generate, rather use C++ code.
goetz@6023 844 address generate_verify_oop() {
goetz@6023 845 StubCodeMark mark(this, "StubRoutines", "verify_oop");
goetz@6023 846
goetz@6023 847 // this is actually a `FunctionDescriptor*'.
goetz@6023 848 address start = 0;
goetz@6023 849
goetz@6023 850 #if !defined(PRODUCT)
goetz@6023 851 start = CAST_FROM_FN_PTR(address, verify_oop_helper);
goetz@6023 852 #endif
goetz@6023 853
goetz@6023 854 return start;
goetz@6023 855 }
goetz@6023 856
goetz@6023 857 // Fairer handling of safepoints for native methods.
goetz@6023 858 //
goetz@6023 859 // Generate code which reads from the polling page. This special handling is needed as the
goetz@6023 860 // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode
goetz@6023 861 // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try
goetz@6023 862 // to read from the safepoint polling page.
goetz@6023 863 address generate_load_from_poll() {
goetz@6023 864 StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");
goetz@6076 865 address start = __ function_entry();
goetz@6023 866 __ unimplemented("StubRoutines::verify_oop", 95); // TODO PPC port
goetz@6023 867 return start;
goetz@6023 868 }
goetz@6023 869
goetz@6023 870 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
goetz@6023 871 //
goetz@6023 872 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
goetz@6023 873 // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
goetz@6023 874 //
goetz@6060 875 // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
goetz@6023 876 // for turning on loop predication optimization, and hence the behavior of "array range check"
goetz@6023 877 // and "loop invariant check" could be influenced, which potentially boosted JVM98.
goetz@6023 878 //
goetz@6060 879 // Generate stub for disjoint short fill. If "aligned" is true, the
goetz@6060 880 // "to" address is assumed to be heapword aligned.
goetz@6023 881 //
goetz@6023 882 // Arguments for generated stub:
goetz@6060 883 // to: R3_ARG1
goetz@6060 884 // value: R4_ARG2
goetz@6060 885 // count: R5_ARG3 treated as signed
goetz@6023 886 //
goetz@6023 887 address generate_fill(BasicType t, bool aligned, const char* name) {
goetz@6023 888 StubCodeMark mark(this, "StubRoutines", name);
goetz@6076 889 address start = __ function_entry();
goetz@6023 890
goetz@6060 891 const Register to = R3_ARG1; // source array address
goetz@6060 892 const Register value = R4_ARG2; // fill value
goetz@6060 893 const Register count = R5_ARG3; // elements count
goetz@6060 894 const Register temp = R6_ARG4; // temp register
goetz@6023 895
goetz@6060 896 //assert_clean_int(count, O3); // Make sure 'count' is clean int.
goetz@6023 897
goetz@6023 898 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
goetz@6023 899 Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
goetz@6023 900
goetz@6023 901 int shift = -1;
goetz@6023 902 switch (t) {
goetz@6023 903 case T_BYTE:
goetz@6023 904 shift = 2;
goetz@6060 905 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
goetz@6023 906 __ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
goetz@6060 907 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
goetz@6023 908 __ blt(CCR0, L_fill_elements);
goetz@6023 909 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
goetz@6023 910 break;
goetz@6023 911 case T_SHORT:
goetz@6023 912 shift = 1;
goetz@6060 913 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
goetz@6023 914 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
goetz@6060 915 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
goetz@6023 916 __ blt(CCR0, L_fill_elements);
goetz@6023 917 break;
goetz@6023 918 case T_INT:
goetz@6023 919 shift = 0;
goetz@6060 920 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
goetz@6023 921 __ blt(CCR0, L_fill_4_bytes);
goetz@6023 922 break;
goetz@6023 923 default: ShouldNotReachHere();
goetz@6023 924 }
goetz@6023 925
goetz@6023 926 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
goetz@6060 927 // Align source address at 4 bytes address boundary.
goetz@6023 928 if (t == T_BYTE) {
goetz@6060 929 // One byte misalignment happens only for byte arrays.
goetz@6023 930 __ andi_(temp, to, 1);
goetz@6023 931 __ beq(CCR0, L_skip_align1);
goetz@6023 932 __ stb(value, 0, to);
goetz@6023 933 __ addi(to, to, 1);
goetz@6023 934 __ addi(count, count, -1);
goetz@6023 935 __ bind(L_skip_align1);
goetz@6023 936 }
goetz@6023 937 // Two bytes misalignment happens only for byte and short (char) arrays.
goetz@6023 938 __ andi_(temp, to, 2);
goetz@6023 939 __ beq(CCR0, L_skip_align2);
goetz@6023 940 __ sth(value, 0, to);
goetz@6023 941 __ addi(to, to, 2);
goetz@6023 942 __ addi(count, count, -(1 << (shift - 1)));
goetz@6023 943 __ bind(L_skip_align2);
goetz@6023 944 }
goetz@6023 945
goetz@6023 946 if (!aligned) {
goetz@6023 947 // Align to 8 bytes, we know we are 4 byte aligned to start.
goetz@6023 948 __ andi_(temp, to, 7);
goetz@6023 949 __ beq(CCR0, L_fill_32_bytes);
goetz@6023 950 __ stw(value, 0, to);
goetz@6023 951 __ addi(to, to, 4);
goetz@6023 952 __ addi(count, count, -(1 << shift));
goetz@6023 953 __ bind(L_fill_32_bytes);
goetz@6023 954 }
goetz@6023 955
goetz@6060 956 __ li(temp, 8<<shift); // Prepare for 32 byte loop.
goetz@6060 957 // Clone bytes int->long as above.
goetz@6060 958 __ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
goetz@6023 959
goetz@6023 960 Label L_check_fill_8_bytes;
goetz@6060 961 // Fill 32-byte chunks.
goetz@6023 962 __ subf_(count, temp, count);
goetz@6023 963 __ blt(CCR0, L_check_fill_8_bytes);
goetz@6023 964
goetz@6023 965 Label L_fill_32_bytes_loop;
goetz@6023 966 __ align(32);
goetz@6023 967 __ bind(L_fill_32_bytes_loop);
goetz@6023 968
goetz@6023 969 __ std(value, 0, to);
goetz@6023 970 __ std(value, 8, to);
goetz@6060 971 __ subf_(count, temp, count); // Update count.
goetz@6023 972 __ std(value, 16, to);
goetz@6023 973 __ std(value, 24, to);
goetz@6023 974
goetz@6023 975 __ addi(to, to, 32);
goetz@6023 976 __ bge(CCR0, L_fill_32_bytes_loop);
goetz@6023 977
goetz@6023 978 __ bind(L_check_fill_8_bytes);
goetz@6023 979 __ add_(count, temp, count);
goetz@6023 980 __ beq(CCR0, L_exit);
goetz@6023 981 __ addic_(count, count, -(2 << shift));
goetz@6023 982 __ blt(CCR0, L_fill_4_bytes);
goetz@6023 983
goetz@6023 984 //
goetz@6023 985 // Length is too short, just fill 8 bytes at a time.
goetz@6023 986 //
goetz@6023 987 Label L_fill_8_bytes_loop;
goetz@6023 988 __ bind(L_fill_8_bytes_loop);
goetz@6023 989 __ std(value, 0, to);
goetz@6023 990 __ addic_(count, count, -(2 << shift));
goetz@6023 991 __ addi(to, to, 8);
goetz@6023 992 __ bge(CCR0, L_fill_8_bytes_loop);
goetz@6023 993
goetz@6060 994 // Fill trailing 4 bytes.
goetz@6023 995 __ bind(L_fill_4_bytes);
goetz@6023 996 __ andi_(temp, count, 1<<shift);
goetz@6023 997 __ beq(CCR0, L_fill_2_bytes);
goetz@6023 998
goetz@6023 999 __ stw(value, 0, to);
goetz@6023 1000 if (t == T_BYTE || t == T_SHORT) {
goetz@6023 1001 __ addi(to, to, 4);
goetz@6060 1002 // Fill trailing 2 bytes.
goetz@6023 1003 __ bind(L_fill_2_bytes);
goetz@6023 1004 __ andi_(temp, count, 1<<(shift-1));
goetz@6023 1005 __ beq(CCR0, L_fill_byte);
goetz@6023 1006 __ sth(value, 0, to);
goetz@6023 1007 if (t == T_BYTE) {
goetz@6023 1008 __ addi(to, to, 2);
goetz@6060 1009 // Fill trailing byte.
goetz@6023 1010 __ bind(L_fill_byte);
goetz@6023 1011 __ andi_(count, count, 1);
goetz@6023 1012 __ beq(CCR0, L_exit);
goetz@6023 1013 __ stb(value, 0, to);
goetz@6023 1014 } else {
goetz@6023 1015 __ bind(L_fill_byte);
goetz@6023 1016 }
goetz@6023 1017 } else {
goetz@6023 1018 __ bind(L_fill_2_bytes);
goetz@6023 1019 }
goetz@6023 1020 __ bind(L_exit);
goetz@6023 1021 __ blr();
goetz@6023 1022
goetz@6060 1023 // Handle copies less than 8 bytes. Int is handled elsewhere.
goetz@6023 1024 if (t == T_BYTE) {
goetz@6023 1025 __ bind(L_fill_elements);
goetz@6023 1026 Label L_fill_2, L_fill_4;
goetz@6023 1027 __ andi_(temp, count, 1);
goetz@6023 1028 __ beq(CCR0, L_fill_2);
goetz@6023 1029 __ stb(value, 0, to);
goetz@6023 1030 __ addi(to, to, 1);
goetz@6023 1031 __ bind(L_fill_2);
goetz@6023 1032 __ andi_(temp, count, 2);
goetz@6023 1033 __ beq(CCR0, L_fill_4);
goetz@6023 1034 __ stb(value, 0, to);
goetz@6023 1035 __ stb(value, 0, to);
goetz@6023 1036 __ addi(to, to, 2);
goetz@6023 1037 __ bind(L_fill_4);
goetz@6023 1038 __ andi_(temp, count, 4);
goetz@6023 1039 __ beq(CCR0, L_exit);
goetz@6023 1040 __ stb(value, 0, to);
goetz@6023 1041 __ stb(value, 1, to);
goetz@6023 1042 __ stb(value, 2, to);
goetz@6023 1043 __ stb(value, 3, to);
goetz@6023 1044 __ blr();
goetz@6023 1045 }
goetz@6023 1046
goetz@6023 1047 if (t == T_SHORT) {
goetz@6023 1048 Label L_fill_2;
goetz@6023 1049 __ bind(L_fill_elements);
goetz@6023 1050 __ andi_(temp, count, 1);
goetz@6023 1051 __ beq(CCR0, L_fill_2);
goetz@6023 1052 __ sth(value, 0, to);
goetz@6023 1053 __ addi(to, to, 2);
goetz@6023 1054 __ bind(L_fill_2);
goetz@6023 1055 __ andi_(temp, count, 2);
goetz@6023 1056 __ beq(CCR0, L_exit);
goetz@6023 1057 __ sth(value, 0, to);
goetz@6023 1058 __ sth(value, 2, to);
goetz@6023 1059 __ blr();
goetz@6023 1060 }
goetz@6023 1061 return start;
goetz@6023 1062 }
goetz@6023 1063
goetz@6023 1064
goetz@6060 1065 // Generate overlap test for array copy stubs.
goetz@6023 1066 //
goetz@6023 1067 // Input:
goetz@6023 1068 // R3_ARG1 - from
goetz@6023 1069 // R4_ARG2 - to
goetz@6023 1070 // R5_ARG3 - element count
goetz@6023 1071 //
goetz@6023 1072 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
goetz@6023 1073 Register tmp1 = R6_ARG4;
goetz@6023 1074 Register tmp2 = R7_ARG5;
goetz@6023 1075
goetz@6023 1076 Label l_overlap;
goetz@6023 1077 #ifdef ASSERT
goetz@6023 1078 __ srdi_(tmp2, R5_ARG3, 31);
goetz@6023 1079 __ asm_assert_eq("missing zero extend", 0xAFFE);
goetz@6023 1080 #endif
goetz@6023 1081
goetz@6023 1082 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
goetz@6023 1083 __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
goetz@6023 1084 __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
goetz@6023 1085 __ cmpld(CCR1, tmp1, tmp2);
goetz@6023 1086 __ crand(/*CCR0 lt*/0, /*CCR1 lt*/4+0, /*CCR0 lt*/0);
goetz@6023 1087 __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
goetz@6023 1088
goetz@6023 1089 // need to copy forwards
goetz@6023 1090 if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
goetz@6023 1091 __ b(no_overlap_target);
goetz@6023 1092 } else {
goetz@6023 1093 __ load_const(tmp1, no_overlap_target, tmp2);
goetz@6023 1094 __ mtctr(tmp1);
goetz@6023 1095 __ bctr();
goetz@6023 1096 }
goetz@6023 1097
goetz@6023 1098 __ bind(l_overlap);
goetz@6023 1099 // need to copy backwards
goetz@6023 1100 }
goetz@6023 1101
goetz@6023 1102 // The guideline in the implementations of generate_disjoint_xxx_copy
goetz@6023 1103 // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
goetz@6023 1104 // single instructions, but to avoid alignment interrupts (see subsequent
goetz@6023 1105 // comment). Furthermore, we try to minimize misaligned access, even
goetz@6023 1106 // though they cause no alignment interrupt.
goetz@6023 1107 //
goetz@6023 1108 // In Big-Endian mode, the PowerPC architecture requires implementations to
goetz@6023 1109 // handle automatically misaligned integer halfword and word accesses,
goetz@6023 1110 // word-aligned integer doubleword accesses, and word-aligned floating-point
goetz@6023 1111 // accesses. Other accesses may or may not generate an Alignment interrupt
goetz@6023 1112 // depending on the implementation.
goetz@6023 1113 // Alignment interrupt handling may require on the order of hundreds of cycles,
goetz@6023 1114 // so every effort should be made to avoid misaligned memory values.
goetz@6023 1115 //
goetz@6023 1116 //
goetz@6023 1117 // Generate stub for disjoint byte copy. If "aligned" is true, the
goetz@6023 1118 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1119 //
goetz@6023 1120 // Arguments for generated stub:
goetz@6023 1121 // from: R3_ARG1
goetz@6023 1122 // to: R4_ARG2
goetz@6023 1123 // count: R5_ARG3 treated as signed
goetz@6023 1124 //
goetz@6023 1125 address generate_disjoint_byte_copy(bool aligned, const char * name) {
goetz@6023 1126 StubCodeMark mark(this, "StubRoutines", name);
goetz@6076 1127 address start = __ function_entry();
goetz@6023 1128
goetz@6023 1129 Register tmp1 = R6_ARG4;
goetz@6023 1130 Register tmp2 = R7_ARG5;
goetz@6023 1131 Register tmp3 = R8_ARG6;
goetz@6023 1132 Register tmp4 = R9_ARG7;
goetz@6023 1133
goetz@6023 1134
goetz@6023 1135 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
goetz@6023 1136 // Don't try anything fancy if arrays don't have many elements.
goetz@6023 1137 __ li(tmp3, 0);
goetz@6023 1138 __ cmpwi(CCR0, R5_ARG3, 17);
goetz@6023 1139 __ ble(CCR0, l_6); // copy 4 at a time
goetz@6023 1140
goetz@6023 1141 if (!aligned) {
goetz@6023 1142 __ xorr(tmp1, R3_ARG1, R4_ARG2);
goetz@6023 1143 __ andi_(tmp1, tmp1, 3);
goetz@6023 1144 __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
goetz@6023 1145
goetz@6023 1146 // Copy elements if necessary to align to 4 bytes.
goetz@6023 1147 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
goetz@6023 1148 __ andi_(tmp1, tmp1, 3);
goetz@6023 1149 __ beq(CCR0, l_2);
goetz@6023 1150
goetz@6023 1151 __ subf(R5_ARG3, tmp1, R5_ARG3);
goetz@6023 1152 __ bind(l_9);
goetz@6023 1153 __ lbz(tmp2, 0, R3_ARG1);
goetz@6023 1154 __ addic_(tmp1, tmp1, -1);
goetz@6023 1155 __ stb(tmp2, 0, R4_ARG2);
goetz@6023 1156 __ addi(R3_ARG1, R3_ARG1, 1);
goetz@6023 1157 __ addi(R4_ARG2, R4_ARG2, 1);
goetz@6023 1158 __ bne(CCR0, l_9);
goetz@6023 1159
goetz@6023 1160 __ bind(l_2);
goetz@6023 1161 }
goetz@6023 1162
goetz@6023 1163 // copy 8 elements at a time
goetz@6023 1164 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
goetz@6023 1165 __ andi_(tmp1, tmp2, 7);
goetz@6023 1166 __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
goetz@6023 1167
goetz@6023 1168 // copy a 2-element word if necessary to align to 8 bytes
goetz@6023 1169 __ andi_(R0, R3_ARG1, 7);
goetz@6023 1170 __ beq(CCR0, l_7);
goetz@6023 1171
goetz@6023 1172 __ lwzx(tmp2, R3_ARG1, tmp3);
goetz@6023 1173 __ addi(R5_ARG3, R5_ARG3, -4);
goetz@6023 1174 __ stwx(tmp2, R4_ARG2, tmp3);
goetz@6023 1175 { // FasterArrayCopy
goetz@6023 1176 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6023 1177 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6023 1178 }
goetz@6023 1179 __ bind(l_7);
goetz@6023 1180
goetz@6023 1181 { // FasterArrayCopy
goetz@6023 1182 __ cmpwi(CCR0, R5_ARG3, 31);
goetz@6023 1183 __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
goetz@6023 1184
goetz@6023 1185 __ srdi(tmp1, R5_ARG3, 5);
goetz@6023 1186 __ andi_(R5_ARG3, R5_ARG3, 31);
goetz@6023 1187 __ mtctr(tmp1);
goetz@6023 1188
goetz@6023 1189 __ bind(l_8);
goetz@6023 1190 // Use unrolled version for mass copying (copy 32 elements a time)
goetz@6023 1191 // Load feeding store gets zero latency on Power6, however not on Power5.
goetz@6023 1192 // Therefore, the following sequence is made for the good of both.
goetz@6023 1193 __ ld(tmp1, 0, R3_ARG1);
goetz@6023 1194 __ ld(tmp2, 8, R3_ARG1);
goetz@6023 1195 __ ld(tmp3, 16, R3_ARG1);
goetz@6023 1196 __ ld(tmp4, 24, R3_ARG1);
goetz@6023 1197 __ std(tmp1, 0, R4_ARG2);
goetz@6023 1198 __ std(tmp2, 8, R4_ARG2);
goetz@6023 1199 __ std(tmp3, 16, R4_ARG2);
goetz@6023 1200 __ std(tmp4, 24, R4_ARG2);
goetz@6023 1201 __ addi(R3_ARG1, R3_ARG1, 32);
goetz@6023 1202 __ addi(R4_ARG2, R4_ARG2, 32);
goetz@6023 1203 __ bdnz(l_8);
goetz@6023 1204 }
goetz@6023 1205
goetz@6023 1206 __ bind(l_6);
goetz@6023 1207
goetz@6023 1208 // copy 4 elements at a time
goetz@6023 1209 __ cmpwi(CCR0, R5_ARG3, 4);
goetz@6023 1210 __ blt(CCR0, l_1);
goetz@6023 1211 __ srdi(tmp1, R5_ARG3, 2);
goetz@6023 1212 __ mtctr(tmp1); // is > 0
goetz@6023 1213 __ andi_(R5_ARG3, R5_ARG3, 3);
goetz@6023 1214
goetz@6023 1215 { // FasterArrayCopy
goetz@6023 1216 __ addi(R3_ARG1, R3_ARG1, -4);
goetz@6023 1217 __ addi(R4_ARG2, R4_ARG2, -4);
goetz@6023 1218 __ bind(l_3);
goetz@6023 1219 __ lwzu(tmp2, 4, R3_ARG1);
goetz@6023 1220 __ stwu(tmp2, 4, R4_ARG2);
goetz@6023 1221 __ bdnz(l_3);
goetz@6023 1222 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6023 1223 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6023 1224 }
goetz@6023 1225
goetz@6023 1226 // do single element copy
goetz@6023 1227 __ bind(l_1);
goetz@6023 1228 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6023 1229 __ beq(CCR0, l_4);
goetz@6023 1230
goetz@6023 1231 { // FasterArrayCopy
goetz@6023 1232 __ mtctr(R5_ARG3);
goetz@6023 1233 __ addi(R3_ARG1, R3_ARG1, -1);
goetz@6023 1234 __ addi(R4_ARG2, R4_ARG2, -1);
goetz@6023 1235
goetz@6023 1236 __ bind(l_5);
goetz@6023 1237 __ lbzu(tmp2, 1, R3_ARG1);
goetz@6023 1238 __ stbu(tmp2, 1, R4_ARG2);
goetz@6023 1239 __ bdnz(l_5);
goetz@6023 1240 }
goetz@6023 1241
goetz@6023 1242 __ bind(l_4);
goetz@6023 1243 __ blr();
goetz@6023 1244
goetz@6023 1245 return start;
goetz@6023 1246 }
goetz@6023 1247
goetz@6023 1248 // Generate stub for conjoint byte copy. If "aligned" is true, the
goetz@6023 1249 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1250 //
goetz@6023 1251 // Arguments for generated stub:
goetz@6023 1252 // from: R3_ARG1
goetz@6023 1253 // to: R4_ARG2
goetz@6023 1254 // count: R5_ARG3 treated as signed
goetz@6023 1255 //
goetz@6023 1256 address generate_conjoint_byte_copy(bool aligned, const char * name) {
goetz@6023 1257 StubCodeMark mark(this, "StubRoutines", name);
goetz@6076 1258 address start = __ function_entry();
goetz@6023 1259
goetz@6023 1260 Register tmp1 = R6_ARG4;
goetz@6023 1261 Register tmp2 = R7_ARG5;
goetz@6023 1262 Register tmp3 = R8_ARG6;
goetz@6023 1263
goetz@6076 1264 #if defined(ABI_ELFv2)
goetz@6076 1265 address nooverlap_target = aligned ?
goetz@6076 1266 StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
goetz@6076 1267 StubRoutines::jbyte_disjoint_arraycopy();
goetz@6076 1268 #else
goetz@6023 1269 address nooverlap_target = aligned ?
goetz@6023 1270 ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
goetz@6023 1271 ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
goetz@6076 1272 #endif
goetz@6023 1273
goetz@6023 1274 array_overlap_test(nooverlap_target, 0);
goetz@6023 1275 // Do reverse copy. We assume the case of actual overlap is rare enough
goetz@6023 1276 // that we don't have to optimize it.
goetz@6023 1277 Label l_1, l_2;
goetz@6023 1278
goetz@6023 1279 __ b(l_2);
goetz@6023 1280 __ bind(l_1);
goetz@6023 1281 __ stbx(tmp1, R4_ARG2, R5_ARG3);
goetz@6023 1282 __ bind(l_2);
goetz@6023 1283 __ addic_(R5_ARG3, R5_ARG3, -1);
goetz@6023 1284 __ lbzx(tmp1, R3_ARG1, R5_ARG3);
goetz@6023 1285 __ bge(CCR0, l_1);
goetz@6023 1286
goetz@6023 1287 __ blr();
goetz@6023 1288
goetz@6023 1289 return start;
goetz@6023 1290 }
goetz@6023 1291
goetz@6023 1292 // Generate stub for disjoint short copy. If "aligned" is true, the
goetz@6023 1293 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1294 //
goetz@6023 1295 // Arguments for generated stub:
goetz@6023 1296 // from: R3_ARG1
goetz@6023 1297 // to: R4_ARG2
goetz@6023 1298 // elm.count: R5_ARG3 treated as signed
goetz@6023 1299 //
goetz@6023 1300 // Strategy for aligned==true:
goetz@6023 1301 //
goetz@6023 1302 // If length <= 9:
goetz@6023 1303 // 1. copy 2 elements at a time (l_6)
goetz@6023 1304 // 2. copy last element if original element count was odd (l_1)
goetz@6023 1305 //
goetz@6023 1306 // If length > 9:
goetz@6023 1307 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
goetz@6023 1308 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
goetz@6023 1309 // 3. copy last element if one was left in step 2. (l_1)
goetz@6023 1310 //
goetz@6023 1311 //
goetz@6023 1312 // Strategy for aligned==false:
goetz@6023 1313 //
goetz@6023 1314 // If length <= 9: same as aligned==true case, but NOTE: load/stores
goetz@6023 1315 // can be unaligned (see comment below)
goetz@6023 1316 //
goetz@6023 1317 // If length > 9:
goetz@6023 1318 // 1. continue with step 6. if the alignment of from and to mod 4
goetz@6023 1319 // is different.
goetz@6023 1320 // 2. align from and to to 4 bytes by copying 1 element if necessary
goetz@6023 1321 // 3. at l_2 from and to are 4 byte aligned; continue with
goetz@6023 1322 // 5. if they cannot be aligned to 8 bytes because they have
goetz@6023 1323 // got different alignment mod 8.
goetz@6023 1324 // 4. at this point we know that both, from and to, have the same
goetz@6023 1325 // alignment mod 8, now copy one element if necessary to get
goetz@6023 1326 // 8 byte alignment of from and to.
goetz@6023 1327 // 5. copy 4 elements at a time until less than 4 elements are
goetz@6023 1328 // left; depending on step 3. all load/stores are aligned or
goetz@6023 1329 // either all loads or all stores are unaligned.
goetz@6023 1330 // 6. copy 2 elements at a time until less than 2 elements are
goetz@6023 1331 // left (l_6); arriving here from step 1., there is a chance
goetz@6023 1332 // that all accesses are unaligned.
goetz@6023 1333 // 7. copy last element if one was left in step 6. (l_1)
goetz@6023 1334 //
goetz@6023 1335 // There are unaligned data accesses using integer load/store
goetz@6023 1336 // instructions in this stub. POWER allows such accesses.
goetz@6023 1337 //
goetz@6023 1338 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
goetz@6023 1339 // Chapter 2: Effect of Operand Placement on Performance) unaligned
goetz@6023 1340 // integer load/stores have good performance. Only unaligned
goetz@6023 1341 // floating point load/stores can have poor performance.
goetz@6023 1342 //
goetz@6023 1343 // TODO:
goetz@6023 1344 //
goetz@6023 1345 // 1. check if aligning the backbranch target of loops is beneficial
goetz@6023 1346 //
goetz@6023 1347 address generate_disjoint_short_copy(bool aligned, const char * name) {
goetz@6023 1348 StubCodeMark mark(this, "StubRoutines", name);
goetz@6023 1349
goetz@6023 1350 Register tmp1 = R6_ARG4;
goetz@6023 1351 Register tmp2 = R7_ARG5;
goetz@6023 1352 Register tmp3 = R8_ARG6;
goetz@6023 1353 Register tmp4 = R9_ARG7;
goetz@6023 1354
goetz@6076 1355 address start = __ function_entry();
goetz@6023 1356
goetz@6023 1357 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
goetz@6023 1358 // don't try anything fancy if arrays don't have many elements
goetz@6023 1359 __ li(tmp3, 0);
goetz@6023 1360 __ cmpwi(CCR0, R5_ARG3, 9);
goetz@6023 1361 __ ble(CCR0, l_6); // copy 2 at a time
goetz@6023 1362
goetz@6023 1363 if (!aligned) {
goetz@6023 1364 __ xorr(tmp1, R3_ARG1, R4_ARG2);
goetz@6023 1365 __ andi_(tmp1, tmp1, 3);
goetz@6023 1366 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
goetz@6023 1367
goetz@6023 1368 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
goetz@6023 1369
goetz@6023 1370 // Copy 1 element if necessary to align to 4 bytes.
goetz@6023 1371 __ andi_(tmp1, R3_ARG1, 3);
goetz@6023 1372 __ beq(CCR0, l_2);
goetz@6023 1373
goetz@6023 1374 __ lhz(tmp2, 0, R3_ARG1);
goetz@6023 1375 __ addi(R3_ARG1, R3_ARG1, 2);
goetz@6023 1376 __ sth(tmp2, 0, R4_ARG2);
goetz@6023 1377 __ addi(R4_ARG2, R4_ARG2, 2);
goetz@6023 1378 __ addi(R5_ARG3, R5_ARG3, -1);
goetz@6023 1379 __ bind(l_2);
goetz@6023 1380
goetz@6023 1381 // At this point the positions of both, from and to, are at least 4 byte aligned.
goetz@6023 1382
goetz@6023 1383 // Copy 4 elements at a time.
goetz@6023 1384 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
goetz@6023 1385 __ xorr(tmp2, R3_ARG1, R4_ARG2);
goetz@6023 1386 __ andi_(tmp1, tmp2, 7);
goetz@6023 1387 __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
goetz@6023 1388
goetz@6023 1389 // Copy a 2-element word if necessary to align to 8 bytes.
goetz@6023 1390 __ andi_(R0, R3_ARG1, 7);
goetz@6023 1391 __ beq(CCR0, l_7);
goetz@6023 1392
goetz@6023 1393 __ lwzx(tmp2, R3_ARG1, tmp3);
goetz@6023 1394 __ addi(R5_ARG3, R5_ARG3, -2);
goetz@6023 1395 __ stwx(tmp2, R4_ARG2, tmp3);
goetz@6023 1396 { // FasterArrayCopy
goetz@6023 1397 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6023 1398 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6023 1399 }
goetz@6023 1400 }
goetz@6023 1401
goetz@6023 1402 __ bind(l_7);
goetz@6023 1403
goetz@6023 1404 // Copy 4 elements at a time; either the loads or the stores can
goetz@6023 1405 // be unaligned if aligned == false.
goetz@6023 1406
goetz@6023 1407 { // FasterArrayCopy
goetz@6023 1408 __ cmpwi(CCR0, R5_ARG3, 15);
goetz@6023 1409 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
goetz@6023 1410
goetz@6023 1411 __ srdi(tmp1, R5_ARG3, 4);
goetz@6023 1412 __ andi_(R5_ARG3, R5_ARG3, 15);
goetz@6023 1413 __ mtctr(tmp1);
goetz@6023 1414
goetz@6023 1415 __ bind(l_8);
goetz@6023 1416 // Use unrolled version for mass copying (copy 16 elements a time).
goetz@6023 1417 // Load feeding store gets zero latency on Power6, however not on Power5.
goetz@6023 1418 // Therefore, the following sequence is made for the good of both.
goetz@6023 1419 __ ld(tmp1, 0, R3_ARG1);
goetz@6023 1420 __ ld(tmp2, 8, R3_ARG1);
goetz@6023 1421 __ ld(tmp3, 16, R3_ARG1);
goetz@6023 1422 __ ld(tmp4, 24, R3_ARG1);
goetz@6023 1423 __ std(tmp1, 0, R4_ARG2);
goetz@6023 1424 __ std(tmp2, 8, R4_ARG2);
goetz@6023 1425 __ std(tmp3, 16, R4_ARG2);
goetz@6023 1426 __ std(tmp4, 24, R4_ARG2);
goetz@6023 1427 __ addi(R3_ARG1, R3_ARG1, 32);
goetz@6023 1428 __ addi(R4_ARG2, R4_ARG2, 32);
goetz@6023 1429 __ bdnz(l_8);
goetz@6023 1430 }
goetz@6023 1431 __ bind(l_6);
goetz@6023 1432
goetz@6023 1433 // copy 2 elements at a time
goetz@6023 1434 { // FasterArrayCopy
goetz@6023 1435 __ cmpwi(CCR0, R5_ARG3, 2);
goetz@6023 1436 __ blt(CCR0, l_1);
goetz@6023 1437 __ srdi(tmp1, R5_ARG3, 1);
goetz@6023 1438 __ andi_(R5_ARG3, R5_ARG3, 1);
goetz@6023 1439
goetz@6023 1440 __ addi(R3_ARG1, R3_ARG1, -4);
goetz@6023 1441 __ addi(R4_ARG2, R4_ARG2, -4);
goetz@6023 1442 __ mtctr(tmp1);
goetz@6023 1443
goetz@6023 1444 __ bind(l_3);
goetz@6023 1445 __ lwzu(tmp2, 4, R3_ARG1);
goetz@6023 1446 __ stwu(tmp2, 4, R4_ARG2);
goetz@6023 1447 __ bdnz(l_3);
goetz@6023 1448
goetz@6023 1449 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6023 1450 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6023 1451 }
goetz@6023 1452
goetz@6023 1453 // do single element copy
goetz@6023 1454 __ bind(l_1);
goetz@6023 1455 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6023 1456 __ beq(CCR0, l_4);
goetz@6023 1457
goetz@6023 1458 { // FasterArrayCopy
goetz@6023 1459 __ mtctr(R5_ARG3);
goetz@6023 1460 __ addi(R3_ARG1, R3_ARG1, -2);
goetz@6023 1461 __ addi(R4_ARG2, R4_ARG2, -2);
goetz@6023 1462
goetz@6023 1463 __ bind(l_5);
goetz@6023 1464 __ lhzu(tmp2, 2, R3_ARG1);
goetz@6023 1465 __ sthu(tmp2, 2, R4_ARG2);
goetz@6023 1466 __ bdnz(l_5);
goetz@6023 1467 }
goetz@6023 1468 __ bind(l_4);
goetz@6023 1469 __ blr();
goetz@6023 1470
goetz@6023 1471 return start;
goetz@6023 1472 }
goetz@6023 1473
goetz@6023 1474 // Generate stub for conjoint short copy. If "aligned" is true, the
goetz@6023 1475 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1476 //
goetz@6023 1477 // Arguments for generated stub:
goetz@6023 1478 // from: R3_ARG1
goetz@6023 1479 // to: R4_ARG2
goetz@6023 1480 // count: R5_ARG3 treated as signed
goetz@6023 1481 //
goetz@6023 1482 address generate_conjoint_short_copy(bool aligned, const char * name) {
goetz@6023 1483 StubCodeMark mark(this, "StubRoutines", name);
goetz@6076 1484 address start = __ function_entry();
goetz@6023 1485
goetz@6023 1486 Register tmp1 = R6_ARG4;
goetz@6023 1487 Register tmp2 = R7_ARG5;
goetz@6023 1488 Register tmp3 = R8_ARG6;
goetz@6023 1489
goetz@6076 1490 #if defined(ABI_ELFv2)
goetz@6076 1491 address nooverlap_target = aligned ?
goetz@6076 1492 StubRoutines::arrayof_jshort_disjoint_arraycopy() :
goetz@6076 1493 StubRoutines::jshort_disjoint_arraycopy();
goetz@6076 1494 #else
goetz@6023 1495 address nooverlap_target = aligned ?
goetz@6023 1496 ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
goetz@6023 1497 ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
goetz@6076 1498 #endif
goetz@6023 1499
goetz@6023 1500 array_overlap_test(nooverlap_target, 1);
goetz@6023 1501
goetz@6023 1502 Label l_1, l_2;
goetz@6023 1503 __ sldi(tmp1, R5_ARG3, 1);
goetz@6023 1504 __ b(l_2);
goetz@6023 1505 __ bind(l_1);
goetz@6023 1506 __ sthx(tmp2, R4_ARG2, tmp1);
goetz@6023 1507 __ bind(l_2);
goetz@6023 1508 __ addic_(tmp1, tmp1, -2);
goetz@6023 1509 __ lhzx(tmp2, R3_ARG1, tmp1);
goetz@6023 1510 __ bge(CCR0, l_1);
goetz@6023 1511
goetz@6023 1512 __ blr();
goetz@6023 1513
goetz@6023 1514 return start;
goetz@6023 1515 }
goetz@6023 1516
goetz@6023 1517 // Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned"
goetz@6023 1518 // is true, the "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1519 //
goetz@6023 1520 // Arguments:
goetz@6023 1521 // from: R3_ARG1
goetz@6023 1522 // to: R4_ARG2
goetz@6023 1523 // count: R5_ARG3 treated as signed
goetz@6023 1524 //
goetz@6023 1525 void generate_disjoint_int_copy_core(bool aligned) {
goetz@6023 1526 Register tmp1 = R6_ARG4;
goetz@6023 1527 Register tmp2 = R7_ARG5;
goetz@6023 1528 Register tmp3 = R8_ARG6;
goetz@6023 1529 Register tmp4 = R0;
goetz@6023 1530
goetz@6023 1531 Label l_1, l_2, l_3, l_4, l_5, l_6;
goetz@6023 1532 // for short arrays, just do single element copy
goetz@6023 1533 __ li(tmp3, 0);
goetz@6023 1534 __ cmpwi(CCR0, R5_ARG3, 5);
goetz@6023 1535 __ ble(CCR0, l_2);
goetz@6023 1536
goetz@6023 1537 if (!aligned) {
goetz@6023 1538 // check if arrays have same alignment mod 8.
goetz@6023 1539 __ xorr(tmp1, R3_ARG1, R4_ARG2);
goetz@6023 1540 __ andi_(R0, tmp1, 7);
goetz@6023 1541 // Not the same alignment, but ld and std just need to be 4 byte aligned.
goetz@6023 1542 __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
goetz@6023 1543
goetz@6023 1544 // copy 1 element to align to and from on an 8 byte boundary
goetz@6023 1545 __ andi_(R0, R3_ARG1, 7);
goetz@6023 1546 __ beq(CCR0, l_4);
goetz@6023 1547
goetz@6023 1548 __ lwzx(tmp2, R3_ARG1, tmp3);
goetz@6023 1549 __ addi(R5_ARG3, R5_ARG3, -1);
goetz@6023 1550 __ stwx(tmp2, R4_ARG2, tmp3);
goetz@6023 1551 { // FasterArrayCopy
goetz@6023 1552 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6023 1553 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6023 1554 }
goetz@6023 1555 __ bind(l_4);
goetz@6023 1556 }
goetz@6023 1557
goetz@6023 1558 { // FasterArrayCopy
goetz@6023 1559 __ cmpwi(CCR0, R5_ARG3, 7);
goetz@6023 1560 __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
goetz@6023 1561
goetz@6023 1562 __ srdi(tmp1, R5_ARG3, 3);
goetz@6023 1563 __ andi_(R5_ARG3, R5_ARG3, 7);
goetz@6023 1564 __ mtctr(tmp1);
goetz@6023 1565
goetz@6023 1566 __ bind(l_6);
goetz@6023 1567 // Use unrolled version for mass copying (copy 8 elements a time).
goetz@6023 1568 // Load feeding store gets zero latency on power6, however not on power 5.
goetz@6023 1569 // Therefore, the following sequence is made for the good of both.
goetz@6023 1570 __ ld(tmp1, 0, R3_ARG1);
goetz@6023 1571 __ ld(tmp2, 8, R3_ARG1);
goetz@6023 1572 __ ld(tmp3, 16, R3_ARG1);
goetz@6023 1573 __ ld(tmp4, 24, R3_ARG1);
goetz@6023 1574 __ std(tmp1, 0, R4_ARG2);
goetz@6023 1575 __ std(tmp2, 8, R4_ARG2);
goetz@6023 1576 __ std(tmp3, 16, R4_ARG2);
goetz@6023 1577 __ std(tmp4, 24, R4_ARG2);
goetz@6023 1578 __ addi(R3_ARG1, R3_ARG1, 32);
goetz@6023 1579 __ addi(R4_ARG2, R4_ARG2, 32);
goetz@6023 1580 __ bdnz(l_6);
goetz@6023 1581 }
goetz@6023 1582
goetz@6023 1583 // copy 1 element at a time
goetz@6023 1584 __ bind(l_2);
goetz@6023 1585 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6023 1586 __ beq(CCR0, l_1);
goetz@6023 1587
goetz@6023 1588 { // FasterArrayCopy
goetz@6023 1589 __ mtctr(R5_ARG3);
goetz@6023 1590 __ addi(R3_ARG1, R3_ARG1, -4);
goetz@6023 1591 __ addi(R4_ARG2, R4_ARG2, -4);
goetz@6023 1592
goetz@6023 1593 __ bind(l_3);
goetz@6023 1594 __ lwzu(tmp2, 4, R3_ARG1);
goetz@6023 1595 __ stwu(tmp2, 4, R4_ARG2);
goetz@6023 1596 __ bdnz(l_3);
goetz@6023 1597 }
goetz@6023 1598
goetz@6023 1599 __ bind(l_1);
goetz@6023 1600 return;
goetz@6023 1601 }
goetz@6023 1602
goetz@6023 1603 // Generate stub for disjoint int copy. If "aligned" is true, the
goetz@6023 1604 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1605 //
goetz@6023 1606 // Arguments for generated stub:
goetz@6023 1607 // from: R3_ARG1
goetz@6023 1608 // to: R4_ARG2
goetz@6023 1609 // count: R5_ARG3 treated as signed
goetz@6023 1610 //
goetz@6023 1611 address generate_disjoint_int_copy(bool aligned, const char * name) {
goetz@6023 1612 StubCodeMark mark(this, "StubRoutines", name);
goetz@6076 1613 address start = __ function_entry();
goetz@6023 1614 generate_disjoint_int_copy_core(aligned);
goetz@6023 1615 __ blr();
goetz@6023 1616 return start;
goetz@6023 1617 }
goetz@6023 1618
goetz@6023 1619 // Generate core code for conjoint int copy (and oop copy on
goetz@6023 1620 // 32-bit). If "aligned" is true, the "from" and "to" addresses
goetz@6023 1621 // are assumed to be heapword aligned.
goetz@6023 1622 //
goetz@6023 1623 // Arguments:
goetz@6023 1624 // from: R3_ARG1
goetz@6023 1625 // to: R4_ARG2
goetz@6023 1626 // count: R5_ARG3 treated as signed
goetz@6023 1627 //
goetz@6023 1628 void generate_conjoint_int_copy_core(bool aligned) {
goetz@6023 1629 // Do reverse copy. We assume the case of actual overlap is rare enough
goetz@6023 1630 // that we don't have to optimize it.
goetz@6023 1631
goetz@6023 1632 Label l_1, l_2, l_3, l_4, l_5, l_6;
goetz@6023 1633
goetz@6023 1634 Register tmp1 = R6_ARG4;
goetz@6023 1635 Register tmp2 = R7_ARG5;
goetz@6023 1636 Register tmp3 = R8_ARG6;
goetz@6023 1637 Register tmp4 = R0;
goetz@6023 1638
goetz@6023 1639 { // FasterArrayCopy
goetz@6023 1640 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6023 1641 __ beq(CCR0, l_6);
goetz@6023 1642
goetz@6023 1643 __ sldi(R5_ARG3, R5_ARG3, 2);
goetz@6023 1644 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
goetz@6023 1645 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
goetz@6023 1646 __ srdi(R5_ARG3, R5_ARG3, 2);
goetz@6023 1647
goetz@6023 1648 __ cmpwi(CCR0, R5_ARG3, 7);
goetz@6023 1649 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
goetz@6023 1650
goetz@6023 1651 __ srdi(tmp1, R5_ARG3, 3);
goetz@6023 1652 __ andi(R5_ARG3, R5_ARG3, 7);
goetz@6023 1653 __ mtctr(tmp1);
goetz@6023 1654
goetz@6023 1655 __ bind(l_4);
goetz@6023 1656 // Use unrolled version for mass copying (copy 4 elements a time).
goetz@6023 1657 // Load feeding store gets zero latency on Power6, however not on Power5.
goetz@6023 1658 // Therefore, the following sequence is made for the good of both.
goetz@6023 1659 __ addi(R3_ARG1, R3_ARG1, -32);
goetz@6023 1660 __ addi(R4_ARG2, R4_ARG2, -32);
goetz@6023 1661 __ ld(tmp4, 24, R3_ARG1);
goetz@6023 1662 __ ld(tmp3, 16, R3_ARG1);
goetz@6023 1663 __ ld(tmp2, 8, R3_ARG1);
goetz@6023 1664 __ ld(tmp1, 0, R3_ARG1);
goetz@6023 1665 __ std(tmp4, 24, R4_ARG2);
goetz@6023 1666 __ std(tmp3, 16, R4_ARG2);
goetz@6023 1667 __ std(tmp2, 8, R4_ARG2);
goetz@6023 1668 __ std(tmp1, 0, R4_ARG2);
goetz@6023 1669 __ bdnz(l_4);
goetz@6023 1670
goetz@6023 1671 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6023 1672 __ beq(CCR0, l_6);
goetz@6023 1673
goetz@6023 1674 __ bind(l_5);
goetz@6023 1675 __ mtctr(R5_ARG3);
goetz@6023 1676 __ bind(l_3);
goetz@6023 1677 __ lwz(R0, -4, R3_ARG1);
goetz@6023 1678 __ stw(R0, -4, R4_ARG2);
goetz@6023 1679 __ addi(R3_ARG1, R3_ARG1, -4);
goetz@6023 1680 __ addi(R4_ARG2, R4_ARG2, -4);
goetz@6023 1681 __ bdnz(l_3);
goetz@6023 1682
goetz@6023 1683 __ bind(l_6);
goetz@6023 1684 }
goetz@6023 1685 }
goetz@6023 1686
goetz@6023 1687 // Generate stub for conjoint int copy. If "aligned" is true, the
goetz@6023 1688 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1689 //
goetz@6023 1690 // Arguments for generated stub:
goetz@6023 1691 // from: R3_ARG1
goetz@6023 1692 // to: R4_ARG2
goetz@6023 1693 // count: R5_ARG3 treated as signed
goetz@6023 1694 //
goetz@6023 1695 address generate_conjoint_int_copy(bool aligned, const char * name) {
goetz@6023 1696 StubCodeMark mark(this, "StubRoutines", name);
goetz@6076 1697 address start = __ function_entry();
goetz@6023 1698
goetz@6076 1699 #if defined(ABI_ELFv2)
goetz@6076 1700 address nooverlap_target = aligned ?
goetz@6076 1701 StubRoutines::arrayof_jint_disjoint_arraycopy() :
goetz@6076 1702 StubRoutines::jint_disjoint_arraycopy();
goetz@6076 1703 #else
goetz@6023 1704 address nooverlap_target = aligned ?
goetz@6023 1705 ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
goetz@6023 1706 ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
goetz@6076 1707 #endif
goetz@6023 1708
goetz@6023 1709 array_overlap_test(nooverlap_target, 2);
goetz@6023 1710
goetz@6023 1711 generate_conjoint_int_copy_core(aligned);
goetz@6023 1712
goetz@6023 1713 __ blr();
goetz@6023 1714
goetz@6023 1715 return start;
goetz@6023 1716 }
goetz@6023 1717
goetz@6023 1718 // Generate core code for disjoint long copy (and oop copy on
goetz@6023 1719 // 64-bit). If "aligned" is true, the "from" and "to" addresses
goetz@6023 1720 // are assumed to be heapword aligned.
goetz@6023 1721 //
goetz@6023 1722 // Arguments:
goetz@6023 1723 // from: R3_ARG1
goetz@6023 1724 // to: R4_ARG2
goetz@6023 1725 // count: R5_ARG3 treated as signed
goetz@6023 1726 //
goetz@6023 1727 void generate_disjoint_long_copy_core(bool aligned) {
goetz@6023 1728 Register tmp1 = R6_ARG4;
goetz@6023 1729 Register tmp2 = R7_ARG5;
goetz@6023 1730 Register tmp3 = R8_ARG6;
goetz@6023 1731 Register tmp4 = R0;
goetz@6023 1732
goetz@6023 1733 Label l_1, l_2, l_3, l_4;
goetz@6023 1734
goetz@6023 1735 { // FasterArrayCopy
goetz@6023 1736 __ cmpwi(CCR0, R5_ARG3, 3);
goetz@6023 1737 __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
goetz@6023 1738
goetz@6023 1739 __ srdi(tmp1, R5_ARG3, 2);
goetz@6023 1740 __ andi_(R5_ARG3, R5_ARG3, 3);
goetz@6023 1741 __ mtctr(tmp1);
goetz@6023 1742
goetz@6023 1743 __ bind(l_4);
goetz@6023 1744 // Use unrolled version for mass copying (copy 4 elements a time).
goetz@6023 1745 // Load feeding store gets zero latency on Power6, however not on Power5.
goetz@6023 1746 // Therefore, the following sequence is made for the good of both.
goetz@6023 1747 __ ld(tmp1, 0, R3_ARG1);
goetz@6023 1748 __ ld(tmp2, 8, R3_ARG1);
goetz@6023 1749 __ ld(tmp3, 16, R3_ARG1);
goetz@6023 1750 __ ld(tmp4, 24, R3_ARG1);
goetz@6023 1751 __ std(tmp1, 0, R4_ARG2);
goetz@6023 1752 __ std(tmp2, 8, R4_ARG2);
goetz@6023 1753 __ std(tmp3, 16, R4_ARG2);
goetz@6023 1754 __ std(tmp4, 24, R4_ARG2);
goetz@6023 1755 __ addi(R3_ARG1, R3_ARG1, 32);
goetz@6023 1756 __ addi(R4_ARG2, R4_ARG2, 32);
goetz@6023 1757 __ bdnz(l_4);
goetz@6023 1758 }
goetz@6023 1759
goetz@6023 1760 // copy 1 element at a time
goetz@6023 1761 __ bind(l_3);
goetz@6023 1762 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6023 1763 __ beq(CCR0, l_1);
goetz@6023 1764
goetz@6023 1765 { // FasterArrayCopy
goetz@6023 1766 __ mtctr(R5_ARG3);
goetz@6023 1767 __ addi(R3_ARG1, R3_ARG1, -8);
goetz@6023 1768 __ addi(R4_ARG2, R4_ARG2, -8);
goetz@6023 1769
goetz@6023 1770 __ bind(l_2);
goetz@6023 1771 __ ldu(R0, 8, R3_ARG1);
goetz@6023 1772 __ stdu(R0, 8, R4_ARG2);
goetz@6023 1773 __ bdnz(l_2);
goetz@6023 1774
goetz@6023 1775 }
goetz@6023 1776 __ bind(l_1);
goetz@6023 1777 }
goetz@6023 1778
goetz@6023 1779 // Generate stub for disjoint long copy. If "aligned" is true, the
goetz@6023 1780 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1781 //
goetz@6023 1782 // Arguments for generated stub:
goetz@6023 1783 // from: R3_ARG1
goetz@6023 1784 // to: R4_ARG2
goetz@6023 1785 // count: R5_ARG3 treated as signed
goetz@6023 1786 //
goetz@6023 1787 address generate_disjoint_long_copy(bool aligned, const char * name) {
goetz@6023 1788 StubCodeMark mark(this, "StubRoutines", name);
goetz@6076 1789 address start = __ function_entry();
goetz@6023 1790 generate_disjoint_long_copy_core(aligned);
goetz@6023 1791 __ blr();
goetz@6023 1792
goetz@6023 1793 return start;
goetz@6023 1794 }
goetz@6023 1795
goetz@6023 1796 // Generate core code for conjoint long copy (and oop copy on
goetz@6023 1797 // 64-bit). If "aligned" is true, the "from" and "to" addresses
goetz@6023 1798 // are assumed to be heapword aligned.
goetz@6023 1799 //
goetz@6023 1800 // Arguments:
goetz@6023 1801 // from: R3_ARG1
goetz@6023 1802 // to: R4_ARG2
goetz@6023 1803 // count: R5_ARG3 treated as signed
goetz@6023 1804 //
goetz@6023 1805 void generate_conjoint_long_copy_core(bool aligned) {
goetz@6023 1806 Register tmp1 = R6_ARG4;
goetz@6023 1807 Register tmp2 = R7_ARG5;
goetz@6023 1808 Register tmp3 = R8_ARG6;
goetz@6023 1809 Register tmp4 = R0;
goetz@6023 1810
goetz@6023 1811 Label l_1, l_2, l_3, l_4, l_5;
goetz@6023 1812
goetz@6023 1813 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6023 1814 __ beq(CCR0, l_1);
goetz@6023 1815
goetz@6023 1816 { // FasterArrayCopy
goetz@6023 1817 __ sldi(R5_ARG3, R5_ARG3, 3);
goetz@6023 1818 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
goetz@6023 1819 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
goetz@6023 1820 __ srdi(R5_ARG3, R5_ARG3, 3);
goetz@6023 1821
goetz@6023 1822 __ cmpwi(CCR0, R5_ARG3, 3);
goetz@6023 1823 __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
goetz@6023 1824
goetz@6023 1825 __ srdi(tmp1, R5_ARG3, 2);
goetz@6023 1826 __ andi(R5_ARG3, R5_ARG3, 3);
goetz@6023 1827 __ mtctr(tmp1);
goetz@6023 1828
goetz@6023 1829 __ bind(l_4);
goetz@6023 1830 // Use unrolled version for mass copying (copy 4 elements a time).
goetz@6023 1831 // Load feeding store gets zero latency on Power6, however not on Power5.
goetz@6023 1832 // Therefore, the following sequence is made for the good of both.
goetz@6023 1833 __ addi(R3_ARG1, R3_ARG1, -32);
goetz@6023 1834 __ addi(R4_ARG2, R4_ARG2, -32);
goetz@6023 1835 __ ld(tmp4, 24, R3_ARG1);
goetz@6023 1836 __ ld(tmp3, 16, R3_ARG1);
goetz@6023 1837 __ ld(tmp2, 8, R3_ARG1);
goetz@6023 1838 __ ld(tmp1, 0, R3_ARG1);
goetz@6023 1839 __ std(tmp4, 24, R4_ARG2);
goetz@6023 1840 __ std(tmp3, 16, R4_ARG2);
goetz@6023 1841 __ std(tmp2, 8, R4_ARG2);
goetz@6023 1842 __ std(tmp1, 0, R4_ARG2);
goetz@6023 1843 __ bdnz(l_4);
goetz@6023 1844
goetz@6023 1845 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6023 1846 __ beq(CCR0, l_1);
goetz@6023 1847
goetz@6023 1848 __ bind(l_5);
goetz@6023 1849 __ mtctr(R5_ARG3);
goetz@6023 1850 __ bind(l_3);
goetz@6023 1851 __ ld(R0, -8, R3_ARG1);
goetz@6023 1852 __ std(R0, -8, R4_ARG2);
goetz@6023 1853 __ addi(R3_ARG1, R3_ARG1, -8);
goetz@6023 1854 __ addi(R4_ARG2, R4_ARG2, -8);
goetz@6023 1855 __ bdnz(l_3);
goetz@6023 1856
goetz@6023 1857 }
goetz@6023 1858 __ bind(l_1);
goetz@6023 1859 }
goetz@6023 1860
goetz@6023 1861 // Generate stub for conjoint long copy. If "aligned" is true, the
goetz@6023 1862 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1863 //
goetz@6023 1864 // Arguments for generated stub:
goetz@6023 1865 // from: R3_ARG1
goetz@6023 1866 // to: R4_ARG2
goetz@6023 1867 // count: R5_ARG3 treated as signed
goetz@6023 1868 //
goetz@6023 1869 address generate_conjoint_long_copy(bool aligned, const char * name) {
goetz@6023 1870 StubCodeMark mark(this, "StubRoutines", name);
goetz@6076 1871 address start = __ function_entry();
goetz@6023 1872
goetz@6076 1873 #if defined(ABI_ELFv2)
goetz@6076 1874 address nooverlap_target = aligned ?
goetz@6076 1875 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
goetz@6076 1876 StubRoutines::jlong_disjoint_arraycopy();
goetz@6076 1877 #else
goetz@6023 1878 address nooverlap_target = aligned ?
goetz@6023 1879 ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
goetz@6023 1880 ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
goetz@6076 1881 #endif
goetz@6023 1882
goetz@6023 1883 array_overlap_test(nooverlap_target, 3);
goetz@6023 1884 generate_conjoint_long_copy_core(aligned);
goetz@6023 1885
goetz@6023 1886 __ blr();
goetz@6023 1887
goetz@6023 1888 return start;
goetz@6023 1889 }
goetz@6023 1890
goetz@6023 1891 // Generate stub for conjoint oop copy. If "aligned" is true, the
goetz@6023 1892 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1893 //
goetz@6023 1894 // Arguments for generated stub:
goetz@6023 1895 // from: R3_ARG1
goetz@6023 1896 // to: R4_ARG2
goetz@6023 1897 // count: R5_ARG3 treated as signed
goetz@6023 1898 // dest_uninitialized: G1 support
goetz@6023 1899 //
goetz@6023 1900 address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
goetz@6023 1901 StubCodeMark mark(this, "StubRoutines", name);
goetz@6023 1902
goetz@6076 1903 address start = __ function_entry();
goetz@6023 1904
goetz@6076 1905 #if defined(ABI_ELFv2)
goetz@6076 1906 address nooverlap_target = aligned ?
goetz@6076 1907 StubRoutines::arrayof_oop_disjoint_arraycopy() :
goetz@6076 1908 StubRoutines::oop_disjoint_arraycopy();
goetz@6076 1909 #else
goetz@6023 1910 address nooverlap_target = aligned ?
goetz@6023 1911 ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
goetz@6023 1912 ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
goetz@6076 1913 #endif
goetz@6023 1914
goetz@6023 1915 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
goetz@6023 1916
goetz@6023 1917 // Save arguments.
goetz@6023 1918 __ mr(R9_ARG7, R4_ARG2);
goetz@6023 1919 __ mr(R10_ARG8, R5_ARG3);
goetz@6023 1920
goetz@6023 1921 if (UseCompressedOops) {
goetz@6023 1922 array_overlap_test(nooverlap_target, 2);
goetz@6023 1923 generate_conjoint_int_copy_core(aligned);
goetz@6023 1924 } else {
goetz@6023 1925 array_overlap_test(nooverlap_target, 3);
goetz@6023 1926 generate_conjoint_long_copy_core(aligned);
goetz@6023 1927 }
goetz@6023 1928
goetz@6060 1929 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
goetz@6023 1930 return start;
goetz@6023 1931 }
goetz@6023 1932
goetz@6023 1933 // Generate stub for disjoint oop copy. If "aligned" is true, the
goetz@6023 1934 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6023 1935 //
goetz@6023 1936 // Arguments for generated stub:
goetz@6023 1937 // from: R3_ARG1
goetz@6023 1938 // to: R4_ARG2
goetz@6023 1939 // count: R5_ARG3 treated as signed
goetz@6023 1940 // dest_uninitialized: G1 support
goetz@6023 1941 //
goetz@6023 1942 address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
goetz@6023 1943 StubCodeMark mark(this, "StubRoutines", name);
goetz@6076 1944 address start = __ function_entry();
goetz@6023 1945
goetz@6023 1946 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
goetz@6023 1947
goetz@6023 1948 // save some arguments, disjoint_long_copy_core destroys them.
goetz@6023 1949 // needed for post barrier
goetz@6023 1950 __ mr(R9_ARG7, R4_ARG2);
goetz@6023 1951 __ mr(R10_ARG8, R5_ARG3);
goetz@6023 1952
goetz@6023 1953 if (UseCompressedOops) {
goetz@6023 1954 generate_disjoint_int_copy_core(aligned);
goetz@6023 1955 } else {
goetz@6023 1956 generate_disjoint_long_copy_core(aligned);
goetz@6023 1957 }
goetz@6023 1958
goetz@6060 1959 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
goetz@6023 1960
goetz@6023 1961 return start;
goetz@6023 1962 }
goetz@6023 1963
simonis@8308 1964 // Arguments for generated stub (little endian only):
simonis@8308 1965 // R3_ARG1 - source byte array address
simonis@8308 1966 // R4_ARG2 - destination byte array address
simonis@8308 1967 // R5_ARG3 - round key array
simonis@8308 1968 address generate_aescrypt_encryptBlock() {
simonis@8308 1969 assert(UseAES, "need AES instructions and misaligned SSE support");
simonis@8308 1970 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
simonis@8308 1971
simonis@8308 1972 address start = __ function_entry();
simonis@8308 1973
simonis@8308 1974 Label L_doLast;
simonis@8308 1975
simonis@8308 1976 Register from = R3_ARG1; // source array address
simonis@8308 1977 Register to = R4_ARG2; // destination array address
simonis@8308 1978 Register key = R5_ARG3; // round key array
simonis@8308 1979
simonis@8308 1980 Register keylen = R8;
simonis@8308 1981 Register temp = R9;
simonis@8308 1982 Register keypos = R10;
simonis@8308 1983 Register hex = R11;
simonis@8308 1984 Register fifteen = R12;
simonis@8308 1985
simonis@8308 1986 VectorRegister vRet = VR0;
simonis@8308 1987
simonis@8308 1988 VectorRegister vKey1 = VR1;
simonis@8308 1989 VectorRegister vKey2 = VR2;
simonis@8308 1990 VectorRegister vKey3 = VR3;
simonis@8308 1991 VectorRegister vKey4 = VR4;
simonis@8308 1992
simonis@8308 1993 VectorRegister fromPerm = VR5;
simonis@8308 1994 VectorRegister keyPerm = VR6;
simonis@8308 1995 VectorRegister toPerm = VR7;
simonis@8308 1996 VectorRegister fSplt = VR8;
simonis@8308 1997
simonis@8308 1998 VectorRegister vTmp1 = VR9;
simonis@8308 1999 VectorRegister vTmp2 = VR10;
simonis@8308 2000 VectorRegister vTmp3 = VR11;
simonis@8308 2001 VectorRegister vTmp4 = VR12;
simonis@8308 2002
simonis@8308 2003 VectorRegister vLow = VR13;
simonis@8308 2004 VectorRegister vHigh = VR14;
simonis@8308 2005
simonis@8308 2006 __ li (hex, 16);
simonis@8308 2007 __ li (fifteen, 15);
simonis@8308 2008 __ vspltisb (fSplt, 0x0f);
simonis@8308 2009
simonis@8308 2010 // load unaligned from[0-15] to vsRet
simonis@8308 2011 __ lvx (vRet, from);
simonis@8308 2012 __ lvx (vTmp1, fifteen, from);
simonis@8308 2013 __ lvsl (fromPerm, from);
simonis@8308 2014 __ vxor (fromPerm, fromPerm, fSplt);
simonis@8308 2015 __ vperm (vRet, vRet, vTmp1, fromPerm);
simonis@8308 2016
simonis@8308 2017 // load keylen (44 or 52 or 60)
simonis@8308 2018 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
simonis@8308 2019
simonis@8308 2020 // to load keys
simonis@8308 2021 __ lvsr (keyPerm, key);
simonis@8308 2022 __ vxor (vTmp2, vTmp2, vTmp2);
simonis@8308 2023 __ vspltisb (vTmp2, -16);
simonis@8308 2024 __ vrld (keyPerm, keyPerm, vTmp2);
simonis@8308 2025 __ vrld (keyPerm, keyPerm, vTmp2);
simonis@8308 2026 __ vsldoi (keyPerm, keyPerm, keyPerm, -8);
simonis@8308 2027
simonis@8308 2028 // load the 1st round key to vKey1
simonis@8308 2029 __ li (keypos, 0);
simonis@8308 2030 __ lvx (vKey1, keypos, key);
simonis@8308 2031 __ addi (keypos, keypos, 16);
simonis@8308 2032 __ lvx (vTmp1, keypos, key);
simonis@8308 2033 __ vperm (vKey1, vTmp1, vKey1, keyPerm);
simonis@8308 2034
simonis@8308 2035 // 1st round
simonis@8308 2036 __ vxor (vRet, vRet, vKey1);
simonis@8308 2037
simonis@8308 2038 // load the 2nd round key to vKey1
simonis@8308 2039 __ addi (keypos, keypos, 16);
simonis@8308 2040 __ lvx (vTmp2, keypos, key);
simonis@8308 2041 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
simonis@8308 2042
simonis@8308 2043 // load the 3rd round key to vKey2
simonis@8308 2044 __ addi (keypos, keypos, 16);
simonis@8308 2045 __ lvx (vTmp1, keypos, key);
simonis@8308 2046 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
simonis@8308 2047
simonis@8308 2048 // load the 4th round key to vKey3
simonis@8308 2049 __ addi (keypos, keypos, 16);
simonis@8308 2050 __ lvx (vTmp2, keypos, key);
simonis@8308 2051 __ vperm (vKey3, vTmp2, vTmp1, keyPerm);
simonis@8308 2052
simonis@8308 2053 // load the 5th round key to vKey4
simonis@8308 2054 __ addi (keypos, keypos, 16);
simonis@8308 2055 __ lvx (vTmp1, keypos, key);
simonis@8308 2056 __ vperm (vKey4, vTmp1, vTmp2, keyPerm);
simonis@8308 2057
simonis@8308 2058 // 2nd - 5th rounds
simonis@8308 2059 __ vcipher (vRet, vRet, vKey1);
simonis@8308 2060 __ vcipher (vRet, vRet, vKey2);
simonis@8308 2061 __ vcipher (vRet, vRet, vKey3);
simonis@8308 2062 __ vcipher (vRet, vRet, vKey4);
simonis@8308 2063
simonis@8308 2064 // load the 6th round key to vKey1
simonis@8308 2065 __ addi (keypos, keypos, 16);
simonis@8308 2066 __ lvx (vTmp2, keypos, key);
simonis@8308 2067 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
simonis@8308 2068
simonis@8308 2069 // load the 7th round key to vKey2
simonis@8308 2070 __ addi (keypos, keypos, 16);
simonis@8308 2071 __ lvx (vTmp1, keypos, key);
simonis@8308 2072 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
simonis@8308 2073
simonis@8308 2074 // load the 8th round key to vKey3
simonis@8308 2075 __ addi (keypos, keypos, 16);
simonis@8308 2076 __ lvx (vTmp2, keypos, key);
simonis@8308 2077 __ vperm (vKey3, vTmp2, vTmp1, keyPerm);
simonis@8308 2078
simonis@8308 2079 // load the 9th round key to vKey4
simonis@8308 2080 __ addi (keypos, keypos, 16);
simonis@8308 2081 __ lvx (vTmp1, keypos, key);
simonis@8308 2082 __ vperm (vKey4, vTmp1, vTmp2, keyPerm);
simonis@8308 2083
simonis@8308 2084 // 6th - 9th rounds
simonis@8308 2085 __ vcipher (vRet, vRet, vKey1);
simonis@8308 2086 __ vcipher (vRet, vRet, vKey2);
simonis@8308 2087 __ vcipher (vRet, vRet, vKey3);
simonis@8308 2088 __ vcipher (vRet, vRet, vKey4);
simonis@8308 2089
simonis@8308 2090 // load the 10th round key to vKey1
simonis@8308 2091 __ addi (keypos, keypos, 16);
simonis@8308 2092 __ lvx (vTmp2, keypos, key);
simonis@8308 2093 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
simonis@8308 2094
simonis@8308 2095 // load the 11th round key to vKey2
simonis@8308 2096 __ addi (keypos, keypos, 16);
simonis@8308 2097 __ lvx (vTmp1, keypos, key);
simonis@8308 2098 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
simonis@8308 2099
simonis@8308 2100 // if all round keys are loaded, skip next 4 rounds
simonis@8308 2101 __ cmpwi (CCR0, keylen, 44);
simonis@8308 2102 __ beq (CCR0, L_doLast);
simonis@8308 2103
simonis@8308 2104 // 10th - 11th rounds
simonis@8308 2105 __ vcipher (vRet, vRet, vKey1);
simonis@8308 2106 __ vcipher (vRet, vRet, vKey2);
simonis@8308 2107
simonis@8308 2108 // load the 12th round key to vKey1
simonis@8308 2109 __ addi (keypos, keypos, 16);
simonis@8308 2110 __ lvx (vTmp2, keypos, key);
simonis@8308 2111 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
simonis@8308 2112
simonis@8308 2113 // load the 13th round key to vKey2
simonis@8308 2114 __ addi (keypos, keypos, 16);
simonis@8308 2115 __ lvx (vTmp1, keypos, key);
simonis@8308 2116 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
simonis@8308 2117
simonis@8308 2118 // if all round keys are loaded, skip next 2 rounds
simonis@8308 2119 __ cmpwi (CCR0, keylen, 52);
simonis@8308 2120 __ beq (CCR0, L_doLast);
simonis@8308 2121
simonis@8308 2122 // 12th - 13th rounds
simonis@8308 2123 __ vcipher (vRet, vRet, vKey1);
simonis@8308 2124 __ vcipher (vRet, vRet, vKey2);
simonis@8308 2125
simonis@8308 2126 // load the 14th round key to vKey1
simonis@8308 2127 __ addi (keypos, keypos, 16);
simonis@8308 2128 __ lvx (vTmp2, keypos, key);
simonis@8308 2129 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
simonis@8308 2130
simonis@8308 2131 // load the 15th round key to vKey2
simonis@8308 2132 __ addi (keypos, keypos, 16);
simonis@8308 2133 __ lvx (vTmp1, keypos, key);
simonis@8308 2134 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
simonis@8308 2135
simonis@8308 2136 __ bind(L_doLast);
simonis@8308 2137
simonis@8308 2138 // last two rounds
simonis@8308 2139 __ vcipher (vRet, vRet, vKey1);
simonis@8308 2140 __ vcipherlast (vRet, vRet, vKey2);
simonis@8308 2141
simonis@8308 2142 __ neg (temp, to);
simonis@8308 2143 __ lvsr (toPerm, temp);
simonis@8308 2144 __ vspltisb (vTmp2, -1);
simonis@8308 2145 __ vxor (vTmp1, vTmp1, vTmp1);
simonis@8308 2146 __ vperm (vTmp2, vTmp2, vTmp1, toPerm);
simonis@8308 2147 __ vxor (toPerm, toPerm, fSplt);
simonis@8308 2148 __ lvx (vTmp1, to);
simonis@8308 2149 __ vperm (vRet, vRet, vRet, toPerm);
simonis@8308 2150 __ vsel (vTmp1, vTmp1, vRet, vTmp2);
simonis@8308 2151 __ lvx (vTmp4, fifteen, to);
simonis@8308 2152 __ stvx (vTmp1, to);
simonis@8308 2153 __ vsel (vRet, vRet, vTmp4, vTmp2);
simonis@8308 2154 __ stvx (vRet, fifteen, to);
simonis@8308 2155
simonis@8308 2156 __ blr();
simonis@8308 2157 return start;
simonis@8308 2158 }
simonis@8308 2159
simonis@8308 2160 // Arguments for generated stub (little endian only):
simonis@8308 2161 // R3_ARG1 - source byte array address
simonis@8308 2162 // R4_ARG2 - destination byte array address
simonis@8308 2163 // R5_ARG3 - K (key) in little endian int array
simonis@8308 2164 address generate_aescrypt_decryptBlock() {
simonis@8308 2165 assert(UseAES, "need AES instructions and misaligned SSE support");
simonis@8308 2166 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
simonis@8308 2167
simonis@8308 2168 address start = __ function_entry();
simonis@8308 2169
simonis@8308 2170 Label L_doLast;
simonis@8308 2171 Label L_do44;
simonis@8308 2172 Label L_do52;
simonis@8308 2173 Label L_do60;
simonis@8308 2174
simonis@8308 2175 Register from = R3_ARG1; // source array address
simonis@8308 2176 Register to = R4_ARG2; // destination array address
simonis@8308 2177 Register key = R5_ARG3; // round key array
simonis@8308 2178
simonis@8308 2179 Register keylen = R8;
simonis@8308 2180 Register temp = R9;
simonis@8308 2181 Register keypos = R10;
simonis@8308 2182 Register hex = R11;
simonis@8308 2183 Register fifteen = R12;
simonis@8308 2184
simonis@8308 2185 VectorRegister vRet = VR0;
simonis@8308 2186
simonis@8308 2187 VectorRegister vKey1 = VR1;
simonis@8308 2188 VectorRegister vKey2 = VR2;
simonis@8308 2189 VectorRegister vKey3 = VR3;
simonis@8308 2190 VectorRegister vKey4 = VR4;
simonis@8308 2191 VectorRegister vKey5 = VR5;
simonis@8308 2192
simonis@8308 2193 VectorRegister fromPerm = VR6;
simonis@8308 2194 VectorRegister keyPerm = VR7;
simonis@8308 2195 VectorRegister toPerm = VR8;
simonis@8308 2196 VectorRegister fSplt = VR9;
simonis@8308 2197
simonis@8308 2198 VectorRegister vTmp1 = VR10;
simonis@8308 2199 VectorRegister vTmp2 = VR11;
simonis@8308 2200 VectorRegister vTmp3 = VR12;
simonis@8308 2201 VectorRegister vTmp4 = VR13;
simonis@8308 2202
simonis@8308 2203 VectorRegister vLow = VR14;
simonis@8308 2204 VectorRegister vHigh = VR15;
simonis@8308 2205
simonis@8308 2206 __ li (hex, 16);
simonis@8308 2207 __ li (fifteen, 15);
simonis@8308 2208 __ vspltisb (fSplt, 0x0f);
simonis@8308 2209
simonis@8308 2210 // load unaligned from[0-15] to vsRet
simonis@8308 2211 __ lvx (vRet, from);
simonis@8308 2212 __ lvx (vTmp1, fifteen, from);
simonis@8308 2213 __ lvsl (fromPerm, from);
simonis@8308 2214 __ vxor (fromPerm, fromPerm, fSplt);
simonis@8308 2215 __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
simonis@8308 2216
simonis@8308 2217 // load keylen (44 or 52 or 60)
simonis@8308 2218 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
simonis@8308 2219
simonis@8308 2220 // to load keys
simonis@8308 2221 __ lvsr (keyPerm, key);
simonis@8308 2222 __ vxor (vTmp2, vTmp2, vTmp2);
simonis@8308 2223 __ vspltisb (vTmp2, -16);
simonis@8308 2224 __ vrld (keyPerm, keyPerm, vTmp2);
simonis@8308 2225 __ vrld (keyPerm, keyPerm, vTmp2);
simonis@8308 2226 __ vsldoi (keyPerm, keyPerm, keyPerm, -8);
simonis@8308 2227
simonis@8308 2228 __ cmpwi (CCR0, keylen, 44);
simonis@8308 2229 __ beq (CCR0, L_do44);
simonis@8308 2230
simonis@8308 2231 __ cmpwi (CCR0, keylen, 52);
simonis@8308 2232 __ beq (CCR0, L_do52);
simonis@8308 2233
simonis@8308 2234 // load the 15th round key to vKey11
simonis@8308 2235 __ li (keypos, 240);
simonis@8308 2236 __ lvx (vTmp1, keypos, key);
simonis@8308 2237 __ addi (keypos, keypos, -16);
simonis@8308 2238 __ lvx (vTmp2, keypos, key);
simonis@8308 2239 __ vperm (vKey1, vTmp1, vTmp2, keyPerm);
simonis@8308 2240
simonis@8308 2241 // load the 14th round key to vKey10
simonis@8308 2242 __ addi (keypos, keypos, -16);
simonis@8308 2243 __ lvx (vTmp1, keypos, key);
simonis@8308 2244 __ vperm (vKey2, vTmp2, vTmp1, keyPerm);
simonis@8308 2245
simonis@8308 2246 // load the 13th round key to vKey10
simonis@8308 2247 __ addi (keypos, keypos, -16);
simonis@8308 2248 __ lvx (vTmp2, keypos, key);
simonis@8308 2249 __ vperm (vKey3, vTmp1, vTmp2, keyPerm);
simonis@8308 2250
simonis@8308 2251 // load the 12th round key to vKey10
simonis@8308 2252 __ addi (keypos, keypos, -16);
simonis@8308 2253 __ lvx (vTmp1, keypos, key);
simonis@8308 2254 __ vperm (vKey4, vTmp2, vTmp1, keyPerm);
simonis@8308 2255
simonis@8308 2256 // load the 11th round key to vKey10
simonis@8308 2257 __ addi (keypos, keypos, -16);
simonis@8308 2258 __ lvx (vTmp2, keypos, key);
simonis@8308 2259 __ vperm (vKey5, vTmp1, vTmp2, keyPerm);
simonis@8308 2260
simonis@8308 2261 // 1st - 5th rounds
simonis@8308 2262 __ vxor (vRet, vRet, vKey1);
simonis@8308 2263 __ vncipher (vRet, vRet, vKey2);
simonis@8308 2264 __ vncipher (vRet, vRet, vKey3);
simonis@8308 2265 __ vncipher (vRet, vRet, vKey4);
simonis@8308 2266 __ vncipher (vRet, vRet, vKey5);
simonis@8308 2267
simonis@8308 2268 __ b (L_doLast);
simonis@8308 2269
simonis@8308 2270 __ bind (L_do52);
simonis@8308 2271
simonis@8308 2272 // load the 13th round key to vKey11
simonis@8308 2273 __ li (keypos, 208);
simonis@8308 2274 __ lvx (vTmp1, keypos, key);
simonis@8308 2275 __ addi (keypos, keypos, -16);
simonis@8308 2276 __ lvx (vTmp2, keypos, key);
simonis@8308 2277 __ vperm (vKey1, vTmp1, vTmp2, keyPerm);
simonis@8308 2278
simonis@8308 2279 // load the 12th round key to vKey10
simonis@8308 2280 __ addi (keypos, keypos, -16);
simonis@8308 2281 __ lvx (vTmp1, keypos, key);
simonis@8308 2282 __ vperm (vKey2, vTmp2, vTmp1, keyPerm);
simonis@8308 2283
simonis@8308 2284 // load the 11th round key to vKey10
simonis@8308 2285 __ addi (keypos, keypos, -16);
simonis@8308 2286 __ lvx (vTmp2, keypos, key);
simonis@8308 2287 __ vperm (vKey3, vTmp1, vTmp2, keyPerm);
simonis@8308 2288
simonis@8308 2289 // 1st - 3rd rounds
simonis@8308 2290 __ vxor (vRet, vRet, vKey1);
simonis@8308 2291 __ vncipher (vRet, vRet, vKey2);
simonis@8308 2292 __ vncipher (vRet, vRet, vKey3);
simonis@8308 2293
simonis@8308 2294 __ b (L_doLast);
simonis@8308 2295
simonis@8308 2296 __ bind (L_do44);
simonis@8308 2297
simonis@8308 2298 // load the 11th round key to vKey11
simonis@8308 2299 __ li (keypos, 176);
simonis@8308 2300 __ lvx (vTmp1, keypos, key);
simonis@8308 2301 __ addi (keypos, keypos, -16);
simonis@8308 2302 __ lvx (vTmp2, keypos, key);
simonis@8308 2303 __ vperm (vKey1, vTmp1, vTmp2, keyPerm);
simonis@8308 2304
simonis@8308 2305 // 1st round
simonis@8308 2306 __ vxor (vRet, vRet, vKey1);
simonis@8308 2307
simonis@8308 2308 __ bind (L_doLast);
simonis@8308 2309
simonis@8308 2310 // load the 10th round key to vKey10
simonis@8308 2311 __ addi (keypos, keypos, -16);
simonis@8308 2312 __ lvx (vTmp1, keypos, key);
simonis@8308 2313 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
simonis@8308 2314
simonis@8308 2315 // load the 9th round key to vKey10
simonis@8308 2316 __ addi (keypos, keypos, -16);
simonis@8308 2317 __ lvx (vTmp2, keypos, key);
simonis@8308 2318 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
simonis@8308 2319
simonis@8308 2320 // load the 8th round key to vKey10
simonis@8308 2321 __ addi (keypos, keypos, -16);
simonis@8308 2322 __ lvx (vTmp1, keypos, key);
simonis@8308 2323 __ vperm (vKey3, vTmp2, vTmp1, keyPerm);
simonis@8308 2324
simonis@8308 2325 // load the 7th round key to vKey10
simonis@8308 2326 __ addi (keypos, keypos, -16);
simonis@8308 2327 __ lvx (vTmp2, keypos, key);
simonis@8308 2328 __ vperm (vKey4, vTmp1, vTmp2, keyPerm);
simonis@8308 2329
simonis@8308 2330 // load the 6th round key to vKey10
simonis@8308 2331 __ addi (keypos, keypos, -16);
simonis@8308 2332 __ lvx (vTmp1, keypos, key);
simonis@8308 2333 __ vperm (vKey5, vTmp2, vTmp1, keyPerm);
simonis@8308 2334
simonis@8308 2335 // last 10th - 6th rounds
simonis@8308 2336 __ vncipher (vRet, vRet, vKey1);
simonis@8308 2337 __ vncipher (vRet, vRet, vKey2);
simonis@8308 2338 __ vncipher (vRet, vRet, vKey3);
simonis@8308 2339 __ vncipher (vRet, vRet, vKey4);
simonis@8308 2340 __ vncipher (vRet, vRet, vKey5);
simonis@8308 2341
simonis@8308 2342 // load the 5th round key to vKey10
simonis@8308 2343 __ addi (keypos, keypos, -16);
simonis@8308 2344 __ lvx (vTmp2, keypos, key);
simonis@8308 2345 __ vperm (vKey1, vTmp1, vTmp2, keyPerm);
simonis@8308 2346
simonis@8308 2347 // load the 4th round key to vKey10
simonis@8308 2348 __ addi (keypos, keypos, -16);
simonis@8308 2349 __ lvx (vTmp1, keypos, key);
simonis@8308 2350 __ vperm (vKey2, vTmp2, vTmp1, keyPerm);
simonis@8308 2351
simonis@8308 2352 // load the 3rd round key to vKey10
simonis@8308 2353 __ addi (keypos, keypos, -16);
simonis@8308 2354 __ lvx (vTmp2, keypos, key);
simonis@8308 2355 __ vperm (vKey3, vTmp1, vTmp2, keyPerm);
simonis@8308 2356
simonis@8308 2357 // load the 2nd round key to vKey10
simonis@8308 2358 __ addi (keypos, keypos, -16);
simonis@8308 2359 __ lvx (vTmp1, keypos, key);
simonis@8308 2360 __ vperm (vKey4, vTmp2, vTmp1, keyPerm);
simonis@8308 2361
simonis@8308 2362 // load the 1st round key to vKey10
simonis@8308 2363 __ addi (keypos, keypos, -16);
simonis@8308 2364 __ lvx (vTmp2, keypos, key);
simonis@8308 2365 __ vperm (vKey5, vTmp1, vTmp2, keyPerm);
simonis@8308 2366
simonis@8308 2367 // last 5th - 1th rounds
simonis@8308 2368 __ vncipher (vRet, vRet, vKey1);
simonis@8308 2369 __ vncipher (vRet, vRet, vKey2);
simonis@8308 2370 __ vncipher (vRet, vRet, vKey3);
simonis@8308 2371 __ vncipher (vRet, vRet, vKey4);
simonis@8308 2372 __ vncipherlast (vRet, vRet, vKey5);
simonis@8308 2373
simonis@8308 2374 __ neg (temp, to);
simonis@8308 2375 __ lvsr (toPerm, temp);
simonis@8308 2376 __ vspltisb (vTmp2, -1);
simonis@8308 2377 __ vxor (vTmp1, vTmp1, vTmp1);
simonis@8308 2378 __ vperm (vTmp2, vTmp2, vTmp1, toPerm);
simonis@8308 2379 __ vxor (toPerm, toPerm, fSplt);
simonis@8308 2380 __ lvx (vTmp1, to);
simonis@8308 2381 __ vperm (vRet, vRet, vRet, toPerm);
simonis@8308 2382 __ vsel (vTmp1, vTmp1, vRet, vTmp2);
simonis@8308 2383 __ lvx (vTmp4, fifteen, to);
simonis@8308 2384 __ stvx (vTmp1, to);
simonis@8308 2385 __ vsel (vRet, vRet, vTmp4, vTmp2);
simonis@8308 2386 __ stvx (vRet, fifteen, to);
simonis@8308 2387
simonis@8308 2388 __ blr();
simonis@8308 2389 return start;
simonis@8308 2390 }
simonis@8308 2391
goetz@6023 2392 void generate_arraycopy_stubs() {
goetz@6023 2393 // Note: the disjoint stubs must be generated first, some of
goetz@6023 2394 // the conjoint stubs use them.
goetz@6023 2395
goetz@6023 2396 // non-aligned disjoint versions
goetz@6023 2397 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
goetz@6023 2398 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
goetz@6023 2399 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
goetz@6023 2400 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
goetz@6023 2401 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
goetz@6023 2402 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
goetz@6023 2403
goetz@6023 2404 // aligned disjoint versions
goetz@6023 2405 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
goetz@6023 2406 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
goetz@6023 2407 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
goetz@6023 2408 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
goetz@6023 2409 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
goetz@6023 2410 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
goetz@6023 2411
goetz@6023 2412 // non-aligned conjoint versions
goetz@6023 2413 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
goetz@6023 2414 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
goetz@6023 2415 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, "jint_arraycopy");
goetz@6023 2416 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
goetz@6023 2417 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
goetz@6023 2418 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
goetz@6023 2419
goetz@6023 2420 // aligned conjoint versions
goetz@6023 2421 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
goetz@6023 2422 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
goetz@6023 2423 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
goetz@6023 2424 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
goetz@6023 2425 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
goetz@6023 2426 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
goetz@6023 2427
goetz@6023 2428 // fill routines
goetz@6023 2429 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
goetz@6023 2430 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
goetz@6023 2431 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
goetz@6023 2432 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
goetz@6023 2433 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
goetz@6023 2434 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
goetz@6023 2435 }
goetz@6023 2436
goetz@6023 2437 // Safefetch stubs.
goetz@6023 2438 void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
goetz@6023 2439 // safefetch signatures:
goetz@6023 2440 // int SafeFetch32(int* adr, int errValue);
goetz@6023 2441 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
goetz@6023 2442 //
goetz@6023 2443 // arguments:
goetz@6023 2444 // R3_ARG1 = adr
goetz@6023 2445 // R4_ARG2 = errValue
goetz@6023 2446 //
goetz@6023 2447 // result:
goetz@6023 2448 // R3_RET = *adr or errValue
goetz@6023 2449
goetz@6023 2450 StubCodeMark mark(this, "StubRoutines", name);
goetz@6023 2451
goetz@6023 2452 // Entry point, pc or function descriptor.
goetz@6076 2453 *entry = __ function_entry();
goetz@6023 2454
goetz@6023 2455 // Load *adr into R4_ARG2, may fault.
goetz@6023 2456 *fault_pc = __ pc();
goetz@6023 2457 switch (size) {
goetz@6023 2458 case 4:
goetz@6023 2459 // int32_t, signed extended
goetz@6023 2460 __ lwa(R4_ARG2, 0, R3_ARG1);
goetz@6023 2461 break;
goetz@6023 2462 case 8:
goetz@6023 2463 // int64_t
goetz@6023 2464 __ ld(R4_ARG2, 0, R3_ARG1);
goetz@6023 2465 break;
goetz@6023 2466 default:
goetz@6023 2467 ShouldNotReachHere();
goetz@6023 2468 }
goetz@6023 2469
goetz@6023 2470 // return errValue or *adr
goetz@6023 2471 *continuation_pc = __ pc();
goetz@6023 2472 __ mr(R3_RET, R4_ARG2);
goetz@6023 2473 __ blr();
goetz@6023 2474 }
goetz@6023 2475
gromero@8979 2476 /**
gromero@8979 2477 * Arguments:
gromero@8979 2478 *
gromero@8979 2479 * Inputs:
gromero@8979 2480 * R3_ARG1 - int crc
gromero@8979 2481 * R4_ARG2 - byte* buf
gromero@8979 2482 * R5_ARG3 - int length (of buffer)
gromero@8979 2483 *
gromero@8979 2484 * scratch:
gromero@8979 2485 * R6_ARG4 - crc table address
gromero@8979 2486 * R7_ARG5 - tmp1
gromero@8979 2487 * R8_ARG6 - tmp2
gromero@8979 2488 *
gromero@8979 2489 * Ouput:
gromero@8979 2490 * R3_RET - int crc result
gromero@8979 2491 */
gromero@8979 2492 // Compute CRC32 function.
gromero@8979 2493 address generate_CRC32_updateBytes(const char* name) {
gromero@8979 2494 __ align(CodeEntryAlignment);
gromero@8979 2495 StubCodeMark mark(this, "StubRoutines", name);
gromero@8979 2496 address start = __ function_entry(); // Remember stub start address (is rtn value).
gromero@8979 2497
gromero@8979 2498 // arguments to kernel_crc32:
gromero@8979 2499 Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
gromero@8979 2500 Register data = R4_ARG2; // source byte array
gromero@8979 2501 Register dataLen = R5_ARG3; // #bytes to process
gromero@8979 2502 Register table = R6_ARG4; // crc table address
gromero@8979 2503
gromero@8979 2504 Register t0 = R9; // work reg for kernel* emitters
gromero@8979 2505 Register t1 = R10; // work reg for kernel* emitters
gromero@8979 2506 Register t2 = R11; // work reg for kernel* emitters
gromero@8979 2507 Register t3 = R12; // work reg for kernel* emitters
gromero@8979 2508
gromero@8979 2509 BLOCK_COMMENT("Stub body {");
gromero@8979 2510 assert_different_registers(crc, data, dataLen, table);
gromero@8979 2511
gromero@8979 2512 StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
gromero@8979 2513
gromero@8979 2514 __ kernel_crc32_1byte(crc, data, dataLen, table, t0, t1, t2, t3);
gromero@8979 2515
gromero@8979 2516 BLOCK_COMMENT("return");
gromero@8979 2517 __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
gromero@8979 2518 __ blr();
gromero@8979 2519
gromero@8979 2520 BLOCK_COMMENT("} Stub body");
gromero@8979 2521 return start;
gromero@8979 2522 }
gromero@8979 2523
goetz@6023 2524 // Initialization
goetz@6023 2525 void generate_initial() {
goetz@6023 2526 // Generates all stubs and initializes the entry points
goetz@6023 2527
goetz@6023 2528 // Entry points that exist in all platforms.
goetz@6023 2529 // Note: This is code that could be shared among different platforms - however the
goetz@6023 2530 // benefit seems to be smaller than the disadvantage of having a
goetz@6023 2531 // much more complicated generator structure. See also comment in
goetz@6023 2532 // stubRoutines.hpp.
goetz@6023 2533
goetz@6023 2534 StubRoutines::_forward_exception_entry = generate_forward_exception();
goetz@6023 2535 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
goetz@6023 2536 StubRoutines::_catch_exception_entry = generate_catch_exception();
goetz@6066 2537
goetz@6066 2538 // Build this early so it's available for the interpreter.
goetz@6066 2539 StubRoutines::_throw_StackOverflowError_entry =
goetz@6066 2540 generate_throw_exception("StackOverflowError throw_exception",
goetz@6066 2541 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
gromero@8979 2542
gromero@8979 2543 // CRC32 Intrinsics.
gromero@8979 2544 if (UseCRC32Intrinsics) {
gromero@8979 2545 StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
gromero@8979 2546 StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
gromero@8979 2547 }
goetz@6023 2548 }
goetz@6023 2549
goetz@6023 2550 void generate_all() {
goetz@6023 2551 // Generates all stubs and initializes the entry points
goetz@6023 2552
goetz@6023 2553 // These entry points require SharedInfo::stack0 to be set up in
goetz@6023 2554 // non-core builds
goetz@6023 2555 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
goetz@6023 2556 // Handle IncompatibleClassChangeError in itable stubs.
goetz@6023 2557 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
goetz@6023 2558 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
goetz@6023 2559
goetz@6023 2560 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
goetz@6023 2561
goetz@6023 2562 // support for verify_oop (must happen after universe_init)
goetz@6023 2563 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
goetz@6023 2564
goetz@6023 2565 // arraycopy stubs used by compilers
goetz@6023 2566 generate_arraycopy_stubs();
goetz@6023 2567
goetz@6077 2568 // Safefetch stubs.
goetz@6023 2569 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
goetz@6023 2570 &StubRoutines::_safefetch32_fault_pc,
goetz@6023 2571 &StubRoutines::_safefetch32_continuation_pc);
goetz@6023 2572 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
goetz@6023 2573 &StubRoutines::_safefetchN_fault_pc,
goetz@6023 2574 &StubRoutines::_safefetchN_continuation_pc);
simonis@8308 2575
simonis@8308 2576 if (UseAESIntrinsics) {
simonis@8308 2577 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
simonis@8308 2578 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
simonis@8308 2579 }
simonis@8308 2580
mdoerr@8595 2581 if (UseMontgomeryMultiplyIntrinsic) {
mdoerr@8595 2582 StubRoutines::_montgomeryMultiply
mdoerr@8595 2583 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
mdoerr@8595 2584 }
mdoerr@8595 2585 if (UseMontgomerySquareIntrinsic) {
mdoerr@8595 2586 StubRoutines::_montgomerySquare
mdoerr@8595 2587 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
mdoerr@8595 2588 }
goetz@6023 2589 }
goetz@6023 2590
goetz@6023 2591 public:
goetz@6023 2592 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
goetz@6023 2593 // replace the standard masm with a special one:
goetz@6023 2594 _masm = new MacroAssembler(code);
goetz@6023 2595 if (all) {
goetz@6023 2596 generate_all();
goetz@6023 2597 } else {
goetz@6023 2598 generate_initial();
goetz@6023 2599 }
goetz@6023 2600 }
goetz@6023 2601 };
goetz@6023 2602
goetz@6023 2603 void StubGenerator_generate(CodeBuffer* code, bool all) {
goetz@6023 2604 StubGenerator g(code, all);
goetz@6023 2605 }