changeset 47522:03ce88449e9d

Merge
author jwilhelm
date Mon, 25 Sep 2017 19:54:58 +0000
parents ae970828ec0c 122833427b36
children ceafc169d2a4
files
diffstat 7 files changed, 334 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/cpu/ppc/assembler_ppc.hpp	Mon Sep 25 20:32:44 2017 +0530
+++ b/src/hotspot/cpu/ppc/assembler_ppc.hpp	Mon Sep 25 19:54:58 2017 +0000
@@ -1308,6 +1308,7 @@
   inline void li(   Register d, int si16);
   inline void lis(  Register d, int si16);
   inline void addir(Register d, int si16, Register a);
+  inline void subi( Register d, Register a, int si16);
 
   static bool is_addi(int x) {
      return ADDI_OPCODE == (x & ADDI_OPCODE_MASK);
--- a/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp	Mon Sep 25 20:32:44 2017 +0530
+++ b/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp	Mon Sep 25 19:54:58 2017 +0000
@@ -164,6 +164,7 @@
 inline void Assembler::li(   Register d, int si16)             { Assembler::addi_r0ok( d, R0, si16); }
 inline void Assembler::lis(  Register d, int si16)             { Assembler::addis_r0ok(d, R0, si16); }
 inline void Assembler::addir(Register d, int si16, Register a) { Assembler::addi(d, a, si16); }
+inline void Assembler::subi( Register d, Register a, int si16) { Assembler::addi(d, a, -si16); }
 
 // PPC 1, section 3.3.9, Fixed-Point Compare Instructions
 inline void Assembler::cmpi(  ConditionRegister f, int l, Register a, int si16)   { emit_int32( CMPI_OPCODE  | bf(f) | l10(l) | ra(a) | simm(si16,16)); }
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Mon Sep 25 20:32:44 2017 +0530
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Mon Sep 25 19:54:58 2017 +0000
@@ -129,7 +129,7 @@
   }
 }
 
-int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
+address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
   const int offset = MacroAssembler::offset_to_global_toc(addr);
 
   const address inst2_addr = a;
@@ -155,7 +155,7 @@
   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
-  return (int)((intptr_t)addr - (intptr_t)inst1_addr);
+  return inst1_addr;
 }
 
 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
@@ -201,7 +201,7 @@
 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 //    ori rx = rx | const.lo
 // Clrldi will be passed by.
-int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
+address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
   assert(UseCompressedOops, "Should only patch compressed oops");
 
   const address inst2_addr = a;
@@ -227,7 +227,7 @@
 
   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
   set_imm((int *)inst2_addr,        (xd)); // unsigned int
-  return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
+  return inst1_addr;
 }
 
 // Get compressed oop or klass constant.
@@ -5234,6 +5234,40 @@
   bind(L_post_third_loop_done);
 }   // multiply_128_x_128_loop
 
+void MacroAssembler::muladd(Register out, Register in,
+                            Register offset, Register len, Register k,
+                            Register tmp1, Register tmp2, Register carry) {
+
+  // Labels
+  Label LOOP, SKIP;
+
+  // Make sure length is positive.
+  cmpdi  (CCR0,    len,     0);
+
+  // Prepare variables
+  subi   (offset,  offset,  4);
+  li     (carry,   0);
+  ble    (CCR0,    SKIP);
+
+  mtctr  (len);
+  subi   (len,     len,     1    );
+  sldi   (len,     len,     2    );
+
+  // Main loop
+  bind(LOOP);
+  lwzx   (tmp1,    len,     in   );
+  lwzx   (tmp2,    offset,  out  );
+  mulld  (tmp1,    tmp1,    k    );
+  add    (tmp2,    carry,   tmp2 );
+  add    (tmp2,    tmp1,    tmp2 );
+  stwx   (tmp2,    offset,  out  );
+  srdi   (carry,   tmp2,    32   );
+  subi   (offset,  offset,  4    );
+  subi   (len,     len,     4    );
+  bdnz   (LOOP);
+  bind(SKIP);
+}
+
 void MacroAssembler::multiply_to_len(Register x, Register xlen,
                                      Register y, Register ylen,
                                      Register z, Register zlen,
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp	Mon Sep 25 20:32:44 2017 +0530
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp	Mon Sep 25 19:54:58 2017 +0000
@@ -105,13 +105,15 @@
   };
 
   inline static bool is_calculate_address_from_global_toc_at(address a, address bound);
-  static int patch_calculate_address_from_global_toc_at(address a, address addr, address bound);
+  // Returns address of first instruction in sequence.
+  static address patch_calculate_address_from_global_toc_at(address a, address bound, address addr);
   static address get_address_of_calculate_address_from_global_toc_at(address a, address addr);
 
 #ifdef _LP64
   // Patch narrow oop constant.
   inline static bool is_set_narrow_oop(address a, address bound);
-  static int patch_set_narrow_oop(address a, address bound, narrowOop data);
+  // Returns address of first instruction in sequence.
+  static address patch_set_narrow_oop(address a, address bound, narrowOop data);
   static narrowOop get_narrow_oop(address a, address bound);
 #endif
 
@@ -813,6 +815,8 @@
                                Register yz_idx, Register idx, Register carry,
                                Register product_high, Register product,
                                Register carry2, Register tmp);
+  void muladd(Register out, Register in, Register offset, Register len, Register k,
+              Register tmp1, Register tmp2, Register carry);
   void multiply_to_len(Register x, Register xlen,
                        Register y, Register ylen,
                        Register z, Register zlen,
--- a/src/hotspot/cpu/ppc/nativeInst_ppc.cpp	Mon Sep 25 20:32:44 2017 +0530
+++ b/src/hotspot/cpu/ppc/nativeInst_ppc.cpp	Mon Sep 25 19:54:58 2017 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -221,13 +221,13 @@
     // A calculation relative to the global TOC.
     if (MacroAssembler::get_address_of_calculate_address_from_global_toc_at(addr, cb->content_begin()) !=
         (address)data) {
-      const int invalidated_range =
-        MacroAssembler::patch_calculate_address_from_global_toc_at(addr, cb->content_begin(),
+      const address inst2_addr = addr;
+      const address inst1_addr =
+        MacroAssembler::patch_calculate_address_from_global_toc_at(inst2_addr, cb->content_begin(),
                                                                    (address)data);
-      const address start = invalidated_range < 0 ? addr + invalidated_range : addr;
-      // FIXME:
-      const int range = invalidated_range < 0 ? 4 - invalidated_range : 8;
-      ICache::ppc64_flush_icache_bytes(start, range);
+      assert(inst1_addr != NULL && inst1_addr < inst2_addr, "first instruction must be found");
+      const int range = inst2_addr - inst1_addr + BytesPerInstWord;
+      ICache::ppc64_flush_icache_bytes(inst1_addr, range);
     }
     next_address = addr + 1 * BytesPerInstWord;
   } else if (MacroAssembler::is_load_const_at(addr)) {
@@ -288,15 +288,15 @@
 }
 
 void NativeMovConstReg::set_narrow_oop(narrowOop data, CodeBlob *code /* = NULL */) {
-  address   addr = addr_at(0);
+  address   inst2_addr = addr_at(0);
   CodeBlob* cb = (code) ? code : CodeCache::find_blob(instruction_address());
-  if (MacroAssembler::get_narrow_oop(addr, cb->content_begin()) == (long)data) return;
-  const int invalidated_range =
-    MacroAssembler::patch_set_narrow_oop(addr, cb->content_begin(), (long)data);
-  const address start = invalidated_range < 0 ? addr + invalidated_range : addr;
-  // FIXME:
-  const int range = invalidated_range < 0 ? 4 - invalidated_range : 8;
-  ICache::ppc64_flush_icache_bytes(start, range);
+  if (MacroAssembler::get_narrow_oop(inst2_addr, cb->content_begin()) == (long)data)
+    return;
+  const address inst1_addr =
+    MacroAssembler::patch_set_narrow_oop(inst2_addr, cb->content_begin(), (long)data);
+  assert(inst1_addr != NULL && inst1_addr < inst2_addr, "first instruction must be found");
+  const int range = inst2_addr - inst1_addr + BytesPerInstWord;
+  ICache::ppc64_flush_icache_bytes(inst1_addr, range);
 }
 
 // Do not use an assertion here. Let clients decide whether they only
--- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp	Mon Sep 25 20:32:44 2017 +0530
+++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp	Mon Sep 25 19:54:58 2017 +0000
@@ -3306,6 +3306,267 @@
       BLOCK_COMMENT("} Stub body");
   }
 
+  /**
+  *  Arguments:
+  *
+  *  Input:
+  *   R3_ARG1    - out address
+  *   R4_ARG2    - in address
+  *   R5_ARG3    - offset
+  *   R6_ARG4    - len
+  *   R7_ARG5    - k
+  *  Output:
+  *   R3_RET     - carry
+  */
+  address generate_mulAdd() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "mulAdd");
+
+    address start = __ function_entry();
+
+    // C2 does not sign extend signed parameters to full 64 bits registers:
+    __ rldic (R5_ARG3, R5_ARG3, 2, 32);  // always positive
+    __ clrldi(R6_ARG4, R6_ARG4, 32);     // force zero bits on higher word
+    __ clrldi(R7_ARG5, R7_ARG5, 32);     // force zero bits on higher word
+
+    __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
+
+    // Moves output carry to return register
+    __ mr    (R3_RET,  R10);
+
+    __ blr();
+
+    return start;
+  }
+
+  /**
+  *  Arguments:
+  *
+  *  Input:
+  *   R3_ARG1    - in address
+  *   R4_ARG2    - in length
+  *   R5_ARG3    - out address
+  *   R6_ARG4    - out length
+  */
+  address generate_squareToLen() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "squareToLen");
+
+    address start = __ function_entry();
+
+    // args - higher word is cleaned (unsignedly) due to int to long casting
+    const Register in        = R3_ARG1;
+    const Register in_len    = R4_ARG2;
+    __ clrldi(in_len, in_len, 32);
+    const Register out       = R5_ARG3;
+    const Register out_len   = R6_ARG4;
+    __ clrldi(out_len, out_len, 32);
+
+    // output
+    const Register ret       = R3_RET;
+
+    // temporaries
+    const Register lplw_s    = R7;
+    const Register in_aux    = R8;
+    const Register out_aux   = R9;
+    const Register piece     = R10;
+    const Register product   = R14;
+    const Register lplw      = R15;
+    const Register i_minus1  = R16;
+    const Register carry     = R17;
+    const Register offset    = R18;
+    const Register off_aux   = R19;
+    const Register t         = R20;
+    const Register mlen      = R21;
+    const Register len       = R22;
+    const Register a         = R23;
+    const Register b         = R24;
+    const Register i         = R25;
+    const Register c         = R26;
+    const Register cs        = R27;
+
+    // Labels
+    Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_MULADD, SKIP_LOOP_SQUARE;
+    Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_MULADD, LOOP_SQUARE;
+
+    // Save non-volatile regs (frameless).
+    int current_offs = -8;
+    __ std(R28, current_offs, R1_SP); current_offs -= 8;
+    __ std(R27, current_offs, R1_SP); current_offs -= 8;
+    __ std(R26, current_offs, R1_SP); current_offs -= 8;
+    __ std(R25, current_offs, R1_SP); current_offs -= 8;
+    __ std(R24, current_offs, R1_SP); current_offs -= 8;
+    __ std(R23, current_offs, R1_SP); current_offs -= 8;
+    __ std(R22, current_offs, R1_SP); current_offs -= 8;
+    __ std(R21, current_offs, R1_SP); current_offs -= 8;
+    __ std(R20, current_offs, R1_SP); current_offs -= 8;
+    __ std(R19, current_offs, R1_SP); current_offs -= 8;
+    __ std(R18, current_offs, R1_SP); current_offs -= 8;
+    __ std(R17, current_offs, R1_SP); current_offs -= 8;
+    __ std(R16, current_offs, R1_SP); current_offs -= 8;
+    __ std(R15, current_offs, R1_SP); current_offs -= 8;
+    __ std(R14, current_offs, R1_SP);
+
+    // Store the squares, right shifted one bit (i.e., divided by 2)
+    __ subi   (out_aux,   out,       8);
+    __ subi   (in_aux,    in,        4);
+    __ cmpwi  (CCR0,      in_len,    0);
+    // Initialize lplw outside of the loop
+    __ xorr   (lplw,      lplw,      lplw);
+    __ ble    (CCR0,      SKIP_LOOP_SQUARE);    // in_len <= 0
+    __ mtctr  (in_len);
+
+    __ bind(LOOP_SQUARE);
+    __ lwzu   (piece,     4,         in_aux);
+    __ mulld  (product,   piece,     piece);
+    // shift left 63 bits and only keep the MSB
+    __ rldic  (lplw_s,    lplw,      63, 0);
+    __ mr     (lplw,      product);
+    // shift right 1 bit without sign extension
+    __ srdi   (product,   product,   1);
+    // join them to the same register and store it
+    __ orr    (product,   lplw_s,    product);
+#ifdef VM_LITTLE_ENDIAN
+    // Swap low and high words for little endian
+    __ rldicl (product,   product,   32, 0);
+#endif
+    __ stdu   (product,   8,         out_aux);
+    __ bdnz   (LOOP_SQUARE);
+
+    __ bind(SKIP_LOOP_SQUARE);
+
+    // Add in off-diagonal sums
+    __ cmpwi  (CCR0,      in_len,    0);
+    __ ble    (CCR0,      SKIP_DIAGONAL_SUM);
+    // Avoid CTR usage here in order to use it at mulAdd
+    __ subi   (i_minus1,  in_len,    1);
+    __ li     (offset,    4);
+
+    __ bind(LOOP_DIAGONAL_SUM);
+
+    __ sldi   (off_aux,   out_len,   2);
+    __ sub    (off_aux,   off_aux,   offset);
+
+    __ mr     (len,       i_minus1);
+    __ sldi   (mlen,      i_minus1,  2);
+    __ lwzx   (t,         in,        mlen);
+
+    __ muladd (out, in, off_aux, len, t, a, b, carry);
+
+    // begin<addOne>
+    // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
+    __ addi   (mlen,      mlen,      4);
+    __ sldi   (a,         out_len,   2);
+    __ subi   (a,         a,         4);
+    __ sub    (a,         a,         mlen);
+    __ subi   (off_aux,   offset,    4);
+    __ sub    (off_aux,   a,         off_aux);
+
+    __ lwzx   (b,         off_aux,   out);
+    __ add    (b,         b,         carry);
+    __ stwx   (b,         off_aux,   out);
+
+    // if (((uint64_t)s >> 32) != 0) {
+    __ srdi_  (a,         b,         32);
+    __ beq    (CCR0,      SKIP_ADDONE);
+
+    // while (--mlen >= 0) {
+    __ bind(LOOP_ADDONE);
+    __ subi   (mlen,      mlen,      4);
+    __ cmpwi  (CCR0,      mlen,      0);
+    __ beq    (CCR0,      SKIP_ADDONE);
+
+    // if (--offset_aux < 0) { // Carry out of number
+    __ subi   (off_aux,   off_aux,   4);
+    __ cmpwi  (CCR0,      off_aux,   0);
+    __ blt    (CCR0,      SKIP_ADDONE);
+
+    // } else {
+    __ lwzx   (b,         off_aux,   out);
+    __ addi   (b,         b,         1);
+    __ stwx   (b,         off_aux,   out);
+    __ cmpwi  (CCR0,      b,         0);
+    __ bne    (CCR0,      SKIP_ADDONE);
+    __ b      (LOOP_ADDONE);
+
+    __ bind(SKIP_ADDONE);
+    // } } } end<addOne>
+
+    __ addi   (offset,    offset,    8);
+    __ subi   (i_minus1,  i_minus1,  1);
+    __ cmpwi  (CCR0,      i_minus1,  0);
+    __ bge    (CCR0,      LOOP_DIAGONAL_SUM);
+
+    __ bind(SKIP_DIAGONAL_SUM);
+
+    // Shift back up and set low bit
+    // Shifts 1 bit left up to len positions. Assumes no leading zeros
+    // begin<primitiveLeftShift>
+    __ cmpwi  (CCR0,      out_len,   0);
+    __ ble    (CCR0,      SKIP_LSHIFT);
+    __ li     (i,         0);
+    __ lwz    (c,         0,         out);
+    __ subi   (b,         out_len,   1);
+    __ mtctr  (b);
+
+    __ bind(LOOP_LSHIFT);
+    __ mr     (b,         c);
+    __ addi   (cs,        i,         4);
+    __ lwzx   (c,         out,       cs);
+
+    __ sldi   (b,         b,         1);
+    __ srwi   (cs,        c,         31);
+    __ orr    (b,         b,         cs);
+    __ stwx   (b,         i,         out);
+
+    __ addi   (i,         i,         4);
+    __ bdnz   (LOOP_LSHIFT);
+
+    __ sldi   (c,         out_len,   2);
+    __ subi   (c,         c,         4);
+    __ lwzx   (b,         out,       c);
+    __ sldi   (b,         b,         1);
+    __ stwx   (b,         out,       c);
+
+    __ bind(SKIP_LSHIFT);
+    // end<primitiveLeftShift>
+
+    // Set low bit
+    __ sldi   (i,         in_len,    2);
+    __ subi   (i,         i,         4);
+    __ lwzx   (i,         in,        i);
+    __ sldi   (c,         out_len,   2);
+    __ subi   (c,         c,         4);
+    __ lwzx   (b,         out,       c);
+
+    __ andi   (i,         i,         1);
+    __ orr    (i,         b,         i);
+
+    __ stwx   (i,         out,       c);
+
+    // Restore non-volatile regs.
+    current_offs = -8;
+    __ ld(R28, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R27, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R26, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R25, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R24, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R23, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R22, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R21, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R20, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R19, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R18, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R17, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R16, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R15, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R14, current_offs, R1_SP);
+
+    __ mr(ret, out);
+    __ blr();
+
+    return start;
+  }
 
   /**
    * Arguments:
@@ -3500,6 +3761,12 @@
     }
 #endif
 
+    if (UseSquareToLenIntrinsic) {
+      StubRoutines::_squareToLen = generate_squareToLen();
+    }
+    if (UseMulAddIntrinsic) {
+      StubRoutines::_mulAdd = generate_mulAdd();
+    }
     if (UseMontgomeryMultiplyIntrinsic) {
       StubRoutines::_montgomeryMultiply
         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
--- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp	Mon Sep 25 20:32:44 2017 +0530
+++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp	Mon Sep 25 19:54:58 2017 +0000
@@ -258,6 +258,12 @@
     FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
   }
 
+  if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+    UseSquareToLenIntrinsic = true;
+  }
+  if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+    UseMulAddIntrinsic = true;
+  }
   if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
     UseMultiplyToLenIntrinsic = true;
   }