changeset 51388:ccb8aa083958

8189105: AARCH64: create intrinsic for sin and cos Reviewed-by: aph, adinn
author dpochepk
date Mon, 25 Jun 2018 16:30:49 +0300
parents 4449b45900f1
children 680d04ae76e9
files src/hotspot/cpu/aarch64/assembler_aarch64.hpp src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
diffstat 6 files changed, 1654 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp	Mon Jun 25 16:30:16 2018 +0300
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp	Mon Jun 25 16:30:49 2018 +0300
@@ -2024,6 +2024,57 @@
       fmovd(Vn, zr);
   }
 
+   // Floating-point rounding
+   // type: half-precision = 11
+   //       single         = 00
+   //       double         = 01
+   // rmode: A = Away     = 100
+   //        I = current  = 111
+   //        M = MinusInf = 010
+   //        N = eveN     = 000
+   //        P = PlusInf  = 001
+   //        X = eXact    = 110
+   //        Z = Zero     = 011
+  void float_round(unsigned type, unsigned rmode, FloatRegister Rd, FloatRegister Rn) {
+    starti;
+    f(0b00011110, 31, 24);
+    f(type, 23, 22);
+    f(0b1001, 21, 18);
+    f(rmode, 17, 15);
+    f(0b10000, 14, 10);
+    rf(Rn, 5), rf(Rd, 0);
+  }
+#define INSN(NAME, type, rmode)                   \
+  void NAME(FloatRegister Vd, FloatRegister Vn) { \
+    float_round(type, rmode, Vd, Vn);             \
+  }
+
+public:
+  INSN(frintah, 0b11, 0b100);
+  INSN(frintih, 0b11, 0b111);
+  INSN(frintmh, 0b11, 0b010);
+  INSN(frintnh, 0b11, 0b000);
+  INSN(frintph, 0b11, 0b001);
+  INSN(frintxh, 0b11, 0b110);
+  INSN(frintzh, 0b11, 0b011);
+
+  INSN(frintas, 0b00, 0b100);
+  INSN(frintis, 0b00, 0b111);
+  INSN(frintms, 0b00, 0b010);
+  INSN(frintns, 0b00, 0b000);
+  INSN(frintps, 0b00, 0b001);
+  INSN(frintxs, 0b00, 0b110);
+  INSN(frintzs, 0b00, 0b011);
+
+  INSN(frintad, 0b01, 0b100);
+  INSN(frintid, 0b01, 0b111);
+  INSN(frintmd, 0b01, 0b010);
+  INSN(frintnd, 0b01, 0b000);
+  INSN(frintpd, 0b01, 0b001);
+  INSN(frintxd, 0b01, 0b110);
+  INSN(frintzd, 0b01, 0b011);
+#undef INSN
+
 /* SIMD extensions
  *
  * We just use FloatRegister in the following. They are exactly the same
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Jun 25 16:30:16 2018 +0300
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Jun 25 16:30:49 2018 +0300
@@ -1256,7 +1256,15 @@
                 FloatRegister tmpC1, FloatRegister tmpC2, FloatRegister tmpC3,
                 FloatRegister tmpC4, Register tmp1, Register tmp2,
                 Register tmp3, Register tmp4, Register tmp5);
-private:
+  void generate_dsin_dcos(bool isCos, address npio2_hw, address two_over_pi,
+      address pio2, address dsin_coef, address dcos_coef);
+ private:
+  // begin trigonometric functions support block
+  void generate__ieee754_rem_pio2(address npio2_hw, address two_over_pi, address pio2);
+  void generate__kernel_rem_pio2(address two_over_pi, address pio2);
+  void generate_kernel_sin(FloatRegister x, bool iyIsOne, address dsin_coef);
+  void generate_kernel_cos(FloatRegister x, address dcos_coef);
+  // end trigonometric functions support block
   void add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
                        Register src1, Register src2);
   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp	Mon Jun 25 16:30:49 2018 +0300
@@ -0,0 +1,1488 @@
+/* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, Cavium. All rights reserved. (By BELLSOFT)
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "macroAssembler_aarch64.hpp"
+
+// The following code is a optimized version of fdlibm sin/cos implementation
+// (C code is in share/runtime/sharedRuntimeTrig.cpp) adapted for AARCH64.
+
+// Please refer to sin/cos approximation via polynomial and
+// trigonometric argument reduction techniques to the following literature:
+//
+// [1] Muller, Jean-Michel, Nicolas Brisebarre, Florent De Dinechin,
+// Claude-Pierre Jeannerod, Vincent Lefevre, Guillaume Melquiond,
+// Nathalie Revol, Damien Stehlé, and Serge Torres:
+// Handbook of floating-point arithmetic.
+// Springer Science & Business Media, 2009.
+// [2] K. C. Ng
+// Argument Reduction for Huge Arguments: Good to the Last Bit
+// July 13, 1992, SunPro
+//
+// HOW TO READ THIS CODE:
+// This code consists of several functions. Each function has following header:
+// 1) Description
+// 2) C-pseudo code with differences from fdlibm marked by comments starting
+//        with "NOTE". Check unmodified fdlibm code in
+//        share/runtime/SharedRuntimeTrig.cpp
+// 3) Brief textual description of changes between fdlibm and current
+//        implementation along with optimization notes (if applicable)
+// 4) Assumptions, input and output
+// 5) (Optional) additional notes about intrinsic implementation
+// Each function is separated in blocks which follow the pseudo-code structure
+//
+// HIGH-LEVEL ALGORITHM DESCRIPTION:
+//    - entry point: generate_dsin_dcos(...);
+//    - check corner cases: NaN, INF, tiny argument.
+//    - check if |x| < Pi/4. Then approximate sin/cos via polynomial (kernel_sin/kernel_cos)
+//    -- else proceed to argument reduction routine (__ieee754_rem_pio2) and
+//           use reduced argument to get result via kernel_sin/kernel_cos
+//
+// HIGH-LEVEL CHANGES BETWEEN INTRINSICS AND FDLIBM:
+// 1) two_over_pi table fdlibm representation is int[], while intrinsic version
+// has these int values converted to double representation to load converted
+// double values directly (see stubRoutines_aarch4::_two_over_pi)
+// 2) Several loops are unrolled and vectorized: see comments in code after
+// labels: SKIP_F_LOAD, RECOMP_FOR1_CHECK, RECOMP_FOR2
+// 3) fdlibm npio2_hw table now has "prefix" with constants used in
+// calculation. These constants are loaded from npio2_hw table instead of
+// constructing it in code (see stubRoutines_aarch64.cpp)
+// 4) Polynomial coefficients for sin and cos are moved to table sin_coef
+// and cos_coef to use the same optimization as in 3). It allows to load most of
+// required constants via single instruction
+//
+//
+//
+///* __ieee754_rem_pio2(x,y)
+// *
+// * returns the remainder of x rem pi/2 in y[0]+y[1] (i.e. like x div pi/2)
+// * x is input argument, y[] is hi and low parts of reduced argument (x)
+// * uses __kernel_rem_pio2()
+// */
+// // use tables(see stubRoutines_aarch64.cpp): two_over_pi and modified npio2_hw
+//
+// BEGIN __ieee754_rem_pio2 PSEUDO CODE
+//
+//static int __ieee754_rem_pio2(double x, double *y) {
+//  double z,w,t,r,fn;
+//  double tx[3];
+//  int e0,i,j,nx,n,ix,hx,i0;
+//
+//  i0 = ((*(int*)&two24A)>>30)^1;        /* high word index */
+//  hx = *(i0+(int*)&x);          /* high word of x */
+//  ix = hx&0x7fffffff;
+//  if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
+//    if(hx>0) {
+//      z = x - pio2_1;
+//      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
+//        y[0] = z - pio2_1t;
+//        y[1] = (z-y[0])-pio2_1t;
+//      } else {                /* near pi/2, use 33+33+53 bit pi */
+//        z -= pio2_2;
+//        y[0] = z - pio2_2t;
+//        y[1] = (z-y[0])-pio2_2t;
+//      }
+//      return 1;
+//    } else {    /* negative x */
+//      z = x + pio2_1;
+//      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
+//        y[0] = z + pio2_1t;
+//        y[1] = (z-y[0])+pio2_1t;
+//      } else {                /* near pi/2, use 33+33+53 bit pi */
+//        z += pio2_2;
+//        y[0] = z + pio2_2t;
+//        y[1] = (z-y[0])+pio2_2t;
+//      }
+//      return -1;
+//    }
+//  }
+//  if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
+//    t  = fabsd(x);
+//    n  = (int) (t*invpio2+half);
+//    fn = (double)n;
+//    r  = t-fn*pio2_1;
+//    w  = fn*pio2_1t;    /* 1st round good to 85 bit */
+//    // NOTE: y[0] = r-w; is moved from if/else below to be before "if"
+//    y[0] = r-w;
+//    if(n<32&&ix!=npio2_hw[n-1]) {
+//      // y[0] = r-w;       /* quick check no cancellation */ // NOTE: moved earlier
+//    } else {
+//      j  = ix>>20;
+//      // y[0] = r-w; // NOTE: moved earlier
+//      i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
+//      if(i>16) {  /* 2nd iteration needed, good to 118 */
+//        t  = r;
+//        w  = fn*pio2_2;
+//        r  = t-w;
+//        w  = fn*pio2_2t-((t-r)-w);
+//        y[0] = r-w;
+//        i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
+//        if(i>49)  {     /* 3rd iteration need, 151 bits acc */
+//          t  = r;       /* will cover all possible cases */
+//          w  = fn*pio2_3;
+//          r  = t-w;
+//          w  = fn*pio2_3t-((t-r)-w);
+//          y[0] = r-w;
+//        }
+//      }
+//    }
+//    y[1] = (r-y[0])-w;
+//    if(hx<0)    {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+//    else         return n;
+//  }
+//  /*
+//   * all other (large) arguments
+//   */
+//  // NOTE: this check is removed, because it was checked in dsin/dcos
+//  // if(ix>=0x7ff00000) {          /* x is inf or NaN */
+//  //  y[0]=y[1]=x-x; return 0;
+//  // }
+//  /* set z = scalbn(|x|,ilogb(x)-23) */
+//  *(1-i0+(int*)&z) = *(1-i0+(int*)&x);
+//  e0    = (ix>>20)-1046;        /* e0 = ilogb(z)-23; */
+//  *(i0+(int*)&z) = ix - (e0<<20);
+//
+//  // NOTE: "for" loop below in unrolled. See comments in asm code
+//  for(i=0;i<2;i++) {
+//    tx[i] = (double)((int)(z));
+//    z     = (z-tx[i])*two24A;
+//  }
+//
+//  tx[2] = z;
+//  nx = 3;
+//
+//  // NOTE: while(tx[nx-1]==zeroA) nx--;  is unrolled. See comments in asm code
+//  while(tx[nx-1]==zeroA) nx--;  /* skip zero term */
+//
+//  n  =  __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
+//  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+//  return n;
+//}
+//
+// END __ieee754_rem_pio2 PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic for __ieee754_rem_pio2:
+//     1. INF/NaN check for huge argument is removed in comparison with fdlibm
+//     code, because this check is already done in dcos/dsin code
+//     2. Most constants are now loaded from table instead of direct initialization
+//     3. Two loops are unrolled
+// Assumptions:
+//     1. Assume |X| >= PI/4
+//     2. Assume rscratch1 = 0x3fe921fb00000000  (~ PI/4)
+//     3. Assume ix = r3
+// Input and output:
+//     1. Input: X = r0
+//     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
+// NOTE: general purpose register names match local variable names in C code
+// NOTE: fpu registers are actively reused. See comments in code about their usage
+void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw,
+    address two_over_pi, address pio2) {
+  const long PIO2_1t = 0x3DD0B4611A626331UL;
+  const long PIO2_2  = 0x3DD0B4611A600000UL;
+  const long PIO2_2t = 0x3BA3198A2E037073UL;
+  Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE,
+      REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET,
+      X_IS_NEGATIVE_LONG_PI;
+  Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6,
+      tmp3 = r7, iqBase = r10, ih = r11, i = r17;
+    // initializing constants first
+    // rscratch1 = 0x3fe921fb00000000 (see assumptions)
+    movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0
+    mov(rscratch2, 0x4002d97c); // 3*PI/4 high word
+    movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1
+    fmovd(v1, rscratch1); // v1 = PIO2_1
+    cmp(rscratch2, ix);
+    br(LE, X_IS_MEDIUM_OR_LARGE);
+
+    block_comment("if(ix<0x4002d97c) {...  /* |x| ~< 3pi/4 */ "); {
+      cmp(X, zr);
+      br(LT, X_IS_NEGATIVE);
+
+      block_comment("if(hx>0) {"); {
+        fsubd(v2, v0, v1); // v2 = z = x - pio2_1
+        cmp(ix, rscratch1, LSR, 32);
+        mov(n, 1);
+        br(EQ, X_IS_POSITIVE_LONG_PI);
+
+        block_comment("case: hx > 0 &&  ix!=0x3ff921fb {"); { /* 33+53 bit pi is good enough */
+          mov(rscratch2, PIO2_1t);
+          fmovd(v27, rscratch2);
+          fsubd(v4, v2, v27); // v4 = y[0] = z - pio2_1t;
+          fsubd(v5, v2, v4);
+          fsubd(v5, v5, v27); // v5 = y[1] = (z-y[0])-pio2_1t
+          b(REDUCTION_DONE);
+        }
+
+        block_comment("case: hx > 0 &*& ix==0x3ff921fb {"); { /* near pi/2, use 33+33+53 bit pi */
+          bind(X_IS_POSITIVE_LONG_PI);
+            mov(rscratch1, PIO2_2);
+            mov(rscratch2, PIO2_2t);
+            fmovd(v27, rscratch1);
+            fmovd(v6, rscratch2);
+            fsubd(v2, v2, v27); // z-= pio2_2
+            fsubd(v4, v2, v6);  // y[0] = z - pio2_2t
+            fsubd(v5, v2, v4);
+            fsubd(v5, v5, v6);  // v5 = (z - y[0]) - pio2_2t
+            b(REDUCTION_DONE);
+        }
+      }
+
+      block_comment("case: hx <= 0)"); {
+        bind(X_IS_NEGATIVE);
+          faddd(v2, v0, v1); // v2 = z = x + pio2_1
+          cmp(ix, rscratch1, LSR, 32);
+          mov(n, -1);
+          br(EQ, X_IS_NEGATIVE_LONG_PI);
+
+          block_comment("case: hx <= 0 && ix!=0x3ff921fb) {"); { /* 33+53 bit pi is good enough */
+            mov(rscratch2, PIO2_1t);
+            fmovd(v27, rscratch2);
+            faddd(v4, v2, v27); // v4 = y[0] = z + pio2_1t;
+            fsubd(v5, v2, v4);
+            faddd(v5, v5, v27); // v5 = y[1] = (z-y[0]) + pio2_1t
+            b(REDUCTION_DONE);
+          }
+
+          block_comment("case: hx <= 0 && ix==0x3ff921fb"); { /* near pi/2, use 33+33+53 bit pi */
+            bind(X_IS_NEGATIVE_LONG_PI);
+              mov(rscratch1, PIO2_2);
+              mov(rscratch2, PIO2_2t);
+              fmovd(v27, rscratch1);
+              fmovd(v6, rscratch2);
+              faddd(v2, v2, v27); // z += pio2_2
+              faddd(v4, v2, v6);  // y[0] = z + pio2_2t
+              fsubd(v5, v2, v4);
+              faddd(v5, v5, v6);  // v5 = (z - y[0]) + pio2_2t
+              b(REDUCTION_DONE);
+          }
+      }
+  }
+  bind(X_IS_MEDIUM_OR_LARGE);
+    mov(rscratch1, 0x413921fb);
+    cmp(ix, rscratch1); // ix < = 0x413921fb ?
+    br(GT, X_IS_LARGE);
+
+    block_comment("|x| ~<= 2^19*(pi/2), medium size"); {
+      lea(ih, ExternalAddress(npio2_hw));
+      ld1(v4, v5, v6, v7, T1D, ih);
+      fabsd(v31, v0);          // v31 = t = |x|
+      add(ih, ih, 64);
+      fmaddd(v2, v31, v5, v4); // v2 = t * invpio2 + half (invpio2 = 53 bits of 2/pi, half = 0.5)
+      fcvtzdw(n, v2);          // n = (int) v2
+      frintzd(v2, v2);
+      fmsubd(v3, v2, v6, v31); // v3 = r = t - fn * pio2_1
+      fmuld(v26, v2, v7);      // v26 = w = fn * pio2_1t
+      fsubd(v4, v3, v26);      // y[0] = r - w. Calculated before branch
+      cmp(n, 32);
+      br(GT, LARGE_ELSE);
+      subw(tmp5, n, 1);        // tmp5 = n - 1
+      ldrw(jv, Address(ih, tmp5, Address::lsl(2)));
+      cmp(ix, jv);
+      br(NE, X_IS_MEDIUM_BRANCH_DONE);
+
+      block_comment("else block for if(n<32&&ix!=npio2_hw[n-1])"); {
+        bind(LARGE_ELSE);
+          fmovd(jx, v4);
+          lsr(tmp5, ix, 20);                       // j = ix >> 20
+          lsl(jx, jx, 1);
+          sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1);   // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
+
+          block_comment("if(i>16)"); {
+            cmp(tmp3, 16);
+            br(LE, X_IS_MEDIUM_BRANCH_DONE);
+            // i > 16. 2nd iteration needed
+            ldpd(v6, v7, Address(ih, -32));
+            fmovd(v28, v3);                        // t = r
+            fmuld(v29, v2, v6);                    // w = v29 = fn * pio2_2
+            fsubd(v3, v28, v29);                   // r = t - w
+            fsubd(v31, v28, v3);                   // v31 = (t - r)
+            fsubd(v31, v29, v31);                  // v31 = w - (t - r) = - ((t - r) - w)
+            fmaddd(v26, v2, v7, v31);              // v26 = w = fn*pio2_2t - ((t - r) - w)
+            fsubd(v4, v3, v26);                    // y[0] = r - w
+            fmovd(jx, v4);
+            lsl(jx, jx, 1);
+            sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1); // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
+
+            block_comment("if(i>49)"); {
+              cmp(tmp3, 49);
+              br(LE, X_IS_MEDIUM_BRANCH_DONE);
+              // 3rd iteration need, 151 bits acc
+              ldpd(v6, v7, Address(ih, -16));
+              fmovd(v28, v3);                      // save "r"
+              fmuld(v29, v2, v6);                  // v29 = fn * pio2_3
+              fsubd(v3, v28, v29);                 // r = r - w
+              fsubd(v31, v28, v3);                 // v31 = (t - r)
+              fsubd(v31, v29, v31);                // v31 = w - (t - r) = - ((t - r) - w)
+              fmaddd(v26, v2, v7, v31);            // v26 = w = fn*pio2_3t - ((t - r) - w)
+              fsubd(v4, v3, v26);                  // y[0] = r - w
+            }
+          }
+      }
+    block_comment("medium x tail"); {
+      bind(X_IS_MEDIUM_BRANCH_DONE);
+        fsubd(v5, v3, v4);                         // v5 = y[1] = (r - y[0])
+        fsubd(v5, v5, v26);                        // v5 = y[1] = (r - y[0]) - w
+        cmp(X, zr);
+        br(GT, REDUCTION_DONE);
+        fnegd(v4, v4);
+        negw(n, n);
+        fnegd(v5, v5);
+        b(REDUCTION_DONE);
+    }
+  }
+
+  block_comment("all other (large) arguments"); {
+    bind(X_IS_LARGE);
+      lsr(rscratch1, ix, 20);                      // ix >> 20
+      movz(tmp5, 0x4170, 48);
+      subw(rscratch1, rscratch1, 1046);            // e0
+      fmovd(v10, tmp5);                            // init two24A value
+      subw(jv, ix, rscratch1, LSL, 20);            // ix - (e0<<20)
+      lsl(jv, jv, 32);
+      subw(rscratch2, rscratch1, 3);
+      bfm(jv, X, 0, 31);                           // jv = z
+      movw(i, 24);
+      fmovd(v26, jv);                              // v26 = z
+
+      block_comment("unrolled for(i=0;i<2;i++) {tx[i] = (double)((int)(z));z = (z-tx[i])*two24A;}"); {
+        // tx[0,1,2] = v6,v7,v26
+        frintzd(v6, v26);                          // v6 = (double)((int)v26)
+        sdivw(jv, rscratch2, i);                   // jv = (e0 - 3)/24
+        fsubd(v26, v26, v6);
+        sub(sp, sp, 560);
+        fmuld(v26, v26, v10);
+        frintzd(v7, v26);                          // v7 = (double)((int)v26)
+        movw(jx, 2); // calculate jx as nx - 1, which is initially 2. Not a part of unrolled loop
+        fsubd(v26, v26, v7);
+      }
+
+      block_comment("nx calculation with unrolled while(tx[nx-1]==zeroA) nx--;"); {
+        fcmpd(v26, 0.0d);                          // if NE then jx == 2. else it's 1 or 0
+        add(iqBase, sp, 480);                      // base of iq[]
+        fmuld(v3, v26, v10);
+        br(NE, NX_SET);
+        fcmpd(v7, 0.0d);                           // v7 == 0 => jx = 0. Else jx = 1
+        csetw(jx, NE);
+      }
+    bind(NX_SET);
+      generate__kernel_rem_pio2(two_over_pi, pio2);
+      // now we have y[0] = v4, y[1] = v5 and n = r2
+      cmp(X, zr);
+      br(GE, REDUCTION_DONE);
+      fnegd(v4, v4);
+      fnegd(v5, v5);
+      negw(n, n);
+  }
+  bind(REDUCTION_DONE);
+}
+
+///*
+// * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
+// * double x[],y[]; int e0,nx,prec; int ipio2[];
+// *
+// * __kernel_rem_pio2 return the last three digits of N with
+// *              y = x - N*pi/2
+// * so that |y| < pi/2.
+// *
+// * The method is to compute the integer (mod 8) and fraction parts of
+// * (2/pi)*x without doing the full multiplication. In general we
+// * skip the part of the product that are known to be a huge integer (
+// * more accurately, = 0 mod 8 ). Thus the number of operations are
+// * independent of the exponent of the input.
+// *
+// * NOTE: 2/pi int representation is converted to double
+// * // (2/pi) is represented by an array of 24-bit integers in ipio2[].
+// *
+// * Input parameters:
+// *      x[]     The input value (must be positive) is broken into nx
+// *              pieces of 24-bit integers in double precision format.
+// *              x[i] will be the i-th 24 bit of x. The scaled exponent
+// *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
+// *              match x's up to 24 bits.
+// *
+// *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
+// *                      e0 = ilogb(z)-23
+// *                      z  = scalbn(z,-e0)
+// *              for i = 0,1,2
+// *                      x[i] = floor(z)
+// *                      z    = (z-x[i])*2**24
+// *
+// *
+// *      y[]     ouput result in an array of double precision numbers.
+// *              The dimension of y[] is:
+// *                      24-bit  precision       1
+// *                      53-bit  precision       2
+// *                      64-bit  precision       2
+// *                      113-bit precision       3
+// *              The actual value is the sum of them. Thus for 113-bit
+// *              precsion, one may have to do something like:
+// *
+// *              long double t,w,r_head, r_tail;
+// *              t = (long double)y[2] + (long double)y[1];
+// *              w = (long double)y[0];
+// *              r_head = t+w;
+// *              r_tail = w - (r_head - t);
+// *
+// *      e0      The exponent of x[0]
+// *
+// *      nx      dimension of x[]
+// *
+// *      prec    an interger indicating the precision:
+// *                      0       24  bits (single)
+// *                      1       53  bits (double)
+// *                      2       64  bits (extended)
+// *                      3       113 bits (quad)
+// *
+// *      NOTE: ipio2[] array below is converted to double representation
+// *      //ipio2[]
+// *      //        integer array, contains the (24*i)-th to (24*i+23)-th
+// *      //        bit of 2/pi after binary point. The corresponding
+// *      //        floating value is
+// *
+// *                      ipio2[i] * 2^(-24(i+1)).
+// *
+// * Here is the description of some local variables:
+// *
+// *      jk      jk+1 is the initial number of terms of ipio2[] needed
+// *              in the computation. The recommended value is 2,3,4,
+// *              6 for single, double, extended,and quad.
+// *
+// *      jz      local integer variable indicating the number of
+// *              terms of ipio2[] used.
+// *
+// *      jx      nx - 1
+// *
+// *      jv      index for pointing to the suitable ipio2[] for the
+// *              computation. In general, we want
+// *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
+// *              is an integer. Thus
+// *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
+// *              Hence jv = max(0,(e0-3)/24).
+// *
+// *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
+// *
+// *      q[]     double array with integral value, representing the
+// *              24-bits chunk of the product of x and 2/pi.
+// *
+// *      q0      the corresponding exponent of q[0]. Note that the
+// *              exponent for q[i] would be q0-24*i.
+// *
+// *      PIo2[]  double precision array, obtained by cutting pi/2
+// *              into 24 bits chunks.
+// *
+// *      f[]     ipio2[] in floating point
+// *
+// *      iq[]    integer array by breaking up q[] in 24-bits chunk.
+// *
+// *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
+// *
+// *      ih      integer. If >0 it indicates q[] is >= 0.5, hence
+// *              it also indicates the *sign* of the result.
+// *
+// */
+//
+// Use PIo2 table(see stubRoutines_aarch64.cpp)
+//
+// BEGIN __kernel_rem_pio2 PSEUDO CODE
+//
+//static int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, /* NOTE: converted to double */ const double *ipio2 // const int *ipio2) {
+//  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
+//  double z,fw,f[20],fq[20],q[20];
+//
+//  /* initialize jk*/
+//  // jk = init_jk[prec]; // NOTE: prec==2 for double. jk is always 4.
+//  jp = jk; // NOTE: always 4
+//
+//  /* determine jx,jv,q0, note that 3>q0 */
+//  jx =  nx-1;
+//  jv = (e0-3)/24; if(jv<0) jv=0;
+//  q0 =  e0-24*(jv+1);
+//
+//  /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+//  j = jv-jx; m = jx+jk;
+//
+//  // NOTE: split into two for-loops: one with zeroB and one with ipio2[j]. It
+//  //       allows the use of wider loads/stores
+//  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; //(double) ipio2[j];
+//
+//  // NOTE: unrolled and vectorized "for". See comments in asm code
+//  /* compute q[0],q[1],...q[jk] */
+//  for (i=0;i<=jk;i++) {
+//    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
+//  }
+//
+//  jz = jk;
+//recompute:
+//  /* distill q[] into iq[] reversingly */
+//  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
+//    fw    =  (double)((int)(twon24* z));
+//    iq[i] =  (int)(z-two24B*fw);
+//    z     =  q[j-1]+fw;
+//  }
+//
+//  /* compute n */
+//  z  = scalbnA(z,q0);           /* actual value of z */
+//  z -= 8.0*floor(z*0.125);              /* trim off integer >= 8 */
+//  n  = (int) z;
+//  z -= (double)n;
+//  ih = 0;
+//  if(q0>0) {    /* need iq[jz-1] to determine n */
+//    i  = (iq[jz-1]>>(24-q0)); n += i;
+//    iq[jz-1] -= i<<(24-q0);
+//    ih = iq[jz-1]>>(23-q0);
+//  }
+//  else if(q0==0) ih = iq[jz-1]>>23;
+//  else if(z>=0.5) ih=2;
+//
+//  if(ih>0) {    /* q > 0.5 */
+//    n += 1; carry = 0;
+//    for(i=0;i<jz ;i++) {        /* compute 1-q */
+//      j = iq[i];
+//      if(carry==0) {
+//        if(j!=0) {
+//          carry = 1; iq[i] = 0x1000000- j;
+//        }
+//      } else  iq[i] = 0xffffff - j;
+//    }
+//    if(q0>0) {          /* rare case: chance is 1 in 12 */
+//      switch(q0) {
+//      case 1:
+//        iq[jz-1] &= 0x7fffff; break;
+//      case 2:
+//        iq[jz-1] &= 0x3fffff; break;
+//      }
+//    }
+//    if(ih==2) {
+//      z = one - z;
+//      if(carry!=0) z -= scalbnA(one,q0);
+//    }
+//  }
+//
+//  /* check if recomputation is needed */
+//  if(z==zeroB) {
+//    j = 0;
+//    for (i=jz-1;i>=jk;i--) j |= iq[i];
+//    if(j==0) { /* need recomputation */
+//      for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
+//
+//      for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
+//        f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
+//        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
+//        q[i] = fw;
+//      }
+//      jz += k;
+//      goto recompute;
+//    }
+//  }
+//
+//  /* chop off zero terms */
+//  if(z==0.0) {
+//    jz -= 1; q0 -= 24;
+//    while(iq[jz]==0) { jz--; q0-=24;}
+//  } else { /* break z into 24-bit if necessary */
+//    z = scalbnA(z,-q0);
+//    if(z>=two24B) {
+//      fw = (double)((int)(twon24*z));
+//      iq[jz] = (int)(z-two24B*fw);
+//      jz += 1; q0 += 24;
+//      iq[jz] = (int) fw;
+//    } else iq[jz] = (int) z ;
+//  }
+//
+//  /* convert integer "bit" chunk to floating-point value */
+//  fw = scalbnA(one,q0);
+//  for(i=jz;i>=0;i--) {
+//    q[i] = fw*(double)iq[i]; fw*=twon24;
+//  }
+//
+//  /* compute PIo2[0,...,jp]*q[jz,...,0] */
+//  for(i=jz;i>=0;i--) {
+//    for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
+//    fq[jz-i] = fw;
+//  }
+//
+//  // NOTE: switch below is eliminated, because prec is always 2 for doubles
+//  /* compress fq[] into y[] */
+//  //switch(prec) {
+//  //case 0:
+//  //  fw = 0.0;
+//  //  for (i=jz;i>=0;i--) fw += fq[i];
+//  //  y[0] = (ih==0)? fw: -fw;
+//  //  break;
+//  //case 1:
+//  //case 2:
+//    fw = 0.0;
+//    for (i=jz;i>=0;i--) fw += fq[i];
+//    y[0] = (ih==0)? fw: -fw;
+//    fw = fq[0]-fw;
+//    for (i=1;i<=jz;i++) fw += fq[i];
+//    y[1] = (ih==0)? fw: -fw;
+//  //  break;
+//  //case 3:       /* painful */
+//  //  for (i=jz;i>0;i--) {
+//  //    fw      = fq[i-1]+fq[i];
+//  // fq[i]  += fq[i-1]-fw;
+//  //    fq[i-1] = fw;
+//  //  }
+//  //  for (i=jz;i>1;i--) {
+//  //    fw      = fq[i-1]+fq[i];
+//  //    fq[i]  += fq[i-1]-fw;
+//  //    fq[i-1] = fw;
+//  //  }
+//  //  for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
+//  //  if(ih==0) {
+//  //    y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
+//  //  } else {
+//  //    y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
+//  //  }
+//  //}
+//  return n&7;
+//}
+//
+// END __kernel_rem_pio2 PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic:
+//     1. One loop is unrolled and vectorized (see comments in code)
+//     2. One loop is split into 2 loops (see comments in code)
+//     3. Non-double code is removed(last switch). Sevaral variables became
+//         constants because of that (see comments in code)
+//     4. Use of jx, which is nx-1 instead of nx
+// Assumptions:
+//     1. Assume |X| >= PI/4
+// Input and output:
+//     1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
+//     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
+// NOTE: general purpose register names match local variable names in C code
+// NOTE: fpu registers are actively reused. See comments in code about their usage
+void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
+  Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
+      RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
+      INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
+      Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
+      RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
+      CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
+      IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
+      RECOMP_FOR1_CHECK;
+  Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
+      tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
+      jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
+    // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
+    // jx = nx - 1
+    lea(twoOverPiBase, ExternalAddress(two_over_pi));
+    cmpw(jv, zr);
+    addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
+    cselw(jv, jv, zr, GE);
+    fmovd(v26, 0.0d);
+    addw(tmp5, jv, 1);                    // jv+1
+    subsw(j, jv, jx);
+    add(qBase, sp, 320);                  // base of q[]
+    msubw(rscratch1, i, tmp5, rscratch1); // q0 =  e0-24*(jv+1)
+    // use double f[20], fq[20], q[20], iq[20] on stack, which is
+    // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
+    // will contain f[20], fq[20], q[20], iq[20]
+    // now initialize f[20] indexes 0..m (inclusive)
+    // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
+    mov(tmp5, sp);
+
+    block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
+        eorw(i, i, i);
+        br(GE, INIT_F_COPY);
+      bind(INIT_F_ZERO);
+        stpq(v26, v26, Address(post(tmp5, 32)));
+        addw(i, i, 4);
+        addsw(j, j, 4);
+        br(LT, INIT_F_ZERO);
+        subw(i, i, j);
+        movw(j, zr);
+      bind(INIT_F_COPY);
+        add(tmp1, twoOverPiBase, j, LSL, 3); // ipio2[j] start address
+        ld1(v18, v19, v20, v21, T16B, tmp1);
+        add(tmp5, sp, i, ext::uxtx, 3);
+        st1(v18, v19, v20, v21, T16B, tmp5);
+    }
+    // v18..v21 can actually contain f[0..7]
+    cbz(i, SKIP_F_LOAD); // i == 0 => f[i] == f[0] => already loaded
+    ld1(v18, v19, v20, v21, T2D, Address(sp)); // load f[0..7]
+  bind(SKIP_F_LOAD);
+    // calculate 2^q0 and 2^-q0, which we'll need further.
+    // q0 is exponent. So, calculate biased exponent(q0+1023)
+    negw(tmp4, rscratch1);
+    addw(tmp5, rscratch1, 1023);
+    addw(tmp4, tmp4, 1023);
+    // Unroll following for(s) depending on jx in [0,1,2]
+    // for (i=0;i<=jk;i++) {
+    //   for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
+    // }
+    // Unrolling for jx == 0 case:
+    //   q[0] = x[0] * f[0]
+    //   q[1] = x[0] * f[1]
+    //   q[2] = x[0] * f[2]
+    //   q[3] = x[0] * f[3]
+    //   q[4] = x[0] * f[4]
+    //
+    // Vectorization for unrolled jx == 0 case:
+    //   {q[0], q[1]} = {f[0], f[1]} * x[0]
+    //   {q[2], q[3]} = {f[2], f[3]} * x[0]
+    //   q[4] = f[4] * x[0]
+    //
+    // Unrolling for jx == 1 case:
+    //   q[0] = x[0] * f[1] + x[1] * f[0]
+    //   q[1] = x[0] * f[2] + x[1] * f[1]
+    //   q[2] = x[0] * f[3] + x[1] * f[2]
+    //   q[3] = x[0] * f[4] + x[1] * f[3]
+    //   q[4] = x[0] * f[5] + x[1] * f[4]
+    //
+    // Vectorization for unrolled jx == 1 case:
+    //   {q[0], q[1]} = {f[0], f[1]} * x[1]
+    //   {q[2], q[3]} = {f[2], f[3]} * x[1]
+    //   q[4] = f[4] * x[1]
+    //   {q[0], q[1]} += {f[1], f[2]} * x[0]
+    //   {q[2], q[3]} += {f[3], f[4]} * x[0]
+    //   q[4] += f[5] * x[0]
+    //
+    // Unrolling for jx == 2 case:
+    //   q[0] = x[0] * f[2] + x[1] * f[1] + x[2] * f[0]
+    //   q[1] = x[0] * f[3] + x[1] * f[2] + x[2] * f[1]
+    //   q[2] = x[0] * f[4] + x[1] * f[3] + x[2] * f[2]
+    //   q[3] = x[0] * f[5] + x[1] * f[4] + x[2] * f[3]
+    //   q[4] = x[0] * f[6] + x[1] * f[5] + x[2] * f[4]
+    //
+    // Vectorization for unrolled jx == 2 case:
+    //   {q[0], q[1]} = {f[0], f[1]} * x[2]
+    //   {q[2], q[3]} = {f[2], f[3]} * x[2]
+    //   q[4] = f[4] * x[2]
+    //   {q[0], q[1]} += {f[1], f[2]} * x[1]
+    //   {q[2], q[3]} += {f[3], f[4]} * x[1]
+    //   q[4] += f[5] * x[1]
+    //   {q[0], q[1]} += {f[2], f[3]} * x[0]
+    //   {q[2], q[3]} += {f[4], f[5]} * x[0]
+    //   q[4] += f[6] * x[0]
+  block_comment("unrolled and vectorized computation of q[0]..q[jk]"); {
+      cmpw(jx, 1);
+      lsl(tmp5, tmp5, 52);                     // now it's 2^q0 double value
+      lsl(tmp4, tmp4, 52);                     // now it's 2^-q0 double value
+      br(LT, JX_IS_0);
+      add(i, sp, 8);
+      ldpq(v26, v27, i);                       // load f[1..4]
+      br(GT, JX_IS_2);
+      // jx == 1
+      fmulxvs(v28, T2D, v18, v7);              // f[0,1] * x[1]
+      fmulxvs(v29, T2D, v19, v7);              // f[2,3] * x[1]
+      fmuld(v30, v20, v7);                     // f[4] * x[1]
+      fmlavs(v28, T2D, v26, v6, 0);
+      fmlavs(v29, T2D, v27, v6, 0);
+      fmlavs(v30, T2D, v6, v20, 1);            // v30 += f[5] * x[0]
+      b(Q_DONE);
+    bind(JX_IS_2);
+      fmulxvs(v28, T2D, v18, v3);              // f[0,1] * x[2]
+      fmulxvs(v29, T2D, v19, v3);              // f[2,3] * x[2]
+      fmuld(v30, v20, v3);                     // f[4] * x[2]
+      fmlavs(v28, T2D, v26, v7, 0);
+      fmlavs(v29, T2D, v27, v7, 0);
+      fmlavs(v30, T2D, v7, v20, 1);            // v30 += f[5] * x[1]
+      fmlavs(v28, T2D, v19, v6, 0);
+      fmlavs(v29, T2D, v20, v6, 0);
+      fmlavs(v30, T2D, v6, v21, 0);            // v30 += f[6] * x[0]
+      b(Q_DONE);
+    bind(JX_IS_0);
+      fmulxvs(v28, T2D, v18, v6);              // f[0,1] * x[0]
+      fmulxvs(v29, T2D, v19, v6);              // f[2,3] * x[0]
+      fmuld(v30, v20, v6);                     // f[4] * x[0]
+    bind(Q_DONE);
+      st1(v28, v29, v30, T2D, Address(qBase)); // save calculated q[0]...q[jk]
+  }
+  movz(i, 0x3E70, 48);
+  movw(jz, 4);
+  fmovd(v17, i);                               // v17 = twon24
+  fmovd(v30, tmp5);                            // 2^q0
+  fmovd(v21, 0.125d);
+  fmovd(v20, 8.0d);
+  fmovd(v22, tmp4);                            // 2^-q0
+
+  block_comment("recompute loop"); {
+    bind(RECOMPUTE);
+      //  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
+      //    fw    =  (double)((int)(twon24* z));
+      //    iq[i] =  (int)(z-two24A*fw);
+      //    z     =  q[j-1]+fw;
+      //  }
+      block_comment("distill q[] into iq[] reversingly"); {
+          eorw(i, i, i);
+          movw(j, jz);
+          add(tmp2, qBase, jz, LSL, 3);                    // q[jz] address
+          ldrd(v18, post(tmp2, -8));                       // z = q[j] and moving address to q[j-1]
+        bind(RECOMP_FIRST_FOR);
+          ldrd(v27, post(tmp2, -8));
+          fmuld(v29, v17, v18);                            // twon24*z
+          frintzd(v29, v29);                               // (double)(int)
+          fmsubd(v28, v10, v29, v18);                      // v28 = z-two24A*fw
+          fcvtzdw(tmp1, v28);                              // (int)(z-two24A*fw)
+          strw(tmp1, Address(iqBase, i, Address::lsl(2)));
+          faddd(v18, v27, v29);
+          add(i, i, 1);
+          subs(j, j, 1);
+          br(GT, RECOMP_FIRST_FOR);
+      }
+      // compute n
+      fmuld(v18, v18, v30);
+      fmuld(v2, v18, v21);
+      frintmd(v2, v2);                                     // v2 = floor(v2) == rounding towards -inf
+      fmsubd(v18, v2, v20, v18);                           // z -= 8.0*floor(z*0.125);
+      movw(ih, 2);
+      frintzd(v2, v18);                                    // v2 = (double)((int)z)
+      fcvtzdw(n, v18);                                     // n  = (int) z;
+      fsubd(v18, v18, v2);                                 // z -= (double)n;
+
+      block_comment("q0-dependent initialization"); {
+          cmpw(rscratch1, 0);                              // if (q0 > 0)
+          br(LT, Q0_ZERO_CMP_LT);
+          subw(j, jz, 1);                                  // j = jz - 1
+          ldrw(tmp2, Address(iqBase, j, Address::lsl(2))); // tmp2 = iq[jz-1]
+          br(EQ, Q0_ZERO_CMP_EQ);
+          movw(tmp4, 24);
+          subw(tmp4, tmp4, rscratch1);                     // == 24 - q0
+          lsrvw(i, tmp2, tmp4);                            // i = iq[jz-1] >> (24-q0)
+          lslvw(tmp5, i, tmp4);
+          subw(tmp2, tmp2, tmp5);                          // iq[jz-1] -= i<<(24-q0);
+          strw(tmp2, Address(iqBase, j, Address::lsl(2))); // store iq[jz-1]
+          subw(rscratch2, tmp4, 1);                        // == 23 - q0
+          addw(n, n, i);                                   // n+=i
+          lsrvw(ih, tmp2, rscratch2);                      // ih = iq[jz-1] >> (23-q0)
+          b(Q0_ZERO_CMP_DONE);
+        bind(Q0_ZERO_CMP_EQ);
+          lsr(ih, tmp2, 23);                               // ih = iq[z-1] >> 23
+          b(Q0_ZERO_CMP_DONE);
+        bind(Q0_ZERO_CMP_LT);
+          fmovd(v4, 0.5d);
+          fcmpd(v18, v4);
+          cselw(ih, zr, ih, LT);                           // if (z<0.5) ih = 0
+      }
+    bind(Q0_ZERO_CMP_DONE);
+      cmpw(ih, zr);
+      br(LE, IH_HANDLED);
+
+    block_comment("if(ih>) {"); {
+      // use rscratch2 as carry
+
+      block_comment("for(i=0;i<jz ;i++) {...}"); {
+          addw(n, n, 1);
+          eorw(i, i, i);
+          eorw(rscratch2, rscratch2, rscratch2);
+        bind(IH_FOR);
+          ldrw(j, Address(iqBase, i, Address::lsl(2)));    // j = iq[i]
+          movw(tmp3, 0x1000000);
+          subw(tmp3, tmp3, rscratch2);
+          cbnzw(rscratch2, IH_FOR_STORE);
+          cbzw(j, IH_FOR_INCREMENT);
+          movw(rscratch2, 1);
+        bind(IH_FOR_STORE);
+          subw(tmp3, tmp3, j);
+          strw(tmp3, Address(iqBase, i, Address::lsl(2))); // iq[i] = 0xffffff - j
+        bind(IH_FOR_INCREMENT);
+          addw(i, i, 1);
+          cmpw(i, jz);
+          br(LT, IH_FOR);
+      }
+
+      block_comment("if(q0>0) {"); {
+        cmpw(rscratch1, zr);
+        br(LE, IH_AFTER_SWITCH);
+        // tmp3 still has iq[jz-1] value. no need to reload
+        // now, zero high tmp3 bits (rscratch1 number of bits)
+        movw(j, -1);
+        subw(i, jz, 1);                                    // set i to jz-1
+        lsrv(j, j, rscratch1);
+        andw(tmp3, tmp3, j, LSR, 8);                       // we have 24-bit-based constants
+        strw(tmp3, Address(iqBase, i, Address::lsl(2)));   // save iq[jz-1]
+      }
+      bind(IH_AFTER_SWITCH);
+        cmpw(ih, 2);
+        br(NE, IH_HANDLED);
+
+        block_comment("if(ih==2) {"); {
+          fmovd(v25, 1.0d);
+          fsubd(v18, v25, v18);                            // z = one - z;
+          cbzw(rscratch2, IH_HANDLED);
+          fsubd(v18, v18, v30);                            // z -= scalbnA(one,q0);
+        }
+    }
+    bind(IH_HANDLED);
+      // check if recomputation is needed
+      fcmpd(v18, 0.0d);
+      br(NE, RECOMP_CHECK_DONE_NOT_ZERO);
+
+      block_comment("if(z==zeroB) {"); {
+
+        block_comment("for (i=jz-1;i>=jk;i--) j |= iq[i];"); {
+            subw(i, jz, 1);
+            eorw(j, j, j);
+            b(RECOMP_FOR1_CHECK);
+          bind(RECOMP_FOR1);
+            ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
+            orrw(j, j, tmp1);
+            subw(i, i, 1);
+          bind(RECOMP_FOR1_CHECK);
+            cmpw(i, 4);
+            br(GE, RECOMP_FOR1);
+        }
+        cbnzw(j, RECOMP_CHECK_DONE);
+
+        block_comment("if(j==0) {"); {
+            // for(k=1;iq[jk-k]==0;k++); // let's unroll it. jk == 4. So, read
+            // iq[3], iq[2], iq[1], iq[0] until non-zero value
+            ldp(tmp1, tmp3, iqBase);               // iq[0..3]
+            movw(j, 2);
+            cmp(tmp3, zr);
+            csel(tmp1, tmp1, tmp3, EQ);            // set register for further consideration
+            cselw(j, j, zr, EQ);                   // set initial k. Use j as k
+            cmp(zr, tmp1, LSR, 32);
+            addw(i, jz, 1);
+            csincw(j, j, j, NE);
+
+          block_comment("for(i=jz+1;i<=jz+k;i++) {...}"); {
+              addw(jz, i, j); // i = jz+1, j = k-1. j+i = jz+k (which is a new jz)
+            bind(RECOMP_FOR2);
+              addw(tmp1, jv, i);
+              ldrd(v29, Address(twoOverPiBase, tmp1, Address::lsl(3)));
+              addw(tmp2, jx, i);
+              strd(v29, Address(sp, tmp2, Address::lsl(3)));
+              // f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
+              // since jx = 0, 1 or 2 we can unroll it:
+              // for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
+              // f[jx+i-j] == (for first iteration) f[jx+i], which is already v29
+              add(tmp2, sp, tmp2, ext::uxtx, 3); // address of f[jx+i]
+              ldpd(v4, v5, Address(tmp2, -16)); // load f[jx+i-2] and f[jx+i-1]
+              fmuld(v26, v6, v29); // initial fw
+              cbzw(jx, RECOMP_FW_UPDATED);
+              fmaddd(v26, v7, v5, v26);
+              cmpw(jx, 1);
+              br(EQ, RECOMP_FW_UPDATED);
+              fmaddd(v26, v3, v4, v26);
+            bind(RECOMP_FW_UPDATED);
+              strd(v26, Address(qBase, i, Address::lsl(3))); // q[i] = fw;
+              addw(i, i, 1);
+              cmpw(i, jz);                                   // jz here is "old jz" + k
+              br(LE, RECOMP_FOR2);
+          }
+            b(RECOMPUTE);
+        }
+      }
+    }
+    bind(RECOMP_CHECK_DONE);
+      // chop off zero terms
+      fcmpd(v18, 0.0d);
+      br(EQ, Z_IS_ZERO);
+
+      block_comment("else block of if(z==0.0) {"); {
+        bind(RECOMP_CHECK_DONE_NOT_ZERO);
+          fmuld(v18, v18, v22);
+          fcmpd(v18, v10);                                   // v10 is stil two24A
+          br(LT, Z_IS_LESS_THAN_TWO24B);
+          fmuld(v1, v18, v17);                               // twon24*z
+          frintzd(v1, v1);                                   // v1 = (double)(int)(v1)
+          fmaddd(v2, v10, v1, v18);
+          fcvtzdw(tmp3, v1);                                 // (int)fw
+          fcvtzdw(tmp2, v2);                                 // double to int
+          strw(tmp2, Address(iqBase, jz, Address::lsl(2)));
+          addw(rscratch1, rscratch1, 24);
+          addw(jz, jz, 1);
+          strw(tmp3, Address(iqBase, jz, Address::lsl(2)));  // iq[jz] = (int) fw
+          b(Z_ZERO_CHECK_DONE);
+        bind(Z_IS_LESS_THAN_TWO24B);
+          fcvtzdw(tmp3, v18);                                // (int)z
+          strw(tmp3, Address(iqBase, jz, Address::lsl(2)));  // iq[jz] = (int) z
+          b(Z_ZERO_CHECK_DONE);
+      }
+
+      block_comment("if(z==0.0) {"); {
+        bind(Z_IS_ZERO);
+          subw(jz, jz, 1);
+          ldrw(tmp1, Address(iqBase, jz, Address::lsl(2)));
+          subw(rscratch1, rscratch1, 24);
+          cbz(tmp1, Z_IS_ZERO);
+      }
+      bind(Z_ZERO_CHECK_DONE);
+        // convert integer "bit" chunk to floating-point value
+        // v17 = twon24
+        // update v30, which was scalbnA(1.0, <old q0>);
+        addw(tmp2, rscratch1, 1023); // biased exponent
+        lsl(tmp2, tmp2, 52); // put at correct position
+        mov(i, jz);
+        fmovd(v30, tmp2);
+
+        block_comment("for(i=jz;i>=0;i--) {q[i] = fw*(double)iq[i]; fw*=twon24;}"); {
+          bind(CONVERTION_FOR);
+            ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
+            scvtfwd(v31, tmp1);
+            fmuld(v31, v31, v30);
+            strd(v31, Address(qBase, i, Address::lsl(3)));
+            fmuld(v30, v30, v17);
+            subsw(i, i, 1);
+            br(GE, CONVERTION_FOR);
+        }
+        add(rscratch2, sp, 160); // base for fq
+        // reusing twoOverPiBase
+        lea(twoOverPiBase, ExternalAddress(pio2));
+
+      block_comment("compute PIo2[0,...,jp]*q[jz,...,0]. for(i=jz;i>=0;i--) {...}"); {
+          movw(i, jz);
+          movw(tmp2, zr); // tmp2 will keep jz - i == 0 at start
+        bind(COMP_FOR);
+          // for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
+          fmovd(v30, 0.0d);
+          add(tmp5, qBase, i, LSL, 3); // address of q[i+k] for k==0
+          movw(tmp3, 4);
+          movw(tmp4, zr);              // used as k
+          cmpw(tmp2, 4);
+          add(tmp1, qBase, i, LSL, 3); // used as q[i] address
+          cselw(tmp3, tmp2, tmp3, LE); // min(jz - i, jp)
+
+          block_comment("for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];"); {
+            bind(COMP_INNER_LOOP);
+              ldrd(v18, Address(tmp1, tmp4, Address::lsl(3)));          // q[i+k]
+              ldrd(v19, Address(twoOverPiBase, tmp4, Address::lsl(3))); // PIo2[k]
+              fmaddd(v30, v18, v19, v30);                               // fw += PIo2[k]*q[i+k];
+              addw(tmp4, tmp4, 1);                                      // k++
+              cmpw(tmp4, tmp3);
+              br(LE, COMP_INNER_LOOP);
+          }
+          strd(v30, Address(rscratch2, tmp2, Address::lsl(3)));         // fq[jz-i]
+          add(tmp2, tmp2, 1);
+          subsw(i, i, 1);
+          br(GE, COMP_FOR);
+      }
+
+      block_comment("switch(prec) {...}. case 2:"); {
+        // compress fq into y[]
+        // remember prec == 2
+
+        block_comment("for (i=jz;i>=0;i--) fw += fq[i];"); {
+            fmovd(v4, 0.0d);
+            mov(i, jz);
+          bind(FW_FOR1);
+            ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
+            subsw(i, i, 1);
+            faddd(v4, v4, v1);
+            br(GE, FW_FOR1);
+        }
+        bind(FW_FOR1_DONE);
+          // v1 contains fq[0]. so, keep it so far
+          fsubd(v5, v1, v4); // fw = fq[0] - fw
+          cbzw(ih, FW_Y0_NO_NEGATION);
+          fnegd(v4, v4);
+        bind(FW_Y0_NO_NEGATION);
+
+        block_comment("for (i=1;i<=jz;i++) fw += fq[i];"); {
+            movw(i, 1);
+              cmpw(jz, 1);
+            br(LT, FW_FOR2_DONE);
+          bind(FW_FOR2);
+            ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
+            addw(i, i, 1);
+            cmp(i, jz);
+            faddd(v5, v5, v1);
+            br(LE, FW_FOR2);
+        }
+        bind(FW_FOR2_DONE);
+          cbz(ih, FW_Y1_NO_NEGATION);
+          fnegd(v5, v5);
+        bind(FW_Y1_NO_NEGATION);
+          add(sp, sp, 560);
+      }
+}
+
+///* __kernel_sin( x, y, iy)
+// * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
+// * Input x is assumed to be bounded by ~pi/4 in magnitude.
+// * Input y is the tail of x.
+// * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
+// *
+// * Algorithm
+// *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
+// *      2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
+// *      3. sin(x) is approximated by a polynomial of degree 13 on
+// *         [0,pi/4]
+// *                               3            13
+// *              sin(x) ~ x + S1*x + ... + S6*x
+// *         where
+// *
+// *      |sin(x)         2     4     6     8     10     12  |     -58
+// *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
+// *      |  x                                               |
+// *
+// *      4. sin(x+y) = sin(x) + sin'(x')*y
+// *                  ~ sin(x) + (1-x*x/2)*y
+// *         For better accuracy, let
+// *                   3      2      2      2      2
+// *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
+// *         then                   3    2
+// *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
+// */
+//static const double
+//S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
+//S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
+//S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
+//S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
+//S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
+//S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
+//
+// NOTE: S1..S6 were moved into a table: StubRoutines::aarch64::_dsin_coef
+//
+// BEGIN __kernel_sin PSEUDO CODE
+//
+//static double __kernel_sin(double x, double y, bool iy)
+//{
+//        double z,r,v;
+//
+//        // NOTE: not needed. moved to dsin/dcos
+//        //int ix;
+//        //ix = high(x)&0x7fffffff;                /* high word of x */
+//
+//        // NOTE: moved to dsin/dcos
+//        //if(ix<0x3e400000)                       /* |x| < 2**-27 */
+//        //   {if((int)x==0) return x;}            /* generate inexact */
+//
+//        z       =  x*x;
+//        v       =  z*x;
+//        r       =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
+//        if(iy==0) return x+v*(S1+z*r);
+//        else      return x-((z*(half*y-v*r)-y)-v*S1);
+//}
+//
+// END __kernel_sin PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic:
+//     1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
+//     2. Constants are now loaded from table dsin_coef
+//     3. C code parameter "int iy" was modified to "bool iyIsOne", because
+//         iy is always 0 or 1. Also, iyIsOne branch was moved into
+//         generation phase instead of taking it during code execution
+// Input ans output:
+//     1. Input for generated function: X argument = x
+//     2. Input for generator: x = register to read argument from, iyIsOne
+//         = flag to use low argument low part or not, dsin_coef = coefficients
+//         table address
+//     3. Return sin(x) value in v0
+void MacroAssembler::generate_kernel_sin(FloatRegister x, bool iyIsOne,
+    address dsin_coef) {
+  FloatRegister y = v5, z = v6, v = v7, r = v16, S1 = v17, S2 = v18,
+      S3 = v19, S4 = v20, S5 = v21, S6 = v22, half = v23;
+  lea(rscratch2, ExternalAddress(dsin_coef));
+  ldpd(S5, S6, Address(rscratch2, 32));
+  fmuld(z, x, x); // z =  x*x;
+  ld1(S1, S2, S3, S4, T1D, Address(rscratch2));
+  fmuld(v, z, x); // v =  z*x;
+
+  block_comment("calculate r =  S2+z*(S3+z*(S4+z*(S5+z*S6)))"); {
+    fmaddd(r, z, S6, S5);
+    // initialize "half" in current block to utilize 2nd FPU. However, it's
+    // not a part of this block
+    fmovd(half, 0.5);
+    fmaddd(r, z, r, S4);
+    fmaddd(r, z, r, S3);
+    fmaddd(r, z, r, S2);
+  }
+
+  if (!iyIsOne) {
+    // return x+v*(S1+z*r);
+    fmaddd(S1, z, r, S1);
+    fmaddd(v0, v, S1, x);
+  } else {
+    // return x-((z*(half*y-v*r)-y)-v*S1);
+    fmuld(S6, half, y);    // half*y
+    fmsubd(S6, v, r, S6);  // half*y-v*r
+    fmsubd(S6, z, S6, y);  // y - z*(half*y-v*r) = - (z*(half*y-v*r)-y)
+    fmaddd(S6, v, S1, S6); // - (z*(half*y-v*r)-y) + v*S1 == -((z*(half*y-v*r)-y)-v*S1)
+    faddd(v0, x, S6);
+  }
+}
+
+///*
+// * __kernel_cos( x,  y )
+// * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
+// * Input x is assumed to be bounded by ~pi/4 in magnitude.
+// * Input y is the tail of x.
+// *
+// * Algorithm
+// *      1. Since cos(-x) = cos(x), we need only to consider positive x.
+// *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
+// *      3. cos(x) is approximated by a polynomial of degree 14 on
+// *         [0,pi/4]
+// *                                       4            14
+// *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
+// *         where the remez error is
+// *
+// *      |              2     4     6     8     10    12     14 |     -58
+// *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
+// *      |                                                      |
+// *
+// *                     4     6     8     10    12     14
+// *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
+// *             cos(x) = 1 - x*x/2 + r
+// *         since cos(x+y) ~ cos(x) - sin(x)*y
+// *                        ~ cos(x) - x*y,
+// *         a correction term is necessary in cos(x) and hence
+// *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
+// *         For better accuracy when x > 0.3, let qx = |x|/4 with
+// *         the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
+// *         Then
+// *              cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
+// *         Note that 1-qx and (x*x/2-qx) is EXACT here, and the
+// *         magnitude of the latter is at least a quarter of x*x/2,
+// *         thus, reducing the rounding error in the subtraction.
+// */
+//
+//static const double
+//C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
+//C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
+//C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
+//C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
+//C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
+//C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
+//
+// NOTE: C1..C6 were moved into a table: StubRoutines::aarch64::_dcos_coef
+//
+// BEGIN __kernel_cos PSEUDO CODE
+//
+//static double __kernel_cos(double x, double y)
+//{
+//  double a,h,z,r,qx=0;
+//
+//  // NOTE: ix is already initialized in dsin/dcos. Reuse value from register
+//  //int ix;
+//  //ix = high(x)&0x7fffffff;              /* ix = |x|'s high word*/
+//
+//  // NOTE: moved to dsin/dcos
+//  //if(ix<0x3e400000) {                   /* if x < 2**27 */
+//  //  if(((int)x)==0) return one;         /* generate inexact */
+//  //}
+//
+//  z  = x*x;
+//  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+//  if(ix < 0x3FD33333)                   /* if |x| < 0.3 */
+//    return one - (0.5*z - (z*r - x*y));
+//  else {
+//    if(ix > 0x3fe90000) {               /* x > 0.78125 */
+//      qx = 0.28125;
+//    } else {
+//      set_high(&qx, ix-0x00200000); /* x/4 */
+//      set_low(&qx, 0);
+//    }
+//    h = 0.5*z-qx;
+//    a = one-qx;
+//    return a - (h - (z*r-x*y));
+//  }
+//}
+//
+// END __kernel_cos PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic:
+//     1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
+//     2. Constants are now loaded from table dcos_coef
+// Input and output:
+//     1. Input for generated function: X argument = x
+//     2. Input for generator: x = register to read argument from, dcos_coef
+//        = coefficients table address
+//     2. Return cos(x) value in v0
+void MacroAssembler::generate_kernel_cos(FloatRegister x, address dcos_coef) {
+  Register ix = r3;
+  FloatRegister qx = v1, h = v2, a = v3, y = v5, z = v6, r = v7, C1 = v18,
+      C2 = v19, C3 = v20, C4 = v21, C5 = v22, C6 = v23, one = v25, half = v26;
+  Label IX_IS_LARGE, SET_QX_CONST, DONE, QX_SET;
+    lea(rscratch2, ExternalAddress(dcos_coef));
+    ldpd(C5, C6, Address(rscratch2, 32));         // load C5, C6
+    fmuld(z, x, x);                               // z=x^2
+    ld1(C1, C2, C3, C4, T1D, Address(rscratch2)); // load C1..C3\4
+    block_comment("calculate r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))))"); {
+      fmaddd(r, z, C6, C5);
+      fmovd(half, 0.5d);
+      fmaddd(r, z, r, C4);
+      fmuld(y, x, y);
+      fmaddd(r, z, r, C3);
+      mov(rscratch1, 0x3FD33333);
+      fmaddd(r, z, r, C2);
+      fmuld(x, z, z);                             // x = z^2
+      fmaddd(r, z, r, C1);                        // r = C1+z(C2+z(C4+z(C5+z*C6)))
+    }
+    // need to multiply r by z to have "final" r value
+    fmovd(one, 1.0d);
+    cmp(ix, rscratch1);
+    br(GT, IX_IS_LARGE);
+    block_comment("if(ix < 0x3FD33333) return one - (0.5*z - (z*r - x*y))"); {
+      // return 1.0 - (0.5*z - (z*r - x*y)) = 1.0 - (0.5*z + (x*y - z*r))
+      fmsubd(v0, x, r, y);
+      fmaddd(v0, half, z, v0);
+      fsubd(v0, one, v0);
+      b(DONE);
+    }
+  block_comment("if(ix >= 0x3FD33333)"); {
+    bind(IX_IS_LARGE);
+      movz(rscratch2, 0x3FE9, 16);
+      cmp(ix, rscratch2);
+      br(GT, SET_QX_CONST);
+      block_comment("set_high(&qx, ix-0x00200000); set_low(&qx, 0);"); {
+        subw(rscratch2, ix, 0x00200000);
+        lsl(rscratch2, rscratch2, 32);
+        fmovd(qx, rscratch2);
+      }
+      b(QX_SET);
+    bind(SET_QX_CONST);
+      block_comment("if(ix > 0x3fe90000) qx = 0.28125;"); {
+        fmovd(qx, 0.28125d);
+      }
+    bind(QX_SET);
+      fnmsub(C6, x, r, y);    // z*r - xy
+      fnmsub(h, half, z, qx); // h = 0.5*z - qx
+      fsubd(a, one, qx);      // a = 1-qx
+      fsubd(C6, h, C6);       // = h - (z*r - x*y)
+      fsubd(v0, a, C6);
+  }
+  bind(DONE);
+}
+
+// generate_dsin_dcos creates stub for dsin and dcos
+// Generation is done via single call because dsin and dcos code is almost the
+// same(see C code below). These functions work as follows:
+// 1) handle corner cases: |x| ~< pi/4, x is NaN or INF, |x| < 2**-27
+// 2) perform argument reduction if required
+// 3) call kernel_sin or kernel_cos which approximate sin/cos via polynomial
+//
+// BEGIN dsin/dcos PSEUDO CODE
+//
+//dsin_dcos(jdouble x, bool isCos) {
+//  double y[2],z=0.0;
+//  int n, ix;
+//
+//  /* High word of x. */
+//  ix = high(x);
+//
+//  /* |x| ~< pi/4 */
+//  ix &= 0x7fffffff;
+//  if(ix <= 0x3fe921fb) return isCos ? __kernel_cos : __kernel_sin(x,z,0);
+//
+//  /* sin/cos(Inf or NaN) is NaN */
+//  else if (ix>=0x7ff00000) return x-x;
+//  else if (ix<0x3e400000) {                   /* if ix < 2**27 */
+//    if(((int)x)==0) return isCos ? one : x;         /* generate inexact */
+//  }
+//  /* argument reduction needed */
+//  else {
+//    n = __ieee754_rem_pio2(x,y);
+//    switch(n&3) {
+//    case 0: return isCos ?  __kernel_cos(y[0],y[1])      :  __kernel_sin(y[0],y[1], true);
+//    case 1: return isCos ? -__kernel_sin(y[0],y[1],true) :  __kernel_cos(y[0],y[1]);
+//    case 2: return isCos ? -__kernel_cos(y[0],y[1])      : -__kernel_sin(y[0],y[1], true);
+//    default:
+//      return isCos ? __kernel_sin(y[0],y[1],1) : -__kernel_cos(y[0],y[1]);
+//    }
+//  }
+//}
+// END dsin/dcos PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic:
+//     1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos
+//     2. Final switch use equivalent bit checks(tbz/tbnz)
+// Input ans output:
+//     1. Input for generated function: X = r0
+//     2. Input for generator: isCos = generate sin or cos, npio2_hw = address
+//         of npio2_hw table, two_over_pi = address of two_over_pi table,
+//         pio2 = address if pio2 table, dsin_coef = address if dsin_coef table,
+//         dcos_coef = address of dcos_coef table
+//     3. Return result in v0
+// NOTE: general purpose register names match local variable names in C code
+void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
+    address two_over_pi, address pio2, address dsin_coef, address dcos_coef) {
+  const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0;
+
+  Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
+  Register X = r0, absX = r1, n = r2, ix = r3;
+  FloatRegister y0 = v4, y1 = v5;
+    block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
+      fmovd(X, v0);
+      mov(rscratch2, 0x3e400000);
+      mov(rscratch1, 0x3fe921fb00000000);            // pi/4. shifted to reuse later
+      ubfm(absX, X, 0, 62);                          // absX
+      movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48);
+      cmp(rscratch2, absX, LSR, 32);
+      lsr(ix, absX, 32);                             // set ix
+      br(GT, TINY_X);                                // handle tiny x (|x| < 2^-27)
+      cmp(ix, rscratch1, LSR, 32);
+      br(LE, EARLY_CASE);                            // if(ix <= 0x3fe921fb) return
+      cmp(absX, r10);
+      br(LT, ARG_REDUCTION);
+      // X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0).
+      // Set last bit unconditionally to make it NaN
+      orr(r10, r10, 1);
+      fmovd(v0, r10);
+      ret(lr);
+    }
+  block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
+    bind(TINY_X);
+      if (isCos) {
+        fmovd(v0, 1.0d);
+      }
+      ret(lr);
+  }
+  bind(ARG_REDUCTION); /* argument reduction needed */
+    block_comment("n = __ieee754_rem_pio2(x,y);"); {
+      generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2);
+    }
+    block_comment("switch(n&3) {case ... }"); {
+      if (isCos) {
+        eorw(absX, n, n, LSR, 1);
+        tbnz(n, 0, RETURN_SIN);
+      } else {
+        tbz(n, 0, RETURN_SIN);
+      }
+      generate_kernel_cos(y0, dcos_coef);
+      if (isCos) {
+        tbz(absX, 0, DONE);
+      } else {
+        tbz(n, 1, DONE);
+      }
+      fnegd(v0, v0);
+      ret(lr);
+    bind(RETURN_SIN);
+      generate_kernel_sin(y0, true, dsin_coef);
+      if (isCos) {
+        tbz(absX, 0, DONE);
+      } else {
+        tbz(n, 1, DONE);
+      }
+      fnegd(v0, v0);
+      ret(lr);
+    }
+  bind(EARLY_CASE);
+    eor(y1, T8B, y1, y1);
+    if (isCos) {
+      generate_kernel_cos(v0, dcos_coef);
+    } else {
+      generate_kernel_sin(v0, false, dsin_coef);
+    }
+  bind(DONE);
+    ret(lr);
+}
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 16:30:16 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 16:30:49 2018 +0300
@@ -3990,6 +3990,18 @@
     return entry;
   }
 
+  address generate_dsin_dcos(bool isCos) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
+    address start = __ pc();
+    __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
+        (address)StubRoutines::aarch64::_two_over_pi,
+        (address)StubRoutines::aarch64::_pio2,
+        (address)StubRoutines::aarch64::_dsin_coef,
+        (address)StubRoutines::aarch64::_dcos_coef);
+    return start;
+  }
+
   address generate_dlog() {
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "dlog");
@@ -5057,6 +5069,8 @@
     }
 
     StubRoutines::_dlog = generate_dlog();
+    StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
+    StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
   }
 
   void generate_all() {
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 16:30:16 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 16:30:49 2018 +0300
@@ -278,3 +278,87 @@
     0x02D578EDUL, 0x7DAEED62UL,         // word swap
     0xD502ED78UL, 0xAE7D62EDUL,         // byte swap of word swap
 };
+
+juint StubRoutines::aarch64::_npio2_hw[] __attribute__ ((aligned(64))) = {
+    // first, various coefficient values: 0.5, invpio2, pio2_1, pio2_1t, pio2_2,
+    // pio2_2t, pio2_3, pio2_3t
+    // This is a small optimization wich keeping double[8] values in int[] table
+    // to have less address calculation instructions
+    //
+    // invpio2:  53 bits of 2/pi (enough for cases when trigonometric argument is small)
+    // pio2_1:   first  33 bit of pi/2
+    // pio2_1t:  pi/2 - pio2_1
+    // pio2_2:   second 33 bit of pi/2
+    // pio2_2t:  pi/2 - (pio2_1+pio2_2)
+    // pio2_3:   third  33 bit of pi/2
+    // pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
+    0x00000000, 0x3fe00000, // 0.5
+    0x6DC9C883, 0x3FE45F30, // invpio2 = 6.36619772367581382433e-01
+    0x54400000, 0x3FF921FB, // pio2_1 = 1.57079632673412561417e+00
+    0x1A626331, 0x3DD0B461, // pio2_1t = 6.07710050650619224932e-11
+    0x1A600000, 0x3DD0B461, // pio2_2 = 6.07710050630396597660e-11
+    0x2E037073, 0x3BA3198A, // pio2_2t = 2.02226624879595063154e-21
+    0x2E000000, 0x3BA3198A, // pio2_3 = 2.02226624871116645580e-21
+    0x252049C1, 0x397B839A, // pio2_3t = 8.47842766036889956997e-32
+    // now, npio2_hw itself
+    0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
+    0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
+    0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
+    0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
+    0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
+    0x404858EB, 0x404921FB
+};
+
+// Coefficients for sin(x) polynomial approximation: S1..S6.
+// See kernel_sin comments in macroAssembler_aarch64_trig.cpp for details
+jdouble StubRoutines::aarch64::_dsin_coef[] __attribute__ ((aligned(64))) = {
+    -1.66666666666666324348e-01, // 0xBFC5555555555549
+     8.33333333332248946124e-03, // 0x3F8111111110F8A6
+    -1.98412698298579493134e-04, // 0xBF2A01A019C161D5
+     2.75573137070700676789e-06, // 0x3EC71DE357B1FE7D
+    -2.50507602534068634195e-08, // 0xBE5AE5E68A2B9CEB
+     1.58969099521155010221e-10  // 0x3DE5D93A5ACFD57C
+};
+
+// Coefficients for cos(x) polynomial approximation: C1..C6.
+// See kernel_cos comments in macroAssembler_aarch64_trig.cpp for details
+jdouble StubRoutines::aarch64::_dcos_coef[] __attribute__ ((aligned(64))) = {
+     4.16666666666666019037e-02, // c0x3FA555555555554C
+    -1.38888888888741095749e-03, // 0xBF56C16C16C15177
+     2.48015872894767294178e-05, // 0x3EFA01A019CB1590
+    -2.75573143513906633035e-07, // 0xBE927E4F809C52AD
+     2.08757232129817482790e-09, // 0x3E21EE9EBDB4B1C4
+    -1.13596475577881948265e-11  // 0xBDA8FAE9BE8838D4
+};
+
+// Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi.
+// Used in cases of very large argument. 396 hex digits is enough to support
+// required precision.
+// Converted to double to avoid unnecessary conversion in code
+// NOTE: table looks like original int table: {0xA2F983, 0x6E4E44,...} with
+//       only (double) conversion added
+jdouble StubRoutines::aarch64::_two_over_pi[] __attribute__ ((aligned(64))) = {
+  (double)0xA2F983, (double)0x6E4E44, (double)0x1529FC, (double)0x2757D1, (double)0xF534DD, (double)0xC0DB62,
+  (double)0x95993C, (double)0x439041, (double)0xFE5163, (double)0xABDEBB, (double)0xC561B7, (double)0x246E3A,
+  (double)0x424DD2, (double)0xE00649, (double)0x2EEA09, (double)0xD1921C, (double)0xFE1DEB, (double)0x1CB129,
+  (double)0xA73EE8, (double)0x8235F5, (double)0x2EBB44, (double)0x84E99C, (double)0x7026B4, (double)0x5F7E41,
+  (double)0x3991D6, (double)0x398353, (double)0x39F49C, (double)0x845F8B, (double)0xBDF928, (double)0x3B1FF8,
+  (double)0x97FFDE, (double)0x05980F, (double)0xEF2F11, (double)0x8B5A0A, (double)0x6D1F6D, (double)0x367ECF,
+  (double)0x27CB09, (double)0xB74F46, (double)0x3F669E, (double)0x5FEA2D, (double)0x7527BA, (double)0xC7EBE5,
+  (double)0xF17B3D, (double)0x0739F7, (double)0x8A5292, (double)0xEA6BFB, (double)0x5FB11F, (double)0x8D5D08,
+  (double)0x560330, (double)0x46FC7B, (double)0x6BABF0, (double)0xCFBC20, (double)0x9AF436, (double)0x1DA9E3,
+  (double)0x91615E, (double)0xE61B08, (double)0x659985, (double)0x5F14A0, (double)0x68408D, (double)0xFFD880,
+  (double)0x4D7327, (double)0x310606, (double)0x1556CA, (double)0x73A8C9, (double)0x60E27B, (double)0xC08C6B,
+};
+
+// Pi over 2 value
+jdouble StubRoutines::aarch64::_pio2[] __attribute__ ((aligned(64))) = {
+  1.57079625129699707031e+00, // 0x3FF921FB40000000
+  7.54978941586159635335e-08, // 0x3E74442D00000000
+  5.39030252995776476554e-15, // 0x3CF8469880000000
+  3.28200341580791294123e-22, // 0x3B78CC5160000000
+  1.27065575308067607349e-29, // 0x39F01B8380000000
+  1.22933308981111328932e-36, // 0x387A252040000000
+  2.73370053816464559624e-44, // 0x36E3822280000000
+  2.16741683877804819444e-51, // 0x3569F31D00000000
+};
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 16:30:16 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 16:30:49 2018 +0300
@@ -41,7 +41,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 28000           // simply increase if too small (assembler will crash if too small)
 };
 
 class aarch64 {
@@ -146,7 +146,13 @@
 
 private:
   static juint    _crc_table[];
-
+  // begin trigonometric tables block. See comments in .cpp file
+  static juint    _npio2_hw[];
+  static jdouble   _two_over_pi[];
+  static jdouble   _pio2[];
+  static jdouble   _dsin_coef[];
+  static jdouble  _dcos_coef[];
+  // end trigonometric tables block
 };
 
 #endif // CPU_AARCH64_VM_STUBROUTINES_AARCH64_HPP