view src/hotspot/cpu/x86/stubRoutines_x86.cpp @ 54526:ee29b516a36a

revert changes
author jlaskey
date Wed, 23 Jan 2019 16:09:20 -0400
parents 480a96a43b62
children 1851a532ddfe
line wrap: on
line source
/*
 * Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */

#include "precompiled.hpp"
#include "runtime/deoptimization.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "runtime/thread.inline.hpp"
#include "utilities/globalDefinitions.hpp"
#include "crc32c.h"

// Implementation of the platform-specific part of StubRoutines - for
// a description of how to extend it, see the stubRoutines.hpp file.

address StubRoutines::x86::_verify_mxcsr_entry = NULL;
address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
address StubRoutines::x86::_ghash_poly_addr = NULL;
address StubRoutines::x86::_ghash_shuffmask_addr = NULL;
address StubRoutines::x86::_upper_word_mask_addr = NULL;
address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL;
address StubRoutines::x86::_k256_adr = NULL;
#ifdef _LP64
address StubRoutines::x86::_k256_W_adr = NULL;
address StubRoutines::x86::_k512_W_addr = NULL;
address StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = NULL;
// Base64 masks
address StubRoutines::x86::_bswap_mask = NULL;
address StubRoutines::x86::_base64_charset = NULL;
address StubRoutines::x86::_gather_mask = NULL;
address StubRoutines::x86::_right_shift_mask = NULL;
address StubRoutines::x86::_left_shift_mask = NULL;
address StubRoutines::x86::_and_mask = NULL;
address StubRoutines::x86::_url_charset = NULL;

#endif
address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL;

//tables common for sin and cos
address StubRoutines::x86::_ONEHALF_adr = NULL;
address StubRoutines::x86::_P_2_adr = NULL;
address StubRoutines::x86::_SC_4_adr = NULL;
address StubRoutines::x86::_Ctable_adr = NULL;
address StubRoutines::x86::_SC_2_adr = NULL;
address StubRoutines::x86::_SC_3_adr = NULL;
address StubRoutines::x86::_SC_1_adr = NULL;
address StubRoutines::x86::_PI_INV_TABLE_adr = NULL;
address StubRoutines::x86::_PI_4_adr = NULL;
address StubRoutines::x86::_PI32INV_adr = NULL;
address StubRoutines::x86::_SIGN_MASK_adr = NULL;
address StubRoutines::x86::_P_1_adr = NULL;
address StubRoutines::x86::_P_3_adr = NULL;
address StubRoutines::x86::_NEG_ZERO_adr = NULL;

//tables common for sincos and tancot
address StubRoutines::x86::_L_2il0floatpacket_0_adr = NULL;
address StubRoutines::x86::_Pi4Inv_adr = NULL;
address StubRoutines::x86::_Pi4x3_adr = NULL;
address StubRoutines::x86::_Pi4x4_adr = NULL;
address StubRoutines::x86::_ones_adr = NULL;

uint64_t StubRoutines::x86::_crc_by128_masks[] =
{
  /* The fields in this structure are arranged so that they can be
   * picked up two at a time with 128-bit loads.
   *
   * Because of flipped bit order for this CRC polynomials
   * the constant for X**N is left-shifted by 1.  This is because
   * a 64 x 64 polynomial multiply produces a 127-bit result
   * but the highest term is always aligned to bit 0 in the container.
   * Pre-shifting by one fixes this, at the cost of potentially making
   * the 32-bit constant no longer fit in a 32-bit container (thus the
   * use of uint64_t, though this is also the size used by the carry-
   * less multiply instruction.
   *
   * In addition, the flipped bit order and highest-term-at-least-bit
   * multiply changes the constants used.  The 96-bit result will be
   * aligned to the high-term end of the target 128-bit container,
   * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
   * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
   *
   * This cause additional problems in the 128-to-64-bit reduction; see the
   * code for details.  By storing a mask in the otherwise unused half of
   * a 128-bit constant, bits can be cleared before multiplication without
   * storing and reloading.  Note that staying on a 128-bit datapath means
   * that some data is uselessly stored and some unused data is intersected
   * with an irrelevant constant.
   */

  ((uint64_t) 0xffffffffUL),     /* low  of K_M_64    */
  ((uint64_t) 0xb1e6b092U << 1), /* high of K_M_64    */
  ((uint64_t) 0xba8ccbe8U << 1), /* low  of K_160_96  */
  ((uint64_t) 0x6655004fU << 1), /* high of K_160_96  */
  ((uint64_t) 0xaa2215eaU << 1), /* low  of K_544_480 */
  ((uint64_t) 0xe3720acbU << 1)  /* high of K_544_480 */
};

/**
 *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
 */
juint StubRoutines::x86::_crc_table[] =
{
    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
    0x2d02ef8dUL
};

#define D 32
#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)

#define TILL_CYCLE 31
uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0]

// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8
// Listing 1: Multiplication of normalized polynomials
// "a" and "b" occupy D least significant bits.
uint32_t crc32c_multiply(uint32_t a, uint32_t b) {
  uint32_t product = 0;
  uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P
  b_pow_x_table[0] = b;
  for (int k = 0; k < D; ++k) {
    // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result.
    if ((a & (((uint32_t)1) << (D - 1 - k))) != 0) product ^= b_pow_x_table[k];

    // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P.
    if (b_pow_x_table[k] & 1) {
      // If degree of (b_pow_x_table[k] * x) is D, then
      // degree of (b_pow_x_table[k] * x - P) is less than D.
      b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P;
    }
    else {
      b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1;
    }
  }
  return product;
}
#undef D
#undef P

// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9
void crc32c_init_pow_2k(void) {
  // _crc32c_pow_2k_table(0) =
  // x^(2^k) mod P(x) = x mod P(x) = x
  // Since we are operating on a reflected values
  // x = 10b, reflect(x) = 0x40000000
  _crc32c_pow_2k_table[0] = 0x40000000;

  for (int k = 1; k < TILL_CYCLE; k++) {
    // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x)
    uint32_t tmp = _crc32c_pow_2k_table[k - 1];
    _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp);
  }
}

// x^N mod P(x)
uint32_t crc32c_f_pow_n(uint32_t n) {
  //            result = 1 (polynomial)
  uint32_t one, result = 0x80000000, i = 0;

  while (one = (n & 1), (n == 1 || n - one > 0)) {
    if (one) {
      result = crc32c_multiply(result, _crc32c_pow_2k_table[i]);
    }
    n >>= 1;
    i++;
  }

  return result;
}

juint *StubRoutines::x86::_crc32c_table;

void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) {

  static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS];

  crc32c_init_pow_2k();

  pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8);      // 8N * 8 = 64N
  pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2);  // 128N

  pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8);
  pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2);

  pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8);
  pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] =
            crc32c_f_pow_n(CRC32C_LOW * 8 * 2);

  if (is_pclmulqdq_table_supported) {
    _crc32c_table = pow_n;
  } else {
    static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256];

    for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) {
      static juint X_CONST = pow_n[j];
      for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations
      // S. Gueron / Information Processing Letters 112 (2012) 184
      // Algorithm 3: Generating a carry-less multiplication lookup table.
      // Input: A 32-bit constant, X_CONST.
      // Output: A table of 256 entries, each one is a 64-bit quadword,
      // that can be used for computing "byte" * X_CONST, for a given byte.
        pclmulqdq_table[j * 256 + i] =
          ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^
          ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^
          ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST);
      }
    }
    _crc32c_table = (juint*)pclmulqdq_table;
  }
}

ATTRIBUTE_ALIGNED(64) juint StubRoutines::x86::_k256[] =
{
    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
    0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
    0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
    0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
    0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
    0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
    0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
    0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
    0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
    0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
};

#ifdef _LP64
// used in MacroAssembler::sha256_AVX2
// dynamically built from _k256
ATTRIBUTE_ALIGNED(64) juint StubRoutines::x86::_k256_W[2*sizeof(StubRoutines::x86::_k256)];

// used in MacroAssembler::sha512_AVX2
ATTRIBUTE_ALIGNED(64) julong StubRoutines::x86::_k512_W[] =
{
    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
    0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
    0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
    0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
    0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
    0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
    0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
    0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
    0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
    0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
    0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
    0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
    0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
    0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
    0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
    0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
    0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
    0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
    0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
    0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
    0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
    0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
    0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
    0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
    0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
    0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
    0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
};
#endif