view src/hotspot/cpu/x86/assembler_x86.hpp @ 51333:cc7fc46cc8c1

8205398: AES-CBC decryption algorithm using AVX512 instructions Reviewed-by: kvn Contributed-by: regev.shemy@intel.com, shay.gueron@intel.com, smita.kamath@intel.com, shravya.rukmannagari@intel.com
author kvn
date Thu, 21 Jun 2018 10:54:07 -0700
parents bf7e2684cd0a
children 480a96a43b62
line wrap: on
line source
/*
 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */

#ifndef CPU_X86_VM_ASSEMBLER_X86_HPP
#define CPU_X86_VM_ASSEMBLER_X86_HPP

#include "asm/register.hpp"
#include "vm_version_x86.hpp"

class BiasedLockingCounters;

// Contains all the definitions needed for x86 assembly code generation.

// Calling convention
class Argument {
 public:
  enum {
#ifdef _LP64
#ifdef _WIN64
    n_int_register_parameters_c   = 4, // rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
    n_float_register_parameters_c = 4,  // xmm0 - xmm3 (c_farg0, c_farg1, ... )
#else
    n_int_register_parameters_c   = 6, // rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
    n_float_register_parameters_c = 8,  // xmm0 - xmm7 (c_farg0, c_farg1, ... )
#endif // _WIN64
    n_int_register_parameters_j   = 6, // j_rarg0, j_rarg1, ...
    n_float_register_parameters_j = 8  // j_farg0, j_farg1, ...
#else
    n_register_parameters = 0   // 0 registers used to pass arguments
#endif // _LP64
  };
};


#ifdef _LP64
// Symbolically name the register arguments used by the c calling convention.
// Windows is different from linux/solaris. So much for standards...

#ifdef _WIN64

REGISTER_DECLARATION(Register, c_rarg0, rcx);
REGISTER_DECLARATION(Register, c_rarg1, rdx);
REGISTER_DECLARATION(Register, c_rarg2, r8);
REGISTER_DECLARATION(Register, c_rarg3, r9);

REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);

#else

REGISTER_DECLARATION(Register, c_rarg0, rdi);
REGISTER_DECLARATION(Register, c_rarg1, rsi);
REGISTER_DECLARATION(Register, c_rarg2, rdx);
REGISTER_DECLARATION(Register, c_rarg3, rcx);
REGISTER_DECLARATION(Register, c_rarg4, r8);
REGISTER_DECLARATION(Register, c_rarg5, r9);

REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
REGISTER_DECLARATION(XMMRegister, c_farg4, xmm4);
REGISTER_DECLARATION(XMMRegister, c_farg5, xmm5);
REGISTER_DECLARATION(XMMRegister, c_farg6, xmm6);
REGISTER_DECLARATION(XMMRegister, c_farg7, xmm7);

#endif // _WIN64

// Symbolically name the register arguments used by the Java calling convention.
// We have control over the convention for java so we can do what we please.
// What pleases us is to offset the java calling convention so that when
// we call a suitable jni method the arguments are lined up and we don't
// have to do little shuffling. A suitable jni method is non-static and a
// small number of arguments (two fewer args on windows)
//
//        |-------------------------------------------------------|
//        | c_rarg0   c_rarg1  c_rarg2 c_rarg3 c_rarg4 c_rarg5    |
//        |-------------------------------------------------------|
//        | rcx       rdx      r8      r9      rdi*    rsi*       | windows (* not a c_rarg)
//        | rdi       rsi      rdx     rcx     r8      r9         | solaris/linux
//        |-------------------------------------------------------|
//        | j_rarg5   j_rarg0  j_rarg1 j_rarg2 j_rarg3 j_rarg4    |
//        |-------------------------------------------------------|

REGISTER_DECLARATION(Register, j_rarg0, c_rarg1);
REGISTER_DECLARATION(Register, j_rarg1, c_rarg2);
REGISTER_DECLARATION(Register, j_rarg2, c_rarg3);
// Windows runs out of register args here
#ifdef _WIN64
REGISTER_DECLARATION(Register, j_rarg3, rdi);
REGISTER_DECLARATION(Register, j_rarg4, rsi);
#else
REGISTER_DECLARATION(Register, j_rarg3, c_rarg4);
REGISTER_DECLARATION(Register, j_rarg4, c_rarg5);
#endif /* _WIN64 */
REGISTER_DECLARATION(Register, j_rarg5, c_rarg0);

REGISTER_DECLARATION(XMMRegister, j_farg0, xmm0);
REGISTER_DECLARATION(XMMRegister, j_farg1, xmm1);
REGISTER_DECLARATION(XMMRegister, j_farg2, xmm2);
REGISTER_DECLARATION(XMMRegister, j_farg3, xmm3);
REGISTER_DECLARATION(XMMRegister, j_farg4, xmm4);
REGISTER_DECLARATION(XMMRegister, j_farg5, xmm5);
REGISTER_DECLARATION(XMMRegister, j_farg6, xmm6);
REGISTER_DECLARATION(XMMRegister, j_farg7, xmm7);

REGISTER_DECLARATION(Register, rscratch1, r10);  // volatile
REGISTER_DECLARATION(Register, rscratch2, r11);  // volatile

REGISTER_DECLARATION(Register, r12_heapbase, r12); // callee-saved
REGISTER_DECLARATION(Register, r15_thread, r15); // callee-saved

#else
// rscratch1 will apear in 32bit code that is dead but of course must compile
// Using noreg ensures if the dead code is incorrectly live and executed it
// will cause an assertion failure
#define rscratch1 noreg
#define rscratch2 noreg

#endif // _LP64

// JSR 292
// On x86, the SP does not have to be saved when invoking method handle intrinsics
// or compiled lambda forms. We indicate that by setting rbp_mh_SP_save to noreg.
REGISTER_DECLARATION(Register, rbp_mh_SP_save, noreg);

// Address is an abstraction used to represent a memory location
// using any of the amd64 addressing modes with one object.
//
// Note: A register location is represented via a Register, not
//       via an address for efficiency & simplicity reasons.

class ArrayAddress;

class Address {
 public:
  enum ScaleFactor {
    no_scale = -1,
    times_1  =  0,
    times_2  =  1,
    times_4  =  2,
    times_8  =  3,
    times_ptr = LP64_ONLY(times_8) NOT_LP64(times_4)
  };
  static ScaleFactor times(int size) {
    assert(size >= 1 && size <= 8 && is_power_of_2(size), "bad scale size");
    if (size == 8)  return times_8;
    if (size == 4)  return times_4;
    if (size == 2)  return times_2;
    return times_1;
  }
  static int scale_size(ScaleFactor scale) {
    assert(scale != no_scale, "");
    assert(((1 << (int)times_1) == 1 &&
            (1 << (int)times_2) == 2 &&
            (1 << (int)times_4) == 4 &&
            (1 << (int)times_8) == 8), "");
    return (1 << (int)scale);
  }

 private:
  Register         _base;
  Register         _index;
  ScaleFactor      _scale;
  int              _disp;
  RelocationHolder _rspec;

  // Easily misused constructors make them private
  // %%% can we make these go away?
  NOT_LP64(Address(address loc, RelocationHolder spec);)
  Address(int disp, address loc, relocInfo::relocType rtype);
  Address(int disp, address loc, RelocationHolder spec);

 public:

 int disp() { return _disp; }
  // creation
  Address()
    : _base(noreg),
      _index(noreg),
      _scale(no_scale),
      _disp(0) {
  }

  // No default displacement otherwise Register can be implicitly
  // converted to 0(Register) which is quite a different animal.

  Address(Register base, int disp)
    : _base(base),
      _index(noreg),
      _scale(no_scale),
      _disp(disp) {
  }

  Address(Register base, Register index, ScaleFactor scale, int disp = 0)
    : _base (base),
      _index(index),
      _scale(scale),
      _disp (disp) {
    assert(!index->is_valid() == (scale == Address::no_scale),
           "inconsistent address");
  }

  Address(Register base, RegisterOrConstant index, ScaleFactor scale = times_1, int disp = 0)
    : _base (base),
      _index(index.register_or_noreg()),
      _scale(scale),
      _disp (disp + (index.constant_or_zero() * scale_size(scale))) {
    if (!index.is_register())  scale = Address::no_scale;
    assert(!_index->is_valid() == (scale == Address::no_scale),
           "inconsistent address");
  }

  Address plus_disp(int disp) const {
    Address a = (*this);
    a._disp += disp;
    return a;
  }
  Address plus_disp(RegisterOrConstant disp, ScaleFactor scale = times_1) const {
    Address a = (*this);
    a._disp += disp.constant_or_zero() * scale_size(scale);
    if (disp.is_register()) {
      assert(!a.index()->is_valid(), "competing indexes");
      a._index = disp.as_register();
      a._scale = scale;
    }
    return a;
  }
  bool is_same_address(Address a) const {
    // disregard _rspec
    return _base == a._base && _disp == a._disp && _index == a._index && _scale == a._scale;
  }

  // The following two overloads are used in connection with the
  // ByteSize type (see sizes.hpp).  They simplify the use of
  // ByteSize'd arguments in assembly code. Note that their equivalent
  // for the optimized build are the member functions with int disp
  // argument since ByteSize is mapped to an int type in that case.
  //
  // Note: DO NOT introduce similar overloaded functions for WordSize
  // arguments as in the optimized mode, both ByteSize and WordSize
  // are mapped to the same type and thus the compiler cannot make a
  // distinction anymore (=> compiler errors).

#ifdef ASSERT
  Address(Register base, ByteSize disp)
    : _base(base),
      _index(noreg),
      _scale(no_scale),
      _disp(in_bytes(disp)) {
  }

  Address(Register base, Register index, ScaleFactor scale, ByteSize disp)
    : _base(base),
      _index(index),
      _scale(scale),
      _disp(in_bytes(disp)) {
    assert(!index->is_valid() == (scale == Address::no_scale),
           "inconsistent address");
  }

  Address(Register base, RegisterOrConstant index, ScaleFactor scale, ByteSize disp)
    : _base (base),
      _index(index.register_or_noreg()),
      _scale(scale),
      _disp (in_bytes(disp) + (index.constant_or_zero() * scale_size(scale))) {
    if (!index.is_register())  scale = Address::no_scale;
    assert(!_index->is_valid() == (scale == Address::no_scale),
           "inconsistent address");
  }

#endif // ASSERT

  // accessors
  bool        uses(Register reg) const { return _base == reg || _index == reg; }
  Register    base()             const { return _base;  }
  Register    index()            const { return _index; }
  ScaleFactor scale()            const { return _scale; }
  int         disp()             const { return _disp;  }

  // Convert the raw encoding form into the form expected by the constructor for
  // Address.  An index of 4 (rsp) corresponds to having no index, so convert
  // that to noreg for the Address constructor.
  static Address make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc);

  static Address make_array(ArrayAddress);

 private:
  bool base_needs_rex() const {
    return _base != noreg && _base->encoding() >= 8;
  }

  bool index_needs_rex() const {
    return _index != noreg &&_index->encoding() >= 8;
  }

  relocInfo::relocType reloc() const { return _rspec.type(); }

  friend class Assembler;
  friend class MacroAssembler;
  friend class LIR_Assembler; // base/index/scale/disp
};

//
// AddressLiteral has been split out from Address because operands of this type
// need to be treated specially on 32bit vs. 64bit platforms. By splitting it out
// the few instructions that need to deal with address literals are unique and the
// MacroAssembler does not have to implement every instruction in the Assembler
// in order to search for address literals that may need special handling depending
// on the instruction and the platform. As small step on the way to merging i486/amd64
// directories.
//
class AddressLiteral {
  friend class ArrayAddress;
  RelocationHolder _rspec;
  // Typically we use AddressLiterals we want to use their rval
  // However in some situations we want the lval (effect address) of the item.
  // We provide a special factory for making those lvals.
  bool _is_lval;

  // If the target is far we'll need to load the ea of this to
  // a register to reach it. Otherwise if near we can do rip
  // relative addressing.

  address          _target;

 protected:
  // creation
  AddressLiteral()
    : _is_lval(false),
      _target(NULL)
  {}

  public:


  AddressLiteral(address target, relocInfo::relocType rtype);

  AddressLiteral(address target, RelocationHolder const& rspec)
    : _rspec(rspec),
      _is_lval(false),
      _target(target)
  {}

  AddressLiteral addr() {
    AddressLiteral ret = *this;
    ret._is_lval = true;
    return ret;
  }


 private:

  address target() { return _target; }
  bool is_lval() { return _is_lval; }

  relocInfo::relocType reloc() const { return _rspec.type(); }
  const RelocationHolder& rspec() const { return _rspec; }

  friend class Assembler;
  friend class MacroAssembler;
  friend class Address;
  friend class LIR_Assembler;
};

// Convience classes
class RuntimeAddress: public AddressLiteral {

  public:

  RuntimeAddress(address target) : AddressLiteral(target, relocInfo::runtime_call_type) {}

};

class ExternalAddress: public AddressLiteral {
 private:
  static relocInfo::relocType reloc_for_target(address target) {
    // Sometimes ExternalAddress is used for values which aren't
    // exactly addresses, like the card table base.
    // external_word_type can't be used for values in the first page
    // so just skip the reloc in that case.
    return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
  }

 public:

  ExternalAddress(address target) : AddressLiteral(target, reloc_for_target(target)) {}

};

class InternalAddress: public AddressLiteral {

  public:

  InternalAddress(address target) : AddressLiteral(target, relocInfo::internal_word_type) {}

};

// x86 can do array addressing as a single operation since disp can be an absolute
// address amd64 can't. We create a class that expresses the concept but does extra
// magic on amd64 to get the final result

class ArrayAddress {
  private:

  AddressLiteral _base;
  Address        _index;

  public:

  ArrayAddress() {};
  ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
  AddressLiteral base() { return _base; }
  Address index() { return _index; }

};

class InstructionAttr;

// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
// See fxsave and xsave(EVEX enabled) documentation for layout
const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);

// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
// is what you get. The Assembler is generating code into a CodeBuffer.

class Assembler : public AbstractAssembler  {
  friend class AbstractAssembler; // for the non-virtual hack
  friend class LIR_Assembler; // as_Address()
  friend class StubGenerator;

 public:
  enum Condition {                     // The x86 condition codes used for conditional jumps/moves.
    zero          = 0x4,
    notZero       = 0x5,
    equal         = 0x4,
    notEqual      = 0x5,
    less          = 0xc,
    lessEqual     = 0xe,
    greater       = 0xf,
    greaterEqual  = 0xd,
    below         = 0x2,
    belowEqual    = 0x6,
    above         = 0x7,
    aboveEqual    = 0x3,
    overflow      = 0x0,
    noOverflow    = 0x1,
    carrySet      = 0x2,
    carryClear    = 0x3,
    negative      = 0x8,
    positive      = 0x9,
    parity        = 0xa,
    noParity      = 0xb
  };

  enum Prefix {
    // segment overrides
    CS_segment = 0x2e,
    SS_segment = 0x36,
    DS_segment = 0x3e,
    ES_segment = 0x26,
    FS_segment = 0x64,
    GS_segment = 0x65,

    REX        = 0x40,

    REX_B      = 0x41,
    REX_X      = 0x42,
    REX_XB     = 0x43,
    REX_R      = 0x44,
    REX_RB     = 0x45,
    REX_RX     = 0x46,
    REX_RXB    = 0x47,

    REX_W      = 0x48,

    REX_WB     = 0x49,
    REX_WX     = 0x4A,
    REX_WXB    = 0x4B,
    REX_WR     = 0x4C,
    REX_WRB    = 0x4D,
    REX_WRX    = 0x4E,
    REX_WRXB   = 0x4F,

    VEX_3bytes = 0xC4,
    VEX_2bytes = 0xC5,
    EVEX_4bytes = 0x62,
    Prefix_EMPTY = 0x0
  };

  enum VexPrefix {
    VEX_B = 0x20,
    VEX_X = 0x40,
    VEX_R = 0x80,
    VEX_W = 0x80
  };

  enum ExexPrefix {
    EVEX_F  = 0x04,
    EVEX_V  = 0x08,
    EVEX_Rb = 0x10,
    EVEX_X  = 0x40,
    EVEX_Z  = 0x80
  };

  enum VexSimdPrefix {
    VEX_SIMD_NONE = 0x0,
    VEX_SIMD_66   = 0x1,
    VEX_SIMD_F3   = 0x2,
    VEX_SIMD_F2   = 0x3
  };

  enum VexOpcode {
    VEX_OPCODE_NONE  = 0x0,
    VEX_OPCODE_0F    = 0x1,
    VEX_OPCODE_0F_38 = 0x2,
    VEX_OPCODE_0F_3A = 0x3,
    VEX_OPCODE_MASK  = 0x1F
  };

  enum AvxVectorLen {
    AVX_128bit = 0x0,
    AVX_256bit = 0x1,
    AVX_512bit = 0x2,
    AVX_NoVec  = 0x4
  };

  enum EvexTupleType {
    EVEX_FV   = 0,
    EVEX_HV   = 4,
    EVEX_FVM  = 6,
    EVEX_T1S  = 7,
    EVEX_T1F  = 11,
    EVEX_T2   = 13,
    EVEX_T4   = 15,
    EVEX_T8   = 17,
    EVEX_HVM  = 18,
    EVEX_QVM  = 19,
    EVEX_OVM  = 20,
    EVEX_M128 = 21,
    EVEX_DUP  = 22,
    EVEX_ETUP = 23
  };

  enum EvexInputSizeInBits {
    EVEX_8bit  = 0,
    EVEX_16bit = 1,
    EVEX_32bit = 2,
    EVEX_64bit = 3,
    EVEX_NObit = 4
  };

  enum WhichOperand {
    // input to locate_operand, and format code for relocations
    imm_operand  = 0,            // embedded 32-bit|64-bit immediate operand
    disp32_operand = 1,          // embedded 32-bit displacement or address
    call32_operand = 2,          // embedded 32-bit self-relative displacement
#ifndef _LP64
    _WhichOperand_limit = 3
#else
     narrow_oop_operand = 3,     // embedded 32-bit immediate narrow oop
    _WhichOperand_limit = 4
#endif
  };

  enum ComparisonPredicate {
    eq = 0,
    lt = 1,
    le = 2,
    _false = 3,
    neq = 4,
    nlt = 5,
    nle = 6,
    _true = 7
  };


  // NOTE: The general philopsophy of the declarations here is that 64bit versions
  // of instructions are freely declared without the need for wrapping them an ifdef.
  // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
  // In the .cpp file the implementations are wrapped so that they are dropped out
  // of the resulting jvm. This is done mostly to keep the footprint of MINIMAL
  // to the size it was prior to merging up the 32bit and 64bit assemblers.
  //
  // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
  // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.

private:

  bool _legacy_mode_bw;
  bool _legacy_mode_dq;
  bool _legacy_mode_vl;
  bool _legacy_mode_vlbw;
  bool _is_managed;
  bool _vector_masking;    // For stub code use only

  class InstructionAttr *_attributes;

  // 64bit prefixes
  int prefix_and_encode(int reg_enc, bool byteinst = false);
  int prefixq_and_encode(int reg_enc);

  int prefix_and_encode(int dst_enc, int src_enc) {
    return prefix_and_encode(dst_enc, false, src_enc, false);
  }
  int prefix_and_encode(int dst_enc, bool dst_is_byte, int src_enc, bool src_is_byte);
  int prefixq_and_encode(int dst_enc, int src_enc);

  void prefix(Register reg);
  void prefix(Register dst, Register src, Prefix p);
  void prefix(Register dst, Address adr, Prefix p);
  void prefix(Address adr);
  void prefixq(Address adr);

  void prefix(Address adr, Register reg,  bool byteinst = false);
  void prefix(Address adr, XMMRegister reg);
  void prefixq(Address adr, Register reg);
  void prefixq(Address adr, XMMRegister reg);

  void prefetch_prefix(Address src);

  void rex_prefix(Address adr, XMMRegister xreg,
                  VexSimdPrefix pre, VexOpcode opc, bool rex_w);
  int  rex_prefix_and_encode(int dst_enc, int src_enc,
                             VexSimdPrefix pre, VexOpcode opc, bool rex_w);

  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, int nds_enc, VexSimdPrefix pre, VexOpcode opc);

  void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool evex_r, bool evex_v,
                   int nds_enc, VexSimdPrefix pre, VexOpcode opc);

  void vex_prefix(Address adr, int nds_enc, int xreg_enc,
                  VexSimdPrefix pre, VexOpcode opc,
                  InstructionAttr *attributes);

  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
                             VexSimdPrefix pre, VexOpcode opc,
                             InstructionAttr *attributes);

  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre,
                   VexOpcode opc, InstructionAttr *attributes);

  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
                             VexOpcode opc, InstructionAttr *attributes);

  // Helper functions for groups of instructions
  void emit_arith_b(int op1, int op2, Register dst, int imm8);

  void emit_arith(int op1, int op2, Register dst, int32_t imm32);
  // Force generation of a 4 byte immediate value even if it fits into 8bit
  void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
  void emit_arith(int op1, int op2, Register dst, Register src);

  bool emit_compressed_disp_byte(int &disp);

  void emit_operand(Register reg,
                    Register base, Register index, Address::ScaleFactor scale,
                    int disp,
                    RelocationHolder const& rspec,
                    int rip_relative_correction = 0);

  void emit_operand(Register reg, Address adr, int rip_relative_correction = 0);

  // operands that only take the original 32bit registers
  void emit_operand32(Register reg, Address adr);

  void emit_operand(XMMRegister reg,
                    Register base, Register index, Address::ScaleFactor scale,
                    int disp,
                    RelocationHolder const& rspec);

  void emit_operand(XMMRegister reg, Address adr);

  void emit_operand(MMXRegister reg, Address adr);

  // workaround gcc (3.2.1-7) bug
  void emit_operand(Address adr, MMXRegister reg);


  // Immediate-to-memory forms
  void emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32);

  void emit_farith(int b1, int b2, int i);


 protected:
  #ifdef ASSERT
  void check_relocation(RelocationHolder const& rspec, int format);
  #endif

  void emit_data(jint data, relocInfo::relocType    rtype, int format);
  void emit_data(jint data, RelocationHolder const& rspec, int format);
  void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
  void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);

  bool reachable(AddressLiteral adr) NOT_LP64({ return true;});

  // These are all easily abused and hence protected

  // 32BIT ONLY SECTION
#ifndef _LP64
  // Make these disappear in 64bit mode since they would never be correct
  void cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec);   // 32BIT ONLY
  void cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY

  void mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
  void mov_literal32(Address dst, int32_t imm32, RelocationHolder const& rspec);     // 32BIT ONLY

  void push_literal32(int32_t imm32, RelocationHolder const& rspec);                 // 32BIT ONLY
#else
  // 64BIT ONLY SECTION
  void mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec);   // 64BIT ONLY

  void cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec);
  void cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec);

  void mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec);
  void mov_narrow_oop(Address dst, int32_t imm32, RelocationHolder const& rspec);
#endif // _LP64

  // These are unique in that we are ensured by the caller that the 32bit
  // relative in these instructions will always be able to reach the potentially
  // 64bit address described by entry. Since they can take a 64bit address they
  // don't have the 32 suffix like the other instructions in this class.

  void call_literal(address entry, RelocationHolder const& rspec);
  void jmp_literal(address entry, RelocationHolder const& rspec);

  // Avoid using directly section
  // Instructions in this section are actually usable by anyone without danger
  // of failure but have performance issues that are addressed my enhanced
  // instructions which will do the proper thing base on the particular cpu.
  // We protect them because we don't trust you...

  // Don't use next inc() and dec() methods directly. INC & DEC instructions
  // could cause a partial flag stall since they don't set CF flag.
  // Use MacroAssembler::decrement() & MacroAssembler::increment() methods
  // which call inc() & dec() or add() & sub() in accordance with
  // the product flag UseIncDec value.

  void decl(Register dst);
  void decl(Address dst);
  void decq(Register dst);
  void decq(Address dst);

  void incl(Register dst);
  void incl(Address dst);
  void incq(Register dst);
  void incq(Address dst);

  // New cpus require use of movsd and movss to avoid partial register stall
  // when loading from memory. But for old Opteron use movlpd instead of movsd.
  // The selection is done in MacroAssembler::movdbl() and movflt().

  // Move Scalar Single-Precision Floating-Point Values
  void movss(XMMRegister dst, Address src);
  void movss(XMMRegister dst, XMMRegister src);
  void movss(Address dst, XMMRegister src);

  // Move Scalar Double-Precision Floating-Point Values
  void movsd(XMMRegister dst, Address src);
  void movsd(XMMRegister dst, XMMRegister src);
  void movsd(Address dst, XMMRegister src);
  void movlpd(XMMRegister dst, Address src);

  // New cpus require use of movaps and movapd to avoid partial register stall
  // when moving between registers.
  void movaps(XMMRegister dst, XMMRegister src);
  void movapd(XMMRegister dst, XMMRegister src);

  // End avoid using directly


  // Instruction prefixes
  void prefix(Prefix p);

  public:

  // Creation
  Assembler(CodeBuffer* code) : AbstractAssembler(code) {
    init_attributes();
  }

  // Decoding
  static address locate_operand(address inst, WhichOperand which);
  static address locate_next_instruction(address inst);

  // Utilities
  static bool is_polling_page_far() NOT_LP64({ return false;});
  static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
                                         int cur_tuple_type, int in_size_in_bits, int cur_encoding);

  // Generic instructions
  // Does 32bit or 64bit as needed for the platform. In some sense these
  // belong in macro assembler but there is no need for both varieties to exist

  void init_attributes(void) {
    _legacy_mode_bw = (VM_Version::supports_avx512bw() == false);
    _legacy_mode_dq = (VM_Version::supports_avx512dq() == false);
    _legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
    _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
    _is_managed = false;
    _vector_masking = false;
    _attributes = NULL;
  }

  void set_attributes(InstructionAttr *attributes) { _attributes = attributes; }
  void clear_attributes(void) { _attributes = NULL; }

  void set_managed(void) { _is_managed = true; }
  void clear_managed(void) { _is_managed = false; }
  bool is_managed(void) { return _is_managed; }

  // Following functions are for stub code use only
  void set_vector_masking(void) { _vector_masking = true; }
  void clear_vector_masking(void) { _vector_masking = false; }
  bool is_vector_masking(void) { return _vector_masking; }

  void lea(Register dst, Address src);

  void mov(Register dst, Register src);

  void pusha();
  void popa();

  void pushf();
  void popf();

  void push(int32_t imm32);

  void push(Register src);

  void pop(Register dst);

  // These are dummies to prevent surprise implicit conversions to Register
  void push(void* v);
  void pop(void* v);

  // These do register sized moves/scans
  void rep_mov();
  void rep_stos();
  void rep_stosb();
  void repne_scan();
#ifdef _LP64
  void repne_scanl();
#endif

  // Vanilla instructions in lexical order

  void adcl(Address dst, int32_t imm32);
  void adcl(Address dst, Register src);
  void adcl(Register dst, int32_t imm32);
  void adcl(Register dst, Address src);
  void adcl(Register dst, Register src);

  void adcq(Register dst, int32_t imm32);
  void adcq(Register dst, Address src);
  void adcq(Register dst, Register src);

  void addb(Address dst, int imm8);
  void addw(Address dst, int imm16);

  void addl(Address dst, int32_t imm32);
  void addl(Address dst, Register src);
  void addl(Register dst, int32_t imm32);
  void addl(Register dst, Address src);
  void addl(Register dst, Register src);

  void addq(Address dst, int32_t imm32);
  void addq(Address dst, Register src);
  void addq(Register dst, int32_t imm32);
  void addq(Register dst, Address src);
  void addq(Register dst, Register src);

#ifdef _LP64
 //Add Unsigned Integers with Carry Flag
  void adcxq(Register dst, Register src);

 //Add Unsigned Integers with Overflow Flag
  void adoxq(Register dst, Register src);
#endif

  void addr_nop_4();
  void addr_nop_5();
  void addr_nop_7();
  void addr_nop_8();

  // Add Scalar Double-Precision Floating-Point Values
  void addsd(XMMRegister dst, Address src);
  void addsd(XMMRegister dst, XMMRegister src);

  // Add Scalar Single-Precision Floating-Point Values
  void addss(XMMRegister dst, Address src);
  void addss(XMMRegister dst, XMMRegister src);

  // AES instructions
  void aesdec(XMMRegister dst, Address src);
  void aesdec(XMMRegister dst, XMMRegister src);
  void aesdeclast(XMMRegister dst, Address src);
  void aesdeclast(XMMRegister dst, XMMRegister src);
  void aesenc(XMMRegister dst, Address src);
  void aesenc(XMMRegister dst, XMMRegister src);
  void aesenclast(XMMRegister dst, Address src);
  void aesenclast(XMMRegister dst, XMMRegister src);
  void vaesdec(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vaesdeclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

  void andl(Address  dst, int32_t imm32);
  void andl(Register dst, int32_t imm32);
  void andl(Register dst, Address src);
  void andl(Register dst, Register src);

  void andq(Address  dst, int32_t imm32);
  void andq(Register dst, int32_t imm32);
  void andq(Register dst, Address src);
  void andq(Register dst, Register src);

  // BMI instructions
  void andnl(Register dst, Register src1, Register src2);
  void andnl(Register dst, Register src1, Address src2);
  void andnq(Register dst, Register src1, Register src2);
  void andnq(Register dst, Register src1, Address src2);

  void blsil(Register dst, Register src);
  void blsil(Register dst, Address src);
  void blsiq(Register dst, Register src);
  void blsiq(Register dst, Address src);

  void blsmskl(Register dst, Register src);
  void blsmskl(Register dst, Address src);
  void blsmskq(Register dst, Register src);
  void blsmskq(Register dst, Address src);

  void blsrl(Register dst, Register src);
  void blsrl(Register dst, Address src);
  void blsrq(Register dst, Register src);
  void blsrq(Register dst, Address src);

  void bsfl(Register dst, Register src);
  void bsrl(Register dst, Register src);

#ifdef _LP64
  void bsfq(Register dst, Register src);
  void bsrq(Register dst, Register src);
#endif

  void bswapl(Register reg);

  void bswapq(Register reg);

  void call(Label& L, relocInfo::relocType rtype);
  void call(Register reg);  // push pc; pc <- reg
  void call(Address adr);   // push pc; pc <- adr

  void cdql();

  void cdqq();

  void cld();

  void clflush(Address adr);

  void cmovl(Condition cc, Register dst, Register src);
  void cmovl(Condition cc, Register dst, Address src);

  void cmovq(Condition cc, Register dst, Register src);
  void cmovq(Condition cc, Register dst, Address src);


  void cmpb(Address dst, int imm8);

  void cmpl(Address dst, int32_t imm32);

  void cmpl(Register dst, int32_t imm32);
  void cmpl(Register dst, Register src);
  void cmpl(Register dst, Address src);

  void cmpq(Address dst, int32_t imm32);
  void cmpq(Address dst, Register src);

  void cmpq(Register dst, int32_t imm32);
  void cmpq(Register dst, Register src);
  void cmpq(Register dst, Address src);

  // these are dummies used to catch attempting to convert NULL to Register
  void cmpl(Register dst, void* junk); // dummy
  void cmpq(Register dst, void* junk); // dummy

  void cmpw(Address dst, int imm16);

  void cmpxchg8 (Address adr);

  void cmpxchgb(Register reg, Address adr);
  void cmpxchgl(Register reg, Address adr);

  void cmpxchgq(Register reg, Address adr);

  // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
  void comisd(XMMRegister dst, Address src);
  void comisd(XMMRegister dst, XMMRegister src);

  // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
  void comiss(XMMRegister dst, Address src);
  void comiss(XMMRegister dst, XMMRegister src);

  // Identify processor type and features
  void cpuid();

  // CRC32C
  void crc32(Register crc, Register v, int8_t sizeInBytes);
  void crc32(Register crc, Address adr, int8_t sizeInBytes);

  // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
  void cvtsd2ss(XMMRegister dst, XMMRegister src);
  void cvtsd2ss(XMMRegister dst, Address src);

  // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
  void cvtsi2sdl(XMMRegister dst, Register src);
  void cvtsi2sdl(XMMRegister dst, Address src);
  void cvtsi2sdq(XMMRegister dst, Register src);
  void cvtsi2sdq(XMMRegister dst, Address src);

  // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
  void cvtsi2ssl(XMMRegister dst, Register src);
  void cvtsi2ssl(XMMRegister dst, Address src);
  void cvtsi2ssq(XMMRegister dst, Register src);
  void cvtsi2ssq(XMMRegister dst, Address src);

  // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
  void cvtdq2pd(XMMRegister dst, XMMRegister src);

  // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
  void cvtdq2ps(XMMRegister dst, XMMRegister src);

  // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
  void cvtss2sd(XMMRegister dst, XMMRegister src);
  void cvtss2sd(XMMRegister dst, Address src);

  // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
  void cvttsd2sil(Register dst, Address src);
  void cvttsd2sil(Register dst, XMMRegister src);
  void cvttsd2siq(Register dst, XMMRegister src);

  // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
  void cvttss2sil(Register dst, XMMRegister src);
  void cvttss2siq(Register dst, XMMRegister src);

  void cvttpd2dq(XMMRegister dst, XMMRegister src);

  // Divide Scalar Double-Precision Floating-Point Values
  void divsd(XMMRegister dst, Address src);
  void divsd(XMMRegister dst, XMMRegister src);

  // Divide Scalar Single-Precision Floating-Point Values
  void divss(XMMRegister dst, Address src);
  void divss(XMMRegister dst, XMMRegister src);

  void emms();

  void fabs();

  void fadd(int i);

  void fadd_d(Address src);
  void fadd_s(Address src);

  // "Alternate" versions of x87 instructions place result down in FPU
  // stack instead of on TOS

  void fadda(int i); // "alternate" fadd
  void faddp(int i = 1);

  void fchs();

  void fcom(int i);

  void fcomp(int i = 1);
  void fcomp_d(Address src);
  void fcomp_s(Address src);

  void fcompp();

  void fcos();

  void fdecstp();

  void fdiv(int i);
  void fdiv_d(Address src);
  void fdivr_s(Address src);
  void fdiva(int i);  // "alternate" fdiv
  void fdivp(int i = 1);

  void fdivr(int i);
  void fdivr_d(Address src);
  void fdiv_s(Address src);

  void fdivra(int i); // "alternate" reversed fdiv

  void fdivrp(int i = 1);

  void ffree(int i = 0);

  void fild_d(Address adr);
  void fild_s(Address adr);

  void fincstp();

  void finit();

  void fist_s (Address adr);
  void fistp_d(Address adr);
  void fistp_s(Address adr);

  void fld1();

  void fld_d(Address adr);
  void fld_s(Address adr);
  void fld_s(int index);
  void fld_x(Address adr);  // extended-precision (80-bit) format

  void fldcw(Address src);

  void fldenv(Address src);

  void fldlg2();

  void fldln2();

  void fldz();

  void flog();
  void flog10();

  void fmul(int i);

  void fmul_d(Address src);
  void fmul_s(Address src);

  void fmula(int i);  // "alternate" fmul

  void fmulp(int i = 1);

  void fnsave(Address dst);

  void fnstcw(Address src);

  void fnstsw_ax();

  void fprem();
  void fprem1();

  void frstor(Address src);

  void fsin();

  void fsqrt();

  void fst_d(Address adr);
  void fst_s(Address adr);

  void fstp_d(Address adr);
  void fstp_d(int index);
  void fstp_s(Address adr);
  void fstp_x(Address adr); // extended-precision (80-bit) format

  void fsub(int i);
  void fsub_d(Address src);
  void fsub_s(Address src);

  void fsuba(int i);  // "alternate" fsub

  void fsubp(int i = 1);

  void fsubr(int i);
  void fsubr_d(Address src);
  void fsubr_s(Address src);

  void fsubra(int i); // "alternate" reversed fsub

  void fsubrp(int i = 1);

  void ftan();

  void ftst();

  void fucomi(int i = 1);
  void fucomip(int i = 1);

  void fwait();

  void fxch(int i = 1);

  void fxrstor(Address src);
  void xrstor(Address src);

  void fxsave(Address dst);
  void xsave(Address dst);

  void fyl2x();
  void frndint();
  void f2xm1();
  void fldl2e();

  void hlt();

  void idivl(Register src);
  void divl(Register src); // Unsigned division

#ifdef _LP64
  void idivq(Register src);
#endif

  void imull(Register src);
  void imull(Register dst, Register src);
  void imull(Register dst, Register src, int value);
  void imull(Register dst, Address src);

#ifdef _LP64
  void imulq(Register dst, Register src);
  void imulq(Register dst, Register src, int value);
  void imulq(Register dst, Address src);
#endif

  // jcc is the generic conditional branch generator to run-
  // time routines, jcc is used for branches to labels. jcc
  // takes a branch opcode (cc) and a label (L) and generates
  // either a backward branch or a forward branch and links it
  // to the label fixup chain. Usage:
  //
  // Label L;      // unbound label
  // jcc(cc, L);   // forward branch to unbound label
  // bind(L);      // bind label to the current pc
  // jcc(cc, L);   // backward branch to bound label
  // bind(L);      // illegal: a label may be bound only once
  //
  // Note: The same Label can be used for forward and backward branches
  // but it may be bound only once.

  void jcc(Condition cc, Label& L, bool maybe_short = true);

  // Conditional jump to a 8-bit offset to L.
  // WARNING: be very careful using this for forward jumps.  If the label is
  // not bound within an 8-bit offset of this instruction, a run-time error
  // will occur.
  void jccb(Condition cc, Label& L);

  void jmp(Address entry);    // pc <- entry

  // Label operations & relative jumps (PPUM Appendix D)
  void jmp(Label& L, bool maybe_short = true);   // unconditional jump to L

  void jmp(Register entry); // pc <- entry

  // Unconditional 8-bit offset jump to L.
  // WARNING: be very careful using this for forward jumps.  If the label is
  // not bound within an 8-bit offset of this instruction, a run-time error
  // will occur.
  void jmpb(Label& L);

  void ldmxcsr( Address src );

  void leal(Register dst, Address src);

  void leaq(Register dst, Address src);

  void lfence();

  void lock();

  void lzcntl(Register dst, Register src);

#ifdef _LP64
  void lzcntq(Register dst, Register src);
#endif

  enum Membar_mask_bits {
    StoreStore = 1 << 3,
    LoadStore  = 1 << 2,
    StoreLoad  = 1 << 1,
    LoadLoad   = 1 << 0
  };

  // Serializes memory and blows flags
  void membar(Membar_mask_bits order_constraint) {
    if (os::is_MP()) {
      // We only have to handle StoreLoad
      if (order_constraint & StoreLoad) {
        // All usable chips support "locked" instructions which suffice
        // as barriers, and are much faster than the alternative of
        // using cpuid instruction. We use here a locked add [esp-C],0.
        // This is conveniently otherwise a no-op except for blowing
        // flags, and introducing a false dependency on target memory
        // location. We can't do anything with flags, but we can avoid
        // memory dependencies in the current method by locked-adding
        // somewhere else on the stack. Doing [esp+C] will collide with
        // something on stack in current method, hence we go for [esp-C].
        // It is convenient since it is almost always in data cache, for
        // any small C.  We need to step back from SP to avoid data
        // dependencies with other things on below SP (callee-saves, for
        // example). Without a clear way to figure out the minimal safe
        // distance from SP, it makes sense to step back the complete
        // cache line, as this will also avoid possible second-order effects
        // with locked ops against the cache line. Our choice of offset
        // is bounded by x86 operand encoding, which should stay within
        // [-128; +127] to have the 8-byte displacement encoding.
        //
        // Any change to this code may need to revisit other places in
        // the code where this idiom is used, in particular the
        // orderAccess code.

        int offset = -VM_Version::L1_line_size();
        if (offset < -128) {
          offset = -128;
        }

        lock();
        addl(Address(rsp, offset), 0);// Assert the lock# signal here
      }
    }
  }

  void mfence();

  // Moves

  void mov64(Register dst, int64_t imm64);

  void movb(Address dst, Register src);
  void movb(Address dst, int imm8);
  void movb(Register dst, Address src);

  void movddup(XMMRegister dst, XMMRegister src);

  void kmovbl(KRegister dst, Register src);
  void kmovbl(Register dst, KRegister src);
  void kmovwl(KRegister dst, Register src);
  void kmovwl(KRegister dst, Address src);
  void kmovwl(Register dst, KRegister src);
  void kmovdl(KRegister dst, Register src);
  void kmovdl(Register dst, KRegister src);
  void kmovql(KRegister dst, KRegister src);
  void kmovql(Address dst, KRegister src);
  void kmovql(KRegister dst, Address src);
  void kmovql(KRegister dst, Register src);
  void kmovql(Register dst, KRegister src);

  void knotwl(KRegister dst, KRegister src);

  void kortestbl(KRegister dst, KRegister src);
  void kortestwl(KRegister dst, KRegister src);
  void kortestdl(KRegister dst, KRegister src);
  void kortestql(KRegister dst, KRegister src);

  void ktestq(KRegister src1, KRegister src2);
  void ktestd(KRegister src1, KRegister src2);

  void ktestql(KRegister dst, KRegister src);

  void movdl(XMMRegister dst, Register src);
  void movdl(Register dst, XMMRegister src);
  void movdl(XMMRegister dst, Address src);
  void movdl(Address dst, XMMRegister src);

  // Move Double Quadword
  void movdq(XMMRegister dst, Register src);
  void movdq(Register dst, XMMRegister src);

  // Move Aligned Double Quadword
  void movdqa(XMMRegister dst, XMMRegister src);
  void movdqa(XMMRegister dst, Address src);

  // Move Unaligned Double Quadword
  void movdqu(Address     dst, XMMRegister src);
  void movdqu(XMMRegister dst, Address src);
  void movdqu(XMMRegister dst, XMMRegister src);

  // Move Unaligned 256bit Vector
  void vmovdqu(Address dst, XMMRegister src);
  void vmovdqu(XMMRegister dst, Address src);
  void vmovdqu(XMMRegister dst, XMMRegister src);

   // Move Unaligned 512bit Vector
  void evmovdqub(Address dst, XMMRegister src, int vector_len);
  void evmovdqub(XMMRegister dst, Address src, int vector_len);
  void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
  void evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len);
  void evmovdquw(Address dst, XMMRegister src, int vector_len);
  void evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len);
  void evmovdquw(XMMRegister dst, Address src, int vector_len);
  void evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len);
  void evmovdqul(Address dst, XMMRegister src, int vector_len);
  void evmovdqul(XMMRegister dst, Address src, int vector_len);
  void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
  void evmovdquq(Address dst, XMMRegister src, int vector_len);
  void evmovdquq(XMMRegister dst, Address src, int vector_len);
  void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);

  // Move lower 64bit to high 64bit in 128bit register
  void movlhps(XMMRegister dst, XMMRegister src);

  void movl(Register dst, int32_t imm32);
  void movl(Address dst, int32_t imm32);
  void movl(Register dst, Register src);
  void movl(Register dst, Address src);
  void movl(Address dst, Register src);

  // These dummies prevent using movl from converting a zero (like NULL) into Register
  // by giving the compiler two choices it can't resolve

  void movl(Address  dst, void* junk);
  void movl(Register dst, void* junk);

#ifdef _LP64
  void movq(Register dst, Register src);
  void movq(Register dst, Address src);
  void movq(Address  dst, Register src);
#endif

  void movq(Address     dst, MMXRegister src );
  void movq(MMXRegister dst, Address src );

#ifdef _LP64
  // These dummies prevent using movq from converting a zero (like NULL) into Register
  // by giving the compiler two choices it can't resolve

  void movq(Address  dst, void* dummy);
  void movq(Register dst, void* dummy);
#endif

  // Move Quadword
  void movq(Address     dst, XMMRegister src);
  void movq(XMMRegister dst, Address src);

  void movsbl(Register dst, Address src);
  void movsbl(Register dst, Register src);

#ifdef _LP64
  void movsbq(Register dst, Address src);
  void movsbq(Register dst, Register src);

  // Move signed 32bit immediate to 64bit extending sign
  void movslq(Address  dst, int32_t imm64);
  void movslq(Register dst, int32_t imm64);

  void movslq(Register dst, Address src);
  void movslq(Register dst, Register src);
  void movslq(Register dst, void* src); // Dummy declaration to cause NULL to be ambiguous
#endif

  void movswl(Register dst, Address src);
  void movswl(Register dst, Register src);

#ifdef _LP64
  void movswq(Register dst, Address src);
  void movswq(Register dst, Register src);
#endif

  void movw(Address dst, int imm16);
  void movw(Register dst, Address src);
  void movw(Address dst, Register src);

  void movzbl(Register dst, Address src);
  void movzbl(Register dst, Register src);

#ifdef _LP64
  void movzbq(Register dst, Address src);
  void movzbq(Register dst, Register src);
#endif

  void movzwl(Register dst, Address src);
  void movzwl(Register dst, Register src);

#ifdef _LP64
  void movzwq(Register dst, Address src);
  void movzwq(Register dst, Register src);
#endif

  // Unsigned multiply with RAX destination register
  void mull(Address src);
  void mull(Register src);

#ifdef _LP64
  void mulq(Address src);
  void mulq(Register src);
  void mulxq(Register dst1, Register dst2, Register src);
#endif

  // Multiply Scalar Double-Precision Floating-Point Values
  void mulsd(XMMRegister dst, Address src);
  void mulsd(XMMRegister dst, XMMRegister src);

  // Multiply Scalar Single-Precision Floating-Point Values
  void mulss(XMMRegister dst, Address src);
  void mulss(XMMRegister dst, XMMRegister src);

  void negl(Register dst);

#ifdef _LP64
  void negq(Register dst);
#endif

  void nop(int i = 1);

  void notl(Register dst);

#ifdef _LP64
  void notq(Register dst);
#endif

  void orl(Address dst, int32_t imm32);
  void orl(Register dst, int32_t imm32);
  void orl(Register dst, Address src);
  void orl(Register dst, Register src);
  void orl(Address dst, Register src);

  void orb(Address dst, int imm8);

  void orq(Address dst, int32_t imm32);
  void orq(Register dst, int32_t imm32);
  void orq(Register dst, Address src);
  void orq(Register dst, Register src);

  // Pack with unsigned saturation
  void packuswb(XMMRegister dst, XMMRegister src);
  void packuswb(XMMRegister dst, Address src);
  void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

  // Pemutation of 64bit words
  void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
  void vpermq(XMMRegister dst, XMMRegister src, int imm8);
  void vperm2i128(XMMRegister dst,  XMMRegister nds, XMMRegister src, int imm8);
  void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);

  void pause();

  // Undefined Instruction
  void ud2();

  // SSE4.2 string instructions
  void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
  void pcmpestri(XMMRegister xmm1, Address src, int imm8);

  void pcmpeqb(XMMRegister dst, XMMRegister src);
  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);

  void evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);

  void evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
  void evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate of, int vector_len);
  void evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len);

  void pcmpeqw(XMMRegister dst, XMMRegister src);
  void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len);

  void pcmpeqd(XMMRegister dst, XMMRegister src);
  void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len);

  void pcmpeqq(XMMRegister dst, XMMRegister src);
  void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqq(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqq(KRegister kdst, XMMRegister nds, Address src, int vector_len);

  void pmovmskb(Register dst, XMMRegister src);
  void vpmovmskb(Register dst, XMMRegister src);

  // SSE 4.1 extract
  void pextrd(Register dst, XMMRegister src, int imm8);
  void pextrq(Register dst, XMMRegister src, int imm8);
  void pextrd(Address dst, XMMRegister src, int imm8);
  void pextrq(Address dst, XMMRegister src, int imm8);
  void pextrb(Address dst, XMMRegister src, int imm8);
  // SSE 2 extract
  void pextrw(Register dst, XMMRegister src, int imm8);
  void pextrw(Address dst, XMMRegister src, int imm8);

  // SSE 4.1 insert
  void pinsrd(XMMRegister dst, Register src, int imm8);
  void pinsrq(XMMRegister dst, Register src, int imm8);
  void pinsrd(XMMRegister dst, Address src, int imm8);
  void pinsrq(XMMRegister dst, Address src, int imm8);
  void pinsrb(XMMRegister dst, Address src, int imm8);
  // SSE 2 insert
  void pinsrw(XMMRegister dst, Register src, int imm8);
  void pinsrw(XMMRegister dst, Address src, int imm8);

  // SSE4.1 packed move
  void pmovzxbw(XMMRegister dst, XMMRegister src);
  void pmovzxbw(XMMRegister dst, Address src);

  void vpmovzxbw( XMMRegister dst, Address src, int vector_len);
  void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len);

  void evpmovwb(Address dst, XMMRegister src, int vector_len);
  void evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len);

#ifndef _LP64 // no 32bit push/pop on amd64
  void popl(Address dst);
#endif

#ifdef _LP64
  void popq(Address dst);
#endif

  void popcntl(Register dst, Address src);
  void popcntl(Register dst, Register src);

  void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len);

#ifdef _LP64
  void popcntq(Register dst, Address src);
  void popcntq(Register dst, Register src);
#endif

  // Prefetches (SSE, SSE2, 3DNOW only)

  void prefetchnta(Address src);
  void prefetchr(Address src);
  void prefetcht0(Address src);
  void prefetcht1(Address src);
  void prefetcht2(Address src);
  void prefetchw(Address src);

  // Shuffle Bytes
  void pshufb(XMMRegister dst, XMMRegister src);
  void pshufb(XMMRegister dst, Address src);
  void vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

  // Shuffle Packed Doublewords
  void pshufd(XMMRegister dst, XMMRegister src, int mode);
  void pshufd(XMMRegister dst, Address src,     int mode);
  void vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len);

  // Shuffle Packed Low Words
  void pshuflw(XMMRegister dst, XMMRegister src, int mode);
  void pshuflw(XMMRegister dst, Address src,     int mode);

  // Shuffle packed values at 128 bit granularity
  void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);

  // Shift Right by bytes Logical DoubleQuadword Immediate
  void psrldq(XMMRegister dst, int shift);
  // Shift Left by bytes Logical DoubleQuadword Immediate
  void pslldq(XMMRegister dst, int shift);

  // Logical Compare 128bit
  void ptest(XMMRegister dst, XMMRegister src);
  void ptest(XMMRegister dst, Address src);
  // Logical Compare 256bit
  void vptest(XMMRegister dst, XMMRegister src);
  void vptest(XMMRegister dst, Address src);

  // Interleave Low Bytes
  void punpcklbw(XMMRegister dst, XMMRegister src);
  void punpcklbw(XMMRegister dst, Address src);

  // Interleave Low Doublewords
  void punpckldq(XMMRegister dst, XMMRegister src);
  void punpckldq(XMMRegister dst, Address src);

  // Interleave Low Quadwords
  void punpcklqdq(XMMRegister dst, XMMRegister src);

#ifndef _LP64 // no 32bit push/pop on amd64
  void pushl(Address src);
#endif

  void pushq(Address src);

  void rcll(Register dst, int imm8);

  void rclq(Register dst, int imm8);

  void rcrq(Register dst, int imm8);

  void rcpps(XMMRegister dst, XMMRegister src);

  void rcpss(XMMRegister dst, XMMRegister src);

  void rdtsc();

  void ret(int imm16);

#ifdef _LP64
  void rorq(Register dst, int imm8);
  void rorxq(Register dst, Register src, int imm8);
  void rorxd(Register dst, Register src, int imm8);
#endif

  void sahf();

  void sarl(Register dst, int imm8);
  void sarl(Register dst);

  void sarq(Register dst, int imm8);
  void sarq(Register dst);

  void sbbl(Address dst, int32_t imm32);
  void sbbl(Register dst, int32_t imm32);
  void sbbl(Register dst, Address src);
  void sbbl(Register dst, Register src);

  void sbbq(Address dst, int32_t imm32);
  void sbbq(Register dst, int32_t imm32);
  void sbbq(Register dst, Address src);
  void sbbq(Register dst, Register src);

  void setb(Condition cc, Register dst);

  void palignr(XMMRegister dst, XMMRegister src, int imm8);
  void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
  void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);

  void pblendw(XMMRegister dst, XMMRegister src, int imm8);

  void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8);
  void sha1nexte(XMMRegister dst, XMMRegister src);
  void sha1msg1(XMMRegister dst, XMMRegister src);
  void sha1msg2(XMMRegister dst, XMMRegister src);
  // xmm0 is implicit additional source to the following instruction.
  void sha256rnds2(XMMRegister dst, XMMRegister src);
  void sha256msg1(XMMRegister dst, XMMRegister src);
  void sha256msg2(XMMRegister dst, XMMRegister src);

  void shldl(Register dst, Register src);
  void shldl(Register dst, Register src, int8_t imm8);

  void shll(Register dst, int imm8);
  void shll(Register dst);

  void shlq(Register dst, int imm8);
  void shlq(Register dst);

  void shrdl(Register dst, Register src);

  void shrl(Register dst, int imm8);
  void shrl(Register dst);

  void shrq(Register dst, int imm8);
  void shrq(Register dst);

  void smovl(); // QQQ generic?

  // Compute Square Root of Scalar Double-Precision Floating-Point Value
  void sqrtsd(XMMRegister dst, Address src);
  void sqrtsd(XMMRegister dst, XMMRegister src);

  // Compute Square Root of Scalar Single-Precision Floating-Point Value
  void sqrtss(XMMRegister dst, Address src);
  void sqrtss(XMMRegister dst, XMMRegister src);

  void std();

  void stmxcsr( Address dst );

  void subl(Address dst, int32_t imm32);
  void subl(Address dst, Register src);
  void subl(Register dst, int32_t imm32);
  void subl(Register dst, Address src);
  void subl(Register dst, Register src);

  void subq(Address dst, int32_t imm32);
  void subq(Address dst, Register src);
  void subq(Register dst, int32_t imm32);
  void subq(Register dst, Address src);
  void subq(Register dst, Register src);

  // Force generation of a 4 byte immediate value even if it fits into 8bit
  void subl_imm32(Register dst, int32_t imm32);
  void subq_imm32(Register dst, int32_t imm32);

  // Subtract Scalar Double-Precision Floating-Point Values
  void subsd(XMMRegister dst, Address src);
  void subsd(XMMRegister dst, XMMRegister src);

  // Subtract Scalar Single-Precision Floating-Point Values
  void subss(XMMRegister dst, Address src);
  void subss(XMMRegister dst, XMMRegister src);

  void testb(Register dst, int imm8);
  void testb(Address dst, int imm8);

  void testl(Register dst, int32_t imm32);
  void testl(Register dst, Register src);
  void testl(Register dst, Address src);

  void testq(Register dst, int32_t imm32);
  void testq(Register dst, Register src);
  void testq(Register dst, Address src);

  // BMI - count trailing zeros
  void tzcntl(Register dst, Register src);
  void tzcntq(Register dst, Register src);

  // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
  void ucomisd(XMMRegister dst, Address src);
  void ucomisd(XMMRegister dst, XMMRegister src);

  // Unordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
  void ucomiss(XMMRegister dst, Address src);
  void ucomiss(XMMRegister dst, XMMRegister src);

  void xabort(int8_t imm8);

  void xaddb(Address dst, Register src);
  void xaddw(Address dst, Register src);
  void xaddl(Address dst, Register src);
  void xaddq(Address dst, Register src);

  void xbegin(Label& abort, relocInfo::relocType rtype = relocInfo::none);

  void xchgb(Register reg, Address adr);
  void xchgw(Register reg, Address adr);
  void xchgl(Register reg, Address adr);
  void xchgl(Register dst, Register src);

  void xchgq(Register reg, Address adr);
  void xchgq(Register dst, Register src);

  void xend();

  // Get Value of Extended Control Register
  void xgetbv();

  void xorl(Register dst, int32_t imm32);
  void xorl(Register dst, Address src);
  void xorl(Register dst, Register src);

  void xorb(Register dst, Address src);

  void xorq(Register dst, Address src);
  void xorq(Register dst, Register src);

  void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0

  // AVX 3-operands scalar instructions (encoded with VEX prefix)

  void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vaddss(XMMRegister dst, XMMRegister nds, Address src);
  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vdivss(XMMRegister dst, XMMRegister nds, Address src);
  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vmulss(XMMRegister dst, XMMRegister nds, Address src);
  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vsubss(XMMRegister dst, XMMRegister nds, Address src);
  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);

  void shlxl(Register dst, Register src1, Register src2);
  void shlxq(Register dst, Register src1, Register src2);

  //====================VECTOR ARITHMETIC=====================================

  // Add Packed Floating-Point Values
  void addpd(XMMRegister dst, XMMRegister src);
  void addpd(XMMRegister dst, Address src);
  void addps(XMMRegister dst, XMMRegister src);
  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Subtract Packed Floating-Point Values
  void subpd(XMMRegister dst, XMMRegister src);
  void subps(XMMRegister dst, XMMRegister src);
  void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Multiply Packed Floating-Point Values
  void mulpd(XMMRegister dst, XMMRegister src);
  void mulpd(XMMRegister dst, Address src);
  void mulps(XMMRegister dst, XMMRegister src);
  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  void vfmadd231pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vfmadd231ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vfmadd231pd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vfmadd231ps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Divide Packed Floating-Point Values
  void divpd(XMMRegister dst, XMMRegister src);
  void divps(XMMRegister dst, XMMRegister src);
  void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Sqrt Packed Floating-Point Values
  void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
  void vsqrtpd(XMMRegister dst, Address src, int vector_len);
  void vsqrtps(XMMRegister dst, XMMRegister src, int vector_len);
  void vsqrtps(XMMRegister dst, Address src, int vector_len);

  // Bitwise Logical AND of Packed Floating-Point Values
  void andpd(XMMRegister dst, XMMRegister src);
  void andps(XMMRegister dst, XMMRegister src);
  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  void unpckhpd(XMMRegister dst, XMMRegister src);
  void unpcklpd(XMMRegister dst, XMMRegister src);

  // Bitwise Logical XOR of Packed Floating-Point Values
  void xorpd(XMMRegister dst, XMMRegister src);
  void xorps(XMMRegister dst, XMMRegister src);
  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Add horizontal packed integers
  void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void phaddw(XMMRegister dst, XMMRegister src);
  void phaddd(XMMRegister dst, XMMRegister src);

  // Add packed integers
  void paddb(XMMRegister dst, XMMRegister src);
  void paddw(XMMRegister dst, XMMRegister src);
  void paddd(XMMRegister dst, XMMRegister src);
  void paddd(XMMRegister dst, Address src);
  void paddq(XMMRegister dst, XMMRegister src);
  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Sub packed integers
  void psubb(XMMRegister dst, XMMRegister src);
  void psubw(XMMRegister dst, XMMRegister src);
  void psubd(XMMRegister dst, XMMRegister src);
  void psubq(XMMRegister dst, XMMRegister src);
  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Multiply packed integers (only shorts and ints)
  void pmullw(XMMRegister dst, XMMRegister src);
  void pmulld(XMMRegister dst, XMMRegister src);
  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Shift left packed integers
  void psllw(XMMRegister dst, int shift);
  void pslld(XMMRegister dst, int shift);
  void psllq(XMMRegister dst, int shift);
  void psllw(XMMRegister dst, XMMRegister shift);
  void pslld(XMMRegister dst, XMMRegister shift);
  void psllq(XMMRegister dst, XMMRegister shift);
  void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
  void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
  void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);

  // Logical shift right packed integers
  void psrlw(XMMRegister dst, int shift);
  void psrld(XMMRegister dst, int shift);
  void psrlq(XMMRegister dst, int shift);
  void psrlw(XMMRegister dst, XMMRegister shift);
  void psrld(XMMRegister dst, XMMRegister shift);
  void psrlq(XMMRegister dst, XMMRegister shift);
  void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
  void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
  void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);

  // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
  void psraw(XMMRegister dst, int shift);
  void psrad(XMMRegister dst, int shift);
  void psraw(XMMRegister dst, XMMRegister shift);
  void psrad(XMMRegister dst, XMMRegister shift);
  void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);

  // And packed integers
  void pand(XMMRegister dst, XMMRegister src);
  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Andn packed integers
  void pandn(XMMRegister dst, XMMRegister src);

  // Or packed integers
  void por(XMMRegister dst, XMMRegister src);
  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

  // Xor packed integers
  void pxor(XMMRegister dst, XMMRegister src);
  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);


  // vinserti forms
  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
  void vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
  void vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
  void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);

  // vinsertf forms
  void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
  void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
  void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
  void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
  void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
  void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);

  // vextracti forms
  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
  void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
  void vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
  void vextracti32x4(Address dst, XMMRegister src, uint8_t imm8);
  void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
  void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);

  // vextractf forms
  void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
  void vextractf128(Address dst, XMMRegister src, uint8_t imm8);
  void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
  void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
  void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
  void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
  void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);

  // legacy xmm sourced word/dword replicate
  void vpbroadcastw(XMMRegister dst, XMMRegister src);
  void vpbroadcastd(XMMRegister dst, XMMRegister src);

  // xmm/mem sourced byte/word/dword/qword replicate
  void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
  void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
  void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
  void evpbroadcastw(XMMRegister dst, Address src, int vector_len);
  void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
  void evpbroadcastd(XMMRegister dst, Address src, int vector_len);
  void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
  void evpbroadcastq(XMMRegister dst, Address src, int vector_len);

  void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
  void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);

  // scalar single/double precision replicate
  void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
  void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
  void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
  void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);

  // gpr sourced byte/word/dword/qword replicate
  void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
  void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
  void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
  void evpbroadcastq(XMMRegister dst, Register src, int vector_len);

  // Carry-Less Multiplication Quadword
  void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
  void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
  void evpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask, int vector_len);
  // AVX instruction which is used to clear upper 128 bits of YMM registers and
  // to avoid transaction penalty between AVX and SSE states. There is no
  // penalty if legacy SSE instructions are encoded using VEX prefix because
  // they always clear upper 128 bits. It should be used before calling
  // runtime code and native libraries.
  void vzeroupper();

  // AVX support for vectorized conditional move (float/double). The following two instructions used only coupled.
  void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
  void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
  void cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
  void blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
  void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);

 protected:
  // Next instructions require address alignment 16 bytes SSE mode.
  // They should be called only from corresponding MacroAssembler instructions.
  void andpd(XMMRegister dst, Address src);
  void andps(XMMRegister dst, Address src);
  void xorpd(XMMRegister dst, Address src);
  void xorps(XMMRegister dst, Address src);

};

// The Intel x86/Amd64 Assembler attributes: All fields enclosed here are to guide encoding level decisions.
// Specific set functions are for specialized use, else defaults or whatever was supplied to object construction
// are applied.
class InstructionAttr {
public:
  InstructionAttr(
    int vector_len,     // The length of vector to be applied in encoding - for both AVX and EVEX
    bool rex_vex_w,     // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true
    bool legacy_mode,   // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX
    bool no_reg_mask,   // when true, k0 is used when EVEX encoding is chosen, else k1 is used under the same condition
    bool uses_vl)       // This instruction may have legacy constraints based on vector length for EVEX
    :
      _avx_vector_len(vector_len),
      _rex_vex_w(rex_vex_w),
      _rex_vex_w_reverted(false),
      _legacy_mode(legacy_mode),
      _no_reg_mask(no_reg_mask),
      _uses_vl(uses_vl),
      _tuple_type(Assembler::EVEX_ETUP),
      _input_size_in_bits(Assembler::EVEX_NObit),
      _is_evex_instruction(false),
      _evex_encoding(0),
      _is_clear_context(true),
      _is_extended_context(false),
      _current_assembler(NULL),
      _embedded_opmask_register_specifier(1) { // hard code k1, it will be initialized for now
    if (UseAVX < 3) _legacy_mode = true;
  }

  ~InstructionAttr() {
    if (_current_assembler != NULL) {
      _current_assembler->clear_attributes();
    }
    _current_assembler = NULL;
  }

private:
  int  _avx_vector_len;
  bool _rex_vex_w;
  bool _rex_vex_w_reverted;
  bool _legacy_mode;
  bool _no_reg_mask;
  bool _uses_vl;
  int  _tuple_type;
  int  _input_size_in_bits;
  bool _is_evex_instruction;
  int  _evex_encoding;
  bool _is_clear_context;
  bool _is_extended_context;
  int _embedded_opmask_register_specifier;

  Assembler *_current_assembler;

public:
  // query functions for field accessors
  int  get_vector_len(void) const { return _avx_vector_len; }
  bool is_rex_vex_w(void) const { return _rex_vex_w; }
  bool is_rex_vex_w_reverted(void) { return _rex_vex_w_reverted; }
  bool is_legacy_mode(void) const { return _legacy_mode; }
  bool is_no_reg_mask(void) const { return _no_reg_mask; }
  bool uses_vl(void) const { return _uses_vl; }
  int  get_tuple_type(void) const { return _tuple_type; }
  int  get_input_size(void) const { return _input_size_in_bits; }
  int  is_evex_instruction(void) const { return _is_evex_instruction; }
  int  get_evex_encoding(void) const { return _evex_encoding; }
  bool is_clear_context(void) const { return _is_clear_context; }
  bool is_extended_context(void) const { return _is_extended_context; }
  int get_embedded_opmask_register_specifier(void) const { return _embedded_opmask_register_specifier; }

  // Set the vector len manually
  void set_vector_len(int vector_len) { _avx_vector_len = vector_len; }

  // Set revert rex_vex_w for avx encoding
  void set_rex_vex_w_reverted(void) { _rex_vex_w_reverted = true; }

  // Set rex_vex_w based on state
  void set_rex_vex_w(bool state) { _rex_vex_w = state; }

  // Set the instruction to be encoded in AVX mode
  void set_is_legacy_mode(void) { _legacy_mode = true; }

  // Set the current instuction to be encoded as an EVEX instuction
  void set_is_evex_instruction(void) { _is_evex_instruction = true; }

  // Internal encoding data used in compressed immediate offset programming
  void set_evex_encoding(int value) { _evex_encoding = value; }

  // Set the Evex.Z field to be used to clear all non directed XMM/YMM/ZMM components
  void reset_is_clear_context(void) { _is_clear_context = false; }

  // Map back to current asembler so that we can manage object level assocation
  void set_current_assembler(Assembler *current_assembler) { _current_assembler = current_assembler; }

  // Address modifiers used for compressed displacement calculation
  void set_address_attributes(int tuple_type, int input_size_in_bits) {
    if (VM_Version::supports_evex()) {
      _tuple_type = tuple_type;
      _input_size_in_bits = input_size_in_bits;
    }
  }

  // Set embedded opmask register specifier.
  void set_embedded_opmask_register_specifier(KRegister mask) {
    _embedded_opmask_register_specifier = (*mask).encoding() & 0x7;
  }

};

#endif // CPU_X86_VM_ASSEMBLER_X86_HPP