changeset 47551:4dd1715f647f

Merge
author jwilhelm
date Sat, 21 Oct 2017 01:23:52 +0200
parents fb677b3f0888 fc4cfca10556
children 50aa24ce898c
files make/conf/jib-profiles.js src/hotspot/os/windows/decoder_windows.hpp src/hotspot/share/code/jvmticmlr.h src/hotspot/share/gc/g1/suspendibleThreadSet.cpp src/hotspot/share/gc/g1/suspendibleThreadSet.hpp src/hotspot/share/prims/jni.h src/java.base/share/classes/jdk/internal/vm/cds/resources/ModuleLoaderMap.dat test/jdk/ProblemList.txt
diffstat 591 files changed, 16179 insertions(+), 7900 deletions(-) [+]
line wrap: on
line diff
--- a/make/common/Modules.gmk	Fri Oct 20 17:16:05 2017 +0530
+++ b/make/common/Modules.gmk	Sat Oct 21 01:23:52 2017 +0200
@@ -113,6 +113,7 @@
     jdk.dynalink \
     jdk.httpserver \
     jdk.incubator.httpclient \
+    jdk.internal.vm.compiler.management \
     jdk.jsobject \
     jdk.localedata \
     jdk.naming.dns \
@@ -215,6 +216,7 @@
 
 ifeq ($(INCLUDE_GRAAL), false)
   MODULES_FILTER += jdk.internal.vm.compiler
+  MODULES_FILTER += jdk.internal.vm.compiler.management
 endif
 
 ################################################################################
--- a/make/conf/jib-profiles.js	Fri Oct 20 17:16:05 2017 +0530
+++ b/make/conf/jib-profiles.js	Sat Oct 21 01:23:52 2017 +0200
@@ -1052,7 +1052,7 @@
         jtreg: {
             server: "javare",
             revision: "4.2",
-            build_number: "b08",
+            build_number: "b09",
             checksum_file: "MD5_VALUES",
             file: "jtreg_bin-4.2.zip",
             environment_name: "JT_HOME",
--- a/make/gensrc/GensrcModuleLoaderMap.gmk	Fri Oct 20 17:16:05 2017 +0530
+++ b/make/gensrc/GensrcModuleLoaderMap.gmk	Sat Oct 21 01:23:52 2017 +0200
@@ -54,15 +54,4 @@
 
 GENSRC_JAVA_BASE += $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/module/ModuleLoaderMap.java
 
-$(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/vm/cds/resources/ModuleLoaderMap.dat: \
-    $(TOPDIR)/src/java.base/share/classes/jdk/internal/vm/cds/resources/ModuleLoaderMap.dat \
-    $(VARDEPS_FILE) $(BUILD_TOOLS_JDK)
-	$(MKDIR) -p $(@D)
-	$(RM) $@ $@.tmp
-	$(TOOL_GENCLASSLOADERMAP) -boot $(BOOT_MODULES_LIST) \
-	    -platform $(PLATFORM_MODULES_LIST) -o $@.tmp $<
-	$(MV) $@.tmp $@
-
-GENSRC_JAVA_BASE += $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/vm/cds/resources/ModuleLoaderMap.dat
-
 ################################################################################
--- a/make/hotspot/lib/CompileJvm.gmk	Fri Oct 20 17:16:05 2017 +0530
+++ b/make/hotspot/lib/CompileJvm.gmk	Sat Oct 21 01:23:52 2017 +0200
@@ -58,6 +58,7 @@
     -I$(JVM_VARIANT_OUTPUTDIR)/gensrc \
     -I$(TOPDIR)/src/hotspot/share/precompiled \
     -I$(TOPDIR)/src/hotspot/share/prims \
+    -I$(TOPDIR)/src/java.base/share/native/include \
     #
 
 # INCLUDE_SUFFIX_* is only meant for including the proper
--- a/make/hotspot/lib/JvmFeatures.gmk	Fri Oct 20 17:16:05 2017 +0530
+++ b/make/hotspot/lib/JvmFeatures.gmk	Sat Oct 21 01:23:52 2017 +0200
@@ -47,6 +47,9 @@
 ifeq ($(call check-jvm-feature, zero), true)
   JVM_CFLAGS_FEATURES += -DZERO -DCC_INTERP -DZERO_LIBARCH='"$(OPENJDK_TARGET_CPU_LEGACY_LIB)"' $(LIBFFI_CFLAGS)
   JVM_LIBS_FEATURES += $(LIBFFI_LIBS)
+  ifeq ($(OPENJDK_TARGET_CPU), sparcv9)
+    BUILD_LIBJVM_EXTRA_FILES := $(TOPDIR)/src/hotspot/cpu/sparc/memset_with_concurrent_readers_sparc.cpp
+  endif
 endif
 
 ifeq ($(call check-jvm-feature, shark), true)
@@ -129,6 +132,7 @@
       cms/ g1/ parallel/
   JVM_EXCLUDE_FILES += \
       concurrentGCThread.cpp \
+      suspendibleThreadSet.cpp \
       plab.cpp
   JVM_EXCLUDE_FILES += \
       g1MemoryPool.cpp \
--- a/make/jdk/src/classes/build/tools/module/GenModuleLoaderMap.java	Fri Oct 20 17:16:05 2017 +0530
+++ b/make/jdk/src/classes/build/tools/module/GenModuleLoaderMap.java	Sat Oct 21 01:23:52 2017 +0200
@@ -77,30 +77,22 @@
             throw new IllegalArgumentException(source + " not exist");
         }
 
-        boolean needsQuotes = outfile.toString().contains(".java.tmp");
-
         try (BufferedWriter bw = Files.newBufferedWriter(outfile, StandardCharsets.UTF_8);
              PrintWriter writer = new PrintWriter(bw)) {
             for (String line : Files.readAllLines(source)) {
                 if (line.contains("@@BOOT_MODULE_NAMES@@")) {
-                    line = patch(line, "@@BOOT_MODULE_NAMES@@", bootModules, needsQuotes);
+                    line = patch(line, "@@BOOT_MODULE_NAMES@@", bootModules);
                 } else if (line.contains("@@PLATFORM_MODULE_NAMES@@")) {
-                    line = patch(line, "@@PLATFORM_MODULE_NAMES@@", platformModules, needsQuotes);
+                    line = patch(line, "@@PLATFORM_MODULE_NAMES@@", platformModules);
                 }
                 writer.println(line);
             }
         }
     }
 
-    private static String patch(String s, String tag, Stream<String> stream, boolean needsQuotes) {
-        String mns = null;
-        if (needsQuotes) {
-            mns = stream.sorted()
-                .collect(Collectors.joining("\",\n            \""));
-        } else {
-            mns = stream.sorted()
-                .collect(Collectors.joining("\n"));
-        }
+    private static String patch(String s, String tag, Stream<String> stream) {
+        String mns = stream.sorted()
+            .collect(Collectors.joining("\",\n            \""));
         return s.replace(tag, mns);
     }
 
--- a/make/test/JtregNativeHotspot.gmk	Fri Oct 20 17:16:05 2017 +0530
+++ b/make/test/JtregNativeHotspot.gmk	Sat Oct 21 01:23:52 2017 +0200
@@ -50,6 +50,7 @@
     $(TOPDIR)/test/hotspot/jtreg/runtime/jni/8025979 \
     $(TOPDIR)/test/hotspot/jtreg/runtime/jni/8033445 \
     $(TOPDIR)/test/hotspot/jtreg/runtime/jni/checked \
+    $(TOPDIR)/test/hotspot/jtreg/runtime/jni/FindClass \
     $(TOPDIR)/test/hotspot/jtreg/runtime/jni/PrivateInterfaceMethods \
     $(TOPDIR)/test/hotspot/jtreg/runtime/jni/ToStringInInterfaceTest \
     $(TOPDIR)/test/hotspot/jtreg/runtime/jni/CalleeSavedRegisters \
@@ -59,6 +60,7 @@
     $(TOPDIR)/test/hotspot/jtreg/runtime/SameObject \
     $(TOPDIR)/test/hotspot/jtreg/runtime/BoolReturn \
     $(TOPDIR)/test/hotspot/jtreg/runtime/noClassDefFoundMsg \
+    $(TOPDIR)/test/hotspot/jtreg/runtime/RedefineTests \
     $(TOPDIR)/test/hotspot/jtreg/compiler/floatingpoint/ \
     $(TOPDIR)/test/hotspot/jtreg/compiler/calls \
     $(TOPDIR)/test/hotspot/jtreg/serviceability/jvmti/GetOwnedMonitorInfo \
@@ -103,6 +105,7 @@
     BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libMAAClassLoadPrepare := -lc
     BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libMAAThreadStart := -lc
     BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libAllowedFunctions := -lc
+    BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libRedefineDoubleDelete := -lc
 endif
 
 ifeq ($(OPENJDK_TARGET_OS), linux)
--- a/src/hotspot/.mx.jvmci/hotspot/templates/eclipse/cproject	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/.mx.jvmci/hotspot/templates/eclipse/cproject	Sat Oct 21 01:23:52 2017 +0200
@@ -70,7 +70,7 @@
 						</toolChain>
 					</folderInfo>
 					<sourceEntries>
-						<entry excluding="cpu/vm/templateTable_x86_32.cpp|cpu/vm/templateInterpreter_x86_32.cpp|cpu/vm/stubRoutines_x86_32.cpp|cpu/vm/stubGenerator_x86_32.cpp|cpu/vm/sharedRuntime_x86_32.cpp|cpu/vm/jniFastGetField_x86_32.cpp|cpu/vm/interpreterRT_x86_32.cpp|cpu/vm/interpreter_x86_32.cpp|cpu/vm/interp_masm_x86_32.cpp|cpu/vm/vtableStubs_x86_32.cpp" flags="VALUE_WORKSPACE_PATH" kind="sourcePath" name=""/>
+            <entry excluding="cpu/x86/templateTable_x86_32.cpp|cpu/x86/templateInterpreter_x86_32.cpp|cpu/x86/stubRoutines_x86_32.cpp|cpu/x86/stubGenerator_x86_32.cpp|cpu/x86/sharedRuntime_x86_32.cpp|cpu/x86/jniFastGetField_x86_32.cpp|cpu/x86/interpreterRT_x86_32.cpp|cpu/x86/interpreter_x86_32.cpp|cpu/x86/interp_masm_x86_32.cpp|cpu/x86/vtableStubs_x86_32.cpp" flags="VALUE_WORKSPACE_PATH" kind="sourcePath" name=""/>
 					</sourceEntries>
 				</configuration>
 			</storageModule>
--- a/src/hotspot/.mx.jvmci/mx_jvmci.py	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/.mx.jvmci/mx_jvmci.py	Sat Oct 21 01:23:52 2017 +0200
@@ -256,14 +256,10 @@
         """
 
         roots = [
-            'ASSEMBLY_EXCEPTION',
-            'LICENSE',
-            'README',
-            'THIRD_PARTY_README',
-            'agent',
-            'make',
-            'src',
-            'test'
+            'cpu',
+            'os',
+            'os_cpu',
+            'share'
         ]
 
         for jvmVariant in _jdkJvmVariants:
@@ -605,6 +601,16 @@
 def _get_openjdk_os_cpu():
     return _get_openjdk_os() + '-' + _get_openjdk_cpu()
 
+def _get_jdk_dir():
+    suiteParentDir = dirname(_suite.dir)
+    # suitParentDir is now something like: /some_prefix/jdk10-hs/open/src
+    pathComponents = suiteParentDir.split(os.sep)
+    for i in range(0, len(pathComponents)):
+        if pathComponents[i] in ["open", "src"]:
+            del pathComponents[i:]
+            break
+    return os.path.join(os.sep, *pathComponents)
+
 def _get_jdk_build_dir(debugLevel=None):
     """
     Gets the directory into which the JDK is built. This directory contains
@@ -613,7 +619,7 @@
     if debugLevel is None:
         debugLevel = _vm.debugLevel
     name = '{}-{}-{}-{}'.format(_get_openjdk_os_cpu(), 'normal', _vm.jvmVariant, debugLevel)
-    return join(dirname(_suite.dir), 'build', name)
+    return join(_get_jdk_dir(), 'build', name)
 
 _jvmci_bootclasspath_prepends = []
 
--- a/src/hotspot/.mx.jvmci/suite.py	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/.mx.jvmci/suite.py	Sat Oct 21 01:23:52 2017 +0200
@@ -24,9 +24,7 @@
 
   "defaultLicense" : "GPLv2-CPE",
 
-  # This puts mx/ as a sibling of the JDK build configuration directories
-  # (e.g., macosx-x86_64-normal-server-release).
-  "outputRoot" : "../build/mx/hotspot",
+  "outputRoot" : "../../build/mx/hotspot",
 
     # ------------- Libraries -------------
 
@@ -43,7 +41,7 @@
     # ------------- JVMCI:Service -------------
 
     "jdk.vm.ci.services" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "javaCompliance" : "9",
       "workingSets" : "API,JVMCI",
@@ -52,7 +50,7 @@
     # ------------- JVMCI:API -------------
 
     "jdk.vm.ci.common" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "checkstyle" : "jdk.vm.ci.services",
       "javaCompliance" : "9",
@@ -60,7 +58,7 @@
     },
 
     "jdk.vm.ci.meta" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "checkstyle" : "jdk.vm.ci.services",
       "javaCompliance" : "9",
@@ -68,7 +66,7 @@
     },
 
     "jdk.vm.ci.code" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "dependencies" : ["jdk.vm.ci.meta"],
       "checkstyle" : "jdk.vm.ci.services",
@@ -77,7 +75,7 @@
     },
 
     "jdk.vm.ci.code.test" : {
-      "subDir" : "test/compiler/jvmci",
+      "subDir" : "../../test/hotspot/jtreg/compiler/jvmci",
       "sourceDirs" : ["src"],
       "dependencies" : [
         "mx:JUNIT",
@@ -92,7 +90,7 @@
     },
 
     "jdk.vm.ci.runtime" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "dependencies" : [
         "jdk.vm.ci.code",
@@ -104,7 +102,7 @@
     },
 
     "jdk.vm.ci.runtime.test" : {
-      "subDir" : "test/compiler/jvmci",
+      "subDir" : "../../test/hotspot/jtreg/compiler/jvmci",
       "sourceDirs" : ["src"],
       "dependencies" : [
         "mx:JUNIT",
@@ -119,7 +117,7 @@
     # ------------- JVMCI:HotSpot -------------
 
     "jdk.vm.ci.aarch64" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "dependencies" : ["jdk.vm.ci.code"],
       "checkstyle" : "jdk.vm.ci.services",
@@ -128,7 +126,7 @@
     },
 
     "jdk.vm.ci.amd64" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "dependencies" : ["jdk.vm.ci.code"],
       "checkstyle" : "jdk.vm.ci.services",
@@ -137,7 +135,7 @@
     },
 
     "jdk.vm.ci.sparc" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "dependencies" : ["jdk.vm.ci.code"],
       "checkstyle" : "jdk.vm.ci.services",
@@ -146,7 +144,7 @@
     },
 
     "jdk.vm.ci.hotspot" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "dependencies" : [
         "jdk.vm.ci.common",
@@ -163,7 +161,7 @@
     },
 
     "jdk.vm.ci.hotspot.test" : {
-      "subDir" : "test/compiler/jvmci",
+      "subDir" : "../../test/hotspot/jtreg/compiler/jvmci",
       "sourceDirs" : ["src"],
       "dependencies" : [
         "TESTNG",
@@ -175,7 +173,7 @@
     },
 
     "jdk.vm.ci.hotspot.aarch64" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "dependencies" : [
         "jdk.vm.ci.aarch64",
@@ -187,7 +185,7 @@
     },
 
     "jdk.vm.ci.hotspot.amd64" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "dependencies" : [
         "jdk.vm.ci.amd64",
@@ -199,7 +197,7 @@
     },
 
     "jdk.vm.ci.hotspot.sparc" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "sourceDirs" : ["src"],
       "dependencies" : [
         "jdk.vm.ci.sparc",
@@ -221,12 +219,12 @@
     # ------------- Distributions -------------
 
     "JVMCI_SERVICES" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "dependencies" : ["jdk.vm.ci.services"],
     },
 
     "JVMCI_API" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "dependencies" : [
         "jdk.vm.ci.runtime",
         "jdk.vm.ci.common",
@@ -240,7 +238,7 @@
     },
 
     "JVMCI_HOTSPOT" : {
-      "subDir" : "src/jdk.internal.vm.ci/share/classes",
+      "subDir" : "../jdk.internal.vm.ci/share/classes",
       "dependencies" : [
         "jdk.vm.ci.hotspot.aarch64",
         "jdk.vm.ci.hotspot.amd64",
@@ -253,7 +251,7 @@
     },
 
     "JVMCI_TEST" : {
-      "subDir" : "test/compiler/jvmci",
+      "subDir" : "../../test/hotspot/jtreg/compiler/jvmci",
       "dependencies" : [
         "jdk.vm.ci.runtime.test",
       ],
--- a/src/hotspot/cpu/aarch64/jniTypes_aarch64.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/aarch64/jniTypes_aarch64.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2017, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -26,9 +26,9 @@
 #ifndef CPU_AARCH64_VM_JNITYPES_AARCH64_HPP
 #define CPU_AARCH64_VM_JNITYPES_AARCH64_HPP
 
+#include "jni.h"
 #include "memory/allocation.hpp"
 #include "oops/oop.hpp"
-#include "prims/jni.h"
 
 // This file holds platform-dependent routines used to write primitive jni
 // types to the array of arguments passed into JavaCalls::call
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -2840,6 +2840,44 @@
   bind(L_done);
 }
 
+// Code for BigInteger::mulAdd instrinsic
+// out     = r0
+// in      = r1
+// offset  = r2  (already out.length-offset)
+// len     = r3
+// k       = r4
+//
+// pseudo code from java implementation:
+// carry = 0;
+// offset = out.length-offset - 1;
+// for (int j=len-1; j >= 0; j--) {
+//     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
+//     out[offset--] = (int)product;
+//     carry = product >>> 32;
+// }
+// return (int)carry;
+void MacroAssembler::mul_add(Register out, Register in, Register offset,
+      Register len, Register k) {
+    Label LOOP, END;
+    // pre-loop
+    cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
+    csel(out, zr, out, Assembler::EQ);
+    br(Assembler::EQ, END);
+    add(in, in, len, LSL, 2); // in[j+1] address
+    add(offset, out, offset, LSL, 2); // out[offset + 1] address
+    mov(out, zr); // used to keep carry now
+    BIND(LOOP);
+    ldrw(rscratch1, Address(pre(in, -4)));
+    madd(rscratch1, rscratch1, k, out);
+    ldrw(rscratch2, Address(pre(offset, -4)));
+    add(rscratch1, rscratch1, rscratch2);
+    strw(rscratch1, Address(offset));
+    lsr(out, rscratch1, 32);
+    subs(len, len, 1);
+    br(Assembler::NE, LOOP);
+    BIND(END);
+}
+
 /**
  * Emits code to update CRC-32 with a byte value according to constants in table
  *
@@ -3291,6 +3329,7 @@
   ldr(dst, Address(dst, ConstMethod::constants_offset()));
   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
   ldr(dst, Address(dst, mirror_offset));
+  resolve_oop_handle(dst);
 }
 
 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1265,6 +1265,7 @@
   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
                        Register zlen, Register tmp1, Register tmp2, Register tmp3,
                        Register tmp4, Register tmp5, Register tmp6, Register tmp7);
+  void mul_add(Register out, Register in, Register offs, Register len, Register k);
   // ISB may be needed because of a safepoint
   void maybe_isb() { isb(); }
 
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -3607,6 +3607,63 @@
     return start;
   }
 
+  address generate_squareToLen() {
+    // squareToLen algorithm for sizes 1..127 described in java code works
+    // faster than multiply_to_len on some CPUs and slower on others, but
+    // multiply_to_len shows a bit better overall results
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "squareToLen");
+    address start = __ pc();
+
+    const Register x     = r0;
+    const Register xlen  = r1;
+    const Register z     = r2;
+    const Register zlen  = r3;
+    const Register y     = r4; // == x
+    const Register ylen  = r5; // == xlen
+
+    const Register tmp1  = r10;
+    const Register tmp2  = r11;
+    const Register tmp3  = r12;
+    const Register tmp4  = r13;
+    const Register tmp5  = r14;
+    const Register tmp6  = r15;
+    const Register tmp7  = r16;
+
+    RegSet spilled_regs = RegSet::of(y, ylen);
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+    __ push(spilled_regs, sp);
+    __ mov(y, x);
+    __ mov(ylen, xlen);
+    __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+    __ pop(spilled_regs, sp);
+    __ leave();
+    __ ret(lr);
+    return start;
+  }
+
+  address generate_mulAdd() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "mulAdd");
+
+    address start = __ pc();
+
+    const Register out     = r0;
+    const Register in      = r1;
+    const Register offset  = r2;
+    const Register len     = r3;
+    const Register k       = r4;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+    __ mul_add(out, in, offset, len, k);
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
@@ -4913,6 +4970,14 @@
       StubRoutines::_multiplyToLen = generate_multiplyToLen();
     }
 
+    if (UseSquareToLenIntrinsic) {
+      StubRoutines::_squareToLen = generate_squareToLen();
+    }
+
+    if (UseMulAddIntrinsic) {
+      StubRoutines::_mulAdd = generate_mulAdd();
+    }
+
     if (UseMontgomeryMultiplyIntrinsic) {
       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
--- a/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -2297,6 +2297,7 @@
                                         ConstantPoolCacheEntry::f1_offset())));
     const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ ldr(obj, Address(obj, mirror_offset));
+    __ resolve_oop_handle(obj);
   }
 }
 
--- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -340,6 +340,14 @@
     UseMultiplyToLenIntrinsic = true;
   }
 
+  if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+    UseSquareToLenIntrinsic = true;
+  }
+
+  if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+    UseMulAddIntrinsic = true;
+  }
+
   if (FLAG_IS_DEFAULT(UseBarriersForVolatile)) {
     UseBarriersForVolatile = (_features & CPU_DMB_ATOMICS) != 0;
   }
--- a/src/hotspot/cpu/arm/jniTypes_arm.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/arm/jniTypes_arm.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,9 +25,9 @@
 #ifndef CPU_ARM_VM_JNITYPES_ARM_HPP
 #define CPU_ARM_VM_JNITYPES_ARM_HPP
 
+#include "jni.h"
 #include "memory/allocation.hpp"
 #include "oops/oop.hpp"
-#include "prims/jni.h"
 
 // This file holds platform-dependent routines used to write primitive jni
 // types to the array of arguments passed into JavaCalls::call
--- a/src/hotspot/cpu/arm/macroAssembler_arm.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/arm/macroAssembler_arm.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -2899,6 +2899,7 @@
   ldr(tmp, Address(tmp,  ConstMethod::constants_offset()));
   ldr(tmp, Address(tmp, ConstantPool::pool_holder_offset_in_bytes()));
   ldr(mirror, Address(tmp, mirror_offset));
+  resolve_oop_handle(mirror);
 }
 
 
--- a/src/hotspot/cpu/arm/stubGenerator_arm.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/arm/stubGenerator_arm.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -2867,46 +2867,51 @@
   //  Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) except for callee_saved_regs.
   void gen_write_ref_array_pre_barrier(Register addr, Register count, int callee_saved_regs) {
     BarrierSet* bs = Universe::heap()->barrier_set();
-    if (bs->has_write_ref_pre_barrier()) {
-      assert(bs->has_write_ref_array_pre_opt(),
-             "Else unsupported barrier set.");
-
-      assert( addr->encoding() < callee_saved_regs, "addr must be saved");
-      assert(count->encoding() < callee_saved_regs, "count must be saved");
-
-      BLOCK_COMMENT("PreBarrier");
+    switch (bs->kind()) {
+    case BarrierSet::G1SATBCTLogging:
+      {
+        assert( addr->encoding() < callee_saved_regs, "addr must be saved");
+        assert(count->encoding() < callee_saved_regs, "count must be saved");
+
+        BLOCK_COMMENT("PreBarrier");
 
 #ifdef AARCH64
-      callee_saved_regs = align_up(callee_saved_regs, 2);
-      for (int i = 0; i < callee_saved_regs; i += 2) {
-        __ raw_push(as_Register(i), as_Register(i+1));
+        callee_saved_regs = align_up(callee_saved_regs, 2);
+        for (int i = 0; i < callee_saved_regs; i += 2) {
+          __ raw_push(as_Register(i), as_Register(i+1));
+        }
+#else
+        RegisterSet saved_regs = RegisterSet(R0, as_Register(callee_saved_regs-1));
+        __ push(saved_regs | R9ifScratched);
+#endif // AARCH64
+
+        if (addr != R0) {
+          assert_different_registers(count, R0);
+          __ mov(R0, addr);
+        }
+#ifdef AARCH64
+        __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_pre takes size_t
+#else
+        if (count != R1) {
+          __ mov(R1, count);
+        }
+#endif // AARCH64
+
+        __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
+
+#ifdef AARCH64
+        for (int i = callee_saved_regs - 2; i >= 0; i -= 2) {
+          __ raw_pop(as_Register(i), as_Register(i+1));
+        }
+#else
+        __ pop(saved_regs | R9ifScratched);
+#endif // AARCH64
       }
-#else
-      RegisterSet saved_regs = RegisterSet(R0, as_Register(callee_saved_regs-1));
-      __ push(saved_regs | R9ifScratched);
-#endif // AARCH64
-
-      if (addr != R0) {
-        assert_different_registers(count, R0);
-        __ mov(R0, addr);
-      }
-#ifdef AARCH64
-      __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_pre takes size_t
-#else
-      if (count != R1) {
-        __ mov(R1, count);
-      }
-#endif // AARCH64
-
-      __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
-
-#ifdef AARCH64
-      for (int i = callee_saved_regs - 2; i >= 0; i -= 2) {
-        __ raw_pop(as_Register(i), as_Register(i+1));
-      }
-#else
-      __ pop(saved_regs | R9ifScratched);
-#endif // AARCH64
+    case BarrierSet::CardTableForRS:
+    case BarrierSet::CardTableExtension:
+      break;
+    default:
+      ShouldNotReachHere();
     }
   }
 #endif // INCLUDE_ALL_GCS
--- a/src/hotspot/cpu/arm/templateTable_arm.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/arm/templateTable_arm.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -2963,6 +2963,7 @@
              cp_base_offset + ConstantPoolCacheEntry::f1_offset()));
     const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ ldr(Robj, Address(Robj, mirror_offset));
+    __ resolve_oop_handle(Robj);
   }
 }
 
--- a/src/hotspot/cpu/ppc/assembler_ppc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/assembler_ppc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -517,6 +517,9 @@
     XXPERMDI_OPCODE= (60u << OPCODE_SHIFT |   10u << 3),
     XXMRGHW_OPCODE = (60u << OPCODE_SHIFT |   18u << 3),
     XXMRGLW_OPCODE = (60u << OPCODE_SHIFT |   50u << 3),
+    XXSPLTW_OPCODE = (60u << OPCODE_SHIFT |  164u << 2),
+    XXLXOR_OPCODE  = (60u << OPCODE_SHIFT |  154u << 3),
+    XXLEQV_OPCODE  = (60u << OPCODE_SHIFT |  186u << 3),
 
     // Vector Permute and Formatting
     VPKPX_OPCODE   = (4u  << OPCODE_SHIFT |  782u     ),
@@ -1125,6 +1128,7 @@
   static int vsplti_sim(int        x)  { return  opp_u_field(x,             15, 11); } // for vsplti* instructions
   static int vsldoi_shb(int        x)  { return  opp_u_field(x,             25, 22); } // for vsldoi instruction
   static int vcmp_rc(   int        x)  { return  opp_u_field(x,             21, 21); } // for vcmp* instructions
+  static int xxsplt_uim(int        x)  { return  opp_u_field(x,             15, 14); } // for xxsplt* instructions
 
   //static int xo1(     int        x)  { return  opp_u_field(x,             29, 21); }// is contained in our opcodes
   //static int xo2(     int        x)  { return  opp_u_field(x,             30, 21); }// is contained in our opcodes
@@ -1308,6 +1312,7 @@
   inline void li(   Register d, int si16);
   inline void lis(  Register d, int si16);
   inline void addir(Register d, int si16, Register a);
+  inline void subi( Register d, Register a, int si16);
 
   static bool is_addi(int x) {
      return ADDI_OPCODE == (x & ADDI_OPCODE_MASK);
@@ -2154,6 +2159,11 @@
   inline void xxpermdi( VectorSRegister d, VectorSRegister a, VectorSRegister b, int dm);
   inline void xxmrghw(  VectorSRegister d, VectorSRegister a, VectorSRegister b);
   inline void xxmrglw(  VectorSRegister d, VectorSRegister a, VectorSRegister b);
+  inline void mtvsrd(   VectorSRegister d, Register a);
+  inline void mtvsrwz(  VectorSRegister d, Register a);
+  inline void xxspltw(  VectorSRegister d, VectorSRegister b, int ui2);
+  inline void xxlxor(   VectorSRegister d, VectorSRegister a, VectorSRegister b);
+  inline void xxleqv(   VectorSRegister d, VectorSRegister a, VectorSRegister b);
 
   // VSX Extended Mnemonics
   inline void xxspltd(  VectorSRegister d, VectorSRegister a, int x);
@@ -2174,7 +2184,8 @@
   inline void vsbox(       VectorRegister d, VectorRegister a);
 
   // SHA (introduced with Power 8)
-  // Not yet implemented.
+  inline void vshasigmad(VectorRegister d, VectorRegister a, bool st, int six);
+  inline void vshasigmaw(VectorRegister d, VectorRegister a, bool st, int six);
 
   // Vector Binary Polynomial Multiplication (introduced with Power 8)
   inline void vpmsumb(  VectorRegister d, VectorRegister a, VectorRegister b);
@@ -2285,6 +2296,11 @@
   inline void lvsl(  VectorRegister d, Register s2);
   inline void lvsr(  VectorRegister d, Register s2);
 
+  // Endianess specific concatenation of 2 loaded vectors.
+  inline void load_perm(VectorRegister perm, Register addr);
+  inline void vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm);
+  inline void vec_perm(VectorRegister dest, VectorRegister first, VectorRegister second, VectorRegister perm);
+
   // RegisterOrConstant versions.
   // These emitters choose between the versions using two registers and
   // those with register and immediate, depending on the content of roc.
--- a/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -164,6 +164,7 @@
 inline void Assembler::li(   Register d, int si16)             { Assembler::addi_r0ok( d, R0, si16); }
 inline void Assembler::lis(  Register d, int si16)             { Assembler::addis_r0ok(d, R0, si16); }
 inline void Assembler::addir(Register d, int si16, Register a) { Assembler::addi(d, a, si16); }
+inline void Assembler::subi( Register d, Register a, int si16) { Assembler::addi(d, a, -si16); }
 
 // PPC 1, section 3.3.9, Fixed-Point Compare Instructions
 inline void Assembler::cmpi(  ConditionRegister f, int l, Register a, int si16)   { emit_int32( CMPI_OPCODE  | bf(f) | l10(l) | ra(a) | simm(si16,16)); }
@@ -760,9 +761,14 @@
 // Vector-Scalar (VSX) instructions.
 inline void Assembler::lxvd2x(  VectorSRegister d, Register s1)              { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(0) | rb(s1)); }
 inline void Assembler::lxvd2x(  VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra0mem(s1) | rb(s2)); }
-inline void Assembler::stxvd2x( VectorSRegister d, Register s1)              { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(0) | rb(s1)); }
-inline void Assembler::stxvd2x( VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra0mem(s1) | rb(s2)); }
-inline void Assembler::mtvrd(   VectorRegister  d, Register a)               { emit_int32( MTVSRD_OPCODE  | vsrt(d->to_vsr()) | ra(a)); }
+inline void Assembler::stxvd2x( VectorSRegister d, Register s1)              { emit_int32( STXVD2X_OPCODE | vsrs(d) | ra(0) | rb(s1)); }
+inline void Assembler::stxvd2x( VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrs(d) | ra0mem(s1) | rb(s2)); }
+inline void Assembler::mtvsrd(  VectorSRegister d, Register a)               { emit_int32( MTVSRD_OPCODE  | vsrt(d)  | ra(a)); }
+inline void Assembler::mtvsrwz( VectorSRegister d, Register a)               { emit_int32( MTVSRWZ_OPCODE | vsrt(d) | ra(a)); }
+inline void Assembler::xxspltw( VectorSRegister d, VectorSRegister b, int ui2)           { emit_int32( XXSPLTW_OPCODE | vsrt(d) | vsrb(b) | xxsplt_uim(uimm(ui2,2))); }
+inline void Assembler::xxlxor(  VectorSRegister d, VectorSRegister a, VectorSRegister b) { emit_int32( XXLXOR_OPCODE | vsrt(d) | vsra(a) | vsrb(b)); }
+inline void Assembler::xxleqv(  VectorSRegister d, VectorSRegister a, VectorSRegister b) { emit_int32( XXLEQV_OPCODE | vsrt(d) | vsra(a) | vsrb(b)); }
+inline void Assembler::mtvrd(    VectorRegister d, Register a)               { emit_int32( MTVSRD_OPCODE  | vsrt(d->to_vsr()) | ra(a)); }
 inline void Assembler::mfvrd(   Register        a, VectorRegister d)         { emit_int32( MFVSRD_OPCODE  | vsrt(d->to_vsr()) | ra(a)); }
 inline void Assembler::mtvrwz(  VectorRegister  d, Register a)               { emit_int32( MTVSRWZ_OPCODE | vsrt(d->to_vsr()) | ra(a)); }
 inline void Assembler::mfvrwz(  Register        a, VectorRegister d)         { emit_int32( MFVSRWZ_OPCODE | vsrt(d->to_vsr()) | ra(a)); }
@@ -925,7 +931,8 @@
 inline void Assembler::vsbox(       VectorRegister d, VectorRegister a)                   { emit_int32( VSBOX_OPCODE        | vrt(d) | vra(a)         ); }
 
 // SHA (introduced with Power 8)
-// Not yet implemented.
+inline void Assembler::vshasigmad(VectorRegister d, VectorRegister a, bool st, int six) { emit_int32( VSHASIGMAD_OPCODE | vrt(d) | vra(a) | vst(st) | vsix(six)); }
+inline void Assembler::vshasigmaw(VectorRegister d, VectorRegister a, bool st, int six) { emit_int32( VSHASIGMAW_OPCODE | vrt(d) | vra(a) | vst(st) | vsix(six)); }
 
 // Vector Binary Polynomial Multiplication (introduced with Power 8)
 inline void Assembler::vpmsumb(  VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPMSUMB_OPCODE | vrt(d) | vra(a) | vrb(b)); }
@@ -1034,6 +1041,30 @@
 inline void Assembler::lvsl(  VectorRegister d, Register s2) { emit_int32( LVSL_OPCODE   | vrt(d) | rb(s2)); }
 inline void Assembler::lvsr(  VectorRegister d, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | rb(s2)); }
 
+inline void Assembler::load_perm(VectorRegister perm, Register addr) {
+#if defined(VM_LITTLE_ENDIAN)
+  lvsr(perm, addr);
+#else
+  lvsl(perm, addr);
+#endif
+}
+
+inline void Assembler::vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm) {
+#if defined(VM_LITTLE_ENDIAN)
+  vperm(first_dest, second, first_dest, perm);
+#else
+  vperm(first_dest, first_dest, second, perm);
+#endif
+}
+
+inline void Assembler::vec_perm(VectorRegister dest, VectorRegister first, VectorRegister second, VectorRegister perm) {
+#if defined(VM_LITTLE_ENDIAN)
+  vperm(dest, second, first, perm);
+#else
+  vperm(dest, first, second, perm);
+#endif
+}
+
 inline void Assembler::load_const(Register d, void* x, Register tmp) {
    load_const(d, (long)x, tmp);
 }
--- a/src/hotspot/cpu/ppc/globals_ppc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/globals_ppc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -32,7 +32,7 @@
 // Sets the default values for platform dependent flags used by the runtime system.
 // (see globals.hpp)
 
-define_pd_global(bool, ShareVtableStubs,      false); // Improves performance markedly for mtrt and compress.
+define_pd_global(bool, ShareVtableStubs,      true);
 define_pd_global(bool, NeedsDeoptSuspend,     false); // Only register window machines need this.
 
 
@@ -103,6 +103,9 @@
           "CPU Version: x for PowerX. Currently recognizes Power5 to "      \
           "Power8. Default is 0. Newer CPUs will be recognized as Power8.") \
                                                                             \
+  product(bool, SuperwordUseVSX, false,                                     \
+          "Use Power8 VSX instructions for superword optimization.")        \
+                                                                            \
   /* Reoptimize code-sequences of calls at runtime, e.g. replace an */      \
   /* indirect call by a direct call.                                */      \
   product(bool, ReoptimizeCallSequences, true,                              \
--- a/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -863,7 +863,7 @@
     //
     // markOop displaced_header = obj->mark().set_unlocked();
     // monitor->lock()->set_displaced_header(displaced_header);
-    // if (Atomic::cmpxchg_ptr(/*ex=*/monitor, /*addr*/obj->mark_addr(), /*cmp*/displaced_header) == displaced_header) {
+    // if (Atomic::cmpxchg(/*ex=*/monitor, /*addr*/obj->mark_addr(), /*cmp*/displaced_header) == displaced_header) {
     //   // We stored the monitor address into the object's mark word.
     // } else if (THREAD->is_lock_owned((address)displaced_header))
     //   // Simple recursive case.
@@ -901,7 +901,7 @@
     std(displaced_header, BasicObjectLock::lock_offset_in_bytes() +
         BasicLock::displaced_header_offset_in_bytes(), monitor);
 
-    // if (Atomic::cmpxchg_ptr(/*ex=*/monitor, /*addr*/obj->mark_addr(), /*cmp*/displaced_header) == displaced_header) {
+    // if (Atomic::cmpxchg(/*ex=*/monitor, /*addr*/obj->mark_addr(), /*cmp*/displaced_header) == displaced_header) {
 
     // Store stack address of the BasicObjectLock (this is monitor) into object.
     addi(object_mark_addr, object, oopDesc::mark_offset_in_bytes());
@@ -977,7 +977,7 @@
     // if ((displaced_header = monitor->displaced_header()) == NULL) {
     //   // Recursive unlock. Mark the monitor unlocked by setting the object field to NULL.
     //   monitor->set_obj(NULL);
-    // } else if (Atomic::cmpxchg_ptr(displaced_header, obj->mark_addr(), monitor) == monitor) {
+    // } else if (Atomic::cmpxchg(displaced_header, obj->mark_addr(), monitor) == monitor) {
     //   // We swapped the unlocked mark in displaced_header into the object's mark word.
     //   monitor->set_obj(NULL);
     // } else {
@@ -1010,7 +1010,7 @@
     cmpdi(CCR0, displaced_header, 0);
     beq(CCR0, free_slot); // recursive unlock
 
-    // } else if (Atomic::cmpxchg_ptr(displaced_header, obj->mark_addr(), monitor) == monitor) {
+    // } else if (Atomic::cmpxchg(displaced_header, obj->mark_addr(), monitor) == monitor) {
     //   // We swapped the unlocked mark in displaced_header into the object's mark word.
     //   monitor->set_obj(NULL);
 
--- a/src/hotspot/cpu/ppc/jniTypes_ppc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/jniTypes_ppc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2017, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2013 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -26,9 +26,9 @@
 #ifndef CPU_PPC_VM_JNITYPES_PPC_HPP
 #define CPU_PPC_VM_JNITYPES_PPC_HPP
 
+#include "jni.h"
 #include "memory/allocation.hpp"
 #include "oops/oop.hpp"
-#include "prims/jni.h"
 
 // This file holds platform-dependent routines used to write primitive
 // jni types to the array of arguments passed into JavaCalls::call.
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -129,7 +129,7 @@
   }
 }
 
-int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
+address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
   const int offset = MacroAssembler::offset_to_global_toc(addr);
 
   const address inst2_addr = a;
@@ -155,7 +155,7 @@
   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
-  return (int)((intptr_t)addr - (intptr_t)inst1_addr);
+  return inst1_addr;
 }
 
 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
@@ -201,7 +201,7 @@
 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 //    ori rx = rx | const.lo
 // Clrldi will be passed by.
-int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
+address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
   assert(UseCompressedOops, "Should only patch compressed oops");
 
   const address inst2_addr = a;
@@ -227,7 +227,7 @@
 
   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
   set_imm((int *)inst2_addr,        (xd)); // unsigned int
-  return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
+  return inst1_addr;
 }
 
 // Get compressed oop or klass constant.
@@ -3382,6 +3382,7 @@
   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
+  resolve_oop_handle(mirror);
 }
 
 // Clear Array
@@ -5234,6 +5235,40 @@
   bind(L_post_third_loop_done);
 }   // multiply_128_x_128_loop
 
+void MacroAssembler::muladd(Register out, Register in,
+                            Register offset, Register len, Register k,
+                            Register tmp1, Register tmp2, Register carry) {
+
+  // Labels
+  Label LOOP, SKIP;
+
+  // Make sure length is positive.
+  cmpdi  (CCR0,    len,     0);
+
+  // Prepare variables
+  subi   (offset,  offset,  4);
+  li     (carry,   0);
+  ble    (CCR0,    SKIP);
+
+  mtctr  (len);
+  subi   (len,     len,     1    );
+  sldi   (len,     len,     2    );
+
+  // Main loop
+  bind(LOOP);
+  lwzx   (tmp1,    len,     in   );
+  lwzx   (tmp2,    offset,  out  );
+  mulld  (tmp1,    tmp1,    k    );
+  add    (tmp2,    carry,   tmp2 );
+  add    (tmp2,    tmp1,    tmp2 );
+  stwx   (tmp2,    offset,  out  );
+  srdi   (carry,   tmp2,    32   );
+  subi   (offset,  offset,  4    );
+  subi   (len,     len,     4    );
+  bdnz   (LOOP);
+  bind(SKIP);
+}
+
 void MacroAssembler::multiply_to_len(Register x, Register xlen,
                                      Register y, Register ylen,
                                      Register z, Register zlen,
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -105,13 +105,15 @@
   };
 
   inline static bool is_calculate_address_from_global_toc_at(address a, address bound);
-  static int patch_calculate_address_from_global_toc_at(address a, address addr, address bound);
+  // Returns address of first instruction in sequence.
+  static address patch_calculate_address_from_global_toc_at(address a, address bound, address addr);
   static address get_address_of_calculate_address_from_global_toc_at(address a, address addr);
 
 #ifdef _LP64
   // Patch narrow oop constant.
   inline static bool is_set_narrow_oop(address a, address bound);
-  static int patch_set_narrow_oop(address a, address bound, narrowOop data);
+  // Returns address of first instruction in sequence.
+  static address patch_set_narrow_oop(address a, address bound, narrowOop data);
   static narrowOop get_narrow_oop(address a, address bound);
 #endif
 
@@ -813,6 +815,8 @@
                                Register yz_idx, Register idx, Register carry,
                                Register product_high, Register product,
                                Register carry2, Register tmp);
+  void muladd(Register out, Register in, Register offset, Register len, Register k,
+              Register tmp1, Register tmp2, Register carry);
   void multiply_to_len(Register x, Register xlen,
                        Register y, Register ylen,
                        Register z, Register zlen,
@@ -862,6 +866,40 @@
   void kernel_crc32_singleByteReg(Register crc, Register val, Register table,
                                   bool invertCRC);
 
+  // SHA-2 auxiliary functions and public interfaces
+ private:
+  void sha256_deque(const VectorRegister src,
+      const VectorRegister dst1, const VectorRegister dst2, const VectorRegister dst3);
+  void sha256_load_h_vec(const VectorRegister a, const VectorRegister e, const Register hptr);
+  void sha256_round(const VectorRegister* hs, const int total_hs, int& h_cnt, const VectorRegister kpw);
+  void sha256_load_w_plus_k_vec(const Register buf_in, const VectorRegister* ws,
+      const int total_ws, const Register k, const VectorRegister* kpws,
+      const int total_kpws);
+  void sha256_calc_4w(const VectorRegister w0, const VectorRegister w1,
+      const VectorRegister w2, const VectorRegister w3, const VectorRegister kpw0,
+      const VectorRegister kpw1, const VectorRegister kpw2, const VectorRegister kpw3,
+      const Register j, const Register k);
+  void sha256_update_sha_state(const VectorRegister a, const VectorRegister b,
+      const VectorRegister c, const VectorRegister d, const VectorRegister e,
+      const VectorRegister f, const VectorRegister g, const VectorRegister h,
+      const Register hptr);
+
+  void sha512_load_w_vec(const Register buf_in, const VectorRegister* ws, const int total_ws);
+  void sha512_update_sha_state(const Register state, const VectorRegister* hs, const int total_hs);
+  void sha512_round(const VectorRegister* hs, const int total_hs, int& h_cnt, const VectorRegister kpw);
+  void sha512_load_h_vec(const Register state, const VectorRegister* hs, const int total_hs);
+  void sha512_calc_2w(const VectorRegister w0, const VectorRegister w1,
+      const VectorRegister w2, const VectorRegister w3,
+      const VectorRegister w4, const VectorRegister w5,
+      const VectorRegister w6, const VectorRegister w7,
+      const VectorRegister kpw0, const VectorRegister kpw1, const Register j,
+      const VectorRegister vRb, const Register k);
+
+ public:
+  void sha256(bool multi_block);
+  void sha512(bool multi_block);
+
+
   //
   // Debugging
   //
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc_sha.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -0,0 +1,1136 @@
+// Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+
+// Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"
+// (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).
+
+#include "asm/macroAssembler.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+
+/**********************************************************************
+ * SHA 256
+ *********************************************************************/
+
+void MacroAssembler::sha256_deque(const VectorRegister src,
+                                  const VectorRegister dst1,
+                                  const VectorRegister dst2,
+                                  const VectorRegister dst3) {
+  vsldoi (dst1, src, src, 12);
+  vsldoi (dst2, src, src, 8);
+  vsldoi (dst3, src, src, 4);
+}
+
+void MacroAssembler::sha256_round(const VectorRegister* hs,
+                                  const int total_hs,
+                                  int& h_cnt,
+                                  const VectorRegister kpw) {
+  // convenience registers: cycle from 0-7 downwards
+  const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
+  // temporaries
+  VectorRegister ch  = VR0;
+  VectorRegister maj = VR1;
+  VectorRegister bsa = VR2;
+  VectorRegister bse = VR3;
+  VectorRegister vt0 = VR4;
+  VectorRegister vt1 = VR5;
+  VectorRegister vt2 = VR6;
+  VectorRegister vt3 = VR7;
+
+  vsel       (ch,  g,   f, e);
+  vxor       (maj, a,   b);
+  vshasigmaw (bse, e,   1, 0xf);
+  vadduwm    (vt2, ch,  kpw);
+  vadduwm    (vt1, h,   bse);
+  vsel       (maj, b,   c, maj);
+  vadduwm    (vt3, vt1, vt2);
+  vshasigmaw (bsa, a,   1, 0);
+  vadduwm    (vt0, bsa, maj);
+
+  vadduwm    (d,   d,   vt3);
+  vadduwm    (h,   vt3, vt0);
+
+  // advance vector pointer to the next iteration
+  h_cnt++;
+}
+
+void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
+                                       const VectorRegister e,
+                                       const Register hptr) {
+  // temporaries
+  Register tmp = R8;
+  VectorRegister vt0 = VR0;
+  VectorRegister vRb = VR6;
+  // labels
+  Label sha256_aligned;
+
+  andi_  (tmp,  hptr, 0xf);
+  lvx    (a,    hptr);
+  addi   (tmp,  hptr, 16);
+  lvx    (e,    tmp);
+  beq    (CCR0, sha256_aligned);
+
+  // handle unaligned accesses
+  load_perm(vRb, hptr);
+  addi   (tmp, hptr, 32);
+  vec_perm(a,   e,    vRb);
+
+  lvx    (vt0,  tmp);
+  vec_perm(e,   vt0,  vRb);
+
+  // aligned accesses
+  bind(sha256_aligned);
+}
+
+void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
+                                              const VectorRegister* ws,
+                                              const int total_ws,
+                                              const Register k,
+                                              const VectorRegister* kpws,
+                                              const int total_kpws) {
+  Label w_aligned, after_w_load;
+
+  Register tmp       = R8;
+  VectorRegister vt0 = VR0;
+  VectorRegister vt1 = VR1;
+  VectorRegister vRb = VR6;
+
+  andi_ (tmp, buf_in, 0xF);
+  beq   (CCR0, w_aligned); // address ends with 0x0, not 0x8
+
+  // deal with unaligned addresses
+  lvx    (ws[0], buf_in);
+  load_perm(vRb, buf_in);
+
+  for (int n = 1; n < total_ws; n++) {
+    VectorRegister w_cur = ws[n];
+    VectorRegister w_prev = ws[n-1];
+
+    addi (tmp, buf_in, n * 16);
+    lvx  (w_cur, tmp);
+    vec_perm(w_prev, w_cur, vRb);
+  }
+  addi   (tmp, buf_in, total_ws * 16);
+  lvx    (vt0, tmp);
+  vec_perm(ws[total_ws-1], vt0, vRb);
+  b      (after_w_load);
+
+  bind(w_aligned);
+
+  // deal with aligned addresses
+  lvx(ws[0], buf_in);
+  for (int n = 1; n < total_ws; n++) {
+    VectorRegister w = ws[n];
+    addi (tmp, buf_in, n * 16);
+    lvx  (w, tmp);
+  }
+
+  bind(after_w_load);
+
+#if defined(VM_LITTLE_ENDIAN)
+  // Byte swapping within int values
+  li       (tmp, 8);
+  lvsl     (vt0, tmp);
+  vspltisb (vt1, 0xb);
+  vxor     (vt1, vt0, vt1);
+  for (int n = 0; n < total_ws; n++) {
+    VectorRegister w = ws[n];
+    vec_perm(w, w, vt1);
+  }
+#endif
+
+  // Loading k, which is always aligned to 16-bytes
+  lvx    (kpws[0], k);
+  for (int n = 1; n < total_kpws; n++) {
+    VectorRegister kpw = kpws[n];
+    addi (tmp, k, 16 * n);
+    lvx  (kpw, tmp);
+  }
+
+  // Add w to K
+  assert(total_ws == total_kpws, "Redesign the loop below");
+  for (int n = 0; n < total_kpws; n++) {
+    VectorRegister kpw = kpws[n];
+    VectorRegister w   = ws[n];
+
+    vadduwm  (kpw, kpw, w);
+  }
+}
+
+void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
+                                    const VectorRegister w1,
+                                    const VectorRegister w2,
+                                    const VectorRegister w3,
+                                    const VectorRegister kpw0,
+                                    const VectorRegister kpw1,
+                                    const VectorRegister kpw2,
+                                    const VectorRegister kpw3,
+                                    const Register j,
+                                    const Register k) {
+  // Temporaries
+  const VectorRegister  vt0  = VR0;
+  const VectorRegister  vt1  = VR1;
+  const VectorSRegister vsrt1 = vt1->to_vsr();
+  const VectorRegister  vt2  = VR2;
+  const VectorRegister  vt3  = VR3;
+  const VectorSRegister vst3 = vt3->to_vsr();
+  const VectorRegister  vt4  = VR4;
+
+  // load to k[j]
+  lvx        (vt0, j,   k);
+
+  // advance j
+  addi       (j,   j,   16); // 16 bytes were read
+
+#if defined(VM_LITTLE_ENDIAN)
+  // b = w[j-15], w[j-14], w[j-13], w[j-12]
+  vsldoi     (vt1, w1,  w0, 12);
+
+  // c = w[j-7], w[j-6], w[j-5], w[j-4]
+  vsldoi     (vt2, w3,  w2, 12);
+
+#else
+  // b = w[j-15], w[j-14], w[j-13], w[j-12]
+  vsldoi     (vt1, w0,  w1, 4);
+
+  // c = w[j-7], w[j-6], w[j-5], w[j-4]
+  vsldoi     (vt2, w2,  w3, 4);
+#endif
+
+  // d = w[j-2], w[j-1], w[j-4], w[j-3]
+  vsldoi     (vt3, w3,  w3, 8);
+
+  // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
+  vshasigmaw (vt1, vt1, 0,  0);
+
+  // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
+  vshasigmaw (vt3, vt3, 0,  0xf);
+
+  // c = s0(w[j-15]) + w[j-7],
+  //     s0(w[j-14]) + w[j-6],
+  //     s0(w[j-13]) + w[j-5],
+  //     s0(w[j-12]) + w[j-4]
+  vadduwm    (vt2, vt1, vt2);
+
+  // c = s0(w[j-15]) + w[j-7] + w[j-16],
+  //     s0(w[j-14]) + w[j-6] + w[j-15],
+  //     s0(w[j-13]) + w[j-5] + w[j-14],
+  //     s0(w[j-12]) + w[j-4] + w[j-13]
+  vadduwm    (vt2, vt2, w0);
+
+  // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
+  //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
+  //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
+  //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3])  // UNDEFINED
+  vadduwm    (vt4, vt2, vt3);
+
+  // At this point, e[0] and e[1] are the correct values to be stored at w[j]
+  // and w[j+1].
+  // e[2] and e[3] are not considered.
+  // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
+  vshasigmaw (vt1, vt4, 0,  0xf);
+
+  // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
+#if defined(VM_LITTLE_ENDIAN)
+  xxmrgld    (vst3, vsrt1, vst3);
+#else
+  xxmrghd    (vst3, vst3, vsrt1);
+#endif
+
+  // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
+  //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
+  //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]),   // w[j+2]
+  //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1])  // w[j+4]
+  vadduwm    (vt2, vt2, vt3);
+
+  // Updating w0 to w3 to hold the new previous 16 values from w.
+  vmr        (w0,  w1);
+  vmr        (w1,  w2);
+  vmr        (w2,  w3);
+  vmr        (w3,  vt2);
+
+  // store k + w to v9 (4 values at once)
+#if defined(VM_LITTLE_ENDIAN)
+  vadduwm    (kpw0, vt2, vt0);
+
+  vsldoi     (kpw1, kpw0, kpw0, 12);
+  vsldoi     (kpw2, kpw0, kpw0, 8);
+  vsldoi     (kpw3, kpw0, kpw0, 4);
+#else
+  vadduwm    (kpw3, vt2, vt0);
+
+  vsldoi     (kpw2, kpw3, kpw3, 12);
+  vsldoi     (kpw1, kpw3, kpw3, 8);
+  vsldoi     (kpw0, kpw3, kpw3, 4);
+#endif
+}
+
+void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
+                                             const VectorRegister b_,
+                                             const VectorRegister c,
+                                             const VectorRegister d,
+                                             const VectorRegister e,
+                                             const VectorRegister f,
+                                             const VectorRegister g,
+                                             const VectorRegister h,
+                                             const Register hptr) {
+  // temporaries
+  VectorRegister vt0  = VR0;
+  VectorRegister vt1  = VR1;
+  VectorRegister vt2  = VR2;
+  VectorRegister vt3  = VR3;
+  VectorRegister vt4  = VR4;
+  VectorRegister vt5  = VR5;
+  VectorRegister vaux = VR6;
+  VectorRegister vRb  = VR6;
+  Register tmp        = R8;
+  Register of16       = R8;
+  Register of32       = R9;
+  Label state_load_aligned;
+
+  // Load hptr
+  andi_   (tmp, hptr, 0xf);
+  li      (of16, 16);
+  lvx     (vt0, hptr);
+  lvx     (vt5, of16, hptr);
+  beq     (CCR0, state_load_aligned);
+
+  // handle unaligned accesses
+  li      (of32, 32);
+  load_perm(vRb, hptr);
+
+  vec_perm(vt0, vt5,  vRb);        // vt0 = hptr[0]..hptr[3]
+
+  lvx     (vt1, hptr, of32);
+  vec_perm(vt5, vt1,  vRb);        // vt5 = hptr[4]..hptr[7]
+
+  // aligned accesses
+  bind(state_load_aligned);
+
+#if defined(VM_LITTLE_ENDIAN)
+  vmrglw  (vt1, b_, a);            // vt1 = {a, b, ?, ?}
+  vmrglw  (vt2, d, c);             // vt2 = {c, d, ?, ?}
+  vmrglw  (vt3, f, e);             // vt3 = {e, f, ?, ?}
+  vmrglw  (vt4, h, g);             // vt4 = {g, h, ?, ?}
+  xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
+  xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
+  vadduwm (a,   vt0, vt1);         // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
+  vadduwm (e,   vt5, vt3);         // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
+
+  // Save hptr back, works for any alignment
+  xxswapd (vt0->to_vsr(), a->to_vsr());
+  stxvd2x (vt0->to_vsr(), hptr);
+  xxswapd (vt5->to_vsr(), e->to_vsr());
+  stxvd2x (vt5->to_vsr(), of16, hptr);
+#else
+  vmrglw  (vt1, a, b_);            // vt1 = {a, b, ?, ?}
+  vmrglw  (vt2, c, d);             // vt2 = {c, d, ?, ?}
+  vmrglw  (vt3, e, f);             // vt3 = {e, f, ?, ?}
+  vmrglw  (vt4, g, h);             // vt4 = {g, h, ?, ?}
+  xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
+  xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
+  vadduwm (d,   vt0, vt1);         // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
+  vadduwm (h,   vt5, vt3);         // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
+
+  // Save hptr back, works for any alignment
+  stxvd2x (d->to_vsr(), hptr);
+  stxvd2x (h->to_vsr(), of16, hptr);
+#endif
+}
+
+static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = {
+  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+static const uint32_t *sha256_round_consts = sha256_round_table;
+
+//   R3_ARG1   - byte[]  Input string with padding but in Big Endian
+//   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
+//   R5_ARG3   - int     offset
+//   R6_ARG4   - int     limit
+//
+//   Internal Register usage:
+//   R7        - k
+//   R8        - tmp | j | of16
+//   R9        - of32
+//   VR0-VR8   - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
+//   VR9-VR16  - a-h
+//   VR17-VR20 - w0-w3
+//   VR21-VR23 - vRb | vaux0-vaux2
+//   VR24-VR27 - kpw0-kpw3
+void MacroAssembler::sha256(bool multi_block) {
+  static const ssize_t buf_size = 64;
+  static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t);
+#ifdef AIX
+  // malloc provides 16 byte alignment
+  if (((uintptr_t)sha256_round_consts & 0xF) != 0) {
+    uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table));
+    guarantee(new_round_consts, "oom");
+    memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table));
+    sha256_round_consts = (const uint32_t*)new_round_consts;
+  }
+#endif
+
+  Register buf_in = R3_ARG1;
+  Register state  = R4_ARG2;
+  Register ofs    = R5_ARG3;
+  Register limit  = R6_ARG4;
+
+  Label sha_loop, core_loop;
+
+  // Save non-volatile vector registers in the red zone
+  static const VectorRegister nv[] = {
+    VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
+  };
+  static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
+
+  for (int c = 0; c < nv_size; c++) {
+    Register tmp = R8;
+    li  (tmp, (c - (nv_size)) * 16);
+    stvx(nv[c], tmp, R1);
+  }
+
+  // Load hash state to registers
+  VectorRegister a = VR9;
+  VectorRegister b = VR10;
+  VectorRegister c = VR11;
+  VectorRegister d = VR12;
+  VectorRegister e = VR13;
+  VectorRegister f = VR14;
+  VectorRegister g = VR15;
+  VectorRegister h = VR16;
+  static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
+  static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
+  // counter for cycling through hs vector to avoid register moves between iterations
+  int h_cnt = 0;
+
+  // Load a-h registers from the memory pointed by state
+#if defined(VM_LITTLE_ENDIAN)
+  sha256_load_h_vec(a, e, state);
+#else
+  sha256_load_h_vec(d, h, state);
+#endif
+
+  // keep k loaded also during MultiBlock loops
+  Register k = R7;
+  assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment");
+  load_const_optimized(k, (address)sha256_round_consts, R0);
+
+  // Avoiding redundant loads
+  if (multi_block) {
+    align(OptoLoopAlignment);
+  }
+  bind(sha_loop);
+#if defined(VM_LITTLE_ENDIAN)
+  sha256_deque(a, b, c, d);
+  sha256_deque(e, f, g, h);
+#else
+  sha256_deque(d, c, b, a);
+  sha256_deque(h, g, f, e);
+#endif
+
+  // Load 16 elements from w out of the loop.
+  // Order of the int values is Endianess specific.
+  VectorRegister w0 = VR17;
+  VectorRegister w1 = VR18;
+  VectorRegister w2 = VR19;
+  VectorRegister w3 = VR20;
+  static const VectorRegister ws[] = {w0, w1, w2, w3};
+  static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
+
+  VectorRegister kpw0 = VR24;
+  VectorRegister kpw1 = VR25;
+  VectorRegister kpw2 = VR26;
+  VectorRegister kpw3 = VR27;
+  static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
+  static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
+
+  sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
+
+  // Cycle through the first 16 elements
+  assert(total_ws == total_kpws, "Redesign the loop below");
+  for (int n = 0; n < total_ws; n++) {
+    VectorRegister vaux0 = VR21;
+    VectorRegister vaux1 = VR22;
+    VectorRegister vaux2 = VR23;
+
+    sha256_deque(kpws[n], vaux0, vaux1, vaux2);
+
+#if defined(VM_LITTLE_ENDIAN)
+    sha256_round(hs, total_hs, h_cnt, kpws[n]);
+    sha256_round(hs, total_hs, h_cnt, vaux0);
+    sha256_round(hs, total_hs, h_cnt, vaux1);
+    sha256_round(hs, total_hs, h_cnt, vaux2);
+#else
+    sha256_round(hs, total_hs, h_cnt, vaux2);
+    sha256_round(hs, total_hs, h_cnt, vaux1);
+    sha256_round(hs, total_hs, h_cnt, vaux0);
+    sha256_round(hs, total_hs, h_cnt, kpws[n]);
+#endif
+  }
+
+  Register tmp = R8;
+  // loop the 16th to the 64th iteration by 8 steps
+  li   (tmp, (w_size - 16) / total_hs);
+  mtctr(tmp);
+
+  // j will be aligned to 4 for loading words.
+  // Whenever read, advance the pointer (e.g: when j is used in a function)
+  Register j = R8;
+  li   (j, 16*4);
+
+  align(OptoLoopAlignment);
+  bind(core_loop);
+
+  // due to VectorRegister rotate, always iterate in multiples of total_hs
+  for (int n = 0; n < total_hs/4; n++) {
+    sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
+    sha256_round(hs, total_hs, h_cnt, kpw0);
+    sha256_round(hs, total_hs, h_cnt, kpw1);
+    sha256_round(hs, total_hs, h_cnt, kpw2);
+    sha256_round(hs, total_hs, h_cnt, kpw3);
+  }
+
+  bdnz   (core_loop);
+
+  // Update hash state
+  sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
+
+  if (multi_block) {
+    addi(buf_in, buf_in, buf_size);
+    addi(ofs, ofs, buf_size);
+    cmplw(CCR0, ofs, limit);
+    ble(CCR0, sha_loop);
+
+    // return ofs
+    mr(R3_RET, ofs);
+  }
+
+  // Restore non-volatile registers
+  for (int c = 0; c < nv_size; c++) {
+    Register tmp = R8;
+    li  (tmp, (c - (nv_size)) * 16);
+    lvx(nv[c], tmp, R1);
+  }
+}
+
+
+/**********************************************************************
+ * SHA 512
+ *********************************************************************/
+
+void MacroAssembler::sha512_load_w_vec(const Register buf_in,
+                                       const VectorRegister* ws,
+                                       const int total_ws) {
+  Register tmp       = R8;
+  VectorRegister vRb = VR8;
+  VectorRegister aux = VR9;
+  Label is_aligned, after_alignment;
+
+  andi_  (tmp, buf_in, 0xF);
+  beq    (CCR0, is_aligned); // address ends with 0x0, not 0x8
+
+  // deal with unaligned addresses
+  lvx    (ws[0], buf_in);
+  load_perm(vRb, buf_in);
+
+  for (int n = 1; n < total_ws; n++) {
+    VectorRegister w_cur = ws[n];
+    VectorRegister w_prev = ws[n-1];
+    addi (tmp, buf_in, n * 16);
+    lvx  (w_cur, tmp);
+    vec_perm(w_prev, w_cur, vRb);
+  }
+  addi   (tmp, buf_in, total_ws * 16);
+  lvx    (aux, tmp);
+  vec_perm(ws[total_ws-1], aux, vRb);
+  b      (after_alignment);
+
+  bind(is_aligned);
+  lvx  (ws[0], buf_in);
+  for (int n = 1; n < total_ws; n++) {
+    VectorRegister w = ws[n];
+    addi (tmp, buf_in, n * 16);
+    lvx  (w, tmp);
+  }
+
+  bind(after_alignment);
+}
+
+// Update hash state
+void MacroAssembler::sha512_update_sha_state(const Register state,
+                                             const VectorRegister* hs,
+                                             const int total_hs) {
+
+#if defined(VM_LITTLE_ENDIAN)
+  int start_idx = 0;
+#else
+  int start_idx = 1;
+#endif
+
+  // load initial hash from the memory pointed by state
+  VectorRegister ini_a = VR10;
+  VectorRegister ini_c = VR12;
+  VectorRegister ini_e = VR14;
+  VectorRegister ini_g = VR16;
+  static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
+  static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
+
+  Label state_save_aligned, after_state_save_aligned;
+
+  Register addr      = R7;
+  Register tmp       = R8;
+  VectorRegister vRb = VR8;
+  VectorRegister aux = VR9;
+
+  andi_(tmp, state, 0xf);
+  beq(CCR0, state_save_aligned);
+  // deal with unaligned addresses
+
+  {
+    VectorRegister a = hs[0];
+    VectorRegister b_ = hs[1];
+    VectorRegister c = hs[2];
+    VectorRegister d = hs[3];
+    VectorRegister e = hs[4];
+    VectorRegister f = hs[5];
+    VectorRegister g = hs[6];
+    VectorRegister h = hs[7];
+    load_perm(vRb, state);
+    lvx    (ini_a, state);
+    addi   (addr, state, 16);
+
+    lvx    (ini_c, addr);
+    addi   (addr, state, 32);
+    vec_perm(ini_a, ini_c, vRb);
+
+    lvx    (ini_e, addr);
+    addi   (addr, state, 48);
+    vec_perm(ini_c, ini_e, vRb);
+
+    lvx    (ini_g, addr);
+    addi   (addr, state, 64);
+    vec_perm(ini_e, ini_g, vRb);
+
+    lvx    (aux, addr);
+    vec_perm(ini_g, aux, vRb);
+
+#if defined(VM_LITTLE_ENDIAN)
+    xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
+    xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
+    xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
+    xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
+#else
+    xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
+    xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
+    xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
+    xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
+#endif
+
+    for (int n = start_idx; n < total_hs; n += 2) {
+      VectorRegister h_cur = hs[n];
+      VectorRegister ini_cur = inis[n/2];
+
+      vaddudm(h_cur, ini_cur, h_cur);
+    }
+
+    for (int n = start_idx; n < total_hs; n += 2) {
+      VectorRegister h_cur = hs[n];
+
+      mfvrd  (tmp, h_cur);
+#if defined(VM_LITTLE_ENDIAN)
+      std    (tmp, 8*n + 8, state);
+#else
+      std    (tmp, 8*n - 8, state);
+#endif
+      vsldoi (aux, h_cur, h_cur, 8);
+      mfvrd  (tmp, aux);
+      std    (tmp, 8*n + 0, state);
+    }
+
+    b      (after_state_save_aligned);
+  }
+
+  bind(state_save_aligned);
+  {
+    for (int n = 0; n < total_hs; n += 2) {
+#if defined(VM_LITTLE_ENDIAN)
+      VectorRegister h_cur = hs[n];
+      VectorRegister h_next = hs[n+1];
+#else
+      VectorRegister h_cur = hs[n+1];
+      VectorRegister h_next = hs[n];
+#endif
+      VectorRegister ini_cur = inis[n/2];
+
+      if (n/2 == 0) {
+        lvx(ini_cur, state);
+      } else {
+        addi(addr, state, (n/2) * 16);
+        lvx(ini_cur, addr);
+      }
+      xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
+    }
+
+    for (int n = start_idx; n < total_hs; n += 2) {
+      VectorRegister h_cur = hs[n];
+      VectorRegister ini_cur = inis[n/2];
+
+      vaddudm(h_cur, ini_cur, h_cur);
+    }
+
+    for (int n = start_idx; n < total_hs; n += 2) {
+      VectorRegister h_cur = hs[n];
+
+      if (n/2 == 0) {
+        stvx(h_cur, state);
+      } else {
+        addi(addr, state, (n/2) * 16);
+        stvx(h_cur, addr);
+      }
+    }
+  }
+
+  bind(after_state_save_aligned);
+}
+
+// Use h_cnt to cycle through hs elements but also increment it at the end
+void MacroAssembler::sha512_round(const VectorRegister* hs,
+                                  const int total_hs, int& h_cnt,
+                                  const VectorRegister kpw) {
+
+  // convenience registers: cycle from 0-7 downwards
+  const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
+  const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
+  // temporaries
+  const VectorRegister Ch   = VR20;
+  const VectorRegister Maj  = VR21;
+  const VectorRegister bsa  = VR22;
+  const VectorRegister bse  = VR23;
+  const VectorRegister tmp1 = VR24;
+  const VectorRegister tmp2 = VR25;
+
+  vsel      (Ch,   g,    f,   e);
+  vxor      (Maj,  a,    b);
+  vshasigmad(bse,  e,    1,   0xf);
+  vaddudm   (tmp2, Ch,   kpw);
+  vaddudm   (tmp1, h,    bse);
+  vsel      (Maj,  b,    c,   Maj);
+  vaddudm   (tmp1, tmp1, tmp2);
+  vshasigmad(bsa,  a,    1,   0);
+  vaddudm   (tmp2, bsa,  Maj);
+  vaddudm   (d,    d,    tmp1);
+  vaddudm   (h,    tmp1, tmp2);
+
+  // advance vector pointer to the next iteration
+  h_cnt++;
+}
+
+void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
+                                    const VectorRegister w1,
+                                    const VectorRegister w2,
+                                    const VectorRegister w3,
+                                    const VectorRegister w4,
+                                    const VectorRegister w5,
+                                    const VectorRegister w6,
+                                    const VectorRegister w7,
+                                    const VectorRegister kpw0,
+                                    const VectorRegister kpw1,
+                                    const Register j,
+                                    const VectorRegister vRb,
+                                    const Register k) {
+  // Temporaries
+  const VectorRegister VR_a = VR20;
+  const VectorRegister VR_b = VR21;
+  const VectorRegister VR_c = VR22;
+  const VectorRegister VR_d = VR23;
+
+  // load to k[j]
+  lvx        (VR_a, j,    k);
+  // advance j
+  addi       (j,    j,    16); // 16 bytes were read
+
+#if defined(VM_LITTLE_ENDIAN)
+  // v6 = w[j-15], w[j-14]
+  vperm      (VR_b, w1,   w0,  vRb);
+  // v12 = w[j-7], w[j-6]
+  vperm      (VR_c, w5,   w4,  vRb);
+#else
+  // v6 = w[j-15], w[j-14]
+  vperm      (VR_b, w0,   w1,  vRb);
+  // v12 = w[j-7], w[j-6]
+  vperm      (VR_c, w4,   w5,  vRb);
+#endif
+
+  // v6 = s0(w[j-15]) , s0(w[j-14])
+  vshasigmad (VR_b, VR_b,    0,   0);
+  // v5 = s1(w[j-2]) , s1(w[j-1])
+  vshasigmad (VR_d, w7,      0,   0xf);
+  // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
+  vaddudm    (VR_b, VR_b, VR_c);
+  // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
+  vaddudm    (VR_d, VR_d, w0);
+  // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
+  //      s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
+  vaddudm    (VR_c, VR_d, VR_b);
+  // Updating w0 to w7 to hold the new previous 16 values from w.
+  vmr        (w0,   w1);
+  vmr        (w1,   w2);
+  vmr        (w2,   w3);
+  vmr        (w3,   w4);
+  vmr        (w4,   w5);
+  vmr        (w5,   w6);
+  vmr        (w6,   w7);
+  vmr        (w7,   VR_c);
+
+#if defined(VM_LITTLE_ENDIAN)
+  // store k + w to kpw0 (2 values at once)
+  vaddudm    (kpw0, VR_c, VR_a);
+  // kpw1 holds (k + w)[1]
+  vsldoi     (kpw1, kpw0, kpw0, 8);
+#else
+  // store k + w to kpw0 (2 values at once)
+  vaddudm    (kpw1, VR_c, VR_a);
+  // kpw1 holds (k + w)[1]
+  vsldoi     (kpw0, kpw1, kpw1, 8);
+#endif
+}
+
+void MacroAssembler::sha512_load_h_vec(const Register state,
+                                       const VectorRegister* hs,
+                                       const int total_hs) {
+#if defined(VM_LITTLE_ENDIAN)
+  VectorRegister a   = hs[0];
+  VectorRegister g   = hs[6];
+  int start_idx = 0;
+#else
+  VectorRegister a   = hs[1];
+  VectorRegister g   = hs[7];
+  int start_idx = 1;
+#endif
+
+  Register addr      = R7;
+  VectorRegister vRb = VR8;
+  Register tmp       = R8;
+  Label state_aligned, after_state_aligned;
+
+  andi_(tmp, state, 0xf);
+  beq(CCR0, state_aligned);
+
+  // deal with unaligned addresses
+  VectorRegister aux = VR9;
+
+  lvx(hs[start_idx], state);
+  load_perm(vRb, state);
+
+  for (int n = start_idx + 2; n < total_hs; n += 2) {
+    VectorRegister h_cur   = hs[n];
+    VectorRegister h_prev2 = hs[n - 2];
+    addi(addr, state, (n/2) * 16);
+    lvx(h_cur, addr);
+    vec_perm(h_prev2, h_cur, vRb);
+  }
+  addi(addr, state, (total_hs/2) * 16);
+  lvx    (aux, addr);
+  vec_perm(hs[total_hs - 2 + start_idx], aux, vRb);
+  b      (after_state_aligned);
+
+  bind(state_aligned);
+
+  // deal with aligned addresses
+  lvx(hs[start_idx], state);
+
+  for (int n = start_idx + 2; n < total_hs; n += 2) {
+    VectorRegister h_cur = hs[n];
+    addi(addr, state, (n/2) * 16);
+    lvx(h_cur, addr);
+  }
+
+  bind(after_state_aligned);
+}
+
+static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = {
+  0x428a2f98d728ae22, 0x7137449123ef65cd,
+  0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
+  0x3956c25bf348b538, 0x59f111f1b605d019,
+  0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
+  0xd807aa98a3030242, 0x12835b0145706fbe,
+  0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+  0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
+  0x9bdc06a725c71235, 0xc19bf174cf692694,
+  0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
+  0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
+  0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
+  0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+  0x983e5152ee66dfab, 0xa831c66d2db43210,
+  0xb00327c898fb213f, 0xbf597fc7beef0ee4,
+  0xc6e00bf33da88fc2, 0xd5a79147930aa725,
+  0x06ca6351e003826f, 0x142929670a0e6e70,
+  0x27b70a8546d22ffc, 0x2e1b21385c26c926,
+  0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+  0x650a73548baf63de, 0x766a0abb3c77b2a8,
+  0x81c2c92e47edaee6, 0x92722c851482353b,
+  0xa2bfe8a14cf10364, 0xa81a664bbc423001,
+  0xc24b8b70d0f89791, 0xc76c51a30654be30,
+  0xd192e819d6ef5218, 0xd69906245565a910,
+  0xf40e35855771202a, 0x106aa07032bbd1b8,
+  0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
+  0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
+  0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
+  0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
+  0x748f82ee5defb2fc, 0x78a5636f43172f60,
+  0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+  0x90befffa23631e28, 0xa4506cebde82bde9,
+  0xbef9a3f7b2c67915, 0xc67178f2e372532b,
+  0xca273eceea26619c, 0xd186b8c721c0c207,
+  0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
+  0x06f067aa72176fba, 0x0a637dc5a2c898a6,
+  0x113f9804bef90dae, 0x1b710b35131c471b,
+  0x28db77f523047d84, 0x32caab7b40c72493,
+  0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
+  0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
+  0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
+};
+static const uint64_t *sha512_round_consts = sha512_round_table;
+
+//   R3_ARG1   - byte[]  Input string with padding but in Big Endian
+//   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
+//   R5_ARG3   - int     offset
+//   R6_ARG4   - int     limit
+//
+//   Internal Register usage:
+//   R7 R8 R9  - volatile temporaries
+//   VR0-VR7   - a-h
+//   VR8       - vRb
+//   VR9       - aux (highly volatile, use with care)
+//   VR10-VR17 - w0-w7 | ini_a-ini_h
+//   VR18      - vsp16 | kplusw0
+//   VR19      - vsp32 | kplusw1
+//   VR20-VR25 - sha512_calc_2w and sha512_round temporaries
+void MacroAssembler::sha512(bool multi_block) {
+  static const ssize_t buf_size = 128;
+  static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t);
+#ifdef AIX
+  // malloc provides 16 byte alignment
+  if (((uintptr_t)sha512_round_consts & 0xF) != 0) {
+    uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table));
+    guarantee(new_round_consts, "oom");
+    memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table));
+    sha512_round_consts = (const uint64_t*)new_round_consts;
+  }
+#endif
+
+  Register buf_in = R3_ARG1;
+  Register state  = R4_ARG2;
+  Register ofs    = R5_ARG3;
+  Register limit  = R6_ARG4;
+
+  Label sha_loop, core_loop;
+
+  // Save non-volatile vector registers in the red zone
+  static const VectorRegister nv[] = {
+    VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
+  };
+  static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
+
+  for (int c = 0; c < nv_size; c++) {
+    Register idx = R7;
+    li  (idx, (c - (nv_size)) * 16);
+    stvx(nv[c], idx, R1);
+  }
+
+  // Load hash state to registers
+  VectorRegister a = VR0;
+  VectorRegister b = VR1;
+  VectorRegister c = VR2;
+  VectorRegister d = VR3;
+  VectorRegister e = VR4;
+  VectorRegister f = VR5;
+  VectorRegister g = VR6;
+  VectorRegister h = VR7;
+  static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
+  static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
+  // counter for cycling through hs vector to avoid register moves between iterations
+  int h_cnt = 0;
+
+  // Load a-h registers from the memory pointed by state
+  sha512_load_h_vec(state, hs, total_hs);
+
+  Register k = R9;
+  assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment");
+  load_const_optimized(k, (address)sha512_round_consts, R0);
+
+  if (multi_block) {
+    align(OptoLoopAlignment);
+  }
+  bind(sha_loop);
+
+  for (int n = 0; n < total_hs; n += 2) {
+#if defined(VM_LITTLE_ENDIAN)
+    VectorRegister h_cur = hs[n];
+    VectorRegister h_next = hs[n + 1];
+#else
+    VectorRegister h_cur = hs[n + 1];
+    VectorRegister h_next = hs[n];
+#endif
+    vsldoi (h_next, h_cur, h_cur, 8);
+  }
+
+  // Load 16 elements from w out of the loop.
+  // Order of the long values is Endianess specific.
+  VectorRegister w0 = VR10;
+  VectorRegister w1 = VR11;
+  VectorRegister w2 = VR12;
+  VectorRegister w3 = VR13;
+  VectorRegister w4 = VR14;
+  VectorRegister w5 = VR15;
+  VectorRegister w6 = VR16;
+  VectorRegister w7 = VR17;
+  static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
+  static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
+
+  // Load 16 w into vectors and setup vsl for vperm
+  sha512_load_w_vec(buf_in, ws, total_ws);
+
+#if defined(VM_LITTLE_ENDIAN)
+  VectorRegister vsp16 = VR18;
+  VectorRegister vsp32 = VR19;
+  VectorRegister shiftarg = VR9;
+
+  vspltisw(vsp16,    8);
+  vspltisw(shiftarg, 1);
+  vsl     (vsp16,    vsp16, shiftarg);
+  vsl     (vsp32,    vsp16, shiftarg);
+
+  VectorRegister vsp8 = VR9;
+  vspltish(vsp8,     8);
+
+  // Convert input from Big Endian to Little Endian
+  for (int c = 0; c < total_ws; c++) {
+    VectorRegister w = ws[c];
+    vrlh  (w, w, vsp8);
+  }
+  for (int c = 0; c < total_ws; c++) {
+    VectorRegister w = ws[c];
+    vrlw  (w, w, vsp16);
+  }
+  for (int c = 0; c < total_ws; c++) {
+    VectorRegister w = ws[c];
+    vrld  (w, w, vsp32);
+  }
+#endif
+
+  Register Rb        = R10;
+  VectorRegister vRb = VR8;
+  li      (Rb, 8);
+  load_perm(vRb, Rb);
+
+  VectorRegister kplusw0 = VR18;
+  VectorRegister kplusw1 = VR19;
+
+  Register addr      = R7;
+
+  for (int n = 0; n < total_ws; n++) {
+    VectorRegister w = ws[n];
+
+    if (n == 0) {
+      lvx  (kplusw0, k);
+    } else {
+      addi (addr, k, n * 16);
+      lvx  (kplusw0, addr);
+    }
+#if defined(VM_LITTLE_ENDIAN)
+    vaddudm(kplusw0, kplusw0, w);
+    vsldoi (kplusw1, kplusw0, kplusw0, 8);
+#else
+    vaddudm(kplusw1, kplusw0, w);
+    vsldoi (kplusw0, kplusw1, kplusw1, 8);
+#endif
+
+    sha512_round(hs, total_hs, h_cnt, kplusw0);
+    sha512_round(hs, total_hs, h_cnt, kplusw1);
+  }
+
+  Register tmp       = R8;
+  li    (tmp, (w_size-16)/total_hs);
+  mtctr (tmp);
+  // j will be aligned to 4 for loading words.
+  // Whenever read, advance the pointer (e.g: when j is used in a function)
+  Register j = tmp;
+  li     (j, 8*16);
+
+  align(OptoLoopAlignment);
+  bind(core_loop);
+
+  // due to VectorRegister rotate, always iterate in multiples of total_hs
+  for (int n = 0; n < total_hs/2; n++) {
+    sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
+    sha512_round(hs, total_hs, h_cnt, kplusw0);
+    sha512_round(hs, total_hs, h_cnt, kplusw1);
+  }
+
+  bdnz   (core_loop);
+
+  sha512_update_sha_state(state, hs, total_hs);
+
+  if (multi_block) {
+    addi(buf_in, buf_in, buf_size);
+    addi(ofs, ofs, buf_size);
+    cmplw(CCR0, ofs, limit);
+    ble(CCR0, sha_loop);
+
+    // return ofs
+    mr(R3_RET, ofs);
+  }
+
+  // Restore non-volatile registers
+  for (int c = 0; c < nv_size; c++) {
+    Register idx = R7;
+    li  (idx, (c - (nv_size)) * 16);
+    lvx(nv[c], idx, R1);
+  }
+}
--- a/src/hotspot/cpu/ppc/nativeInst_ppc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/nativeInst_ppc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -221,13 +221,13 @@
     // A calculation relative to the global TOC.
     if (MacroAssembler::get_address_of_calculate_address_from_global_toc_at(addr, cb->content_begin()) !=
         (address)data) {
-      const int invalidated_range =
-        MacroAssembler::patch_calculate_address_from_global_toc_at(addr, cb->content_begin(),
+      const address inst2_addr = addr;
+      const address inst1_addr =
+        MacroAssembler::patch_calculate_address_from_global_toc_at(inst2_addr, cb->content_begin(),
                                                                    (address)data);
-      const address start = invalidated_range < 0 ? addr + invalidated_range : addr;
-      // FIXME:
-      const int range = invalidated_range < 0 ? 4 - invalidated_range : 8;
-      ICache::ppc64_flush_icache_bytes(start, range);
+      assert(inst1_addr != NULL && inst1_addr < inst2_addr, "first instruction must be found");
+      const int range = inst2_addr - inst1_addr + BytesPerInstWord;
+      ICache::ppc64_flush_icache_bytes(inst1_addr, range);
     }
     next_address = addr + 1 * BytesPerInstWord;
   } else if (MacroAssembler::is_load_const_at(addr)) {
@@ -288,15 +288,15 @@
 }
 
 void NativeMovConstReg::set_narrow_oop(narrowOop data, CodeBlob *code /* = NULL */) {
-  address   addr = addr_at(0);
+  address   inst2_addr = addr_at(0);
   CodeBlob* cb = (code) ? code : CodeCache::find_blob(instruction_address());
-  if (MacroAssembler::get_narrow_oop(addr, cb->content_begin()) == (long)data) return;
-  const int invalidated_range =
-    MacroAssembler::patch_set_narrow_oop(addr, cb->content_begin(), (long)data);
-  const address start = invalidated_range < 0 ? addr + invalidated_range : addr;
-  // FIXME:
-  const int range = invalidated_range < 0 ? 4 - invalidated_range : 8;
-  ICache::ppc64_flush_icache_bytes(start, range);
+  if (MacroAssembler::get_narrow_oop(inst2_addr, cb->content_begin()) == (long)data)
+    return;
+  const address inst1_addr =
+    MacroAssembler::patch_set_narrow_oop(inst2_addr, cb->content_begin(), (long)data);
+  assert(inst1_addr != NULL && inst1_addr < inst2_addr, "first instruction must be found");
+  const int range = inst2_addr - inst1_addr + BytesPerInstWord;
+  ICache::ppc64_flush_icache_bytes(inst1_addr, range);
 }
 
 // Do not use an assertion here. Let clients decide whether they only
--- a/src/hotspot/cpu/ppc/ppc.ad	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/ppc.ad	Sat Oct 21 01:23:52 2017 +0200
@@ -254,6 +254,73 @@
   reg_def SR_SPEFSCR(SOC, SOC, Op_RegP, 4, SR_SPEFSCR->as_VMReg()); // v
   reg_def SR_PPR(    SOC, SOC, Op_RegP, 5, SR_PPR->as_VMReg());     // v
 
+// ----------------------------
+// Vector-Scalar Registers
+// ----------------------------
+  reg_def VSR0 ( SOC, SOC, Op_VecX, 0, NULL);
+  reg_def VSR1 ( SOC, SOC, Op_VecX, 1, NULL);
+  reg_def VSR2 ( SOC, SOC, Op_VecX, 2, NULL);
+  reg_def VSR3 ( SOC, SOC, Op_VecX, 3, NULL);
+  reg_def VSR4 ( SOC, SOC, Op_VecX, 4, NULL);
+  reg_def VSR5 ( SOC, SOC, Op_VecX, 5, NULL);
+  reg_def VSR6 ( SOC, SOC, Op_VecX, 6, NULL);
+  reg_def VSR7 ( SOC, SOC, Op_VecX, 7, NULL);
+  reg_def VSR8 ( SOC, SOC, Op_VecX, 8, NULL);
+  reg_def VSR9 ( SOC, SOC, Op_VecX, 9, NULL);
+  reg_def VSR10 ( SOC, SOC, Op_VecX, 10, NULL);
+  reg_def VSR11 ( SOC, SOC, Op_VecX, 11, NULL);
+  reg_def VSR12 ( SOC, SOC, Op_VecX, 12, NULL);
+  reg_def VSR13 ( SOC, SOC, Op_VecX, 13, NULL);
+  reg_def VSR14 ( SOC, SOC, Op_VecX, 14, NULL);
+  reg_def VSR15 ( SOC, SOC, Op_VecX, 15, NULL);
+  reg_def VSR16 ( SOC, SOC, Op_VecX, 16, NULL);
+  reg_def VSR17 ( SOC, SOC, Op_VecX, 17, NULL);
+  reg_def VSR18 ( SOC, SOC, Op_VecX, 18, NULL);
+  reg_def VSR19 ( SOC, SOC, Op_VecX, 19, NULL);
+  reg_def VSR20 ( SOC, SOC, Op_VecX, 20, NULL);
+  reg_def VSR21 ( SOC, SOC, Op_VecX, 21, NULL);
+  reg_def VSR22 ( SOC, SOC, Op_VecX, 22, NULL);
+  reg_def VSR23 ( SOC, SOC, Op_VecX, 23, NULL);
+  reg_def VSR24 ( SOC, SOC, Op_VecX, 24, NULL);
+  reg_def VSR25 ( SOC, SOC, Op_VecX, 25, NULL);
+  reg_def VSR26 ( SOC, SOC, Op_VecX, 26, NULL);
+  reg_def VSR27 ( SOC, SOC, Op_VecX, 27, NULL);
+  reg_def VSR28 ( SOC, SOC, Op_VecX, 28, NULL);
+  reg_def VSR29 ( SOC, SOC, Op_VecX, 29, NULL);
+  reg_def VSR30 ( SOC, SOC, Op_VecX, 30, NULL);
+  reg_def VSR31 ( SOC, SOC, Op_VecX, 31, NULL);
+  reg_def VSR32 ( SOC, SOC, Op_VecX, 32, NULL);
+  reg_def VSR33 ( SOC, SOC, Op_VecX, 33, NULL);
+  reg_def VSR34 ( SOC, SOC, Op_VecX, 34, NULL);
+  reg_def VSR35 ( SOC, SOC, Op_VecX, 35, NULL);
+  reg_def VSR36 ( SOC, SOC, Op_VecX, 36, NULL);
+  reg_def VSR37 ( SOC, SOC, Op_VecX, 37, NULL);
+  reg_def VSR38 ( SOC, SOC, Op_VecX, 38, NULL);
+  reg_def VSR39 ( SOC, SOC, Op_VecX, 39, NULL);
+  reg_def VSR40 ( SOC, SOC, Op_VecX, 40, NULL);
+  reg_def VSR41 ( SOC, SOC, Op_VecX, 41, NULL);
+  reg_def VSR42 ( SOC, SOC, Op_VecX, 42, NULL);
+  reg_def VSR43 ( SOC, SOC, Op_VecX, 43, NULL);
+  reg_def VSR44 ( SOC, SOC, Op_VecX, 44, NULL);
+  reg_def VSR45 ( SOC, SOC, Op_VecX, 45, NULL);
+  reg_def VSR46 ( SOC, SOC, Op_VecX, 46, NULL);
+  reg_def VSR47 ( SOC, SOC, Op_VecX, 47, NULL);
+  reg_def VSR48 ( SOC, SOC, Op_VecX, 48, NULL);
+  reg_def VSR49 ( SOC, SOC, Op_VecX, 49, NULL);
+  reg_def VSR50 ( SOC, SOC, Op_VecX, 50, NULL);
+  reg_def VSR51 ( SOC, SOC, Op_VecX, 51, NULL);
+  reg_def VSR52 ( SOC, SOC, Op_VecX, 52, NULL);
+  reg_def VSR53 ( SOC, SOC, Op_VecX, 53, NULL);
+  reg_def VSR54 ( SOC, SOC, Op_VecX, 54, NULL);
+  reg_def VSR55 ( SOC, SOC, Op_VecX, 55, NULL);
+  reg_def VSR56 ( SOC, SOC, Op_VecX, 56, NULL);
+  reg_def VSR57 ( SOC, SOC, Op_VecX, 57, NULL);
+  reg_def VSR58 ( SOC, SOC, Op_VecX, 58, NULL);
+  reg_def VSR59 ( SOC, SOC, Op_VecX, 59, NULL);
+  reg_def VSR60 ( SOC, SOC, Op_VecX, 60, NULL);
+  reg_def VSR61 ( SOC, SOC, Op_VecX, 61, NULL);
+  reg_def VSR62 ( SOC, SOC, Op_VecX, 62, NULL);
+  reg_def VSR63 ( SOC, SOC, Op_VecX, 63, NULL);
 
 // ----------------------------
 // Specify priority of register selection within phases of register
@@ -385,6 +452,73 @@
 );
 
 alloc_class chunk3 (
+  VSR0,
+  VSR1,
+  VSR2,
+  VSR3,
+  VSR4,
+  VSR5,
+  VSR6,
+  VSR7,
+  VSR8,
+  VSR9,
+  VSR10,
+  VSR11,
+  VSR12,
+  VSR13,
+  VSR14,
+  VSR15,
+  VSR16,
+  VSR17,
+  VSR18,
+  VSR19,
+  VSR20,
+  VSR21,
+  VSR22,
+  VSR23,
+  VSR24,
+  VSR25,
+  VSR26,
+  VSR27,
+  VSR28,
+  VSR29,
+  VSR30,
+  VSR31,
+  VSR32,
+  VSR33,
+  VSR34,
+  VSR35,
+  VSR36,
+  VSR37,
+  VSR38,
+  VSR39,
+  VSR40,
+  VSR41,
+  VSR42,
+  VSR43,
+  VSR44,
+  VSR45,
+  VSR46,
+  VSR47,
+  VSR48,
+  VSR49,
+  VSR50,
+  VSR51,
+  VSR52,
+  VSR53,
+  VSR54,
+  VSR55,
+  VSR56,
+  VSR57,
+  VSR58,
+  VSR59,
+  VSR60,
+  VSR61,
+  VSR62,
+  VSR63
+);
+
+alloc_class chunk4 (
   // special registers
   // These registers are not allocated, but used for nodes generated by postalloc expand.
   SR_XER,
@@ -769,6 +903,45 @@
   F31, F31_H     // nv!
 );
 
+// ----------------------------
+// Vector-Scalar Register Class
+// ----------------------------
+
+reg_class vs_reg(
+  VSR32,
+  VSR33,
+  VSR34,
+  VSR35,
+  VSR36,
+  VSR37,
+  VSR38,
+  VSR39,
+  VSR40,
+  VSR41,
+  VSR42,
+  VSR43,
+  VSR44,
+  VSR45,
+  VSR46,
+  VSR47,
+  VSR48,
+  VSR49,
+  VSR50,
+  VSR51
+//  VSR52,     // nv!
+//  VSR53,     // nv!
+//  VSR54,     // nv!
+//  VSR55,     // nv!
+//  VSR56,     // nv!
+//  VSR57,     // nv!
+//  VSR58,     // nv!
+//  VSR59,     // nv!
+//  VSR60,     // nv!
+//  VSR61,     // nv!
+//  VSR62,     // nv!
+//  VSR63      // nv!
+);
+
  %}
 
 //----------DEFINITION BLOCK---------------------------------------------------
@@ -1502,7 +1675,7 @@
   if (reg < 64+64) return rc_float;
 
   // Between float regs & stack are the flags regs.
-  assert(OptoReg::is_stack(reg), "blow up if spilling flags");
+  assert(OptoReg::is_stack(reg) || reg < 64+64+64, "blow up if spilling flags");
 
   return rc_stack;
 }
@@ -2048,14 +2221,24 @@
 
 // Vector width in bytes.
 const int Matcher::vector_width_in_bytes(BasicType bt) {
-  assert(MaxVectorSize == 8, "");
-  return 8;
+  if (SuperwordUseVSX) {
+    assert(MaxVectorSize == 16, "");
+    return 16;
+  } else {
+    assert(MaxVectorSize == 8, "");
+    return 8;
+  }
 }
 
 // Vector ideal reg.
 const uint Matcher::vector_ideal_reg(int size) {
-  assert(MaxVectorSize == 8 && size == 8, "");
-  return Op_RegL;
+  if (SuperwordUseVSX) {
+    assert(MaxVectorSize == 16 && size == 16, "");
+    return Op_VecX;
+  } else {
+    assert(MaxVectorSize == 8 && size == 8, "");
+    return Op_RegL;
+  }
 }
 
 const uint Matcher::vector_shift_count_ideal_reg(int size) {
@@ -2075,7 +2258,7 @@
 
 // PPC doesn't support misaligned vectors store/load.
 const bool Matcher::misaligned_vectors_ok() {
-  return false;
+  return !AlignVector; // can be changed by flag
 }
 
 // PPC AES support not yet implemented
@@ -2217,10 +2400,31 @@
   F13_num
 };
 
+const MachRegisterNumbers vsarg_reg[64] = {
+  VSR0_num, VSR1_num, VSR2_num, VSR3_num,
+  VSR4_num, VSR5_num, VSR6_num, VSR7_num,
+  VSR8_num, VSR9_num, VSR10_num, VSR11_num,
+  VSR12_num, VSR13_num, VSR14_num, VSR15_num,
+  VSR16_num, VSR17_num, VSR18_num, VSR19_num,
+  VSR20_num, VSR21_num, VSR22_num, VSR23_num,
+  VSR24_num, VSR23_num, VSR24_num, VSR25_num,
+  VSR28_num, VSR29_num, VSR30_num, VSR31_num,
+  VSR32_num, VSR33_num, VSR34_num, VSR35_num,
+  VSR36_num, VSR37_num, VSR38_num, VSR39_num,
+  VSR40_num, VSR41_num, VSR42_num, VSR43_num,
+  VSR44_num, VSR45_num, VSR46_num, VSR47_num,
+  VSR48_num, VSR49_num, VSR50_num, VSR51_num,
+  VSR52_num, VSR53_num, VSR54_num, VSR55_num,
+  VSR56_num, VSR57_num, VSR58_num, VSR59_num,
+  VSR60_num, VSR61_num, VSR62_num, VSR63_num
+};
+
 const int num_iarg_registers = sizeof(iarg_reg) / sizeof(iarg_reg[0]);
 
 const int num_farg_registers = sizeof(farg_reg) / sizeof(farg_reg[0]);
 
+const int num_vsarg_registers = sizeof(vsarg_reg) / sizeof(vsarg_reg[0]);
+
 // Return whether or not this register is ever used as an argument. This
 // function is used on startup to build the trampoline stubs in generateOptoStub.
 // Registers not mentioned will be killed by the VM call in the trampoline, and
@@ -2552,6 +2756,115 @@
   return nodes;
 }
 
+typedef struct {
+  loadConL_hiNode *_large_hi;
+  loadConL_loNode *_large_lo;
+  mtvsrdNode      *_moved;
+  xxspltdNode     *_replicated;
+  loadConLNode    *_small;
+  MachNode        *_last;
+} loadConLReplicatedNodesTuple;
+
+loadConLReplicatedNodesTuple loadConLReplicatedNodesTuple_create(Compile *C, PhaseRegAlloc *ra_, Node *toc, immLOper *immSrc, 
+                                                 vecXOper *dst, immI_0Oper *zero,
+                                                 OptoReg::Name reg_second, OptoReg::Name reg_first,
+                                                 OptoReg::Name reg_vec_second, OptoReg::Name reg_vec_first) {
+  loadConLReplicatedNodesTuple nodes;
+
+  const bool large_constant_pool = true; // TODO: PPC port C->cfg()->_consts_size > 4000;
+  if (large_constant_pool) {
+    // Create new nodes.
+    loadConL_hiNode *m1 = new  loadConL_hiNode();
+    loadConL_loNode *m2 = new  loadConL_loNode();
+    mtvsrdNode *m3 = new  mtvsrdNode();
+    xxspltdNode *m4 = new  xxspltdNode();
+
+    // inputs for new nodes
+    m1->add_req(NULL, toc);
+    m2->add_req(NULL, m1);
+    m3->add_req(NULL, m2);
+    m4->add_req(NULL, m3);
+
+    // operands for new nodes
+    m1->_opnds[0] = new  iRegLdstOper(); // dst
+    m1->_opnds[1] = immSrc;              // src
+    m1->_opnds[2] = new  iRegPdstOper(); // toc
+
+    m2->_opnds[0] = new  iRegLdstOper(); // dst
+    m2->_opnds[1] = immSrc;              // src
+    m2->_opnds[2] = new  iRegLdstOper(); // base
+
+    m3->_opnds[0] = new  vecXOper();     // dst
+    m3->_opnds[1] = new  iRegLdstOper(); // src
+
+    m4->_opnds[0] = new  vecXOper();     // dst
+    m4->_opnds[1] = new  vecXOper();     // src
+    m4->_opnds[2] = zero;
+
+    // Initialize ins_attrib TOC fields.
+    m1->_const_toc_offset = -1;
+    m2->_const_toc_offset_hi_node = m1;
+
+    // Initialize ins_attrib instruction offset.
+    m1->_cbuf_insts_offset = -1;
+
+    // register allocation for new nodes
+    ra_->set_pair(m1->_idx, reg_second, reg_first);
+    ra_->set_pair(m2->_idx, reg_second, reg_first);
+    ra_->set1(m3->_idx, reg_second);
+    ra_->set2(m3->_idx, reg_vec_first);
+    ra_->set_pair(m4->_idx, reg_vec_second, reg_vec_first);
+
+    // Create result.
+    nodes._large_hi = m1;
+    nodes._large_lo = m2;
+    nodes._moved = m3;
+    nodes._replicated = m4;
+    nodes._small = NULL;
+    nodes._last = nodes._replicated;
+    assert(m2->bottom_type()->isa_long(), "must be long");
+  } else {
+    loadConLNode *m2 = new  loadConLNode();
+    mtvsrdNode *m3 = new  mtvsrdNode();
+    xxspltdNode *m4 = new  xxspltdNode();
+
+    // inputs for new nodes
+    m2->add_req(NULL, toc);
+
+    // operands for new nodes
+    m2->_opnds[0] = new  iRegLdstOper(); // dst
+    m2->_opnds[1] = immSrc;              // src
+    m2->_opnds[2] = new  iRegPdstOper(); // toc
+
+    m3->_opnds[0] = new  vecXOper();     // dst
+    m3->_opnds[1] = new  iRegLdstOper(); // src
+
+    m4->_opnds[0] = new  vecXOper();     // dst
+    m4->_opnds[1] = new  vecXOper();     // src
+    m4->_opnds[2] = zero;
+
+    // Initialize ins_attrib instruction offset.
+    m2->_cbuf_insts_offset = -1;
+    ra_->set1(m3->_idx, reg_second);
+    ra_->set2(m3->_idx, reg_vec_first);
+    ra_->set_pair(m4->_idx, reg_vec_second, reg_vec_first);
+
+    // register allocation for new nodes
+    ra_->set_pair(m2->_idx, reg_second, reg_first);
+
+    // Create result.
+    nodes._large_hi = NULL;
+    nodes._large_lo = NULL;
+    nodes._small = m2;
+    nodes._moved = m3;
+    nodes._replicated = m4;
+    nodes._last = nodes._replicated;
+    assert(m2->bottom_type()->isa_long(), "must be long");
+  }
+
+  return nodes;
+}
+
 %} // source
 
 encode %{
@@ -3212,6 +3525,27 @@
     assert(loadConLNodes._last->bottom_type()->isa_long(), "must be long");
   %}
 
+  enc_class postalloc_expand_load_replF_constant_vsx(vecX dst, immF src, iRegLdst toc) %{
+    // Create new nodes.
+
+    // Make an operand with the bit pattern to load as float.
+    immLOper *op_repl = new  immLOper((jlong)replicate_immF(op_src->constantF()));
+    immI_0Oper *op_zero = new  immI_0Oper(0);
+
+    loadConLReplicatedNodesTuple loadConLNodes =
+      loadConLReplicatedNodesTuple_create(C, ra_, n_toc, op_repl, op_dst, op_zero,
+                                OptoReg::Name(R20_H_num), OptoReg::Name(R20_num),
+                                OptoReg::Name(VSR11_num), OptoReg::Name(VSR10_num));
+
+    // Push new nodes.
+    if (loadConLNodes._large_hi) { nodes->push(loadConLNodes._large_hi); }
+    if (loadConLNodes._large_lo) { nodes->push(loadConLNodes._large_lo); }
+    if (loadConLNodes._moved)    { nodes->push(loadConLNodes._moved); }
+    if (loadConLNodes._last)     { nodes->push(loadConLNodes._last); }
+
+    assert(nodes->length() >= 1, "must have created at least 1 node");
+  %}
+
   // This enc_class is needed so that scheduler gets proper
   // input mapping for latency computation.
   enc_class enc_poll(immI dst, iRegLdst poll) %{
@@ -3840,6 +4174,14 @@
 //
 // Formats are generated automatically for constants and base registers.
 
+operand vecX() %{
+  constraint(ALLOC_IN_RC(vs_reg));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 //----------Simple Operands----------------------------------------------------
 // Immediate Operands
 
@@ -5372,6 +5714,20 @@
   ins_pipe(pipe_class_memory);
 %}
 
+// Load Aligned Packed Byte
+instruct loadV16(vecX dst, indirect mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 16);
+  match(Set dst (LoadVector mem));
+  ins_cost(MEMORY_REF_COST);
+
+  format %{ "LXVD2X      $dst, $mem \t// load 16-byte Vector" %}
+  size(4);
+  ins_encode %{
+    __ lxvd2x($dst$$VectorSRegister, $mem$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 // Load Range, range = array length (=jint)
 instruct loadRange(iRegIdst dst, memory mem) %{
   match(Set dst (LoadRange mem));
@@ -6368,6 +6724,20 @@
   ins_pipe(pipe_class_memory);
 %}
 
+// Store Packed Byte long register to memory
+instruct storeV16(indirect mem, vecX src) %{
+  predicate(n->as_StoreVector()->memory_size() == 16);
+  match(Set mem (StoreVector mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  format %{ "STXVD2X     $mem, $src \t// store 16-byte Vector" %}
+  size(4);
+  ins_encode %{
+    __ stxvd2x($src$$VectorSRegister, $mem$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 // Store Compressed Oop
 instruct storeN(memory dst, iRegN_P2N src) %{
   match(Set dst (StoreN dst src));
@@ -13239,6 +13609,26 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct mtvsrwz(vecX temp1, iRegIsrc src) %{
+  effect(DEF temp1, USE src);
+  
+  size(4);
+  ins_encode %{
+    __ mtvsrwz($temp1$$VectorSRegister, $src$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct xxspltw(vecX dst, vecX src, immI8 imm1) %{
+  effect(DEF dst, USE src, USE imm1);
+
+  size(4);
+  ins_encode %{
+    __ xxspltw($dst$$VectorSRegister, $src$$VectorSRegister, $imm1$$constant); 
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 //---------- Replicate Vector Instructions ------------------------------------
 
 // Insrdi does replicate if src == dst.
@@ -13318,6 +13708,46 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct repl16B_reg_Ex(vecX dst, iRegIsrc src) %{
+  match(Set dst (ReplicateB src));
+  predicate(n->as_Vector()->length() == 16);
+
+  expand %{
+    iRegLdst tmpL;
+    vecX tmpV;
+    immI8  imm1 %{ (int)  1 %}
+    moveReg(tmpL, src);
+    repl56(tmpL);
+    repl48(tmpL);
+    mtvsrwz(tmpV, tmpL);
+    xxspltw(dst, tmpV, imm1);
+  %}
+%}
+
+instruct repl16B_immI0(vecX dst, immI_0 zero) %{
+  match(Set dst (ReplicateB zero));
+  predicate(n->as_Vector()->length() == 16);
+
+  format %{ "XXLXOR      $dst, $zero \t// replicate16B" %}
+  size(4);
+  ins_encode %{
+    __ xxlxor($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct repl16B_immIminus1(vecX dst, immI_minus1 src) %{
+  match(Set dst (ReplicateB src));
+  predicate(n->as_Vector()->length() == 16);
+
+  format %{ "XXLEQV      $dst, $src \t// replicate16B" %}
+  size(4);
+  ins_encode %{
+    __ xxleqv($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct repl4S_reg_Ex(iRegLdst dst, iRegIsrc src) %{
   match(Set dst (ReplicateS src));
   predicate(n->as_Vector()->length() == 4);
@@ -13352,6 +13782,46 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct repl8S_reg_Ex(vecX dst, iRegIsrc src) %{
+  match(Set dst (ReplicateS src));
+  predicate(n->as_Vector()->length() == 8);
+
+  expand %{
+    iRegLdst tmpL;
+    vecX tmpV;
+    immI8  zero %{ (int)  0 %} 
+    moveReg(tmpL, src);
+    repl48(tmpL);
+    repl32(tmpL);
+    mtvsrd(tmpV, tmpL);
+    xxpermdi(dst, tmpV, tmpV, zero);
+  %}
+%}
+
+instruct repl8S_immI0(vecX dst, immI_0 zero) %{
+  match(Set dst (ReplicateS zero));
+  predicate(n->as_Vector()->length() == 8);
+
+  format %{ "XXLXOR      $dst, $zero \t// replicate8S" %}
+  size(4);
+  ins_encode %{
+    __ xxlxor($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct repl8S_immIminus1(vecX dst, immI_minus1 src) %{
+  match(Set dst (ReplicateS src));
+  predicate(n->as_Vector()->length() == 8);
+
+  format %{ "XXLEQV      $dst, $src \t// replicate16B" %}
+  size(4);
+  ins_encode %{
+    __ xxleqv($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct repl2I_reg_Ex(iRegLdst dst, iRegIsrc src) %{
   match(Set dst (ReplicateI src));
   predicate(n->as_Vector()->length() == 2);
@@ -13386,6 +13856,46 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct repl4I_reg_Ex(vecX dst, iRegIsrc src) %{
+  match(Set dst (ReplicateI src));
+  predicate(n->as_Vector()->length() == 4);
+  ins_cost(2 * DEFAULT_COST);
+
+  expand %{ 
+    iRegLdst tmpL;
+    vecX tmpV;
+    immI8  zero %{ (int)  0 %} 
+    moveReg(tmpL, src);
+    repl32(tmpL);
+    mtvsrd(tmpV, tmpL);
+    xxpermdi(dst, tmpV, tmpV, zero);
+  %}
+%}
+
+instruct repl4I_immI0(vecX dst, immI_0 zero) %{
+  match(Set dst (ReplicateI zero));
+  predicate(n->as_Vector()->length() == 4);
+
+  format %{ "XXLXOR      $dst, $zero \t// replicate4I" %}
+  size(4);
+  ins_encode %{
+    __ xxlxor($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct repl4I_immIminus1(vecX dst, immI_minus1 src) %{
+  match(Set dst (ReplicateI src));
+  predicate(n->as_Vector()->length() == 4);
+
+  format %{ "XXLEQV      $dst, $dst, $dst \t// replicate4I" %}
+  size(4);
+  ins_encode %{
+    __ xxleqv($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 // Move float to int register via stack, replicate.
 instruct repl2F_reg_Ex(iRegLdst dst, regF src) %{
   match(Set dst (ReplicateF src));
@@ -13484,6 +13994,154 @@
 %}
 
 
+instruct repl4F_reg_Ex(vecX dst, regF src) %{
+  match(Set dst (ReplicateF src));
+  predicate(n->as_Vector()->length() == 4);
+  ins_cost(2 * MEMORY_REF_COST + DEFAULT_COST);
+  expand %{
+    stackSlotL tmpS;
+    iRegIdst tmpI;
+    iRegLdst tmpL;
+    vecX tmpV;
+    immI8  zero %{ (int)  0 %} 
+
+    moveF2I_reg_stack(tmpS, src);   // Move float to stack.
+    moveF2I_stack_reg(tmpI, tmpS);  // Move stack to int reg.
+    moveReg(tmpL, tmpI);             // Move int to long reg.
+    repl32(tmpL);                    // Replicate bitpattern.
+    mtvsrd(tmpV, tmpL);
+    xxpermdi(dst, tmpV, tmpV, zero);
+  %}
+%}
+
+instruct repl4F_immF_Ex(vecX dst, immF src) %{
+  match(Set dst (ReplicateF src));
+  predicate(n->as_Vector()->length() == 4);
+  ins_cost(10 * DEFAULT_COST);
+
+  postalloc_expand( postalloc_expand_load_replF_constant_vsx(dst, src, constanttablebase) );
+%}
+
+instruct repl4F_immF0(vecX dst, immF_0 zero) %{
+  match(Set dst (ReplicateF zero));
+  predicate(n->as_Vector()->length() == 4);
+
+  format %{ "XXLXOR      $dst, $zero \t// replicate4F" %}
+  ins_encode %{
+    __ xxlxor($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct repl2D_reg_Ex(vecX dst, regD src) %{
+  match(Set dst (ReplicateD src));
+  predicate(n->as_Vector()->length() == 2);
+  expand %{
+    stackSlotL tmpS;
+    iRegLdst tmpL;
+    iRegLdst tmp;
+    vecX tmpV;
+    immI8  zero %{ (int)  0 %} 
+    moveD2L_reg_stack(tmpS, src);
+    moveD2L_stack_reg(tmpL, tmpS);
+    mtvsrd(tmpV, tmpL);
+    xxpermdi(dst, tmpV, tmpV, zero);
+  %}
+%}
+
+instruct repl2D_immI0(vecX dst, immI_0 zero) %{
+  match(Set dst (ReplicateD zero));
+  predicate(n->as_Vector()->length() == 2);
+
+  format %{ "XXLXOR      $dst, $zero \t// replicate2D" %}
+  size(4);
+  ins_encode %{
+    __ xxlxor($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct repl2D_immIminus1(vecX dst, immI_minus1 src) %{
+  match(Set dst (ReplicateD src));
+  predicate(n->as_Vector()->length() == 2);
+
+  format %{ "XXLEQV      $dst, $src \t// replicate16B" %}
+  size(4);
+  ins_encode %{
+    __ xxleqv($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct mtvsrd(vecX dst, iRegLsrc src) %{
+  predicate(false);
+  effect(DEF dst, USE src);
+
+  format %{ "MTVSRD      $dst, $src \t// Move to 16-byte register"%} 
+  size(4);
+  ins_encode %{
+    __ mtvsrd($dst$$VectorSRegister, $src$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct xxspltd(vecX dst, vecX src, immI8 zero) %{
+  effect(DEF dst, USE src, USE zero);
+
+  format %{ "XXSPLATD      $dst, $src, $zero \t// Permute 16-byte register"%}
+  size(4);
+  ins_encode %{
+    __ xxpermdi($dst$$VectorSRegister, $src$$VectorSRegister, $src$$VectorSRegister, $zero$$constant);
+  %} 
+  ins_pipe(pipe_class_default);
+%}
+
+instruct xxpermdi(vecX dst, vecX src1, vecX src2, immI8 zero) %{
+  effect(DEF dst, USE src1, USE src2, USE zero);
+
+  format %{ "XXPERMDI      $dst, $src1, $src2, $zero \t// Permute 16-byte register"%}
+  size(4);
+  ins_encode %{
+    __ xxpermdi($dst$$VectorSRegister, $src1$$VectorSRegister, $src2$$VectorSRegister, $zero$$constant);
+  %} 
+  ins_pipe(pipe_class_default);
+%}
+
+instruct repl2L_reg_Ex(vecX dst, iRegLsrc src) %{
+  match(Set dst (ReplicateL src));
+  predicate(n->as_Vector()->length() == 2);
+  expand %{
+    vecX tmpV;
+    immI8  zero %{ (int)  0 %} 
+    mtvsrd(tmpV, src); 
+    xxpermdi(dst, tmpV, tmpV, zero);
+  %}
+%}
+
+instruct repl2L_immI0(vecX dst, immI_0 zero) %{
+  match(Set dst (ReplicateL zero));
+  predicate(n->as_Vector()->length() == 2);
+
+  format %{ "XXLXOR      $dst, $zero \t// replicate2L" %}
+  size(4);
+  ins_encode %{
+    __ xxlxor($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct repl2L_immIminus1(vecX dst, immI_minus1 src) %{
+  match(Set dst (ReplicateL src));
+  predicate(n->as_Vector()->length() == 2);
+
+  format %{ "XXLEQV      $dst, $src \t// replicate16B" %}
+  size(4);
+  ins_encode %{
+    __ xxleqv($dst$$VectorSRegister, $dst$$VectorSRegister, $dst$$VectorSRegister);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 // ============================================================================
 // Safepoint Instruction
 
--- a/src/hotspot/cpu/ppc/register_definitions_ppc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/register_definitions_ppc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -31,3 +31,5 @@
 REGISTER_DEFINITION(Register, noreg);
 
 REGISTER_DEFINITION(FloatRegister, fnoreg);
+
+REGISTER_DEFINITION(VectorSRegister, vsnoreg);
--- a/src/hotspot/cpu/ppc/register_ppc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/register_ppc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -677,7 +677,7 @@
       * 2                                          // register halves
       + ConditionRegisterImpl::number_of_registers // condition code registers
       + SpecialRegisterImpl::number_of_registers   // special registers
-      + VectorRegisterImpl::number_of_registers    // VSX registers
+      + VectorSRegisterImpl::number_of_registers   // VSX registers
   };
 
   static const int max_gpr;
--- a/src/hotspot/cpu/ppc/sharedRuntime_ppc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/sharedRuntime_ppc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -479,8 +479,8 @@
 
 // Is vector's size (in bytes) bigger than a size saved by default?
 bool SharedRuntime::is_wide_vector(int size) {
-  // Note, MaxVectorSize == 8 on PPC64.
-  assert(size <= 8, "%d bytes vectors are not supported", size);
+  // Note, MaxVectorSize == 8/16 on PPC64.
+  assert(size <= (SuperwordUseVSX ? 16 : 8), "%d bytes vectors are not supported", size);
   return size > 8;
 }
 
@@ -2234,9 +2234,6 @@
   __ release();
   // TODO: PPC port assert(4 == JavaThread::sz_thread_state(), "unexpected field size");
   __ stw(R0, thread_(thread_state));
-  if (UseMembar) {
-    __ fence();
-  }
 
 
   // The JNI call
@@ -2393,9 +2390,6 @@
   __ release();
   // TODO: PPC port assert(4 == JavaThread::sz_thread_state(), "unexpected field size");
   __ stw(R0, thread_(thread_state));
-  if (UseMembar) {
-    __ fence();
-  }
   __ bind(after_transition);
 
   // Reguard any pages if necessary.
--- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -2667,7 +2667,7 @@
     return start;
   }
 
-  // Arguments for generated stub (little endian only):
+  // Arguments for generated stub:
   //   R3_ARG1   - source byte array address
   //   R4_ARG2   - destination byte array address
   //   R5_ARG3   - round key array
@@ -2686,7 +2686,6 @@
     Register keylen         = R8;
     Register temp           = R9;
     Register keypos         = R10;
-    Register hex            = R11;
     Register fifteen        = R12;
 
     VectorRegister vRet     = VR0;
@@ -2706,164 +2705,170 @@
     VectorRegister vTmp3    = VR11;
     VectorRegister vTmp4    = VR12;
 
-    VectorRegister vLow     = VR13;
-    VectorRegister vHigh    = VR14;
-
-    __ li              (hex, 16);
     __ li              (fifteen, 15);
-    __ vspltisb        (fSplt, 0x0f);
 
     // load unaligned from[0-15] to vsRet
     __ lvx             (vRet, from);
     __ lvx             (vTmp1, fifteen, from);
     __ lvsl            (fromPerm, from);
+#ifdef VM_LITTLE_ENDIAN
+    __ vspltisb        (fSplt, 0x0f);
     __ vxor            (fromPerm, fromPerm, fSplt);
+#endif
     __ vperm           (vRet, vRet, vTmp1, fromPerm);
 
     // load keylen (44 or 52 or 60)
     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
 
     // to load keys
-    __ lvsr            (keyPerm, key);
-    __ vxor            (vTmp2, vTmp2, vTmp2);
+    __ load_perm       (keyPerm, key);
+#ifdef VM_LITTLE_ENDIAN
     __ vspltisb        (vTmp2, -16);
     __ vrld            (keyPerm, keyPerm, vTmp2);
     __ vrld            (keyPerm, keyPerm, vTmp2);
     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
-
-    // load the 1st round key to vKey1
-    __ li              (keypos, 0);
+#endif
+
+    // load the 1st round key to vTmp1
+    __ lvx             (vTmp1, key);
+    __ li              (keypos, 16);
     __ lvx             (vKey1, keypos, key);
-    __ addi            (keypos, keypos, 16);
+    __ vec_perm        (vTmp1, vKey1, keyPerm);
+
+    // 1st round
+    __ vxor            (vRet, vRet, vTmp1);
+
+    // load the 2nd round key to vKey1
+    __ li              (keypos, 32);
+    __ lvx             (vKey2, keypos, key);
+    __ vec_perm        (vKey1, vKey2, keyPerm);
+
+    // load the 3rd round key to vKey2
+    __ li              (keypos, 48);
+    __ lvx             (vKey3, keypos, key);
+    __ vec_perm        (vKey2, vKey3, keyPerm);
+
+    // load the 4th round key to vKey3
+    __ li              (keypos, 64);
+    __ lvx             (vKey4, keypos, key);
+    __ vec_perm        (vKey3, vKey4, keyPerm);
+
+    // load the 5th round key to vKey4
+    __ li              (keypos, 80);
     __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey1, vTmp1, vKey1, keyPerm);
-
-    // 1st round
-    __ vxor (vRet, vRet, vKey1);
-
-    // load the 2nd round key to vKey1
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
-
-    // load the 3rd round key to vKey2
-    __ addi            (keypos, keypos, 16);
+    __ vec_perm        (vKey4, vTmp1, keyPerm);
+
+    // 2nd - 5th rounds
+    __ vcipher         (vRet, vRet, vKey1);
+    __ vcipher         (vRet, vRet, vKey2);
+    __ vcipher         (vRet, vRet, vKey3);
+    __ vcipher         (vRet, vRet, vKey4);
+
+    // load the 6th round key to vKey1
+    __ li              (keypos, 96);
+    __ lvx             (vKey2, keypos, key);
+    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
+
+    // load the 7th round key to vKey2
+    __ li              (keypos, 112);
+    __ lvx             (vKey3, keypos, key);
+    __ vec_perm        (vKey2, vKey3, keyPerm);
+
+    // load the 8th round key to vKey3
+    __ li              (keypos, 128);
+    __ lvx             (vKey4, keypos, key);
+    __ vec_perm        (vKey3, vKey4, keyPerm);
+
+    // load the 9th round key to vKey4
+    __ li              (keypos, 144);
     __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
-
-    // load the 4th round key to vKey3
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
-
-    // load the 5th round key to vKey4
-    __ addi            (keypos, keypos, 16);
+    __ vec_perm        (vKey4, vTmp1, keyPerm);
+
+    // 6th - 9th rounds
+    __ vcipher         (vRet, vRet, vKey1);
+    __ vcipher         (vRet, vRet, vKey2);
+    __ vcipher         (vRet, vRet, vKey3);
+    __ vcipher         (vRet, vRet, vKey4);
+
+    // load the 10th round key to vKey1
+    __ li              (keypos, 160);
+    __ lvx             (vKey2, keypos, key);
+    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
+
+    // load the 11th round key to vKey2
+    __ li              (keypos, 176);
     __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
-
-    // 2nd - 5th rounds
-    __ vcipher (vRet, vRet, vKey1);
-    __ vcipher (vRet, vRet, vKey2);
-    __ vcipher (vRet, vRet, vKey3);
-    __ vcipher (vRet, vRet, vKey4);
-
-    // load the 6th round key to vKey1
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
-
-    // load the 7th round key to vKey2
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
-
-    // load the 8th round key to vKey3
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
-
-    // load the 9th round key to vKey4
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
-
-    // 6th - 9th rounds
-    __ vcipher (vRet, vRet, vKey1);
-    __ vcipher (vRet, vRet, vKey2);
-    __ vcipher (vRet, vRet, vKey3);
-    __ vcipher (vRet, vRet, vKey4);
-
-    // load the 10th round key to vKey1
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
-
-    // load the 11th round key to vKey2
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+    __ vec_perm        (vKey2, vTmp1, keyPerm);
 
     // if all round keys are loaded, skip next 4 rounds
     __ cmpwi           (CCR0, keylen, 44);
     __ beq             (CCR0, L_doLast);
 
     // 10th - 11th rounds
-    __ vcipher (vRet, vRet, vKey1);
-    __ vcipher (vRet, vRet, vKey2);
+    __ vcipher         (vRet, vRet, vKey1);
+    __ vcipher         (vRet, vRet, vKey2);
 
     // load the 12th round key to vKey1
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+    __ li              (keypos, 192);
+    __ lvx             (vKey2, keypos, key);
+    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
 
     // load the 13th round key to vKey2
-    __ addi            (keypos, keypos, 16);
+    __ li              (keypos, 208);
     __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+    __ vec_perm        (vKey2, vTmp1, keyPerm);
 
     // if all round keys are loaded, skip next 2 rounds
     __ cmpwi           (CCR0, keylen, 52);
     __ beq             (CCR0, L_doLast);
 
     // 12th - 13th rounds
-    __ vcipher (vRet, vRet, vKey1);
-    __ vcipher (vRet, vRet, vKey2);
+    __ vcipher         (vRet, vRet, vKey1);
+    __ vcipher         (vRet, vRet, vKey2);
 
     // load the 14th round key to vKey1
-    __ addi            (keypos, keypos, 16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+    __ li              (keypos, 224);
+    __ lvx             (vKey2, keypos, key);
+    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
 
     // load the 15th round key to vKey2
-    __ addi            (keypos, keypos, 16);
+    __ li              (keypos, 240);
     __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+    __ vec_perm        (vKey2, vTmp1, keyPerm);
 
     __ bind(L_doLast);
 
     // last two rounds
-    __ vcipher (vRet, vRet, vKey1);
-    __ vcipherlast (vRet, vRet, vKey2);
-
-    __ neg             (temp, to);
-    __ lvsr            (toPerm, temp);
-    __ vspltisb        (vTmp2, -1);
-    __ vxor            (vTmp1, vTmp1, vTmp1);
-    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
-    __ vxor            (toPerm, toPerm, fSplt);
+    __ vcipher         (vRet, vRet, vKey1);
+    __ vcipherlast     (vRet, vRet, vKey2);
+
+    // store result (unaligned)
+#ifdef VM_LITTLE_ENDIAN
+    __ lvsl            (toPerm, to);
+#else
+    __ lvsr            (toPerm, to);
+#endif
+    __ vspltisb        (vTmp3, -1);
+    __ vspltisb        (vTmp4, 0);
     __ lvx             (vTmp1, to);
-    __ vperm           (vRet, vRet, vRet, toPerm);
-    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
-    __ lvx             (vTmp4, fifteen, to);
+    __ lvx             (vTmp2, fifteen, to);
+#ifdef VM_LITTLE_ENDIAN
+    __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
+    __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
+#else
+    __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
+#endif
+    __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
+    __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
+    __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
+    __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
     __ stvx            (vTmp1, to);
-    __ vsel            (vRet, vRet, vTmp4, vTmp2);
-    __ stvx            (vRet, fifteen, to);
 
     __ blr();
      return start;
   }
 
-  // Arguments for generated stub (little endian only):
+  // Arguments for generated stub:
   //   R3_ARG1   - source byte array address
   //   R4_ARG2   - destination byte array address
   //   R5_ARG3   - K (key) in little endian int array
@@ -2885,7 +2890,6 @@
     Register keylen         = R8;
     Register temp           = R9;
     Register keypos         = R10;
-    Register hex            = R11;
     Register fifteen        = R12;
 
     VectorRegister vRet     = VR0;
@@ -2906,30 +2910,30 @@
     VectorRegister vTmp3    = VR12;
     VectorRegister vTmp4    = VR13;
 
-    VectorRegister vLow     = VR14;
-    VectorRegister vHigh    = VR15;
-
-    __ li              (hex, 16);
     __ li              (fifteen, 15);
-    __ vspltisb        (fSplt, 0x0f);
 
     // load unaligned from[0-15] to vsRet
     __ lvx             (vRet, from);
     __ lvx             (vTmp1, fifteen, from);
     __ lvsl            (fromPerm, from);
+#ifdef VM_LITTLE_ENDIAN
+    __ vspltisb        (fSplt, 0x0f);
     __ vxor            (fromPerm, fromPerm, fSplt);
+#endif
     __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
 
     // load keylen (44 or 52 or 60)
     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
 
     // to load keys
-    __ lvsr            (keyPerm, key);
+    __ load_perm       (keyPerm, key);
+#ifdef VM_LITTLE_ENDIAN
     __ vxor            (vTmp2, vTmp2, vTmp2);
     __ vspltisb        (vTmp2, -16);
     __ vrld            (keyPerm, keyPerm, vTmp2);
     __ vrld            (keyPerm, keyPerm, vTmp2);
     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
+#endif
 
     __ cmpwi           (CCR0, keylen, 44);
     __ beq             (CCR0, L_do44);
@@ -2937,32 +2941,32 @@
     __ cmpwi           (CCR0, keylen, 52);
     __ beq             (CCR0, L_do52);
 
-    // load the 15th round key to vKey11
+    // load the 15th round key to vKey1
     __ li              (keypos, 240);
+    __ lvx             (vKey1, keypos, key);
+    __ li              (keypos, 224);
+    __ lvx             (vKey2, keypos, key);
+    __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
+
+    // load the 14th round key to vKey2
+    __ li              (keypos, 208);
+    __ lvx             (vKey3, keypos, key);
+    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
+
+    // load the 13th round key to vKey3
+    __ li              (keypos, 192);
+    __ lvx             (vKey4, keypos, key);
+    __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
+
+    // load the 12th round key to vKey4
+    __ li              (keypos, 176);
+    __ lvx             (vKey5, keypos, key);
+    __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
+
+    // load the 11th round key to vKey5
+    __ li              (keypos, 160);
     __ lvx             (vTmp1, keypos, key);
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
-
-    // load the 14th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
-
-    // load the 13th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
-
-    // load the 12th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
-
-    // load the 11th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
+    __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
 
     // 1st - 5th rounds
     __ vxor            (vRet, vRet, vKey1);
@@ -2975,22 +2979,22 @@
 
     __ bind            (L_do52);
 
-    // load the 13th round key to vKey11
+    // load the 13th round key to vKey1
     __ li              (keypos, 208);
+    __ lvx             (vKey1, keypos, key);
+    __ li              (keypos, 192);
+    __ lvx             (vKey2, keypos, key);
+    __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
+
+    // load the 12th round key to vKey2
+    __ li              (keypos, 176);
+    __ lvx             (vKey3, keypos, key);
+    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
+
+    // load the 11th round key to vKey3
+    __ li              (keypos, 160);
     __ lvx             (vTmp1, keypos, key);
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
-
-    // load the 12th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
-
-    // load the 11th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+    __ vec_perm        (vKey3, vTmp1, vKey3, keyPerm);
 
     // 1st - 3rd rounds
     __ vxor            (vRet, vRet, vKey1);
@@ -3001,42 +3005,42 @@
 
     __ bind            (L_do44);
 
-    // load the 11th round key to vKey11
+    // load the 11th round key to vKey1
     __ li              (keypos, 176);
+    __ lvx             (vKey1, keypos, key);
+    __ li              (keypos, 160);
     __ lvx             (vTmp1, keypos, key);
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+    __ vec_perm        (vKey1, vTmp1, vKey1, keyPerm);
 
     // 1st round
     __ vxor            (vRet, vRet, vKey1);
 
     __ bind            (L_doLast);
 
-    // load the 10th round key to vKey10
-    __ addi            (keypos, keypos, -16);
+    // load the 10th round key to vKey1
+    __ li              (keypos, 144);
+    __ lvx             (vKey2, keypos, key);
+    __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
+
+    // load the 9th round key to vKey2
+    __ li              (keypos, 128);
+    __ lvx             (vKey3, keypos, key);
+    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
+
+    // load the 8th round key to vKey3
+    __ li              (keypos, 112);
+    __ lvx             (vKey4, keypos, key);
+    __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
+
+    // load the 7th round key to vKey4
+    __ li              (keypos, 96);
+    __ lvx             (vKey5, keypos, key);
+    __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
+
+    // load the 6th round key to vKey5
+    __ li              (keypos, 80);
     __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
-
-    // load the 9th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
-
-    // load the 8th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
-
-    // load the 7th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
-
-    // load the 6th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey5, vTmp2, vTmp1, keyPerm);
+    __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
 
     // last 10th - 6th rounds
     __ vncipher        (vRet, vRet, vKey1);
@@ -3045,30 +3049,29 @@
     __ vncipher        (vRet, vRet, vKey4);
     __ vncipher        (vRet, vRet, vKey5);
 
-    // load the 5th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
-
-    // load the 4th round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
-
-    // load the 3rd round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
-
-    // load the 2nd round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp1, keypos, key);
-    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
-
-    // load the 1st round key to vKey10
-    __ addi            (keypos, keypos, -16);
-    __ lvx             (vTmp2, keypos, key);
-    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
+    // load the 5th round key to vKey1
+    __ li              (keypos, 64);
+    __ lvx             (vKey2, keypos, key);
+    __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
+
+    // load the 4th round key to vKey2
+    __ li              (keypos, 48);
+    __ lvx             (vKey3, keypos, key);
+    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
+
+    // load the 3rd round key to vKey3
+    __ li              (keypos, 32);
+    __ lvx             (vKey4, keypos, key);
+    __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
+
+    // load the 2nd round key to vKey4
+    __ li              (keypos, 16);
+    __ lvx             (vKey5, keypos, key);
+    __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
+
+    // load the 1st round key to vKey5
+    __ lvx             (vTmp1, key);
+    __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
 
     // last 5th - 1th rounds
     __ vncipher        (vRet, vRet, vKey1);
@@ -3077,24 +3080,54 @@
     __ vncipher        (vRet, vRet, vKey4);
     __ vncipherlast    (vRet, vRet, vKey5);
 
-    __ neg             (temp, to);
-    __ lvsr            (toPerm, temp);
-    __ vspltisb        (vTmp2, -1);
-    __ vxor            (vTmp1, vTmp1, vTmp1);
-    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
-    __ vxor            (toPerm, toPerm, fSplt);
+    // store result (unaligned)
+#ifdef VM_LITTLE_ENDIAN
+    __ lvsl            (toPerm, to);
+#else
+    __ lvsr            (toPerm, to);
+#endif
+    __ vspltisb        (vTmp3, -1);
+    __ vspltisb        (vTmp4, 0);
     __ lvx             (vTmp1, to);
-    __ vperm           (vRet, vRet, vRet, toPerm);
-    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
-    __ lvx             (vTmp4, fifteen, to);
+    __ lvx             (vTmp2, fifteen, to);
+#ifdef VM_LITTLE_ENDIAN
+    __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
+    __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
+#else
+    __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
+#endif
+    __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
+    __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
+    __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
+    __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
     __ stvx            (vTmp1, to);
-    __ vsel            (vRet, vRet, vTmp4, vTmp2);
-    __ stvx            (vRet, fifteen, to);
 
     __ blr();
      return start;
   }
 
+  address generate_sha256_implCompress(bool multi_block, const char *name) {
+    assert(UseSHA, "need SHA instructions");
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ function_entry();
+
+    __ sha256 (multi_block);
+
+    __ blr();
+    return start;
+  }
+
+  address generate_sha512_implCompress(bool multi_block, const char *name) {
+    assert(UseSHA, "need SHA instructions");
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ function_entry();
+
+    __ sha512 (multi_block);
+
+    __ blr();
+    return start;
+  }
+
   void generate_arraycopy_stubs() {
     // Note: the disjoint stubs must be generated first, some of
     // the conjoint stubs use them.
@@ -3306,6 +3339,267 @@
       BLOCK_COMMENT("} Stub body");
   }
 
+  /**
+  *  Arguments:
+  *
+  *  Input:
+  *   R3_ARG1    - out address
+  *   R4_ARG2    - in address
+  *   R5_ARG3    - offset
+  *   R6_ARG4    - len
+  *   R7_ARG5    - k
+  *  Output:
+  *   R3_RET     - carry
+  */
+  address generate_mulAdd() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "mulAdd");
+
+    address start = __ function_entry();
+
+    // C2 does not sign extend signed parameters to full 64 bits registers:
+    __ rldic (R5_ARG3, R5_ARG3, 2, 32);  // always positive
+    __ clrldi(R6_ARG4, R6_ARG4, 32);     // force zero bits on higher word
+    __ clrldi(R7_ARG5, R7_ARG5, 32);     // force zero bits on higher word
+
+    __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
+
+    // Moves output carry to return register
+    __ mr    (R3_RET,  R10);
+
+    __ blr();
+
+    return start;
+  }
+
+  /**
+  *  Arguments:
+  *
+  *  Input:
+  *   R3_ARG1    - in address
+  *   R4_ARG2    - in length
+  *   R5_ARG3    - out address
+  *   R6_ARG4    - out length
+  */
+  address generate_squareToLen() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "squareToLen");
+
+    address start = __ function_entry();
+
+    // args - higher word is cleaned (unsignedly) due to int to long casting
+    const Register in        = R3_ARG1;
+    const Register in_len    = R4_ARG2;
+    __ clrldi(in_len, in_len, 32);
+    const Register out       = R5_ARG3;
+    const Register out_len   = R6_ARG4;
+    __ clrldi(out_len, out_len, 32);
+
+    // output
+    const Register ret       = R3_RET;
+
+    // temporaries
+    const Register lplw_s    = R7;
+    const Register in_aux    = R8;
+    const Register out_aux   = R9;
+    const Register piece     = R10;
+    const Register product   = R14;
+    const Register lplw      = R15;
+    const Register i_minus1  = R16;
+    const Register carry     = R17;
+    const Register offset    = R18;
+    const Register off_aux   = R19;
+    const Register t         = R20;
+    const Register mlen      = R21;
+    const Register len       = R22;
+    const Register a         = R23;
+    const Register b         = R24;
+    const Register i         = R25;
+    const Register c         = R26;
+    const Register cs        = R27;
+
+    // Labels
+    Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_MULADD, SKIP_LOOP_SQUARE;
+    Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_MULADD, LOOP_SQUARE;
+
+    // Save non-volatile regs (frameless).
+    int current_offs = -8;
+    __ std(R28, current_offs, R1_SP); current_offs -= 8;
+    __ std(R27, current_offs, R1_SP); current_offs -= 8;
+    __ std(R26, current_offs, R1_SP); current_offs -= 8;
+    __ std(R25, current_offs, R1_SP); current_offs -= 8;
+    __ std(R24, current_offs, R1_SP); current_offs -= 8;
+    __ std(R23, current_offs, R1_SP); current_offs -= 8;
+    __ std(R22, current_offs, R1_SP); current_offs -= 8;
+    __ std(R21, current_offs, R1_SP); current_offs -= 8;
+    __ std(R20, current_offs, R1_SP); current_offs -= 8;
+    __ std(R19, current_offs, R1_SP); current_offs -= 8;
+    __ std(R18, current_offs, R1_SP); current_offs -= 8;
+    __ std(R17, current_offs, R1_SP); current_offs -= 8;
+    __ std(R16, current_offs, R1_SP); current_offs -= 8;
+    __ std(R15, current_offs, R1_SP); current_offs -= 8;
+    __ std(R14, current_offs, R1_SP);
+
+    // Store the squares, right shifted one bit (i.e., divided by 2)
+    __ subi   (out_aux,   out,       8);
+    __ subi   (in_aux,    in,        4);
+    __ cmpwi  (CCR0,      in_len,    0);
+    // Initialize lplw outside of the loop
+    __ xorr   (lplw,      lplw,      lplw);
+    __ ble    (CCR0,      SKIP_LOOP_SQUARE);    // in_len <= 0
+    __ mtctr  (in_len);
+
+    __ bind(LOOP_SQUARE);
+    __ lwzu   (piece,     4,         in_aux);
+    __ mulld  (product,   piece,     piece);
+    // shift left 63 bits and only keep the MSB
+    __ rldic  (lplw_s,    lplw,      63, 0);
+    __ mr     (lplw,      product);
+    // shift right 1 bit without sign extension
+    __ srdi   (product,   product,   1);
+    // join them to the same register and store it
+    __ orr    (product,   lplw_s,    product);
+#ifdef VM_LITTLE_ENDIAN
+    // Swap low and high words for little endian
+    __ rldicl (product,   product,   32, 0);
+#endif
+    __ stdu   (product,   8,         out_aux);
+    __ bdnz   (LOOP_SQUARE);
+
+    __ bind(SKIP_LOOP_SQUARE);
+
+    // Add in off-diagonal sums
+    __ cmpwi  (CCR0,      in_len,    0);
+    __ ble    (CCR0,      SKIP_DIAGONAL_SUM);
+    // Avoid CTR usage here in order to use it at mulAdd
+    __ subi   (i_minus1,  in_len,    1);
+    __ li     (offset,    4);
+
+    __ bind(LOOP_DIAGONAL_SUM);
+
+    __ sldi   (off_aux,   out_len,   2);
+    __ sub    (off_aux,   off_aux,   offset);
+
+    __ mr     (len,       i_minus1);
+    __ sldi   (mlen,      i_minus1,  2);
+    __ lwzx   (t,         in,        mlen);
+
+    __ muladd (out, in, off_aux, len, t, a, b, carry);
+
+    // begin<addOne>
+    // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
+    __ addi   (mlen,      mlen,      4);
+    __ sldi   (a,         out_len,   2);
+    __ subi   (a,         a,         4);
+    __ sub    (a,         a,         mlen);
+    __ subi   (off_aux,   offset,    4);
+    __ sub    (off_aux,   a,         off_aux);
+
+    __ lwzx   (b,         off_aux,   out);
+    __ add    (b,         b,         carry);
+    __ stwx   (b,         off_aux,   out);
+
+    // if (((uint64_t)s >> 32) != 0) {
+    __ srdi_  (a,         b,         32);
+    __ beq    (CCR0,      SKIP_ADDONE);
+
+    // while (--mlen >= 0) {
+    __ bind(LOOP_ADDONE);
+    __ subi   (mlen,      mlen,      4);
+    __ cmpwi  (CCR0,      mlen,      0);
+    __ beq    (CCR0,      SKIP_ADDONE);
+
+    // if (--offset_aux < 0) { // Carry out of number
+    __ subi   (off_aux,   off_aux,   4);
+    __ cmpwi  (CCR0,      off_aux,   0);
+    __ blt    (CCR0,      SKIP_ADDONE);
+
+    // } else {
+    __ lwzx   (b,         off_aux,   out);
+    __ addi   (b,         b,         1);
+    __ stwx   (b,         off_aux,   out);
+    __ cmpwi  (CCR0,      b,         0);
+    __ bne    (CCR0,      SKIP_ADDONE);
+    __ b      (LOOP_ADDONE);
+
+    __ bind(SKIP_ADDONE);
+    // } } } end<addOne>
+
+    __ addi   (offset,    offset,    8);
+    __ subi   (i_minus1,  i_minus1,  1);
+    __ cmpwi  (CCR0,      i_minus1,  0);
+    __ bge    (CCR0,      LOOP_DIAGONAL_SUM);
+
+    __ bind(SKIP_DIAGONAL_SUM);
+
+    // Shift back up and set low bit
+    // Shifts 1 bit left up to len positions. Assumes no leading zeros
+    // begin<primitiveLeftShift>
+    __ cmpwi  (CCR0,      out_len,   0);
+    __ ble    (CCR0,      SKIP_LSHIFT);
+    __ li     (i,         0);
+    __ lwz    (c,         0,         out);
+    __ subi   (b,         out_len,   1);
+    __ mtctr  (b);
+
+    __ bind(LOOP_LSHIFT);
+    __ mr     (b,         c);
+    __ addi   (cs,        i,         4);
+    __ lwzx   (c,         out,       cs);
+
+    __ sldi   (b,         b,         1);
+    __ srwi   (cs,        c,         31);
+    __ orr    (b,         b,         cs);
+    __ stwx   (b,         i,         out);
+
+    __ addi   (i,         i,         4);
+    __ bdnz   (LOOP_LSHIFT);
+
+    __ sldi   (c,         out_len,   2);
+    __ subi   (c,         c,         4);
+    __ lwzx   (b,         out,       c);
+    __ sldi   (b,         b,         1);
+    __ stwx   (b,         out,       c);
+
+    __ bind(SKIP_LSHIFT);
+    // end<primitiveLeftShift>
+
+    // Set low bit
+    __ sldi   (i,         in_len,    2);
+    __ subi   (i,         i,         4);
+    __ lwzx   (i,         in,        i);
+    __ sldi   (c,         out_len,   2);
+    __ subi   (c,         c,         4);
+    __ lwzx   (b,         out,       c);
+
+    __ andi   (i,         i,         1);
+    __ orr    (i,         b,         i);
+
+    __ stwx   (i,         out,       c);
+
+    // Restore non-volatile regs.
+    current_offs = -8;
+    __ ld(R28, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R27, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R26, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R25, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R24, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R23, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R22, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R21, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R20, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R19, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R18, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R17, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R16, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R15, current_offs, R1_SP); current_offs -= 8;
+    __ ld(R14, current_offs, R1_SP);
+
+    __ mr(ret, out);
+    __ blr();
+
+    return start;
+  }
 
   /**
    * Arguments:
@@ -3500,6 +3794,12 @@
     }
 #endif
 
+    if (UseSquareToLenIntrinsic) {
+      StubRoutines::_squareToLen = generate_squareToLen();
+    }
+    if (UseMulAddIntrinsic) {
+      StubRoutines::_mulAdd = generate_mulAdd();
+    }
     if (UseMontgomeryMultiplyIntrinsic) {
       StubRoutines::_montgomeryMultiply
         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
@@ -3514,6 +3814,14 @@
       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
     }
 
+    if (UseSHA256Intrinsics) {
+      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
+      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
+    }
+    if (UseSHA512Intrinsics) {
+      StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
+      StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
+    }
   }
 
  public:
--- a/src/hotspot/cpu/ppc/stubRoutines_ppc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/stubRoutines_ppc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -34,7 +34,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 20000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 20000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
 };
 
 // CRC32 Intrinsics.
--- a/src/hotspot/cpu/ppc/templateInterpreterGenerator_ppc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/templateInterpreterGenerator_ppc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1470,10 +1470,6 @@
   // TODO PPC port assert(4 == JavaThread::sz_thread_state(), "unexpected field size");
   __ stw(R0, thread_(thread_state));
 
-  if (UseMembar) {
-    __ fence();
-  }
-
   //=============================================================================
   // Call the native method. Argument registers must not have been
   // overwritten since "__ call_stub(signature_handler);" (except for
@@ -1594,9 +1590,6 @@
   __ li(R0/*thread_state*/, _thread_in_Java);
   __ release();
   __ stw(R0/*thread_state*/, thread_(thread_state));
-  if (UseMembar) {
-    __ fence();
-  }
 
   if (CheckJNICalls) {
     // clear_pending_jni_exception_check
--- a/src/hotspot/cpu/ppc/templateTable_ppc_64.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/templateTable_ppc_64.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -2224,6 +2224,7 @@
   if (is_static) {
     __ ld(Robj, in_bytes(cp_base_offset) + in_bytes(ConstantPoolCacheEntry::f1_offset()), Rcache);
     __ ld(Robj, in_bytes(Klass::java_mirror_offset()), Robj);
+    __ resolve_oop_handle(Robj);
     // Acquire not needed here. Following access has an address dependency on this value.
   }
 }
--- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -107,13 +107,23 @@
   // TODO: PPC port PdScheduling::power6SectorSize = 0x20;
   }
 
-  MaxVectorSize = 8;
+  if (PowerArchitecturePPC64 >= 8) {
+    if (FLAG_IS_DEFAULT(SuperwordUseVSX)) {
+      FLAG_SET_ERGO(bool, SuperwordUseVSX, true);
+    }
+  } else {
+    if (SuperwordUseVSX) {
+      warning("SuperwordUseVSX specified, but needs at least Power8.");
+      FLAG_SET_DEFAULT(SuperwordUseVSX, false);
+    }
+  }
+  MaxVectorSize = SuperwordUseVSX ? 16 : 8;
 #endif
 
   // Create and print feature-string.
   char buf[(num_features+1) * 16]; // Max 16 chars per feature.
   jio_snprintf(buf, sizeof(buf),
-               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                (has_fsqrt()   ? " fsqrt"   : ""),
                (has_isel()    ? " isel"    : ""),
                (has_lxarxeh() ? " lxarxeh" : ""),
@@ -130,7 +140,8 @@
                (has_mfdscr()  ? " mfdscr"  : ""),
                (has_vsx()     ? " vsx"     : ""),
                (has_ldbrx()   ? " ldbrx"   : ""),
-               (has_stdbrx()  ? " stdbrx"  : "")
+               (has_stdbrx()  ? " stdbrx"  : ""),
+               (has_vshasig() ? " sha"     : "")
                // Make sure number of %s matches num_features!
               );
   _features_string = os::strdup(buf);
@@ -138,8 +149,7 @@
     print_features();
   }
 
-  // PPC64 supports 8-byte compare-exchange operations (see
-  // Atomic::cmpxchg and StubGenerator::generate_atomic_cmpxchg_ptr)
+  // PPC64 supports 8-byte compare-exchange operations (see Atomic::cmpxchg)
   // and 'atomic long memory ops' (see Unsafe_GetLongVolatile).
   _supports_cx8 = true;
 
@@ -200,7 +210,6 @@
   }
 
   // The AES intrinsic stubs require AES instruction support.
-#if defined(VM_LITTLE_ENDIAN)
   if (has_vcipher()) {
     if (FLAG_IS_DEFAULT(UseAES)) {
       UseAES = true;
@@ -221,18 +230,6 @@
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
 
-#else
-  if (UseAES) {
-    warning("AES instructions are not available on this CPU");
-    FLAG_SET_DEFAULT(UseAES, false);
-  }
-  if (UseAESIntrinsics) {
-    if (!FLAG_IS_DEFAULT(UseAESIntrinsics))
-      warning("AES intrinsics are not available on this CPU");
-    FLAG_SET_DEFAULT(UseAESIntrinsics, false);
-  }
-#endif
-
   if (UseAESCTRIntrinsics) {
     warning("AES/CTR intrinsics are not available on this CPU");
     FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
@@ -247,17 +244,49 @@
     FLAG_SET_DEFAULT(UseFMA, true);
   }
 
-  if (UseSHA) {
-    warning("SHA instructions are not available on this CPU");
+  if (has_vshasig()) {
+    if (FLAG_IS_DEFAULT(UseSHA)) {
+      UseSHA = true;
+    }
+  } else if (UseSHA) {
+    if (!FLAG_IS_DEFAULT(UseSHA))
+      warning("SHA instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseSHA, false);
   }
-  if (UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics) {
-    warning("SHA intrinsics are not available on this CPU");
+
+  if (UseSHA1Intrinsics) {
+    warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
     FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
+  }
+
+  if (UseSHA && has_vshasig()) {
+    if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
+    }
+  } else if (UseSHA256Intrinsics) {
+    warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU.");
     FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+  }
+
+  if (UseSHA && has_vshasig()) {
+    if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
+    }
+  } else if (UseSHA512Intrinsics) {
+    warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU.");
     FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
   }
 
+  if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) {
+    FLAG_SET_DEFAULT(UseSHA, false);
+  }
+
+  if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+    UseSquareToLenIntrinsic = true;
+  }
+  if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+    UseMulAddIntrinsic = true;
+  }
   if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
     UseMultiplyToLenIntrinsic = true;
   }
@@ -657,6 +686,7 @@
   a->lxvd2x(VSR0, R3_ARG1);                    // code[14] -> vsx
   a->ldbrx(R7, R3_ARG1, R4_ARG2);              // code[15] -> ldbrx
   a->stdbrx(R7, R3_ARG1, R4_ARG2);             // code[16] -> stdbrx
+  a->vshasigmaw(VR0, VR1, 1, 0xF);             // code[17] -> vshasig
   a->blr();
 
   // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@@ -708,6 +738,7 @@
   if (code[feature_cntr++]) features |= vsx_m;
   if (code[feature_cntr++]) features |= ldbrx_m;
   if (code[feature_cntr++]) features |= stdbrx_m;
+  if (code[feature_cntr++]) features |= vshasig_m;
 
   // Print the detection code.
   if (PrintAssembly) {
--- a/src/hotspot/cpu/ppc/vm_version_ppc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/ppc/vm_version_ppc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -49,6 +49,7 @@
     vsx,
     ldbrx,
     stdbrx,
+    vshasig,
     num_features // last entry to count features
   };
   enum Feature_Flag_Set {
@@ -64,6 +65,7 @@
     vand_m                = (1 << vand   ),
     lqarx_m               = (1 << lqarx  ),
     vcipher_m             = (1 << vcipher),
+    vshasig_m             = (1 << vshasig),
     vpmsumb_m             = (1 << vpmsumb),
     tcheck_m              = (1 << tcheck ),
     mfdscr_m              = (1 << mfdscr ),
@@ -106,6 +108,7 @@
   static bool has_vsx()     { return (_features & vsx_m) != 0; }
   static bool has_ldbrx()   { return (_features & ldbrx_m) != 0; }
   static bool has_stdbrx()  { return (_features & stdbrx_m) != 0; }
+  static bool has_vshasig() { return (_features & vshasig_m) != 0; }
   static bool has_mtfprd()  { return has_vpmsumb(); } // alias for P8
 
   // Assembler testing
--- a/src/hotspot/cpu/s390/assembler_s390.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/assembler_s390.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2016 SAP SE. All rights reserved.
+ * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -250,7 +250,6 @@
   bool is_RSform()  { return has_base() && !has_index() && is_disp12(); }
   bool is_RSYform() { return has_base() && !has_index() && is_disp20(); }
   bool is_RXform()  { return has_base() &&  has_index() && is_disp12(); }
-  bool is_RXEform() { return has_base() &&  has_index() && is_disp12(); }
   bool is_RXYform() { return has_base() &&  has_index() && is_disp20(); }
 
   bool uses(Register r) { return _base == r || _index == r; };
@@ -1093,7 +1092,201 @@
 #define TRTT_ZOPC   (unsigned  int)(0xb9 << 24 | 0x90 << 16)
 
 
-// Miscellaneous Operations
+//---------------------------
+//--  Vector Instructions  --
+//---------------------------
+
+//---<  Vector Support Instructions  >---
+
+//---  Load (memory)  ---
+
+#define VLM_ZOPC    (unsigned long)(0xe7L << 40 | 0x36L << 0)   // load full vreg range (n * 128 bit)
+#define VL_ZOPC     (unsigned long)(0xe7L << 40 | 0x06L << 0)   // load full vreg (128 bit)
+#define VLEB_ZOPC   (unsigned long)(0xe7L << 40 | 0x00L << 0)   // load vreg element (8 bit)
+#define VLEH_ZOPC   (unsigned long)(0xe7L << 40 | 0x01L << 0)   // load vreg element (16 bit)
+#define VLEF_ZOPC   (unsigned long)(0xe7L << 40 | 0x03L << 0)   // load vreg element (32 bit)
+#define VLEG_ZOPC   (unsigned long)(0xe7L << 40 | 0x02L << 0)   // load vreg element (64 bit)
+
+#define VLREP_ZOPC  (unsigned long)(0xe7L << 40 | 0x05L << 0)   // load and replicate into all vector elements
+#define VLLEZ_ZOPC  (unsigned long)(0xe7L << 40 | 0x04L << 0)   // load logical element and zero.
+
+// vector register gather
+#define VGEF_ZOPC   (unsigned long)(0xe7L << 40 | 0x13L << 0)   // gather element (32 bit), V1(M3) = [D2(V2(M3),B2)]
+#define VGEG_ZOPC   (unsigned long)(0xe7L << 40 | 0x12L << 0)   // gather element (64 bit), V1(M3) = [D2(V2(M3),B2)]
+// vector register scatter
+#define VSCEF_ZOPC  (unsigned long)(0xe7L << 40 | 0x1bL << 0)   // vector scatter element FW
+#define VSCEG_ZOPC  (unsigned long)(0xe7L << 40 | 0x1aL << 0)   // vector scatter element DW
+
+#define VLBB_ZOPC   (unsigned long)(0xe7L << 40 | 0x07L << 0)   // load vreg to block boundary (load to alignment).
+#define VLL_ZOPC    (unsigned long)(0xe7L << 40 | 0x37L << 0)   // load vreg with length.
+
+//---  Load (register)  ---
+
+#define VLR_ZOPC    (unsigned long)(0xe7L << 40 | 0x56L << 0)   // copy full vreg (128 bit)
+#define VLGV_ZOPC   (unsigned long)(0xe7L << 40 | 0x21L << 0)   // copy vreg element -> GR
+#define VLVG_ZOPC   (unsigned long)(0xe7L << 40 | 0x22L << 0)   // copy GR -> vreg element
+#define VLVGP_ZOPC  (unsigned long)(0xe7L << 40 | 0x62L << 0)   // copy GR2, GR3 (disjoint pair) -> vreg
+
+// vector register pack: cut in half the size the source vector elements
+#define VPK_ZOPC    (unsigned long)(0xe7L << 40 | 0x94L << 0)   // just cut
+#define VPKS_ZOPC   (unsigned long)(0xe7L << 40 | 0x97L << 0)   // saturate as signed values
+#define VPKLS_ZOPC  (unsigned long)(0xe7L << 40 | 0x95L << 0)   // saturate as unsigned values
+
+// vector register unpack: double in size the source vector elements
+#define VUPH_ZOPC   (unsigned long)(0xe7L << 40 | 0xd7L << 0)   // signed, left half of the source vector elements
+#define VUPLH_ZOPC  (unsigned long)(0xe7L << 40 | 0xd5L << 0)   // unsigned, left half of the source vector elements
+#define VUPL_ZOPC   (unsigned long)(0xe7L << 40 | 0xd6L << 0)   // signed, right half of the source vector elements
+#define VUPLL_ZOPC  (unsigned long)(0xe7L << 40 | 0xd4L << 0)   // unsigned, right half of the source vector element
+
+// vector register merge
+#define VMRH_ZOPC   (unsigned long)(0xe7L << 40 | 0x61L << 0)   // register merge high (left half of source registers)
+#define VMRL_ZOPC   (unsigned long)(0xe7L << 40 | 0x60L << 0)   // register merge low (right half of source registers)
+
+// vector register permute
+#define VPERM_ZOPC  (unsigned long)(0xe7L << 40 | 0x8cL << 0)   // vector permute
+#define VPDI_ZOPC   (unsigned long)(0xe7L << 40 | 0x84L << 0)   // vector permute DW immediate
+
+// vector register replicate
+#define VREP_ZOPC   (unsigned long)(0xe7L << 40 | 0x4dL << 0)   // vector replicate
+#define VREPI_ZOPC  (unsigned long)(0xe7L << 40 | 0x45L << 0)   // vector replicate immediate
+#define VSEL_ZOPC   (unsigned long)(0xe7L << 40 | 0x8dL << 0)   // vector select
+
+#define VSEG_ZOPC   (unsigned long)(0xe7L << 40 | 0x5fL << 0)   // vector sign-extend to DW (rightmost element in each DW).
+
+//---  Load (immediate)  ---
+
+#define VLEIB_ZOPC  (unsigned long)(0xe7L << 40 | 0x40L << 0)   // load vreg element (16 bit imm to 8 bit)
+#define VLEIH_ZOPC  (unsigned long)(0xe7L << 40 | 0x41L << 0)   // load vreg element (16 bit imm to 16 bit)
+#define VLEIF_ZOPC  (unsigned long)(0xe7L << 40 | 0x43L << 0)   // load vreg element (16 bit imm to 32 bit)
+#define VLEIG_ZOPC  (unsigned long)(0xe7L << 40 | 0x42L << 0)   // load vreg element (16 bit imm to 64 bit)
+
+//---  Store  ---
+
+#define VSTM_ZOPC   (unsigned long)(0xe7L << 40 | 0x3eL << 0)   // store full vreg range (n * 128 bit)
+#define VST_ZOPC    (unsigned long)(0xe7L << 40 | 0x0eL << 0)   // store full vreg (128 bit)
+#define VSTEB_ZOPC  (unsigned long)(0xe7L << 40 | 0x08L << 0)   // store vreg element (8 bit)
+#define VSTEH_ZOPC  (unsigned long)(0xe7L << 40 | 0x09L << 0)   // store vreg element (16 bit)
+#define VSTEF_ZOPC  (unsigned long)(0xe7L << 40 | 0x0bL << 0)   // store vreg element (32 bit)
+#define VSTEG_ZOPC  (unsigned long)(0xe7L << 40 | 0x0aL << 0)   // store vreg element (64 bit)
+#define VSTL_ZOPC   (unsigned long)(0xe7L << 40 | 0x3fL << 0)   // store vreg with length.
+
+//---  Misc  ---
+
+#define VGM_ZOPC    (unsigned long)(0xe7L << 40 | 0x46L << 0)   // generate bit  mask, [start..end] = '1', else '0'
+#define VGBM_ZOPC   (unsigned long)(0xe7L << 40 | 0x44L << 0)   // generate byte mask, bits(imm16) -> bytes
+
+//---<  Vector Arithmetic Instructions  >---
+
+// Load
+#define VLC_ZOPC    (unsigned long)(0xe7L << 40 | 0xdeL << 0)   // V1 := -V2,   element size = 2**m
+#define VLP_ZOPC    (unsigned long)(0xe7L << 40 | 0xdfL << 0)   // V1 := |V2|,  element size = 2**m
+
+// ADD
+#define VA_ZOPC     (unsigned long)(0xe7L << 40 | 0xf3L << 0)   // V1 := V2 + V3, element size = 2**m
+#define VACC_ZOPC   (unsigned long)(0xe7L << 40 | 0xf1L << 0)   // V1 := carry(V2 + V3), element size = 2**m
+
+// SUB
+#define VS_ZOPC     (unsigned long)(0xe7L << 40 | 0xf7L << 0)   // V1 := V2 - V3, element size = 2**m
+#define VSCBI_ZOPC  (unsigned long)(0xe7L << 40 | 0xf5L << 0)   // V1 := borrow(V2 - V3), element size = 2**m
+
+// MUL
+#define VML_ZOPC    (unsigned long)(0xe7L << 40 | 0xa2L << 0)   // V1 := V2 * V3, element size = 2**m
+#define VMH_ZOPC    (unsigned long)(0xe7L << 40 | 0xa3L << 0)   // V1 := V2 * V3, element size = 2**m
+#define VMLH_ZOPC   (unsigned long)(0xe7L << 40 | 0xa1L << 0)   // V1 := V2 * V3, element size = 2**m, unsigned
+#define VME_ZOPC    (unsigned long)(0xe7L << 40 | 0xa6L << 0)   // V1 := V2 * V3, element size = 2**m
+#define VMLE_ZOPC   (unsigned long)(0xe7L << 40 | 0xa4L << 0)   // V1 := V2 * V3, element size = 2**m, unsigned
+#define VMO_ZOPC    (unsigned long)(0xe7L << 40 | 0xa7L << 0)   // V1 := V2 * V3, element size = 2**m
+#define VMLO_ZOPC   (unsigned long)(0xe7L << 40 | 0xa5L << 0)   // V1 := V2 * V3, element size = 2**m, unsigned
+
+// MUL & ADD
+#define VMAL_ZOPC   (unsigned long)(0xe7L << 40 | 0xaaL << 0)   // V1 := V2 * V3 + V4, element size = 2**m
+#define VMAH_ZOPC   (unsigned long)(0xe7L << 40 | 0xabL << 0)   // V1 := V2 * V3 + V4, element size = 2**m
+#define VMALH_ZOPC  (unsigned long)(0xe7L << 40 | 0xa9L << 0)   // V1 := V2 * V3 + V4, element size = 2**m, unsigned
+#define VMAE_ZOPC   (unsigned long)(0xe7L << 40 | 0xaeL << 0)   // V1 := V2 * V3 + V4, element size = 2**m
+#define VMALE_ZOPC  (unsigned long)(0xe7L << 40 | 0xacL << 0)   // V1 := V2 * V3 + V4, element size = 2**m, unsigned
+#define VMAO_ZOPC   (unsigned long)(0xe7L << 40 | 0xafL << 0)   // V1 := V2 * V3 + V4, element size = 2**m
+#define VMALO_ZOPC  (unsigned long)(0xe7L << 40 | 0xadL << 0)   // V1 := V2 * V3 + V4, element size = 2**m, unsigned
+
+// Vector SUM
+#define VSUM_ZOPC   (unsigned long)(0xe7L << 40 | 0x64L << 0)   // V1[j] := toFW(sum(V2[i]) + V3[j]), subelements: byte or HW
+#define VSUMG_ZOPC  (unsigned long)(0xe7L << 40 | 0x65L << 0)   // V1[j] := toDW(sum(V2[i]) + V3[j]), subelements: HW or FW
+#define VSUMQ_ZOPC  (unsigned long)(0xe7L << 40 | 0x67L << 0)   // V1[j] := toQW(sum(V2[i]) + V3[j]), subelements: FW or DW
+
+// Average
+#define VAVG_ZOPC   (unsigned long)(0xe7L << 40 | 0xf2L << 0)   // V1 := (V2+V3+1)/2, signed,   element size = 2**m
+#define VAVGL_ZOPC  (unsigned long)(0xe7L << 40 | 0xf0L << 0)   // V1 := (V2+V3+1)/2, unsigned, element size = 2**m
+
+// VECTOR Galois Field Multiply Sum
+#define VGFM_ZOPC   (unsigned long)(0xe7L << 40 | 0xb4L << 0)
+#define VGFMA_ZOPC  (unsigned long)(0xe7L << 40 | 0xbcL << 0)
+
+//---<  Vector Logical Instructions  >---
+
+// AND
+#define VN_ZOPC     (unsigned long)(0xe7L << 40 | 0x68L << 0)   // V1 := V2 & V3,  element size = 2**m
+#define VNC_ZOPC    (unsigned long)(0xe7L << 40 | 0x69L << 0)   // V1 := V2 & ~V3, element size = 2**m
+
+// XOR
+#define VX_ZOPC     (unsigned long)(0xe7L << 40 | 0x6dL << 0)   // V1 := V2 ^ V3,  element size = 2**m
+
+// NOR
+#define VNO_ZOPC    (unsigned long)(0xe7L << 40 | 0x6bL << 0)   // V1 := !(V2 | V3),  element size = 2**m
+
+// OR
+#define VO_ZOPC     (unsigned long)(0xe7L << 40 | 0x6aL << 0)   // V1 := V2 | V3,  element size = 2**m
+
+// Comparison (element-wise)
+#define VCEQ_ZOPC   (unsigned long)(0xe7L << 40 | 0xf8L << 0)   // V1 := (V2 == V3) ? 0xffff : 0x0000, element size = 2**m
+#define VCH_ZOPC    (unsigned long)(0xe7L << 40 | 0xfbL << 0)   // V1 := (V2  > V3) ? 0xffff : 0x0000, element size = 2**m, signed
+#define VCHL_ZOPC   (unsigned long)(0xe7L << 40 | 0xf9L << 0)   // V1 := (V2  > V3) ? 0xffff : 0x0000, element size = 2**m, unsigned
+
+// Max/Min (element-wise)
+#define VMX_ZOPC    (unsigned long)(0xe7L << 40 | 0xffL << 0)   // V1 := (V2 > V3) ? V2 : V3, element size = 2**m, signed
+#define VMXL_ZOPC   (unsigned long)(0xe7L << 40 | 0xfdL << 0)   // V1 := (V2 > V3) ? V2 : V3, element size = 2**m, unsigned
+#define VMN_ZOPC    (unsigned long)(0xe7L << 40 | 0xfeL << 0)   // V1 := (V2 < V3) ? V2 : V3, element size = 2**m, signed
+#define VMNL_ZOPC   (unsigned long)(0xe7L << 40 | 0xfcL << 0)   // V1 := (V2 < V3) ? V2 : V3, element size = 2**m, unsigned
+
+// Leading/Trailing Zeros, population count
+#define VCLZ_ZOPC   (unsigned long)(0xe7L << 40 | 0x53L << 0)   // V1 := leadingzeros(V2),  element size = 2**m
+#define VCTZ_ZOPC   (unsigned long)(0xe7L << 40 | 0x52L << 0)   // V1 := trailingzeros(V2), element size = 2**m
+#define VPOPCT_ZOPC (unsigned long)(0xe7L << 40 | 0x50L << 0)   // V1 := popcount(V2), bytewise!!
+
+// Rotate/Shift
+#define VERLLV_ZOPC (unsigned long)(0xe7L << 40 | 0x73L << 0)   // V1 := rotateleft(V2), rotate count in V3 element
+#define VERLL_ZOPC  (unsigned long)(0xe7L << 40 | 0x33L << 0)   // V1 := rotateleft(V3), rotate count from d2(b2).
+#define VERIM_ZOPC  (unsigned long)(0xe7L << 40 | 0x72L << 0)   // Rotate then insert under mask. Read Principles of Operation!!
+
+#define VESLV_ZOPC  (unsigned long)(0xe7L << 40 | 0x70L << 0)   // V1 := SLL(V2, V3), unsigned, element-wise
+#define VESL_ZOPC   (unsigned long)(0xe7L << 40 | 0x30L << 0)   // V1 := SLL(V3), unsigned, shift count from d2(b2).
+
+#define VESRAV_ZOPC (unsigned long)(0xe7L << 40 | 0x7AL << 0)   // V1 := SRA(V2, V3), signed, element-wise
+#define VESRA_ZOPC  (unsigned long)(0xe7L << 40 | 0x3AL << 0)   // V1 := SRA(V3), signed, shift count from d2(b2).
+#define VESRLV_ZOPC (unsigned long)(0xe7L << 40 | 0x78L << 0)   // V1 := SRL(V2, V3), unsigned, element-wise
+#define VESRL_ZOPC  (unsigned long)(0xe7L << 40 | 0x38L << 0)   // V1 := SRL(V3), unsigned, shift count from d2(b2).
+
+#define VSL_ZOPC    (unsigned long)(0xe7L << 40 | 0x74L << 0)   // V1 := SLL(V2), unsigned, bit-count
+#define VSLB_ZOPC   (unsigned long)(0xe7L << 40 | 0x75L << 0)   // V1 := SLL(V2), unsigned, byte-count
+#define VSLDB_ZOPC  (unsigned long)(0xe7L << 40 | 0x77L << 0)   // V1 := SLL((V2,V3)), unsigned, byte-count
+
+#define VSRA_ZOPC   (unsigned long)(0xe7L << 40 | 0x7eL << 0)   // V1 := SRA(V2), signed, bit-count
+#define VSRAB_ZOPC  (unsigned long)(0xe7L << 40 | 0x7fL << 0)   // V1 := SRA(V2), signed, byte-count
+#define VSRL_ZOPC   (unsigned long)(0xe7L << 40 | 0x7cL << 0)   // V1 := SRL(V2), unsigned, bit-count
+#define VSRLB_ZOPC  (unsigned long)(0xe7L << 40 | 0x7dL << 0)   // V1 := SRL(V2), unsigned, byte-count
+
+// Test under Mask
+#define VTM_ZOPC    (unsigned long)(0xe7L << 40 | 0xd8L << 0)   // Like TM, set CC according to state of selected bits.
+
+//---<  Vector String Instructions  >---
+#define VFAE_ZOPC   (unsigned long)(0xe7L << 40 | 0x82L << 0)   // Find any element
+#define VFEE_ZOPC   (unsigned long)(0xe7L << 40 | 0x80L << 0)   // Find element equal
+#define VFENE_ZOPC  (unsigned long)(0xe7L << 40 | 0x81L << 0)   // Find element not equal
+#define VSTRC_ZOPC  (unsigned long)(0xe7L << 40 | 0x8aL << 0)   // String range compare
+#define VISTR_ZOPC  (unsigned long)(0xe7L << 40 | 0x5cL << 0)   // Isolate String
+
+
+//--------------------------------
+//--  Miscellaneous Operations  --
+//--------------------------------
 
 // Execute
 #define EX_ZOPC     (unsigned  int)(68L << 24)
@@ -1244,10 +1437,18 @@
     // unsigned arithmetic calculation instructions
     // Mask bit#0 is not used by these instructions.
     // There is no indication of overflow for these instr.
-    bcondLogZero             =  2,
-    bcondLogNotZero          =  5,
+    bcondLogZero_NoCarry     =  8,
+    bcondLogZero_Carry       =  2,
+    // bcondLogZero_Borrow      =  8,  // This CC is never generated.
+    bcondLogZero_NoBorrow    =  2,
+    bcondLogZero             =  bcondLogZero_Carry | bcondLogZero_NoCarry,
+    bcondLogNotZero_NoCarry  =  4,
+    bcondLogNotZero_Carry    =  1,
     bcondLogNotZero_Borrow   =  4,
     bcondLogNotZero_NoBorrow =  1,
+    bcondLogNotZero          =  bcondLogNotZero_Carry | bcondLogNotZero_NoCarry,
+    bcondLogCarry            =  bcondLogZero_Carry | bcondLogNotZero_Carry,
+    bcondLogBorrow           =  /* bcondLogZero_Borrow | */ bcondLogNotZero_Borrow,
     // string search instructions
     bcondFound       =  4,
     bcondNotFound    =  2,
@@ -1280,6 +1481,29 @@
     to_minus_infinity = 7
   };
 
+  // Vector Register Element Type.
+  enum VRegElemType {
+    VRET_BYTE   = 0,
+    VRET_HW     = 1,
+    VRET_FW     = 2,
+    VRET_DW     = 3,
+    VRET_QW     = 4
+  };
+
+  // Vector Operation Result Control.
+  //   This is a set of flags used in some vector instructions to control
+  //   the result (side) effects of instruction execution.
+  enum VOpRC {
+    VOPRC_CCSET    = 0b0001, // set the CC.
+    VOPRC_CCIGN    = 0b0000, // ignore, don't set CC.
+    VOPRC_ZS       = 0b0010, // Zero Search. Additional, elementwise, comparison against zero.
+    VOPRC_NOZS     = 0b0000, // No Zero Search.
+    VOPRC_RTBYTEIX = 0b0100, // generate byte index to lowest element with true comparison.
+    VOPRC_RTBITVEC = 0b0000, // generate bit vector, all 1s for true, all 0s for false element comparisons.
+    VOPRC_INVERT   = 0b1000, // invert comparison results.
+    VOPRC_NOINVERT = 0b0000  // use comparison results as is, do not invert.
+  };
+
   // Inverse condition code, i.e. determine "15 - cc" for a given condition code cc.
   static branch_condition inverse_condition(branch_condition cc);
   static branch_condition inverse_float_condition(branch_condition cc);
@@ -1376,6 +1600,65 @@
     return r;
   }
 
+  static int64_t rsmask_48( Address a) { assert(a.is_RSform(),  "bad address format"); return rsmask_48( a.disp12(), a.base()); }
+  static int64_t rxmask_48( Address a) {      if (a.is_RXform())  { return rxmask_48( a.disp12(), a.index(), a.base()); }
+                                         else if (a.is_RSform())  { return rsmask_48( a.disp12(),            a.base()); }
+                                         else                     { guarantee(false, "bad address format");  return 0;  }
+                                       }
+  static int64_t rsymask_48(Address a) { assert(a.is_RSYform(), "bad address format"); return rsymask_48(a.disp20(), a.base()); }
+  static int64_t rxymask_48(Address a) {      if (a.is_RXYform()) { return rxymask_48( a.disp20(), a.index(), a.base()); }
+                                         else if (a.is_RSYform()) { return rsymask_48( a.disp20(),            a.base()); }
+                                         else                     { guarantee(false, "bad address format");  return 0;   }
+                                       }
+
+  static int64_t rsmask_48( int64_t d2, Register b2)              { return uimm12(d2, 20, 48)                   | regz(b2, 16, 48); }
+  static int64_t rxmask_48( int64_t d2, Register x2, Register b2) { return uimm12(d2, 20, 48) | reg(x2, 12, 48) | regz(b2, 16, 48); }
+  static int64_t rsymask_48(int64_t d2, Register b2)              { return simm20(d2)                           | regz(b2, 16, 48); }
+  static int64_t rxymask_48(int64_t d2, Register x2, Register b2) { return simm20(d2)         | reg(x2, 12, 48) | regz(b2, 16, 48); }
+
+  // Address calculated from d12(vx,b) - vx is vector index register.
+  static int64_t rvmask_48( int64_t d2, VectorRegister x2, Register b2) { return uimm12(d2, 20, 48) | vreg(x2, 12) | regz(b2, 16, 48); }
+
+  static int64_t vreg_mask(VectorRegister v, int pos) {
+    return vreg(v, pos) | v->RXB_mask(pos);
+  }
+
+  // Vector Element Size Control. 4-bit field which indicates the size of the vector elements.
+  static int64_t vesc_mask(int64_t size, int min_size, int max_size, int pos) {
+    // min_size - minimum element size. Not all instructions support element sizes beginning with "byte".
+    // max_size - maximum element size. Not all instructions support element sizes up to "QW".
+    assert((min_size <= size) && (size <= max_size), "element size control out of range");
+    return uimm4(size, pos, 48);
+  }
+
+  // Vector Element IndeX. 4-bit field which indexes the target vector element.
+  static int64_t veix_mask(int64_t ix, int el_size, int pos) {
+    // el_size - size of the vector element. This is a VRegElemType enum value.
+    // ix      - vector element index.
+    int max_ix = -1;
+    switch (el_size) {
+      case VRET_BYTE: max_ix = 15; break;
+      case VRET_HW:   max_ix =  7; break;
+      case VRET_FW:   max_ix =  3; break;
+      case VRET_DW:   max_ix =  1; break;
+      case VRET_QW:   max_ix =  0; break;
+      default:        guarantee(false, "bad vector element size %d", el_size); break;
+    }
+    assert((0 <= ix) && (ix <= max_ix), "element size out of range (0 <= %ld <= %d)", ix, max_ix);
+    return uimm4(ix, pos, 48);
+  }
+
+  // Vector Operation Result Control. 4-bit field.
+  static int64_t voprc_any(int64_t flags, int pos, int64_t allowed_flags = 0b1111) {
+    assert((flags & allowed_flags) == flags, "Invalid VOPRC_* flag combination: %d", (int)flags);
+    return uimm4(flags, pos, 48);
+  }
+
+  // Vector Operation Result Control. Condition code setting.
+  static int64_t voprc_ccmask(int64_t flags, int pos) {
+    return voprc_any(flags, pos, VOPRC_CCIGN | VOPRC_CCSET);
+  }
+
  public:
 
   //--------------------------------------------------
@@ -1453,6 +1736,8 @@
   static long imm24(int64_t i24, int s, int len)   { return imm(i24, 24) << (len-s-24); }
   static long imm32(int64_t i32, int s, int len)   { return imm(i32, 32) << (len-s-32); }
 
+  static long vreg(VectorRegister v, int pos)      { const int len = 48; return u_field(v->encoding()&0x0f, (len-pos)-1, (len-pos)-4) | v->RXB_mask(pos); }
+
   static long fregt(FloatRegister r, int s, int len) { return freg(r,s,len); }
   static long freg( FloatRegister r, int s, int len) { return u_field(r->encoding(), (len-s)-1, (len-s)-4); }
 
@@ -1840,13 +2125,16 @@
   inline void z_alsi( const Address& d, int64_t i2);              // add logical   *(d) += i2_imm8           ; uint32  -- z10
   inline void z_algsi(const Address& d, int64_t i2);              // add logical   *(d) += i2_imm8           ; uint64  -- z10
 
-  // negate
+  // sign adjustment
   inline void z_lcr(  Register r1, Register r2 = noreg);              // neg r1 = -r2   ; int32
   inline void z_lcgr( Register r1, Register r2 = noreg);              // neg r1 = -r2   ; int64
   inline void z_lcgfr(Register r1, Register r2);                      // neg r1 = -r2   ; int64 <- int32
   inline void z_lnr(  Register r1, Register r2 = noreg);              // neg r1 = -|r2| ; int32
   inline void z_lngr( Register r1, Register r2 = noreg);              // neg r1 = -|r2| ; int64
   inline void z_lngfr(Register r1, Register r2);                      // neg r1 = -|r2| ; int64 <- int32
+  inline void z_lpr(  Register r1, Register r2 = noreg);              //     r1 =  |r2| ; int32
+  inline void z_lpgr( Register r1, Register r2 = noreg);              //     r1 =  |r2| ; int64
+  inline void z_lpgfr(Register r1, Register r2);                      //     r1 =  |r2| ; int64 <- int32
 
   // subtract intstructions
   // sub registers
@@ -2125,6 +2413,422 @@
   inline void z_trtt(Register r1, Register r2, int64_t m3);
 
 
+  //---------------------------
+  //--  Vector Instructions  --
+  //---------------------------
+
+  //---<  Vector Support Instructions  >---
+
+  // Load (transfer from memory)
+  inline void z_vlm(   VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vl(    VectorRegister v1, int64_t d2, Register x2, Register b2);
+  inline void z_vleb(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vleh(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vlef(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vleg(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+
+  // Gather/Scatter
+  inline void z_vgef(  VectorRegister v1, int64_t d2, VectorRegister vx2, Register b2, int64_t m3);
+  inline void z_vgeg(  VectorRegister v1, int64_t d2, VectorRegister vx2, Register b2, int64_t m3);
+
+  inline void z_vscef( VectorRegister v1, int64_t d2, VectorRegister vx2, Register b2, int64_t m3);
+  inline void z_vsceg( VectorRegister v1, int64_t d2, VectorRegister vx2, Register b2, int64_t m3);
+
+  // load and replicate
+  inline void z_vlrep( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vlrepb(VectorRegister v1, int64_t d2, Register x2, Register b2);
+  inline void z_vlreph(VectorRegister v1, int64_t d2, Register x2, Register b2);
+  inline void z_vlrepf(VectorRegister v1, int64_t d2, Register x2, Register b2);
+  inline void z_vlrepg(VectorRegister v1, int64_t d2, Register x2, Register b2);
+
+  inline void z_vllez( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vllezb(VectorRegister v1, int64_t d2, Register x2, Register b2);
+  inline void z_vllezh(VectorRegister v1, int64_t d2, Register x2, Register b2);
+  inline void z_vllezf(VectorRegister v1, int64_t d2, Register x2, Register b2);
+  inline void z_vllezg(VectorRegister v1, int64_t d2, Register x2, Register b2);
+
+  inline void z_vlbb(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vll(   VectorRegister v1, Register r3, int64_t d2, Register b2);
+
+  // Load (register to register)
+  inline void z_vlr(   VectorRegister v1, VectorRegister v2);
+
+  inline void z_vlgv(  Register r1, VectorRegister v3, int64_t d2, Register b2, int64_t m4);
+  inline void z_vlgvb( Register r1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vlgvh( Register r1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vlgvf( Register r1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vlgvg( Register r1, VectorRegister v3, int64_t d2, Register b2);
+
+  inline void z_vlvg(  VectorRegister v1, Register r3, int64_t d2, Register b2, int64_t m4);
+  inline void z_vlvgb( VectorRegister v1, Register r3, int64_t d2, Register b2);
+  inline void z_vlvgh( VectorRegister v1, Register r3, int64_t d2, Register b2);
+  inline void z_vlvgf( VectorRegister v1, Register r3, int64_t d2, Register b2);
+  inline void z_vlvgg( VectorRegister v1, Register r3, int64_t d2, Register b2);
+
+  inline void z_vlvgp( VectorRegister v1, Register r2, Register r3);
+
+  // vector register pack
+  inline void z_vpk(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vpkh(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpkf(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpkg(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  inline void z_vpks(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5);
+  inline void z_vpksh( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpksf( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpksg( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpkshs(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpksfs(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpksgs(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  inline void z_vpkls(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5);
+  inline void z_vpklsh( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpklsf( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpklsg( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpklshs(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpklsfs(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vpklsgs(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // vector register unpack (sign-extended)
+  inline void z_vuph(   VectorRegister v1, VectorRegister v2, int64_t m3);
+  inline void z_vuphb(  VectorRegister v1, VectorRegister v2);
+  inline void z_vuphh(  VectorRegister v1, VectorRegister v2);
+  inline void z_vuphf(  VectorRegister v1, VectorRegister v2);
+  inline void z_vupl(   VectorRegister v1, VectorRegister v2, int64_t m3);
+  inline void z_vuplb(  VectorRegister v1, VectorRegister v2);
+  inline void z_vuplh(  VectorRegister v1, VectorRegister v2);
+  inline void z_vuplf(  VectorRegister v1, VectorRegister v2);
+
+  // vector register unpack (zero-extended)
+  inline void z_vuplh(  VectorRegister v1, VectorRegister v2, int64_t m3);
+  inline void z_vuplhb( VectorRegister v1, VectorRegister v2);
+  inline void z_vuplhh( VectorRegister v1, VectorRegister v2);
+  inline void z_vuplhf( VectorRegister v1, VectorRegister v2);
+  inline void z_vupll(  VectorRegister v1, VectorRegister v2, int64_t m3);
+  inline void z_vupllb( VectorRegister v1, VectorRegister v2);
+  inline void z_vupllh( VectorRegister v1, VectorRegister v2);
+  inline void z_vupllf( VectorRegister v1, VectorRegister v2);
+
+  // vector register merge high/low
+  inline void z_vmrh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmrhb(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmrhh(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmrhf(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmrhg(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  inline void z_vmrl( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmrlb(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmrlh(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmrlf(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmrlg(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // vector register permute
+  inline void z_vperm( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4);
+  inline void z_vpdi(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t        m4);
+
+  // vector register replicate
+  inline void z_vrep(  VectorRegister v1, VectorRegister v3, int64_t imm2, int64_t m4);
+  inline void z_vrepb( VectorRegister v1, VectorRegister v3, int64_t imm2);
+  inline void z_vreph( VectorRegister v1, VectorRegister v3, int64_t imm2);
+  inline void z_vrepf( VectorRegister v1, VectorRegister v3, int64_t imm2);
+  inline void z_vrepg( VectorRegister v1, VectorRegister v3, int64_t imm2);
+  inline void z_vrepi( VectorRegister v1, int64_t imm2,      int64_t m3);
+  inline void z_vrepib(VectorRegister v1, int64_t imm2);
+  inline void z_vrepih(VectorRegister v1, int64_t imm2);
+  inline void z_vrepif(VectorRegister v1, int64_t imm2);
+  inline void z_vrepig(VectorRegister v1, int64_t imm2);
+
+  inline void z_vsel(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4);
+  inline void z_vseg(  VectorRegister v1, VectorRegister v2, int64_t imm3);
+
+  // Load (immediate)
+  inline void z_vleib( VectorRegister v1, int64_t imm2, int64_t m3);
+  inline void z_vleih( VectorRegister v1, int64_t imm2, int64_t m3);
+  inline void z_vleif( VectorRegister v1, int64_t imm2, int64_t m3);
+  inline void z_vleig( VectorRegister v1, int64_t imm2, int64_t m3);
+
+  // Store
+  inline void z_vstm(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vst(   VectorRegister v1, int64_t d2, Register x2, Register b2);
+  inline void z_vsteb( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vsteh( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vstef( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vsteg( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3);
+  inline void z_vstl(  VectorRegister v1, Register r3, int64_t d2, Register b2);
+
+  // Misc
+  inline void z_vgm(   VectorRegister v1, int64_t imm2, int64_t imm3, int64_t m4);
+  inline void z_vgmb(  VectorRegister v1, int64_t imm2, int64_t imm3);
+  inline void z_vgmh(  VectorRegister v1, int64_t imm2, int64_t imm3);
+  inline void z_vgmf(  VectorRegister v1, int64_t imm2, int64_t imm3);
+  inline void z_vgmg(  VectorRegister v1, int64_t imm2, int64_t imm3);
+
+  inline void z_vgbm(  VectorRegister v1, int64_t imm2);
+  inline void z_vzero( VectorRegister v1); // preferred method to set vreg to all zeroes
+  inline void z_vone(  VectorRegister v1); // preferred method to set vreg to all ones
+
+  //---<  Vector Arithmetic Instructions  >---
+
+  // Load
+  inline void z_vlc(    VectorRegister v1, VectorRegister v2, int64_t m3);
+  inline void z_vlcb(   VectorRegister v1, VectorRegister v2);
+  inline void z_vlch(   VectorRegister v1, VectorRegister v2);
+  inline void z_vlcf(   VectorRegister v1, VectorRegister v2);
+  inline void z_vlcg(   VectorRegister v1, VectorRegister v2);
+  inline void z_vlp(    VectorRegister v1, VectorRegister v2, int64_t m3);
+  inline void z_vlpb(   VectorRegister v1, VectorRegister v2);
+  inline void z_vlph(   VectorRegister v1, VectorRegister v2);
+  inline void z_vlpf(   VectorRegister v1, VectorRegister v2);
+  inline void z_vlpg(   VectorRegister v1, VectorRegister v2);
+
+  // ADD
+  inline void z_va(     VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vab(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vah(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vaf(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vag(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vaq(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vacc(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vaccb(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vacch(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vaccf(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vaccg(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vaccq(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // SUB
+  inline void z_vs(     VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vsb(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsh(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsf(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsg(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsq(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vscbi(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vscbib( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vscbih( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vscbif( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vscbig( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vscbiq( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // MULTIPLY
+  inline void z_vml(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmh(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmlh(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vme(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmle(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmo(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmlo(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+
+  // MULTIPLY & ADD
+  inline void z_vmal(   VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5);
+  inline void z_vmah(   VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5);
+  inline void z_vmalh(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5);
+  inline void z_vmae(   VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5);
+  inline void z_vmale(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5);
+  inline void z_vmao(   VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5);
+  inline void z_vmalo(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5);
+
+  // VECTOR SUM
+  inline void z_vsum(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vsumb(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsumh(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsumg(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vsumgh( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsumgf( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsumq(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vsumqf( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsumqg( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // Average
+  inline void z_vavg(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vavgb(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vavgh(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vavgf(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vavgg(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vavgl(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vavglb( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vavglh( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vavglf( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vavglg( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // VECTOR Galois Field Multiply Sum
+  inline void z_vgfm(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vgfmb(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vgfmh(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vgfmf(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vgfmg(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  // VECTOR Galois Field Multiply Sum and Accumulate
+  inline void z_vgfma(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5);
+  inline void z_vgfmab( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4);
+  inline void z_vgfmah( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4);
+  inline void z_vgfmaf( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4);
+  inline void z_vgfmag( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4);
+
+  //---<  Vector Logical Instructions  >---
+
+  // AND
+  inline void z_vn(     VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vnc(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // XOR
+  inline void z_vx(     VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // NOR
+  inline void z_vno(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // OR
+  inline void z_vo(     VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // Comparison (element-wise)
+  inline void z_vceq(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5);
+  inline void z_vceqb(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vceqh(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vceqf(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vceqg(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vceqbs( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vceqhs( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vceqfs( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vceqgs( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vch(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5);
+  inline void z_vchb(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchh(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchf(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchg(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchbs(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchhs(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchfs(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchgs(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchl(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5);
+  inline void z_vchlb(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchlh(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchlf(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchlg(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchlbs( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchlhs( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchlfs( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vchlgs( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // Max/Min (element-wise)
+  inline void z_vmx(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmxb(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmxh(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmxf(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmxg(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmxl(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmxlb(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmxlh(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmxlf(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmxlg(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmn(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmnb(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmnh(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmnf(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmng(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmnl(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4);
+  inline void z_vmnlb(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmnlh(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmnlf(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vmnlg(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // Leading/Trailing Zeros, population count
+  inline void z_vclz(   VectorRegister v1, VectorRegister v2, int64_t m3);
+  inline void z_vclzb(  VectorRegister v1, VectorRegister v2);
+  inline void z_vclzh(  VectorRegister v1, VectorRegister v2);
+  inline void z_vclzf(  VectorRegister v1, VectorRegister v2);
+  inline void z_vclzg(  VectorRegister v1, VectorRegister v2);
+  inline void z_vctz(   VectorRegister v1, VectorRegister v2, int64_t m3);
+  inline void z_vctzb(  VectorRegister v1, VectorRegister v2);
+  inline void z_vctzh(  VectorRegister v1, VectorRegister v2);
+  inline void z_vctzf(  VectorRegister v1, VectorRegister v2);
+  inline void z_vctzg(  VectorRegister v1, VectorRegister v2);
+  inline void z_vpopct( VectorRegister v1, VectorRegister v2, int64_t m3);
+
+  // Rotate/Shift
+  inline void z_verllv( VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t m4);
+  inline void z_verllvb(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_verllvh(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_verllvf(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_verllvg(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_verll(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2,         int64_t m4);
+  inline void z_verllb( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_verllh( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_verllf( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_verllg( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_verim(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4, int64_t m5);
+  inline void z_verimb( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4);
+  inline void z_verimh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4);
+  inline void z_verimf( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4);
+  inline void z_verimg( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4);
+
+  inline void z_veslv(  VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t m4);
+  inline void z_veslvb( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_veslvh( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_veslvf( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_veslvg( VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vesl(   VectorRegister v1, VectorRegister v3, int64_t d2, Register b2,         int64_t m4);
+  inline void z_veslb(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_veslh(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_veslf(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_veslg(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+
+  inline void z_vesrav( VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t m4);
+  inline void z_vesravb(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vesravh(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vesravf(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vesravg(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vesra(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2,         int64_t m4);
+  inline void z_vesrab( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vesrah( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vesraf( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vesrag( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vesrlv( VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t m4);
+  inline void z_vesrlvb(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vesrlvh(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vesrlvf(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vesrlvg(VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vesrl(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2,         int64_t m4);
+  inline void z_vesrlb( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vesrlh( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vesrlf( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+  inline void z_vesrlg( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2);
+
+  inline void z_vsl(    VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vslb(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsldb(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4);
+
+  inline void z_vsra(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsrab(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsrl(   VectorRegister v1, VectorRegister v2, VectorRegister v3);
+  inline void z_vsrlb(  VectorRegister v1, VectorRegister v2, VectorRegister v3);
+
+  // Test under Mask
+  inline void z_vtm(    VectorRegister v1, VectorRegister v2);
+
+  //---<  Vector String Instructions  >---
+  inline void z_vfae(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4, int64_t cc5);   // Find any element
+  inline void z_vfaeb(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t cc5);
+  inline void z_vfaeh(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t cc5);
+  inline void z_vfaef(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t cc5);
+  inline void z_vfee(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4, int64_t cc5);   // Find element equal
+  inline void z_vfeeb(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t cc5);
+  inline void z_vfeeh(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t cc5);
+  inline void z_vfeef(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t cc5);
+  inline void z_vfene(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4, int64_t cc5);   // Find element not equal
+  inline void z_vfeneb( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t cc5);
+  inline void z_vfeneh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t cc5);
+  inline void z_vfenef( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t cc5);
+  inline void z_vstrc(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t imm5, int64_t cc6);   // String range compare
+  inline void z_vstrcb( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t cc6);
+  inline void z_vstrch( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t cc6);
+  inline void z_vstrcf( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t cc6);
+  inline void z_vistr(  VectorRegister v1, VectorRegister v2, int64_t imm3, int64_t cc5);                      // Isolate String
+  inline void z_vistrb( VectorRegister v1, VectorRegister v2, int64_t cc5);
+  inline void z_vistrh( VectorRegister v1, VectorRegister v2, int64_t cc5);
+  inline void z_vistrf( VectorRegister v1, VectorRegister v2, int64_t cc5);
+  inline void z_vistrbs(VectorRegister v1, VectorRegister v2);
+  inline void z_vistrhs(VectorRegister v1, VectorRegister v2);
+  inline void z_vistrfs(VectorRegister v1, VectorRegister v2);
+
+
   // Floatingpoint instructions
   // ==========================
 
--- a/src/hotspot/cpu/s390/assembler_s390.inline.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/assembler_s390.inline.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2016 SAP SE. All rights reserved.
+ * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -309,6 +309,9 @@
 inline void Assembler::z_lnr(  Register r1, Register r2) { emit_16( LNR_ZOPC   | regt( r1,  8, 16) | reg((r2 == noreg) ? r1:r2, 12, 16)); }
 inline void Assembler::z_lngr( Register r1, Register r2) { emit_32( LNGR_ZOPC  | regt( r1, 24, 32) | reg((r2 == noreg) ? r1:r2, 28, 32)); }
 inline void Assembler::z_lngfr(Register r1, Register r2) { emit_32( LNGFR_ZOPC | regt( r1, 24, 32) | reg((r2 == noreg) ? r1:r2, 28, 32)); }
+inline void Assembler::z_lpr(  Register r1, Register r2) { emit_16( LPR_ZOPC   | regt( r1,  8, 16) | reg((r2 == noreg) ? r1:r2, 12, 16)); }
+inline void Assembler::z_lpgr( Register r1, Register r2) { emit_32( LPGR_ZOPC  | regt( r1, 24, 32) | reg((r2 == noreg) ? r1:r2, 28, 32)); }
+inline void Assembler::z_lpgfr(Register r1, Register r2) { emit_32( LPGFR_ZOPC | regt( r1, 24, 32) | reg((r2 == noreg) ? r1:r2, 28, 32)); }
 
 inline void Assembler::z_lrvr( Register r1, Register r2) { emit_32( LRVR_ZOPC  | regt(r1, 24, 32) | reg(r2, 28, 32)); }
 inline void Assembler::z_lrvgr(Register r1, Register r2) { emit_32( LRVGR_ZOPC | regt(r1, 24, 32) | reg(r2, 28, 32)); }
@@ -702,6 +705,421 @@
 inline void Assembler::z_cvdg(Register r1, int64_t d2, Register x2, Register b2) { emit_48( CVDG_ZOPC | regt(r1, 8, 48) | reg(x2, 12, 48) | reg(b2, 16, 48) | simm20(d2)); }
 
 
+//---------------------------
+//--  Vector Instructions  --
+//---------------------------
+
+//---<  Vector Support Instructions  >---
+
+// Load (transfer from memory)
+inline void Assembler::z_vlm(    VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {emit_48(VLM_ZOPC   | vreg(v1,  8)     | vreg(v3, 12)     | rsmask_48(d2,     b2)); }
+inline void Assembler::z_vl(     VectorRegister v1, int64_t d2, Register x2, Register b2)             {emit_48(VL_ZOPC    | vreg(v1,  8)                        | rxmask_48(d2, x2, b2)); }
+inline void Assembler::z_vleb(   VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VLEB_ZOPC  | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | veix_mask(m3, VRET_BYTE, 32)); }
+inline void Assembler::z_vleh(   VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VLEH_ZOPC  | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | veix_mask(m3, VRET_HW,   32)); }
+inline void Assembler::z_vlef(   VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VLEF_ZOPC  | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | veix_mask(m3, VRET_FW,   32)); }
+inline void Assembler::z_vleg(   VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VLEG_ZOPC  | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | veix_mask(m3, VRET_DW,   32)); }
+
+// Gather/Scatter
+inline void Assembler::z_vgef(   VectorRegister v1, int64_t d2, VectorRegister vx2, Register b2, int64_t m3) {emit_48(VGEF_ZOPC  | vreg(v1,  8)                 | rvmask_48(d2, vx2, b2) | veix_mask(m3, VRET_FW,   32)); }
+inline void Assembler::z_vgeg(   VectorRegister v1, int64_t d2, VectorRegister vx2, Register b2, int64_t m3) {emit_48(VGEG_ZOPC  | vreg(v1,  8)                 | rvmask_48(d2, vx2, b2) | veix_mask(m3, VRET_DW,   32)); }
+
+inline void Assembler::z_vscef(  VectorRegister v1, int64_t d2, VectorRegister vx2, Register b2, int64_t m3) {emit_48(VSCEF_ZOPC | vreg(v1,  8)                 | rvmask_48(d2, vx2, b2) | veix_mask(m3, VRET_FW,   32)); }
+inline void Assembler::z_vsceg(  VectorRegister v1, int64_t d2, VectorRegister vx2, Register b2, int64_t m3) {emit_48(VSCEG_ZOPC | vreg(v1,  8)                 | rvmask_48(d2, vx2, b2) | veix_mask(m3, VRET_DW,   32)); }
+
+// load and replicate
+inline void Assembler::z_vlrep(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VLREP_ZOPC | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | vesc_mask(m3, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vlrepb( VectorRegister v1, int64_t d2, Register x2, Register b2)             {z_vlrep(v1, d2, x2, b2, VRET_BYTE); }// load byte and replicate to all vector elements of type 'B'
+inline void Assembler::z_vlreph( VectorRegister v1, int64_t d2, Register x2, Register b2)             {z_vlrep(v1, d2, x2, b2, VRET_HW); }  // load HW   and replicate to all vector elements of type 'H'
+inline void Assembler::z_vlrepf( VectorRegister v1, int64_t d2, Register x2, Register b2)             {z_vlrep(v1, d2, x2, b2, VRET_FW); }  // load FW   and replicate to all vector elements of type 'F'
+inline void Assembler::z_vlrepg( VectorRegister v1, int64_t d2, Register x2, Register b2)             {z_vlrep(v1, d2, x2, b2, VRET_DW); }  // load DW   and replicate to all vector elements of type 'G'
+
+inline void Assembler::z_vllez(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VLLEZ_ZOPC | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | vesc_mask(m3, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vllezb( VectorRegister v1, int64_t d2, Register x2, Register b2)             {z_vllez(v1, d2, x2, b2, VRET_BYTE); }// load logical byte into left DW of VR, zero all other bit positions.
+inline void Assembler::z_vllezh( VectorRegister v1, int64_t d2, Register x2, Register b2)             {z_vllez(v1, d2, x2, b2, VRET_HW); }  // load logical HW   into left DW of VR, zero all other bit positions.
+inline void Assembler::z_vllezf( VectorRegister v1, int64_t d2, Register x2, Register b2)             {z_vllez(v1, d2, x2, b2, VRET_FW); }  // load logical FW   into left DW of VR, zero all other bit positions.
+inline void Assembler::z_vllezg( VectorRegister v1, int64_t d2, Register x2, Register b2)             {z_vllez(v1, d2, x2, b2, VRET_DW); }  // load logical DW   into left DW of VR, zero all other bit positions.
+
+inline void Assembler::z_vlbb(   VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VLBB_ZOPC  | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | uimm4(m3, 32, 48)); }
+inline void Assembler::z_vll(    VectorRegister v1, Register r3, int64_t d2, Register b2)             {emit_48(VLL_ZOPC   | vreg(v1,  8)     |  reg(r3, 12, 48) | rsmask_48(d2,     b2)); }
+
+// Load (register to register)
+inline void Assembler::z_vlr (   VectorRegister v1, VectorRegister v2)                                {emit_48(VLR_ZOPC   | vreg(v1,  8)     | vreg(v2, 12)); }
+
+inline void Assembler::z_vlgv(   Register r1, VectorRegister v3, int64_t d2, Register b2, int64_t m4) {emit_48(VLGV_ZOPC  |  reg(r1,  8, 48) | vreg(v3, 12)     | rsmask_48(d2,     b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vlgvb(  Register r1, VectorRegister v3, int64_t d2, Register b2)             {z_vlgv(r1, v3, d2, b2, VRET_BYTE); } // load byte from VR element (index d2(b2)) into GR (logical)
+inline void Assembler::z_vlgvh(  Register r1, VectorRegister v3, int64_t d2, Register b2)             {z_vlgv(r1, v3, d2, b2, VRET_HW); }   // load HW   from VR element (index d2(b2)) into GR (logical)
+inline void Assembler::z_vlgvf(  Register r1, VectorRegister v3, int64_t d2, Register b2)             {z_vlgv(r1, v3, d2, b2, VRET_FW); }   // load FW   from VR element (index d2(b2)) into GR (logical)
+inline void Assembler::z_vlgvg(  Register r1, VectorRegister v3, int64_t d2, Register b2)             {z_vlgv(r1, v3, d2, b2, VRET_DW); }   // load DW   from VR element (index d2(b2)) into GR.
+
+inline void Assembler::z_vlvg(   VectorRegister v1, Register r3, int64_t d2, Register b2, int64_t m4) {emit_48(VLVG_ZOPC  | vreg(v1,  8)     |  reg(r3, 12, 48) | rsmask_48(d2,     b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vlvgb(  VectorRegister v1, Register r3, int64_t d2, Register b2)             {z_vlvg(v1, r3, d2, b2, VRET_BYTE); }
+inline void Assembler::z_vlvgh(  VectorRegister v1, Register r3, int64_t d2, Register b2)             {z_vlvg(v1, r3, d2, b2, VRET_HW); }
+inline void Assembler::z_vlvgf(  VectorRegister v1, Register r3, int64_t d2, Register b2)             {z_vlvg(v1, r3, d2, b2, VRET_FW); }
+inline void Assembler::z_vlvgg(  VectorRegister v1, Register r3, int64_t d2, Register b2)             {z_vlvg(v1, r3, d2, b2, VRET_DW); }
+
+inline void Assembler::z_vlvgp(  VectorRegister v1, Register r2, Register r3)                         {emit_48(VLVGP_ZOPC | vreg(v1,  8)     |  reg(r2, 12, 48) |  reg(r3, 16, 48)); }
+
+// vector register pack
+inline void Assembler::z_vpk(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VPK_ZOPC   | vreg(v1,  8)     | vreg(v2, 12)     | vreg(v3, 16)     | vesc_mask(m4, VRET_HW, VRET_DW, 32)); }
+inline void Assembler::z_vpkh(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpk(v1, v2, v3, VRET_HW); }       // vector element type 'H'
+inline void Assembler::z_vpkf(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpk(v1, v2, v3, VRET_FW); }       // vector element type 'F'
+inline void Assembler::z_vpkg(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpk(v1, v2, v3, VRET_DW); }       // vector element type 'G'
+
+inline void Assembler::z_vpks(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5) {emit_48(VPKS_ZOPC  | vreg(v1,  8) |  vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_HW, VRET_DW, 32) | voprc_ccmask(cc5, 24)); }
+inline void Assembler::z_vpksh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpks(v1, v2, v3, VRET_HW, VOPRC_CCIGN); }   // vector element type 'H', don't set CC
+inline void Assembler::z_vpksf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpks(v1, v2, v3, VRET_FW, VOPRC_CCIGN); }   // vector element type 'F', don't set CC
+inline void Assembler::z_vpksg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpks(v1, v2, v3, VRET_DW, VOPRC_CCIGN); }   // vector element type 'G', don't set CC
+inline void Assembler::z_vpkshs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpks(v1, v2, v3, VRET_HW, VOPRC_CCSET); }   // vector element type 'H', set CC
+inline void Assembler::z_vpksfs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpks(v1, v2, v3, VRET_FW, VOPRC_CCSET); }   // vector element type 'F', set CC
+inline void Assembler::z_vpksgs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpks(v1, v2, v3, VRET_DW, VOPRC_CCSET); }   // vector element type 'G', set CC
+
+inline void Assembler::z_vpkls(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5) {emit_48(VPKLS_ZOPC | vreg(v1,  8) |  vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_HW, VRET_DW, 32) | voprc_ccmask(cc5, 24)); }
+inline void Assembler::z_vpklsh( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpkls(v1, v2, v3, VRET_HW, VOPRC_CCIGN); }  // vector element type 'H', don't set CC
+inline void Assembler::z_vpklsf( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpkls(v1, v2, v3, VRET_FW, VOPRC_CCIGN); }  // vector element type 'F', don't set CC
+inline void Assembler::z_vpklsg( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpkls(v1, v2, v3, VRET_DW, VOPRC_CCIGN); }  // vector element type 'G', don't set CC
+inline void Assembler::z_vpklshs(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpkls(v1, v2, v3, VRET_HW, VOPRC_CCSET); }  // vector element type 'H', set CC
+inline void Assembler::z_vpklsfs(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpkls(v1, v2, v3, VRET_FW, VOPRC_CCSET); }  // vector element type 'F', set CC
+inline void Assembler::z_vpklsgs(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vpkls(v1, v2, v3, VRET_DW, VOPRC_CCSET); }  // vector element type 'G', set CC
+
+// vector register unpack (sign-extended)
+inline void Assembler::z_vuph(   VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VUPH_ZOPC  | vreg(v1,  8)     | vreg(v2, 12)     | vesc_mask(m3, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vuphb(  VectorRegister v1, VectorRegister v2)                                {z_vuph(v1, v2, VRET_BYTE); }        // vector element type 'B'
+inline void Assembler::z_vuphh(  VectorRegister v1, VectorRegister v2)                                {z_vuph(v1, v2, VRET_HW); }          // vector element type 'H'
+inline void Assembler::z_vuphf(  VectorRegister v1, VectorRegister v2)                                {z_vuph(v1, v2, VRET_FW); }          // vector element type 'F'
+inline void Assembler::z_vupl(   VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VUPL_ZOPC  | vreg(v1,  8)     | vreg(v2, 12)     | vesc_mask(m3, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vuplb(  VectorRegister v1, VectorRegister v2)                                {z_vupl(v1, v2, VRET_BYTE); }        // vector element type 'B'
+inline void Assembler::z_vuplh(  VectorRegister v1, VectorRegister v2)                                {z_vupl(v1, v2, VRET_HW); }          // vector element type 'H'
+inline void Assembler::z_vuplf(  VectorRegister v1, VectorRegister v2)                                {z_vupl(v1, v2, VRET_FW); }          // vector element type 'F'
+
+// vector register unpack (zero-extended)
+inline void Assembler::z_vuplh(  VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VUPLH_ZOPC | vreg(v1,  8)     | vreg(v2, 12)     | vesc_mask(m3, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vuplhb( VectorRegister v1, VectorRegister v2)                                {z_vuplh(v1, v2, VRET_BYTE); }       // vector element type 'B'
+inline void Assembler::z_vuplhh( VectorRegister v1, VectorRegister v2)                                {z_vuplh(v1, v2, VRET_HW); }         // vector element type 'H'
+inline void Assembler::z_vuplhf( VectorRegister v1, VectorRegister v2)                                {z_vuplh(v1, v2, VRET_FW); }         // vector element type 'F'
+inline void Assembler::z_vupll(  VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VUPLL_ZOPC | vreg(v1,  8)     | vreg(v2, 12)     | vesc_mask(m3, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vupllb( VectorRegister v1, VectorRegister v2)                                {z_vupll(v1, v2, VRET_BYTE); }       // vector element type 'B'
+inline void Assembler::z_vupllh( VectorRegister v1, VectorRegister v2)                                {z_vupll(v1, v2, VRET_HW); }         // vector element type 'H'
+inline void Assembler::z_vupllf( VectorRegister v1, VectorRegister v2)                                {z_vupll(v1, v2, VRET_FW); }         // vector element type 'F'
+
+// vector register merge high/low
+inline void Assembler::z_vmrh(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMRH_ZOPC  | vreg(v1,  8)     | vreg(v2, 12)     | vreg(v3, 16)     | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vmrhb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmrh(v1, v2, v3, VRET_BYTE); }    // vector element type 'B'
+inline void Assembler::z_vmrhh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmrh(v1, v2, v3, VRET_HW); }      // vector element type 'H'
+inline void Assembler::z_vmrhf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmrh(v1, v2, v3, VRET_FW); }      // vector element type 'F'
+inline void Assembler::z_vmrhg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmrh(v1, v2, v3, VRET_DW); }      // vector element type 'G'
+
+inline void Assembler::z_vmrl(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMRL_ZOPC  | vreg(v1,  8)     | vreg(v2, 12)     | vreg(v3, 16)     | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vmrlb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmrh(v1, v2, v3, VRET_BYTE); }    // vector element type 'B'
+inline void Assembler::z_vmrlh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmrh(v1, v2, v3, VRET_HW); }      // vector element type 'H'
+inline void Assembler::z_vmrlf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmrh(v1, v2, v3, VRET_FW); }      // vector element type 'F'
+inline void Assembler::z_vmrlg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmrh(v1, v2, v3, VRET_DW); }      // vector element type 'G'
+
+// vector register permute
+inline void Assembler::z_vperm(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4) {emit_48(VPERM_ZOPC | vreg(v1,  8) |  vreg(v2, 12) | vreg(v3, 16) | vreg(v4, 32)); }
+inline void Assembler::z_vpdi(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t        m4) {emit_48(VPDI_ZOPC  | vreg(v1,  8) |  vreg(v2, 12) | vreg(v3, 16) | uimm4(m4, 32, 48)); }
+
+// vector register replicate
+inline void Assembler::z_vrep(   VectorRegister v1, VectorRegister v3, int64_t imm2, int64_t m4)      {emit_48(VREP_ZOPC  | vreg(v1,  8)     | vreg(v3, 12)     | simm16(imm2, 16, 48) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vrepb(  VectorRegister v1, VectorRegister v3, int64_t imm2)                  {z_vrep(v1, v3, imm2, VRET_BYTE); }  // vector element type 'B'
+inline void Assembler::z_vreph(  VectorRegister v1, VectorRegister v3, int64_t imm2)                  {z_vrep(v1, v3, imm2, VRET_HW); }    // vector element type 'H'
+inline void Assembler::z_vrepf(  VectorRegister v1, VectorRegister v3, int64_t imm2)                  {z_vrep(v1, v3, imm2, VRET_FW); }    // vector element type 'F'
+inline void Assembler::z_vrepg(  VectorRegister v1, VectorRegister v3, int64_t imm2)                  {z_vrep(v1, v3, imm2, VRET_DW); }    // vector element type 'G'
+inline void Assembler::z_vrepi(  VectorRegister v1, int64_t imm2,      int64_t m3)                    {emit_48(VREPI_ZOPC | vreg(v1,  8)                        | simm16(imm2, 16, 48) | vesc_mask(m3, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vrepib( VectorRegister v1, int64_t imm2)                                     {z_vrepi(v1, imm2, VRET_BYTE); }     // vector element type 'B'
+inline void Assembler::z_vrepih( VectorRegister v1, int64_t imm2)                                     {z_vrepi(v1, imm2, VRET_HW); }       // vector element type 'B'
+inline void Assembler::z_vrepif( VectorRegister v1, int64_t imm2)                                     {z_vrepi(v1, imm2, VRET_FW); }       // vector element type 'B'
+inline void Assembler::z_vrepig( VectorRegister v1, int64_t imm2)                                     {z_vrepi(v1, imm2, VRET_DW); }       // vector element type 'B'
+
+inline void Assembler::z_vsel(   VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4) {emit_48(VSEL_ZOPC  | vreg(v1,  8) |  vreg(v2, 12) |  vreg(v3, 16) |  vreg(v4, 32)); }
+inline void Assembler::z_vseg(   VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VSEG_ZOPC  | vreg(v1,  8)     | vreg(v2, 12)     | uimm4(m3, 32, 48)); }
+
+// Load (immediate)
+inline void Assembler::z_vleib(  VectorRegister v1, int64_t imm2, int64_t m3)                         {emit_48(VLEIB_ZOPC | vreg(v1,  8)                        | simm16(imm2, 32, 48)  | veix_mask(m3, VRET_BYTE, 32)); }
+inline void Assembler::z_vleih(  VectorRegister v1, int64_t imm2, int64_t m3)                         {emit_48(VLEIH_ZOPC | vreg(v1,  8)                        | simm16(imm2, 32, 48)  | veix_mask(m3, VRET_HW,   32)); }
+inline void Assembler::z_vleif(  VectorRegister v1, int64_t imm2, int64_t m3)                         {emit_48(VLEIF_ZOPC | vreg(v1,  8)                        | simm16(imm2, 32, 48)  | veix_mask(m3, VRET_FW,   32)); }
+inline void Assembler::z_vleig(  VectorRegister v1, int64_t imm2, int64_t m3)                         {emit_48(VLEIG_ZOPC | vreg(v1,  8)                        | simm16(imm2, 32, 48)  | veix_mask(m3, VRET_DW,   32)); }
+
+// Store
+inline void Assembler::z_vstm(   VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {emit_48(VSTM_ZOPC  | vreg(v1,  8)     | vreg(v3, 12)     | rsmask_48(d2,     b2)); }
+inline void Assembler::z_vst(    VectorRegister v1, int64_t d2, Register x2, Register b2)             {emit_48(VST_ZOPC   | vreg(v1,  8)                        | rxmask_48(d2, x2, b2)); }
+inline void Assembler::z_vsteb(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VSTEB_ZOPC | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | veix_mask(m3, VRET_BYTE, 32)); }
+inline void Assembler::z_vsteh(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VSTEH_ZOPC | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | veix_mask(m3, VRET_HW,   32)); }
+inline void Assembler::z_vstef(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VSTEF_ZOPC | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | veix_mask(m3, VRET_FW,   32)); }
+inline void Assembler::z_vsteg(  VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3) {emit_48(VSTEG_ZOPC | vreg(v1,  8)                        | rxmask_48(d2, x2, b2) | veix_mask(m3, VRET_DW,   32)); }
+inline void Assembler::z_vstl(   VectorRegister v1, Register r3, int64_t d2, Register b2)             {emit_48(VSTL_ZOPC  | vreg(v1,  8)     |  reg(r3, 12, 48) | rsmask_48(d2,     b2)); }
+
+// Misc
+inline void Assembler::z_vgm(    VectorRegister v1, int64_t imm2, int64_t imm3, int64_t m4)           {emit_48(VGM_ZOPC   | vreg(v1,  8)     | uimm8( imm2, 16, 48) | uimm8(imm3, 24, 48) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vgmb(   VectorRegister v1, int64_t imm2, int64_t imm3)                       {z_vgm(v1, imm2, imm3, VRET_BYTE); } // vector element type 'B'
+inline void Assembler::z_vgmh(   VectorRegister v1, int64_t imm2, int64_t imm3)                       {z_vgm(v1, imm2, imm3, VRET_HW); }   // vector element type 'H'
+inline void Assembler::z_vgmf(   VectorRegister v1, int64_t imm2, int64_t imm3)                       {z_vgm(v1, imm2, imm3, VRET_FW); }   // vector element type 'F'
+inline void Assembler::z_vgmg(   VectorRegister v1, int64_t imm2, int64_t imm3)                       {z_vgm(v1, imm2, imm3, VRET_DW); }   // vector element type 'G'
+
+inline void Assembler::z_vgbm(   VectorRegister v1, int64_t imm2)                                     {emit_48(VGBM_ZOPC  | vreg(v1,  8)     | uimm16(imm2, 16, 48)); }
+inline void Assembler::z_vzero(  VectorRegister v1)                                                   {z_vgbm(v1, 0); }      // preferred method to set vreg to all zeroes
+inline void Assembler::z_vone(   VectorRegister v1)                                                   {z_vgbm(v1, 0xffff); } // preferred method to set vreg to all ones
+
+//---<  Vector Arithmetic Instructions  >---
+
+// Load
+inline void Assembler::z_vlc(    VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VLC_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vesc_mask(m3, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vlcb(   VectorRegister v1, VectorRegister v2)                                {z_vlc(v1, v2, VRET_BYTE); }         // vector element type 'B'
+inline void Assembler::z_vlch(   VectorRegister v1, VectorRegister v2)                                {z_vlc(v1, v2, VRET_HW); }           // vector element type 'H'
+inline void Assembler::z_vlcf(   VectorRegister v1, VectorRegister v2)                                {z_vlc(v1, v2, VRET_FW); }           // vector element type 'F'
+inline void Assembler::z_vlcg(   VectorRegister v1, VectorRegister v2)                                {z_vlc(v1, v2, VRET_DW); }           // vector element type 'G'
+inline void Assembler::z_vlp(    VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VLP_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vesc_mask(m3, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vlpb(   VectorRegister v1, VectorRegister v2)                                {z_vlp(v1, v2, VRET_BYTE); }         // vector element type 'B'
+inline void Assembler::z_vlph(   VectorRegister v1, VectorRegister v2)                                {z_vlp(v1, v2, VRET_HW); }           // vector element type 'H'
+inline void Assembler::z_vlpf(   VectorRegister v1, VectorRegister v2)                                {z_vlp(v1, v2, VRET_FW); }           // vector element type 'F'
+inline void Assembler::z_vlpg(   VectorRegister v1, VectorRegister v2)                                {z_vlp(v1, v2, VRET_DW); }           // vector element type 'G'
+
+// ADD
+inline void Assembler::z_va(     VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VA_ZOPC    | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_QW, 32)); }
+inline void Assembler::z_vab(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_va(v1, v2, v3, VRET_BYTE); }      // vector element type 'B'
+inline void Assembler::z_vah(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_va(v1, v2, v3, VRET_HW); }        // vector element type 'H'
+inline void Assembler::z_vaf(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_va(v1, v2, v3, VRET_FW); }        // vector element type 'F'
+inline void Assembler::z_vag(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_va(v1, v2, v3, VRET_DW); }        // vector element type 'G'
+inline void Assembler::z_vaq(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_va(v1, v2, v3, VRET_QW); }        // vector element type 'Q'
+inline void Assembler::z_vacc(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VACC_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_QW, 32)); }
+inline void Assembler::z_vaccb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vacc(v1, v2, v3, VRET_BYTE); }    // vector element type 'B'
+inline void Assembler::z_vacch(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vacc(v1, v2, v3, VRET_HW); }      // vector element type 'H'
+inline void Assembler::z_vaccf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vacc(v1, v2, v3, VRET_FW); }      // vector element type 'F'
+inline void Assembler::z_vaccg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vacc(v1, v2, v3, VRET_DW); }      // vector element type 'G'
+inline void Assembler::z_vaccq(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vacc(v1, v2, v3, VRET_QW); }      // vector element type 'Q'
+
+// SUB
+inline void Assembler::z_vs(     VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VS_ZOPC    | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_QW, 32)); }
+inline void Assembler::z_vsb(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vs(v1, v2, v3, VRET_BYTE); }      // vector element type 'B'
+inline void Assembler::z_vsh(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vs(v1, v2, v3, VRET_HW); }        // vector element type 'H'
+inline void Assembler::z_vsf(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vs(v1, v2, v3, VRET_FW); }        // vector element type 'F'
+inline void Assembler::z_vsg(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vs(v1, v2, v3, VRET_DW); }        // vector element type 'G'
+inline void Assembler::z_vsq(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vs(v1, v2, v3, VRET_QW); }        // vector element type 'Q'
+inline void Assembler::z_vscbi(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VSCBI_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_QW, 32)); }
+inline void Assembler::z_vscbib( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vscbi(v1, v2, v3, VRET_BYTE); }   // vector element type 'B'
+inline void Assembler::z_vscbih( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vscbi(v1, v2, v3, VRET_HW); }     // vector element type 'H'
+inline void Assembler::z_vscbif( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vscbi(v1, v2, v3, VRET_FW); }     // vector element type 'F'
+inline void Assembler::z_vscbig( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vscbi(v1, v2, v3, VRET_DW); }     // vector element type 'G'
+inline void Assembler::z_vscbiq( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vscbi(v1, v2, v3, VRET_QW); }     // vector element type 'Q'
+
+// MULTIPLY
+inline void Assembler::z_vml(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VML_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vmh(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMH_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vmlh(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMLH_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vme(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VME_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vmle(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMLE_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vmo(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMO_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); }
+inline void Assembler::z_vmlo(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMLO_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); }
+
+// MULTIPLY & ADD
+inline void Assembler::z_vmal(   VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5) {emit_48(VMAL_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vreg(v4, 32) | vesc_mask(m5, VRET_BYTE, VRET_FW, 20)); }
+inline void Assembler::z_vmah(   VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5) {emit_48(VMAH_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vreg(v4, 32) | vesc_mask(m5, VRET_BYTE, VRET_FW, 20)); }
+inline void Assembler::z_vmalh(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5) {emit_48(VMALH_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vreg(v4, 32) | vesc_mask(m5, VRET_BYTE, VRET_FW, 20)); }
+inline void Assembler::z_vmae(   VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5) {emit_48(VMAE_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vreg(v4, 32) | vesc_mask(m5, VRET_BYTE, VRET_FW, 20)); }
+inline void Assembler::z_vmale(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5) {emit_48(VMALE_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vreg(v4, 32) | vesc_mask(m5, VRET_BYTE, VRET_FW, 20)); }
+inline void Assembler::z_vmao(   VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5) {emit_48(VMAO_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vreg(v4, 32) | vesc_mask(m5, VRET_BYTE, VRET_FW, 20)); }
+inline void Assembler::z_vmalo(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5) {emit_48(VMALO_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vreg(v4, 32) | vesc_mask(m5, VRET_BYTE, VRET_FW, 20)); }
+
+// VECTOR SUM
+inline void Assembler::z_vsum(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VSUM_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_HW, 32)); }
+inline void Assembler::z_vsumb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vsum(v1, v2, v3, VRET_BYTE); }    // vector element type 'B'
+inline void Assembler::z_vsumh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vsum(v1, v2, v3, VRET_HW); }      // vector element type 'H'
+inline void Assembler::z_vsumg(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VSUMG_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_HW,   VRET_FW, 32)); }
+inline void Assembler::z_vsumgh( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vsumg(v1, v2, v3, VRET_HW); }     // vector element type 'B'
+inline void Assembler::z_vsumgf( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vsumg(v1, v2, v3, VRET_FW); }     // vector element type 'H'
+inline void Assembler::z_vsumq(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VSUMQ_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_FW,   VRET_DW, 32)); }
+inline void Assembler::z_vsumqf( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vsumq(v1, v2, v3, VRET_FW); }     // vector element type 'B'
+inline void Assembler::z_vsumqg( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vsumq(v1, v2, v3, VRET_DW); }     // vector element type 'H'
+
+// Average
+inline void Assembler::z_vavg(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VAVG_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vavgb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vavg(v1, v2, v3, VRET_BYTE); }    // vector element type 'B'
+inline void Assembler::z_vavgh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vavg(v1, v2, v3, VRET_HW); }      // vector element type 'H'
+inline void Assembler::z_vavgf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vavg(v1, v2, v3, VRET_FW); }      // vector element type 'F'
+inline void Assembler::z_vavgg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vavg(v1, v2, v3, VRET_DW); }      // vector element type 'G'
+inline void Assembler::z_vavgl(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VAVGL_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vavglb( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vavgl(v1, v2, v3, VRET_BYTE); }   // vector element type 'B'
+inline void Assembler::z_vavglh( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vavgl(v1, v2, v3, VRET_HW); }     // vector element type 'H'
+inline void Assembler::z_vavglf( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vavgl(v1, v2, v3, VRET_FW); }     // vector element type 'F'
+inline void Assembler::z_vavglg( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vavgl(v1, v2, v3, VRET_DW); }     // vector element type 'G'
+
+// VECTOR Galois Field Multiply Sum
+inline void Assembler::z_vgfm(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VGFM_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vgfmb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vgfm(v1, v2, v3, VRET_BYTE); }    // vector element type 'B'
+inline void Assembler::z_vgfmh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vgfm(v1, v2, v3, VRET_HW); }      // vector element type 'H'
+inline void Assembler::z_vgfmf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vgfm(v1, v2, v3, VRET_FW); }      // vector element type 'F'
+inline void Assembler::z_vgfmg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vgfm(v1, v2, v3, VRET_DW); }      // vector element type 'G'
+inline void Assembler::z_vgfma(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5) {emit_48(VGFMA_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vreg(v3, 16) | vesc_mask(m5, VRET_BYTE, VRET_DW, 20)); }
+inline void Assembler::z_vgfmab( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4) {z_vgfma(v1, v2, v3, v4, VRET_BYTE); } // vector element type 'B'
+inline void Assembler::z_vgfmah( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4) {z_vgfma(v1, v2, v3, v4, VRET_HW); }   // vector element type 'H'
+inline void Assembler::z_vgfmaf( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4) {z_vgfma(v1, v2, v3, v4, VRET_FW); }   // vector element type 'F'
+inline void Assembler::z_vgfmag( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4) {z_vgfma(v1, v2, v3, v4, VRET_DW); }   // vector element type 'G'
+
+//---<  Vector Logical Instructions  >---
+
+// AND
+inline void Assembler::z_vn(     VectorRegister v1, VectorRegister v2, VectorRegister v3)             {emit_48(VN_ZOPC    | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+inline void Assembler::z_vnc(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {emit_48(VNC_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+
+// XOR
+inline void Assembler::z_vx(     VectorRegister v1, VectorRegister v2, VectorRegister v3)             {emit_48(VX_ZOPC    | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+
+// NOR
+inline void Assembler::z_vno(    VectorRegister v1, VectorRegister v2, VectorRegister v3)             {emit_48(VNO_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+
+// OR
+inline void Assembler::z_vo(     VectorRegister v1, VectorRegister v2, VectorRegister v3)             {emit_48(VO_ZOPC    | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+
+// Comparison (element-wise)
+inline void Assembler::z_vceq(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5) {emit_48(VCEQ_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32) | voprc_ccmask(cc5, 24)); }
+inline void Assembler::z_vceqb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vceq(v1, v2, v3, VRET_BYTE, VOPRC_CCIGN); } // vector element type 'B', don't set CC
+inline void Assembler::z_vceqh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vceq(v1, v2, v3, VRET_HW,   VOPRC_CCIGN); } // vector element type 'H', don't set CC
+inline void Assembler::z_vceqf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vceq(v1, v2, v3, VRET_FW,   VOPRC_CCIGN); } // vector element type 'F', don't set CC
+inline void Assembler::z_vceqg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vceq(v1, v2, v3, VRET_DW,   VOPRC_CCIGN); } // vector element type 'G', don't set CC
+inline void Assembler::z_vceqbs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vceq(v1, v2, v3, VRET_BYTE, VOPRC_CCSET); } // vector element type 'B', don't set CC
+inline void Assembler::z_vceqhs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vceq(v1, v2, v3, VRET_HW,   VOPRC_CCSET); } // vector element type 'H', don't set CC
+inline void Assembler::z_vceqfs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vceq(v1, v2, v3, VRET_FW,   VOPRC_CCSET); } // vector element type 'F', don't set CC
+inline void Assembler::z_vceqgs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vceq(v1, v2, v3, VRET_DW,   VOPRC_CCSET); } // vector element type 'G', don't set CC
+inline void Assembler::z_vch(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5) {emit_48(VCH_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32) | voprc_ccmask(cc5, 24)); }
+inline void Assembler::z_vchb(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vch(v1, v2, v3, VRET_BYTE,  VOPRC_CCIGN); }  // vector element type 'B', don't set CC
+inline void Assembler::z_vchh(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vch(v1, v2, v3, VRET_HW,    VOPRC_CCIGN); }  // vector element type 'H', don't set CC
+inline void Assembler::z_vchf(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vch(v1, v2, v3, VRET_FW,    VOPRC_CCIGN); }  // vector element type 'F', don't set CC
+inline void Assembler::z_vchg(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vch(v1, v2, v3, VRET_DW,    VOPRC_CCIGN); }  // vector element type 'G', don't set CC
+inline void Assembler::z_vchbs(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vch(v1, v2, v3, VRET_BYTE,  VOPRC_CCSET); }  // vector element type 'B', don't set CC
+inline void Assembler::z_vchhs(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vch(v1, v2, v3, VRET_HW,    VOPRC_CCSET); }  // vector element type 'H', don't set CC
+inline void Assembler::z_vchfs(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vch(v1, v2, v3, VRET_FW,    VOPRC_CCSET); }  // vector element type 'F', don't set CC
+inline void Assembler::z_vchgs(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vch(v1, v2, v3, VRET_DW,    VOPRC_CCSET); }  // vector element type 'G', don't set CC
+inline void Assembler::z_vchl(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4, int64_t cc5) {emit_48(VCHL_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32) | voprc_ccmask(cc5, 24)); }
+inline void Assembler::z_vchlb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vchl(v1, v2, v3, VRET_BYTE, VOPRC_CCIGN); }  // vector element type 'B', don't set CC
+inline void Assembler::z_vchlh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vchl(v1, v2, v3, VRET_HW,   VOPRC_CCIGN); }  // vector element type 'H', don't set CC
+inline void Assembler::z_vchlf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vchl(v1, v2, v3, VRET_FW,   VOPRC_CCIGN); }  // vector element type 'F', don't set CC
+inline void Assembler::z_vchlg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vchl(v1, v2, v3, VRET_DW,   VOPRC_CCIGN); }  // vector element type 'G', don't set CC
+inline void Assembler::z_vchlbs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vchl(v1, v2, v3, VRET_BYTE, VOPRC_CCSET); }  // vector element type 'B', don't set CC
+inline void Assembler::z_vchlhs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vchl(v1, v2, v3, VRET_HW,   VOPRC_CCSET); }  // vector element type 'H', don't set CC
+inline void Assembler::z_vchlfs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vchl(v1, v2, v3, VRET_FW,   VOPRC_CCSET); }  // vector element type 'F', don't set CC
+inline void Assembler::z_vchlgs( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vchl(v1, v2, v3, VRET_DW,   VOPRC_CCSET); }  // vector element type 'G', don't set CC
+
+// Max/Min (element-wise)
+inline void Assembler::z_vmx(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMX_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vmxb(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmx(v1, v2, v3, VRET_BYTE); }     // vector element type 'B'
+inline void Assembler::z_vmxh(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmx(v1, v2, v3, VRET_HW); }       // vector element type 'H'
+inline void Assembler::z_vmxf(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmx(v1, v2, v3, VRET_FW); }       // vector element type 'F'
+inline void Assembler::z_vmxg(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmx(v1, v2, v3, VRET_DW); }       // vector element type 'G'
+inline void Assembler::z_vmxl(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMXL_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vmxlb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmxl(v1, v2, v3, VRET_BYTE); }    // vector element type 'B'
+inline void Assembler::z_vmxlh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmxl(v1, v2, v3, VRET_HW); }      // vector element type 'H'
+inline void Assembler::z_vmxlf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmxl(v1, v2, v3, VRET_FW); }      // vector element type 'F'
+inline void Assembler::z_vmxlg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmxl(v1, v2, v3, VRET_DW); }      // vector element type 'G'
+inline void Assembler::z_vmn(    VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMN_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vmnb(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmn(v1, v2, v3, VRET_BYTE); }     // vector element type 'B'
+inline void Assembler::z_vmnh(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmn(v1, v2, v3, VRET_HW); }       // vector element type 'H'
+inline void Assembler::z_vmnf(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmn(v1, v2, v3, VRET_FW); }       // vector element type 'F'
+inline void Assembler::z_vmng(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmn(v1, v2, v3, VRET_DW); }       // vector element type 'G'
+inline void Assembler::z_vmnl(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMNL_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vmnlb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmnl(v1, v2, v3, VRET_BYTE); }    // vector element type 'B'
+inline void Assembler::z_vmnlh(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmnl(v1, v2, v3, VRET_HW); }      // vector element type 'H'
+inline void Assembler::z_vmnlf(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmnl(v1, v2, v3, VRET_FW); }      // vector element type 'F'
+inline void Assembler::z_vmnlg(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vmnl(v1, v2, v3, VRET_DW); }      // vector element type 'G'
+
+// Leading/Trailing Zeros, population count
+inline void Assembler::z_vclz(   VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VCLZ_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vesc_mask(m3, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vclzb(  VectorRegister v1, VectorRegister v2)                                {z_vclz(v1, v2, VRET_BYTE); }        // vector element type 'B'
+inline void Assembler::z_vclzh(  VectorRegister v1, VectorRegister v2)                                {z_vclz(v1, v2, VRET_HW); }          // vector element type 'H'
+inline void Assembler::z_vclzf(  VectorRegister v1, VectorRegister v2)                                {z_vclz(v1, v2, VRET_FW); }          // vector element type 'F'
+inline void Assembler::z_vclzg(  VectorRegister v1, VectorRegister v2)                                {z_vclz(v1, v2, VRET_DW); }          // vector element type 'G'
+inline void Assembler::z_vctz(   VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VCTZ_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vesc_mask(m3, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vctzb(  VectorRegister v1, VectorRegister v2)                                {z_vctz(v1, v2, VRET_BYTE); }        // vector element type 'B'
+inline void Assembler::z_vctzh(  VectorRegister v1, VectorRegister v2)                                {z_vctz(v1, v2, VRET_HW); }          // vector element type 'H'
+inline void Assembler::z_vctzf(  VectorRegister v1, VectorRegister v2)                                {z_vctz(v1, v2, VRET_FW); }          // vector element type 'F'
+inline void Assembler::z_vctzg(  VectorRegister v1, VectorRegister v2)                                {z_vctz(v1, v2, VRET_DW); }          // vector element type 'G'
+inline void Assembler::z_vpopct( VectorRegister v1, VectorRegister v2, int64_t m3)                    {emit_48(VPOPCT_ZOPC| vreg(v1,  8) | vreg(v2, 12) | vesc_mask(m3, VRET_BYTE, VRET_DW, 32)); }
+
+// Rotate/Shift
+inline void Assembler::z_verllv( VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t m4) {emit_48(VERLLV_ZOPC| vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)      | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_verllvb(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_verllv(v1, v2, v3, VRET_BYTE); }  // vector element type 'B'
+inline void Assembler::z_verllvh(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_verllv(v1, v2, v3, VRET_HW); }    // vector element type 'H'
+inline void Assembler::z_verllvf(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_verllv(v1, v2, v3, VRET_FW); }    // vector element type 'F'
+inline void Assembler::z_verllvg(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_verllv(v1, v2, v3, VRET_DW); }    // vector element type 'G'
+inline void Assembler::z_verll(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2,         int64_t m4) {emit_48(VERLL_ZOPC | vreg(v1,  8) | vreg(v3, 12) | rsmask_48(d2, b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_verllb( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_verll(v1, v3, d2, b2, VRET_BYTE);}// vector element type 'B'
+inline void Assembler::z_verllh( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_verll(v1, v3, d2, b2, VRET_HW);}  // vector element type 'H'
+inline void Assembler::z_verllf( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_verll(v1, v3, d2, b2, VRET_FW);}  // vector element type 'F'
+inline void Assembler::z_verllg( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_verll(v1, v3, d2, b2, VRET_DW);}  // vector element type 'G'
+inline void Assembler::z_verim(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4, int64_t m5) {emit_48(VERLL_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)      | uimm8(imm4, 24, 48) | vesc_mask(m5, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_verimb( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4) {z_verim(v1, v2, v3, imm4, VRET_BYTE); }   // vector element type 'B'
+inline void Assembler::z_verimh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4) {z_verim(v1, v2, v3, imm4, VRET_HW); }     // vector element type 'H'
+inline void Assembler::z_verimf( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4) {z_verim(v1, v2, v3, imm4, VRET_FW); }     // vector element type 'F'
+inline void Assembler::z_verimg( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4) {z_verim(v1, v2, v3, imm4, VRET_DW); }     // vector element type 'G'
+
+inline void Assembler::z_veslv(  VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t m4) {emit_48(VESLV_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)      | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_veslvb( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_veslv(v1, v2, v3, VRET_BYTE); }   // vector element type 'B'
+inline void Assembler::z_veslvh( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_veslv(v1, v2, v3, VRET_HW); }     // vector element type 'H'
+inline void Assembler::z_veslvf( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_veslv(v1, v2, v3, VRET_FW); }     // vector element type 'F'
+inline void Assembler::z_veslvg( VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_veslv(v1, v2, v3, VRET_DW); }     // vector element type 'G'
+inline void Assembler::z_vesl(   VectorRegister v1, VectorRegister v3, int64_t d2, Register b2,         int64_t m4) {emit_48(VESL_ZOPC  | vreg(v1,  8) | vreg(v3, 12) | rsmask_48(d2, b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_veslb(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesl(v1, v3, d2, b2, VRET_BYTE);} // vector element type 'B'
+inline void Assembler::z_veslh(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesl(v1, v3, d2, b2, VRET_HW);}   // vector element type 'H'
+inline void Assembler::z_veslf(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesl(v1, v3, d2, b2, VRET_FW);}   // vector element type 'F'
+inline void Assembler::z_veslg(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesl(v1, v3, d2, b2, VRET_DW);}   // vector element type 'G'
+
+inline void Assembler::z_vesrav( VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t m4) {emit_48(VESRAV_ZOPC| vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)      | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vesravb(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vesrav(v1, v2, v3, VRET_BYTE); }  // vector element type 'B'
+inline void Assembler::z_vesravh(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vesrav(v1, v2, v3, VRET_HW); }    // vector element type 'H'
+inline void Assembler::z_vesravf(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vesrav(v1, v2, v3, VRET_FW); }    // vector element type 'F'
+inline void Assembler::z_vesravg(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vesrav(v1, v2, v3, VRET_DW); }    // vector element type 'G'
+inline void Assembler::z_vesra(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2,         int64_t m4) {emit_48(VESRA_ZOPC | vreg(v1,  8) | vreg(v3, 12) | rsmask_48(d2, b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vesrab( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesra(v1, v3, d2, b2, VRET_BYTE);}// vector element type 'B'
+inline void Assembler::z_vesrah( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesra(v1, v3, d2, b2, VRET_HW);}  // vector element type 'H'
+inline void Assembler::z_vesraf( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesra(v1, v3, d2, b2, VRET_FW);}  // vector element type 'F'
+inline void Assembler::z_vesrag( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesra(v1, v3, d2, b2, VRET_DW);}  // vector element type 'G'
+inline void Assembler::z_vesrlv( VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t m4) {emit_48(VESRLV_ZOPC| vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)      | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vesrlvb(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vesrlv(v1, v2, v3, VRET_BYTE); }  // vector element type 'B'
+inline void Assembler::z_vesrlvh(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vesrlv(v1, v2, v3, VRET_HW); }    // vector element type 'H'
+inline void Assembler::z_vesrlvf(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vesrlv(v1, v2, v3, VRET_FW); }    // vector element type 'F'
+inline void Assembler::z_vesrlvg(VectorRegister v1, VectorRegister v2, VectorRegister v3)             {z_vesrlv(v1, v2, v3, VRET_DW); }    // vector element type 'G'
+inline void Assembler::z_vesrl(  VectorRegister v1, VectorRegister v3, int64_t d2, Register b2,         int64_t m4) {emit_48(VESRL_ZOPC | vreg(v1,  8) | vreg(v3, 12) | rsmask_48(d2, b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); }
+inline void Assembler::z_vesrlb( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesrl(v1, v3, d2, b2, VRET_BYTE);}// vector element type 'B'
+inline void Assembler::z_vesrlh( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesrl(v1, v3, d2, b2, VRET_HW);}  // vector element type 'H'
+inline void Assembler::z_vesrlf( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesrl(v1, v3, d2, b2, VRET_FW);}  // vector element type 'F'
+inline void Assembler::z_vesrlg( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2)       {z_vesrl(v1, v3, d2, b2, VRET_DW);}  // vector element type 'G'
+
+inline void Assembler::z_vsl(    VectorRegister v1, VectorRegister v2, VectorRegister v3)               {emit_48(VSL_ZOPC   | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+inline void Assembler::z_vslb(   VectorRegister v1, VectorRegister v2, VectorRegister v3)               {emit_48(VSLB_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+inline void Assembler::z_vsldb(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4) {emit_48(VSLDB_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | uimm8(imm4, 24, 48)); }
+
+inline void Assembler::z_vsra(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {emit_48(VSRA_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+inline void Assembler::z_vsrab(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {emit_48(VSRAB_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+inline void Assembler::z_vsrl(   VectorRegister v1, VectorRegister v2, VectorRegister v3)             {emit_48(VSRL_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+inline void Assembler::z_vsrlb(  VectorRegister v1, VectorRegister v2, VectorRegister v3)             {emit_48(VSRLB_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)); }
+
+// Test under Mask
+inline void Assembler::z_vtm(    VectorRegister v1, VectorRegister v2)                                {emit_48(VTM_ZOPC   | vreg(v1,  8) | vreg(v2, 12)); }
+
+//---<  Vector String Instructions  >---
+inline void Assembler::z_vfae(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4, int64_t cc5) {emit_48(VFAE_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)      | vesc_mask(imm4, VRET_BYTE, VRET_FW, 32) | voprc_any(cc5, 24) ); }  // Find any element
+inline void Assembler::z_vfaeb(  VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t cc5) {z_vfae(v1, v2, v3, VRET_BYTE, cc5); }
+inline void Assembler::z_vfaeh(  VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t cc5) {z_vfae(v1, v2, v3, VRET_HW,   cc5); }
+inline void Assembler::z_vfaef(  VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t cc5) {z_vfae(v1, v2, v3, VRET_FW,   cc5); }
+inline void Assembler::z_vfee(   VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4, int64_t cc5) {emit_48(VFEE_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)      | vesc_mask(imm4, VRET_BYTE, VRET_FW, 32) | voprc_any(cc5, 24) ); }  // Find element equal
+inline void Assembler::z_vfeeb(  VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t cc5) {z_vfee(v1, v2, v3, VRET_BYTE, cc5); }
+inline void Assembler::z_vfeeh(  VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t cc5) {z_vfee(v1, v2, v3, VRET_HW,   cc5); }
+inline void Assembler::z_vfeef(  VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t cc5) {z_vfee(v1, v2, v3, VRET_FW,   cc5); }
+inline void Assembler::z_vfene(  VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t imm4, int64_t cc5) {emit_48(VFENE_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16)      | vesc_mask(imm4, VRET_BYTE, VRET_FW, 32) | voprc_any(cc5, 24) ); }  // Find element not equal
+inline void Assembler::z_vfeneb( VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t cc5) {z_vfene(v1, v2, v3, VRET_BYTE, cc5); }
+inline void Assembler::z_vfeneh( VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t cc5) {z_vfene(v1, v2, v3, VRET_HW,   cc5); }
+inline void Assembler::z_vfenef( VectorRegister v1, VectorRegister v2, VectorRegister v3,               int64_t cc5) {z_vfene(v1, v2, v3, VRET_FW,   cc5); }
+inline void Assembler::z_vstrc(  VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t imm5, int64_t cc6) {emit_48(VSTRC_ZOPC | vreg(v1,  8) | vreg(v2, 12) | vreg(v3, 16) | vreg(v4, 32)     | vesc_mask(imm5, VRET_BYTE, VRET_FW, 20) | voprc_any(cc6, 24) ); }  // String range compare
+inline void Assembler::z_vstrcb( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4,               int64_t cc6) {z_vstrc(v1, v2, v3, v4, VRET_BYTE, cc6); }
+inline void Assembler::z_vstrch( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4,               int64_t cc6) {z_vstrc(v1, v2, v3, v4, VRET_HW,   cc6); }
+inline void Assembler::z_vstrcf( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4,               int64_t cc6) {z_vstrc(v1, v2, v3, v4, VRET_FW,   cc6); }
+inline void Assembler::z_vistr(  VectorRegister v1, VectorRegister v2, int64_t imm3, int64_t cc5) {emit_48(VISTR_ZOPC  | vreg(v1,  8) | vreg(v2, 12) | vesc_mask(imm3, VRET_BYTE, VRET_FW, 32) | voprc_any(cc5, 24) ); }  // isolate string
+inline void Assembler::z_vistrb( VectorRegister v1, VectorRegister v2,               int64_t cc5) {z_vistr(v1, v2, VRET_BYTE, cc5); }
+inline void Assembler::z_vistrh( VectorRegister v1, VectorRegister v2,               int64_t cc5) {z_vistr(v1, v2, VRET_HW,   cc5); }
+inline void Assembler::z_vistrf( VectorRegister v1, VectorRegister v2,               int64_t cc5) {z_vistr(v1, v2, VRET_FW,   cc5); }
+inline void Assembler::z_vistrbs(VectorRegister v1, VectorRegister v2)                            {z_vistr(v1, v2, VRET_BYTE, VOPRC_CCSET); }
+inline void Assembler::z_vistrhs(VectorRegister v1, VectorRegister v2)                            {z_vistr(v1, v2, VRET_HW,   VOPRC_CCSET); }
+inline void Assembler::z_vistrfs(VectorRegister v1, VectorRegister v2)                            {z_vistr(v1, v2, VRET_FW,   VOPRC_CCSET); }
+
+
 //-------------------------------
 // FLOAT INSTRUCTIONS
 //-------------------------------
--- a/src/hotspot/cpu/s390/globals_s390.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/globals_s390.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2016 SAP SE. All rights reserved.
+ * Copyright (c) 2016, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,7 +34,7 @@
 // Sorted according to sparc.
 
 // z/Architecture remembers branch targets, so don't share vtables.
-define_pd_global(bool,  ShareVtableStubs,            false);
+define_pd_global(bool,  ShareVtableStubs,            true);
 define_pd_global(bool,  NeedsDeoptSuspend,           false); // Only register window machines need this.
 
 define_pd_global(bool,  ImplicitNullChecks,          true);  // Generate code for implicit null checks.
--- a/src/hotspot/cpu/s390/interp_masm_s390.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/interp_masm_s390.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -914,7 +914,7 @@
   //
   // markOop displaced_header = obj->mark().set_unlocked();
   // monitor->lock()->set_displaced_header(displaced_header);
-  // if (Atomic::cmpxchg_ptr(/*ex=*/monitor, /*addr*/obj->mark_addr(), /*cmp*/displaced_header) == displaced_header) {
+  // if (Atomic::cmpxchg(/*ex=*/monitor, /*addr*/obj->mark_addr(), /*cmp*/displaced_header) == displaced_header) {
   //   // We stored the monitor address into the object's mark word.
   // } else if (THREAD->is_lock_owned((address)displaced_header))
   //   // Simple recursive case.
@@ -949,7 +949,7 @@
   z_stg(displaced_header, BasicObjectLock::lock_offset_in_bytes() +
                           BasicLock::displaced_header_offset_in_bytes(), monitor);
 
-  // if (Atomic::cmpxchg_ptr(/*ex=*/monitor, /*addr*/obj->mark_addr(), /*cmp*/displaced_header) == displaced_header) {
+  // if (Atomic::cmpxchg(/*ex=*/monitor, /*addr*/obj->mark_addr(), /*cmp*/displaced_header) == displaced_header) {
 
   // Store stack address of the BasicObjectLock (this is monitor) into object.
   add2reg(object_mark_addr, oopDesc::mark_offset_in_bytes(), object);
@@ -1021,7 +1021,7 @@
   // if ((displaced_header = monitor->displaced_header()) == NULL) {
   //   // Recursive unlock. Mark the monitor unlocked by setting the object field to NULL.
   //   monitor->set_obj(NULL);
-  // } else if (Atomic::cmpxchg_ptr(displaced_header, obj->mark_addr(), monitor) == monitor) {
+  // } else if (Atomic::cmpxchg(displaced_header, obj->mark_addr(), monitor) == monitor) {
   //   // We swapped the unlocked mark in displaced_header into the object's mark word.
   //   monitor->set_obj(NULL);
   // } else {
@@ -1062,7 +1062,7 @@
                                                       BasicLock::displaced_header_offset_in_bytes()));
   z_bre(done); // displaced_header == 0 -> goto done
 
-  // } else if (Atomic::cmpxchg_ptr(displaced_header, obj->mark_addr(), monitor) == monitor) {
+  // } else if (Atomic::cmpxchg(displaced_header, obj->mark_addr(), monitor) == monitor) {
   //   // We swapped the unlocked mark in displaced_header into the object's mark word.
   //   monitor->set_obj(NULL);
 
--- a/src/hotspot/cpu/s390/jniTypes_s390.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/jniTypes_s390.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2016 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -29,9 +29,9 @@
 // This file holds platform-dependent routines used to write primitive
 // jni types to the array of arguments passed into JavaCalls::call.
 
+#include "jni.h"
 #include "memory/allocation.hpp"
 #include "oops/oop.hpp"
-#include "prims/jni.h"
 
 class JNITypes : AllStatic {
   // These functions write a java primitive type (in native format) to
--- a/src/hotspot/cpu/s390/macroAssembler_s390.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/macroAssembler_s390.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -4671,6 +4671,7 @@
   mem2reg_opt(mirror, Address(mirror, ConstMethod::constants_offset()));
   mem2reg_opt(mirror, Address(mirror, ConstantPool::pool_holder_offset_in_bytes()));
   mem2reg_opt(mirror, Address(mirror, Klass::java_mirror_offset()));
+  resolve_oop_handle(mirror);
 }
 
 //---------------------------------------------------------------
--- a/src/hotspot/cpu/s390/register_definitions_s390.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/register_definitions_s390.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2016 SAP SE. All rights reserved.
+ * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -35,3 +35,5 @@
 REGISTER_DEFINITION(Register, noreg);
 
 REGISTER_DEFINITION(FloatRegister, fnoreg);
+
+REGISTER_DEFINITION(VectorRegister, vnoreg);
--- a/src/hotspot/cpu/s390/register_s390.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/register_s390.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2016 SAP SE. All rights reserved.
+ * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -46,3 +46,13 @@
   };
   return is_valid() ? names[encoding()] : "fnoreg";
 }
+
+const char* VectorRegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+    "Z_V0",  "Z_V1",  "Z_V2",  "Z_V3",  "Z_V4",  "Z_V5",  "Z_V6",  "Z_V7",
+    "Z_V8",  "Z_V9",  "Z_V10", "Z_V11", "Z_V12", "Z_V13", "Z_V14", "Z_V15",
+    "Z_V16", "Z_V17", "Z_V18", "Z_V19", "Z_V20", "Z_V21", "Z_V22", "Z_V23",
+    "Z_V24", "Z_V25", "Z_V26", "Z_V27", "Z_V28", "Z_V29", "Z_V30", "Z_V31"
+  };
+  return is_valid() ? names[encoding()] : "fnoreg";
+}
--- a/src/hotspot/cpu/s390/register_s390.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/register_s390.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2016 SAP SE. All rights reserved.
+ * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,11 +34,6 @@
 
 typedef VMRegImpl* VMReg;
 
-// Use Register as shortcut.
-class RegisterImpl;
-typedef RegisterImpl* Register;
-
-// The implementation of integer registers for z/Architecture.
 
 // z/Architecture registers, see "LINUX for zSeries ELF ABI Supplement", IBM March 2001
 //
@@ -57,6 +52,17 @@
 //   f1,f3,f5,f7 General purpose (volatile)
 //   f8-f15      General purpose (nonvolatile)
 
+
+//===========================
+//===  Integer Registers  ===
+//===========================
+
+// Use Register as shortcut.
+class RegisterImpl;
+typedef RegisterImpl* Register;
+
+// The implementation of integer registers for z/Architecture.
+
 inline Register as_Register(int encoding) {
   return (Register)(long)encoding;
 }
@@ -110,6 +116,11 @@
 CONSTANT_REGISTER_DECLARATION(Register, Z_R14, (14));
 CONSTANT_REGISTER_DECLARATION(Register, Z_R15, (15));
 
+
+//=============================
+//===  Condition Registers  ===
+//=============================
+
 // Use ConditionRegister as shortcut
 class ConditionRegisterImpl;
 typedef ConditionRegisterImpl* ConditionRegister;
@@ -159,7 +170,7 @@
 // dangers of defines.
 // If a particular file has a problem with these defines then it's possible
 // to turn them off in that file by defining
-// DONT_USE_REGISTER_DEFINES. Register_definition_s390.cpp does that
+// DONT_USE_REGISTER_DEFINES. Register_definitions_s390.cpp does that
 // so that it's able to provide real definitions of these registers
 // for use in debuggers and such.
 
@@ -186,6 +197,11 @@
 #define Z_CR ((ConditionRegister)(Z_CR_ConditionRegisterEnumValue))
 #endif // DONT_USE_REGISTER_DEFINES
 
+
+//=========================
+//===  Float Registers  ===
+//=========================
+
 // Use FloatRegister as shortcut
 class FloatRegisterImpl;
 typedef FloatRegisterImpl* FloatRegister;
@@ -263,22 +279,6 @@
 #define Z_F15 ((FloatRegister)(  Z_F15_FloatRegisterEnumValue))
 #endif // DONT_USE_REGISTER_DEFINES
 
-// Need to know the total number of registers of all sorts for SharedInfo.
-// Define a class that exports it.
-
-class ConcreteRegisterImpl : public AbstractRegisterImpl {
- public:
-  enum {
-    number_of_registers =
-      (RegisterImpl::number_of_registers +
-      FloatRegisterImpl::number_of_registers)
-      * 2 // register halves
-      + 1 // condition code register
-  };
-  static const int max_gpr;
-  static const int max_fpr;
-};
-
 // Single, Double and Quad fp reg classes. These exist to map the ADLC
 // encoding for a floating point register, to the FloatRegister number
 // desired by the macroassembler. A FloatRegister is a number between
@@ -329,6 +329,161 @@
 };
 
 
+//==========================
+//===  Vector Registers  ===
+//==========================
+
+// Use VectorRegister as shortcut
+class VectorRegisterImpl;
+typedef VectorRegisterImpl* VectorRegister;
+
+// The implementation of vector registers for z/Architecture.
+
+inline VectorRegister as_VectorRegister(int encoding) {
+  return (VectorRegister)(long)encoding;
+}
+
+class VectorRegisterImpl: public AbstractRegisterImpl {
+ public:
+  enum {
+    number_of_registers     = 32,
+    number_of_arg_registers = 0
+  };
+
+  // construction
+  inline friend VectorRegister as_VectorRegister(int encoding);
+
+  inline VMReg as_VMReg();
+
+  // accessors
+  int encoding() const                                {
+     assert(is_valid(), "invalid register"); return value();
+  }
+
+  bool is_valid() const           { return  0 <= value() && value() < number_of_registers; }
+  bool is_volatile() const        { return true; }
+  bool is_nonvolatile() const     { return false; }
+
+  // Register fields in z/Architecture instructions are 4 bits wide, restricting the
+  // addressable register set size to 16.
+  // The vector register set size is 32, requiring an extension, by one bit, of the
+  // register encoding. This is accomplished by the introduction of a RXB field in the
+  // instruction. RXB = Register eXtension Bits.
+  // The RXB field contains the MSBs (most significant bit) of the vector register numbers
+  // used for this instruction. Assignment of MSB in RBX is by bit position of the
+  // register field in the instruction.
+  // Example:
+  //   The register field starting at bit position 12 in the instruction is assigned RXB bit 0b0100.
+  int64_t RXB_mask(int pos) {
+    if (encoding() >= number_of_registers/2) {
+      switch (pos) {
+        case 8:   return ((int64_t)0b1000) << 8; // actual bit pos: 36
+        case 12:  return ((int64_t)0b0100) << 8; // actual bit pos: 37
+        case 16:  return ((int64_t)0b0010) << 8; // actual bit pos: 38
+        case 32:  return ((int64_t)0b0001) << 8; // actual bit pos: 39
+        default:
+          ShouldNotReachHere();
+      }
+    }
+    return 0;
+  }
+
+  const char* name() const;
+
+  VectorRegister successor() const { return as_VectorRegister(encoding() + 1); }
+};
+
+// The Vector registers of z/Architecture.
+
+CONSTANT_REGISTER_DECLARATION(VectorRegister, vnoreg, (-1));
+
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V0,  (0));
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V1,  (1));
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V2,  (2));
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V3,  (3));
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V4,  (4));
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V5,  (5));
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V6,  (6));
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V7,  (7));
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V8,  (8));
+CONSTANT_REGISTER_DECLARATION(VectorRegister,  Z_V9,  (9));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V10, (10));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V11, (11));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V12, (12));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V13, (13));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V14, (14));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V15, (15));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V16, (16));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V17, (17));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V18, (18));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V19, (19));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V20, (20));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V21, (21));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V22, (22));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V23, (23));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V24, (24));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V25, (25));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V26, (26));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V27, (27));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V28, (28));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V29, (29));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V30, (30));
+CONSTANT_REGISTER_DECLARATION(VectorRegister, Z_V31, (31));
+
+#ifndef DONT_USE_REGISTER_DEFINES
+#define vnoreg ((VectorRegister)(vnoreg_VectorRegisterEnumValue))
+#define Z_V0  ((VectorRegister)(   Z_V0_VectorRegisterEnumValue))
+#define Z_V1  ((VectorRegister)(   Z_V1_VectorRegisterEnumValue))
+#define Z_V2  ((VectorRegister)(   Z_V2_VectorRegisterEnumValue))
+#define Z_V3  ((VectorRegister)(   Z_V3_VectorRegisterEnumValue))
+#define Z_V4  ((VectorRegister)(   Z_V4_VectorRegisterEnumValue))
+#define Z_V5  ((VectorRegister)(   Z_V5_VectorRegisterEnumValue))
+#define Z_V6  ((VectorRegister)(   Z_V6_VectorRegisterEnumValue))
+#define Z_V7  ((VectorRegister)(   Z_V7_VectorRegisterEnumValue))
+#define Z_V8  ((VectorRegister)(   Z_V8_VectorRegisterEnumValue))
+#define Z_V9  ((VectorRegister)(   Z_V9_VectorRegisterEnumValue))
+#define Z_V10 ((VectorRegister)(  Z_V10_VectorRegisterEnumValue))
+#define Z_V11 ((VectorRegister)(  Z_V11_VectorRegisterEnumValue))
+#define Z_V12 ((VectorRegister)(  Z_V12_VectorRegisterEnumValue))
+#define Z_V13 ((VectorRegister)(  Z_V13_VectorRegisterEnumValue))
+#define Z_V14 ((VectorRegister)(  Z_V14_VectorRegisterEnumValue))
+#define Z_V15 ((VectorRegister)(  Z_V15_VectorRegisterEnumValue))
+#define Z_V16 ((VectorRegister)(  Z_V16_VectorRegisterEnumValue))
+#define Z_V17 ((VectorRegister)(  Z_V17_VectorRegisterEnumValue))
+#define Z_V18 ((VectorRegister)(  Z_V18_VectorRegisterEnumValue))
+#define Z_V19 ((VectorRegister)(  Z_V19_VectorRegisterEnumValue))
+#define Z_V20 ((VectorRegister)(  Z_V20_VectorRegisterEnumValue))
+#define Z_V21 ((VectorRegister)(  Z_V21_VectorRegisterEnumValue))
+#define Z_V22 ((VectorRegister)(  Z_V22_VectorRegisterEnumValue))
+#define Z_V23 ((VectorRegister)(  Z_V23_VectorRegisterEnumValue))
+#define Z_V24 ((VectorRegister)(  Z_V24_VectorRegisterEnumValue))
+#define Z_V25 ((VectorRegister)(  Z_V25_VectorRegisterEnumValue))
+#define Z_V26 ((VectorRegister)(  Z_V26_VectorRegisterEnumValue))
+#define Z_V27 ((VectorRegister)(  Z_V27_VectorRegisterEnumValue))
+#define Z_V28 ((VectorRegister)(  Z_V28_VectorRegisterEnumValue))
+#define Z_V29 ((VectorRegister)(  Z_V29_VectorRegisterEnumValue))
+#define Z_V30 ((VectorRegister)(  Z_V30_VectorRegisterEnumValue))
+#define Z_V31 ((VectorRegister)(  Z_V31_VectorRegisterEnumValue))
+#endif // DONT_USE_REGISTER_DEFINES
+
+
+// Need to know the total number of registers of all sorts for SharedInfo.
+// Define a class that exports it.
+
+class ConcreteRegisterImpl : public AbstractRegisterImpl {
+ public:
+  enum {
+    number_of_registers =
+      (RegisterImpl::number_of_registers +
+      FloatRegisterImpl::number_of_registers)
+      * 2 // register halves
+      + 1 // condition code register
+  };
+  static const int max_gpr;
+  static const int max_fpr;
+};
+
+
 // Common register declarations used in assembler code.
 REGISTER_DECLARATION(Register,      Z_EXC_OOP, Z_R2);
 REGISTER_DECLARATION(Register,      Z_EXC_PC,  Z_R3);
--- a/src/hotspot/cpu/s390/s390.ad	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/s390.ad	Sat Oct 21 01:23:52 2017 +0200
@@ -3149,7 +3149,7 @@
   interface(REG_INTER);
 %}
 
-// Revenregi and roddRegI constitute and even-odd-pair.
+// revenRegI and roddRegI constitute and even-odd-pair.
 operand revenRegI() %{
   constraint(ALLOC_IN_RC(z_rarg3_int_reg));
   match(iRegI);
@@ -3157,7 +3157,7 @@
   interface(REG_INTER);
 %}
 
-// Revenregi and roddRegI constitute and even-odd-pair.
+// revenRegI and roddRegI constitute and even-odd-pair.
 operand roddRegI() %{
   constraint(ALLOC_IN_RC(z_rarg4_int_reg));
   match(iRegI);
@@ -3283,7 +3283,7 @@
   interface(REG_INTER);
 %}
 
-// Revenregp and roddRegP constitute and even-odd-pair.
+// revenRegP and roddRegP constitute and even-odd-pair.
 operand revenRegP() %{
   constraint(ALLOC_IN_RC(z_rarg3_ptr_reg));
   match(iRegP);
@@ -3291,7 +3291,7 @@
   interface(REG_INTER);
 %}
 
-// Revenregl and roddRegL constitute and even-odd-pair.
+// revenRegP and roddRegP constitute and even-odd-pair.
 operand roddRegP() %{
   constraint(ALLOC_IN_RC(z_rarg4_ptr_reg));
   match(iRegP);
@@ -3380,7 +3380,7 @@
   interface(REG_INTER);
 %}
 
-// Revenregl and roddRegL constitute and even-odd-pair.
+// revenRegL and roddRegL constitute and even-odd-pair.
 operand revenRegL() %{
   constraint(ALLOC_IN_RC(z_rarg3_long_reg));
   match(iRegL);
@@ -3388,7 +3388,7 @@
   interface(REG_INTER);
 %}
 
-// Revenregl and roddRegL constitute and even-odd-pair.
+// revenRegL and roddRegL constitute and even-odd-pair.
 operand roddRegL() %{
   constraint(ALLOC_IN_RC(z_rarg4_long_reg));
   match(iRegL);
@@ -6443,6 +6443,32 @@
   ins_pipe(pipe_class_dummy);
 %}
 
+instruct mulHiL_reg_reg(revenRegL Rdst, roddRegL Rsrc1, iRegL Rsrc2, iRegL Rtmp1, flagsReg cr)%{
+  match(Set Rdst (MulHiL Rsrc1 Rsrc2));
+  effect(TEMP_DEF Rdst, USE_KILL Rsrc1, TEMP Rtmp1, KILL cr);
+  ins_cost(7*DEFAULT_COST);
+  // TODO: s390 port size(VARIABLE_SIZE);
+  format %{ "MulHiL  $Rdst, $Rsrc1, $Rsrc2\t # Multiply High Long" %}
+  ins_encode%{
+    Register dst  = $Rdst$$Register;
+    Register src1 = $Rsrc1$$Register;
+    Register src2 = $Rsrc2$$Register;
+    Register tmp1 = $Rtmp1$$Register;
+    Register tmp2 = $Rdst$$Register;
+    // z/Architecture has only unsigned multiply (64 * 64 -> 128).
+    // implementing mulhs(a,b) = mulhu(a,b) – (a & (b>>63)) – (b & (a>>63))
+    __ z_srag(tmp2, src1, 63);  // a>>63
+    __ z_srag(tmp1, src2, 63);  // b>>63
+    __ z_ngr(tmp2, src2);       // b & (a>>63)
+    __ z_ngr(tmp1, src1);       // a & (b>>63)
+    __ z_agr(tmp1, tmp2);       // ((a & (b>>63)) + (b & (a>>63)))
+    __ z_mlgr(dst, src2);       // tricky: 128-bit product is written to even/odd pair (dst,src1),
+                                //         multiplicand is taken from oddReg (src1), multiplier in src2.
+    __ z_sgr(dst, tmp1);
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
 //  DIV
 
 // Integer DIVMOD with Register, both quotient and mod results
--- a/src/hotspot/cpu/s390/templateTable_s390.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/templateTable_s390.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -2382,6 +2382,7 @@
   if (is_static) {
     __ mem2reg_opt(obj, Address(cache, index, cp_base_offset + ConstantPoolCacheEntry::f1_offset()));
     __ mem2reg_opt(obj, Address(obj, Klass::java_mirror_offset()));
+    __ resolve_oop_handle(obj);
   }
 }
 
--- a/src/hotspot/cpu/s390/vm_version_s390.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/vm_version_s390.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -224,7 +224,7 @@
   }
 
   // z/Architecture supports 8-byte compare-exchange operations
-  // (see Atomic::cmpxchg and StubGenerator::generate_atomic_cmpxchg_ptr)
+  // (see Atomic::cmpxchg)
   // and 'atomic long memory ops' (see Unsafe_GetLongVolatile).
   _supports_cx8 = true;
 
@@ -706,12 +706,13 @@
   Label    getCPUFEATURES;                   // fcode = -1 (cache)
   Label    getCIPHERFEATURES;                // fcode = -2 (cipher)
   Label    getMSGDIGESTFEATURES;             // fcode = -3 (SHA)
+  Label    getVECTORFEATURES;                // fcode = -4 (OS support for vector instructions)
   Label    checkLongDispFast;
   Label    noLongDisp;
   Label    posDisp, negDisp;
   Label    errRTN;
   a->z_ltgfr(Z_R0, Z_ARG2);                  // Buf len to r0 and test.
-  a->z_brl(getFEATURES);                     // negative -> Get machine features.
+  a->z_brl(getFEATURES);                     // negative -> Get machine features not covered by facility list.
   a->z_brz(checkLongDispFast);               // zero -> Check for high-speed Long Displacement Facility.
   a->z_aghi(Z_R0, -1);
   a->z_stfle(0, Z_ARG1);
@@ -736,6 +737,8 @@
   a->z_bre(getCIPHERFEATURES);
   a->z_cghi(Z_R0, -3);                       // -3: Extract detailed crypto capabilities (msg digest instructions).
   a->z_bre(getMSGDIGESTFEATURES);
+  a->z_cghi(Z_R0, -4);                       // -4: Verify vector instruction availability (OS support).
+  a->z_bre(getVECTORFEATURES);
 
   a->z_xgr(Z_RET, Z_RET);                    // Not a valid function code.
   a->z_br(Z_R14);                            // Return "operation aborted".
@@ -766,6 +769,11 @@
   a->z_ecag(Z_RET,Z_R0,0,Z_ARG3);            // Extract information as requested by Z_ARG1 contents.
   a->z_br(Z_R14);
 
+  // Use a vector instruction to verify OS support. Will fail with SIGFPE if OS support is missing.
+  a->bind(getVECTORFEATURES);
+  a->z_vtm(Z_V0,Z_V0);                       // non-destructive vector instruction. Will cause SIGFPE if not supported.
+  a->z_br(Z_R14);
+
   // Check the performance of the Long Displacement Facility, i.e. find out if we are running on z900 or newer.
   a->bind(checkLongDispFast);
   a->z_llill(Z_R0, 0xffff);                  // preset #iterations
@@ -962,6 +970,19 @@
     _nfeatures = 0;
   }
 
+  if (has_VectorFacility()) {
+    // Verify that feature can actually be used. OS support required.
+    call_getFeatures(buffer, -4, 0);
+    if (printVerbose) {
+      ttyLocker ttyl;
+      if (has_VectorFacility()) {
+        tty->print_cr("  Vector Facility has been verified to be supported by OS");
+      } else {
+        tty->print_cr("  Vector Facility has been disabled - not supported by OS");
+      }
+    }
+  }
+
   // Extract Crypto Facility details.
   if (has_Crypto()) {
     // Get cipher features.
--- a/src/hotspot/cpu/s390/vm_version_s390.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/s390/vm_version_s390.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -473,6 +473,8 @@
   static void set_has_CryptoExt5()                { _features[0] |= CryptoExtension5Mask; }
   static void set_has_VectorFacility()            { _features[2] |= VectorFacilityMask; }
 
+  static void reset_has_VectorFacility()          { _features[2] &= ~VectorFacilityMask; }
+
   // Assembler testing.
   static void allow_all();
   static void revert();
--- a/src/hotspot/cpu/sparc/assembler_sparc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/assembler_sparc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -122,6 +122,7 @@
     fpop1_op3    = 0x34,
     fpop2_op3    = 0x35,
     impdep1_op3  = 0x36,
+    addx_op3     = 0x36,
     aes3_op3     = 0x36,
     sha_op3      = 0x36,
     bmask_op3    = 0x36,
@@ -133,6 +134,8 @@
     fzero_op3    = 0x36,
     fsrc_op3     = 0x36,
     fnot_op3     = 0x36,
+    mpmul_op3    = 0x36,
+    umulx_op3    = 0x36,
     xmulx_op3    = 0x36,
     crc32c_op3   = 0x36,
     impdep2_op3  = 0x37,
@@ -195,6 +198,9 @@
     fnegs_opf          = 0x05,
     fnegd_opf          = 0x06,
 
+    addxc_opf          = 0x11,
+    addxccc_opf        = 0x13,
+    umulxhi_opf        = 0x16,
     alignaddr_opf      = 0x18,
     bmask_opf          = 0x19,
 
@@ -240,7 +246,8 @@
     sha256_opf         = 0x142,
     sha512_opf         = 0x143,
 
-    crc32c_opf         = 0x147
+    crc32c_opf         = 0x147,
+    mpmul_opf          = 0x148
   };
 
   enum op5s {
@@ -380,7 +387,7 @@
     assert_signed_range(x, nbits + 2);
   }
 
-  static void assert_unsigned_const(int x, int nbits) {
+  static void assert_unsigned_range(int x, int nbits) {
     assert(juint(x) < juint(1 << nbits), "unsigned constant out of range");
   }
 
@@ -534,6 +541,12 @@
     return x & ((1 << nbits) - 1);
   }
 
+  // unsigned immediate, in low bits, at most nbits long.
+  static int uimm(int x, int nbits) {
+    assert_unsigned_range(x, nbits);
+    return x & ((1 << nbits) - 1);
+  }
+
   // compute inverse of wdisp16
   static intptr_t inv_wdisp16(int x, intptr_t pos) {
     int lo = x & ((1 << 14) - 1);
@@ -631,6 +644,9 @@
   // FMAf instructions supported only on certain processors
   static void fmaf_only() { assert(VM_Version::has_fmaf(), "This instruction only works on SPARC with FMAf"); }
 
+  // MPMUL instruction supported only on certain processors
+  static void mpmul_only() { assert(VM_Version::has_mpmul(), "This instruction only works on SPARC with MPMUL"); }
+
   // instruction only in VIS1
   static void vis1_only() { assert(VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }
 
@@ -772,11 +788,12 @@
     AbstractAssembler::flush();
   }
 
-  inline void emit_int32(int);  // shadows AbstractAssembler::emit_int32
-  inline void emit_data(int);
-  inline void emit_data(int, RelocationHolder const &rspec);
-  inline void emit_data(int, relocInfo::relocType rtype);
-  // helper for above functions
+  inline void emit_int32(int32_t);  // shadows AbstractAssembler::emit_int32
+  inline void emit_data(int32_t);
+  inline void emit_data(int32_t, RelocationHolder const&);
+  inline void emit_data(int32_t, relocInfo::relocType rtype);
+
+  // Helper for the above functions.
   inline void check_delay();
 
 
@@ -929,6 +946,10 @@
   // fmaf instructions.
 
   inline void fmadd(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d);
+  inline void fmsub(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d);
+
+  inline void fnmadd(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d);
+  inline void fnmsub(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d);
 
   // pp 165
 
@@ -960,6 +981,8 @@
   inline void ldf(FloatRegisterImpl::Width w, Register s1, int simm13a, FloatRegister d,
                   RelocationHolder const &rspec = RelocationHolder());
 
+  inline void ldd(Register s1, Register s2, FloatRegister d);
+  inline void ldd(Register s1, int simm13a, FloatRegister d);
 
   inline void ldfsr(Register s1, Register s2);
   inline void ldfsr(Register s1, int simm13a);
@@ -987,8 +1010,6 @@
   inline void lduw(Register s1, int simm13a, Register d);
   inline void ldx(Register s1, Register s2, Register d);
   inline void ldx(Register s1, int simm13a, Register d);
-  inline void ldd(Register s1, Register s2, Register d);
-  inline void ldd(Register s1, int simm13a, Register d);
 
   // pp 177
 
@@ -1157,6 +1178,9 @@
   inline void stf(FloatRegisterImpl::Width w, FloatRegister d, Register s1, Register s2);
   inline void stf(FloatRegisterImpl::Width w, FloatRegister d, Register s1, int simm13a);
 
+  inline void std(FloatRegister d, Register s1, Register s2);
+  inline void std(FloatRegister d, Register s1, int simm13a);
+
   inline void stfsr(Register s1, Register s2);
   inline void stfsr(Register s1, int simm13a);
   inline void stxfsr(Register s1, Register s2);
@@ -1177,8 +1201,6 @@
   inline void stw(Register d, Register s1, int simm13a);
   inline void stx(Register d, Register s1, Register s2);
   inline void stx(Register d, Register s1, int simm13a);
-  inline void std(Register d, Register s1, Register s2);
-  inline void std(Register d, Register s1, int simm13a);
 
   // pp 177
 
@@ -1267,6 +1289,9 @@
 
   // VIS3 instructions
 
+  inline void addxc(Register s1, Register s2, Register d);
+  inline void addxccc(Register s1, Register s2, Register d);
+
   inline void movstosw(FloatRegister s, Register d);
   inline void movstouw(FloatRegister s, Register d);
   inline void movdtox(FloatRegister s, Register d);
@@ -1276,6 +1301,7 @@
 
   inline void xmulx(Register s1, Register s2, Register d);
   inline void xmulxhi(Register s1, Register s2, Register d);
+  inline void umulxhi(Register s1, Register s2, Register d);
 
   // Crypto SHA instructions
 
@@ -1287,6 +1313,10 @@
 
   inline void crc32c(FloatRegister s1, FloatRegister s2, FloatRegister d);
 
+  // MPMUL instruction
+
+  inline void mpmul(int uimm5);
+
   // Creation
   Assembler(CodeBuffer* code) : AbstractAssembler(code) {
 #ifdef VALIDATE_PIPELINE
--- a/src/hotspot/cpu/sparc/assembler_sparc.inline.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/assembler_sparc.inline.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -59,7 +59,7 @@
 #endif
 }
 
-inline void Assembler::emit_int32(int x) {
+inline void Assembler::emit_int32(int32_t x) {
   check_delay();
 #ifdef VALIDATE_PIPELINE
   _hazard_state = NoHazard;
@@ -67,16 +67,16 @@
   AbstractAssembler::emit_int32(x);
 }
 
-inline void Assembler::emit_data(int x) {
+inline void Assembler::emit_data(int32_t x) {
   emit_int32(x);
 }
 
-inline void Assembler::emit_data(int x, relocInfo::relocType rtype) {
+inline void Assembler::emit_data(int32_t x, relocInfo::relocType rtype) {
   relocate(rtype);
   emit_int32(x);
 }
 
-inline void Assembler::emit_data(int x, RelocationHolder const &rspec) {
+inline void Assembler::emit_data(int32_t x, RelocationHolder const &rspec) {
   relocate(rspec);
   emit_int32(x);
 }
@@ -359,6 +359,19 @@
   fmaf_only();
   emit_int32(op(arith_op) | fd(d, w) | op3(stpartialf_op3) | fs1(s1, w) | fs3(s3, w) | op5(w) | fs2(s2, w));
 }
+inline void Assembler::fmsub(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d) {
+  fmaf_only();
+  emit_int32(op(arith_op) | fd(d, w) | op3(stpartialf_op3) | fs1(s1, w) | fs3(s3, w) | op5(0x4 + w) | fs2(s2, w));
+}
+
+inline void Assembler::fnmadd(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d) {
+  fmaf_only();
+  emit_int32(op(arith_op) | fd(d, w) | op3(stpartialf_op3) | fs1(s1, w) | fs3(s3, w) | op5(0xc + w) | fs2(s2, w));
+}
+inline void Assembler::fnmsub(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d) {
+  fmaf_only();
+  emit_int32(op(arith_op) | fd(d, w) | op3(stpartialf_op3) | fs1(s1, w) | fs3(s3, w) | op5(0x8 + w) | fs2(s2, w));
+}
 
 inline void Assembler::flush(Register s1, Register s2) {
   emit_int32(op(arith_op) | op3(flush_op3) | rs1(s1) | rs2(s2));
@@ -402,6 +415,15 @@
   emit_data(op(ldst_op) | fd(d, w) | alt_op3(ldf_op3, w) | rs1(s1) | immed(true) | simm(simm13a, 13), rspec);
 }
 
+inline void Assembler::ldd(Register s1, Register s2, FloatRegister d) {
+  assert(d->is_even(), "not even");
+  ldf(FloatRegisterImpl::D, s1, s2, d);
+}
+inline void Assembler::ldd(Register s1, int simm13a, FloatRegister d) {
+  assert(d->is_even(), "not even");
+  ldf(FloatRegisterImpl::D, s1, simm13a, d);
+}
+
 inline void Assembler::ldxfsr(Register s1, Register s2) {
   emit_int32(op(ldst_op) | rd(G1) | op3(ldfsr_op3) | rs1(s1) | rs2(s2));
 }
@@ -460,16 +482,6 @@
 inline void Assembler::ldx(Register s1, int simm13a, Register d) {
   emit_data(op(ldst_op) | rd(d) | op3(ldx_op3) | rs1(s1) | immed(true) | simm(simm13a, 13));
 }
-inline void Assembler::ldd(Register s1, Register s2, Register d) {
-  v9_dep();
-  assert(d->is_even(), "not even");
-  emit_int32(op(ldst_op) | rd(d) | op3(ldd_op3) | rs1(s1) | rs2(s2));
-}
-inline void Assembler::ldd(Register s1, int simm13a, Register d) {
-  v9_dep();
-  assert(d->is_even(), "not even");
-  emit_data(op(ldst_op) | rd(d) | op3(ldd_op3) | rs1(s1) | immed(true) | simm(simm13a, 13));
-}
 
 inline void Assembler::ldsba(Register s1, Register s2, int ia, Register d) {
   emit_int32(op(ldst_op) | rd(d) | op3(ldsb_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2));
@@ -806,6 +818,15 @@
   emit_data(op(ldst_op) | fd(d, w) | alt_op3(stf_op3, w) | rs1(s1) | immed(true) | simm(simm13a, 13));
 }
 
+inline void Assembler::std(FloatRegister d, Register s1, Register s2) {
+  assert(d->is_even(), "not even");
+  stf(FloatRegisterImpl::D, d, s1, s2);
+}
+inline void Assembler::std(FloatRegister d, Register s1, int simm13a) {
+  assert(d->is_even(), "not even");
+  stf(FloatRegisterImpl::D, d, s1, simm13a);
+}
+
 inline void Assembler::stxfsr(Register s1, Register s2) {
   emit_int32(op(ldst_op) | rd(G1) | op3(stfsr_op3) | rs1(s1) | rs2(s2));
 }
@@ -848,16 +869,6 @@
 inline void Assembler::stx(Register d, Register s1, int simm13a) {
   emit_data(op(ldst_op) | rd(d) | op3(stx_op3) | rs1(s1) | immed(true) | simm(simm13a, 13));
 }
-inline void Assembler::std(Register d, Register s1, Register s2) {
-  v9_dep();
-  assert(d->is_even(), "not even");
-  emit_int32(op(ldst_op) | rd(d) | op3(std_op3) | rs1(s1) | rs2(s2));
-}
-inline void Assembler::std(Register d, Register s1, int simm13a) {
-  v9_dep();
-  assert(d->is_even(), "not even");
-  emit_data(op(ldst_op) | rd(d) | op3(std_op3) | rs1(s1) | immed(true) | simm(simm13a, 13));
-}
 
 inline void Assembler::stba(Register d, Register s1, Register s2, int ia) {
   emit_int32(op(ldst_op) | rd(d) | op3(stb_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2));
@@ -1043,6 +1054,15 @@
 
 // VIS3 instructions
 
+inline void Assembler::addxc(Register s1, Register s2, Register d) {
+  vis3_only();
+  emit_int32(op(arith_op) | rd(d) | op3(addx_op3) | rs1(s1) | opf(addxc_opf) | rs2(s2));
+}
+inline void Assembler::addxccc(Register s1, Register s2, Register d) {
+  vis3_only();
+  emit_int32(op(arith_op) | rd(d) | op3(addx_op3) | rs1(s1) | opf(addxccc_opf) | rs2(s2));
+}
+
 inline void Assembler::movstosw(FloatRegister s, Register d) {
   vis3_only();
   emit_int32(op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstosw_opf) | fs2(s, FloatRegisterImpl::S));
@@ -1073,6 +1093,10 @@
   vis3_only();
   emit_int32(op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2));
 }
+inline void Assembler::umulxhi(Register s1, Register s2, Register d) {
+  vis3_only();
+  emit_int32(op(arith_op) | rd(d) | op3(umulx_op3) | rs1(s1) | opf(umulxhi_opf) | rs2(s2));
+}
 
 // Crypto SHA instructions
 
@@ -1096,4 +1120,11 @@
   emit_int32(op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(crc32c_op3) | fs1(s1, FloatRegisterImpl::D) | opf(crc32c_opf) | fs2(s2, FloatRegisterImpl::D));
 }
 
+// MPMUL instruction
+
+inline void Assembler::mpmul(int uimm5) {
+  mpmul_only();
+  emit_int32(op(arith_op) | rd(0) | op3(mpmul_op3) | rs1(0) | opf(mpmul_opf) | uimm(uimm5, 5));
+}
+
 #endif // CPU_SPARC_VM_ASSEMBLER_SPARC_INLINE_HPP
--- a/src/hotspot/cpu/sparc/frame_sparc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/frame_sparc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -119,8 +119,8 @@
     reg = regname->as_Register();
   }
   if (reg->is_out()) {
-    assert(_younger_window != NULL, "Younger window should be available");
-    return second_word + (address)&_younger_window[reg->after_save()->sp_offset_in_saved_window()];
+    return _younger_window == NULL ? NULL :
+      second_word + (address)&_younger_window[reg->after_save()->sp_offset_in_saved_window()];
   }
   if (reg->is_local() || reg->is_in()) {
     assert(_window != NULL, "Window should be available");
--- a/src/hotspot/cpu/sparc/globals_sparc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/globals_sparc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -97,12 +97,15 @@
                    writeable) \
                                                                             \
   product(intx, UseVIS, 99,                                                 \
-          "Highest supported VIS instructions set on Sparc")                \
+          "Highest supported VIS instructions set on SPARC")                \
           range(0, 99)                                                      \
                                                                             \
   product(bool, UseCBCond, false,                                           \
           "Use compare and branch instruction on SPARC")                    \
                                                                             \
+  product(bool, UseMPMUL, false,                                            \
+          "Use multi-precision multiply instruction (mpmul) on SPARC")      \
+                                                                            \
   product(bool, UseBlockZeroing, false,                                     \
           "Use special cpu instructions for block zeroing")                 \
                                                                             \
--- a/src/hotspot/cpu/sparc/jniTypes_sparc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/jniTypes_sparc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -25,9 +25,9 @@
 #ifndef CPU_SPARC_VM_JNITYPES_SPARC_HPP
 #define CPU_SPARC_VM_JNITYPES_SPARC_HPP
 
+#include "jni.h"
 #include "memory/allocation.hpp"
 #include "oops/oop.hpp"
-#include "prims/jni.h"
 
 // This file holds platform-dependent routines used to write primitive jni
 // types to the array of arguments passed into JavaCalls::call
--- a/src/hotspot/cpu/sparc/macroAssembler_sparc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/macroAssembler_sparc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -1574,29 +1574,39 @@
   assert_not_delayed();
   if (use_cbcond(L)) {
     Assembler::cbcond(zero, ptr_cc, s1, 0, L);
-    return;
+  } else {
+    br_null(s1, false, p, L);
+    delayed()->nop();
   }
-  br_null(s1, false, p, L);
-  delayed()->nop();
 }
 
 void MacroAssembler::br_notnull_short(Register s1, Predict p, Label& L) {
   assert_not_delayed();
   if (use_cbcond(L)) {
     Assembler::cbcond(notZero, ptr_cc, s1, 0, L);
-    return;
+  } else {
+    br_notnull(s1, false, p, L);
+    delayed()->nop();
   }
-  br_notnull(s1, false, p, L);
-  delayed()->nop();
 }
 
 // Unconditional short branch
 void MacroAssembler::ba_short(Label& L) {
+  assert_not_delayed();
   if (use_cbcond(L)) {
     Assembler::cbcond(equal, icc, G0, G0, L);
-    return;
+  } else {
+    br(always, false, pt, L);
+    delayed()->nop();
   }
-  br(always, false, pt, L);
+}
+
+// Branch if 'icc' says zero or not (i.e. icc.z == 1|0).
+
+void MacroAssembler::br_icc_zero(bool iszero, Predict p, Label &L) {
+  assert_not_delayed();
+  Condition cf = (iszero ? Assembler::zero : Assembler::notZero);
+  br(cf, false, p, L);
   delayed()->nop();
 }
 
@@ -3834,6 +3844,7 @@
   ld_ptr(mirror, in_bytes(ConstMethod::constants_offset()), mirror);
   ld_ptr(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
   ld_ptr(mirror, mirror_offset, mirror);
+  resolve_oop_handle(mirror);
 }
 
 void MacroAssembler::load_klass(Register src_oop, Register klass) {
--- a/src/hotspot/cpu/sparc/macroAssembler_sparc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/macroAssembler_sparc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -606,7 +606,7 @@
   // offset.  No explicit code generation is needed if the offset is within a certain
   // range (0 <= offset <= page_size).
   //
-  // %%%%%% Currently not done for SPARC
+  // FIXME: Currently not done for SPARC
 
   void null_check(Register reg, int offset = -1);
   static bool needs_explicit_null_check(intptr_t offset);
@@ -648,6 +648,9 @@
   // unconditional short branch
   void ba_short(Label& L);
 
+  // Branch on icc.z (true or not).
+  void br_icc_zero(bool iszero, Predict p, Label &L);
+
   inline void bp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt = relocInfo::none );
   inline void bp( Condition c, bool a, CC cc, Predict p, Label& L );
 
@@ -663,19 +666,19 @@
   inline void fbp( Condition c, bool a, CC cc, Predict p, Label& L );
 
   // Sparc shorthands(pp 85, V8 manual, pp 289 V9 manual)
-  inline void cmp(  Register s1, Register s2 );
-  inline void cmp(  Register s1, int simm13a );
+  inline void cmp( Register s1, Register s2 );
+  inline void cmp( Register s1, int simm13a );
 
   inline void jmp( Register s1, Register s2 );
   inline void jmp( Register s1, int simm13a, RelocationHolder const& rspec = RelocationHolder() );
 
   // Check if the call target is out of wdisp30 range (relative to the code cache)
   static inline bool is_far_target(address d);
-  inline void call( address d,  relocInfo::relocType rt = relocInfo::runtime_call_type );
-  inline void call( address d,  RelocationHolder const& rspec);
+  inline void call( address d, relocInfo::relocType rt = relocInfo::runtime_call_type );
+  inline void call( address d, RelocationHolder const& rspec);
 
-  inline void call( Label& L,   relocInfo::relocType rt = relocInfo::runtime_call_type );
-  inline void call( Label& L,  RelocationHolder const& rspec);
+  inline void call( Label& L, relocInfo::relocType rt = relocInfo::runtime_call_type );
+  inline void call( Label& L, RelocationHolder const& rspec);
 
   inline void callr( Register s1, Register s2 );
   inline void callr( Register s1, int simm13a, RelocationHolder const& rspec = RelocationHolder() );
--- a/src/hotspot/cpu/sparc/macroAssembler_sparc.inline.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/macroAssembler_sparc.inline.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -185,7 +185,7 @@
 }
 
 inline void MacroAssembler::br( Condition c, bool a, Predict p, Label& L ) {
-  // See note[+] on 'avoid_pipeline_stalls()', in "assembler_sparc.inline.hpp".
+  // See note[+] on 'avoid_pipeline_stall()', in "assembler_sparc.inline.hpp".
   avoid_pipeline_stall();
   br(c, a, p, target(L));
 }
--- a/src/hotspot/cpu/sparc/register_sparc.hpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/register_sparc.hpp	Sat Oct 21 01:23:52 2017 +0200
@@ -236,7 +236,7 @@
   inline VMReg as_VMReg( );
 
   // accessors
-  int encoding() const                                { assert(is_valid(), "invalid register"); return value(); }
+  int encoding() const { assert(is_valid(), "invalid register"); return value(); }
 
  public:
   int encoding(Width w) const {
@@ -258,10 +258,12 @@
     return -1;
   }
 
-  bool  is_valid() const                              { return 0 <= value() && value() < number_of_registers; }
+  bool is_valid() const { return 0 <= value() && value() < number_of_registers; }
+  bool is_even()  const { return (encoding() & 1) == 0; }
+
   const char* name() const;
 
-  FloatRegister successor() const                     { return as_FloatRegister(encoding() + 1); }
+  FloatRegister successor() const { return as_FloatRegister(encoding() + 1); }
 };
 
 
--- a/src/hotspot/cpu/sparc/sparc.ad	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/sparc.ad	Sat Oct 21 01:23:52 2017 +0200
@@ -2628,7 +2628,6 @@
 %}
 
 
-
 enc_class fmadds (sflt_reg dst, sflt_reg a, sflt_reg b, sflt_reg c) %{
     MacroAssembler _masm(&cbuf);
 
@@ -2651,7 +2650,71 @@
     __ fmadd(FloatRegisterImpl::D, Fra, Frb, Frc, Frd);
 %}
 
-
+enc_class fmsubs (sflt_reg dst, sflt_reg a, sflt_reg b, sflt_reg c) %{
+    MacroAssembler _masm(&cbuf);
+
+    FloatRegister Frd = reg_to_SingleFloatRegister_object($dst$$reg);
+    FloatRegister Fra = reg_to_SingleFloatRegister_object($a$$reg);
+    FloatRegister Frb = reg_to_SingleFloatRegister_object($b$$reg);
+    FloatRegister Frc = reg_to_SingleFloatRegister_object($c$$reg);
+
+    __ fmsub(FloatRegisterImpl::S, Fra, Frb, Frc, Frd);
+%}
+
+enc_class fmsubd (dflt_reg dst, dflt_reg a, dflt_reg b, dflt_reg c) %{
+    MacroAssembler _masm(&cbuf);
+
+    FloatRegister Frd = reg_to_DoubleFloatRegister_object($dst$$reg);
+    FloatRegister Fra = reg_to_DoubleFloatRegister_object($a$$reg);
+    FloatRegister Frb = reg_to_DoubleFloatRegister_object($b$$reg);
+    FloatRegister Frc = reg_to_DoubleFloatRegister_object($c$$reg);
+
+    __ fmsub(FloatRegisterImpl::D, Fra, Frb, Frc, Frd);
+%}
+
+enc_class fnmadds (sflt_reg dst, sflt_reg a, sflt_reg b, sflt_reg c) %{
+    MacroAssembler _masm(&cbuf);
+
+    FloatRegister Frd = reg_to_SingleFloatRegister_object($dst$$reg);
+    FloatRegister Fra = reg_to_SingleFloatRegister_object($a$$reg);
+    FloatRegister Frb = reg_to_SingleFloatRegister_object($b$$reg);
+    FloatRegister Frc = reg_to_SingleFloatRegister_object($c$$reg);
+
+    __ fnmadd(FloatRegisterImpl::S, Fra, Frb, Frc, Frd);
+%}
+
+enc_class fnmaddd (dflt_reg dst, dflt_reg a, dflt_reg b, dflt_reg c) %{
+    MacroAssembler _masm(&cbuf);
+
+    FloatRegister Frd = reg_to_DoubleFloatRegister_object($dst$$reg);
+    FloatRegister Fra = reg_to_DoubleFloatRegister_object($a$$reg);
+    FloatRegister Frb = reg_to_DoubleFloatRegister_object($b$$reg);
+    FloatRegister Frc = reg_to_DoubleFloatRegister_object($c$$reg);
+
+    __ fnmadd(FloatRegisterImpl::D, Fra, Frb, Frc, Frd);
+%}
+
+enc_class fnmsubs (sflt_reg dst, sflt_reg a, sflt_reg b, sflt_reg c) %{
+    MacroAssembler _masm(&cbuf);
+
+    FloatRegister Frd = reg_to_SingleFloatRegister_object($dst$$reg);
+    FloatRegister Fra = reg_to_SingleFloatRegister_object($a$$reg);
+    FloatRegister Frb = reg_to_SingleFloatRegister_object($b$$reg);
+    FloatRegister Frc = reg_to_SingleFloatRegister_object($c$$reg);
+
+    __ fnmsub(FloatRegisterImpl::S, Fra, Frb, Frc, Frd);
+%}
+
+enc_class fnmsubd (dflt_reg dst, dflt_reg a, dflt_reg b, dflt_reg c) %{
+    MacroAssembler _masm(&cbuf);
+
+    FloatRegister Frd = reg_to_DoubleFloatRegister_object($dst$$reg);
+    FloatRegister Fra = reg_to_DoubleFloatRegister_object($a$$reg);
+    FloatRegister Frb = reg_to_DoubleFloatRegister_object($b$$reg);
+    FloatRegister Frc = reg_to_DoubleFloatRegister_object($c$$reg);
+
+    __ fnmsub(FloatRegisterImpl::D, Fra, Frb, Frc, Frd);
+%}
 
 
 enc_class fmovs (dflt_reg dst, dflt_reg src) %{
@@ -7597,7 +7660,7 @@
   ins_pipe(fdivD_reg_reg);
 %}
 
-// Single precision fused floating-point multiply-add (d = a * b + c).
+// Single/Double precision fused floating-point multiply-add (d = a * b + c).
 instruct fmaF_regx4(regF dst, regF a, regF b, regF c) %{
   predicate(UseFMA);
   match(Set dst (FmaF c (Binary a b)));
@@ -7606,7 +7669,6 @@
   ins_pipe(fmaF_regx4);
 %}
 
-// Double precision fused floating-point multiply-add (d = a * b + c).
 instruct fmaD_regx4(regD dst, regD a, regD b, regD c) %{
   predicate(UseFMA);
   match(Set dst (FmaD c (Binary a b)));
@@ -7615,6 +7677,66 @@
   ins_pipe(fmaD_regx4);
 %}
 
+// Additional patterns matching complement versions that we can map directly to
+// variants of the fused multiply-add instructions.
+
+// Single/Double precision fused floating-point multiply-sub (d = a * b - c)
+instruct fmsubF_regx4(regF dst, regF a, regF b, regF c) %{
+  predicate(UseFMA);
+  match(Set dst (FmaF (NegF c) (Binary a b)));
+  format %{ "fmsubs $a,$b,$c,$dst\t# $dst = $a * $b - $c" %}
+  ins_encode(fmsubs(dst, a, b, c));
+  ins_pipe(fmaF_regx4);
+%}
+
+instruct fmsubD_regx4(regD dst, regD a, regD b, regD c) %{
+  predicate(UseFMA);
+  match(Set dst (FmaD (NegD c) (Binary a b)));
+  format %{ "fmsubd $a,$b,$c,$dst\t# $dst = $a * $b - $c" %}
+  ins_encode(fmsubd(dst, a, b, c));
+  ins_pipe(fmaD_regx4);
+%}
+
+// Single/Double precision fused floating-point neg. multiply-add,
+//      d = -1 * a * b - c = -(a * b + c)
+instruct fnmaddF_regx4(regF dst, regF a, regF b, regF c) %{
+  predicate(UseFMA);
+  match(Set dst (FmaF (NegF c) (Binary (NegF a) b)));
+  match(Set dst (FmaF (NegF c) (Binary a (NegF b))));
+  format %{ "fnmadds $a,$b,$c,$dst\t# $dst = -($a * $b + $c)" %}
+  ins_encode(fnmadds(dst, a, b, c));
+  ins_pipe(fmaF_regx4);
+%}
+
+instruct fnmaddD_regx4(regD dst, regD a, regD b, regD c) %{
+  predicate(UseFMA);
+  match(Set dst (FmaD (NegD c) (Binary (NegD a) b)));
+  match(Set dst (FmaD (NegD c) (Binary a (NegD b))));
+  format %{ "fnmaddd $a,$b,$c,$dst\t# $dst = -($a * $b + $c)" %}
+  ins_encode(fnmaddd(dst, a, b, c));
+  ins_pipe(fmaD_regx4);
+%}
+
+// Single/Double precision fused floating-point neg. multiply-sub,
+//      d = -1 * a * b + c = -(a * b - c)
+instruct fnmsubF_regx4(regF dst, regF a, regF b, regF c) %{
+  predicate(UseFMA);
+  match(Set dst (FmaF c (Binary (NegF a) b)));
+  match(Set dst (FmaF c (Binary a (NegF b))));
+  format %{ "fnmsubs $a,$b,$c,$dst\t# $dst = -($a * $b - $c)" %}
+  ins_encode(fnmsubs(dst, a, b, c));
+  ins_pipe(fmaF_regx4);
+%}
+
+instruct fnmsubD_regx4(regD dst, regD a, regD b, regD c) %{
+  predicate(UseFMA);
+  match(Set dst (FmaD c (Binary (NegD a) b)));
+  match(Set dst (FmaD c (Binary a (NegD b))));
+  format %{ "fnmsubd $a,$b,$c,$dst\t# $dst = -($a * $b - $c)" %}
+  ins_encode(fnmsubd(dst, a, b, c));
+  ins_pipe(fmaD_regx4);
+%}
+
 //----------Logical Instructions-----------------------------------------------
 // And Instructions
 // Register And
--- a/src/hotspot/cpu/sparc/stubGenerator_sparc.cpp	Fri Oct 20 17:16:05 2017 +0530
+++ b/src/hotspot/cpu/sparc/stubGenerator_sparc.cpp	Sat Oct 21 01:23:52 2017 +0200
@@ -58,7 +58,6 @@
 // Note:  The register L7 is used as L7_thread_cache, and may not be used
 //        any other way within this module.
 
-
 static const Register& Lstub_temp = L2;
 
 // -------------------------------------------------------------------------------------------------------------------------
@@ -4943,7 +4942,7 @@
     return start;
   }
 
-/**
+  /**
    *  Arguments:
    *
    * Inputs:
@@ -4975,6 +4974,773 @@
     return start;
   }
 
+  /**
+   * Arguments:
+   *
+   * Inputs:
+   *   I0   - int* x-addr
+   *   I1   - int  x-len
+   *   I2   - int* y-addr
+   *   I3   - int  y-len
+   *   I4   - int* z-addr   (output vector)
+   *   I5   - int  z-len
+   */
+  address generate_multiplyToLen() {
+    assert(UseMultiplyToLenIntrinsic, "need VIS3 instructions");
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
+    address start = __ pc();
+
+    __ save_frame(0);
+
+    const Register xptr = I0; // input address
+    const Register xlen = I1; // ...and length in 32b-words
+    const Register yptr = I2; //
+    const Register ylen = I3; //
+    const Register zptr = I4; // output address
+    const Register zlen = I5; // ...and length in 32b-words
+
+    /* The minimal "limb" representation suggest that odd length vectors are as
+     * likely as even length dittos. This in turn suggests that we need to cope
+     * with odd/even length arrays and data not aligned properly for 64-bit read
+     * and write operations. We thus use a number of different kernels:
+     *
+     *   if (is_even(x.len) && is_even(y.len))
+     *      if (is_align64(x) && is_align64(y) && is_align64(z))
+     *         if (x.len == y.len && 16 <= x.len && x.len <= 64)
+     *            memv_mult_mpmul(...)
+     *         else
+     *            memv_mult_64x64(...)
+     *      else
+     *         memv_mult_64x64u(...)
+     *   else
+     *      memv_mult_32x32(...)
+     *
+     * Here we assume VIS3 support (for 'umulxhi', 'addxc' and 'addxccc').
+     * In case CBCOND instructions are supported, we will use 'cxbX'. If the
+     * MPMUL instruction is supported, we will generate a kernel using 'mpmul'
+     * (for vectors with proper characteristics).
+     */
+    const Register tmp0 = L0;
+    const Register tmp1 = L1;
+
+    Label L_mult_32x32;
+    Label L_mult_64x64u;
+    Label L_mult_64x64;
+    Label L_exit;
+
+    if_both_even(xlen, ylen, tmp0, false, L_mult_32x32);
+    if_all3_aligned(xptr, yptr, zptr, tmp1, 64, false, L_mult_64x64u);
+
+    if (UseMPMUL) {
+      if_eq(xlen, ylen, false, L_mult_64x64);
+      if_in_rng(xlen, 16, 64, tmp0, tmp1, false, L_mult_64x64);
+
+      // 1. Multiply naturally aligned 64b-datums using a generic 'mpmul' kernel,
+      //    operating on equal length vectors of size [16..64].
+      gen_mult_mpmul(xlen, xptr, yptr, zptr, L_exit);
+    }
+
+    // 2. Multiply naturally aligned 64-bit datums (64x64).
+    __ bind(L_mult_64x64);
+    gen_mult_64x64(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
+
+    // 3. Multiply unaligned 64-bit datums (64x64).
+    __ bind(L_mult_64x64u);
+    gen_mult_64x64_unaligned(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
+
+    // 4. Multiply naturally aligned 32-bit datums (32x32).
+    __ bind(L_mult_32x32);
+    gen_mult_32x32(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
+
+    __ bind(L_exit);
+    __ ret();
+    __ delayed()->restore();
+
+    return start;
+  }
+
+  // Additional help functions used by multiplyToLen generation.
+
+  void if_both_even(Register r1, Register r2, Register tmp, bool iseven, Label &L)
+  {
+    __ or3(r1, r2, tmp);
+    __ andcc(tmp, 0x1, tmp);
+    __ br_icc_zero(iseven, Assembler::pn, L);
+  }
+
+  void if_all3_aligned(Register r1, Register r2, Register r3,
+                       Register tmp, uint align, bool isalign, Label &L)
+  {
+    __ or3(r1, r2, tmp);
+    __ or3(r3, tmp, tmp);
+    __ andcc(tmp, (align - 1), tmp);
+    __ br_icc_zero(isalign, Assembler::pn, L);
+  }
+
+  void if_eq(Register x, Register y, bool iseq, Label &L)
+  {
+    Assembler::Condition cf = (iseq ? Assembler::equal : Assembler::notEqual);
+    __ cmp_and_br_short(x, y, cf, Assembler::pt, L);
+  }
+
+  void if_in_rng(Register x, int lb, int ub, Register t1, Register t2, bool inrng, Label &L)
+  {
+    assert(Assembler::is_simm13(lb), "Small ints only!");
+    assert(Assembler::is_simm13(ub), "Small ints only!");
+    // Compute (x - lb) * (ub - x) >= 0
+    // NOTE: With the local use of this routine, we rely on small integers to
+    //       guarantee that we do not overflow in the multiplication.
+    __ add(G0, ub, t2);
+    __ sub(x, lb, t1);
+    __ sub(t2, x, t2);
+    __ mulx(t1, t2, t1);
+    Assembler::Condition cf = (inrng ? Assembler::greaterEqual : Assembler::less);
+    __ cmp_and_br_short(t1, G0, cf, Assembler::pt, L);
+  }
+
+  void ldd_entry(Register base, Register offs, FloatRegister dest)
+  {
+    __ ldd(base, offs, dest);
+    __ inc(offs, 8);
+  }
+
+  void ldx_entry(Register base, Register offs, Register dest)
+  {
+    __ ldx(base, offs, dest);
+    __ inc(offs, 8);
+  }
+
+  void mpmul_entry(int m, Label &next)
+  {
+    __ mpmul(m);
+    __ cbcond(Assembler::equal, Assembler::icc, G0, G0, next);
+  }
+
+  void stx_entry(Label &L, Register r1, Register r2, Register base, Register offs)
+  {
+    __ bind(L);
+    __ stx(r1, base, offs);
+    __ inc(offs, 8);
+    __ stx(r2, base, offs);
+    __ inc(offs, 8);
+  }
+
+  void offs_entry(Label &Lbl0, Label &Lbl1)
+  {
+    assert(Lbl0.is_bound(), "must be");
+    assert(Lbl1.is_bound(), "must be");
+
+    int offset = Lbl0.loc_pos() - Lbl1.loc_pos();
+
+    __ emit_data(offset);
+  }
+
+  /* Generate the actual multiplication kernels for BigInteger vectors:
+   *
+   *   1. gen_mult_mpmul(...)
+   *
+   *   2. gen_mult_64x64(...)
+   *
+   *   3. gen_mult_64x64_unaligned(...)
+   *
+   *   4. gen_mult_32x32(...)
+   */
+  void gen_mult_mpmul(Register len, Register xptr, Register yptr, Register zptr,
+                      Label &L_exit)
+  {
+    const Register zero = G0;
+    const Register gxp  = G1;   // Need to use global registers across RWs.
+    const Register gyp  = G2;
+    const Register gzp  = G3;
+    const Register offs = G4;
+    const Register disp = G5;
+
+    __ mov(xptr, gxp);
+    __ mov(yptr, gyp);
+    __ mov(zptr, gzp);
+
+    /* Compute jump vector entry:
+     *
+     *   1. mpmul input size (0..31) x 64b
+     *   2. vector input size in 32b limbs (even number)
+     *   3. branch entries in reverse order (31..0), using two
+     *      instructions per entry (2 * 4 bytes).
+     *
+     *   displacement = byte_offset(bra_offset(len))
+     *                = byte_offset((64 - len)/2)
+     *                = 8 * (64 - len)/2
+     *                = 4 * (64 - len)
+     */
+    Register temp = I5;         // Alright to use input regs. in first batch.
+
+    __ sub(zero, len, temp);
+    __ add(temp, 64, temp);
+    __ sllx(temp, 2, disp);     // disp := (64 - len) << 2
+
+    // Dispatch relative current PC, into instruction table below.
+    __ rdpc(temp);
+    __ add(temp, 16, temp);
+    __ jmp(temp, disp);
+    __ delayed()->clr(offs);
+
+    ldd_entry(gxp, offs, F22);
+    ldd_entry(gxp, offs, F20);
+    ldd_entry(gxp, offs, F18);
+    ldd_entry(gxp, offs, F16);
+    ldd_entry(gxp, offs, F14);
+    ldd_entry(gxp, offs, F12);
+    ldd_entry(gxp, offs, F10);
+    ldd_entry(gxp, offs, F8);
+    ldd_entry(gxp, offs, F6);
+    ldd_entry(gxp, offs, F4);
+    ldx_entry(gxp, offs, I5);
+    ldx_entry(gxp, offs, I4);
+    ldx_entry(gxp, offs, I3);
+    ldx_entry(gxp, offs, I2);
+    ldx_entry(gxp, offs, I1);
+    ldx_entry(gxp, offs, I0);
+    ldx_entry(gxp, offs, L7);
+    ldx_entry(gxp, offs, L6);
+    ldx_entry(gxp, offs, L5);
+    ldx_entry(gxp, offs, L4);
+    ldx_entry(gxp, offs, L3);
+    ldx_entry(gxp, offs, L2);
+    ldx_entry(gxp, offs, L1);
+    ldx_entry(gxp, offs, L0);
+    ldd_entry(gxp, offs, F2);
+    ldd_entry(gxp, offs, F0);
+    ldx_entry(gxp, offs, O5);
+    ldx_entry(gxp, offs, O4);
+    ldx_entry(gxp, offs, O3);
+    ldx_entry(gxp, offs, O2);
+    ldx_entry(gxp, offs, O1);
+    ldx_entry(gxp, offs, O0);
+
+    __ save(SP, -176, SP);
+
+    const Register addr = gxp;  // Alright to reuse 'gxp'.
+
+    // Dispatch relative current PC, into instruction table below.
+    __ rdpc(addr);
+    __ add(addr, 16, addr);
+    __ jmp(addr, disp);
+    __ delayed()->clr(offs);
+
+    ldd_entry(gyp, offs, F58);
+    ldd_entry(gyp, offs, F56);
+    ldd_entry(gyp, offs, F54);
+    ldd_entry(gyp, offs, F52);
+    ldd_entry(gyp, offs, F50);
+    ldd_entry(gyp, offs, F48);
+    ldd_entry(gyp, offs, F46);
+    ldd_entry(gyp, offs, F44);
+    ldd_entry(gyp, offs, F42);
+    ldd_entry(gyp, offs, F40);
+    ldd_entry(gyp, offs, F38);
+    ldd_entry(gyp, offs, F36);
+    ldd_entry(gyp, offs, F34);
+    ldd_entry(gyp, offs, F32);
+    ldd_entry(gyp, offs, F30);
+    ldd_entry(gyp, offs, F28);
+    ldd_entry(gyp, offs, F26);
+    ldd_entry(gyp, offs, F24);
+    ldx_entry(gyp, offs, O5);
+    ldx_entry(gyp, offs, O4);
+    ldx_entry(gyp, offs, O3);
+    ldx_entry(gyp, offs, O2);
+    ldx_entry(gyp, offs, O1);
+    ldx_entry(gyp, offs, O0);
+    ldx_entry(gyp, offs, L7);
+    ldx_entry(gyp, offs, L6);
+    ldx_entry(gyp, offs, L5);
+    ldx_entry(gyp, offs, L4);
+    ldx_entry(gyp, offs, L3);
+    ldx_entry(gyp, offs, L2);
+    ldx_entry(gyp, offs, L1);
+    ldx_entry(gyp, offs, L0);
+
+    __ save(SP, -176, SP);
+    __ save(SP, -176, SP);
+    __ save(SP, -176, SP);
+    __ save(SP, -176, SP);
+    __ save(SP, -176, SP);
+
+    Label L_mpmul_restore_4, L_mpmul_restore_3, L_mpmul_restore_2;
+    Label L_mpmul_restore_1, L_mpmul_restore_0;
+
+    // Dispatch relative current PC, into instruction table below.
+    __ rdpc(addr);
+    __ add(addr, 16, addr);
+    __ jmp(addr, disp);
+    __ delayed()->clr(offs);
+
+    mpmul_entry(31, L_mpmul_restore_0);
+    mpmul_entry(30, L_mpmul_restore_0);
+    mpmul_entry(29, L_mpmul_restore_0);
+    mpmul_entry(28, L_mpmul_restore_0);
+    mpmul_entry(27, L_mpmul_restore_1);
+    mpmul_entry(26, L_mpmul_restore_1);
+    mpmul_entry(25, L_mpmul_restore_1);
+    mpmul_entry(24, L_mpmul_restore_1);
+    mpmul_entry(23, L_mpmul_restore_1);
+    mpmul_entry(22, L_mpmul_restore_1);
+    mpmul_entry(21, L_mpmul_restore_1);
+    mpmul_entry(20, L_mpmul_restore_2);
+    mpmul_entry(19, L_mpmul_restore_2);
+    mpmul_entry(18, L_mpmul_restore_2);
+    mpmul_entry(17, L_mpmul_restore_2);
+    mpmul_entry(16, L_mpmul_restore_2);
+    mpmul_entry(15, L_mpmul_restore_2);
+    mpmul_entry(14, L_mpmul_restore_2);
+    mpmul_entry(13, L_mpmul_restore_3);
+    mpmul_entry(12, L_mpmul_restore_3);
+    mpmul_entry(11, L_mpmul_restore_3);
+    mpmul_entry(10, L_mpmul_restore_3);
+    mpmul_entry( 9, L_mpmul_restore_3);
+    mpmul_entry( 8, L_mpmul_restore_3);
+    mpmul_entry( 7, L_mpmul_restore_3);
+    mpmul_entry( 6, L_mpmul_restore_4);
+    mpmul_entry( 5, L_mpmul_restore_4);
+    mpmul_entry( 4, L_mpmul_restore_4);
+    mpmul_entry( 3, L_mpmul_restore_4);
+    mpmul_entry( 2, L_mpmul_restore_4);
+    mpmul_entry( 1, L_mpmul_restore_4);
+    mpmul_entry( 0, L_mpmul_restore_4);
+
+    Label L_z31, L_z30, L_z29, L_z28, L_z27, L_z26, L_z25, L_z24;
+    Label L_z23, L_z22, L_z21, L_z20, L_z19, L_z18, L_z17, L_z16;
+    Label L_z15, L_z14, L_z13, L_z12, L_z11, L_z10, L_z09, L_z08;
+    Label L_z07, L_z06, L_z05, L_z04, L_z03, L_z02, L_z01, L_z00;
+
+    Label L_zst_base;    // Store sequence base address.
+    __ bind(L_zst_base);
+
+    stx_entry(L_z31, L7, L6, gzp, offs);
+    stx_entry(L_z30, L5, L4, gzp, offs);
+    stx_entry(L_z29, L3, L2, gzp, offs);
+    stx_entry(L_z28, L1, L0, gzp, offs);
+    __ restore();
+    stx_entry(L_z27, O5, O4, gzp, offs);
+    stx_entry(L_z26, O3, O2, gzp, offs);
+    stx_entry(L_z25, O1, O0, gzp, offs);
+    stx_entry(L_z24, L7, L6, gzp, offs);
+    stx_entry(L_z23, L5, L4, gzp, offs);
+    stx_entry(L_z22, L3, L2, gzp, offs);
+    stx_entry(L_z21, L1, L0, gzp, offs);
+    __ restore();
+    stx_entry(L_z20, O5, O4, gzp, offs);
+    stx_entry(L_z19, O3, O2, gzp, offs);
+    stx_entry(L_z18, O1, O0, gzp, offs);
+    stx_entry(L_z17, L7, L6, gzp, offs);
+    stx_entry(L_z16, L5, L4, gzp, offs);
+    stx_entry(L_z15, L3, L2, gzp, offs);
+    stx_entry(L_z14, L1, L0, gzp, offs);
+    __ restore();
+    stx_entry(L_z13, O5, O4, gzp, offs);
+    stx_entry(L_z12, O3, O2, gzp, offs);
+    stx_entry(L_z11, O1, O0, gzp, offs);
+    stx_entry(L_z10, L7, L6, gzp, offs);
+    stx_entry(L_z09, L5, L4, gzp, offs);
+    stx_entry(L_z08, L3, L2, gzp, offs);
+    stx_entry(L_z07, L1, L0, gzp, offs);
+    __ restore();
+    stx_entry(L_z06, O5, O4, gzp, offs);
+    stx_entry(L_z05, O3, O2, gzp, offs);
+    stx_entry(L_z04, O1, O0, gzp, offs);
+    stx_entry(L_z03, L7, L6, gzp, offs);
+    stx_entry(L_z02, L5, L4, gzp, offs);
+    stx_entry(L_z01, L3, L2, gzp, offs);
+    stx_entry(L_z00, L1, L0, gzp, offs);
+
+    __ restore();
+    __ restore();
+    // Exit out of 'mpmul' routine, back to multiplyToLen.
+    __ ba_short(L_exit);
+
+    Label L_zst_offs;
+    __ bind(L_zst_offs);
+
+    offs_entry(L_z31, L_zst_base);  // index 31: 2048x2048
+    offs_entry(L_z30, L_zst_base);
+    offs_entry(L_z29, L_zst_base);
+    offs_entry(L_z28, L_zst_base);
+    offs_entry(L_z27, L_zst_base);
+    offs_entry(L_z26, L_zst_base);
+    offs_entry(L_z25, L_zst_base);
+    offs_entry(L_z24, L_zst_base);
+    offs_entry(L_z23, L_zst_base);
+    offs_entry(L_z22, L_zst_base);
+    offs_entry(L_z21, L_zst_base);
+    offs_entry(L_z20, L_zst_base);
+    offs_entry(L_z19, L_zst_base);
+    offs_entry(L_z18, L_zst_base);
+    offs_entry(L_z17, L_zst_base);
+    offs_entry(L_z16, L_zst_base);
+    offs_entry(L_z15, L_zst_base);
+    offs_entry(L_z14, L_zst_base);
+    offs_entry(L_z13, L_zst_base);
+    offs_entry(L_z12, L_zst_base);
+    offs_entry(L_z11, L_zst_base);
+    offs_entry(L_z10, L_zst_base);
+    offs_entry(L_z09, L_zst_base);
+    offs_entry(L_z08, L_zst_base);
+    offs_entry(L_z07, L_zst_base);
+    offs_entry(L_z06, L_zst_base);
+    offs_entry(L_z05, L_zst_base);
+    offs_entry(L_z04, L_zst_base);
+    offs_entry(L_z03, L_zst_base);
+    offs_entry(L_z02, L_zst_base);
+    offs_entry(L_z01, L_zst_base);
+    offs_entry(L_z00, L_zst_base);  // index  0:   64x64
+
+    __ bind(L_mpmul_restore_4);
+    __ restore();
+    __ bind(L_mpmul_restore_3);
+    __ restore();
+    __ bind(L_mpmul_restore_2);
+    __ restore();
+    __ bind(L_mpmul_restore_1);
+    __ restore();
+    __ bind(L_mpmul_restore_0);
+
+    // Dispatch via offset vector entry, into z-store sequence.
+    Label L_zst_rdpc;
+    __ bind(L_zst_rdpc);
+
+    assert(L_zst_base.is_bound(), "must be");
+    assert(L_zst_offs.is_bound(), "must be");
+    assert(L_zst_rdpc.is_bound(), "must be");
+
+    int dbase = L_zst_rdpc.loc_pos() - L_zst_base.loc_pos();
+    int doffs = L_zst_rdpc.loc_pos() - L_zst_offs.loc_pos();
+
+    temp = gyp;   // Alright to reuse 'gyp'.
+
+    __ rdpc(addr);
+    __ sub(addr, doffs, temp);
+    __ srlx(disp, 1, disp);
+    __ lduw(temp, disp, offs);
+    __ sub(addr, dbase, temp);
+    __ jmp(temp, offs);
+    __ delayed()->clr(offs);
+  }
+
+  void gen_mult_64x64(Register xp, Register xn,
+                      Register yp, Register yn,
+                      Register zp, Register zn, Label &L_exit)
+  {
+    // Assuming that a stack frame has already been created, i.e. local and
+    // output registers are available for immediate use.
+
+    const Register ri = L0;     // Outer loop index, xv[i]
+    const Register rj = L1;     // Inner loop index, yv[j]
+    const Register rk = L2;     // Output loop index, zv[k]
+    const Register rx = L4;     // x-vector datum [i]
+    const Register ry = L5;     // y-vector datum [j]
+    const Register rz = L6;     // z-vector datum [k]
+    const Register rc = L7;     // carry over (to z-vector datum [k-1])
+
+    const Register lop = O0;    // lo-64b product
+    const Register hip = O1;    // hi-64b product
+
+    const Register zero = G0;
+
+    Label L_loop_i,  L_exit_loop_i;
+    Label L_loop_j;
+    Label L_loop_i2, L_exit_loop_i2;
+
+    __ srlx(xn, 1, xn);         // index for u32 to u64 ditto
+    __ srlx(yn, 1, yn);         // index for u32 to u64 ditto
+    __ srlx(zn, 1, zn);         // index for u32 to u64 ditto
+    __ dec(xn);                 // Adjust [0..(N/2)-1]
+    __ dec(yn);
+    __ dec(zn);
+    __ clr(rc);                 // u64 c = 0
+    __ sllx(xn, 3, ri);         // int i = xn (byte offset i = 8*xn)
+    __ sllx(yn, 3, rj);         // int j = yn (byte offset i = 8*xn)
+    __ sllx(zn, 3, rk);         // int k = zn (byte offset k = 8*zn)
+    __ ldx(yp, rj, ry);         // u64 y = yp[yn]
+
+    // for (int i = xn; i >= 0; i--)
+    __ bind(L_loop_i);
+
+    __ cmp_and_br_short(ri, 0,  // i >= 0
+                        Assembler::less, Assembler::pn, L_exit_loop_i);
+    __ ldx(xp, ri, rx);         // x = xp[i]
+    __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
+    __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
+    __ addcc(rc, lop, lop);     // Accumulate lower order bits (producing carry)
+    __ addxc(hip, zero, rc);    // carry over to next datum [k-1]
+    __ stx(lop, zp, rk);        // z[k] = lop
+    __ dec(rk, 8);              // k--
+    __ dec(ri, 8);              // i--
+    __ ba_short(L_loop_i);
+
+    __ bind(L_exit_loop_i);
+    __ stx(rc, zp, rk);         // z[k] = c
+
+    // for (int j = yn - 1; j >= 0; j--)
+    __ sllx(yn, 3, rj);         // int j = yn - 1 (byte offset j = 8*yn)
+    __ dec(rj, 8);
+
+    __ bind(L_loop_j);
+
+    __ cmp_and_br_short(rj, 0,  // j >= 0
+                        Assembler::less, Assembler::pn, L_exit);
+    __ clr(rc);                 // u64 c = 0
+    __ ldx(yp, rj, ry);         // u64 y = yp[j]
+
+    // for (int i = xn, k = --zn; i >= 0; i--)
+    __ dec(zn);                 // --zn
+    __ sllx(xn, 3, ri);         // int i = xn (byte offset i = 8*xn)
+    __ sllx(zn, 3, rk);         // int k = zn (byte offset k = 8*zn)
+
+    __ bind(L_loop_i2);
+
+    __ cmp_and_br_short(ri, 0,  // i >= 0
+                        Assembler::less, Assembler::pn, L_exit_loop_i2);
+    __ ldx(xp, ri, rx);         // x = xp[i]
+    __ ldx(zp, rk, rz);         // z = zp[k], accumulator
+    __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
+    __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
+    __ addcc(rz, rc, rz);       // Accumulate lower order bits,
+    __ addxc(hip, zero, rc);    // Accumulate higher order bits to carry
+    __ addcc(rz, lop, rz);      //    z += lo(p) + c
+    __ addxc(rc, zero, rc);
+    __ stx(rz, zp, rk);         // zp[k] = z
+    __ dec(rk, 8);              // k--
+    __ dec(ri, 8);              // i--
+    __ ba_short(L_loop_i2);
+
+    __ bind(L_exit_loop_i2);
+    __ stx(rc, zp, rk);         // z[k] = c
+    __ dec(rj, 8);              // j--
+    __ ba_short(L_loop_j);
+  }
+
+  void gen_mult_64x64_unaligned(Register xp, Register xn,
+                                Register yp, Register yn,
+                                Register zp, Register zn, Label &L_exit)
+  {
+    // Assuming that a stack frame has already been created, i.e. local and
+    // output registers are available for use.
+
+    const Register xpc = L0;    // Outer loop cursor, xp[i]
+    const Register ypc = L1;    // Inner loop cursor, yp[j]
+    const Register zpc = L2;    // Output loop cursor, zp[k]
+    const Register rx  = L4;    // x-vector datum [i]
+    const Register ry  = L5;    // y-vector datum [j]
+    const Register rz  = L6;    // z-vector datum [k]
+    const Register rc  = L7;    // carry over (to z-vector datum [k-1])
+    const Register rt  = O2;
+
+    const Register lop = O0;    // lo-64b product
+    const Register hip = O1;    // hi-64b product
+
+    const Register zero = G0;
+
+    Label L_loop_i,  L_exit_loop_i;
+    Label L_loop_j;
+    Label L_loop_i2, L_exit_loop_i2;
+
+    __ srlx(xn, 1, xn);         // index for u32 to u64 ditto
+    __ srlx(yn, 1, yn);         // index for u32 to u64 ditto
+    __ srlx(zn, 1, zn);         // index for u32 to u64 ditto
+    __ dec(xn);                 // Adjust [0..(N/2)-1]
+    __ dec(yn);
+    __ dec(zn);
+    __ clr(rc);                 // u64 c = 0
+    __ sllx(xn, 3, xpc);        // u32* xpc = &xp[xn] (byte offset 8*xn)
+    __ add(xp, xpc, xpc);
+    __ sllx(yn, 3, ypc);        // u32* ypc = &yp[yn] (byte offset 8*yn)
+    __ add(yp, ypc, ypc);
+    __ sllx(zn, 3, zpc);        // u32* zpc = &zp[zn] (byte offset 8*zn)
+    __ add(zp, zpc, zpc);
+    __ lduw(ypc, 0, rt);        // u64 y = yp[yn]
+    __ lduw(ypc, 4, ry);        //   ...
+    __ sllx(rt, 32, rt);
+    __ or3(rt, ry, ry);
+
+    // for (int i = xn; i >= 0; i--)
+    __ bind(L_loop_i);
+
+    __ cmp_and_br_short(xpc, xp,// i >= 0
+                        Assembler::less, Assembler::pn, L_exit_loop_i);
+    __ lduw(xpc, 0, rt);        // u64 x = xp[i]
+    __ lduw(xpc, 4, rx);        //   ...
+    __ sllx(rt, 32, rt);
+    __ or3(rt, rx, rx);
+    __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
+    __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
+    __ addcc(rc, lop, lop);     // Accumulate lower order bits (producing carry)
+    __ addxc(hip, zero, rc);    // carry over to next datum [k-1]
+    __ srlx(lop, 32, rt);
+    __ stw(rt, zpc, 0);         // z[k] = lop
+    __ stw(lop, zpc, 4);        //   ...
+    __ dec(zpc, 8);             // k-- (zpc--)
+    __ dec(xpc, 8);             // i-- (xpc--)
+    __ ba_short(L_loop_i);
+
+    __ bind(L_exit_loop_i);
+    __ srlx(rc, 32, rt);
+    __ stw(rt, zpc, 0);         // z[k] = c
+    __ stw(rc, zpc, 4);
+
+    // for (int j = yn - 1; j >= 0; j--)
+    __ sllx(yn, 3, ypc);        // u32* ypc = &yp[yn] (byte offset 8*yn)
+    __ add(yp, ypc, ypc);
+    __ dec(ypc, 8);             // yn - 1 (ypc--)
+
+    __ bind(L_loop_j);
+
+    __ cmp_and_br_short(ypc, yp,// j >= 0
+                        Assembler::less, Assembler::pn, L_exit);
+    __ clr(rc);                 // u64 c = 0
+    __ lduw(ypc, 0, rt);        // u64 y = yp[j] (= *ypc)
+    __ lduw(ypc, 4, ry);        //   ...
+    __ sllx(rt, 32, rt);
+    __ or3(rt, ry, ry);
+
+    // for (int i = xn, k = --zn; i >= 0; i--)
+    __ sllx(xn, 3, xpc);        // u32* xpc = &xp[xn] (byte offset 8*xn)
+    __ add(xp, xpc, xpc);
+    __ dec(zn);                 // --zn
+    __ sllx(zn, 3, zpc);        // u32* zpc = &zp[zn] (byte offset 8*zn)
+    __ add(zp, zpc, zpc);
+
+    __ bind(L_loop_i2);
+
+    __ cmp_and_br_short(xpc, xp,// i >= 0
+                        Assembler::less, Assembler::pn, L_exit_loop_i2);
+    __ lduw(xpc, 0, rt);        // u64 x = xp[i] (= *xpc)
+    __ lduw(xpc, 4, rx);        //   ...
+    __ sllx(rt, 32, rt);
+    __ or3(rt, rx, rx);
+
+    __ lduw(zpc, 0, rt);        // u64 z = zp[k] (= *zpc)
+    __ lduw(zpc, 4, rz);        //   ...
+    __ sllx(rt, 32, rt);
+    __ or3(rt, rz, rz);
+
+    __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
+    __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
+    __ addcc(rz, rc, rz);       // Accumulate lower order bits...
+    __ addxc(hip, zero, rc);    // Accumulate higher order bits to carry
+    __ addcc(rz, lop, rz);      // ... z += lo(p) + c
+    __ addxccc(rc, zero, rc);
+    __ srlx(rz, 32, rt);
+    __ stw(rt, zpc, 0);         // zp[k] = z    (*zpc = z)
+    __ stw(rz, zpc, 4);
+    __ dec(zpc, 8);             // k-- (zpc--)
+    __ dec(xpc, 8);             // i-- (xpc--)
+    __ ba_short(L_loop_i2);
+
+    __ bind(L_exit_loop_i2);
+    __ srlx(rc, 32, rt);
+    __ stw(rt, zpc, 0);         // z[k] = c
+    __ stw(rc, zpc, 4);
+    __ dec(ypc, 8);             // j-- (ypc--)
+    __ ba_short(L_loop_j);
+  }
+
+  void gen_mult_32x32(Register xp, Register xn,
+                      Register yp, Register yn,
+                      Register zp, Register zn, Label &L_exit)
+  {
+    // Assuming that a stack frame has already been created, i.e. local and
+    // output registers are available for use.
+
+    const Register ri = L0;     // Outer loop index, xv[i]
+    const Register rj = L1;     // Inner loop index, yv[j]
+    const Register rk = L2;     // Output loop index, zv[k]
+    const Register rx = L4;     // x-vector datum [i]
+    const Register ry = L5;     // y-vector datum [j]
+    const Register rz = L6;     // z-vector datum [k]
+    const Register rc = L7;     // carry over (to z-vector datum [k-1])
+
+    const Register p64 = O0;    // 64b product
+    const Register z65 = O1;    // carry+64b accumulator
+    const Register c65 = O2;    // carry at bit 65
+    const Register c33 = O2;    // carry at bit 33 (after shift)
+
+    const Register zero = G0;
+
+    Label L_loop_i,  L_exit_loop_i;
+    Label L_loop_j;
+    Label L_loop_i2, L_exit_loop_i2;
+
+    __ dec(xn);                 // Adjust [0..N-1]
+    __ dec(yn);
+    __ dec(zn);
+    __ clr(rc);                 // u32 c = 0
+    __ sllx(xn, 2, ri);         // int i = xn (byte offset i = 4*xn)
+    __ sllx(yn, 2, rj);         // int j = yn (byte offset i = 4*xn)
+    __ sllx(zn, 2, rk);         // int k = zn (byte offset k = 4*zn)
+    __ lduw(yp, rj, ry);        // u32 y = yp[yn]
+
+    // for (int i = xn; i >= 0; i--)
+    __ bind(L_loop_i);
+
+    __ cmp_and_br_short(ri, 0,  // i >= 0
+                        Assembler::less, Assembler::pn, L_exit_loop_i);
+    __ lduw(xp, ri, rx);        // x = xp[i]
+    __ mulx(rx, ry, p64);       // 64b result of 32x32
+    __ addcc(rc, p64, z65);     // Accumulate to 65 bits (producing carry)
+    __ addxc(zero, zero, c65);  // Materialise carry (in bit 65) into lsb,
+    __ sllx(c65, 32, c33);      // and shift into bit 33
+    __ srlx(z65, 32, rc);       // carry = c33 | hi(z65) >> 32
+    __ add(c33, rc, rc);        // carry over to next datum [k-1]
+    __ stw(z65, zp, rk);        // z[k] = lo(z65)
+    __ dec(rk, 4);              // k--
+    __ dec(ri, 4);              // i--
+    __ ba_short(L_loop_i);
+
+    __ bind(L_exit_loop_i);
+    __ stw(rc, zp, rk);         // z[k] = c
+
+    // for (int j = yn - 1; j >= 0; j--)
+    __ sllx(yn, 2, rj);         // int j = yn - 1 (byte offset j = 4*yn)
+    __ dec(rj, 4);
+
+    __ bind(L_loop_j);
+
+    __ cmp_and_br_short(rj, 0,  // j >= 0
+                        Assembler::less, Assembler::pn, L_exit);
+    __ clr(rc);                 // u32 c = 0
+    __ lduw(yp, rj, ry);        // u32 y = yp[j]
+
+    // for (int i = xn, k = --zn; i >= 0; i--)