changeset 4395:1b08add4c387

8005544: Use 256bit YMM registers in arraycopy stubs on x86 Summary: Use YMM registers in arraycopy and array_fill stubs. Reviewed-by: roland, twisti
author kvn
date Thu, 03 Jan 2013 16:30:47 -0800
parents 1da9509ab853
children 0deee949d657
files src/cpu/x86/vm/assembler_x86.cpp src/cpu/x86/vm/assembler_x86.hpp src/cpu/x86/vm/stubGenerator_x86_32.cpp src/cpu/x86/vm/stubGenerator_x86_64.cpp
diffstat 4 files changed, 198 insertions(+), 92 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/x86/vm/assembler_x86.cpp	Thu Jan 03 15:09:55 2013 -0800
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu Jan 03 16:30:47 2013 -0800
@@ -3653,6 +3653,15 @@
   emit_byte(0x01);
 }
 
+// duplicate 4-bytes integer data from src into 8 locations in dest
+void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_avx2(), "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+  emit_byte(0x58);
+  emit_byte((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::vzeroupper() {
   assert(VM_Version::supports_avx(), "");
   (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
@@ -11276,29 +11285,53 @@
     {
       assert( UseSSE >= 2, "supported cpu only" );
       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
-      // Fill 32-byte chunks
       movdl(xtmp, value);
-      pshufd(xtmp, xtmp, 0);
-
-      subl(count, 8 << shift);
-      jcc(Assembler::less, L_check_fill_8_bytes);
-      align(16);
-
-      BIND(L_fill_32_bytes_loop);
-
-      if (UseUnalignedLoadStores) {
-        movdqu(Address(to, 0), xtmp);
-        movdqu(Address(to, 16), xtmp);
+      if (UseAVX >= 2 && UseUnalignedLoadStores) {
+        // Fill 64-byte chunks
+        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
+        vpbroadcastd(xtmp, xtmp);
+
+        subl(count, 16 << shift);
+        jcc(Assembler::less, L_check_fill_32_bytes);
+        align(16);
+
+        BIND(L_fill_64_bytes_loop);
+        vmovdqu(Address(to, 0), xtmp);
+        vmovdqu(Address(to, 32), xtmp);
+        addptr(to, 64);
+        subl(count, 16 << shift);
+        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
+
+        BIND(L_check_fill_32_bytes);
+        addl(count, 8 << shift);
+        jccb(Assembler::less, L_check_fill_8_bytes);
+        vmovdqu(Address(to, 0), xtmp);
+        addptr(to, 32);
+        subl(count, 8 << shift);
       } else {
-        movq(Address(to, 0), xtmp);
-        movq(Address(to, 8), xtmp);
-        movq(Address(to, 16), xtmp);
-        movq(Address(to, 24), xtmp);
+        // Fill 32-byte chunks
+        pshufd(xtmp, xtmp, 0);
+
+        subl(count, 8 << shift);
+        jcc(Assembler::less, L_check_fill_8_bytes);
+        align(16);
+
+        BIND(L_fill_32_bytes_loop);
+
+        if (UseUnalignedLoadStores) {
+          movdqu(Address(to, 0), xtmp);
+          movdqu(Address(to, 16), xtmp);
+        } else {
+          movq(Address(to, 0), xtmp);
+          movq(Address(to, 8), xtmp);
+          movq(Address(to, 16), xtmp);
+          movq(Address(to, 24), xtmp);
+        }
+
+        addptr(to, 32);
+        subl(count, 8 << shift);
+        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
       }
-
-      addptr(to, 32);
-      subl(count, 8 << shift);
-      jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
       BIND(L_check_fill_8_bytes);
       addl(count, 8 << shift);
       jccb(Assembler::zero, L_exit);
--- a/src/cpu/x86/vm/assembler_x86.hpp	Thu Jan 03 15:09:55 2013 -0800
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Thu Jan 03 16:30:47 2013 -0800
@@ -1775,6 +1775,9 @@
   void vextractf128h(Address dst, XMMRegister src);
   void vextracti128h(Address dst, XMMRegister src);
 
+  // duplicate 4-bytes integer data from src into 8 locations in dest
+  void vpbroadcastd(XMMRegister dst, XMMRegister src);
+
   // AVX instruction which is used to clear upper 128 bits of YMM registers and
   // to avoid transaction penalty between AVX and SSE states. There is no
   // penalty if legacy SSE instructions are encoded using VEX prefix because
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Jan 03 15:09:55 2013 -0800
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Jan 03 16:30:47 2013 -0800
@@ -825,16 +825,22 @@
     __ align(OptoLoopAlignment);
   __ BIND(L_copy_64_bytes_loop);
 
-    if(UseUnalignedLoadStores) {
-      __ movdqu(xmm0, Address(from, 0));
-      __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
-      __ movdqu(xmm1, Address(from, 16));
-      __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
-      __ movdqu(xmm2, Address(from, 32));
-      __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
-      __ movdqu(xmm3, Address(from, 48));
-      __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
-
+    if (UseUnalignedLoadStores) {
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(from,  0));
+        __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
+        __ vmovdqu(xmm1, Address(from, 32));
+        __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
+      } else {
+        __ movdqu(xmm0, Address(from, 0));
+        __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+        __ movdqu(xmm1, Address(from, 16));
+        __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+        __ movdqu(xmm2, Address(from, 32));
+        __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+        __ movdqu(xmm3, Address(from, 48));
+        __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+      }
     } else {
       __ movq(xmm0, Address(from, 0));
       __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Jan 03 15:09:55 2013 -0800
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Jan 03 16:30:47 2013 -0800
@@ -1314,23 +1314,54 @@
   //   end_to       - destination array end address
   //   qword_count  - 64-bits element count, negative
   //   to           - scratch
-  //   L_copy_32_bytes - entry label
+  //   L_copy_bytes - entry label
   //   L_copy_8_bytes  - exit  label
   //
-  void copy_32_bytes_forward(Register end_from, Register end_to,
+  void copy_bytes_forward(Register end_from, Register end_to,
                              Register qword_count, Register to,
-                             Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
+                             Label& L_copy_bytes, Label& L_copy_8_bytes) {
     DEBUG_ONLY(__ stop("enter at entry label, not here"));
     Label L_loop;
     __ align(OptoLoopAlignment);
-  __ BIND(L_loop);
-    if(UseUnalignedLoadStores) {
-      __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
-      __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
-      __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
-      __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
-
+    if (UseUnalignedLoadStores) {
+      Label L_end;
+      // Copy 64-bytes per iteration
+      __ BIND(L_loop);
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+        __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
+        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
+      } else {
+        __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+        __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
+        __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
+        __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
+      }
+      __ BIND(L_copy_bytes);
+      __ addptr(qword_count, 8);
+      __ jcc(Assembler::lessEqual, L_loop);
+      __ subptr(qword_count, 4);  // sub(8) and add(4)
+      __ jccb(Assembler::greater, L_end);
+      // Copy trailing 32 bytes
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
+      } else {
+        __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
+        __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
+      }
+      __ addptr(qword_count, 4);
+      __ BIND(L_end);
     } else {
+      // Copy 32-bytes per iteration
+      __ BIND(L_loop);
       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
@@ -1339,15 +1370,15 @@
       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+
+      __ BIND(L_copy_bytes);
+      __ addptr(qword_count, 4);
+      __ jcc(Assembler::lessEqual, L_loop);
     }
-  __ BIND(L_copy_32_bytes);
-    __ addptr(qword_count, 4);
-    __ jcc(Assembler::lessEqual, L_loop);
     __ subptr(qword_count, 4);
     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
   }
 
-
   // Copy big chunks backward
   //
   // Inputs:
@@ -1355,23 +1386,55 @@
   //   dest         - destination array address
   //   qword_count  - 64-bits element count
   //   to           - scratch
-  //   L_copy_32_bytes - entry label
+  //   L_copy_bytes - entry label
   //   L_copy_8_bytes  - exit  label
   //
-  void copy_32_bytes_backward(Register from, Register dest,
+  void copy_bytes_backward(Register from, Register dest,
                               Register qword_count, Register to,
-                              Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
+                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
     DEBUG_ONLY(__ stop("enter at entry label, not here"));
     Label L_loop;
     __ align(OptoLoopAlignment);
-  __ BIND(L_loop);
-    if(UseUnalignedLoadStores) {
-      __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
-      __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
-      __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
-      __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
-
+    if (UseUnalignedLoadStores) {
+      Label L_end;
+      // Copy 64-bytes per iteration
+      __ BIND(L_loop);
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
+        __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
+        __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
+        __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
+      } else {
+        __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
+        __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
+        __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
+        __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
+        __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
+      }
+      __ BIND(L_copy_bytes);
+      __ subptr(qword_count, 8);
+      __ jcc(Assembler::greaterEqual, L_loop);
+
+      __ addptr(qword_count, 4);  // add(8) and sub(4)
+      __ jccb(Assembler::less, L_end);
+      // Copy trailing 32 bytes
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
+        __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
+      } else {
+        __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
+        __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
+        __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
+      }
+      __ subptr(qword_count, 4);
+      __ BIND(L_end);
     } else {
+      // Copy 32-bytes per iteration
+      __ BIND(L_loop);
       __ movq(to, Address(from, qword_count, Address::times_8, 24));
       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
       __ movq(to, Address(from, qword_count, Address::times_8, 16));
@@ -1380,10 +1443,11 @@
       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
       __ movq(to, Address(from, qword_count, Address::times_8,  0));
       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
+
+      __ BIND(L_copy_bytes);
+      __ subptr(qword_count, 4);
+      __ jcc(Assembler::greaterEqual, L_loop);
     }
-  __ BIND(L_copy_32_bytes);
-    __ subptr(qword_count, 4);
-    __ jcc(Assembler::greaterEqual, L_loop);
     __ addptr(qword_count, 4);
     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
   }
@@ -1413,7 +1477,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
     Label L_copy_byte, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
@@ -1445,7 +1509,7 @@
     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
     __ negptr(qword_count); // make the count negative
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1488,8 +1552,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
-    // Copy in 32-bytes chunks
-    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
     __ jmp(L_copy_4_bytes);
 
     return start;
@@ -1516,7 +1580,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1559,10 +1623,10 @@
     // Check for and copy trailing dword
   __ BIND(L_copy_4_bytes);
     __ testl(byte_count, 4);
-    __ jcc(Assembler::zero, L_copy_32_bytes);
+    __ jcc(Assembler::zero, L_copy_bytes);
     __ movl(rax, Address(from, qword_count, Address::times_8));
     __ movl(Address(to, qword_count, Address::times_8), rax);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1577,8 +1641,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
-    // Copy in 32-bytes chunks
-    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
 
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
@@ -1613,7 +1677,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1644,7 +1708,7 @@
     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
     __ negptr(qword_count);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1680,8 +1744,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
-    // Copy in 32-bytes chunks
-    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
     __ jmp(L_copy_4_bytes);
 
     return start;
@@ -1728,7 +1792,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1763,10 +1827,10 @@
     // Check for and copy trailing dword
   __ BIND(L_copy_4_bytes);
     __ testl(word_count, 2);
-    __ jcc(Assembler::zero, L_copy_32_bytes);
+    __ jcc(Assembler::zero, L_copy_bytes);
     __ movl(rax, Address(from, qword_count, Address::times_8));
     __ movl(Address(to, qword_count, Address::times_8), rax);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1781,8 +1845,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
-    // Copy in 32-bytes chunks
-    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
 
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
@@ -1818,7 +1882,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1854,7 +1918,7 @@
     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
     __ negptr(qword_count);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1881,8 +1945,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
-    // Copy 32-bytes chunks
-    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
     __ jmp(L_copy_4_bytes);
 
     return start;
@@ -1910,7 +1974,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1944,10 +2008,10 @@
 
     // Check for and copy trailing dword
     __ testl(dword_count, 1);
-    __ jcc(Assembler::zero, L_copy_32_bytes);
+    __ jcc(Assembler::zero, L_copy_bytes);
     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1965,8 +2029,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
-    // Copy in 32-bytes chunks
-    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
 
    __ bind(L_exit);
      if (is_oop) {
@@ -2004,7 +2068,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register qword_count = rdx;  // elements count
@@ -2036,7 +2100,7 @@
     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
     __ negptr(qword_count);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -2055,8 +2119,8 @@
       __ ret(0);
     }
 
-    // Copy 64-byte chunks
-    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
 
     if (is_oop) {
     __ BIND(L_exit);
@@ -2093,7 +2157,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register qword_count = rdx;  // elements count
@@ -2119,7 +2183,7 @@
       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
     }
 
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -2138,8 +2202,8 @@
       __ ret(0);
     }
 
-    // Copy in 32-bytes chunks
-    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
 
     if (is_oop) {
     __ BIND(L_exit);