comparison src/cpu/ppc/vm/stubGenerator_ppc.cpp @ 12219:0f2a78897867

8166684: PPC64: implement intrinsic code with vector instructions for Unsafe.copyMemory() Reviewed-by: simonis, mdoerr Contributed-by: Michihiro Horie <horie@jp.ibm.com>
author mdoerr
date Fri, 21 Oct 2016 10:27:32 +0200
parents ebbfdf26a4ee
children
comparison
equal deleted inserted replaced
36:5cf37dc03519 37:10b40031fe2c
1218 __ align(32); 1218 __ align(32);
1219 1219
1220 __ bind(l_10); 1220 __ bind(l_10);
1221 // Use loop with VSX load/store instructions to 1221 // Use loop with VSX load/store instructions to
1222 // copy 32 elements a time. 1222 // copy 32 elements a time.
1223 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 1223 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1224 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 1224 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1225 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1225 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1226 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1226 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1227 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1227 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1228 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1228 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1229 __ bdnz(l_10); // Dec CTR and loop if not zero. 1229 __ bdnz(l_10); // Dec CTR and loop if not zero.
1484 __ align(32); 1484 __ align(32);
1485 1485
1486 __ bind(l_9); 1486 __ bind(l_9);
1487 // Use loop with VSX load/store instructions to 1487 // Use loop with VSX load/store instructions to
1488 // copy 16 elements a time. 1488 // copy 16 elements a time.
1489 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load from src. 1489 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
1490 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst. 1490 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
1491 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. 1491 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
1492 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. 1492 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1493 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. 1493 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
1494 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. 1494 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
1495 __ bdnz(l_9); // Dec CTR and loop if not zero. 1495 __ bdnz(l_9); // Dec CTR and loop if not zero.
1675 __ align(32); 1675 __ align(32);
1676 1676
1677 __ bind(l_7); 1677 __ bind(l_7);
1678 // Use loop with VSX load/store instructions to 1678 // Use loop with VSX load/store instructions to
1679 // copy 8 elements a time. 1679 // copy 8 elements a time.
1680 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 1680 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1681 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 1681 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1682 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1682 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1683 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1683 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1684 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1684 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1685 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1685 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1686 __ bdnz(l_7); // Dec CTR and loop if not zero. 1686 __ bdnz(l_7); // Dec CTR and loop if not zero.
1743 // 1743 //
1744 void generate_conjoint_int_copy_core(bool aligned) { 1744 void generate_conjoint_int_copy_core(bool aligned) {
1745 // Do reverse copy. We assume the case of actual overlap is rare enough 1745 // Do reverse copy. We assume the case of actual overlap is rare enough
1746 // that we don't have to optimize it. 1746 // that we don't have to optimize it.
1747 1747
1748 Label l_1, l_2, l_3, l_4, l_5, l_6; 1748 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1749 1749
1750 Register tmp1 = R6_ARG4; 1750 Register tmp1 = R6_ARG4;
1751 Register tmp2 = R7_ARG5; 1751 Register tmp2 = R7_ARG5;
1752 Register tmp3 = R8_ARG6; 1752 Register tmp3 = R8_ARG6;
1753 Register tmp4 = R0; 1753 Register tmp4 = R0;
1754 1754
1755 VectorSRegister tmp_vsr1 = VSR1;
1756 VectorSRegister tmp_vsr2 = VSR2;
1757
1755 { // FasterArrayCopy 1758 { // FasterArrayCopy
1756 __ cmpwi(CCR0, R5_ARG3, 0); 1759 __ cmpwi(CCR0, R5_ARG3, 0);
1757 __ beq(CCR0, l_6); 1760 __ beq(CCR0, l_6);
1758 1761
1759 __ sldi(R5_ARG3, R5_ARG3, 2); 1762 __ sldi(R5_ARG3, R5_ARG3, 2);
1760 __ add(R3_ARG1, R3_ARG1, R5_ARG3); 1763 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1761 __ add(R4_ARG2, R4_ARG2, R5_ARG3); 1764 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1762 __ srdi(R5_ARG3, R5_ARG3, 2); 1765 __ srdi(R5_ARG3, R5_ARG3, 2);
1763 1766
1767 if (!aligned) {
1768 // check if arrays have same alignment mod 8.
1769 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1770 __ andi_(R0, tmp1, 7);
1771 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1772 __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1773
1774 // copy 1 element to align to and from on an 8 byte boundary
1775 __ andi_(R0, R3_ARG1, 7);
1776 __ beq(CCR0, l_7);
1777
1778 __ addi(R3_ARG1, R3_ARG1, -4);
1779 __ addi(R4_ARG2, R4_ARG2, -4);
1780 __ addi(R5_ARG3, R5_ARG3, -1);
1781 __ lwzx(tmp2, R3_ARG1);
1782 __ stwx(tmp2, R4_ARG2);
1783 __ bind(l_7);
1784 }
1785
1764 __ cmpwi(CCR0, R5_ARG3, 7); 1786 __ cmpwi(CCR0, R5_ARG3, 7);
1765 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain 1787 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1766 1788
1767 __ srdi(tmp1, R5_ARG3, 3); 1789 __ srdi(tmp1, R5_ARG3, 3);
1768 __ andi(R5_ARG3, R5_ARG3, 7); 1790 __ andi(R5_ARG3, R5_ARG3, 7);
1769 __ mtctr(tmp1); 1791 __ mtctr(tmp1);
1770 1792
1793 if (!VM_Version::has_vsx()) {
1771 __ bind(l_4); 1794 __ bind(l_4);
1772 // Use unrolled version for mass copying (copy 4 elements a time). 1795 // Use unrolled version for mass copying (copy 4 elements a time).
1773 // Load feeding store gets zero latency on Power6, however not on Power5. 1796 // Load feeding store gets zero latency on Power6, however not on Power5.
1774 // Therefore, the following sequence is made for the good of both. 1797 // Therefore, the following sequence is made for the good of both.
1775 __ addi(R3_ARG1, R3_ARG1, -32); 1798 __ addi(R3_ARG1, R3_ARG1, -32);
1781 __ std(tmp4, 24, R4_ARG2); 1804 __ std(tmp4, 24, R4_ARG2);
1782 __ std(tmp3, 16, R4_ARG2); 1805 __ std(tmp3, 16, R4_ARG2);
1783 __ std(tmp2, 8, R4_ARG2); 1806 __ std(tmp2, 8, R4_ARG2);
1784 __ std(tmp1, 0, R4_ARG2); 1807 __ std(tmp1, 0, R4_ARG2);
1785 __ bdnz(l_4); 1808 __ bdnz(l_4);
1809 } else { // Processor supports VSX, so use it to mass copy.
1810 // Prefetch the data into the L2 cache.
1811 __ dcbt(R3_ARG1, 0);
1812
1813 // If supported set DSCR pre-fetch to deepest.
1814 if (VM_Version::has_mfdscr()) {
1815 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1816 __ mtdscr(tmp2);
1817 }
1818
1819 __ li(tmp1, 16);
1820
1821 // Backbranch target aligned to 32-byte. Not 16-byte align as
1822 // loop contains < 8 instructions that fit inside a single
1823 // i-cache sector.
1824 __ align(32);
1825
1826 __ bind(l_4);
1827 // Use loop with VSX load/store instructions to
1828 // copy 8 elements a time.
1829 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
1830 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
1831 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
1832 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1833 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1834 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1835 __ bdnz(l_4);
1836
1837 // Restore DSCR pre-fetch value.
1838 if (VM_Version::has_mfdscr()) {
1839 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1840 __ mtdscr(tmp2);
1841 }
1842 }
1786 1843
1787 __ cmpwi(CCR0, R5_ARG3, 0); 1844 __ cmpwi(CCR0, R5_ARG3, 0);
1788 __ beq(CCR0, l_6); 1845 __ beq(CCR0, l_6);
1789 1846
1790 __ bind(l_5); 1847 __ bind(l_5);
1890 __ align(32); 1947 __ align(32);
1891 1948
1892 __ bind(l_5); 1949 __ bind(l_5);
1893 // Use loop with VSX load/store instructions to 1950 // Use loop with VSX load/store instructions to
1894 // copy 4 elements a time. 1951 // copy 4 elements a time.
1895 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 1952 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1896 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 1953 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1897 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1954 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1898 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1955 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1899 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1956 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1900 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1957 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1901 __ bdnz(l_5); // Dec CTR and loop if not zero. 1958 __ bdnz(l_5); // Dec CTR and loop if not zero.
1960 Register tmp1 = R6_ARG4; 2017 Register tmp1 = R6_ARG4;
1961 Register tmp2 = R7_ARG5; 2018 Register tmp2 = R7_ARG5;
1962 Register tmp3 = R8_ARG6; 2019 Register tmp3 = R8_ARG6;
1963 Register tmp4 = R0; 2020 Register tmp4 = R0;
1964 2021
2022 VectorSRegister tmp_vsr1 = VSR1;
2023 VectorSRegister tmp_vsr2 = VSR2;
2024
1965 Label l_1, l_2, l_3, l_4, l_5; 2025 Label l_1, l_2, l_3, l_4, l_5;
1966 2026
1967 __ cmpwi(CCR0, R5_ARG3, 0); 2027 __ cmpwi(CCR0, R5_ARG3, 0);
1968 __ beq(CCR0, l_1); 2028 __ beq(CCR0, l_1);
1969 2029
1978 2038
1979 __ srdi(tmp1, R5_ARG3, 2); 2039 __ srdi(tmp1, R5_ARG3, 2);
1980 __ andi(R5_ARG3, R5_ARG3, 3); 2040 __ andi(R5_ARG3, R5_ARG3, 3);
1981 __ mtctr(tmp1); 2041 __ mtctr(tmp1);
1982 2042
2043 if (!VM_Version::has_vsx()) {
1983 __ bind(l_4); 2044 __ bind(l_4);
1984 // Use unrolled version for mass copying (copy 4 elements a time). 2045 // Use unrolled version for mass copying (copy 4 elements a time).
1985 // Load feeding store gets zero latency on Power6, however not on Power5. 2046 // Load feeding store gets zero latency on Power6, however not on Power5.
1986 // Therefore, the following sequence is made for the good of both. 2047 // Therefore, the following sequence is made for the good of both.
1987 __ addi(R3_ARG1, R3_ARG1, -32); 2048 __ addi(R3_ARG1, R3_ARG1, -32);
1993 __ std(tmp4, 24, R4_ARG2); 2054 __ std(tmp4, 24, R4_ARG2);
1994 __ std(tmp3, 16, R4_ARG2); 2055 __ std(tmp3, 16, R4_ARG2);
1995 __ std(tmp2, 8, R4_ARG2); 2056 __ std(tmp2, 8, R4_ARG2);
1996 __ std(tmp1, 0, R4_ARG2); 2057 __ std(tmp1, 0, R4_ARG2);
1997 __ bdnz(l_4); 2058 __ bdnz(l_4);
2059 } else { // Processor supports VSX, so use it to mass copy.
2060 // Prefetch the data into the L2 cache.
2061 __ dcbt(R3_ARG1, 0);
2062
2063 // If supported set DSCR pre-fetch to deepest.
2064 if (VM_Version::has_mfdscr()) {
2065 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
2066 __ mtdscr(tmp2);
2067 }
2068
2069 __ li(tmp1, 16);
2070
2071 // Backbranch target aligned to 32-byte. Not 16-byte align as
2072 // loop contains < 8 instructions that fit inside a single
2073 // i-cache sector.
2074 __ align(32);
2075
2076 __ bind(l_4);
2077 // Use loop with VSX load/store instructions to
2078 // copy 4 elements a time.
2079 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
2080 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
2081 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
2082 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
2083 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
2084 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
2085 __ bdnz(l_4);
2086
2087 // Restore DSCR pre-fetch value.
2088 if (VM_Version::has_mfdscr()) {
2089 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
2090 __ mtdscr(tmp2);
2091 }
2092 }
1998 2093
1999 __ cmpwi(CCR0, R5_ARG3, 0); 2094 __ cmpwi(CCR0, R5_ARG3, 0);
2000 __ beq(CCR0, l_1); 2095 __ beq(CCR0, l_1);
2001 2096
2002 __ bind(l_5); 2097 __ bind(l_5);