changeset 51391:866c9aa29ee4

8189103: AARCH64: optimize String indexOf intrinsic Reviewed-by: aph
author dpochepk
date Mon, 25 Jun 2018 16:32:02 +0300
parents 7ad092f40454
children afca3c78ea0f
files src/hotspot/cpu/aarch64/aarch64.ad src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
diffstat 6 files changed, 516 insertions(+), 154 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/cpu/aarch64/aarch64.ad	Mon Jun 25 16:31:37 2018 +0300
+++ b/src/hotspot/cpu/aarch64/aarch64.ad	Mon Jun 25 16:32:02 2018 +0300
@@ -15927,12 +15927,13 @@
 %}
 
 instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
-       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
+       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3,
+       iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr)
 %{
   predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU);
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
   effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr);
   format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU)" %}
 
   ins_encode %{
@@ -15940,18 +15941,20 @@
                       $cnt1$$Register, $cnt2$$Register,
                       $tmp1$$Register, $tmp2$$Register,
                       $tmp3$$Register, $tmp4$$Register,
+                      $tmp5$$Register, $tmp6$$Register,
                       -1, $result$$Register, StrIntrinsicNode::UU);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
-       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
+       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3,
+       iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr)
 %{
   predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
   effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr);
   format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL)" %}
 
   ins_encode %{
@@ -15959,18 +15962,20 @@
                       $cnt1$$Register, $cnt2$$Register,
                       $tmp1$$Register, $tmp2$$Register,
                       $tmp3$$Register, $tmp4$$Register,
+                      $tmp5$$Register, $tmp6$$Register,
                       -1, $result$$Register, StrIntrinsicNode::LL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
-       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
+       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3,
+       iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr)
 %{
   predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL);
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
   effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr);
   format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL)" %}
 
   ins_encode %{
@@ -15978,30 +15983,12 @@
                       $cnt1$$Register, $cnt2$$Register,
                       $tmp1$$Register, $tmp2$$Register,
                       $tmp3$$Register, $tmp4$$Register,
+                      $tmp5$$Register, $tmp6$$Register,
                       -1, $result$$Register, StrIntrinsicNode::UL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
-instruct string_indexofLU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
-       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
-%{
-  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LU);
-  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
-  format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LU)" %}
-
-  ins_encode %{
-    __ string_indexof($str1$$Register, $str2$$Register,
-                      $cnt1$$Register, $cnt2$$Register,
-                      $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
-                      -1, $result$$Register, StrIntrinsicNode::LU);
-  %}
-  ins_pipe(pipe_class_memory);
-%}
-
 instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
                  immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2,
                  iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
@@ -16017,7 +16004,7 @@
     __ string_indexof($str1$$Register, $str2$$Register,
                       $cnt1$$Register, zr,
                       $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
+                      $tmp3$$Register, $tmp4$$Register, zr, zr,
                       icnt2, $result$$Register, StrIntrinsicNode::UU);
   %}
   ins_pipe(pipe_class_memory);
@@ -16038,7 +16025,7 @@
     __ string_indexof($str1$$Register, $str2$$Register,
                       $cnt1$$Register, zr,
                       $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
+                      $tmp3$$Register, $tmp4$$Register, zr, zr,
                       icnt2, $result$$Register, StrIntrinsicNode::LL);
   %}
   ins_pipe(pipe_class_memory);
@@ -16059,33 +16046,12 @@
     __ string_indexof($str1$$Register, $str2$$Register,
                       $cnt1$$Register, zr,
                       $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
+                      $tmp3$$Register, $tmp4$$Register, zr, zr,
                       icnt2, $result$$Register, StrIntrinsicNode::UL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
-instruct string_indexof_conLU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
-                 immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2,
-                 iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
-%{
-  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LU);
-  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
-  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
-  format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LU)" %}
-
-  ins_encode %{
-    int icnt2 = (int)$int_cnt2$$constant;
-    __ string_indexof($str1$$Register, $str2$$Register,
-                      $cnt1$$Register, zr,
-                      $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
-                      icnt2, $result$$Register, StrIntrinsicNode::LU);
-  %}
-  ins_pipe(pipe_class_memory);
-%}
-
 instruct string_indexofU_char(iRegP_R1 str1, iRegI_R2 cnt1, iRegI_R3 ch,
                               iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2,
                               iRegINoSp tmp3, rFlagsReg cr)
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Jun 25 16:31:37 2018 +0300
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Jun 25 16:32:02 2018 +0300
@@ -4316,8 +4316,10 @@
                                     Register cnt2, Register cnt1,
                                     Register tmp1, Register tmp2,
                                     Register tmp3, Register tmp4,
+                                    Register tmp5, Register tmp6,
                                     int icnt1, Register result, int ae) {
-  Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
+  // NOTE: tmp5, tmp6 can be zr depending on specific method version
+  Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 
   Register ch1 = rscratch1;
   Register ch2 = rscratch2;
@@ -4346,18 +4348,21 @@
   // if (substr.count > string.count) return -1;
   // if (substr.count == 0) return 0;
 
-// We have two strings, a source string in str2, cnt2 and a pattern string
-// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
-
-// For larger pattern and source we use a simplified Boyer Moore algorithm.
-// With a small pattern and source we use linear scan.
+  // We have two strings, a source string in str2, cnt2 and a pattern string
+  // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
+
+  // For larger pattern and source we use a simplified Boyer Moore algorithm.
+  // With a small pattern and source we use linear scan.
 
   if (icnt1 == -1) {
-    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
-    ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
-    br(LO, LINEARSEARCH);       // a byte array.
-    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
-    br(HS, LINEARSEARCH);
+    sub(result_tmp, cnt2, cnt1);
+    cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
+    br(LT, LINEARSEARCH);
+    dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
+    cmp(cnt1, 256);
+    lsr(tmp1, cnt2, 2);
+    ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
+    br(GE, LINEARSTUB);
   }
 
 // The Boyer Moore alogorithm is based on the description here:-
@@ -4377,7 +4382,9 @@
 //
 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 //
-// #define ASIZE 128
+// This particular implementation has few java-specific optimizations.
+//
+// #define ASIZE 256
 //
 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 //       int i, j;
@@ -4386,11 +4393,16 @@
 //
 //       /* Preprocessing */
 //       for (i = 0; i < ASIZE; ++i)
-//          bc[i] = 0;
+//          bc[i] = m;
 //       for (i = 0; i < m - 1; ) {
 //          c = x[i];
 //          ++i;
-//          if (c < ASIZE) bc[c] = i;
+//          // c < 256 for Latin1 string, so, no need for branch
+//          #ifdef PATTERN_STRING_IS_LATIN1
+//          bc[c] = m - i;
+//          #else
+//          if (c < ASIZE) bc[c] = m - i;
+//          #endif
 //       }
 //
 //       /* Searching */
@@ -4400,84 +4412,160 @@
 //          if (x[m-1] == c)
 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 //          if (i < 0) return j;
+//          // c < 256 for Latin1 string, so, no need for branch
+//          #ifdef SOURCE_STRING_IS_LATIN1
+//          // LL case: (c< 256) always true. Remove branch
+//          j += bc[y[j+m-1]];
+//          #endif
+//          #ifndef PATTERN_STRING_IS_UTF
+//          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 //          if (c < ASIZE)
-//            j = j - bc[y[j+m-1]] + m;
+//            j += bc[y[j+m-1]];
 //          else
-//            j += 1; // Advance by 1 only if char >= ASIZE
+//            j += 1
+//          #endif
+//          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
+//          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
+//          if (c < ASIZE)
+//            j += bc[y[j+m-1]];
+//          else
+//            j += m
+//          #endif
 //       }
 //    }
 
   if (icnt1 == -1) {
-    BIND(BM);
-
-    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
-    Label BMADV, BMMATCH, BMCHECKEND;
-
+    Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
+        BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
     Register cnt1end = tmp2;
     Register str2end = cnt2;
     Register skipch = tmp2;
 
-    // Restrict ASIZE to 128 to reduce stack space/initialisation.
-    // The presence of chars >= ASIZE in the target string does not affect
-    // performance, but we must be careful not to initialise them in the stack
-    // array.
-    // The presence of chars >= ASIZE in the source string may adversely affect
-    // performance since we can only advance by one when we encounter one.
-
-      stp(zr, zr, pre(sp, -128));
-      for (int i = 1; i < 8; i++)
-          stp(zr, zr, Address(sp, i*16));
-
-      mov(cnt1tmp, 0);
-      sub(cnt1end, cnt1, 1);
+    // str1 length is >=8, so, we can read at least 1 register for cases when
+    // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
+    // UL case. We'll re-read last character in inner pre-loop code to have
+    // single outer pre-loop load
+    const int firstStep = isL ? 7 : 3;
+
+    const int ASIZE = 256;
+    const int STORED_BYTES = 32; // amount of bytes stored per instruction
+    sub(sp, sp, ASIZE);
+    mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
+    mov(ch1, sp);
+    BIND(BM_INIT_LOOP);
+      stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
+      subs(tmp5, tmp5, 1);
+      br(GT, BM_INIT_LOOP);
+
+      sub(cnt1tmp, cnt1, 1);
+      mov(tmp5, str2);
+      add(str2end, str2, result_tmp, LSL, str2_chr_shift);
+      sub(ch2, cnt1, 1);
+      mov(tmp3, str1);
     BIND(BCLOOP);
-      (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
-      cmp(ch1, 128);
-      add(cnt1tmp, cnt1tmp, 1);
-      br(HS, BCSKIP);
-      strb(cnt1tmp, Address(sp, ch1));
+      (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
+      if (!str1_isL) {
+        cmp(ch1, ASIZE);
+        br(HS, BCSKIP);
+      }
+      strb(ch2, Address(sp, ch1));
     BIND(BCSKIP);
-      cmp(cnt1tmp, cnt1end);
-      br(LT, BCLOOP);
-
-      mov(result_tmp, str2);
-
-      sub(cnt2, cnt2, cnt1);
-      add(str2end, str2, cnt2, LSL, str2_chr_shift);
+      subs(ch2, ch2, 1);
+      br(GT, BCLOOP);
+
+      add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
+      if (str1_isL == str2_isL) {
+        // load last 8 bytes (8LL/4UU symbols)
+        ldr(tmp6, Address(tmp6, -wordSize));
+      } else {
+        ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
+        // convert Latin1 to UTF. We'll have to wait until load completed, but
+        // it's still faster than per-character loads+checks
+        lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
+        ubfx(ch1, tmp6, 8, 8); // str1[N-2]
+        ubfx(ch2, tmp6, 16, 8); // str1[N-3]
+        andr(tmp6, tmp6, 0xFF); // str1[N-4]
+        orr(ch2, ch1, ch2, LSL, 16);
+        orr(tmp6, tmp6, tmp3, LSL, 48);
+        orr(tmp6, tmp6, ch2, LSL, 16);
+      }
     BIND(BMLOOPSTR2);
-      sub(cnt1tmp, cnt1, 1);
-      (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
-      cmp(ch1, skipch);
+      sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
+      if (str1_isL == str2_isL) {
+        // re-init tmp3. It's for free because it's executed in parallel with
+        // load above. Alternative is to initialize it before loop, but it'll
+        // affect performance on in-order systems with 2 or more ld/st pipelines
+        lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
+      }
+      if (!isL) { // UU/UL case
+        lsl(ch2, cnt1tmp, 1); // offset in bytes
+      }
+      cmp(tmp3, skipch);
       br(NE, BMSKIP);
-      subs(cnt1tmp, cnt1tmp, 1);
-      br(LT, BMMATCH);
+      ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
+      mov(ch1, tmp6);
+      if (isL) {
+        b(BMLOOPSTR1_AFTER_LOAD);
+      } else {
+        sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
+        b(BMLOOPSTR1_CMP);
+      }
     BIND(BMLOOPSTR1);
       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
+    BIND(BMLOOPSTR1_AFTER_LOAD);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(LT, BMLOOPSTR1_LASTCMP);
+    BIND(BMLOOPSTR1_CMP);
+      cmp(ch1, ch2);
+      br(EQ, BMLOOPSTR1);
+    BIND(BMSKIP);
+      if (!isL) {
+        // if we've met UTF symbol while searching Latin1 pattern, then we can
+        // skip cnt1 symbols
+        if (str1_isL != str2_isL) {
+          mov(result_tmp, cnt1);
+        } else {
+          mov(result_tmp, 1);
+        }
+        cmp(skipch, ASIZE);
+        br(HS, BMADV);
+      }
+      ldrb(result_tmp, Address(sp, skipch)); // load skip distance
+    BIND(BMADV);
+      sub(cnt1tmp, cnt1, 1);
+      add(str2, str2, result_tmp, LSL, str2_chr_shift);
+      cmp(str2, str2end);
+      br(LE, BMLOOPSTR2);
+      add(sp, sp, ASIZE);
+      b(NOMATCH);
+    BIND(BMLOOPSTR1_LASTCMP);
       cmp(ch1, ch2);
       br(NE, BMSKIP);
-      subs(cnt1tmp, cnt1tmp, 1);
-      br(GE, BMLOOPSTR1);
     BIND(BMMATCH);
-      sub(result, str2, result_tmp);
+      sub(result, str2, tmp5);
       if (!str2_isL) lsr(result, result, 1);
-      add(sp, sp, 128);
+      add(sp, sp, ASIZE);
       b(DONE);
-    BIND(BMADV);
-      add(str2, str2, str2_chr_size);
-      b(BMCHECKEND);
-    BIND(BMSKIP);
-      cmp(skipch, 128);
-      br(HS, BMADV);
-      ldrb(ch2, Address(sp, skipch));
-      add(str2, str2, cnt1, LSL, str2_chr_shift);
-      sub(str2, str2, ch2, LSL, str2_chr_shift);
-    BIND(BMCHECKEND);
-      cmp(str2, str2end);
-      br(LE, BMLOOPSTR2);
-      add(sp, sp, 128);
-      b(NOMATCH);
+
+    BIND(LINEARSTUB);
+    cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
+    br(LT, LINEAR_MEDIUM);
+    mov(result, zr);
+    RuntimeAddress stub = NULL;
+    if (isL) {
+      stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
+      assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
+    } else if (str1_isL) {
+      stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
+       assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
+    } else {
+      stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
+      assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
+    }
+    trampoline_call(stub);
+    b(DONE);
   }
 
   BIND(LINEARSEARCH);
@@ -4493,15 +4581,12 @@
 
         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
         br(LT, DOSHORT);
-
-        sub(cnt2, cnt2, cnt1);
-        mov(result_tmp, cnt2);
-
+      BIND(LINEAR_MEDIUM);
+        (this->*str1_load_1chr)(first, Address(str1));
         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
-        (this->*str1_load_1chr)(first, Address(str1, cnt1_neg));
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 
       BIND(FIRST_LOOP);
         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
@@ -4539,10 +4624,9 @@
       Label CH1_LOOP;
 
         (this->*load_4chr)(ch1, str1);
-        sub(cnt2, cnt2, 4);
-        mov(result_tmp, cnt2);
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
+        sub(result_tmp, cnt2, 4);
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 
       BIND(CH1_LOOP);
         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
@@ -4551,18 +4635,18 @@
         adds(cnt2_neg, cnt2_neg, str2_chr_size);
         br(LE, CH1_LOOP);
         b(NOMATCH);
-    }
+      }
 
     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
       Label CH1_LOOP;
 
       BIND(DO2);
         (this->*load_2chr)(ch1, str1);
-        sub(cnt2, cnt2, 2);
-        mov(result_tmp, cnt2);
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
-
+        if (icnt1 == 2) {
+          sub(result_tmp, cnt2, 2);
+        }
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
       BIND(CH1_LOOP);
         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
         cmp(ch1, ch2);
@@ -4578,12 +4662,11 @@
       BIND(DO3);
         (this->*load_2chr)(first, str1);
         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
-
-        sub(cnt2, cnt2, 3);
-        mov(result_tmp, cnt2);
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
-
+        if (icnt1 == 3) {
+          sub(result_tmp, cnt2, 3);
+        }
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
       BIND(FIRST_LOOP);
         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
         cmpw(first, ch2);
@@ -4602,30 +4685,23 @@
     }
 
     if (icnt1 == -1 || icnt1 == 1) {
-      Label CH1_LOOP, HAS_ZERO;
-      Label DO1_SHORT, DO1_LOOP;
+      Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 
       BIND(DO1);
         (this->*str1_load_1chr)(ch1, str1);
         cmp(cnt2, 8);
         br(LT, DO1_SHORT);
 
+        sub(result_tmp, cnt2, 8/str2_chr_size);
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
+        mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+
         if (str2_isL) {
-          if (!str1_isL) {
-            tst(ch1, 0xff00);
-            br(NE, NOMATCH);
-          }
           orr(ch1, ch1, ch1, LSL, 8);
         }
         orr(ch1, ch1, ch1, LSL, 16);
         orr(ch1, ch1, ch1, LSL, 32);
-
-        sub(cnt2, cnt2, 8/str2_chr_size);
-        mov(result_tmp, cnt2);
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
-
-        mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
       BIND(CH1_LOOP);
         ldr(ch2, Address(str2, cnt2_neg));
         eor(ch2, ch1, ch2);
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Jun 25 16:31:37 2018 +0300
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Jun 25 16:32:02 2018 +0300
@@ -1247,6 +1247,7 @@
                       Register cnt1, Register cnt2,
                       Register tmp1, Register tmp2,
                       Register tmp3, Register tmp4,
+                      Register tmp5, Register tmp6,
                       int int_cnt1, Register result, int ae);
   void string_indexof_char(Register str1, Register cnt1,
                            Register ch, Register result,
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 16:31:37 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 16:32:02 2018 +0300
@@ -4325,6 +4325,305 @@
           = generate_compare_long_string_different_encoding(false);
   }
 
+  // R0 = result
+  // R1 = str2
+  // R2 = cnt1
+  // R3 = str1
+  // R4 = cnt2
+  // This generic linear code use few additional ideas, which makes it faster:
+  // 1) we can safely keep at least 1st register of pattern(since length >= 8)
+  // in order to skip initial loading(help in systems with 1 ld pipeline)
+  // 2) we can use "fast" algorithm of finding single character to search for
+  // first symbol with less branches(1 branch per each loaded register instead
+  // of branch for each symbol), so, this is where constants like
+  // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
+  // 3) after loading and analyzing 1st register of source string, it can be
+  // used to search for every 1st character entry, saving few loads in
+  // comparison with "simplier-but-slower" implementation
+  // 4) in order to avoid lots of push/pop operations, code below is heavily
+  // re-using/re-initializing/compressing register values, which makes code
+  // larger and a bit less readable, however, most of extra operations are
+  // issued during loads or branches, so, penalty is minimal
+  address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
+    const char* stubName = str1_isL
+        ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
+        : "indexof_linear_uu";
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stubName);
+    address entry = __ pc();
+
+    int str1_chr_size = str1_isL ? 1 : 2;
+    int str2_chr_size = str2_isL ? 1 : 2;
+    int str1_chr_shift = str1_isL ? 0 : 1;
+    int str2_chr_shift = str2_isL ? 0 : 1;
+    bool isL = str1_isL && str2_isL;
+   // parameters
+    Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
+    // temporary registers
+    Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
+    RegSet spilled_regs = RegSet::range(tmp1, tmp4);
+    // redefinitions
+    Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
+
+    __ push(spilled_regs, sp);
+    Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
+        L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
+        L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
+        L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
+        L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
+        L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
+    // Read whole register from str1. It is safe, because length >=8 here
+    __ ldr(ch1, Address(str1));
+    // Read whole register from str2. It is safe, because length >=8 here
+    __ ldr(ch2, Address(str2));
+    __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
+    if (str1_isL != str2_isL) {
+      __ eor(v0, __ T16B, v0, v0);
+    }
+    __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
+    __ mul(first, first, tmp1);
+    // check if we have less than 1 register to check
+    __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
+    if (str1_isL != str2_isL) {
+      __ fmovd(v1, ch1);
+    }
+    __ br(__ LE, L_SMALL);
+    __ eor(ch2, first, ch2);
+    if (str1_isL != str2_isL) {
+      __ zip1(v1, __ T16B, v1, v0);
+    }
+    __ sub(tmp2, ch2, tmp1);
+    __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+    __ bics(tmp2, tmp2, ch2);
+    if (str1_isL != str2_isL) {
+      __ fmovd(ch1, v1);
+    }
+    __ br(__ NE, L_HAS_ZERO);
+    __ subs(cnt2, cnt2, wordSize/str2_chr_size);
+    __ add(result, result, wordSize/str2_chr_size);
+    __ add(str2, str2, wordSize);
+    __ br(__ LT, L_POST_LOOP);
+    __ BIND(L_LOOP);
+      __ ldr(ch2, Address(str2));
+      __ eor(ch2, first, ch2);
+      __ sub(tmp2, ch2, tmp1);
+      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+      __ bics(tmp2, tmp2, ch2);
+      __ br(__ NE, L_HAS_ZERO);
+    __ BIND(L_LOOP_PROCEED);
+      __ subs(cnt2, cnt2, wordSize/str2_chr_size);
+      __ add(str2, str2, wordSize);
+      __ add(result, result, wordSize/str2_chr_size);
+      __ br(__ GE, L_LOOP);
+    __ BIND(L_POST_LOOP);
+      __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check
+      __ br(__ LE, NOMATCH);
+      __ ldr(ch2, Address(str2));
+      __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
+      __ eor(ch2, first, ch2);
+      __ sub(tmp2, ch2, tmp1);
+      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+      __ mov(tmp4, -1); // all bits set
+      __ b(L_SMALL_PROCEED);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_SMALL);
+      __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
+      __ eor(ch2, first, ch2);
+      if (str1_isL != str2_isL) {
+        __ zip1(v1, __ T16B, v1, v0);
+      }
+      __ sub(tmp2, ch2, tmp1);
+      __ mov(tmp4, -1); // all bits set
+      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+      if (str1_isL != str2_isL) {
+        __ fmovd(ch1, v1); // move converted 4 symbols
+      }
+    __ BIND(L_SMALL_PROCEED);
+      __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
+      __ bic(tmp2, tmp2, ch2);
+      __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
+      __ rbit(tmp2, tmp2);
+      __ br(__ EQ, NOMATCH);
+    __ BIND(L_SMALL_HAS_ZERO_LOOP);
+      __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
+      __ cmp(cnt1, wordSize/str2_chr_size);
+      __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
+      if (str2_isL) { // LL
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
+        __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+      } else {
+        __ mov(ch2, 0xE); // all bits in byte set except last one
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+      }
+      __ cmp(ch1, ch2);
+      __ mov(tmp4, wordSize/str2_chr_size);
+      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
+    __ BIND(L_SMALL_CMP_LOOP);
+      str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
+               : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
+      str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
+               : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
+      __ add(tmp4, tmp4, 1);
+      __ cmp(tmp4, cnt1);
+      __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
+      __ cmp(first, ch2);
+      __ br(__ EQ, L_SMALL_CMP_LOOP);
+    __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
+      __ cbz(tmp2, NOMATCH); // no more matches. exit
+      __ clz(tmp4, tmp2);
+      __ add(result, result, 1); // advance index
+      __ add(str2, str2, str2_chr_size); // advance pointer
+      __ b(L_SMALL_HAS_ZERO_LOOP);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
+      __ cmp(first, ch2);
+      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
+      if (str2_isL) { // LL
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
+        __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+      } else {
+        __ mov(ch2, 0xE); // all bits in byte set except last one
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+      }
+      __ cmp(ch1, ch2);
+      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_HAS_ZERO);
+      __ rbit(tmp2, tmp2);
+      __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
+      // Now, perform compression of counters(cnt2 and cnt1) into one register.
+      // It's fine because both counters are 32bit and are not changed in this
+      // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
+      __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
+      __ sub(result, result, 1);
+    __ BIND(L_HAS_ZERO_LOOP);
+      __ mov(cnt1, wordSize/str2_chr_size);
+      __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
+      __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
+      if (str2_isL) {
+        __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1);
+        __ mov(tmp4, wordSize/str2_chr_size);
+      } else {
+        __ mov(ch2, 0xE);
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1);
+        __ mov(tmp4, wordSize/str2_chr_size);
+        __ sub(str2, str2, str2_chr_size);
+      }
+      __ cmp(ch1, ch2);
+      __ mov(tmp4, wordSize/str2_chr_size);
+      __ br(__ NE, L_CMP_LOOP_NOMATCH);
+    __ BIND(L_CMP_LOOP);
+      str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
+               : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
+      str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
+               : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
+      __ add(tmp4, tmp4, 1);
+      __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
+      __ br(__ GE, L_CMP_LOOP_LAST_CMP);
+      __ cmp(cnt1, ch2);
+      __ br(__ EQ, L_CMP_LOOP);
+    __ BIND(L_CMP_LOOP_NOMATCH);
+      // here we're not matched
+      __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
+      __ clz(tmp4, tmp2);
+      __ add(str2, str2, str2_chr_size); // advance pointer
+      __ b(L_HAS_ZERO_LOOP);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_CMP_LOOP_LAST_CMP);
+      __ cmp(cnt1, ch2);
+      __ br(__ NE, L_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_CMP_LOOP_LAST_CMP2);
+      if (str2_isL) {
+        __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1);
+      } else {
+        __ mov(ch2, 0xE);
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1);
+        __ sub(str2, str2, str2_chr_size);
+      }
+      __ cmp(ch1, ch2);
+      __ br(__ NE, L_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
+      // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
+      // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
+      // so, result was increased at max by wordSize/str2_chr_size - 1, so,
+      // respective high bit wasn't changed. L_LOOP_PROCEED will increase
+      // result by analyzed characters value, so, we can just reset lower bits
+      // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
+      // 2) restore cnt1 and cnt2 values from "compressed" cnt2
+      // 3) advance str2 value to represent next str2 octet. result & 7/3 is
+      // index of last analyzed substring inside current octet. So, str2 in at
+      // respective start address. We need to advance it to next octet
+      __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
+      __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
+      __ bfm(result, zr, 0, 2 - str2_chr_shift);
+      __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
+      __ movw(cnt2, cnt2);
+      __ b(L_LOOP_PROCEED);
+    __ align(OptoLoopAlignment);
+    __ BIND(NOMATCH);
+      __ mov(result, -1);
+    __ BIND(DONE);
+      __ pop(spilled_regs, sp);
+      __ ret(lr);
+    return entry;
+  }
+
+  void generate_string_indexof_stubs() {
+    StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
+    StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
+    StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
+  }
+
   /**
    *  Arguments:
    *
@@ -5426,6 +5725,8 @@
 
     generate_compare_long_strings();
 
+    generate_string_indexof_stubs();
+
     if (UseMultiplyToLenIntrinsic) {
       StubRoutines::_multiplyToLen = generate_multiplyToLen();
     }
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 16:31:37 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 16:32:02 2018 +0300
@@ -52,6 +52,9 @@
 address StubRoutines::aarch64::_compare_long_string_UU = NULL;
 address StubRoutines::aarch64::_compare_long_string_LU = NULL;
 address StubRoutines::aarch64::_compare_long_string_UL = NULL;
+address StubRoutines::aarch64::_string_indexof_linear_ll = NULL;
+address StubRoutines::aarch64::_string_indexof_linear_uu = NULL;
+address StubRoutines::aarch64::_string_indexof_linear_ul = NULL;
 bool StubRoutines::aarch64::_completed = false;
 
 /**
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 16:31:37 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 16:32:02 2018 +0300
@@ -70,6 +70,9 @@
   static address _compare_long_string_LU;
   static address _compare_long_string_UL;
   static address _compare_long_string_UU;
+  static address _string_indexof_linear_ll;
+  static address _string_indexof_linear_uu;
+  static address _string_indexof_linear_ul;
   static bool _completed;
 
  public:
@@ -156,6 +159,18 @@
       return _compare_long_string_UU;
   }
 
+  static address string_indexof_linear_ul() {
+      return _string_indexof_linear_ul;
+  }
+
+  static address string_indexof_linear_ll() {
+      return _string_indexof_linear_ll;
+  }
+
+  static address string_indexof_linear_uu() {
+      return _string_indexof_linear_uu;
+  }
+
   static bool complete() {
     return _completed;
   }