comparison src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @ 51391:866c9aa29ee4

8189103: AARCH64: optimize String indexOf intrinsic Reviewed-by: aph
author dpochepk
date Mon, 25 Jun 2018 16:32:02 +0300
parents 7ad092f40454
children afca3c78ea0f
comparison
equal deleted inserted replaced
47:d43a1f688a8f 48:47986e11366f
4314 // Search for str1 in str2 and return index or -1 4314 // Search for str1 in str2 and return index or -1
4315 void MacroAssembler::string_indexof(Register str2, Register str1, 4315 void MacroAssembler::string_indexof(Register str2, Register str1,
4316 Register cnt2, Register cnt1, 4316 Register cnt2, Register cnt1,
4317 Register tmp1, Register tmp2, 4317 Register tmp1, Register tmp2,
4318 Register tmp3, Register tmp4, 4318 Register tmp3, Register tmp4,
4319 Register tmp5, Register tmp6,
4319 int icnt1, Register result, int ae) { 4320 int icnt1, Register result, int ae) {
4320 Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH; 4321 // NOTE: tmp5, tmp6 can be zr depending on specific method version
4322 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4321 4323
4322 Register ch1 = rscratch1; 4324 Register ch1 = rscratch1;
4323 Register ch2 = rscratch2; 4325 Register ch2 = rscratch2;
4324 Register cnt1tmp = tmp1; 4326 Register cnt1tmp = tmp1;
4325 Register cnt2tmp = tmp2; 4327 Register cnt2tmp = tmp2;
4344 4346
4345 // Note, inline_string_indexOf() generates checks: 4347 // Note, inline_string_indexOf() generates checks:
4346 // if (substr.count > string.count) return -1; 4348 // if (substr.count > string.count) return -1;
4347 // if (substr.count == 0) return 0; 4349 // if (substr.count == 0) return 0;
4348 4350
4349 // We have two strings, a source string in str2, cnt2 and a pattern string 4351 // We have two strings, a source string in str2, cnt2 and a pattern string
4350 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4352 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4351 4353
4352 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4354 // For larger pattern and source we use a simplified Boyer Moore algorithm.
4353 // With a small pattern and source we use linear scan. 4355 // With a small pattern and source we use linear scan.
4354 4356
4355 if (icnt1 == -1) { 4357 if (icnt1 == -1) {
4356 cmp(cnt1, 256); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4358 sub(result_tmp, cnt2, cnt1);
4357 ccmp(cnt1, 8, 0b0000, LO); // Can't handle skip >= 256 because we use 4359 cmp(cnt1, 8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4358 br(LO, LINEARSEARCH); // a byte array. 4360 br(LT, LINEARSEARCH);
4359 cmp(cnt1, cnt2, LSR, 2); // Source must be 4 * pattern for BM 4361 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4360 br(HS, LINEARSEARCH); 4362 cmp(cnt1, 256);
4363 lsr(tmp1, cnt2, 2);
4364 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4365 br(GE, LINEARSTUB);
4361 } 4366 }
4362 4367
4363 // The Boyer Moore alogorithm is based on the description here:- 4368 // The Boyer Moore alogorithm is based on the description here:-
4364 // 4369 //
4365 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4370 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4375 // 4380 //
4376 // This is also known as the Boyer-Moore-Horspool algorithm:- 4381 // This is also known as the Boyer-Moore-Horspool algorithm:-
4377 // 4382 //
4378 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4383 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4379 // 4384 //
4380 // #define ASIZE 128 4385 // This particular implementation has few java-specific optimizations.
4386 //
4387 // #define ASIZE 256
4381 // 4388 //
4382 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4389 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
4383 // int i, j; 4390 // int i, j;
4384 // unsigned c; 4391 // unsigned c;
4385 // unsigned char bc[ASIZE]; 4392 // unsigned char bc[ASIZE];
4386 // 4393 //
4387 // /* Preprocessing */ 4394 // /* Preprocessing */
4388 // for (i = 0; i < ASIZE; ++i) 4395 // for (i = 0; i < ASIZE; ++i)
4389 // bc[i] = 0; 4396 // bc[i] = m;
4390 // for (i = 0; i < m - 1; ) { 4397 // for (i = 0; i < m - 1; ) {
4391 // c = x[i]; 4398 // c = x[i];
4392 // ++i; 4399 // ++i;
4393 // if (c < ASIZE) bc[c] = i; 4400 // // c < 256 for Latin1 string, so, no need for branch
4401 // #ifdef PATTERN_STRING_IS_LATIN1
4402 // bc[c] = m - i;
4403 // #else
4404 // if (c < ASIZE) bc[c] = m - i;
4405 // #endif
4394 // } 4406 // }
4395 // 4407 //
4396 // /* Searching */ 4408 // /* Searching */
4397 // j = 0; 4409 // j = 0;
4398 // while (j <= n - m) { 4410 // while (j <= n - m) {
4399 // c = y[i+j]; 4411 // c = y[i+j];
4400 // if (x[m-1] == c) 4412 // if (x[m-1] == c)
4401 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4413 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4402 // if (i < 0) return j; 4414 // if (i < 0) return j;
4415 // // c < 256 for Latin1 string, so, no need for branch
4416 // #ifdef SOURCE_STRING_IS_LATIN1
4417 // // LL case: (c< 256) always true. Remove branch
4418 // j += bc[y[j+m-1]];
4419 // #endif
4420 // #ifndef PATTERN_STRING_IS_UTF
4421 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4403 // if (c < ASIZE) 4422 // if (c < ASIZE)
4404 // j = j - bc[y[j+m-1]] + m; 4423 // j += bc[y[j+m-1]];
4405 // else 4424 // else
4406 // j += 1; // Advance by 1 only if char >= ASIZE 4425 // j += 1
4426 // #endif
4427 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4428 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4429 // if (c < ASIZE)
4430 // j += bc[y[j+m-1]];
4431 // else
4432 // j += m
4433 // #endif
4407 // } 4434 // }
4408 // } 4435 // }
4409 4436
4410 if (icnt1 == -1) { 4437 if (icnt1 == -1) {
4411 BIND(BM); 4438 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4412 4439 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4413 Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
4414 Label BMADV, BMMATCH, BMCHECKEND;
4415
4416 Register cnt1end = tmp2; 4440 Register cnt1end = tmp2;
4417 Register str2end = cnt2; 4441 Register str2end = cnt2;
4418 Register skipch = tmp2; 4442 Register skipch = tmp2;
4419 4443
4420 // Restrict ASIZE to 128 to reduce stack space/initialisation. 4444 // str1 length is >=8, so, we can read at least 1 register for cases when
4421 // The presence of chars >= ASIZE in the target string does not affect 4445 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4422 // performance, but we must be careful not to initialise them in the stack 4446 // UL case. We'll re-read last character in inner pre-loop code to have
4423 // array. 4447 // single outer pre-loop load
4424 // The presence of chars >= ASIZE in the source string may adversely affect 4448 const int firstStep = isL ? 7 : 3;
4425 // performance since we can only advance by one when we encounter one. 4449
4426 4450 const int ASIZE = 256;
4427 stp(zr, zr, pre(sp, -128)); 4451 const int STORED_BYTES = 32; // amount of bytes stored per instruction
4428 for (int i = 1; i < 8; i++) 4452 sub(sp, sp, ASIZE);
4429 stp(zr, zr, Address(sp, i*16)); 4453 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4430 4454 mov(ch1, sp);
4431 mov(cnt1tmp, 0); 4455 BIND(BM_INIT_LOOP);
4432 sub(cnt1end, cnt1, 1); 4456 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4457 subs(tmp5, tmp5, 1);
4458 br(GT, BM_INIT_LOOP);
4459
4460 sub(cnt1tmp, cnt1, 1);
4461 mov(tmp5, str2);
4462 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4463 sub(ch2, cnt1, 1);
4464 mov(tmp3, str1);
4433 BIND(BCLOOP); 4465 BIND(BCLOOP);
4434 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4466 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4435 cmp(ch1, 128); 4467 if (!str1_isL) {
4436 add(cnt1tmp, cnt1tmp, 1); 4468 cmp(ch1, ASIZE);
4437 br(HS, BCSKIP); 4469 br(HS, BCSKIP);
4438 strb(cnt1tmp, Address(sp, ch1)); 4470 }
4471 strb(ch2, Address(sp, ch1));
4439 BIND(BCSKIP); 4472 BIND(BCSKIP);
4440 cmp(cnt1tmp, cnt1end); 4473 subs(ch2, ch2, 1);
4441 br(LT, BCLOOP); 4474 br(GT, BCLOOP);
4442 4475
4443 mov(result_tmp, str2); 4476 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4444 4477 if (str1_isL == str2_isL) {
4445 sub(cnt2, cnt2, cnt1); 4478 // load last 8 bytes (8LL/4UU symbols)
4446 add(str2end, str2, cnt2, LSL, str2_chr_shift); 4479 ldr(tmp6, Address(tmp6, -wordSize));
4480 } else {
4481 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4482 // convert Latin1 to UTF. We'll have to wait until load completed, but
4483 // it's still faster than per-character loads+checks
4484 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4485 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4486 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4487 andr(tmp6, tmp6, 0xFF); // str1[N-4]
4488 orr(ch2, ch1, ch2, LSL, 16);
4489 orr(tmp6, tmp6, tmp3, LSL, 48);
4490 orr(tmp6, tmp6, ch2, LSL, 16);
4491 }
4447 BIND(BMLOOPSTR2); 4492 BIND(BMLOOPSTR2);
4448 sub(cnt1tmp, cnt1, 1);
4449 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4450 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4493 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4451 cmp(ch1, skipch); 4494 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4495 if (str1_isL == str2_isL) {
4496 // re-init tmp3. It's for free because it's executed in parallel with
4497 // load above. Alternative is to initialize it before loop, but it'll
4498 // affect performance on in-order systems with 2 or more ld/st pipelines
4499 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4500 }
4501 if (!isL) { // UU/UL case
4502 lsl(ch2, cnt1tmp, 1); // offset in bytes
4503 }
4504 cmp(tmp3, skipch);
4452 br(NE, BMSKIP); 4505 br(NE, BMSKIP);
4453 subs(cnt1tmp, cnt1tmp, 1); 4506 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4454 br(LT, BMMATCH); 4507 mov(ch1, tmp6);
4508 if (isL) {
4509 b(BMLOOPSTR1_AFTER_LOAD);
4510 } else {
4511 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4512 b(BMLOOPSTR1_CMP);
4513 }
4455 BIND(BMLOOPSTR1); 4514 BIND(BMLOOPSTR1);
4456 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4515 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4457 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4516 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4517 BIND(BMLOOPSTR1_AFTER_LOAD);
4518 subs(cnt1tmp, cnt1tmp, 1);
4519 br(LT, BMLOOPSTR1_LASTCMP);
4520 BIND(BMLOOPSTR1_CMP);
4521 cmp(ch1, ch2);
4522 br(EQ, BMLOOPSTR1);
4523 BIND(BMSKIP);
4524 if (!isL) {
4525 // if we've met UTF symbol while searching Latin1 pattern, then we can
4526 // skip cnt1 symbols
4527 if (str1_isL != str2_isL) {
4528 mov(result_tmp, cnt1);
4529 } else {
4530 mov(result_tmp, 1);
4531 }
4532 cmp(skipch, ASIZE);
4533 br(HS, BMADV);
4534 }
4535 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4536 BIND(BMADV);
4537 sub(cnt1tmp, cnt1, 1);
4538 add(str2, str2, result_tmp, LSL, str2_chr_shift);
4539 cmp(str2, str2end);
4540 br(LE, BMLOOPSTR2);
4541 add(sp, sp, ASIZE);
4542 b(NOMATCH);
4543 BIND(BMLOOPSTR1_LASTCMP);
4458 cmp(ch1, ch2); 4544 cmp(ch1, ch2);
4459 br(NE, BMSKIP); 4545 br(NE, BMSKIP);
4460 subs(cnt1tmp, cnt1tmp, 1);
4461 br(GE, BMLOOPSTR1);
4462 BIND(BMMATCH); 4546 BIND(BMMATCH);
4463 sub(result, str2, result_tmp); 4547 sub(result, str2, tmp5);
4464 if (!str2_isL) lsr(result, result, 1); 4548 if (!str2_isL) lsr(result, result, 1);
4465 add(sp, sp, 128); 4549 add(sp, sp, ASIZE);
4466 b(DONE); 4550 b(DONE);
4467 BIND(BMADV); 4551
4468 add(str2, str2, str2_chr_size); 4552 BIND(LINEARSTUB);
4469 b(BMCHECKEND); 4553 cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4470 BIND(BMSKIP); 4554 br(LT, LINEAR_MEDIUM);
4471 cmp(skipch, 128); 4555 mov(result, zr);
4472 br(HS, BMADV); 4556 RuntimeAddress stub = NULL;
4473 ldrb(ch2, Address(sp, skipch)); 4557 if (isL) {
4474 add(str2, str2, cnt1, LSL, str2_chr_shift); 4558 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4475 sub(str2, str2, ch2, LSL, str2_chr_shift); 4559 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4476 BIND(BMCHECKEND); 4560 } else if (str1_isL) {
4477 cmp(str2, str2end); 4561 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4478 br(LE, BMLOOPSTR2); 4562 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4479 add(sp, sp, 128); 4563 } else {
4480 b(NOMATCH); 4564 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4565 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4566 }
4567 trampoline_call(stub);
4568 b(DONE);
4481 } 4569 }
4482 4570
4483 BIND(LINEARSEARCH); 4571 BIND(LINEARSEARCH);
4484 { 4572 {
4485 Label DO1, DO2, DO3; 4573 Label DO1, DO2, DO3;
4491 { 4579 {
4492 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4580 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4493 4581
4494 cmp(cnt1, str1_isL == str2_isL ? 4 : 2); 4582 cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4495 br(LT, DOSHORT); 4583 br(LT, DOSHORT);
4496 4584 BIND(LINEAR_MEDIUM);
4497 sub(cnt2, cnt2, cnt1); 4585 (this->*str1_load_1chr)(first, Address(str1));
4498 mov(result_tmp, cnt2);
4499
4500 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4586 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4501 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4502 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4587 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4503 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4588 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4504 (this->*str1_load_1chr)(first, Address(str1, cnt1_neg)); 4589 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4505 4590
4506 BIND(FIRST_LOOP); 4591 BIND(FIRST_LOOP);
4507 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4592 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4508 cmp(first, ch2); 4593 cmp(first, ch2);
4509 br(EQ, STR1_LOOP); 4594 br(EQ, STR1_LOOP);
4537 4622
4538 if (icnt1 == 4) { 4623 if (icnt1 == 4) {
4539 Label CH1_LOOP; 4624 Label CH1_LOOP;
4540 4625
4541 (this->*load_4chr)(ch1, str1); 4626 (this->*load_4chr)(ch1, str1);
4542 sub(cnt2, cnt2, 4); 4627 sub(result_tmp, cnt2, 4);
4543 mov(result_tmp, cnt2); 4628 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4544 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4629 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4545 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4546 4630
4547 BIND(CH1_LOOP); 4631 BIND(CH1_LOOP);
4548 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4632 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4549 cmp(ch1, ch2); 4633 cmp(ch1, ch2);
4550 br(EQ, MATCH); 4634 br(EQ, MATCH);
4551 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4635 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4552 br(LE, CH1_LOOP); 4636 br(LE, CH1_LOOP);
4553 b(NOMATCH); 4637 b(NOMATCH);
4554 } 4638 }
4555 4639
4556 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4640 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4557 Label CH1_LOOP; 4641 Label CH1_LOOP;
4558 4642
4559 BIND(DO2); 4643 BIND(DO2);
4560 (this->*load_2chr)(ch1, str1); 4644 (this->*load_2chr)(ch1, str1);
4561 sub(cnt2, cnt2, 2); 4645 if (icnt1 == 2) {
4562 mov(result_tmp, cnt2); 4646 sub(result_tmp, cnt2, 2);
4563 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4647 }
4564 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4648 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4565 4649 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4566 BIND(CH1_LOOP); 4650 BIND(CH1_LOOP);
4567 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4651 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4568 cmp(ch1, ch2); 4652 cmp(ch1, ch2);
4569 br(EQ, MATCH); 4653 br(EQ, MATCH);
4570 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4654 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4576 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4660 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4577 4661
4578 BIND(DO3); 4662 BIND(DO3);
4579 (this->*load_2chr)(first, str1); 4663 (this->*load_2chr)(first, str1);
4580 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4664 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4581 4665 if (icnt1 == 3) {
4582 sub(cnt2, cnt2, 3); 4666 sub(result_tmp, cnt2, 3);
4583 mov(result_tmp, cnt2); 4667 }
4584 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4668 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4585 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4669 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4586
4587 BIND(FIRST_LOOP); 4670 BIND(FIRST_LOOP);
4588 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4671 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4589 cmpw(first, ch2); 4672 cmpw(first, ch2);
4590 br(EQ, STR1_LOOP); 4673 br(EQ, STR1_LOOP);
4591 BIND(STR2_NEXT); 4674 BIND(STR2_NEXT);
4600 br(NE, STR2_NEXT); 4683 br(NE, STR2_NEXT);
4601 b(MATCH); 4684 b(MATCH);
4602 } 4685 }
4603 4686
4604 if (icnt1 == -1 || icnt1 == 1) { 4687 if (icnt1 == -1 || icnt1 == 1) {
4605 Label CH1_LOOP, HAS_ZERO; 4688 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4606 Label DO1_SHORT, DO1_LOOP;
4607 4689
4608 BIND(DO1); 4690 BIND(DO1);
4609 (this->*str1_load_1chr)(ch1, str1); 4691 (this->*str1_load_1chr)(ch1, str1);
4610 cmp(cnt2, 8); 4692 cmp(cnt2, 8);
4611 br(LT, DO1_SHORT); 4693 br(LT, DO1_SHORT);
4612 4694
4695 sub(result_tmp, cnt2, 8/str2_chr_size);
4696 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4697 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4698 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4699
4613 if (str2_isL) { 4700 if (str2_isL) {
4614 if (!str1_isL) {
4615 tst(ch1, 0xff00);
4616 br(NE, NOMATCH);
4617 }
4618 orr(ch1, ch1, ch1, LSL, 8); 4701 orr(ch1, ch1, ch1, LSL, 8);
4619 } 4702 }
4620 orr(ch1, ch1, ch1, LSL, 16); 4703 orr(ch1, ch1, ch1, LSL, 16);
4621 orr(ch1, ch1, ch1, LSL, 32); 4704 orr(ch1, ch1, ch1, LSL, 32);
4622
4623 sub(cnt2, cnt2, 8/str2_chr_size);
4624 mov(result_tmp, cnt2);
4625 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4626 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4627
4628 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4629 BIND(CH1_LOOP); 4705 BIND(CH1_LOOP);
4630 ldr(ch2, Address(str2, cnt2_neg)); 4706 ldr(ch2, Address(str2, cnt2_neg));
4631 eor(ch2, ch1, ch2); 4707 eor(ch2, ch1, ch2);
4632 sub(tmp1, ch2, tmp3); 4708 sub(tmp1, ch2, tmp3);
4633 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4709 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);