annotate modules/javafx.web/src/main/native/Source/ThirdParty/icu/source/common/util.cpp @ 11038:20a8447c71c6

8207159: Update ICU to version 62.1 Reviewed-by: mbilla, kcr, ghb
author arajkumar
date Fri, 24 Aug 2018 15:06:40 +0530
parents fee4ef5c87df
children
rev   line source
arajkumar@11038 1 // © 2016 and later: Unicode, Inc. and others.
arajkumar@11038 2 // License & terms of use: http://www.unicode.org/copyright.html
ghb@10550 3 /*
ghb@10550 4 **********************************************************************
ghb@10550 5 * Copyright (c) 2001-2011, International Business Machines
ghb@10550 6 * Corporation and others. All Rights Reserved.
ghb@10550 7 **********************************************************************
ghb@10550 8 * Date Name Description
ghb@10550 9 * 11/19/2001 aliu Creation.
ghb@10550 10 **********************************************************************
ghb@10550 11 */
ghb@10550 12
ghb@10550 13 #include "unicode/unimatch.h"
ghb@10550 14 #include "unicode/utf16.h"
ghb@10550 15 #include "patternprops.h"
ghb@10550 16 #include "util.h"
ghb@10550 17
ghb@10550 18 // Define UChar constants using hex for EBCDIC compatibility
ghb@10550 19
ghb@10550 20 static const UChar BACKSLASH = 0x005C; /*\*/
ghb@10550 21 static const UChar UPPER_U = 0x0055; /*U*/
ghb@10550 22 static const UChar LOWER_U = 0x0075; /*u*/
ghb@10550 23 static const UChar APOSTROPHE = 0x0027; // '\''
ghb@10550 24 static const UChar SPACE = 0x0020; // ' '
ghb@10550 25
ghb@10550 26 // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
ghb@10550 27 static const UChar DIGITS[] = {
ghb@10550 28 48,49,50,51,52,53,54,55,56,57,
ghb@10550 29 65,66,67,68,69,70,71,72,73,74,
ghb@10550 30 75,76,77,78,79,80,81,82,83,84,
ghb@10550 31 85,86,87,88,89,90
ghb@10550 32 };
ghb@10550 33
ghb@10550 34 U_NAMESPACE_BEGIN
ghb@10550 35
ghb@10550 36 UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
ghb@10550 37 int32_t radix, int32_t minDigits) {
ghb@10550 38 if (radix < 2 || radix > 36) {
ghb@10550 39 // Bogus radix
ghb@10550 40 return result.append((UChar)63/*?*/);
ghb@10550 41 }
ghb@10550 42 // Handle negatives
ghb@10550 43 if (n < 0) {
ghb@10550 44 n = -n;
ghb@10550 45 result.append((UChar)45/*-*/);
ghb@10550 46 }
ghb@10550 47 // First determine the number of digits
ghb@10550 48 int32_t nn = n;
ghb@10550 49 int32_t r = 1;
ghb@10550 50 while (nn >= radix) {
ghb@10550 51 nn /= radix;
ghb@10550 52 r *= radix;
ghb@10550 53 --minDigits;
ghb@10550 54 }
ghb@10550 55 // Now generate the digits
ghb@10550 56 while (--minDigits > 0) {
ghb@10550 57 result.append(DIGITS[0]);
ghb@10550 58 }
ghb@10550 59 while (r > 0) {
ghb@10550 60 int32_t digit = n / r;
ghb@10550 61 result.append(DIGITS[digit]);
ghb@10550 62 n -= digit * r;
ghb@10550 63 r /= radix;
ghb@10550 64 }
ghb@10550 65 return result;
ghb@10550 66 }
ghb@10550 67
ghb@10550 68 /**
ghb@10550 69 * Return true if the character is NOT printable ASCII.
ghb@10550 70 */
ghb@10550 71 UBool ICU_Utility::isUnprintable(UChar32 c) {
ghb@10550 72 return !(c >= 0x20 && c <= 0x7E);
ghb@10550 73 }
ghb@10550 74
ghb@10550 75 /**
ghb@10550 76 * Escape unprintable characters using \uxxxx notation for U+0000 to
ghb@10550 77 * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
ghb@10550 78 * printable ASCII, then do nothing and return FALSE. Otherwise,
ghb@10550 79 * append the escaped notation and return TRUE.
ghb@10550 80 */
ghb@10550 81 UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
ghb@10550 82 if (isUnprintable(c)) {
ghb@10550 83 result.append(BACKSLASH);
ghb@10550 84 if (c & ~0xFFFF) {
ghb@10550 85 result.append(UPPER_U);
ghb@10550 86 result.append(DIGITS[0xF&(c>>28)]);
ghb@10550 87 result.append(DIGITS[0xF&(c>>24)]);
ghb@10550 88 result.append(DIGITS[0xF&(c>>20)]);
ghb@10550 89 result.append(DIGITS[0xF&(c>>16)]);
ghb@10550 90 } else {
ghb@10550 91 result.append(LOWER_U);
ghb@10550 92 }
ghb@10550 93 result.append(DIGITS[0xF&(c>>12)]);
ghb@10550 94 result.append(DIGITS[0xF&(c>>8)]);
ghb@10550 95 result.append(DIGITS[0xF&(c>>4)]);
ghb@10550 96 result.append(DIGITS[0xF&c]);
ghb@10550 97 return TRUE;
ghb@10550 98 }
ghb@10550 99 return FALSE;
ghb@10550 100 }
ghb@10550 101
ghb@10550 102 /**
ghb@10550 103 * Returns the index of a character, ignoring quoted text.
ghb@10550 104 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
ghb@10550 105 * found by a search for 'h'.
ghb@10550 106 */
ghb@10550 107 // FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.
ghb@10550 108 /*
ghb@10550 109 int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
ghb@10550 110 int32_t start, int32_t limit,
ghb@10550 111 UChar charToFind) {
ghb@10550 112 for (int32_t i=start; i<limit; ++i) {
ghb@10550 113 UChar c = text.charAt(i);
ghb@10550 114 if (c == BACKSLASH) {
ghb@10550 115 ++i;
ghb@10550 116 } else if (c == APOSTROPHE) {
ghb@10550 117 while (++i < limit
ghb@10550 118 && text.charAt(i) != APOSTROPHE) {}
ghb@10550 119 } else if (c == charToFind) {
ghb@10550 120 return i;
ghb@10550 121 }
ghb@10550 122 }
ghb@10550 123 return -1;
ghb@10550 124 }
ghb@10550 125 */
ghb@10550 126
ghb@10550 127 /**
ghb@10550 128 * Skip over a sequence of zero or more white space characters at pos.
ghb@10550 129 * @param advance if true, advance pos to the first non-white-space
ghb@10550 130 * character at or after pos, or str.length(), if there is none.
ghb@10550 131 * Otherwise leave pos unchanged.
ghb@10550 132 * @return the index of the first non-white-space character at or
ghb@10550 133 * after pos, or str.length(), if there is none.
ghb@10550 134 */
ghb@10550 135 int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
ghb@10550 136 UBool advance) {
ghb@10550 137 int32_t p = pos;
ghb@10550 138 const UChar* s = str.getBuffer();
ghb@10550 139 p = (int32_t)(PatternProps::skipWhiteSpace(s + p, str.length() - p) - s);
ghb@10550 140 if (advance) {
ghb@10550 141 pos = p;
ghb@10550 142 }
ghb@10550 143 return p;
ghb@10550 144 }
ghb@10550 145
ghb@10550 146 /**
ghb@10550 147 * Skip over Pattern_White_Space in a Replaceable.
ghb@10550 148 * Skipping may be done in the forward or
ghb@10550 149 * reverse direction. In either case, the leftmost index will be
ghb@10550 150 * inclusive, and the rightmost index will be exclusive. That is,
ghb@10550 151 * given a range defined as [start, limit), the call
ghb@10550 152 * skipWhitespace(text, start, limit) will advance start past leading
ghb@10550 153 * whitespace, whereas the call skipWhitespace(text, limit, start),
ghb@10550 154 * will back up limit past trailing whitespace.
ghb@10550 155 * @param text the text to be analyzed
ghb@10550 156 * @param pos either the start or limit of a range of 'text', to skip
ghb@10550 157 * leading or trailing whitespace, respectively
ghb@10550 158 * @param stop either the limit or start of a range of 'text', to skip
ghb@10550 159 * leading or trailing whitespace, respectively
ghb@10550 160 * @return the new start or limit, depending on what was passed in to
ghb@10550 161 * 'pos'
ghb@10550 162 */
ghb@10550 163 //?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.
ghb@10550 164 //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
ghb@10550 165 //? int32_t pos, int32_t stop) {
ghb@10550 166 //? UChar32 c;
ghb@10550 167 //? UBool isForward = (stop >= pos);
ghb@10550 168 //?
ghb@10550 169 //? if (!isForward) {
ghb@10550 170 //? --pos; // pos is a limit, so back up by one
ghb@10550 171 //? }
ghb@10550 172 //?
ghb@10550 173 //? while (pos != stop &&
ghb@10550 174 //? PatternProps::isWhiteSpace(c = text.char32At(pos))) {
ghb@10550 175 //? if (isForward) {
ghb@10550 176 //? pos += U16_LENGTH(c);
ghb@10550 177 //? } else {
ghb@10550 178 //? pos -= U16_LENGTH(c);
ghb@10550 179 //? }
ghb@10550 180 //? }
ghb@10550 181 //?
ghb@10550 182 //? if (!isForward) {
ghb@10550 183 //? ++pos; // make pos back into a limit
ghb@10550 184 //? }
ghb@10550 185 //?
ghb@10550 186 //? return pos;
ghb@10550 187 //?}
ghb@10550 188
ghb@10550 189 /**
ghb@10550 190 * Parse a single non-whitespace character 'ch', optionally
ghb@10550 191 * preceded by whitespace.
ghb@10550 192 * @param id the string to be parsed
ghb@10550 193 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
ghb@10550 194 * offset of the first character to be parsed. On output, pos[0]
ghb@10550 195 * is the index after the last parsed character. If the parse
ghb@10550 196 * fails, pos[0] will be unchanged.
ghb@10550 197 * @param ch the non-whitespace character to be parsed.
ghb@10550 198 * @return true if 'ch' is seen preceded by zero or more
ghb@10550 199 * whitespace characters.
ghb@10550 200 */
ghb@10550 201 UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
ghb@10550 202 int32_t start = pos;
ghb@10550 203 skipWhitespace(id, pos, TRUE);
ghb@10550 204 if (pos == id.length() ||
ghb@10550 205 id.charAt(pos) != ch) {
ghb@10550 206 pos = start;
ghb@10550 207 return FALSE;
ghb@10550 208 }
ghb@10550 209 ++pos;
ghb@10550 210 return TRUE;
ghb@10550 211 }
ghb@10550 212
ghb@10550 213 /**
ghb@10550 214 * Parse a pattern string within the given Replaceable and a parsing
ghb@10550 215 * pattern. Characters are matched literally and case-sensitively
ghb@10550 216 * except for the following special characters:
ghb@10550 217 *
ghb@10550 218 * ~ zero or more Pattern_White_Space chars
ghb@10550 219 *
ghb@10550 220 * If end of pattern is reached with all matches along the way,
ghb@10550 221 * pos is advanced to the first unparsed index and returned.
ghb@10550 222 * Otherwise -1 is returned.
ghb@10550 223 * @param pat pattern that controls parsing
ghb@10550 224 * @param text text to be parsed, starting at index
ghb@10550 225 * @param index offset to first character to parse
ghb@10550 226 * @param limit offset after last character to parse
ghb@10550 227 * @return index after last parsed character, or -1 on parse failure.
ghb@10550 228 */
ghb@10550 229 int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
ghb@10550 230 const Replaceable& text,
ghb@10550 231 int32_t index,
ghb@10550 232 int32_t limit) {
ghb@10550 233 int32_t ipat = 0;
ghb@10550 234
ghb@10550 235 // empty pattern matches immediately
ghb@10550 236 if (ipat == pat.length()) {
ghb@10550 237 return index;
ghb@10550 238 }
ghb@10550 239
ghb@10550 240 UChar32 cpat = pat.char32At(ipat);
ghb@10550 241
ghb@10550 242 while (index < limit) {
ghb@10550 243 UChar32 c = text.char32At(index);
ghb@10550 244
ghb@10550 245 // parse \s*
ghb@10550 246 if (cpat == 126 /*~*/) {
ghb@10550 247 if (PatternProps::isWhiteSpace(c)) {
ghb@10550 248 index += U16_LENGTH(c);
ghb@10550 249 continue;
ghb@10550 250 } else {
ghb@10550 251 if (++ipat == pat.length()) {
ghb@10550 252 return index; // success; c unparsed
ghb@10550 253 }
ghb@10550 254 // fall thru; process c again with next cpat
ghb@10550 255 }
ghb@10550 256 }
ghb@10550 257
ghb@10550 258 // parse literal
ghb@10550 259 else if (c == cpat) {
ghb@10550 260 index += U16_LENGTH(c);
ghb@10550 261 ipat += U16_LENGTH(cpat);
ghb@10550 262 if (ipat == pat.length()) {
ghb@10550 263 return index; // success; c parsed
ghb@10550 264 }
ghb@10550 265 // fall thru; get next cpat
ghb@10550 266 }
ghb@10550 267
ghb@10550 268 // match failure of literal
ghb@10550 269 else {
ghb@10550 270 return -1;
ghb@10550 271 }
ghb@10550 272
ghb@10550 273 cpat = pat.char32At(ipat);
ghb@10550 274 }
ghb@10550 275
ghb@10550 276 return -1; // text ended before end of pat
ghb@10550 277 }
ghb@10550 278
ghb@10550 279 /**
ghb@10550 280 * Append a character to a rule that is being built up. To flush
ghb@10550 281 * the quoteBuf to rule, make one final call with isLiteral == TRUE.
ghb@10550 282 * If there is no final character, pass in (UChar32)-1 as c.
ghb@10550 283 * @param rule the string to append the character to
ghb@10550 284 * @param c the character to append, or (UChar32)-1 if none.
ghb@10550 285 * @param isLiteral if true, then the given character should not be
ghb@10550 286 * quoted or escaped. Usually this means it is a syntactic element
ghb@10550 287 * such as > or $
ghb@10550 288 * @param escapeUnprintable if true, then unprintable characters
ghb@10550 289 * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will
ghb@10550 290 * appear outside of quotes.
ghb@10550 291 * @param quoteBuf a buffer which is used to build up quoted
ghb@10550 292 * substrings. The caller should initially supply an empty buffer,
ghb@10550 293 * and thereafter should not modify the buffer. The buffer should be
ghb@10550 294 * cleared out by, at the end, calling this method with a literal
ghb@10550 295 * character.
ghb@10550 296 */
ghb@10550 297 void ICU_Utility::appendToRule(UnicodeString& rule,
ghb@10550 298 UChar32 c,
ghb@10550 299 UBool isLiteral,
ghb@10550 300 UBool escapeUnprintable,
ghb@10550 301 UnicodeString& quoteBuf) {
ghb@10550 302 // If we are escaping unprintables, then escape them outside
ghb@10550 303 // quotes. \u and \U are not recognized within quotes. The same
ghb@10550 304 // logic applies to literals, but literals are never escaped.
ghb@10550 305 if (isLiteral ||
ghb@10550 306 (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
ghb@10550 307 if (quoteBuf.length() > 0) {
ghb@10550 308 // We prefer backslash APOSTROPHE to double APOSTROPHE
ghb@10550 309 // (more readable, less similar to ") so if there are
ghb@10550 310 // double APOSTROPHEs at the ends, we pull them outside
ghb@10550 311 // of the quote.
ghb@10550 312
ghb@10550 313 // If the first thing in the quoteBuf is APOSTROPHE
ghb@10550 314 // (doubled) then pull it out.
ghb@10550 315 while (quoteBuf.length() >= 2 &&
ghb@10550 316 quoteBuf.charAt(0) == APOSTROPHE &&
ghb@10550 317 quoteBuf.charAt(1) == APOSTROPHE) {
ghb@10550 318 rule.append(BACKSLASH).append(APOSTROPHE);
ghb@10550 319 quoteBuf.remove(0, 2);
ghb@10550 320 }
ghb@10550 321 // If the last thing in the quoteBuf is APOSTROPHE
ghb@10550 322 // (doubled) then remove and count it and add it after.
ghb@10550 323 int32_t trailingCount = 0;
ghb@10550 324 while (quoteBuf.length() >= 2 &&
ghb@10550 325 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
ghb@10550 326 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
ghb@10550 327 quoteBuf.truncate(quoteBuf.length()-2);
ghb@10550 328 ++trailingCount;
ghb@10550 329 }
ghb@10550 330 if (quoteBuf.length() > 0) {
ghb@10550 331 rule.append(APOSTROPHE);
ghb@10550 332 rule.append(quoteBuf);
ghb@10550 333 rule.append(APOSTROPHE);
ghb@10550 334 quoteBuf.truncate(0);
ghb@10550 335 }
ghb@10550 336 while (trailingCount-- > 0) {
ghb@10550 337 rule.append(BACKSLASH).append(APOSTROPHE);
ghb@10550 338 }
ghb@10550 339 }
ghb@10550 340 if (c != (UChar32)-1) {
ghb@10550 341 /* Since spaces are ignored during parsing, they are
ghb@10550 342 * emitted only for readability. We emit one here
ghb@10550 343 * only if there isn't already one at the end of the
ghb@10550 344 * rule.
ghb@10550 345 */
ghb@10550 346 if (c == SPACE) {
ghb@10550 347 int32_t len = rule.length();
ghb@10550 348 if (len > 0 && rule.charAt(len-1) != c) {
ghb@10550 349 rule.append(c);
ghb@10550 350 }
ghb@10550 351 } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
ghb@10550 352 rule.append(c);
ghb@10550 353 }
ghb@10550 354 }
ghb@10550 355 }
ghb@10550 356
ghb@10550 357 // Escape ' and '\' and don't begin a quote just for them
ghb@10550 358 else if (quoteBuf.length() == 0 &&
ghb@10550 359 (c == APOSTROPHE || c == BACKSLASH)) {
ghb@10550 360 rule.append(BACKSLASH);
ghb@10550 361 rule.append(c);
ghb@10550 362 }
ghb@10550 363
ghb@10550 364 // Specials (printable ascii that isn't [0-9a-zA-Z]) and
ghb@10550 365 // whitespace need quoting. Also append stuff to quotes if we are
ghb@10550 366 // building up a quoted substring already.
ghb@10550 367 else if (quoteBuf.length() > 0 ||
ghb@10550 368 (c >= 0x0021 && c <= 0x007E &&
ghb@10550 369 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
ghb@10550 370 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
ghb@10550 371 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
ghb@10550 372 PatternProps::isWhiteSpace(c)) {
ghb@10550 373 quoteBuf.append(c);
ghb@10550 374 // Double ' within a quote
ghb@10550 375 if (c == APOSTROPHE) {
ghb@10550 376 quoteBuf.append(c);
ghb@10550 377 }
ghb@10550 378 }
ghb@10550 379
ghb@10550 380 // Otherwise just append
ghb@10550 381 else {
ghb@10550 382 rule.append(c);
ghb@10550 383 }
ghb@10550 384 }
ghb@10550 385
ghb@10550 386 void ICU_Utility::appendToRule(UnicodeString& rule,
ghb@10550 387 const UnicodeString& text,
ghb@10550 388 UBool isLiteral,
ghb@10550 389 UBool escapeUnprintable,
ghb@10550 390 UnicodeString& quoteBuf) {
ghb@10550 391 for (int32_t i=0; i<text.length(); ++i) {
ghb@10550 392 appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
ghb@10550 393 }
ghb@10550 394 }
ghb@10550 395
ghb@10550 396 /**
ghb@10550 397 * Given a matcher reference, which may be null, append its
ghb@10550 398 * pattern as a literal to the given rule.
ghb@10550 399 */
ghb@10550 400 void ICU_Utility::appendToRule(UnicodeString& rule,
ghb@10550 401 const UnicodeMatcher* matcher,
ghb@10550 402 UBool escapeUnprintable,
ghb@10550 403 UnicodeString& quoteBuf) {
ghb@10550 404 if (matcher != NULL) {
ghb@10550 405 UnicodeString pat;
ghb@10550 406 appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
ghb@10550 407 TRUE, escapeUnprintable, quoteBuf);
ghb@10550 408 }
ghb@10550 409 }
ghb@10550 410
ghb@10550 411 U_NAMESPACE_END