changeset 58367:7898edac8a27

8216332: Grapheme regex does not work with emoji sequences Reviewed-by: rriggs
author naoto
date Thu, 12 Mar 2020 08:31:26 -0700
parents d527da8f8f9b
children 910e8900f11d
files src/java.base/share/classes/java/util/regex/Grapheme.java test/jdk/java/util/regex/GraphemeTestCases.txt test/jdk/java/util/regex/RegExTest.java
diffstat 3 files changed, 78 insertions(+), 39 deletions(-) [+]
line wrap: on
line diff
--- a/src/java.base/share/classes/java/util/regex/Grapheme.java	Tue Feb 25 12:17:26 2020 +0100
+++ b/src/java.base/share/classes/java/util/regex/Grapheme.java	Thu Mar 12 08:31:26 2020 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -71,7 +71,7 @@
             int t1 = getGraphemeType(ch1);
 
             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
-                gb11 = false;
+                // continue for gb11
             } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
                 // continue for gb12
             } else if (rules[t0][t1]) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/jdk/java/util/regex/GraphemeTestCases.txt	Thu Mar 12 08:31:26 2020 -0700
@@ -0,0 +1,33 @@
+#
+# Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 only, as
+# published by the Free Software Foundation.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# version 2 for more details (a copy is included in the LICENSE file that
+# accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version
+# 2 along with this work; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+# or visit www.oracle.com if you need additional information or have any
+# questions.
+#
+
+# test cases for Grapheme support. Format follows GrahemeBreakTest.txt
+# from Unicode.
+# https://unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
+
+# bug 8216332
+÷ 1F468 × 1F3FE ÷ 1F468 × 200D × 1F469 × 200D × 1F466 ÷
+÷ 1F468 × 200D × 1F469 × 200D × 1F466 × 200d ÷ 0041 ÷
+÷ 1F468 × 200D × 1F469 × 200D × 1F466 ÷ 0041 ÷
+÷ 0041 ÷ 1F468 × 200D × 1F469 × 200D × 1F466 ÷ 0041 ÷
+÷ 1F468 × 200D × 1F3FE × 200D × 1F469 × 1F3FE × 200D × 1F466 ÷ 0041 ÷
--- a/test/jdk/java/util/regex/RegExTest.java	Tue Feb 25 12:17:26 2020 +0100
+++ b/test/jdk/java/util/regex/RegExTest.java	Thu Mar 12 08:31:26 2020 -0700
@@ -36,6 +36,7 @@
  * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
  * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
  * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
+ * 8216332
  *
  * @library /test/lib
  * @library /lib/testlibrary/java/lang
@@ -55,6 +56,8 @@
 import java.math.BigInteger;
 import java.nio.CharBuffer;
 import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -68,6 +71,8 @@
 import java.util.regex.MatchResult;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
+import java.util.stream.Stream;
+
 import jdk.test.lib.RandomFactory;
 
 /**
@@ -4790,47 +4795,48 @@
     }
 
     private static void grapheme() throws Exception {
-        Files.lines(UCDFiles.GRAPHEME_BREAK_TEST)
+        Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST),
+                Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt")))
             .filter( ln -> ln.length() != 0 && !ln.startsWith("#") )
             .forEach( ln -> {
-                    ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
-                    // System.out.println(str);
-                    String[] strs = ln.split("\u00f7|\u00d7");
-                    StringBuilder src = new StringBuilder();
-                    ArrayList<String> graphemes = new ArrayList<>();
-                    StringBuilder buf = new StringBuilder();
-                    int offBk = 0;
-                    for (String str : strs) {
-                        if (str.length() == 0)  // first empty str
-                            continue;
-                        int cp = Integer.parseInt(str, 16);
-                        src.appendCodePoint(cp);
-                        buf.appendCodePoint(cp);
-                        offBk += (str.length() + 1);
-                        if (ln.charAt(offBk) == '\u00f7') {    // DIV
-                            graphemes.add(buf.toString());
-                            buf = new StringBuilder();
-                        }
+                ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
+                // System.out.println(str);
+                String[] strs = ln.split("\u00f7|\u00d7");
+                StringBuilder src = new StringBuilder();
+                ArrayList<String> graphemes = new ArrayList<>();
+                StringBuilder buf = new StringBuilder();
+                int offBk = 0;
+                for (String str : strs) {
+                    if (str.length() == 0)  // first empty str
+                        continue;
+                    int cp = Integer.parseInt(str, 16);
+                    src.appendCodePoint(cp);
+                    buf.appendCodePoint(cp);
+                    offBk += (str.length() + 1);
+                    if (ln.charAt(offBk) == '\u00f7') {    // DIV
+                        graphemes.add(buf.toString());
+                        buf = new StringBuilder();
                     }
-                    Pattern p = Pattern.compile("\\X");
-                    Matcher m = p.matcher(src.toString());
-                    Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
-                    for (String g : graphemes) {
-                        // System.out.printf("     grapheme:=[%s]%n", g);
-                        // (1) test \\X directly
-                        if (!m.find() || !m.group().equals(g)) {
-                            System.out.println("Failed \\X [" + ln + "] : " + g);
-                            failCount++;
-                        }
-                        // (2) test \\b{g} + \\X  via Scanner
-                        boolean hasNext = s.hasNext(p);
-                        // if (!s.hasNext() || !s.next().equals(next)) {
-                        if (!s.hasNext(p) || !s.next(p).equals(g)) {
-                            System.out.println("Failed b{g} [" + ln + "] : " + g);
-                            failCount++;
-                        }
+                }
+                Pattern p = Pattern.compile("\\X");
+                Matcher m = p.matcher(src.toString());
+                Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
+                for (String g : graphemes) {
+                    // System.out.printf("     grapheme:=[%s]%n", g);
+                    // (1) test \\X directly
+                    if (!m.find() || !m.group().equals(g)) {
+                        System.out.println("Failed \\X [" + ln + "] : " + g);
+                        failCount++;
                     }
-                });
+                    // (2) test \\b{g} + \\X  via Scanner
+                    boolean hasNext = s.hasNext(p);
+                    // if (!s.hasNext() || !s.next().equals(next)) {
+                    if (!s.hasNext(p) || !s.next(p).equals(g)) {
+                        System.out.println("Failed b{g} [" + ln + "] : " + g);
+                        failCount++;
+                    }
+                }
+            });
         // some sanity checks
         if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() ||
             !Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||