2 * Copyright 2007-2009 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Utility routines for dealing with bytecode-level names.
30 * Includes universal mangling rules for the JVM.
32 * <h3>Avoiding Dangerous Characters </h3>
35 * The JVM defines a very small set of characters which are illegal
36 * in name spellings. We will slightly extend and regularize this set
37 * into a group of <cite>dangerous characters</cite>.
38 * These characters will then be replaced, in mangled names, by escape sequences.
39 * In addition, accidental escape sequences must be further escaped.
40 * Finally, a special prefix will be applied if and only if
41 * the mangling would otherwise fail to begin with the escape character.
42 * This happens to cover the corner case of the null string,
43 * and also clearly marks symbols which need demangling.
46 * Dangerous characters are the union of all characters forbidden
47 * or otherwise restricted by the JVM specification,
48 * plus their mates, if they are brackets
49 * (<code><big><b>[</b></big></code> and <code><big><b>]</b></big></code>,
50 * <code><big><b><</b></big></code> and <code><big><b>></b></big></code>),
51 * plus, arbitrarily, the colon character <code><big><b>:</b></big></code>.
52 * There is no distinction between type, method, and field names.
53 * This makes it easier to convert between mangled names of different
54 * types, since they do not need to be decoded (demangled).
57 * The escape character is backslash <code><big><b>\</b></big></code>
58 * (also known as reverse solidus).
59 * This character is, until now, unheard of in bytecode names,
60 * but traditional in the proposed role.
63 * <h3> Replacement Characters </h3>
67 * Every escape sequence is two characters
68 * (in fact, two UTF8 bytes) beginning with
69 * the escape character and followed by a
70 * <cite>replacement character</cite>.
71 * (Since the replacement character is never a backslash,
72 * iterated manglings do not double in size.)
75 * Each dangerous character has some rough visual similarity
76 * to its corresponding replacement character.
77 * This makes mangled symbols easier to recognize by sight.
80 * The dangerous characters are
81 * <code><big><b>/</b></big></code> (forward slash, used to delimit package components),
82 * <code><big><b>.</b></big></code> (dot, also a package delimiter),
83 * <code><big><b>;</b></big></code> (semicolon, used in signatures),
84 * <code><big><b>$</b></big></code> (dollar, used in inner classes and synthetic members),
85 * <code><big><b><</b></big></code> (left angle),
86 * <code><big><b>></b></big></code> (right angle),
87 * <code><big><b>[</b></big></code> (left square bracket, used in array types),
88 * <code><big><b>]</b></big></code> (right square bracket, reserved in this scheme for language use),
89 * and <code><big><b>:</b></big></code> (colon, reserved in this scheme for language use).
90 * Their replacements are, respectively,
91 * <code><big><b>|</b></big></code> (vertical bar),
92 * <code><big><b>,</b></big></code> (comma),
93 * <code><big><b>?</b></big></code> (question mark),
94 * <code><big><b>%</b></big></code> (percent),
95 * <code><big><b>^</b></big></code> (caret),
96 * <code><big><b>_</b></big></code> (underscore), and
97 * <code><big><b>{</b></big></code> (left curly bracket),
98 * <code><big><b>}</b></big></code> (right curly bracket),
99 * <code><big><b>!</b></big></code> (exclamation mark).
100 * In addition, the replacement character for the escape character itself is
101 * <code><big><b>-</b></big></code> (hyphen),
102 * and the replacement character for the null prefix is
103 * <code><big><b>=</b></big></code> (equal sign).
106 * An escape character <code><big><b>\</b></big></code>
107 * followed by any of these replacement characters
108 * is an escape sequence, and there are no other escape sequences.
109 * An equal sign is only part of an escape sequence
110 * if it is the second character in the whole string, following a backslash.
111 * Two consecutive backslashes do <em>not</em> form an escape sequence.
114 * Each escape sequence replaces a so-called <cite>original character</cite>
115 * which is either one of the dangerous characters or the escape character.
116 * A null prefix replaces an initial null string, not a character.
119 * All this implies that escape sequences cannot overlap and may be
120 * determined all at once for a whole string. Note that a spelling
121 * string can contain <cite>accidental escapes</cite>, apparent escape
122 * sequences which must not be interpreted as manglings.
123 * These are disabled by replacing their leading backslash with an
124 * escape sequence (<code><big><b>\-</b></big></code>). To mangle a string, three logical steps
125 * are required, though they may be carried out in one pass:
128 * <li>In each accidental escape, replace the backslash with an escape sequence
129 * (<code><big><b>\-</b></big></code>).</li>
130 * <li>Replace each dangerous character with an escape sequence
131 * (<code><big><b>\|</b></big></code> for <code><big><b>/</b></big></code>, etc.).</li>
132 * <li>If the first two steps introduced any change, <em>and</em>
133 * if the string does not already begin with a backslash, prepend a null prefix (<code><big><b>\=</b></big></code>).</li>
136 * To demangle a mangled string that begins with an escape,
137 * remove any null prefix, and then replace (in parallel)
138 * each escape sequence by its original character.
139 * <p>Spelling strings which contain accidental
140 * escapes <em>must</em> have them replaced, even if those
141 * strings do not contain dangerous characters.
142 * This restriction means that mangling a string always
143 * requires a scan of the string for escapes.
144 * But then, a scan would be required anyway,
145 * to check for dangerous characters.
148 * <h3> Nice Properties </h3>
151 * If a bytecode name does not contain any escape sequence,
152 * demangling is a no-op: The string demangles to itself.
153 * Such a string is called <cite>self-mangling</cite>.
154 * Almost all strings are self-mangling.
155 * In practice, to demangle almost any name “found in nature”,
156 * simply verify that it does not begin with a backslash.
159 * Mangling is a one-to-one function, while demangling
160 * is a many-to-one function.
161 * A mangled string is defined as <cite>validly mangled</cite> if
162 * it is in fact the unique mangling of its spelling string.
163 * Three examples of invalidly mangled strings are <code><big><b>\=foo</b></big></code>,
164 * <code><big><b>\-bar</b></big></code>, and <code><big><b>baz\!</b></big></code>, which demangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and
165 * <code><big><b>baz\!</b></big></code>, but then remangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and <code><big><b>\=baz\-!</b></big></code>.
166 * If a language back-end or runtime is using mangled names,
167 * it should never present an invalidly mangled bytecode
168 * name to the JVM. If the runtime encounters one,
169 * it should also report an error, since such an occurrence
170 * probably indicates a bug in name encoding which
171 * will lead to errors in linkage.
172 * However, this note does not propose that the JVM verifier
173 * detect invalidly mangled names.
176 * As a result of these rules, it is a simple matter to
177 * compute validly mangled substrings and concatenations
178 * of validly mangled strings, and (with a little care)
179 * these correspond to corresponding operations on their
183 * <li>Any prefix of a validly mangled string is also validly mangled,
184 * although a null prefix may need to be removed.</li>
185 * <li>Any suffix of a validly mangled string is also validly mangled,
186 * although a null prefix may need to be added.</li>
187 * <li>Two validly mangled strings, when concatenated,
188 * are also validly mangled, although any null prefix
189 * must be removed from the second string,
190 * and a trailing backslash on the first string may need escaping,
191 * if it would participate in an accidental escape when followed
192 * by the first character of the second string.</li>
194 * <p>If languages that include non-Java symbol spellings use this
195 * mangling convention, they will enjoy the following advantages:
198 * <li>They can interoperate via symbols they share in common.</li>
199 * <li>Low-level tools, such as backtrace printers, will have readable displays.</li>
200 * <li>Future JVM and language extensions can safely use the dangerous characters
201 * for structuring symbols, but will never interfere with valid spellings.</li>
202 * <li>Runtimes and compilers can use standard libraries for mangling and demangling.</li>
203 * <li>Occasional transliterations and name composition will be simple and regular,
204 * for classes, methods, and fields.</li>
205 * <li>Bytecode names will continue to be compact.
206 * When mangled, spellings will at most double in length, either in
207 * UTF8 or UTF16 format, and most will not change at all.</li>
211 * <h3> Suggestions for Human Readable Presentations </h3>
215 * For human readable displays of symbols,
216 * it will be better to present a string-like quoted
217 * representation of the spelling, because JVM users
218 * are generally familiar with such tokens.
219 * We suggest using single or double quotes before and after
220 * mangled symbols which are not valid Java identifiers,
221 * with quotes, backslashes, and non-printing characters
222 * escaped as if for literals in the Java language.
225 * For example, an HTML-like spelling
226 * <code><big><b><pre></b></big></code> mangles to
227 * <code><big><b>\^pre\_</b></big></code> and could
228 * display more cleanly as
229 * <code><big><b>'<pre>'</b></big></code>,
230 * with the quotes included.
231 * Such string-like conventions are <em>not</em> suitable
232 * for mangled bytecode names, in part because
233 * dangerous characters must be eliminated, rather
234 * than just quoted. Otherwise internally structured
235 * strings like package prefixes and method signatures
236 * could not be reliably parsed.
239 * In such human-readable displays, invalidly mangled
240 * names should <em>not</em> be demangled and quoted,
241 * for this would be misleading. Likewise, JVM symbols
242 * which contain dangerous characters (like dots in field
243 * names or brackets in method names) should not be
244 * simply quoted. The bytecode names
245 * <code><big><b>\=phase\,1</b></big></code> and
246 * <code><big><b>phase.1</b></big></code> are distinct,
247 * and in demangled displays they should be presented as
248 * <code><big><b>'phase.1'</b></big></code> and something like
249 * <code><big><b>'phase'.1</b></big></code>, respectively.
253 * @version 1.2, 02/06/2008
254 * @see http://blogs.sun.com/jrose/entry/symbolic_freedom_in_the_vm
256 public class BytecodeName {
257 private BytecodeName() { } // static only class
259 /** Given a source name, produce the corresponding bytecode name.
260 * The source name should not be qualified, because any syntactic
261 * markers (dots, slashes, dollar signs, colons, etc.) will be mangled.
262 * @param s the source name
263 * @return a valid bytecode name which represents the source name
265 public static String toBytecodeName(String s) {
266 String bn = mangle(s);
267 assert((Object)bn == s || looksMangled(bn)) : bn;
268 assert(s.equals(toSourceName(bn))) : s;
272 /** Given an unqualified bytecode name, produce the corresponding source name.
273 * The bytecode name must not contain dangerous characters.
274 * In particular, it must not be qualified or segmented by colon {@code ':'}.
275 * @param s the bytecode name
276 * @return the source name, which may possibly have unsafe characters
277 * @throws IllegalArgumentException if the bytecode name is not {@link #isSafeBytecodeName safe}
278 * @see #isSafeBytecodeName(java.lang.String)
280 public static String toSourceName(String s) {
281 checkSafeBytecodeName(s);
283 if (looksMangled(s)) {
285 assert(s.equals(mangle(sn))) : s+" => "+sn+" => "+mangle(sn);
291 * Given a bytecode name from a classfile, separate it into
292 * components delimited by dangerous characters.
293 * Each resulting array element will be either a dangerous character,
294 * or else a safe bytecode name.
295 * (The safe name might possibly be mangled to hide further dangerous characters.)
296 * For example, the qualified class name {@code java/lang/String}
297 * will be parsed into the array {@code {"java", '/', "lang", '/', "String"}}.
298 * The name {@code <init>} will be parsed into { '<', "init", '>'}}
299 * The name {@code foo/bar$:baz} will be parsed into
300 * {@code {"foo", '/', "bar", '$', ':', "baz"}}.
302 public static Object[] parseBytecodeName(String s) {
303 int slen = s.length();
305 for (int pass = 0; pass <= 1; pass++) {
308 for (int i = 0; i <= slen; i++) {
311 whichDC = DANGEROUS_CHARS.indexOf(s.charAt(i));
312 if (whichDC < DANGEROUS_CHAR_FIRST_INDEX) continue;
314 // got to end of string or next dangerous char
318 res[fillp] = s.substring(lasti, i);
322 if (whichDC >= DANGEROUS_CHAR_FIRST_INDEX) {
324 res[fillp] = DANGEROUS_CHARS_CA[whichDC];
328 if (pass != 0) break;
329 // between passes, build the result array
330 res = new String[fillp];
332 if (fillp != 0) res[0] = s;
340 * Given a series of components, create a bytecode name for a classfile.
341 * This is the inverse of {@link #parseBytecodeName(java.lang.String)}.
342 * Each component must either be an interned one-character string of
343 * a dangerous character, or else a safe bytecode name.
344 * @param components a series of name components
345 * @return the concatenation of all components
346 * @throws IllegalArgumentException if any component contains an unsafe
347 * character, and is not an interned one-character string
348 * @throws NullPointerException if any component is null
350 public static String unparseBytecodeName(Object[] components) {
351 for (Object c : components) {
352 if (c instanceof String)
353 checkSafeBytecodeName((String) c); // may fail
355 return appendAll(components);
357 private static String appendAll(Object[] components) {
358 if (components.length <= 1) {
359 if (components.length == 1) {
360 return String.valueOf(components[0]);
365 for (Object c : components) {
366 if (c instanceof String)
367 slen += String.valueOf(c).length();
371 StringBuilder sb = new StringBuilder(slen);
372 for (Object c : components) {
375 return sb.toString();
379 * Given a bytecode name, produce the corresponding display name.
380 * This is the source name, plus quotes if needed.
381 * If the bytecode name contains dangerous characters,
382 * assume that they are being used as punctuation,
383 * and pass them through unchanged.
384 * @param s the original bytecode name (which may be qualified)
385 * @return a human-readable presentation
387 public static String toDisplayName(String s) {
388 Object[] components = parseBytecodeName(s);
389 for (int i = 0; i < components.length; i++) {
390 if (!(components[i] instanceof String))
392 String c = (String) components[i];
393 // pretty up the name by demangling it
394 String sn = toSourceName(c);
395 if ((Object)sn != c || !isJavaIdent(sn)) {
396 components[i] = quoteDisplay(sn);
399 return appendAll(components);
401 private static boolean isJavaIdent(String s) {
402 int slen = s.length();
403 if (slen == 0) return false;
404 if (!Character.isUnicodeIdentifierStart(s.charAt(0)))
406 for (int i = 1; i < slen; i++) {
407 if (!Character.isUnicodeIdentifierPart(s.charAt(0)))
412 private static String quoteDisplay(String s) {
413 // TO DO: Replace wierd characters in s by C-style escapes.
414 return "'"+s.replaceAll("['\\\\]", "\\\\$0")+"'";
417 private static void checkSafeBytecodeName(String s)
418 throws IllegalArgumentException {
419 if (!isSafeBytecodeName(s)) {
420 throw new IllegalArgumentException(s);
425 * Report whether a simple name is safe as a bytecode name.
426 * Such names are acceptable in class files as class, method, and field names.
427 * Additionally, they are free of "dangerous" characters, even if those
428 * characters are legal in some (or all) names in class files.
429 * @param s the proposed bytecode name
430 * @return true if the name is non-empty and all of its characters are safe
432 public static boolean isSafeBytecodeName(String s) {
433 if (s.length() == 0) return false;
434 // check occurrences of each DANGEROUS char
435 for (char xc : DANGEROUS_CHARS_A) {
436 if (xc == ESCAPE_C) continue; // not really that dangerous
437 if (s.indexOf(xc) >= 0) return false;
443 * Report whether a character is safe in a bytecode name.
444 * This is true of any unicode character except the following
445 * <em>dangerous characters</em>: {@code ".;:$[]<>/"}.
446 * @param s the proposed character
447 * @return true if the character is safe to use in classfiles
449 public static boolean isSafeBytecodeChar(char c) {
450 return DANGEROUS_CHARS.indexOf(c) < DANGEROUS_CHAR_FIRST_INDEX;
453 private static boolean looksMangled(String s) {
454 return s.charAt(0) == ESCAPE_C;
457 private static String mangle(String s) {
461 // build this lazily, when we first need an escape:
462 StringBuilder sb = null;
464 for (int i = 0, slen = s.length(); i < slen; i++) {
465 char c = s.charAt(i);
467 boolean needEscape = false;
470 char c1 = s.charAt(i+1);
471 if ((i == 0 && c1 == NULL_ESCAPE_C)
472 || c1 != originalOfReplacement(c1)) {
473 // an accidental escape
478 needEscape = isDangerous(c);
482 if (sb != null) sb.append(c);
486 // build sb if this is the first escape
488 sb = new StringBuilder(s.length()+10);
489 // mangled names must begin with a backslash:
490 if (s.charAt(0) != ESCAPE_C && i > 0)
491 sb.append(NULL_ESCAPE);
492 // append the string so far, which is unremarkable:
493 sb.append(s.substring(0, i));
496 // rewrite \ to \-, / to \|, etc.
498 sb.append(replacementOf(c));
501 if (sb != null) return sb.toString();
506 private static String demangle(String s) {
507 // build this lazily, when we first meet an escape:
508 StringBuilder sb = null;
511 if (s.startsWith(NULL_ESCAPE))
514 for (int i = stringStart, slen = s.length(); i < slen; i++) {
515 char c = s.charAt(i);
517 if (c == ESCAPE_C && i+1 < slen) {
518 // might be an escape sequence
519 char rc = s.charAt(i+1);
520 char oc = originalOfReplacement(rc);
522 // build sb if this is the first escape
524 sb = new StringBuilder(s.length());
525 // append the string so far, which is unremarkable:
526 sb.append(s.substring(stringStart, i));
528 ++i; // skip both characters
537 if (sb != null) return sb.toString();
539 return s.substring(stringStart);
542 static char ESCAPE_C = '\\';
543 // empty escape sequence to avoid a null name or illegal prefix
544 static char NULL_ESCAPE_C = '=';
545 static String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C;
547 static final String DANGEROUS_CHARS = "\\/.;:$[]<>"; // \\ must be first
548 static final String REPLACEMENT_CHARS = "-|,?!%{}^_";
549 static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\
550 static char[] DANGEROUS_CHARS_A = DANGEROUS_CHARS.toCharArray();
551 static char[] REPLACEMENT_CHARS_A = REPLACEMENT_CHARS.toCharArray();
552 static final Character[] DANGEROUS_CHARS_CA;
554 Character[] dcca = new Character[DANGEROUS_CHARS.length()];
555 for (int i = 0; i < dcca.length; i++)
556 dcca[i] = Character.valueOf(DANGEROUS_CHARS.charAt(i));
557 DANGEROUS_CHARS_CA = dcca;
560 static final long[] SPECIAL_BITMAP = new long[2]; // 128 bits
562 String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS;
563 //System.out.println("SPECIAL = "+SPECIAL);
564 for (char c : SPECIAL.toCharArray()) {
565 SPECIAL_BITMAP[c >>> 6] |= 1L << c;
568 static boolean isSpecial(char c) {
569 if ((c >>> 6) < SPECIAL_BITMAP.length)
570 return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0;
574 static char replacementOf(char c) {
575 if (!isSpecial(c)) return c;
576 int i = DANGEROUS_CHARS.indexOf(c);
578 return REPLACEMENT_CHARS.charAt(i);
580 static char originalOfReplacement(char c) {
581 if (!isSpecial(c)) return c;
582 int i = REPLACEMENT_CHARS.indexOf(c);
584 return DANGEROUS_CHARS.charAt(i);
586 static boolean isDangerous(char c) {
587 if (!isSpecial(c)) return false;
588 return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX);
590 static int indexOfDangerousChar(String s, int from) {
591 for (int i = from, slen = s.length(); i < slen; i++) {
592 if (isDangerous(s.charAt(i)))
597 static int lastIndexOfDangerousChar(String s, int from) {
598 for (int i = Math.min(from, s.length()-1); i >= 0; i--) {
599 if (isDangerous(s.charAt(i)))
606 static void main(String[] av) {
607 // If verbose is enabled, quietly check everything.
608 // Otherwise, print the output for the user to check.
609 boolean verbose = false;
613 while (av.length > 0 && av[0].startsWith("-")) {
614 String flag = av[0].intern();
615 av = java.util.Arrays.copyOfRange(av, 1, av.length); // Java 1.6 or later
616 if (flag == "-" || flag == "--") break;
617 else if (flag == "-q")
619 else if (flag == "-v")
621 else if (flag.startsWith("-l"))
622 maxlen = Integer.valueOf(flag.substring(2));
624 throw new Error("Illegal flag argument: "+flag);
628 maxlen = (verbose ? 2 : 4);
629 if (verbose) System.out.println("Note: maxlen = "+maxlen);
632 case 0: av = new String[] {
633 DANGEROUS_CHARS.substring(0) +
634 REPLACEMENT_CHARS.substring(0, 1) +
636 }; // and fall through:
638 char[] cv = av[0].toCharArray();
639 av = new String[cv.length];
642 String s = String.valueOf(c);
643 if (c == 'x') s = "foo"; // tradition...
648 System.out.println("Note: Verbose output mode enabled. Use '-q' to suppress.");
649 Tester t = new Tester();
656 static class Tester {
659 java.util.Map<String,String> map = new java.util.HashMap<String,String>();
662 void test(String stringSoFar, int tokensSoFar) {
664 if (tokensSoFar <= maxlen) {
665 for (String token : tokens) {
666 if (token.length() == 0) continue; // skip empty tokens
667 if (stringSoFar.indexOf(token) != stringSoFar.lastIndexOf(token))
668 continue; // there are already two occs. of this token
669 if (token.charAt(0) == ESCAPE_C && token.length() == 1 && maxlen < 4)
670 test(stringSoFar+token, tokensSoFar); // want lots of \'s
671 else if (tokensSoFar < maxlen)
672 test(stringSoFar+token, tokensSoFar+1);
677 void test(String s) {
678 // for small batches, do not test the null string
679 if (s.length() == 0 && maxlen >=1 && maxlen <= 2) return;
680 String bn = testSourceName(s);
681 if (bn == null) return;
683 //if (verbose) System.out.println(s+" == id");
685 if (verbose) System.out.println(s+" => "+bn+" "+toDisplayName(bn));
686 String bnbn = testSourceName(bn);
687 if (bnbn == null) return;
688 if (verbose) System.out.println(bn+" => "+bnbn+" "+toDisplayName(bnbn));
690 String bn3 = testSourceName(bnbn);
691 if (bn3 == null) return;
692 if (verbose) System.out.println(bnbn+" => "+bn3);
697 String testSourceName(String s) {
698 if (map.containsKey(s)) return null;
699 String bn = toBytecodeName(s);
701 String sn = toSourceName(bn);
703 String bad = (s+" => "+bn+" != "+sn);
704 if (!verbose) throw new Error("Bad mangling: "+bad);
705 System.out.println("*** "+bad);