001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.VisibleForTesting; 024import java.util.Arrays; 025import java.util.BitSet; 026 027/** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 036 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 037 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 038 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 039 * assigned characters, including important CJK characters and emoji. 040 * 041 * <p>Supplementary characters are <a 042 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 043 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 044 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 045 * 046 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 047 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 048 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 049 * 050 * <p>Example usages: 051 * 052 * <pre> 053 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 054 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 055 * 056 * <p>See the Guava User Guide article on <a 057 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 058 * </a>. 059 * 060 * @author Kevin Bourrillion 061 * @since 1.0 062 */ 063@GwtCompatible(emulated = true) 064public abstract class CharMatcher implements Predicate<Character> { 065 /* 066 * N777777777NO 067 * N7777777777777N 068 * M777777777777777N 069 * $N877777777D77777M 070 * N M77777777ONND777M 071 * MN777777777NN D777 072 * N7ZN777777777NN ~M7778 073 * N777777777777MMNN88777N 074 * N777777777777MNZZZ7777O 075 * DZN7777O77777777777777 076 * N7OONND7777777D77777N 077 * 8$M++++?N???$77777$ 078 * M7++++N+M77777777N 079 * N77O777777777777$ M 080 * DNNM$$$$777777N D 081 * N$N:=N$777N7777M NZ 082 * 77Z::::N777777777 ODZZZ 083 * 77N::::::N77777777M NNZZZ$ 084 * $777:::::::77777777MN ZM8ZZZZZ 085 * 777M::::::Z7777777Z77 N++ZZZZNN 086 * 7777M:::::M7777777$777M $++IZZZZM 087 * M777$:::::N777777$M7777M +++++ZZZDN 088 * NN$::::::7777$$M777777N N+++ZZZZNZ 089 * N::::::N:7$O:77777777 N++++ZZZZN 090 * M::::::::::::N77777777+ +?+++++ZZZM 091 * 8::::::::::::D77777777M O+++++ZZ 092 * ::::::::::::M777777777N O+?D 093 * M:::::::::::M77777777778 77= 094 * D=::::::::::N7777777777N 777 095 * INN===::::::=77777777777N I777N 096 * ?777N========N7777777777787M N7777 097 * 77777$D======N77777777777N777N? N777777 098 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 099 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 100 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 101 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 102 * 7 :N MNN$$$$M$ $$$777$8 8D8I 103 * NMM.:7O 777777778 104 * 7777777MN 105 * M NO .7: 106 * M : M 107 * 8 108 */ 109 110 // Constant matcher factory methods 111 112 /** 113 * Matches any character. 114 * 115 * @since 19.0 (since 1.0 as constant {@code ANY}) 116 */ 117 public static CharMatcher any() { 118 return Any.INSTANCE; 119 } 120 121 /** 122 * Matches no characters. 123 * 124 * @since 19.0 (since 1.0 as constant {@code NONE}) 125 */ 126 public static CharMatcher none() { 127 return None.INSTANCE; 128 } 129 130 /** 131 * Determines whether a character is whitespace according to the latest Unicode standard, as 132 * illustrated <a 133 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 134 * This is not the same definition used by other Java APIs. (See a <a 135 * href="https://goo.gl/Y6SLWx">comparison of several definitions of "whitespace"</a>.) 136 * 137 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 138 * 139 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 140 * date. 141 * 142 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 143 */ 144 public static CharMatcher whitespace() { 145 return Whitespace.INSTANCE; 146 } 147 148 /** 149 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 150 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 151 * discussion of that term. 152 * 153 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 154 */ 155 public static CharMatcher breakingWhitespace() { 156 return BreakingWhitespace.INSTANCE; 157 } 158 159 /** 160 * Determines whether a character is ASCII, meaning that its code point is less than 128. 161 * 162 * @since 19.0 (since 1.0 as constant {@code ASCII}) 163 */ 164 public static CharMatcher ascii() { 165 return Ascii.INSTANCE; 166 } 167 168 /** 169 * Determines whether a character is a BMP digit according to <a 170 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 171 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 172 * 173 * @deprecated Many digits are supplementary characters; see the class documentation. 174 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 175 */ 176 @Deprecated 177 public static CharMatcher digit() { 178 return Digit.INSTANCE; 179 } 180 181 /** 182 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 183 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 184 * '9')}. 185 * 186 * @deprecated Many digits are supplementary characters; see the class documentation. 187 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 188 */ 189 @Deprecated 190 public static CharMatcher javaDigit() { 191 return JavaDigit.INSTANCE; 192 } 193 194 /** 195 * Determines whether a character is a BMP letter according to {@linkplain 196 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 197 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 198 * 199 * @deprecated Most letters are supplementary characters; see the class documentation. 200 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 201 */ 202 @Deprecated 203 public static CharMatcher javaLetter() { 204 return JavaLetter.INSTANCE; 205 } 206 207 /** 208 * Determines whether a character is a BMP letter or digit according to {@linkplain 209 * Character#isLetterOrDigit(char) Java's definition}. 210 * 211 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 212 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 213 */ 214 @Deprecated 215 public static CharMatcher javaLetterOrDigit() { 216 return JavaLetterOrDigit.INSTANCE; 217 } 218 219 /** 220 * Determines whether a BMP character is upper case according to {@linkplain 221 * Character#isUpperCase(char) Java's definition}. 222 * 223 * @deprecated Some uppercase characters are supplementary characters; see the class 224 * documentation. 225 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 226 */ 227 @Deprecated 228 public static CharMatcher javaUpperCase() { 229 return JavaUpperCase.INSTANCE; 230 } 231 232 /** 233 * Determines whether a BMP character is lower case according to {@linkplain 234 * Character#isLowerCase(char) Java's definition}. 235 * 236 * @deprecated Some lowercase characters are supplementary characters; see the class 237 * documentation. 238 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 239 */ 240 @Deprecated 241 public static CharMatcher javaLowerCase() { 242 return JavaLowerCase.INSTANCE; 243 } 244 245 /** 246 * Determines whether a character is an ISO control character as specified by {@link 247 * Character#isISOControl(char)}. 248 * 249 * <p>All ISO control codes are on the BMP and thus supported by this API. 250 * 251 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 252 */ 253 public static CharMatcher javaIsoControl() { 254 return JavaIsoControl.INSTANCE; 255 } 256 257 /** 258 * Determines whether a character is invisible; that is, if its Unicode category is any of 259 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 260 * PRIVATE_USE according to ICU4J. 261 * 262 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 263 * 264 * @deprecated Most invisible characters are supplementary characters; see the class 265 * documentation. 266 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 267 */ 268 @Deprecated 269 public static CharMatcher invisible() { 270 return Invisible.INSTANCE; 271 } 272 273 /** 274 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 275 * errs on the side of returning {@code false} (that is, it tends to assume a character is 276 * double-width). 277 * 278 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 279 * date. 280 * 281 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 282 * 283 * @deprecated Many such characters are supplementary characters; see the class documentation. 284 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 285 */ 286 @Deprecated 287 public static CharMatcher singleWidth() { 288 return SingleWidth.INSTANCE; 289 } 290 291 // Static factories 292 293 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 294 public static CharMatcher is(final char match) { 295 return new Is(match); 296 } 297 298 /** 299 * Returns a {@code char} matcher that matches any character except the BMP character specified. 300 * 301 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 302 */ 303 public static CharMatcher isNot(final char match) { 304 return new IsNot(match); 305 } 306 307 /** 308 * Returns a {@code char} matcher that matches any BMP character present in the given character 309 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 310 */ 311 public static CharMatcher anyOf(final CharSequence sequence) { 312 switch (sequence.length()) { 313 case 0: 314 return none(); 315 case 1: 316 return is(sequence.charAt(0)); 317 case 2: 318 return isEither(sequence.charAt(0), sequence.charAt(1)); 319 default: 320 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 321 // matcher? 322 return new AnyOf(sequence); 323 } 324 } 325 326 /** 327 * Returns a {@code char} matcher that matches any BMP character not present in the given 328 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 329 */ 330 public static CharMatcher noneOf(CharSequence sequence) { 331 return anyOf(sequence).negate(); 332 } 333 334 /** 335 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 336 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 337 * CharMatcher.inRange('a', 'z')}. 338 * 339 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 340 */ 341 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 342 return new InRange(startInclusive, endInclusive); 343 } 344 345 /** 346 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 347 * which operates on primitive {@code char} instances instead. 348 */ 349 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 350 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 351 } 352 353 // Constructors 354 355 /** 356 * Constructor for use by subclasses. When subclassing, you may want to override {@code 357 * toString()} to provide a useful description. 358 */ 359 protected CharMatcher() {} 360 361 // Abstract methods 362 363 /** Determines a true or false value for the given character. */ 364 public abstract boolean matches(char c); 365 366 // Non-static factories 367 368 /** Returns a matcher that matches any character not matched by this matcher. */ 369 // @Override under Java 8 but not under Java 7 370 public CharMatcher negate() { 371 return new Negated(this); 372 } 373 374 /** 375 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 376 */ 377 public CharMatcher and(CharMatcher other) { 378 return new And(this, other); 379 } 380 381 /** 382 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 383 */ 384 public CharMatcher or(CharMatcher other) { 385 return new Or(this, other); 386 } 387 388 /** 389 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 390 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 391 * worthwhile only if the precomputed matcher is queried many thousands of times. 392 * 393 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 394 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 395 * worthwhile tradeoff in a browser. 396 */ 397 public CharMatcher precomputed() { 398 return Platform.precomputeCharMatcher(this); 399 } 400 401 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 402 403 /** 404 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 405 * on {@link Platform} so that we can have different behavior in GWT. 406 * 407 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 408 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 409 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 410 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 411 * matcher which is faster to query than the original. 412 */ 413 @GwtIncompatible // SmallCharMatcher 414 CharMatcher precomputedInternal() { 415 final BitSet table = new BitSet(); 416 setBits(table); 417 int totalCharacters = table.cardinality(); 418 if (totalCharacters * 2 <= DISTINCT_CHARS) { 419 return precomputedPositive(totalCharacters, table, toString()); 420 } else { 421 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 422 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 423 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 424 String suffix = ".negate()"; 425 final String description = toString(); 426 String negatedDescription = 427 description.endsWith(suffix) 428 ? description.substring(0, description.length() - suffix.length()) 429 : description + suffix; 430 return new NegatedFastMatcher( 431 precomputedPositive(negatedCharacters, table, negatedDescription)) { 432 @Override 433 public String toString() { 434 return description; 435 } 436 }; 437 } 438 } 439 440 /** 441 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 442 */ 443 @GwtIncompatible // SmallCharMatcher 444 private static CharMatcher precomputedPositive( 445 int totalCharacters, BitSet table, String description) { 446 switch (totalCharacters) { 447 case 0: 448 return none(); 449 case 1: 450 return is((char) table.nextSetBit(0)); 451 case 2: 452 char c1 = (char) table.nextSetBit(0); 453 char c2 = (char) table.nextSetBit(c1 + 1); 454 return isEither(c1, c2); 455 default: 456 return isSmall(totalCharacters, table.length()) 457 ? SmallCharMatcher.from(table, description) 458 : new BitSetMatcher(table, description); 459 } 460 } 461 462 @GwtIncompatible // SmallCharMatcher 463 private static boolean isSmall(int totalCharacters, int tableLength) { 464 return totalCharacters <= SmallCharMatcher.MAX_SIZE 465 && tableLength > (totalCharacters * 4 * Character.SIZE); 466 // err on the side of BitSetMatcher 467 } 468 469 /** Sets bits in {@code table} matched by this matcher. */ 470 @GwtIncompatible // used only from other GwtIncompatible code 471 void setBits(BitSet table) { 472 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 473 if (matches((char) c)) { 474 table.set(c); 475 } 476 } 477 } 478 479 // Text processing routines 480 481 /** 482 * Returns {@code true} if a character sequence contains at least one matching BMP character. 483 * Equivalent to {@code !matchesNoneOf(sequence)}. 484 * 485 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 486 * character, until this returns {@code true} or the end is reached. 487 * 488 * @param sequence the character sequence to examine, possibly empty 489 * @return {@code true} if this matcher matches at least one character in the sequence 490 * @since 8.0 491 */ 492 public boolean matchesAnyOf(CharSequence sequence) { 493 return !matchesNoneOf(sequence); 494 } 495 496 /** 497 * Returns {@code true} if a character sequence contains only matching BMP characters. 498 * 499 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 500 * character, until this returns {@code false} or the end is reached. 501 * 502 * @param sequence the character sequence to examine, possibly empty 503 * @return {@code true} if this matcher matches every character in the sequence, including when 504 * the sequence is empty 505 */ 506 public boolean matchesAllOf(CharSequence sequence) { 507 for (int i = sequence.length() - 1; i >= 0; i--) { 508 if (!matches(sequence.charAt(i))) { 509 return false; 510 } 511 } 512 return true; 513 } 514 515 /** 516 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 517 * {@code !matchesAnyOf(sequence)}. 518 * 519 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 520 * character, until this returns {@code true} or the end is reached. 521 * 522 * @param sequence the character sequence to examine, possibly empty 523 * @return {@code true} if this matcher matches no characters in the sequence, including when the 524 * sequence is empty 525 */ 526 public boolean matchesNoneOf(CharSequence sequence) { 527 return indexIn(sequence) == -1; 528 } 529 530 /** 531 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 532 * no matching character is present. 533 * 534 * <p>The default implementation iterates over the sequence in forward order calling {@link 535 * #matches} for each character. 536 * 537 * @param sequence the character sequence to examine from the beginning 538 * @return an index, or {@code -1} if no character matches 539 */ 540 public int indexIn(CharSequence sequence) { 541 return indexIn(sequence, 0); 542 } 543 544 /** 545 * Returns the index of the first matching BMP character in a character sequence, starting from a 546 * given position, or {@code -1} if no character matches after that position. 547 * 548 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 549 * start}, calling {@link #matches} for each character. 550 * 551 * @param sequence the character sequence to examine 552 * @param start the first index to examine; must be nonnegative and no greater than {@code 553 * sequence.length()} 554 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 555 * or {@code -1} if no character matches 556 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 557 * sequence.length()} 558 */ 559 public int indexIn(CharSequence sequence, int start) { 560 int length = sequence.length(); 561 checkPositionIndex(start, length); 562 for (int i = start; i < length; i++) { 563 if (matches(sequence.charAt(i))) { 564 return i; 565 } 566 } 567 return -1; 568 } 569 570 /** 571 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 572 * no matching character is present. 573 * 574 * <p>The default implementation iterates over the sequence in reverse order calling {@link 575 * #matches} for each character. 576 * 577 * @param sequence the character sequence to examine from the end 578 * @return an index, or {@code -1} if no character matches 579 */ 580 public int lastIndexIn(CharSequence sequence) { 581 for (int i = sequence.length() - 1; i >= 0; i--) { 582 if (matches(sequence.charAt(i))) { 583 return i; 584 } 585 } 586 return -1; 587 } 588 589 /** 590 * Returns the number of matching {@code char}s found in a character sequence. 591 * 592 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 593 */ 594 public int countIn(CharSequence sequence) { 595 int count = 0; 596 for (int i = 0; i < sequence.length(); i++) { 597 if (matches(sequence.charAt(i))) { 598 count++; 599 } 600 } 601 return count; 602 } 603 604 /** 605 * Returns a string containing all non-matching characters of a character sequence, in order. For 606 * example: 607 * 608 * <pre>{@code 609 * CharMatcher.is('a').removeFrom("bazaar") 610 * }</pre> 611 * 612 * ... returns {@code "bzr"}. 613 */ 614 public String removeFrom(CharSequence sequence) { 615 String string = sequence.toString(); 616 int pos = indexIn(string); 617 if (pos == -1) { 618 return string; 619 } 620 621 char[] chars = string.toCharArray(); 622 int spread = 1; 623 624 // This unusual loop comes from extensive benchmarking 625 OUT: 626 while (true) { 627 pos++; 628 while (true) { 629 if (pos == chars.length) { 630 break OUT; 631 } 632 if (matches(chars[pos])) { 633 break; 634 } 635 chars[pos - spread] = chars[pos]; 636 pos++; 637 } 638 spread++; 639 } 640 return new String(chars, 0, pos - spread); 641 } 642 643 /** 644 * Returns a string containing all matching BMP characters of a character sequence, in order. For 645 * example: 646 * 647 * <pre>{@code 648 * CharMatcher.is('a').retainFrom("bazaar") 649 * }</pre> 650 * 651 * ... returns {@code "aaa"}. 652 */ 653 public String retainFrom(CharSequence sequence) { 654 return negate().removeFrom(sequence); 655 } 656 657 /** 658 * Returns a string copy of the input character sequence, with each matching BMP character 659 * replaced by a given replacement character. For example: 660 * 661 * <pre>{@code 662 * CharMatcher.is('a').replaceFrom("radar", 'o') 663 * }</pre> 664 * 665 * ... returns {@code "rodor"}. 666 * 667 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 668 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 669 * character. 670 * 671 * @param sequence the character sequence to replace matching characters in 672 * @param replacement the character to append to the result string in place of each matching 673 * character in {@code sequence} 674 * @return the new string 675 */ 676 public String replaceFrom(CharSequence sequence, char replacement) { 677 String string = sequence.toString(); 678 int pos = indexIn(string); 679 if (pos == -1) { 680 return string; 681 } 682 char[] chars = string.toCharArray(); 683 chars[pos] = replacement; 684 for (int i = pos + 1; i < chars.length; i++) { 685 if (matches(chars[i])) { 686 chars[i] = replacement; 687 } 688 } 689 return new String(chars); 690 } 691 692 /** 693 * Returns a string copy of the input character sequence, with each matching BMP character 694 * replaced by a given replacement sequence. For example: 695 * 696 * <pre>{@code 697 * CharMatcher.is('a').replaceFrom("yaha", "oo") 698 * }</pre> 699 * 700 * ... returns {@code "yoohoo"}. 701 * 702 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 703 * off calling {@link #replaceFrom(CharSequence, char)} directly. 704 * 705 * @param sequence the character sequence to replace matching characters in 706 * @param replacement the characters to append to the result string in place of each matching 707 * character in {@code sequence} 708 * @return the new string 709 */ 710 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 711 int replacementLen = replacement.length(); 712 if (replacementLen == 0) { 713 return removeFrom(sequence); 714 } 715 if (replacementLen == 1) { 716 return replaceFrom(sequence, replacement.charAt(0)); 717 } 718 719 String string = sequence.toString(); 720 int pos = indexIn(string); 721 if (pos == -1) { 722 return string; 723 } 724 725 int len = string.length(); 726 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 727 728 int oldpos = 0; 729 do { 730 buf.append(string, oldpos, pos); 731 buf.append(replacement); 732 oldpos = pos + 1; 733 pos = indexIn(string, oldpos); 734 } while (pos != -1); 735 736 buf.append(string, oldpos, len); 737 return buf.toString(); 738 } 739 740 /** 741 * Returns a substring of the input character sequence that omits all matching BMP characters from 742 * the beginning and from the end of the string. For example: 743 * 744 * <pre>{@code 745 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 746 * }</pre> 747 * 748 * ... returns {@code "cat"}. 749 * 750 * <p>Note that: 751 * 752 * <pre>{@code 753 * CharMatcher.inRange('\0', ' ').trimFrom(str) 754 * }</pre> 755 * 756 * ... is equivalent to {@link String#trim()}. 757 */ 758 public String trimFrom(CharSequence sequence) { 759 int len = sequence.length(); 760 int first; 761 int last; 762 763 for (first = 0; first < len; first++) { 764 if (!matches(sequence.charAt(first))) { 765 break; 766 } 767 } 768 for (last = len - 1; last > first; last--) { 769 if (!matches(sequence.charAt(last))) { 770 break; 771 } 772 } 773 774 return sequence.subSequence(first, last + 1).toString(); 775 } 776 777 /** 778 * Returns a substring of the input character sequence that omits all matching BMP characters from 779 * the beginning of the string. For example: 780 * 781 * <pre>{@code 782 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 783 * }</pre> 784 * 785 * ... returns {@code "catbab"}. 786 */ 787 public String trimLeadingFrom(CharSequence sequence) { 788 int len = sequence.length(); 789 for (int first = 0; first < len; first++) { 790 if (!matches(sequence.charAt(first))) { 791 return sequence.subSequence(first, len).toString(); 792 } 793 } 794 return ""; 795 } 796 797 /** 798 * Returns a substring of the input character sequence that omits all matching BMP characters from 799 * the end of the string. For example: 800 * 801 * <pre>{@code 802 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 803 * }</pre> 804 * 805 * ... returns {@code "abacat"}. 806 */ 807 public String trimTrailingFrom(CharSequence sequence) { 808 int len = sequence.length(); 809 for (int last = len - 1; last >= 0; last--) { 810 if (!matches(sequence.charAt(last))) { 811 return sequence.subSequence(0, last + 1).toString(); 812 } 813 } 814 return ""; 815 } 816 817 /** 818 * Returns a string copy of the input character sequence, with each group of consecutive matching 819 * BMP characters replaced by a single replacement character. For example: 820 * 821 * <pre>{@code 822 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 823 * }</pre> 824 * 825 * ... returns {@code "b-p-r"}. 826 * 827 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 828 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 829 * character. 830 * 831 * @param sequence the character sequence to replace matching groups of characters in 832 * @param replacement the character to append to the result string in place of each group of 833 * matching characters in {@code sequence} 834 * @return the new string 835 */ 836 public String collapseFrom(CharSequence sequence, char replacement) { 837 // This implementation avoids unnecessary allocation. 838 int len = sequence.length(); 839 for (int i = 0; i < len; i++) { 840 char c = sequence.charAt(i); 841 if (matches(c)) { 842 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 843 // a no-op replacement 844 i++; 845 } else { 846 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 847 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 848 } 849 } 850 } 851 // no replacement needed 852 return sequence.toString(); 853 } 854 855 /** 856 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 857 * groups of matching BMP characters at the start or end of the sequence are removed without 858 * replacement. 859 */ 860 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 861 // This implementation avoids unnecessary allocation. 862 int len = sequence.length(); 863 int first = 0; 864 int last = len - 1; 865 866 while (first < len && matches(sequence.charAt(first))) { 867 first++; 868 } 869 870 while (last > first && matches(sequence.charAt(last))) { 871 last--; 872 } 873 874 return (first == 0 && last == len - 1) 875 ? collapseFrom(sequence, replacement) 876 : finishCollapseFrom( 877 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 878 } 879 880 private String finishCollapseFrom( 881 CharSequence sequence, 882 int start, 883 int end, 884 char replacement, 885 StringBuilder builder, 886 boolean inMatchingGroup) { 887 for (int i = start; i < end; i++) { 888 char c = sequence.charAt(i); 889 if (matches(c)) { 890 if (!inMatchingGroup) { 891 builder.append(replacement); 892 inMatchingGroup = true; 893 } 894 } else { 895 builder.append(c); 896 inMatchingGroup = false; 897 } 898 } 899 return builder.toString(); 900 } 901 902 /** 903 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 904 * instead. 905 */ 906 @Deprecated 907 @Override 908 public boolean apply(Character character) { 909 return matches(character); 910 } 911 912 /** 913 * Returns a string representation of this {@code CharMatcher}, such as {@code 914 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 915 */ 916 @Override 917 public String toString() { 918 return super.toString(); 919 } 920 921 /** 922 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 923 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 924 */ 925 private static String showCharacter(char c) { 926 String hex = "0123456789ABCDEF"; 927 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 928 for (int i = 0; i < 4; i++) { 929 tmp[5 - i] = hex.charAt(c & 0xF); 930 c = (char) (c >> 4); 931 } 932 return String.copyValueOf(tmp); 933 } 934 935 // Fast matchers 936 937 /** A matcher for which precomputation will not yield any significant benefit. */ 938 abstract static class FastMatcher extends CharMatcher { 939 940 @Override 941 public final CharMatcher precomputed() { 942 return this; 943 } 944 945 @Override 946 public CharMatcher negate() { 947 return new NegatedFastMatcher(this); 948 } 949 } 950 951 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 952 abstract static class NamedFastMatcher extends FastMatcher { 953 954 private final String description; 955 956 NamedFastMatcher(String description) { 957 this.description = checkNotNull(description); 958 } 959 960 @Override 961 public final String toString() { 962 return description; 963 } 964 } 965 966 /** Negation of a {@link FastMatcher}. */ 967 static class NegatedFastMatcher extends Negated { 968 969 NegatedFastMatcher(CharMatcher original) { 970 super(original); 971 } 972 973 @Override 974 public final CharMatcher precomputed() { 975 return this; 976 } 977 } 978 979 /** Fast matcher using a {@link BitSet} table of matching characters. */ 980 @GwtIncompatible // used only from other GwtIncompatible code 981 private static final class BitSetMatcher extends NamedFastMatcher { 982 983 private final BitSet table; 984 985 private BitSetMatcher(BitSet table, String description) { 986 super(description); 987 if (table.length() + Long.SIZE < table.size()) { 988 table = (BitSet) table.clone(); 989 // If only we could actually call BitSet.trimToSize() ourselves... 990 } 991 this.table = table; 992 } 993 994 @Override 995 public boolean matches(char c) { 996 return table.get(c); 997 } 998 999 @Override 1000 void setBits(BitSet bitSet) { 1001 bitSet.or(table); 1002 } 1003 } 1004 1005 // Static constant implementation classes 1006 1007 /** Implementation of {@link #any()}. */ 1008 private static final class Any extends NamedFastMatcher { 1009 1010 static final Any INSTANCE = new Any(); 1011 1012 private Any() { 1013 super("CharMatcher.any()"); 1014 } 1015 1016 @Override 1017 public boolean matches(char c) { 1018 return true; 1019 } 1020 1021 @Override 1022 public int indexIn(CharSequence sequence) { 1023 return (sequence.length() == 0) ? -1 : 0; 1024 } 1025 1026 @Override 1027 public int indexIn(CharSequence sequence, int start) { 1028 int length = sequence.length(); 1029 checkPositionIndex(start, length); 1030 return (start == length) ? -1 : start; 1031 } 1032 1033 @Override 1034 public int lastIndexIn(CharSequence sequence) { 1035 return sequence.length() - 1; 1036 } 1037 1038 @Override 1039 public boolean matchesAllOf(CharSequence sequence) { 1040 checkNotNull(sequence); 1041 return true; 1042 } 1043 1044 @Override 1045 public boolean matchesNoneOf(CharSequence sequence) { 1046 return sequence.length() == 0; 1047 } 1048 1049 @Override 1050 public String removeFrom(CharSequence sequence) { 1051 checkNotNull(sequence); 1052 return ""; 1053 } 1054 1055 @Override 1056 public String replaceFrom(CharSequence sequence, char replacement) { 1057 char[] array = new char[sequence.length()]; 1058 Arrays.fill(array, replacement); 1059 return new String(array); 1060 } 1061 1062 @Override 1063 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1064 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1065 for (int i = 0; i < sequence.length(); i++) { 1066 result.append(replacement); 1067 } 1068 return result.toString(); 1069 } 1070 1071 @Override 1072 public String collapseFrom(CharSequence sequence, char replacement) { 1073 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1074 } 1075 1076 @Override 1077 public String trimFrom(CharSequence sequence) { 1078 checkNotNull(sequence); 1079 return ""; 1080 } 1081 1082 @Override 1083 public int countIn(CharSequence sequence) { 1084 return sequence.length(); 1085 } 1086 1087 @Override 1088 public CharMatcher and(CharMatcher other) { 1089 return checkNotNull(other); 1090 } 1091 1092 @Override 1093 public CharMatcher or(CharMatcher other) { 1094 checkNotNull(other); 1095 return this; 1096 } 1097 1098 @Override 1099 public CharMatcher negate() { 1100 return none(); 1101 } 1102 } 1103 1104 /** Implementation of {@link #none()}. */ 1105 private static final class None extends NamedFastMatcher { 1106 1107 static final None INSTANCE = new None(); 1108 1109 private None() { 1110 super("CharMatcher.none()"); 1111 } 1112 1113 @Override 1114 public boolean matches(char c) { 1115 return false; 1116 } 1117 1118 @Override 1119 public int indexIn(CharSequence sequence) { 1120 checkNotNull(sequence); 1121 return -1; 1122 } 1123 1124 @Override 1125 public int indexIn(CharSequence sequence, int start) { 1126 int length = sequence.length(); 1127 checkPositionIndex(start, length); 1128 return -1; 1129 } 1130 1131 @Override 1132 public int lastIndexIn(CharSequence sequence) { 1133 checkNotNull(sequence); 1134 return -1; 1135 } 1136 1137 @Override 1138 public boolean matchesAllOf(CharSequence sequence) { 1139 return sequence.length() == 0; 1140 } 1141 1142 @Override 1143 public boolean matchesNoneOf(CharSequence sequence) { 1144 checkNotNull(sequence); 1145 return true; 1146 } 1147 1148 @Override 1149 public String removeFrom(CharSequence sequence) { 1150 return sequence.toString(); 1151 } 1152 1153 @Override 1154 public String replaceFrom(CharSequence sequence, char replacement) { 1155 return sequence.toString(); 1156 } 1157 1158 @Override 1159 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1160 checkNotNull(replacement); 1161 return sequence.toString(); 1162 } 1163 1164 @Override 1165 public String collapseFrom(CharSequence sequence, char replacement) { 1166 return sequence.toString(); 1167 } 1168 1169 @Override 1170 public String trimFrom(CharSequence sequence) { 1171 return sequence.toString(); 1172 } 1173 1174 @Override 1175 public String trimLeadingFrom(CharSequence sequence) { 1176 return sequence.toString(); 1177 } 1178 1179 @Override 1180 public String trimTrailingFrom(CharSequence sequence) { 1181 return sequence.toString(); 1182 } 1183 1184 @Override 1185 public int countIn(CharSequence sequence) { 1186 checkNotNull(sequence); 1187 return 0; 1188 } 1189 1190 @Override 1191 public CharMatcher and(CharMatcher other) { 1192 checkNotNull(other); 1193 return this; 1194 } 1195 1196 @Override 1197 public CharMatcher or(CharMatcher other) { 1198 return checkNotNull(other); 1199 } 1200 1201 @Override 1202 public CharMatcher negate() { 1203 return any(); 1204 } 1205 } 1206 1207 /** Implementation of {@link #whitespace()}. */ 1208 @VisibleForTesting 1209 static final class Whitespace extends NamedFastMatcher { 1210 1211 static final String TABLE = 1212 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1213 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1214 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1215 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1216 static final int MULTIPLIER = 1682554634; 1217 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1218 1219 static final Whitespace INSTANCE = new Whitespace(); 1220 1221 Whitespace() { 1222 super("CharMatcher.whitespace()"); 1223 } 1224 1225 @Override 1226 public boolean matches(char c) { 1227 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1228 } 1229 1230 @GwtIncompatible // used only from other GwtIncompatible code 1231 @Override 1232 void setBits(BitSet table) { 1233 for (int i = 0; i < TABLE.length(); i++) { 1234 table.set(TABLE.charAt(i)); 1235 } 1236 } 1237 } 1238 1239 /** Implementation of {@link #breakingWhitespace()}. */ 1240 private static final class BreakingWhitespace extends CharMatcher { 1241 1242 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1243 1244 @Override 1245 public boolean matches(char c) { 1246 switch (c) { 1247 case '\t': 1248 case '\n': 1249 case '\013': 1250 case '\f': 1251 case '\r': 1252 case ' ': 1253 case '\u0085': 1254 case '\u1680': 1255 case '\u2028': 1256 case '\u2029': 1257 case '\u205f': 1258 case '\u3000': 1259 return true; 1260 case '\u2007': 1261 return false; 1262 default: 1263 return c >= '\u2000' && c <= '\u200a'; 1264 } 1265 } 1266 1267 @Override 1268 public String toString() { 1269 return "CharMatcher.breakingWhitespace()"; 1270 } 1271 } 1272 1273 /** Implementation of {@link #ascii()}. */ 1274 private static final class Ascii extends NamedFastMatcher { 1275 1276 static final Ascii INSTANCE = new Ascii(); 1277 1278 Ascii() { 1279 super("CharMatcher.ascii()"); 1280 } 1281 1282 @Override 1283 public boolean matches(char c) { 1284 return c <= '\u007f'; 1285 } 1286 } 1287 1288 /** Implementation that matches characters that fall within multiple ranges. */ 1289 private static class RangesMatcher extends CharMatcher { 1290 1291 private final String description; 1292 private final char[] rangeStarts; 1293 private final char[] rangeEnds; 1294 1295 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1296 this.description = description; 1297 this.rangeStarts = rangeStarts; 1298 this.rangeEnds = rangeEnds; 1299 checkArgument(rangeStarts.length == rangeEnds.length); 1300 for (int i = 0; i < rangeStarts.length; i++) { 1301 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1302 if (i + 1 < rangeStarts.length) { 1303 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1304 } 1305 } 1306 } 1307 1308 @Override 1309 public boolean matches(char c) { 1310 int index = Arrays.binarySearch(rangeStarts, c); 1311 if (index >= 0) { 1312 return true; 1313 } else { 1314 index = ~index - 1; 1315 return index >= 0 && c <= rangeEnds[index]; 1316 } 1317 } 1318 1319 @Override 1320 public String toString() { 1321 return description; 1322 } 1323 } 1324 1325 /** Implementation of {@link #digit()}. */ 1326 private static final class Digit extends RangesMatcher { 1327 // Plug the following UnicodeSet pattern into 1328 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1329 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1330 // and get the zeroes from there. 1331 1332 // Must be in ascending order. 1333 private static final String ZEROES = 1334 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1335 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1336 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1337 1338 private static char[] zeroes() { 1339 return ZEROES.toCharArray(); 1340 } 1341 1342 private static char[] nines() { 1343 char[] nines = new char[ZEROES.length()]; 1344 for (int i = 0; i < ZEROES.length(); i++) { 1345 nines[i] = (char) (ZEROES.charAt(i) + 9); 1346 } 1347 return nines; 1348 } 1349 1350 static final Digit INSTANCE = new Digit(); 1351 1352 private Digit() { 1353 super("CharMatcher.digit()", zeroes(), nines()); 1354 } 1355 } 1356 1357 /** Implementation of {@link #javaDigit()}. */ 1358 private static final class JavaDigit extends CharMatcher { 1359 1360 static final JavaDigit INSTANCE = new JavaDigit(); 1361 1362 @Override 1363 public boolean matches(char c) { 1364 return Character.isDigit(c); 1365 } 1366 1367 @Override 1368 public String toString() { 1369 return "CharMatcher.javaDigit()"; 1370 } 1371 } 1372 1373 /** Implementation of {@link #javaLetter()}. */ 1374 private static final class JavaLetter extends CharMatcher { 1375 1376 static final JavaLetter INSTANCE = new JavaLetter(); 1377 1378 @Override 1379 public boolean matches(char c) { 1380 return Character.isLetter(c); 1381 } 1382 1383 @Override 1384 public String toString() { 1385 return "CharMatcher.javaLetter()"; 1386 } 1387 } 1388 1389 /** Implementation of {@link #javaLetterOrDigit()}. */ 1390 private static final class JavaLetterOrDigit extends CharMatcher { 1391 1392 static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit(); 1393 1394 @Override 1395 public boolean matches(char c) { 1396 return Character.isLetterOrDigit(c); 1397 } 1398 1399 @Override 1400 public String toString() { 1401 return "CharMatcher.javaLetterOrDigit()"; 1402 } 1403 } 1404 1405 /** Implementation of {@link #javaUpperCase()}. */ 1406 private static final class JavaUpperCase extends CharMatcher { 1407 1408 static final JavaUpperCase INSTANCE = new JavaUpperCase(); 1409 1410 @Override 1411 public boolean matches(char c) { 1412 return Character.isUpperCase(c); 1413 } 1414 1415 @Override 1416 public String toString() { 1417 return "CharMatcher.javaUpperCase()"; 1418 } 1419 } 1420 1421 /** Implementation of {@link #javaLowerCase()}. */ 1422 private static final class JavaLowerCase extends CharMatcher { 1423 1424 static final JavaLowerCase INSTANCE = new JavaLowerCase(); 1425 1426 @Override 1427 public boolean matches(char c) { 1428 return Character.isLowerCase(c); 1429 } 1430 1431 @Override 1432 public String toString() { 1433 return "CharMatcher.javaLowerCase()"; 1434 } 1435 } 1436 1437 /** Implementation of {@link #javaIsoControl()}. */ 1438 private static final class JavaIsoControl extends NamedFastMatcher { 1439 1440 static final JavaIsoControl INSTANCE = new JavaIsoControl(); 1441 1442 private JavaIsoControl() { 1443 super("CharMatcher.javaIsoControl()"); 1444 } 1445 1446 @Override 1447 public boolean matches(char c) { 1448 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1449 } 1450 } 1451 1452 /** Implementation of {@link #invisible()}. */ 1453 private static final class Invisible extends RangesMatcher { 1454 // Plug the following UnicodeSet pattern into 1455 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1456 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1457 // with the "Abbreviate" option, and get the ranges from there. 1458 private static final String RANGE_STARTS = 1459 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1460 + "\u3000\ud800\ufeff\ufff9"; 1461 private static final String RANGE_ENDS = // inclusive ends 1462 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1463 + "\u3000\uf8ff\ufeff\ufffb"; 1464 1465 static final Invisible INSTANCE = new Invisible(); 1466 1467 private Invisible() { 1468 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1469 } 1470 } 1471 1472 /** Implementation of {@link #singleWidth()}. */ 1473 private static final class SingleWidth extends RangesMatcher { 1474 1475 static final SingleWidth INSTANCE = new SingleWidth(); 1476 1477 private SingleWidth() { 1478 super( 1479 "CharMatcher.singleWidth()", 1480 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1481 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1482 } 1483 } 1484 1485 // Non-static factory implementation classes 1486 1487 /** Implementation of {@link #negate()}. */ 1488 private static class Negated extends CharMatcher { 1489 1490 final CharMatcher original; 1491 1492 Negated(CharMatcher original) { 1493 this.original = checkNotNull(original); 1494 } 1495 1496 @Override 1497 public boolean matches(char c) { 1498 return !original.matches(c); 1499 } 1500 1501 @Override 1502 public boolean matchesAllOf(CharSequence sequence) { 1503 return original.matchesNoneOf(sequence); 1504 } 1505 1506 @Override 1507 public boolean matchesNoneOf(CharSequence sequence) { 1508 return original.matchesAllOf(sequence); 1509 } 1510 1511 @Override 1512 public int countIn(CharSequence sequence) { 1513 return sequence.length() - original.countIn(sequence); 1514 } 1515 1516 @GwtIncompatible // used only from other GwtIncompatible code 1517 @Override 1518 void setBits(BitSet table) { 1519 BitSet tmp = new BitSet(); 1520 original.setBits(tmp); 1521 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1522 table.or(tmp); 1523 } 1524 1525 @Override 1526 public CharMatcher negate() { 1527 return original; 1528 } 1529 1530 @Override 1531 public String toString() { 1532 return original + ".negate()"; 1533 } 1534 } 1535 1536 /** Implementation of {@link #and(CharMatcher)}. */ 1537 private static final class And extends CharMatcher { 1538 1539 final CharMatcher first; 1540 final CharMatcher second; 1541 1542 And(CharMatcher a, CharMatcher b) { 1543 first = checkNotNull(a); 1544 second = checkNotNull(b); 1545 } 1546 1547 @Override 1548 public boolean matches(char c) { 1549 return first.matches(c) && second.matches(c); 1550 } 1551 1552 @GwtIncompatible // used only from other GwtIncompatible code 1553 @Override 1554 void setBits(BitSet table) { 1555 BitSet tmp1 = new BitSet(); 1556 first.setBits(tmp1); 1557 BitSet tmp2 = new BitSet(); 1558 second.setBits(tmp2); 1559 tmp1.and(tmp2); 1560 table.or(tmp1); 1561 } 1562 1563 @Override 1564 public String toString() { 1565 return "CharMatcher.and(" + first + ", " + second + ")"; 1566 } 1567 } 1568 1569 /** Implementation of {@link #or(CharMatcher)}. */ 1570 private static final class Or extends CharMatcher { 1571 1572 final CharMatcher first; 1573 final CharMatcher second; 1574 1575 Or(CharMatcher a, CharMatcher b) { 1576 first = checkNotNull(a); 1577 second = checkNotNull(b); 1578 } 1579 1580 @GwtIncompatible // used only from other GwtIncompatible code 1581 @Override 1582 void setBits(BitSet table) { 1583 first.setBits(table); 1584 second.setBits(table); 1585 } 1586 1587 @Override 1588 public boolean matches(char c) { 1589 return first.matches(c) || second.matches(c); 1590 } 1591 1592 @Override 1593 public String toString() { 1594 return "CharMatcher.or(" + first + ", " + second + ")"; 1595 } 1596 } 1597 1598 // Static factory implementations 1599 1600 /** Implementation of {@link #is(char)}. */ 1601 private static final class Is extends FastMatcher { 1602 1603 private final char match; 1604 1605 Is(char match) { 1606 this.match = match; 1607 } 1608 1609 @Override 1610 public boolean matches(char c) { 1611 return c == match; 1612 } 1613 1614 @Override 1615 public String replaceFrom(CharSequence sequence, char replacement) { 1616 return sequence.toString().replace(match, replacement); 1617 } 1618 1619 @Override 1620 public CharMatcher and(CharMatcher other) { 1621 return other.matches(match) ? this : none(); 1622 } 1623 1624 @Override 1625 public CharMatcher or(CharMatcher other) { 1626 return other.matches(match) ? other : super.or(other); 1627 } 1628 1629 @Override 1630 public CharMatcher negate() { 1631 return isNot(match); 1632 } 1633 1634 @GwtIncompatible // used only from other GwtIncompatible code 1635 @Override 1636 void setBits(BitSet table) { 1637 table.set(match); 1638 } 1639 1640 @Override 1641 public String toString() { 1642 return "CharMatcher.is('" + showCharacter(match) + "')"; 1643 } 1644 } 1645 1646 /** Implementation of {@link #isNot(char)}. */ 1647 private static final class IsNot extends FastMatcher { 1648 1649 private final char match; 1650 1651 IsNot(char match) { 1652 this.match = match; 1653 } 1654 1655 @Override 1656 public boolean matches(char c) { 1657 return c != match; 1658 } 1659 1660 @Override 1661 public CharMatcher and(CharMatcher other) { 1662 return other.matches(match) ? super.and(other) : other; 1663 } 1664 1665 @Override 1666 public CharMatcher or(CharMatcher other) { 1667 return other.matches(match) ? any() : this; 1668 } 1669 1670 @GwtIncompatible // used only from other GwtIncompatible code 1671 @Override 1672 void setBits(BitSet table) { 1673 table.set(0, match); 1674 table.set(match + 1, Character.MAX_VALUE + 1); 1675 } 1676 1677 @Override 1678 public CharMatcher negate() { 1679 return is(match); 1680 } 1681 1682 @Override 1683 public String toString() { 1684 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1685 } 1686 } 1687 1688 private static CharMatcher.IsEither isEither(char c1, char c2) { 1689 return new CharMatcher.IsEither(c1, c2); 1690 } 1691 1692 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1693 private static final class IsEither extends FastMatcher { 1694 1695 private final char match1; 1696 private final char match2; 1697 1698 IsEither(char match1, char match2) { 1699 this.match1 = match1; 1700 this.match2 = match2; 1701 } 1702 1703 @Override 1704 public boolean matches(char c) { 1705 return c == match1 || c == match2; 1706 } 1707 1708 @GwtIncompatible // used only from other GwtIncompatible code 1709 @Override 1710 void setBits(BitSet table) { 1711 table.set(match1); 1712 table.set(match2); 1713 } 1714 1715 @Override 1716 public String toString() { 1717 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1718 } 1719 } 1720 1721 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1722 private static final class AnyOf extends CharMatcher { 1723 1724 private final char[] chars; 1725 1726 public AnyOf(CharSequence chars) { 1727 this.chars = chars.toString().toCharArray(); 1728 Arrays.sort(this.chars); 1729 } 1730 1731 @Override 1732 public boolean matches(char c) { 1733 return Arrays.binarySearch(chars, c) >= 0; 1734 } 1735 1736 @Override 1737 @GwtIncompatible // used only from other GwtIncompatible code 1738 void setBits(BitSet table) { 1739 for (char c : chars) { 1740 table.set(c); 1741 } 1742 } 1743 1744 @Override 1745 public String toString() { 1746 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1747 for (char c : chars) { 1748 description.append(showCharacter(c)); 1749 } 1750 description.append("\")"); 1751 return description.toString(); 1752 } 1753 } 1754 1755 /** Implementation of {@link #inRange(char, char)}. */ 1756 private static final class InRange extends FastMatcher { 1757 1758 private final char startInclusive; 1759 private final char endInclusive; 1760 1761 InRange(char startInclusive, char endInclusive) { 1762 checkArgument(endInclusive >= startInclusive); 1763 this.startInclusive = startInclusive; 1764 this.endInclusive = endInclusive; 1765 } 1766 1767 @Override 1768 public boolean matches(char c) { 1769 return startInclusive <= c && c <= endInclusive; 1770 } 1771 1772 @GwtIncompatible // used only from other GwtIncompatible code 1773 @Override 1774 void setBits(BitSet table) { 1775 table.set(startInclusive, endInclusive + 1); 1776 } 1777 1778 @Override 1779 public String toString() { 1780 return "CharMatcher.inRange('" 1781 + showCharacter(startInclusive) 1782 + "', '" 1783 + showCharacter(endInclusive) 1784 + "')"; 1785 } 1786 } 1787 1788 /** Implementation of {@link #forPredicate(Predicate)}. */ 1789 private static final class ForPredicate extends CharMatcher { 1790 1791 private final Predicate<? super Character> predicate; 1792 1793 ForPredicate(Predicate<? super Character> predicate) { 1794 this.predicate = checkNotNull(predicate); 1795 } 1796 1797 @Override 1798 public boolean matches(char c) { 1799 return predicate.apply(c); 1800 } 1801 1802 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1803 @Override 1804 public boolean apply(Character character) { 1805 return predicate.apply(checkNotNull(character)); 1806 } 1807 1808 @Override 1809 public String toString() { 1810 return "CharMatcher.forPredicate(" + predicate + ")"; 1811 } 1812 } 1813}