BreakIterDemo.java

Download BreakIterDemo.java

 1: // BreakIterator demo
 2: // Shows a correct way to process a String one Unicode character
 3: // at a time.  This is difficult, as multi-char characters may
 4: // be in the input either by accident, by non-English users,
 5: // or by attackers attempting to break your software.
 6: //
 7: // Written 6/2012 by Wayne Pollock, Tampa Florida USA
 8: // Inspired by the BreakIterator Java docs, and the incorrect code
 9: // at <https://www.securecoding.cert.org/confluence/display/java/
10: // IDS10-J.+Do+not+split+characters+between+two+data+structures#IDS10-JDonotsplit
11: // charactersbetweentwodatastructures-CompliantSolutionSubstring>
12: 
13: import java.text.*;
14: 
15: class BreakIterDemo
16: {
17:   public static void main ( String [] args ) {
18:     for ( String source : args )
19:       System.out.println( source + " --> " + getLeadingText( source ) );
20:   }
21: 
22:   private static String getLeadingText( String source ) {
23:     final BreakIterator iter =
24:       BreakIterator.getCharacterInstance();
25:     iter.setText( source );
26: 
27:     // This ugly loop must track both the start and end index
28:     // of each (possibly multi-char) character:
29:     int start, end;
30:     for ( start = iter.first(), end = iter.next();
31:           end != BreakIterator.DONE;
32:           start = end, end = iter.next() )
33:     {
34: 
35:       int ch = source.codePointAt( start );
36:       if ( ! Character.isLetter( ch ) )
37:         break;  // end of leading good text
38:      }
39: 
40:     if ( end == BreakIterator.DONE )
41:       return source;
42:     else
43:       return source.substring( 0, start );
44:   }
45: }