BreakIterDemo.java
Download BreakIterDemo.java
1: // BreakIterator demo
2: // Shows a correct way to process a String one Unicode character
3: // at a time. This is difficult, as multi-char characters may
4: // be in the input either by accident, by non-English users,
5: // or by attackers attempting to break your software.
6: //
7: // Written 6/2012 by Wayne Pollock, Tampa Florida USA
8: // Inspired by the BreakIterator Java docs, and the incorrect code
9: // at <https://www.securecoding.cert.org/confluence/display/java/
10: // IDS10-J.+Do+not+split+characters+between+two+data+structures#IDS10-JDonotsplit
11: // charactersbetweentwodatastructures-CompliantSolutionSubstring>
12:
13: import java.text.*;
14:
15: class BreakIterDemo
16: {
17: public static void main ( String [] args ) {
18: for ( String source : args )
19: System.out.println( source + " --> " + getLeadingText( source ) );
20: }
21:
22: private static String getLeadingText( String source ) {
23: final BreakIterator iter =
24: BreakIterator.getCharacterInstance();
25: iter.setText( source );
26:
27: // This ugly loop must track both the start and end index
28: // of each (possibly multi-char) character:
29: int start, end;
30: for ( start = iter.first(), end = iter.next();
31: end != BreakIterator.DONE;
32: start = end, end = iter.next() )
33: {
34:
35: int ch = source.codePointAt( start );
36: if ( ! Character.isLetter( ch ) )
37: break; // end of leading good text
38: }
39:
40: if ( end == BreakIterator.DONE )
41: return source;
42: else
43: return source.substring( 0, start );
44: }
45: }