/home/wpollock1/public_html/AJava/BreakIterDemo.java
// BreakIterator demo
// Shows a correct way to process a String one Unicode character
// at a time. This is difficult, as multi-char characters may
// be in the input either by accident, by non-English users,
// or by attackers attempting to break your software.
//
// Written 6/2012 by Wayne Pollock, Tampa Florida USA
// Inspired by the BreakIterator Java docs, and the incorrect code
// at <https://www.securecoding.cert.org/confluence/display/java/
// IDS10-J.+Do+not+split+characters+between+two+data+structures#IDS10-JDonotsplit
// charactersbetweentwodatastructures-CompliantSolutionSubstring>
import java.text.*;
class BreakIterDemo
{
public static void main ( String [] args ) {
for ( String source : args )
System.out.println( source + " --> " + getLeadingText( source ) );
}
private static String getLeadingText( String source ) {
final BreakIterator iter =
BreakIterator.getCharacterInstance();
iter.setText( source );
// This ugly loop must track both the start and end index
// of each (possibly multi-char) character:
int start, end;
for ( start = iter.first(), end = iter.next();
end != BreakIterator.DONE;
start = end, end = iter.next() )
{
int ch = source.codePointAt( start );
if ( ! Character.isLetter( ch ) )
break; // end of leading good text
}
if ( end == BreakIterator.DONE )
return source;
else
return source.substring( 0, start );
}
}