Download NatOrderComparator.java
1: package com.wpollock.natordercomp; 2: 3: import java.math.BigInteger; 4: import java.util.Comparator; 5: 6: /** 7: * A natural order String Comparator. 8: * Natural order means to compare Strings character for character, 9: * except when there are runs of digits in the String. Each run of 10: * digits is compared numerically, not lexically (so "2" < "10"). 11: * <p> 12: * If two runs of digits compare equal numerically, the shorter number 13: * is less than the longer one (so "3" < "03"). 14: * <p> 15: * The general algorithm is inspired by 16: * <a href="https://en.wikipedia.org/wiki/Natural_sort_order">Natural 17: * Sort Order</a> on Wikipedia. 18: * Compare two Strings codepoint by codepoint, unless a digit is 19: * encountered. If the other String's codepoint is not a digit, 20: * the String with the digit is less. 21: * If both are numeric, for each String read all the digits and convert to 22: * a number. The smaller value is less; if values 23: * are the same, shorter length is less (to account for leading zeroes). If 24: * the numbers are the same, continue with the next codepoint If everything 25: * is the same, the shorter String is less. Otherwise, they are equal. 26: * <p> 27: * For this version, no options are provided (such as ignoring leading zeros 28: * or spaces, case-insensitive matching, or Unicode normalization). All 29: * such steps are assumed to be done by the caller. 30: * 31: * @version 1.0 (12/30/2015) 32: * @author Wayne Pollock, Tampa Florida USA 33: * 34: */ 35: 36: public class NatOrderComparator implements Comparator<String> { 37: 38: /* 39: * While the code strives for a clean, elegant solution, efficiency in a 40: * Comparator is important, so this version avoids using regular expressions 41: * and tries to avoid creating unnecessary objects. 42: */ 43: 44: // Someday, there should be versions with args for case-insensitive, 45: // ignore whitespace, and Unicode normalization & sanitization. For 46: // maximum flexibility, we will use the static factory pattern even 47: // though it isn't warranted yet. 48: 49: // Reusing noc is safe since NatOrderComparator objects are immutable. 50: private static NatOrderComparator noc = new NatOrderComparator(); 51: 52: /** 53: * No-arg constructor. 54: */ 55: private NatOrderComparator() { 56: } 57: 58: /** 59: * Factory method to get a reference to a NatOrderComparator. 60: * @return NatOrderComparator 61: */ 62: public static NatOrderComparator getInstance() { 63: return noc; 64: } 65: 66: /** 67: * Compares two Strings using a "natural" sorting order. 68: * 69: * @param s1 the first String to compare 70: * @param s2 the second String to compare 71: * @return {@literal 0 if s1 equals s2, 72: * <0 if s1 is naturally less than s2, 73: * >0 if s2 is naturally less than s1 74: * } 75: * @throws NullPointerException if either argument is null. This Exception 76: * is required by the Comparator contract, not IllegalArgumentException. 77: */ 78: 79: @Override 80: public int compare(final String s1, final String s2) { 81: if ( s1 == null || s2 == null ) 82: throw new NullPointerException("s1: " + s1 + ", s2: " + s2); 83: if ( s1.equals(s2)) // Includes zero-length Strings 84: return 0; 85: 86: // Walk the length of the shorter String: 87: if ( s1.length() == 0 ) 88: return -1; // s2.length must be greater than 0 89: if ( s2.length() == 0 ) 90: return 1; // s1.length must be greater than 0 91: final int len = Math.min(s1.length(), s2.length()); 92: 93: // Walk through the String, one codepoint at a time: 94: // (Since Java 8, you can do this with Streams: 95: // s.codePoints().forEach(c -> ...); 96: // for(int c : s.codePoints().toArray()) { ... } 97: // But I don't think those methods are as efficient.) 98: int cp1, cp2; 99: int i1 = 0; 100: while ( i1 < len ) { 101: cp1 = s1.codePointAt(i1); 102: cp2 = s2.codePointAt(i1); 103: 104: // Case one: neither is a digit: 105: if ( ! Character.isDigit(cp1) && ! Character.isDigit(cp2) ) { 106: if ( cp1 != cp2 ) 107: return cp1 - cp2; 108: i1 += Character.charCount(cp1); 109: } 110: // Case two: only one is numeric: 111: else if ( Character.isDigit(cp1) != Character.isDigit(cp2) ) { 112: return Character.isDigit(cp1) ? -1: 1; // numbers < letters 113: 114: } 115: // Case three: both are numeric: 116: else { 117: // Find longest run of digits in each String, and extract: 118: int numStart = i1, i2 = i1; 119: while ( Character.isDigit(cp1) && i1 < s1.length() ) { 120: i1 += Character.charCount(cp1); 121: if ( i1 >= s1.length() ) 122: break; 123: cp1 = s1.codePointAt(i1); 124: } 125: while ( Character.isDigit(cp2) && i2 < s2.length() ) { 126: i2 += Character.charCount(cp2); 127: if ( i2 >= s2.length() ) 128: break; 129: cp2 = s2.codePointAt(i2); 130: } 131: final String num1String = s1.substring(numStart, i1); 132: final String num2String = s2.substring(numStart, i2); 133: 134: // Convert to int (or BigInteger) and compare numerically: 135: try { 136: final int num1 = Integer.parseInt(num1String); 137: final int num2 = Integer.parseInt(num2String); 138: if ( num1 != num2 ) 139: return num1 - num2; 140: } catch (NumberFormatException nfe) { 141: 142: // If numbers are too big for type int, use BigIntegers: 143: final BigInteger num1 = new BigInteger(num1String); 144: final BigInteger num2 = new BigInteger(num2String); 145: if ( ! num1.equals(num2)) 146: return num1.subtract(num2).intValue(); 147: } 148: // Handle leading zero when values are equal; shorter is less: 149: if ( num1String.length() != num2String.length() ) 150: return num1String.length() - num2String.length(); 151: } 152: } 153: 154: // If we get to here, the Strings are the same unless one is longer: 155: return s1.length() - s2.length(); 156: } 157: }