NatOrderComparator.java

Download NatOrderComparator.java

  1: package com.wpollock.natordercomp;
  2: 
  3: import java.math.BigInteger;
  4: import java.util.Comparator;
  5: 
  6: /**
  7:  * A natural order String Comparator.
  8:  * Natural order means to compare Strings character for character,
  9:  * except when there are runs of digits in the String.  Each run of
 10:  * digits is compared numerically, not lexically (so "2" < "10").
 11:  * <p>
 12:  * If two runs of digits compare equal numerically, the shorter number
 13:  * is less than the longer one (so "3" < "03").
 14:  * <p>
 15:  * The general algorithm is inspired by
 16:  * <a href="https://en.wikipedia.org/wiki/Natural_sort_order">Natural
 17:  * Sort Order</a> on Wikipedia.
 18:  * Compare two Strings codepoint by codepoint, unless a digit is
 19:  * encountered.  If the other String's codepoint is not a digit,
 20:  * the String with the digit is less.
 21:  * If both are numeric, for each String read all the digits and convert to
 22:  * a number.  The smaller value is less; if values
 23:  * are the same, shorter length is less (to account for leading zeroes). If
 24:  * the numbers are the same, continue with the next codepoint  If everything
 25:  * is the same, the shorter String is less.  Otherwise, they are equal.
 26:  * <p>
 27:  * For this version, no options are provided (such as ignoring leading zeros
 28:  * or spaces, case-insensitive matching, or Unicode normalization).  All
 29:  * such steps are assumed to be done by the caller.
 30:  *
 31:  * @version 1.0 (12/30/2015)
 32:  * @author Wayne Pollock, Tampa Florida USA
 33:  *
 34:  */
 35: 
 36: public class NatOrderComparator implements Comparator<String> {
 37: 
 38:     /*
 39:      * While the code strives for a clean, elegant solution, efficiency in a
 40:      * Comparator is important, so this version avoids using regular expressions
 41:      * and tries to avoid creating unnecessary objects.
 42:      */
 43: 
 44:     // Someday, there should be versions with args for case-insensitive,
 45:     // ignore whitespace, and Unicode normalization & sanitization.  For
 46:     // maximum flexibility, we will use the static factory pattern even
 47:     // though it isn't warranted yet.
 48: 
 49:     // Reusing noc is safe since NatOrderComparator objects are immutable.
 50:     private static NatOrderComparator noc = new NatOrderComparator();
 51: 
 52:     /**
 53:      * No-arg constructor.
 54:      */
 55:     private NatOrderComparator() {
 56:     }
 57: 
 58:     /**
 59:      * Factory method to get a reference to a NatOrderComparator.
 60:      * @return NatOrderComparator
 61:      */
 62:     public static NatOrderComparator getInstance() {
 63:         return noc;
 64:     }
 65: 
 66:     /**
 67:      * Compares two Strings using a "natural" sorting order.
 68:      *
 69:      * @param s1 the first String to compare
 70:      * @param s2 the second String to compare
 71:      * @return {@literal 0 if s1 equals s2,
 72:      *         <0 if s1 is naturally less than s2,
 73:      *         >0 if s2 is naturally less than s1
 74:      *         }
 75:      * @throws NullPointerException if either argument is null.  This Exception
 76:      * is required by the Comparator contract, not IllegalArgumentException.
 77:      */
 78: 
 79:     @Override
 80:     public int compare(final String s1, final String s2) {
 81:         if ( s1 == null || s2 == null )
 82:             throw new NullPointerException("s1: " + s1 + ", s2: " + s2);
 83:         if ( s1.equals(s2))  // Includes zero-length Strings
 84:             return 0;
 85: 
 86:         // Walk the length of the shorter String:
 87:         if ( s1.length() == 0 )
 88:             return -1;  // s2.length must be greater than 0
 89:         if ( s2.length() == 0 )
 90:             return 1;  // s1.length must be greater than 0
 91:         final int len = Math.min(s1.length(), s2.length());
 92: 
 93:         // Walk through the String, one codepoint at a time:
 94:         // (Since Java 8, you can do this with Streams:
 95:         // s.codePoints().forEach(c -> ...);
 96:         // for(int c : s.codePoints().toArray()) { ... }
 97:         // But I don't think those methods are as efficient.)
 98:         int cp1, cp2;
 99:         int i1 = 0;
100:         while ( i1 < len ) {
101:             cp1 = s1.codePointAt(i1);
102:             cp2 = s2.codePointAt(i1);
103: 
104:             // Case one: neither is a digit:
105:             if ( ! Character.isDigit(cp1) && ! Character.isDigit(cp2) ) {
106:                 if ( cp1 != cp2 )
107:                     return cp1 - cp2;
108:                 i1 += Character.charCount(cp1);
109:             }
110:             // Case two: only one is numeric:
111:             else if ( Character.isDigit(cp1) != Character.isDigit(cp2) ) {
112:                 return Character.isDigit(cp1) ? -1: 1;  // numbers < letters
113: 
114:             }
115:             // Case three: both are numeric:
116:             else {
117:                 // Find longest run of digits in each String, and extract:
118:                 int numStart = i1, i2 = i1;
119:                 while ( Character.isDigit(cp1) && i1 < s1.length() ) {
120:                     i1 += Character.charCount(cp1);
121:                     if ( i1 >= s1.length() )
122:                         break;
123:                     cp1 = s1.codePointAt(i1);
124:                 }
125:                 while ( Character.isDigit(cp2) && i2 < s2.length() ) {
126:                     i2 += Character.charCount(cp2);
127:                     if ( i2 >= s2.length() )
128:                         break;
129:                     cp2 = s2.codePointAt(i2);
130:                 }
131:                 final String num1String = s1.substring(numStart, i1);
132:                 final String num2String = s2.substring(numStart, i2);
133: 
134:                 // Convert to int (or BigInteger) and compare numerically:
135:                 try {
136:                     final int num1 = Integer.parseInt(num1String);
137:                     final int num2 = Integer.parseInt(num2String);
138:                     if ( num1 != num2 )
139:                         return num1 - num2;
140:                 } catch (NumberFormatException nfe) {
141: 
142:                     // If numbers are too big for type int, use BigIntegers:
143:                     final BigInteger num1 = new BigInteger(num1String);
144:                     final BigInteger num2 = new BigInteger(num2String);
145:                     if ( ! num1.equals(num2))
146:                         return num1.subtract(num2).intValue();
147:                 }
148:                 // Handle leading zero when values are equal; shorter is less:
149:                 if ( num1String.length() != num2String.length() )
150:                     return num1String.length() - num2String.length();
151:             }
152:         }
153: 
154:         // If we get to here, the Strings are the same unless one is longer:
155:         return s1.length() - s2.length();
156:     }
157: }