001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import java.util.regex.Pattern;
021
022import org.apache.commons.codec.EncoderException;
023import org.apache.commons.codec.StringEncoder;
024
025/**
026 * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
027 * general purpose scheme to find word with similar phonemes.
028 * <p>
029 * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
030 * </p>
031 * <p>
032 * Algorithm description:
033 * </p>
034 * <pre>
035 * 1. Transcode first characters of name
036 *   1a. MAC -&gt;   MCC
037 *   1b. KN  -&gt;   NN
038 *   1c. K   -&gt;   C
039 *   1d. PH  -&gt;   FF
040 *   1e. PF  -&gt;   FF
041 *   1f. SCH -&gt;   SSS
042 * 2. Transcode last characters of name
043 *   2a. EE, IE          -&gt;   Y
044 *   2b. DT,RT,RD,NT,ND  -&gt;   D
045 * 3. First character of key = first character of name
046 * 4. Transcode remaining characters by following these rules, incrementing by one character each time
047 *   4a. EV  -&gt;   AF  else A,E,I,O,U -&gt; A
048 *   4b. Q   -&gt;   G
049 *   4c. Z   -&gt;   S
050 *   4d. M   -&gt;   N
051 *   4e. KN  -&gt;   N   else K -&gt; C
052 *   4f. SCH -&gt;   SSS
053 *   4g. PH  -&gt;   FF
054 *   4h. H   -&gt;   If previous or next is non-vowel, previous
055 *   4i. W   -&gt;   If previous is vowel, previous
056 *   4j. Add current to key if current != last key character
057 * 5. If last character is S, remove it
058 * 6. If last characters are AY, replace with Y
059 * 7. If last character is A, remove it
060 * 8. Collapse all strings of repeated characters
061 * 9. Add original first character of name as first character of key
062 * </pre>
063 * <p>
064 * This class is immutable and thread-safe.
065 * </p>
066 *
067 * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
068 * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
069 * @see Soundex
070 * @since 1.7
071 */
072public class Nysiis implements StringEncoder {
073
074    private static final char[] CHARS_A   = { 'A' };
075    private static final char[] CHARS_AF  = { 'A', 'F' };
076    private static final char[] CHARS_C   = { 'C' };
077    private static final char[] CHARS_FF  = { 'F', 'F' };
078    private static final char[] CHARS_G   = { 'G' };
079    private static final char[] CHARS_N   = { 'N' };
080    private static final char[] CHARS_NN  = { 'N', 'N' };
081    private static final char[] CHARS_S   = { 'S' };
082    private static final char[] CHARS_SSS = { 'S', 'S', 'S' };
083
084    private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
085    private static final Pattern PAT_KN     = Pattern.compile("^KN");
086    private static final Pattern PAT_K      = Pattern.compile("^K");
087    private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
088    private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
089    private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
090    private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
091
092    private static final char SPACE = ' ';
093    private static final int TRUE_LENGTH = 6;
094
095    /** Indicates the strict mode. */
096    private final boolean strict;
097
098    /**
099     * Tests if the given character is a vowel.
100     *
101     * @param c
102     *            the character to test
103     * @return {@code true} if the character is a vowel, {@code false} otherwise
104     */
105    private static boolean isVowel(final char c) {
106        return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
107    }
108
109    /**
110     * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
111     * a time: [i-1, i, i+1, i+2].
112     *
113     * @param prev
114     *            the previous character
115     * @param curr
116     *            the current character
117     * @param next
118     *            the next character
119     * @param aNext
120     *            the after next character
121     * @return a transcoded array of characters, starting from the current position
122     */
123    private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
124        // 1. EV -> AF
125        if (curr == 'E' && next == 'V') {
126            return CHARS_AF;
127        }
128
129        // A, E, I, O, U -> A
130        if (isVowel(curr)) {
131            return CHARS_A;
132        }
133
134        // 2. Q -> G, Z -> S, M -> N
135
136
137        // 3. KN -> NN else K -> C
138        switch (curr) {
139        case 'Q':
140            return CHARS_G;
141        case 'Z':
142            return CHARS_S;
143        case 'M':
144            return CHARS_N;
145        case 'K':
146            if (next == 'N') {
147                return CHARS_NN;
148            }
149            return CHARS_C;
150        default:
151            break;
152        }
153
154        // 4. SCH -> SSS
155        if (curr == 'S' && next == 'C' && aNext == 'H') {
156            return CHARS_SSS;
157        }
158
159        // PH -> FF
160        if (curr == 'P' && next == 'H') {
161            return CHARS_FF;
162        }
163
164        // 5. H -> If previous or next is a non vowel, previous.
165        if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
166            return new char[] { prev };
167        }
168
169        // 6. W -> If previous is vowel, previous.
170        if (curr == 'W' && isVowel(prev)) {
171            return new char[] { prev };
172        }
173
174        return new char[] { curr };
175    }
176
177    /**
178     * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
179     * i.e. encoded strings have a maximum length of 6.
180     */
181    public Nysiis() {
182        this(true);
183    }
184
185    /**
186     * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
187     *
188     * <ul>
189     *  <li>{@code true}: encoded strings have a maximum length of 6</li>
190     *  <li>{@code false}: encoded strings may have arbitrary length</li>
191     * </ul>
192     *
193     * @param strict
194     *            the strict mode
195     */
196    public Nysiis(final boolean strict) {
197        this.strict = strict;
198    }
199
200    /**
201     * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
202     * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
203     * {@link String}.
204     *
205     * @param obj
206     *            Object to encode
207     * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
208     * @throws EncoderException
209     *            if the parameter supplied is not of a {@link String}
210     * @throws IllegalArgumentException
211     *            if a character is not mapped
212     */
213    @Override
214    public Object encode(final Object obj) throws EncoderException {
215        if (!(obj instanceof String)) {
216            throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
217        }
218        return this.nysiis((String) obj);
219    }
220
221    /**
222     * Encodes a String using the NYSIIS algorithm.
223     *
224     * @param str
225     *            A String object to encode
226     * @return A Nysiis code corresponding to the String supplied
227     * @throws IllegalArgumentException
228     *            if a character is not mapped
229     */
230    @Override
231    public String encode(final String str) {
232        return this.nysiis(str);
233    }
234
235    /**
236     * Indicates the strict mode for this {@link Nysiis} encoder.
237     *
238     * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise
239     */
240    public boolean isStrict() {
241        return this.strict;
242    }
243
244    /**
245     * Retrieves the NYSIIS code for a given String object.
246     *
247     * @param str
248     *            String to encode using the NYSIIS algorithm
249     * @return A NYSIIS code for the String supplied
250     */
251    public String nysiis(String str) {
252        if (str == null) {
253            return null;
254        }
255
256        // Use the same clean rules as Soundex
257        str = SoundexUtils.clean(str);
258
259        if (str.isEmpty()) {
260            return str;
261        }
262
263        // Translate first characters of name:
264        // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
265        str = PAT_MAC.matcher(str).replaceFirst("MCC");
266        str = PAT_KN.matcher(str).replaceFirst("NN");
267        str = PAT_K.matcher(str).replaceFirst("C");
268        str = PAT_PH_PF.matcher(str).replaceFirst("FF");
269        str = PAT_SCH.matcher(str).replaceFirst("SSS");
270
271        // Translate last characters of name:
272        // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
273        str = PAT_EE_IE.matcher(str).replaceFirst("Y");
274        str = PAT_DT_ETC.matcher(str).replaceFirst("D");
275
276        // First character of key = first character of name.
277        final StringBuilder key = new StringBuilder(str.length());
278        key.append(str.charAt(0));
279
280        // Transcode remaining characters, incrementing by one character each time
281        final char[] chars = str.toCharArray();
282        final int len = chars.length;
283
284        for (int i = 1; i < len; i++) {
285            final char next = i < len - 1 ? chars[i + 1] : SPACE;
286            final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
287            final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
288            System.arraycopy(transcoded, 0, chars, i, transcoded.length);
289
290            // only append the current char to the key if it is different from the last one
291            if (chars[i] != chars[i - 1]) {
292                key.append(chars[i]);
293            }
294        }
295
296        if (key.length() > 1) {
297            char lastChar = key.charAt(key.length() - 1);
298
299            // If last character is S, remove it.
300            if (lastChar == 'S') {
301                key.deleteCharAt(key.length() - 1);
302                lastChar = key.charAt(key.length() - 1);
303            }
304
305            if (key.length() > 2) {
306                final char last2Char = key.charAt(key.length() - 2);
307                // If last characters are AY, replace with Y.
308                if (last2Char == 'A' && lastChar == 'Y') {
309                    key.deleteCharAt(key.length() - 2);
310                }
311            }
312
313            // If last character is A, remove it.
314            if (lastChar == 'A') {
315                key.deleteCharAt(key.length() - 1);
316            }
317        }
318
319        final String string = key.toString();
320        return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
321    }
322
323}