001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022import org.apache.commons.codec.binary.StringUtils;
023
024/**
025 * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence
026 * Philips</CITE>.
027 * <p>
028 * This class is conditionally thread-safe. The instance field for the maximum code length is mutable
029 * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is
030 * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication
031 * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup.
032 * </p>
033 *
034 * @see <a href="http://drdobbs.com/184401251?pgno=2">Original Article</a>
035 * @see <a href="http://en.wikipedia.org/wiki/Metaphone">http://en.wikipedia.org/wiki/Metaphone</a>
036 */
037public class DoubleMetaphone implements StringEncoder {
038
039    /**
040     * "Vowels" to test for
041     */
042    private static final String VOWELS = "AEIOUY";
043
044    /**
045     * Prefixes when present which are not pronounced
046     */
047    private static final String[] SILENT_START =
048        { "GN", "KN", "PN", "WR", "PS" };
049    private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
050        { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
051    private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
052        { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
053    private static final String[] L_T_K_S_N_M_B_Z =
054        { "L", "T", "K", "S", "N", "M", "B", "Z" };
055
056    /**
057     * Maximum length of an encoding, default is 4
058     */
059    private int maxCodeLen = 4;
060
061    /**
062     * Encode a value with Double Metaphone.
063     *
064     * @param value String to encode
065     * @return an encoded string
066     */
067    public String doubleMetaphone(final String value) {
068        return doubleMetaphone(value, false);
069    }
070
071    /**
072     * Encode a value with Double Metaphone, optionally using the alternate encoding.
073     *
074     * @param value String to encode
075     * @param alternate use alternate encode
076     * @return an encoded string
077     */
078    public String doubleMetaphone(String value, final boolean alternate) {
079        value = cleanInput(value);
080        if (value == null) {
081            return null;
082        }
083
084        final boolean slavoGermanic = isSlavoGermanic(value);
085        int index = isSilentStart(value) ? 1 : 0;
086
087        final DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
088
089        while (!result.isComplete() && index <= value.length() - 1) {
090            switch (value.charAt(index)) {
091            case 'A':
092            case 'E':
093            case 'I':
094            case 'O':
095            case 'U':
096            case 'Y':
097                index = handleAEIOUY(result, index);
098                break;
099            case 'B':
100                result.append('P');
101                index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
102                break;
103            case '\u00C7':
104                // A C with a Cedilla
105                result.append('S');
106                index++;
107                break;
108            case 'C':
109                index = handleC(value, result, index);
110                break;
111            case 'D':
112                index = handleD(value, result, index);
113                break;
114            case 'F':
115                result.append('F');
116                index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
117                break;
118            case 'G':
119                index = handleG(value, result, index, slavoGermanic);
120                break;
121            case 'H':
122                index = handleH(value, result, index);
123                break;
124            case 'J':
125                index = handleJ(value, result, index, slavoGermanic);
126                break;
127            case 'K':
128                result.append('K');
129                index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
130                break;
131            case 'L':
132                index = handleL(value, result, index);
133                break;
134            case 'M':
135                result.append('M');
136                index = conditionM0(value, index) ? index + 2 : index + 1;
137                break;
138            case 'N':
139                result.append('N');
140                index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
141                break;
142            case '\u00D1':
143                // N with a tilde (spanish ene)
144                result.append('N');
145                index++;
146                break;
147            case 'P':
148                index = handleP(value, result, index);
149                break;
150            case 'Q':
151                result.append('K');
152                index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
153                break;
154            case 'R':
155                index = handleR(value, result, index, slavoGermanic);
156                break;
157            case 'S':
158                index = handleS(value, result, index, slavoGermanic);
159                break;
160            case 'T':
161                index = handleT(value, result, index);
162                break;
163            case 'V':
164                result.append('F');
165                index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
166                break;
167            case 'W':
168                index = handleW(value, result, index);
169                break;
170            case 'X':
171                index = handleX(value, result, index);
172                break;
173            case 'Z':
174                index = handleZ(value, result, index, slavoGermanic);
175                break;
176            default:
177                index++;
178                break;
179            }
180        }
181
182        return alternate ? result.getAlternate() : result.getPrimary();
183    }
184
185    /**
186     * Encode the value using DoubleMetaphone.  It will only work if
187     * {@code obj} is a {@code String} (like {@code Metaphone}).
188     *
189     * @param obj Object to encode (should be of type String)
190     * @return An encoded Object (will be of type String)
191     * @throws EncoderException encode parameter is not of type String
192     */
193    @Override
194    public Object encode(final Object obj) throws EncoderException {
195        if (!(obj instanceof String)) {
196            throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
197        }
198        return doubleMetaphone((String) obj);
199    }
200
201    /**
202     * Encode the value using DoubleMetaphone.
203     *
204     * @param value String to encode
205     * @return An encoded String
206     */
207    @Override
208    public String encode(final String value) {
209        return doubleMetaphone(value);
210    }
211
212    /**
213     * Check if the Double Metaphone values of two {@code String} values
214     * are equal.
215     *
216     * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
217     * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
218     * @return {@code true} if the encoded {@code String}s are equal;
219     *          {@code false} otherwise.
220     * @see #isDoubleMetaphoneEqual(String,String,boolean)
221     */
222    public boolean isDoubleMetaphoneEqual(final String value1, final String value2) {
223        return isDoubleMetaphoneEqual(value1, value2, false);
224    }
225
226    /**
227     * Check if the Double Metaphone values of two {@code String} values
228     * are equal, optionally using the alternate value.
229     *
230     * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
231     * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
232     * @param alternate use the alternate value if {@code true}.
233     * @return {@code true} if the encoded {@code String}s are equal;
234     *          {@code false} otherwise.
235     */
236    public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) {
237        return StringUtils.equals(doubleMetaphone(value1, alternate), doubleMetaphone(value2, alternate));
238    }
239
240    /**
241     * Returns the maxCodeLen.
242     * @return int
243     */
244    public int getMaxCodeLen() {
245        return this.maxCodeLen;
246    }
247
248    /**
249     * Sets the maxCodeLen.
250     * @param maxCodeLen The maxCodeLen to set
251     */
252    public void setMaxCodeLen(final int maxCodeLen) {
253        this.maxCodeLen = maxCodeLen;
254    }
255
256    //-- BEGIN HANDLERS --//
257
258    /**
259     * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases.
260     */
261    private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) {
262        if (index == 0) {
263            result.append('A');
264        }
265        return index + 1;
266    }
267
268    /**
269     * Handles 'C' cases.
270     */
271    private int handleC(final String value, final DoubleMetaphoneResult result, int index) {
272        if (conditionC0(value, index)) {  // very confusing, moved out
273            result.append('K');
274            index += 2;
275        } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
276            result.append('S');
277            index += 2;
278        } else if (contains(value, index, 2, "CH")) {
279            index = handleCH(value, result, index);
280        } else if (contains(value, index, 2, "CZ") &&
281                   !contains(value, index - 2, 4, "WICZ")) {
282            //-- "Czerny" --//
283            result.append('S', 'X');
284            index += 2;
285        } else if (contains(value, index + 1, 3, "CIA")) {
286            //-- "focaccia" --//
287            result.append('X');
288            index += 3;
289        } else if (contains(value, index, 2, "CC") &&
290                   !(index == 1 && charAt(value, 0) == 'M')) {
291            //-- double "cc" but not "McClelland" --//
292            return handleCC(value, result, index);
293        } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
294            result.append('K');
295            index += 2;
296        } else if (contains(value, index, 2, "CI", "CE", "CY")) {
297            //-- Italian vs. English --//
298            if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
299                result.append('S', 'X');
300            } else {
301                result.append('S');
302            }
303            index += 2;
304        } else {
305            result.append('K');
306            if (contains(value, index + 1, 2, " C", " Q", " G")) {
307                //-- Mac Caffrey, Mac Gregor --//
308                index += 3;
309            } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
310                       !contains(value, index + 1, 2, "CE", "CI")) {
311                index += 2;
312            } else {
313                index++;
314            }
315        }
316
317        return index;
318    }
319
320    /**
321     * Handles 'CC' cases.
322     */
323    private int handleCC(final String value, final DoubleMetaphoneResult result, int index) {
324        if (contains(value, index + 2, 1, "I", "E", "H") &&
325            !contains(value, index + 2, 2, "HU")) {
326            //-- "bellocchio" but not "bacchus" --//
327            if ((index == 1 && charAt(value, index - 1) == 'A') ||
328                contains(value, index - 1, 5, "UCCEE", "UCCES")) {
329                //-- "accident", "accede", "succeed" --//
330                result.append("KS");
331            } else {
332                //-- "bacci", "bertucci", other Italian --//
333                result.append('X');
334            }
335            index += 3;
336        } else {    // Pierce's rule
337            result.append('K');
338            index += 2;
339        }
340
341        return index;
342    }
343
344    /**
345     * Handles 'CH' cases.
346     */
347    private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) {
348        if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
349            result.append('K', 'X');
350            return index + 2;
351        }
352        if (conditionCH0(value, index)) {
353            //-- Greek roots ("chemistry", "chorus", etc.) --//
354            result.append('K');
355            return index + 2;
356        }
357        if (conditionCH1(value, index)) {
358            //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
359            result.append('K');
360            return index + 2;
361        }
362        if (index > 0) {
363            if (contains(value, 0, 2, "MC")) {
364                result.append('K');
365            } else {
366                result.append('X', 'K');
367            }
368        } else {
369            result.append('X');
370        }
371        return index + 2;
372    }
373
374    /**
375     * Handles 'D' cases.
376     */
377    private int handleD(final String value, final DoubleMetaphoneResult result, int index) {
378        if (contains(value, index, 2, "DG")) {
379            //-- "Edge" --//
380            if (contains(value, index + 2, 1, "I", "E", "Y")) {
381                result.append('J');
382                index += 3;
383                //-- "Edgar" --//
384            } else {
385                result.append("TK");
386                index += 2;
387            }
388        } else if (contains(value, index, 2, "DT", "DD")) {
389            result.append('T');
390            index += 2;
391        } else {
392            result.append('T');
393            index++;
394        }
395        return index;
396    }
397
398    /**
399     * Handles 'G' cases.
400     */
401    private int handleG(final String value, final DoubleMetaphoneResult result, int index,
402                        final boolean slavoGermanic) {
403        if (charAt(value, index + 1) == 'H') {
404            index = handleGH(value, result, index);
405        } else if (charAt(value, index + 1) == 'N') {
406            if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
407                result.append("KN", "N");
408            } else if (!contains(value, index + 2, 2, "EY") &&
409                       charAt(value, index + 1) != 'Y' && !slavoGermanic) {
410                result.append("N", "KN");
411            } else {
412                result.append("KN");
413            }
414            index = index + 2;
415        } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
416            result.append("KL", "L");
417            index += 2;
418        } else if (index == 0 &&
419                   (charAt(value, index + 1) == 'Y' ||
420                    contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
421            //-- -ges-, -gep-, -gel-, -gie- at beginning --//
422            result.append('K', 'J');
423            index += 2;
424        } else if ((contains(value, index + 1, 2, "ER") ||
425                    charAt(value, index + 1) == 'Y') &&
426                   !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
427                   !contains(value, index - 1, 1, "E", "I") &&
428                   !contains(value, index - 1, 3, "RGY", "OGY")) {
429            //-- -ger-, -gy- --//
430            result.append('K', 'J');
431            index += 2;
432        } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
433                   contains(value, index - 1, 4, "AGGI", "OGGI")) {
434            //-- Italian "biaggi" --//
435            if (contains(value, 0 ,4, "VAN ", "VON ") ||
436                contains(value, 0, 3, "SCH") ||
437                contains(value, index + 1, 2, "ET")) {
438                //-- obvious germanic --//
439                result.append('K');
440            } else if (contains(value, index + 1, 3, "IER")) {
441                result.append('J');
442            } else {
443                result.append('J', 'K');
444            }
445            index += 2;
446        } else if (charAt(value, index + 1) == 'G') {
447            index += 2;
448            result.append('K');
449        } else {
450            index++;
451            result.append('K');
452        }
453        return index;
454    }
455
456    /**
457     * Handles 'GH' cases.
458     */
459    private int handleGH(final String value, final DoubleMetaphoneResult result, int index) {
460        if (index > 0 && !isVowel(charAt(value, index - 1))) {
461            result.append('K');
462            index += 2;
463        } else if (index == 0) {
464            if (charAt(value, index + 2) == 'I') {
465                result.append('J');
466            } else {
467                result.append('K');
468            }
469            index += 2;
470        } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
471                   (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
472                   (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
473            //-- Parker's rule (with some further refinements) - "hugh"
474            index += 2;
475        } else {
476            if (index > 2 && charAt(value, index - 1) == 'U' &&
477                contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
478                //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
479                result.append('F');
480            } else if (index > 0 && charAt(value, index - 1) != 'I') {
481                result.append('K');
482            }
483            index += 2;
484        }
485        return index;
486    }
487
488    /**
489     * Handles 'H' cases.
490     */
491    private int handleH(final String value, final DoubleMetaphoneResult result, int index) {
492        //-- only keep if first & before vowel or between 2 vowels --//
493        if ((index == 0 || isVowel(charAt(value, index - 1))) &&
494            isVowel(charAt(value, index + 1))) {
495            result.append('H');
496            index += 2;
497            //-- also takes car of "HH" --//
498        } else {
499            index++;
500        }
501        return index;
502    }
503
504    /**
505     * Handles 'J' cases.
506     */
507    private int handleJ(final String value, final DoubleMetaphoneResult result, int index,
508                        final boolean slavoGermanic) {
509        if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
510                //-- obvious Spanish, "Jose", "San Jacinto" --//
511                if ((index == 0 && (charAt(value, index + 4) == ' ') ||
512                     value.length() == 4) || contains(value, 0, 4, "SAN ")) {
513                    result.append('H');
514                } else {
515                    result.append('J', 'H');
516                }
517                index++;
518            } else {
519                if (index == 0 && !contains(value, index, 4, "JOSE")) {
520                    result.append('J', 'A');
521                } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
522                           (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
523                    result.append('J', 'H');
524                } else if (index == value.length() - 1) {
525                    result.append('J', ' ');
526                } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) &&
527                           !contains(value, index - 1, 1, "S", "K", "L")) {
528                    result.append('J');
529                }
530
531                if (charAt(value, index + 1) == 'J') {
532                    index += 2;
533                } else {
534                    index++;
535                }
536            }
537        return index;
538    }
539
540    /**
541     * Handles 'L' cases.
542     */
543    private int handleL(final String value, final DoubleMetaphoneResult result, int index) {
544        if (charAt(value, index + 1) == 'L') {
545            if (conditionL0(value, index)) {
546                result.appendPrimary('L');
547            } else {
548                result.append('L');
549            }
550            index += 2;
551        } else {
552            index++;
553            result.append('L');
554        }
555        return index;
556    }
557
558    /**
559     * Handles 'P' cases.
560     */
561    private int handleP(final String value, final DoubleMetaphoneResult result, int index) {
562        if (charAt(value, index + 1) == 'H') {
563            result.append('F');
564            index += 2;
565        } else {
566            result.append('P');
567            index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
568        }
569        return index;
570    }
571
572    /**
573     * Handles 'R' cases.
574     */
575    private int handleR(final String value, final DoubleMetaphoneResult result, final int index,
576                        final boolean slavoGermanic) {
577        if (index == value.length() - 1 && !slavoGermanic &&
578            contains(value, index - 2, 2, "IE") &&
579            !contains(value, index - 4, 2, "ME", "MA")) {
580            result.appendAlternate('R');
581        } else {
582            result.append('R');
583        }
584        return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
585    }
586
587    /**
588     * Handles 'S' cases.
589     */
590    private int handleS(final String value, final DoubleMetaphoneResult result, int index,
591                        final boolean slavoGermanic) {
592        if (contains(value, index - 1, 3, "ISL", "YSL")) {
593            //-- special cases "island", "isle", "carlisle", "carlysle" --//
594            index++;
595        } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
596            //-- special case "sugar-" --//
597            result.append('X', 'S');
598            index++;
599        } else if (contains(value, index, 2, "SH")) {
600            if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) {
601                //-- germanic --//
602                result.append('S');
603            } else {
604                result.append('X');
605            }
606            index += 2;
607        } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
608            //-- Italian and Armenian --//
609            if (slavoGermanic) {
610                result.append('S');
611            } else {
612                result.append('S', 'X');
613            }
614            index += 3;
615        } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) ||
616                   contains(value, index + 1, 1, "Z")) {
617            //-- german & anglicisations, e.g. "smith" match "schmidt" //
618            // "snider" match "schneider" --//
619            //-- also, -sz- in slavic language although in hungarian it //
620            //   is pronounced "s" --//
621            result.append('S', 'X');
622            index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
623        } else if (contains(value, index, 2, "SC")) {
624            index = handleSC(value, result, index);
625        } else {
626            if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) {
627                //-- french e.g. "resnais", "artois" --//
628                result.appendAlternate('S');
629            } else {
630                result.append('S');
631            }
632            index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
633        }
634        return index;
635    }
636
637    /**
638     * Handles 'SC' cases.
639     */
640    private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) {
641        if (charAt(value, index + 2) == 'H') {
642            //-- Schlesinger's rule --//
643            if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) {
644                //-- Dutch origin, e.g. "school", "schooner" --//
645                if (contains(value, index + 3, 2, "ER", "EN")) {
646                    //-- "schermerhorn", "schenker" --//
647                    result.append("X", "SK");
648                } else {
649                    result.append("SK");
650                }
651            } else if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
652                result.append('X', 'S');
653            } else {
654                result.append('X');
655            }
656        } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
657            result.append('S');
658        } else {
659            result.append("SK");
660        }
661        return index + 3;
662    }
663
664    /**
665     * Handles 'T' cases.
666     */
667    private int handleT(final String value, final DoubleMetaphoneResult result, int index) {
668        if (contains(value, index, 4, "TION") || contains(value, index, 3, "TIA", "TCH")) {
669            result.append('X');
670            index += 3;
671        } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) {
672            if (contains(value, index + 2, 2, "OM", "AM") ||
673                //-- special case "thomas", "thames" or germanic --//
674                contains(value, 0, 4, "VAN ", "VON ") ||
675                contains(value, 0, 3, "SCH")) {
676                result.append('T');
677            } else {
678                result.append('0', 'T');
679            }
680            index += 2;
681        } else {
682            result.append('T');
683            index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
684        }
685        return index;
686    }
687
688    /**
689     * Handles 'W' cases.
690     */
691    private int handleW(final String value, final DoubleMetaphoneResult result, int index) {
692        if (contains(value, index, 2, "WR")) {
693            //-- can also be in middle of word --//
694            result.append('R');
695            index += 2;
696        } else if (index == 0 && (isVowel(charAt(value, index + 1)) ||
697                           contains(value, index, 2, "WH"))) {
698            if (isVowel(charAt(value, index + 1))) {
699                //-- Wasserman should match Vasserman --//
700                result.append('A', 'F');
701            } else {
702                //-- need Uomo to match Womo --//
703                result.append('A');
704            }
705            index++;
706        } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
707                   contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
708                   contains(value, 0, 3, "SCH")) {
709            //-- Arnow should match Arnoff --//
710            result.appendAlternate('F');
711            index++;
712        } else if (contains(value, index, 4, "WICZ", "WITZ")) {
713            //-- Polish e.g. "filipowicz" --//
714            result.append("TS", "FX");
715            index += 4;
716        } else {
717            index++;
718        }
719        return index;
720    }
721
722    /**
723     * Handles 'X' cases.
724     */
725    private int handleX(final String value, final DoubleMetaphoneResult result, int index) {
726        if (index == 0) {
727            result.append('S');
728            index++;
729        } else {
730            if (!((index == value.length() - 1) &&
731                  (contains(value, index - 3, 3, "IAU", "EAU") ||
732                   contains(value, index - 2, 2, "AU", "OU")))) {
733                //-- French e.g. breaux --//
734                result.append("KS");
735            }
736            index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
737        }
738        return index;
739    }
740
741    /**
742     * Handles 'Z' cases.
743     */
744    private int handleZ(final String value, final DoubleMetaphoneResult result, int index,
745                        final boolean slavoGermanic) {
746        if (charAt(value, index + 1) == 'H') {
747            //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
748            result.append('J');
749            index += 2;
750        } else {
751            if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") ||
752                (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
753                result.append("S", "TS");
754            } else {
755                result.append('S');
756            }
757            index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
758        }
759        return index;
760    }
761
762    //-- BEGIN CONDITIONS --//
763
764    /**
765     * Complex condition 0 for 'C'.
766     */
767    private boolean conditionC0(final String value, final int index) {
768        if (contains(value, index, 4, "CHIA")) {
769            return true;
770        }
771        if (index <= 1) {
772            return false;
773        }
774        if (isVowel(charAt(value, index - 2))) {
775            return false;
776        }
777        if (!contains(value, index - 1, 3, "ACH")) {
778            return false;
779        }
780        final char c = charAt(value, index + 2);
781        return (c != 'I' && c != 'E') ||
782                contains(value, index - 2, 6, "BACHER", "MACHER");
783    }
784
785    /**
786     * Complex condition 0 for 'CH'.
787     */
788    private boolean conditionCH0(final String value, final int index) {
789        if (index != 0) {
790            return false;
791        }
792        if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
793                   !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
794            return false;
795        }
796        return !contains(value, 0, 5, "CHORE");
797    }
798
799    /**
800     * Complex condition 1 for 'CH'.
801     */
802    private boolean conditionCH1(final String value, final int index) {
803        return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) ||
804                contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
805                contains(value, index + 2, 1, "T", "S") ||
806                ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
807                 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
808    }
809
810    /**
811     * Complex condition 0 for 'L'.
812     */
813    private boolean conditionL0(final String value, final int index) {
814        if (index == value.length() - 3 &&
815            contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
816            return true;
817        }
818        return (contains(value, value.length() - 2, 2, "AS", "OS") ||
819                contains(value, value.length() - 1, 1, "A", "O")) &&
820                contains(value, index - 1, 4, "ALLE");
821    }
822
823    /**
824     * Complex condition 0 for 'M'.
825     */
826    private boolean conditionM0(final String value, final int index) {
827        if (charAt(value, index + 1) == 'M') {
828            return true;
829        }
830        return contains(value, index - 1, 3, "UMB") &&
831               ((index + 1) == value.length() - 1 || contains(value, index + 2, 2, "ER"));
832    }
833
834    //-- BEGIN HELPER FUNCTIONS --//
835
836    /**
837     * Determines whether or not a value is of slavo-germanic origin. A value is
838     * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
839     */
840    private boolean isSlavoGermanic(final String value) {
841        return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
842                value.contains("CZ") || value.contains("WITZ");
843    }
844
845    /**
846     * Determines whether or not a character is a vowel or not
847     */
848    private boolean isVowel(final char ch) {
849        return VOWELS.indexOf(ch) != -1;
850    }
851
852    /**
853     * Determines whether or not the value starts with a silent letter.  It will
854     * return {@code true} if the value starts with any of 'GN', 'KN',
855     * 'PN', 'WR' or 'PS'.
856     */
857    private boolean isSilentStart(final String value) {
858        boolean result = false;
859        for (final String element : SILENT_START) {
860            if (value.startsWith(element)) {
861                result = true;
862                break;
863            }
864        }
865        return result;
866    }
867
868    /**
869     * Cleans the input.
870     */
871    private String cleanInput(String input) {
872        if (input == null) {
873            return null;
874        }
875        input = input.trim();
876        if (input.isEmpty()) {
877            return null;
878        }
879        return input.toUpperCase(java.util.Locale.ENGLISH);
880    }
881
882    /*
883     * Gets the character at index {@code index} if available, otherwise
884     * it returns {@code Character.MIN_VALUE} so that there is some sort
885     * of default.
886     */
887    protected char charAt(final String value, final int index) {
888        if (index < 0 || index >= value.length()) {
889            return Character.MIN_VALUE;
890        }
891        return value.charAt(index);
892    }
893
894    /*
895     * Determines whether {@code value} contains any of the criteria starting at index {@code start} and
896     * matching up to length {@code length}.
897     */
898    protected static boolean contains(final String value, final int start, final int length,
899                                      final String... criteria) {
900        boolean result = false;
901        if (start >= 0 && start + length <= value.length()) {
902            final String target = value.substring(start, start + length);
903
904            for (final String element : criteria) {
905                if (target.equals(element)) {
906                    result = true;
907                    break;
908                }
909            }
910        }
911        return result;
912    }
913
914    //-- BEGIN INNER CLASSES --//
915
916    /**
917     * Inner class for storing results, since there is the optional alternate encoding.
918     */
919    public class DoubleMetaphoneResult {
920
921        private final StringBuilder primary = new StringBuilder(getMaxCodeLen());
922        private final StringBuilder alternate = new StringBuilder(getMaxCodeLen());
923        private final int maxLength;
924
925        public DoubleMetaphoneResult(final int maxLength) {
926            this.maxLength = maxLength;
927        }
928
929        public void append(final char value) {
930            appendPrimary(value);
931            appendAlternate(value);
932        }
933
934        public void append(final char primary, final char alternate) {
935            appendPrimary(primary);
936            appendAlternate(alternate);
937        }
938
939        public void appendPrimary(final char value) {
940            if (this.primary.length() < this.maxLength) {
941                this.primary.append(value);
942            }
943        }
944
945        public void appendAlternate(final char value) {
946            if (this.alternate.length() < this.maxLength) {
947                this.alternate.append(value);
948            }
949        }
950
951        public void append(final String value) {
952            appendPrimary(value);
953            appendAlternate(value);
954        }
955
956        public void append(final String primary, final String alternate) {
957            appendPrimary(primary);
958            appendAlternate(alternate);
959        }
960
961        public void appendPrimary(final String value) {
962            final int addChars = this.maxLength - this.primary.length();
963            if (value.length() <= addChars) {
964                this.primary.append(value);
965            } else {
966                this.primary.append(value, 0, addChars);
967            }
968        }
969
970        public void appendAlternate(final String value) {
971            final int addChars = this.maxLength - this.alternate.length();
972            if (value.length() <= addChars) {
973                this.alternate.append(value);
974            } else {
975                this.alternate.append(value, 0, addChars);
976            }
977        }
978
979        public String getPrimary() {
980            return this.primary.toString();
981        }
982
983        public String getAlternate() {
984            return this.alternate.toString();
985        }
986
987        public boolean isComplete() {
988            return this.primary.length() >= this.maxLength &&
989                   this.alternate.length() >= this.maxLength;
990        }
991    }
992}