001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.util.ArrayList; 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.Comparator; 024import java.util.EnumMap; 025import java.util.HashMap; 026import java.util.HashSet; 027import java.util.List; 028import java.util.Map; 029import java.util.Scanner; 030import java.util.Set; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import org.apache.commons.codec.Resources; 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 041 * and a logical flag indicating if all languages must be in play. A rule matches if: 042 * </p> 043 * <ul> 044 * <li>the pattern matches at the current position</li> 045 * <li>the string up until the beginning of the pattern matches the left context</li> 046 * <li>the string from the end of the pattern matches the right context</li> 047 * <li>logical is ALL and all languages are in scope; or</li> 048 * <li>logical is any other value and at least one language is in scope</li> 049 * </ul> 050 * <p> 051 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 052 * to explicitly construct their own. 053 * </p> 054 * <p> 055 * Rules are immutable and thread-safe. 056 * </p> 057 * <h2>Rules resources</h2> 058 * <p> 059 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 060 * named following the pattern: 061 * </p> 062 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 063 * <p> 064 * The format of these resources is the following: 065 * </p> 066 * <ul> 067 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 068 * will be interpreted as: 069 * <ol> 070 * <li>pattern</li> 071 * <li>left context</li> 072 * <li>right context</li> 073 * <li>phoneme</li> 074 * </ol> 075 * </li> 076 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 077 * as a comment.</li> 078 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 079 * all content until a line ending in '*' and '/' is found.</li> 080 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 081 * </ul> 082 * 083 * @since 1.6 084 */ 085public class Rule { 086 087 public static final RPattern ALL_STRINGS_RMATCHER = input -> true; 088 089 public static final String ALL = "ALL"; 090 091 private static final String DOUBLE_QUOTE = "\""; 092 093 private static final String HASH_INCLUDE = "#include"; 094 095 private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length(); 096 097 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = 098 new EnumMap<>(NameType.class); 099 100 private final RPattern lContext; 101 102 private final String pattern; 103 104 private final PhonemeExpr phoneme; 105 106 private final RPattern rContext; 107 108 public static final class Phoneme implements PhonemeExpr { 109 110 private final StringBuilder phonemeText; 111 private final Languages.LanguageSet languages; 112 113 public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> { 114 final int o1Length = o1.phonemeText.length(); 115 final int o2Length = o2.phonemeText.length(); 116 for (int i = 0; i < o1Length; i++) { 117 if (i >= o2Length) { 118 return +1; 119 } 120 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 121 if (c != 0) { 122 return c; 123 } 124 } 125 126 if (o1Length < o2Length) { 127 return -1; 128 } 129 130 return 0; 131 }; 132 133 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 134 this.phonemeText = new StringBuilder(phonemeText); 135 this.languages = languages; 136 } 137 138 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 139 this(phonemeLeft.phonemeText, phonemeLeft.languages); 140 this.phonemeText.append(phonemeRight.phonemeText); 141 } 142 143 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 144 this(phonemeLeft.phonemeText, languages); 145 this.phonemeText.append(phonemeRight.phonemeText); 146 } 147 148 public Phoneme append(final CharSequence str) { 149 this.phonemeText.append(str); 150 return this; 151 } 152 153 public Languages.LanguageSet getLanguages() { 154 return this.languages; 155 } 156 157 @Override 158 public Iterable<Phoneme> getPhonemes() { 159 return Collections.singleton(this); 160 } 161 162 public CharSequence getPhonemeText() { 163 return this.phonemeText; 164 } 165 166 /** 167 * Deprecated since 1.9. 168 * 169 * @param right the Phoneme to join 170 * @return a new Phoneme 171 * @deprecated since 1.9 172 */ 173 @Deprecated 174 public Phoneme join(final Phoneme right) { 175 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 176 this.languages.restrictTo(right.languages)); 177 } 178 179 /** 180 * Returns a new Phoneme with the same text but a union of its 181 * current language set and the given one. 182 * 183 * @param lang the language set to merge 184 * @return a new Phoneme 185 */ 186 public Phoneme mergeWithLanguage(final LanguageSet lang) { 187 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang)); 188 } 189 190 @Override 191 public String toString() { 192 return phonemeText.toString() + "[" + languages + "]"; 193 } 194 } 195 196 public interface PhonemeExpr { 197 Iterable<Phoneme> getPhonemes(); 198 } 199 200 public static final class PhonemeList implements PhonemeExpr { 201 202 private final List<Phoneme> phonemes; 203 204 public PhonemeList(final List<Phoneme> phonemes) { 205 this.phonemes = phonemes; 206 } 207 208 @Override 209 public List<Phoneme> getPhonemes() { 210 return this.phonemes; 211 } 212 } 213 214 /** 215 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 216 */ 217 public interface RPattern { 218 boolean isMatch(CharSequence input); 219 } 220 221 static { 222 for (final NameType s : NameType.values()) { 223 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts = 224 new EnumMap<>(RuleType.class); 225 226 for (final RuleType rt : RuleType.values()) { 227 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>(); 228 229 final Languages ls = Languages.getInstance(s); 230 ls.getLanguages().forEach(l -> { 231 try (final Scanner scanner = createScanner(s, rt, l)) { 232 rs.put(l, parseRules(scanner, createResourceName(s, rt, l))); 233 } catch (final IllegalStateException e) { 234 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 235 } 236 }); 237 if (!rt.equals(RuleType.RULES)) { 238 try (final Scanner scanner = createScanner(s, rt, "common")) { 239 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common"))); 240 } 241 } 242 243 rts.put(rt, Collections.unmodifiableMap(rs)); 244 } 245 246 RULES.put(s, Collections.unmodifiableMap(rts)); 247 } 248 } 249 250 private static boolean contains(final CharSequence chars, final char input) { 251 return chars.chars().anyMatch(c -> c == input); 252 } 253 254 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 255 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 256 nameType.getName(), rt.getName(), lang); 257 } 258 259 @SuppressWarnings("resource") // Closing the Scanner closes the resource 260 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 261 final String resName = createResourceName(nameType, rt, lang); 262 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 263 } 264 265 @SuppressWarnings("resource") // Closing the Scanner closes the resource 266 private static Scanner createScanner(final String lang) { 267 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 268 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 269 } 270 271 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 272 final int suffixLength = suffix.length(); 273 final int inputLength = input.length(); 274 275 if (suffixLength > inputLength) { 276 return false; 277 } 278 for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) { 279 if (input.charAt(i) != suffix.charAt(j)) { 280 return false; 281 } 282 } 283 return true; 284 } 285 286 /** 287 * Gets rules for a combination of name type, rule type and languages. 288 * 289 * @param nameType 290 * the NameType to consider 291 * @param rt 292 * the RuleType to consider 293 * @param langs 294 * the set of languages to consider 295 * @return a list of Rules that apply 296 */ 297 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 298 final Languages.LanguageSet langs) { 299 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 300 final List<Rule> allRules = new ArrayList<>(); 301 ruleMap.values().forEach(rules -> allRules.addAll(rules)); 302 return allRules; 303 } 304 305 /** 306 * Gets rules for a combination of name type, rule type and a single language. 307 * 308 * @param nameType 309 * the NameType to consider 310 * @param rt 311 * the RuleType to consider 312 * @param lang 313 * the language to consider 314 * @return a list of Rules that apply 315 */ 316 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 317 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang)))); 318 } 319 320 /** 321 * Gets rules for a combination of name type, rule type and languages. 322 * 323 * @param nameType 324 * the NameType to consider 325 * @param rt 326 * the RuleType to consider 327 * @param langs 328 * the set of languages to consider 329 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 330 * @since 1.9 331 */ 332 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 333 final Languages.LanguageSet langs) { 334 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : 335 getInstanceMap(nameType, rt, Languages.ANY); 336 } 337 338 /** 339 * Gets rules for a combination of name type, rule type and a single language. 340 * 341 * @param nameType 342 * the NameType to consider 343 * @param rt 344 * the RuleType to consider 345 * @param lang 346 * the language to consider 347 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 348 * @since 1.9 349 */ 350 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 351 final String lang) { 352 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 353 354 if (rules == null) { 355 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 356 nameType.getName(), rt.getName(), lang)); 357 } 358 359 return rules; 360 } 361 362 private static Phoneme parsePhoneme(final String ph) { 363 final int open = ph.indexOf("["); 364 if (open >= 0) { 365 if (!ph.endsWith("]")) { 366 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 367 } 368 final String before = ph.substring(0, open); 369 final String in = ph.substring(open + 1, ph.length() - 1); 370 final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]"))); 371 372 return new Phoneme(before, Languages.LanguageSet.from(langs)); 373 } 374 return new Phoneme(ph, Languages.ANY_LANGUAGE); 375 } 376 377 private static PhonemeExpr parsePhonemeExpr(final String ph) { 378 if (ph.startsWith("(")) { // we have a bracketed list of options 379 if (!ph.endsWith(")")) { 380 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 381 } 382 383 final List<Phoneme> phs = new ArrayList<>(); 384 final String body = ph.substring(1, ph.length() - 1); 385 for (final String part : body.split("[|]")) { 386 phs.add(parsePhoneme(part)); 387 } 388 if (body.startsWith("|") || body.endsWith("|")) { 389 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 390 } 391 392 return new PhonemeList(phs); 393 } 394 return parsePhoneme(ph); 395 } 396 397 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 398 final Map<String, List<Rule>> lines = new HashMap<>(); 399 int currentLine = 0; 400 401 boolean inMultilineComment = false; 402 while (scanner.hasNextLine()) { 403 currentLine++; 404 final String rawLine = scanner.nextLine(); 405 String line = rawLine; 406 407 if (inMultilineComment) { 408 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 409 inMultilineComment = false; 410 } 411 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 412 inMultilineComment = true; 413 } else { 414 // discard comments 415 final int cmtI = line.indexOf(ResourceConstants.CMT); 416 if (cmtI >= 0) { 417 line = line.substring(0, cmtI); 418 } 419 420 // trim leading-trailing whitespace 421 line = line.trim(); 422 423 if (line.isEmpty()) { 424 continue; // empty lines can be safely skipped 425 } 426 427 if (line.startsWith(HASH_INCLUDE)) { 428 // include statement 429 final String incl = line.substring(HASH_INCLUDE_LENGTH).trim(); 430 if (incl.contains(" ")) { 431 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 432 location); 433 } 434 try (final Scanner hashIncludeScanner = createScanner(incl)) { 435 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl)); 436 } 437 } else { 438 // rule 439 final String[] parts = line.split("\\s+"); 440 if (parts.length != 4) { 441 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 442 " parts: " + rawLine + " in " + location); 443 } 444 try { 445 final String pat = stripQuotes(parts[0]); 446 final String lCon = stripQuotes(parts[1]); 447 final String rCon = stripQuotes(parts[2]); 448 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 449 final int cLine = currentLine; 450 final Rule r = new Rule(pat, lCon, rCon, ph) { 451 private final int myLine = cLine; 452 private final String loc = location; 453 454 @Override 455 public String toString() { 456 final StringBuilder sb = new StringBuilder(); 457 sb.append("Rule"); 458 sb.append("{line=").append(myLine); 459 sb.append(", loc='").append(loc).append('\''); 460 sb.append(", pat='").append(pat).append('\''); 461 sb.append(", lcon='").append(lCon).append('\''); 462 sb.append(", rcon='").append(rCon).append('\''); 463 sb.append('}'); 464 return sb.toString(); 465 } 466 }; 467 final String patternKey = r.pattern.substring(0,1); 468 final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>()); 469 rules.add(r); 470 } catch (final IllegalArgumentException e) { 471 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 472 location, e); 473 } 474 } 475 } 476 } 477 478 return lines; 479 } 480 481 /** 482 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 483 * 484 * @param regex 485 * the regular expression to compile 486 * @return an RPattern that will match this regex 487 */ 488 private static RPattern pattern(final String regex) { 489 final boolean startsWith = regex.startsWith("^"); 490 final boolean endsWith = regex.endsWith("$"); 491 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 492 final boolean boxes = content.contains("["); 493 494 if (!boxes) { 495 if (startsWith && endsWith) { 496 // exact match 497 if (content.isEmpty()) { 498 // empty 499 return input -> input.length() == 0; 500 } 501 return input -> input.equals(content); 502 } 503 if ((startsWith || endsWith) && content.isEmpty()) { 504 // matches every string 505 return ALL_STRINGS_RMATCHER; 506 } 507 if (startsWith) { 508 // matches from start 509 return input -> startsWith(input, content); 510 } 511 if (endsWith) { 512 // matches from start 513 return input -> endsWith(input, content); 514 } 515 } else { 516 final boolean startsWithBox = content.startsWith("["); 517 final boolean endsWithBox = content.endsWith("]"); 518 519 if (startsWithBox && endsWithBox) { 520 String boxContent = content.substring(1, content.length() - 1); 521 if (!boxContent.contains("[")) { 522 // box containing alternatives 523 final boolean negate = boxContent.startsWith("^"); 524 if (negate) { 525 boxContent = boxContent.substring(1); 526 } 527 final String bContent = boxContent; 528 final boolean shouldMatch = !negate; 529 530 if (startsWith && endsWith) { 531 // exact match 532 return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 533 } 534 if (startsWith) { 535 // first char 536 return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 537 } 538 if (endsWith) { 539 // last char 540 return input -> input.length() > 0 && 541 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 542 } 543 } 544 } 545 } 546 547 return new RPattern() { 548 final Pattern pattern = Pattern.compile(regex); 549 550 @Override 551 public boolean isMatch(final CharSequence input) { 552 final Matcher matcher = pattern.matcher(input); 553 return matcher.find(); 554 } 555 }; 556 } 557 558 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 559 if (prefix.length() > input.length()) { 560 return false; 561 } 562 for (int i = 0; i < prefix.length(); i++) { 563 if (input.charAt(i) != prefix.charAt(i)) { 564 return false; 565 } 566 } 567 return true; 568 } 569 570 private static String stripQuotes(String str) { 571 if (str.startsWith(DOUBLE_QUOTE)) { 572 str = str.substring(1); 573 } 574 575 if (str.endsWith(DOUBLE_QUOTE)) { 576 str = str.substring(0, str.length() - 1); 577 } 578 579 return str; 580 } 581 582 /** 583 * Creates a new rule. 584 * 585 * @param pattern 586 * the pattern 587 * @param lContext 588 * the left context 589 * @param rContext 590 * the right context 591 * @param phoneme 592 * the resulting phoneme 593 */ 594 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 595 this.pattern = pattern; 596 this.lContext = pattern(lContext + "$"); 597 this.rContext = pattern("^" + rContext); 598 this.phoneme = phoneme; 599 } 600 601 /** 602 * Gets the left context. This is a regular expression that must match to the left of the pattern. 603 * 604 * @return the left context Pattern 605 */ 606 public RPattern getLContext() { 607 return this.lContext; 608 } 609 610 /** 611 * Gets the pattern. This is a string-literal that must exactly match. 612 * 613 * @return the pattern 614 */ 615 public String getPattern() { 616 return this.pattern; 617 } 618 619 /** 620 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 621 * 622 * @return the phoneme 623 */ 624 public PhonemeExpr getPhoneme() { 625 return this.phoneme; 626 } 627 628 /** 629 * Gets the right context. This is a regular expression that must match to the right of the pattern. 630 * 631 * @return the right context Pattern 632 */ 633 public RPattern getRContext() { 634 return this.rContext; 635 } 636 637 /** 638 * Decides if the pattern and context match the input starting at a position. It is a match if the 639 * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and 640 * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}. 641 * 642 * @param input 643 * the input String 644 * @param i 645 * the int position within the input 646 * @return true if the pattern and left/right context match, false otherwise 647 */ 648 public boolean patternAndContextMatches(final CharSequence input, final int i) { 649 if (i < 0) { 650 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 651 } 652 653 final int patternLength = this.pattern.length(); 654 final int ipl = i + patternLength; 655 656 if (ipl > input.length()) { 657 // not enough room for the pattern to match 658 return false; 659 } 660 661 // evaluate the pattern, left context and right context 662 // fail early if any of the evaluations is not successful 663 if (!input.subSequence(i, ipl).equals(this.pattern)) { 664 return false; 665 } 666 if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 667 return false; 668 } 669 return this.lContext.isMatch(input.subSequence(0, i)); 670 } 671}