| | 158 | |
| | 159 | /** |
| | 160 | * Add a regular expression rule. |
| | 161 | * @param regExpr the regular expression to search for |
| | 162 | * @param replacement a string to replace with, which should match the expression. |
| | 163 | */ |
| | 164 | public void addRegExprRule(String regExpr, String replacement) { |
| | 165 | rules_.add(new RegExprRule(regExpr, replacement)); |
| | 166 | } |
| | 167 | |
| | 168 | /** |
| | 169 | * Add a rule with synonym words. |
| | 170 | * @param words words which are synonyms |
| | 171 | */ |
| | 172 | public void addSynonyms(String... words) { |
| | 173 | for (String word : words) { |
| | 174 | rules_.add(new SynonymRule(word, words)); |
| | 175 | } |
| | 176 | } |
| | 177 | |
| | 178 | /** |
| | 179 | * Check if two names are similar, but not identical. First both names will be "normalized". |
| | 180 | * Afterwards the Levenshtein distance will be calculated.<br> |
| | 181 | * Examples for normalization rules:<br> |
| | 182 | * <code>replaceAll("\\d+", "0")</code><br> |
| | 183 | * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true |
| | 184 | * @param name first name to compare |
| | 185 | * @param name2 second name to compare |
| | 186 | * @return true if the normalized names are different but only a "little bit" |
| | 187 | */ |
| | 188 | public boolean similaryName(String name, String name2) { |
| | 189 | // check plain strings |
| | 190 | int distance = getLevenshteinDistance(name, name2); |
| | 191 | boolean similar = distance>0 && distance<=2; |
| | 192 | |
| | 193 | // try all rules |
| | 194 | for (NormalizeRule rule : rules_) { |
| | 195 | int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2)); |
| | 196 | if (levenshteinDistance == 0) |
| | 197 | // one rule results in identical names: identical |
| | 198 | return false; |
| | 199 | else if (levenshteinDistance <= 2) { |
| | 200 | // 0 < distance <= 2 |
| | 201 | similar = true; |
| | 202 | } |
| | 203 | } |
| | 204 | return similar; |
| | 205 | } |
| | 206 | |
| | 207 | public interface NormalizeRule { |
| | 208 | |
| | 209 | /** |
| | 210 | * Normalize the string by replacing parts. |
| | 211 | * @param name name to normalize |
| | 212 | * @return normalized string |
| | 213 | */ |
| | 214 | String normalize(String name); |
| | 215 | |
| | 216 | } |
| | 217 | |
| | 218 | public class RegExprRule implements NormalizeRule { |
| | 219 | private Pattern regExpr_; |
| | 220 | private String replacement_; |
| | 221 | |
| | 222 | public RegExprRule(String expression, String replacement) { |
| | 223 | regExpr_ = Pattern.compile(expression); |
| | 224 | replacement_ = replacement; |
| | 225 | } |
| | 226 | |
| | 227 | @Override |
| | 228 | public String normalize(String name) { |
| | 229 | return regExpr_.matcher(name).replaceAll(replacement_); |
| | 230 | } |
| | 231 | |
| | 232 | @Override |
| | 233 | public String toString() { |
| | 234 | return "replaceAll(" + regExpr_ + ", " + replacement_ + ")"; |
| | 235 | } |
| | 236 | } |
| | 237 | |
| | 238 | public class SynonymRule implements NormalizeRule { |
| | 239 | |
| | 240 | private String[] words_; |
| | 241 | private Pattern regExpr_; |
| | 242 | private String replacement_; |
| | 243 | |
| | 244 | public SynonymRule(String replacement, String[] words) { |
| | 245 | replacement_ = replacement.toLowerCase(); |
| | 246 | words_ = words; |
| | 247 | |
| | 248 | // build regular expression for other words (for fast match) |
| | 249 | StringBuilder expression = new StringBuilder(); |
| | 250 | int maxLength = 0; |
| | 251 | for (int i = 0; i < words.length; i++) { |
| | 252 | if (words[i].length() > maxLength) { |
| | 253 | maxLength = words[i].length(); |
| | 254 | } |
| | 255 | if (expression.length() > 0) { |
| | 256 | expression.append("|"); |
| | 257 | } |
| | 258 | expression.append(Pattern.quote(words[i])); |
| | 259 | } |
| | 260 | regExpr_ = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE); |
| | 261 | } |
| | 262 | |
| | 263 | @Override |
| | 264 | public String normalize(String name) { |
| | 265 | // find first match |
| | 266 | Matcher matcher = regExpr_.matcher(name); |
| | 267 | if (!matcher.find()) |
| | 268 | return name; |
| | 269 | |
| | 270 | int start = matcher.start(); |
| | 271 | |
| | 272 | // which word matches? |
| | 273 | String part = ""; |
| | 274 | for (int i = 0; i < words_.length; i++) { |
| | 275 | String word = words_[i]; |
| | 276 | part = name.substring(start, start + word.length()); |
| | 277 | if (word.equalsIgnoreCase(part)) { |
| | 278 | break; |
| | 279 | } |
| | 280 | } |
| | 281 | |
| | 282 | // replace the word |
| | 283 | char[] newName = matcher.replaceFirst(replacement_).toCharArray(); |
| | 284 | |
| | 285 | // adjust case (replacement is not shorter than matching word!) |
| | 286 | int minLength = Math.min(replacement_.length(), part.length()); |
| | 287 | for (int i = 0; i < minLength; i++) { |
| | 288 | if (Character.isUpperCase(part.charAt(i))) { |
| | 289 | newName[start + i] = Character.toUpperCase(newName[start + i]); |
| | 290 | } |
| | 291 | } |
| | 292 | |
| | 293 | return new String(newName); |
| | 294 | } |
| | 295 | |
| | 296 | @Override |
| | 297 | public String toString() { |
| | 298 | return "synonyms(" + replacement_ + ", " + Arrays.toString(words_) + ")"; |
| | 299 | } |
| | 300 | |
| | 301 | } |
| | 302 | |