| | 1 | // License: GPL. For details, see LICENSE file. |
| | 2 | package org.openstreetmap.josm.tools; |
| | 3 | |
| | 4 | /** |
| | 5 | * Provides methods for computing similarity between strings. |
| | 6 | */ |
| | 7 | public final class StringMetrics { |
| | 8 | private static final LevenshteinStringMetric levenshtein = new LevenshteinStringMetric(); |
| | 9 | |
| | 10 | /** |
| | 11 | * Get metric by name. |
| | 12 | * |
| | 13 | * @param name |
| | 14 | * @return |
| | 15 | */ |
| | 16 | public static InterfaceStringMetric getByName(String name) { |
| | 17 | if ("levenshtein".equals(name)) { |
| | 18 | return levenshtein; |
| | 19 | } |
| | 20 | throw new IllegalArgumentException("Not a valid string metric name"); |
| | 21 | } |
| | 22 | |
| | 23 | public interface InterfaceStringMetric { |
| | 24 | /** |
| | 25 | * Get string similarity. 1 indicates a perfect match and 0 indicates no match |
| | 26 | * |
| | 27 | * @param string1 First string |
| | 28 | * @param string2 Second string |
| | 29 | * @return the similarity between the strings normalized |
| | 30 | */ |
| | 31 | public float getSimilarity(String string1, String string2); |
| | 32 | |
| | 33 | /** |
| | 34 | * Get unnormalized similarity; the range of possible values may vary |
| | 35 | * between metrics |
| | 36 | * |
| | 37 | * @param string1 first string |
| | 38 | * @param string2 second string |
| | 39 | * @return |
| | 40 | */ |
| | 41 | public float getUnNormalisedSimilarity(String string1, String string2); |
| | 42 | } |
| | 43 | |
| | 44 | /** |
| | 45 | * Compute Levenshtein distance. For algorithm details see: |
| | 46 | * http://en.wikipedia.org/wiki/Levenshtein_distance |
| | 47 | */ |
| | 48 | public static class LevenshteinStringMetric implements InterfaceStringMetric { |
| | 49 | |
| | 50 | @Override |
| | 51 | public float getSimilarity(String string1, String string2) { |
| | 52 | float d = getUnNormalisedSimilarity(string1, string2); |
| | 53 | int max = Math.max(string1.length(), string2.length()); |
| | 54 | if (max > 0) |
| | 55 | return 1 - (d/(float)max); |
| | 56 | else |
| | 57 | return 1; |
| | 58 | } |
| | 59 | |
| | 60 | @Override |
| | 61 | public float getUnNormalisedSimilarity(String string1, String string2) { |
| | 62 | int d[][]; // matrix |
| | 63 | int n; // length of s |
| | 64 | int m; // length of t |
| | 65 | int i; // iterates through s |
| | 66 | int j; // iterates through t |
| | 67 | char s_i; // ith character of s |
| | 68 | char t_j; // jth character of t |
| | 69 | int cost; // cost |
| | 70 | |
| | 71 | // Step 1 |
| | 72 | n = string1.length(); |
| | 73 | m = string2.length(); |
| | 74 | if (n == 0) { |
| | 75 | return m; |
| | 76 | } |
| | 77 | if (m == 0) { |
| | 78 | return n; |
| | 79 | } |
| | 80 | d = new int[n + 1][m + 1]; |
| | 81 | |
| | 82 | // Step 2 |
| | 83 | for (i = 0; i <= n; i++) { |
| | 84 | d[i][0] = i; |
| | 85 | } |
| | 86 | for (j = 0; j <= m; j++) { |
| | 87 | d[0][j] = j; |
| | 88 | } |
| | 89 | |
| | 90 | // Step 3 |
| | 91 | for (i = 1; i <= n; i++) { |
| | 92 | |
| | 93 | s_i = string1.charAt(i - 1); |
| | 94 | |
| | 95 | // Step 4 |
| | 96 | for (j = 1; j <= m; j++) { |
| | 97 | |
| | 98 | t_j = string2.charAt(j - 1); |
| | 99 | |
| | 100 | // Step 5 |
| | 101 | if (s_i == t_j) { |
| | 102 | cost = 0; |
| | 103 | } else { |
| | 104 | cost = 1; |
| | 105 | } |
| | 106 | |
| | 107 | // Step 6 |
| | 108 | d[i][j] = Utils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); |
| | 109 | } |
| | 110 | } |
| | 111 | |
| | 112 | // Step 7 |
| | 113 | return d[n][m]; |
| | 114 | } |
| | 115 | } |
| | 116 | } |