diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index dfad1fc8..0b57ce7a 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -2,8 +2,9 @@ [versions] junit5-version = '5.11.3' slf4j-version = '2.0.16' -wiremock-testcontainers-version = '1.0-alpha-14' +string-similarity-version = '1.0.0' testcontainers-version = '1.20.3' +wiremock-testcontainers-version = '1.0-alpha-14' [libraries] commons-validator = 'commons-validator:commons-validator:1.9.0' @@ -12,6 +13,7 @@ junit-vintage = { module = 'org.junit.vintage:junit-vintage-engine', version.ref lombok = 'org.projectlombok:lombok:1.18.34' slf4j-api = { module = 'org.slf4j:slf4j-api', version.ref = 'slf4j-version' } slf4j-nop = { module = 'org.slf4j:slf4j-nop', version.ref = 'slf4j-version' } +string-similarity = {module = 'net.ricecode:string-similarity', version.ref ='string-similarity-version'} spock = 'org.spockframework:spock-bom:2.3-groovy-3.0' testcontainers-junit-jupiter = { module = 'org.testcontainers:junit-jupiter', version.ref = 'testcontainers-version' } wiremock-testcontainers = { module = 'org.wiremock.integrations.testcontainers:wiremock-testcontainers-module', version.ref = 'wiremock-testcontainers-version' } diff --git a/htmlSanityCheck-core/build.gradle b/htmlSanityCheck-core/build.gradle index d788afb9..8acbabc1 100644 --- a/htmlSanityCheck-core/build.gradle +++ b/htmlSanityCheck-core/build.gradle @@ -13,12 +13,11 @@ dependencies { testImplementation libs.slf4j.nop // jsoup is our awesome html parser, see jsoup.org implementation libs.jsoup - + implementation libs.string.similarity compileOnly libs.lombok annotationProcessor libs.lombok testCompileOnly libs.lombok testAnnotationProcessor libs.lombok - testImplementation platform("org.codehaus.groovy:groovy-bom:${GroovySystem.version}") testImplementation 'org.codehaus.groovy:groovy-xml' diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/AscendingSimilarityScoreComparator.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/AscendingSimilarityScoreComparator.java deleted file mode 100644 index 42d7c5e4..00000000 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/AscendingSimilarityScoreComparator.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -import java.util.Comparator; - -/** - * A comparator that allows SimilarityScore to be sorted in - * ascending order. - * - * @author Ralph Allan Rice ralph.rice@gmail.com - */ -public class AscendingSimilarityScoreComparator implements Comparator { - /** - * Compares two similarity scores. - * - * @param x The first score to be compared. - * @param y The second score to be compared. - * @return a negative integer, zero, or a positive integer as the first score is less than, - * equal to, or greater than the second score. - */ - public int compare(SimilarityScore x, SimilarityScore y) { - double first = x.getScore(); - double second = y.getScore(); - if (first == second) { - return 0; - } - if (first < second) { - return -1; - } - return 1; - } - -} diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/DescendingSimilarityScoreComparator.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/DescendingSimilarityScoreComparator.java deleted file mode 100644 index 45a3af8c..00000000 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/DescendingSimilarityScoreComparator.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -import java.util.Comparator; - -/** - * A comparator that allows SimilarityScore to be sorted in - * descending order. - * - * @author Ralph Allan Rice ralph.rice@gmail.com - */ -public class DescendingSimilarityScoreComparator implements Comparator { - /** - * Compares two similarity scores. - * - * @param x The first score to be compared. - * @param y The second score to be compared. - * @return a negative integer, zero, or a positive integer as the first score is greater than, - * equal to, or less than the second score. - */ - public int compare(SimilarityScore x, SimilarityScore y) { - double first = x.getScore(); - double second = y.getScore(); - if (first == second) { - return 0; - } - if (first < second) { - return 1; - } - return -1; - } - -} - diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroStrategy.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroStrategy.java deleted file mode 100644 index 057c70df..00000000 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroStrategy.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -/** - * A strategy that uses the Jaro Distance to calculate the similarity of two strings. - * - * @author Ralph Allan Rice ralph.rice@gmail.com - * @see About Jaro Distance - */ -public class JaroStrategy implements SimilarityStrategy { - - /** - * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity - * and 1.0 implies absolute similarity. - * - * @param first The first string to compare. - * @param second The second string to compare. - * @return A number between 0.0 and 1.0. - */ - public double score(String first, String second) { - String shorter; - String longer; - - // Determine which String is longer. - if (first.length() > second.length()) { - longer = first.toLowerCase(); - shorter = second.toLowerCase(); - } else { - longer = second.toLowerCase(); - shorter = first.toLowerCase(); - } - - // Calculate the half length() distance of the shorter String. - int halflength = (shorter.length() / 2) + 1; - - // Find the set of matching characters between the shorter and longer strings. Note that - // the set of matching characters may be different depending on the order of the strings. - String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength); - String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength); - - - // If one or both of the sets of common characters is empty, then - // there is no similarity between the two strings. - if (m1.length() == 0 || m2.length() == 0) return 0.0; - - // If the set of common characters is not the same size, then - // there is no similarity between the two strings, either. - if (m1.length() != m2.length()) return 0.0; - - // Calculate the number of transpositions between the two sets - // of common characters. - int transpositions = transpositions(m1, m2); - - // Calculate the distance. - return (m1.length() / ((double)shorter.length()) + - m2.length() / ((double)longer.length()) + - (m1.length() - transpositions) / ((double)m1.length())) / 3.0; - - - } - - /** - * Gets a set of matching characters between two strings. - * - * @param first The first string. - * @param second The second string. - * @param limit The maximum distance to consider. - * @return A string contain the set of common characters. - * @remarks Two characters from the first string and the second string are considered matching if the character's - * respective positions are no farther than the limit value. - */ - private String getSetOfMatchingCharacterWithin(String first, String second, int limit) { - - StringBuilder common = new StringBuilder(); - StringBuilder copy = new StringBuilder(second); - for (int i = 0; i < first.length(); i++) { - char ch = first.charAt(i); - boolean found = false; - - // See if the character is within the limit positions away from the original position of that character. - for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++) { - if (copy.charAt(j) == ch) { - found = true; - common.append(ch); - copy.setCharAt(j, '*'); - } - } - } - return common.toString(); - } - - /** - * Calculates the number of transpositions between two strings. - * - * @param first The first string. - * @param second The second string. - * @return The number of transpositions between the two strings. - */ - private int transpositions(String first, String second) { - int transpositions = 0; - for (int i = 0; i < first.length(); i++) { - if (first.charAt(i) != second.charAt(i)) { - transpositions++; - } - } - transpositions /= 2; - return transpositions; - } - -} diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroWinklerStrategy.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroWinklerStrategy.java deleted file mode 100644 index fbe1c3a9..00000000 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroWinklerStrategy.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -/** - * A strategy that uses the Jaro-Winkler Distance to calculate the similarity of two strings. - * - * @author Ralph Allan Rice ralph.rice@gmail.com - * @see About Jaro-Winkler Distance - */ -public class JaroWinklerStrategy extends JaroStrategy implements SimilarityStrategy { - static final double DEFAULT_SCALING_FACTOR = 0.1; // This is the default scaling factor Winkler used. - - private final double scalingFactor; - - /** - * Constructs a new JaroWinklerStrategy instance. - * - * @param scalingFactor The scaling factor between 0.00 and 0.25. If the scaling factor is greater than 0.25, the scaling factor is set to 0.25. - */ - public JaroWinklerStrategy(double scalingFactor) { - if (scalingFactor > 0.25) { - scalingFactor = 0.25; - } - this.scalingFactor = scalingFactor; - } - - /** - * Constructs a new JaroWinklerStrategy instance. - */ - public JaroWinklerStrategy() { - this.scalingFactor = DEFAULT_SCALING_FACTOR; - } - - /** - * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity - * and 1.0 implies absolute similarity. - * - * @param first The first string to compare. - * @param second The second string to compare. - * @return A number between 0.0 and 1.0. - */ - public double score(String first, String second) { - double jaro = super.score(first, second); - - int cl = commonPrefixLength(first, second); - - // The Jaro-Winkler distance uses a prefix scale which gives more favorable ratings - // to strings that match from the beginning for a set prefix length. - return jaro + (scalingFactor * cl * (1.0 - jaro)); - - } - - /** - * Calculates the number of characters from the beginning of the strings that match exactly one-to-one, - * up to a maximum of four (4) characters. - * - * @param first The first string. - * @param second The second string. - * @return A number between 0 and 4. - */ - private int commonPrefixLength(String first, String second) { - String shorter; - String longer; - - // Determine which string is longer. - if (first.length() > second.length()) { - longer = first.toLowerCase(); - shorter = second.toLowerCase(); - } else { - longer = second.toLowerCase(); - shorter = first.toLowerCase(); - } - - int result = 0; - - // Iterate through the shorter string. - for (int i = 0; i < shorter.length(); i++) { - if (shorter.charAt(i) != longer.charAt(i)) { - break; - } - result++; - } - - // Limit the result to 4. - return result > 4 ? 4 : result; - } - - -} diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityScore.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityScore.java deleted file mode 100644 index 7c55af7d..00000000 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityScore.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -/** - * A value object contains a similarity score. - * - * @author Ralph Allan Rice ralph.rice@gmail.com - */ -public class SimilarityScore { - - private final String key; - private final double score; - - /** - * Constructs a similarity score. - * - * @param key The string key. - * @param score The score value. - */ - - public SimilarityScore(String key, double score) { - this.key = key; - this.score = score; - } - - /** - * Gets the key for this score. - * - * @return A string. - */ - public String getKey() { - return this.key; - } - - /** - * Gets the value of the score. - * - * @return A double. - */ - public double getScore() { - return this.score; - } - - - /** - * Returns the hash code for this object. - * - * @return An integer representing the hash code. - */ - public int hashCode() { - int hash = 11; - hash = 23 * hash + key.hashCode(); - hash = 23 * hash + (int) (score * 1000000); - return hash; - } - - /** - * Determines if the supplied object equals this object. - * - * @return True if the keys and scores match between the two objects. Otherwise false. - */ - @Override - public boolean equals(Object o) { - if ((o == null) || (o.getClass() != this.getClass())) { - return false; - } - SimilarityScore other = (SimilarityScore) o; - - return this.key.equals(other.key) - && this.score == other.score; - } - - -} diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityStrategy.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityStrategy.java deleted file mode 100644 index 7a34d565..00000000 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityStrategy.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -/** - * @author Ralph Allan Rice ralph.rice@gmail.com - * An interface that defines methods to perform string similarity calculation. - */ -public interface SimilarityStrategy { - - /** - * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity - * and 1.0 implies absolute similarity. - * - * @param first The first string to compare. - * @param second The second string to compare. - * @return A number between 0.0 and 1.0. - */ - double score(String first, String second); -} diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityService.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityService.java deleted file mode 100644 index beefe14f..00000000 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityService.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -import java.util.Comparator; -import java.util.List; - -/** - * A service that performs string similarity calculations. - * - * @author Ralph Allan Rice ralph.rice@gmail.com - */ -public interface StringSimilarityService { - - /** - * Calculates all similarity scores for a given set of features. - * - * @param features The list of features. - * @param target The target string to compare against the features. - * @return A list of similarity scores. - */ - List scoreAll(List features, String target); - - /** - * Calculates the similarity score of a single feature. - * - * @param feature The feature string to compare. - * @param target The target string to compare against the feature. - * @return The similarity score between the feature and target. - */ - double score(String feature, String target); - - - /** - * Finds the feature within a set of given features that best match the target string. - * - * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @return A SimilarityScore that has the highest score value amongst the features. - */ - SimilarityScore findTop(List features, String target); - - /** - * Finds the feature within a set of given features that best match the target string. - * - * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @param comparator A comparator that is used sort the scores. - * @return A SimilarityScore that has the top value amongst the features, according to the comparator. - */ - SimilarityScore findTop(List features, String target, Comparator comparator); - - - // added by Gernot Starke: - - /** - * Finds the n features within a set of given features that best match the target string. - * - * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @param n The (maximum) number of hits to be returned. - * @return A list of SimilarityScore instances having the top values amongst the features, - * according to the comparator - */ - List findBestN(List features, String target, int n); - -} diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityServiceImpl.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityServiceImpl.java deleted file mode 100644 index 19bafcb4..00000000 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityServiceImpl.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; - - -/** - * An implementation of StringSimilarityService. - * - * @author Ralph Allan Rice ralph.rice@gmail.com - * @see StringSimilarityService - */ -public class StringSimilarityServiceImpl implements StringSimilarityService { - - private final SimilarityStrategy strategy; - - - /** - * Creates a similarity calculator instance. - * - * @param strategy The similarity strategy to use when calculating similarity scores. - */ - public StringSimilarityServiceImpl(SimilarityStrategy strategy) { - this.strategy = strategy; - } - - /** - * Calculates all similarity scores for a given set of features. - * - * @param features The list of features. - * @param target The target string to compare against the features. - * @return A list of similarity scores. - */ - public List scoreAll(List features, String target) { - List scores = new ArrayList<>(); - - for (String feature : features) { - double score = strategy.score(feature, target); - scores.add(new SimilarityScore(feature, score)); - } - - return scores; - } - - /** - * Calculates the similarity score of a single feature. - * - * @param feature The feature string to compare. - * @param target The target string to compare against the feature. - * @return The similarity score between the feature and target. - */ - public double score(String feature, String target) { - return strategy.score(feature, target); - } - - /** - * Finds the feature within a set of given features that best match the target string. - * - * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @return The similarity score with the highest value. - */ - public SimilarityScore findTop(List features, String target) { - return findTop(features, target, new DescendingSimilarityScoreComparator()); - } - - /** - * Finds the feature within a set of given features that best match the target string. - * - * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @param comparator A comparator that is used sort the scores. - * @return A SimilarityScore that has the top value amongst the features, according to the comparator. - */ - public SimilarityScore findTop(List features, String target, Comparator comparator) { - if (features.isEmpty()) { - return null; - } - List scores = scoreAll(features, target); - scores.sort(comparator); - return scores.get(0); - } - - // added by Gernot Starke: - - /** - * Finds the n features within a set of given features that best match the target string. - * - * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @param n The (maximum) number of hits to be returned. - * @return A list of SimilarityScore instances having the top values amongst the features, - * according to the comparator - */ - public List findBestN(List features, String target, int n) { - List result = new ArrayList<>(); - - if ((!features.isEmpty()) && (n >= 1)) { - List scores = scoreAll(features, target); - scores.sort(new DescendingSimilarityScoreComparator()); - - // fails if n> scores.size(): result = scores.subList(0, n); //NOSONAR(S125) - result = scores.subList(0, Math.min(scores.size(), n)); - - } - return result; - } - -} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/comparator/SimilarityScoreComparator.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/comparator/SimilarityScoreComparator.java new file mode 100644 index 00000000..ff4e48f9 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/comparator/SimilarityScoreComparator.java @@ -0,0 +1,15 @@ +package org.aim42.htmlsanitycheck.comparator; + +import net.ricecode.similarity.SimilarityScore; + +import java.math.BigDecimal; +import java.util.Comparator; + +public class SimilarityScoreComparator implements Comparator { + @Override + public int compare(SimilarityScore o1, SimilarityScore o2) { + BigDecimal score1 = BigDecimal.valueOf(o1.getScore()); + BigDecimal score2 = BigDecimal.valueOf(o2.getScore()); + return score2.compareTo(score1); + } +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/suggest/Suggester.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/suggest/Suggester.java index c564ae90..85e0234e 100644 --- a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/suggest/Suggester.java +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/suggest/Suggester.java @@ -4,6 +4,7 @@ import net.ricecode.similarity.SimilarityScore; import net.ricecode.similarity.StringSimilarityService; import net.ricecode.similarity.StringSimilarityServiceImpl; +import org.aim42.htmlsanitycheck.comparator.SimilarityScoreComparator; import java.util.List; import java.util.stream.Collectors; @@ -46,8 +47,10 @@ public static List determineNSuggestions(String target, List opt // the "*." operator is the coolest thing in groovy: // applies the method to all elements of the collection (usually known as "map") - return service.findBestN(options, target, n) + return service.scoreAll(options, target) .stream() + .sorted(new SimilarityScoreComparator()) + .limit(n) .map(SimilarityScore::getKey) .collect(Collectors.toList()); } diff --git a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/suggest/StringSimilarityServiceImplTest.java b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/suggest/StringSimilarityServiceImplTest.java index dc722375..4c0cc984 100644 --- a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/suggest/StringSimilarityServiceImplTest.java +++ b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/suggest/StringSimilarityServiceImplTest.java @@ -31,12 +31,14 @@ import net.ricecode.similarity.SimilarityStrategy; import net.ricecode.similarity.StringSimilarityService; import net.ricecode.similarity.StringSimilarityServiceImpl; +import org.aim42.htmlsanitycheck.comparator.SimilarityScoreComparator; import org.junit.Before; import org.junit.Test; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; @@ -51,7 +53,7 @@ public void setup() { strategy = new JaroWinklerStrategy(); service = new StringSimilarityServiceImpl(strategy); - features = new ArrayList(); + features = new ArrayList<>(); } @Test @@ -67,7 +69,7 @@ public void testFindBestTwo() { features.add(c2); features.add(c3); - List bestTwo = service.findBestN(features, target, 2); + List bestTwo = service.scoreAll(features, target).stream().sorted(new SimilarityScoreComparator()).limit(2).collect(Collectors.toList()); assertEquals(2, bestTwo.size()); @@ -82,9 +84,9 @@ public void testFindBestTwo() { @Test public void testFindBestZero() { - assert features.size() == 0; + assert features.isEmpty(); - List best = service.findBestN(features, "none", 2); + List best = service.scoreAll(features, "none").stream().sorted(new SimilarityScoreComparator()).collect(Collectors.toList()); assertEquals("empty list of candidates shall yield empty result", 0, best.size()); @@ -96,7 +98,7 @@ public void testFindBestThreeInTooShortList() { features = Arrays.asList("McDommy", "NicClumsy"); - List bestThree = service.findBestN(features, target, 3); + List bestThree = service.scoreAll(features, target).stream().sorted(new SimilarityScoreComparator()).limit(3).collect(Collectors.toList()); assertEquals("even if list is too short, valid result is returned", 2, bestThree.size()); } @@ -116,7 +118,7 @@ public void testFindTopInList() { features.add(c2); features.add(c3); - List scores = service.scoreAll(features, target); + List scores = service.scoreAll(features, target).stream().sorted(new SimilarityScoreComparator()).collect(Collectors.toList()); assertEquals(3, scores.size()); @@ -179,7 +181,7 @@ public void testFindSuggestionInLongList() { "Kadabra", "Kakuna", "Kangaskhan", "Karrablast", "Kecleon", "Kingdra" ); - List scores = service.scoreAll(features, target); + List scores = service.scoreAll(features, target).stream().sorted(new SimilarityScoreComparator()).collect(Collectors.toList()); assertEquals(267, scores.size()); diff --git a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/AscendingComparatorTest.java b/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/AscendingComparatorTest.java deleted file mode 100644 index a4a6709e..00000000 --- a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/AscendingComparatorTest.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -import org.junit.Test; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class AscendingComparatorTest { - - @Test - public void testCompareScoreFirstGreater() { - SimilarityScore first = new SimilarityScore("First", 0.87); - SimilarityScore second = new SimilarityScore("Second", 0.54); - AscendingSimilarityScoreComparator c = new AscendingSimilarityScoreComparator(); - assertTrue(c.compare(first, second)>0); - assertTrue(c.compare(second, first)<0); - } - - @Test - public void testCompareScoreSecondGreater() { - SimilarityScore first = new SimilarityScore("First", 0.37); - SimilarityScore second = new SimilarityScore("Second", 0.65); - AscendingSimilarityScoreComparator c = new AscendingSimilarityScoreComparator(); - assertTrue(c.compare(first, second)<0); - assertTrue(c.compare(second, first)>0); - } - - @Test - public void testCompareScoreEquality() { - SimilarityScore first = new SimilarityScore("First", 0.96); - SimilarityScore second = new SimilarityScore("Second", 0.96); - AscendingSimilarityScoreComparator c = new AscendingSimilarityScoreComparator(); - assertEquals(0, c.compare(first, second)); - assertEquals(0, c.compare(second, first)); - } - -} diff --git a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/DescendingComparatorTest.java b/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/DescendingComparatorTest.java deleted file mode 100644 index c62d4db2..00000000 --- a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/DescendingComparatorTest.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -import org.junit.Test; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class DescendingComparatorTest { - - @Test - public void testCompareScoreFirstGreater() { - SimilarityScore first = new SimilarityScore("First", 0.87); - SimilarityScore second = new SimilarityScore("Second", 0.54); - DescendingSimilarityScoreComparator c = new DescendingSimilarityScoreComparator(); - assertTrue(c.compare(first, second)<0); - assertTrue(c.compare(second, first)>0); - } - - @Test - public void testCompareScoreSecondGreater() { - SimilarityScore first = new SimilarityScore("First", 0.37); - SimilarityScore second = new SimilarityScore("Second", 0.65); - DescendingSimilarityScoreComparator c = new DescendingSimilarityScoreComparator(); - assertTrue(c.compare(first, second)>0); - assertTrue(c.compare(second, first)<0); - } - - @Test - public void testCompareScoreEquality() { - SimilarityScore first = new SimilarityScore("First", 0.96); - SimilarityScore second = new SimilarityScore("Second", 0.96); - DescendingSimilarityScoreComparator c = new DescendingSimilarityScoreComparator(); - assertEquals(0, c.compare(first, second)); - assertEquals(0, c.compare(second, first)); - } - -} diff --git a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/JaroStrategyTest.java b/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/JaroStrategyTest.java deleted file mode 100644 index 4950d4cb..00000000 --- a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/JaroStrategyTest.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -public class JaroStrategyTest { - - @Test - public void testOneTranspostion() { - SimilarityStrategy s = new JaroStrategy(); - String first = "Martha"; - String second = "Marhta"; - double expected = 0.944; - double delta = 0.001; - double actual = s.score(first, second); - assertEquals(expected, actual, delta); - } - - @Test - public void testSoundAlike() { - SimilarityStrategy s = new JaroStrategy(); - String first = "Dwayne"; - String second = "Duane"; - double expected = 0.822; - double delta = 0.001; - double actual = s.score(first, second); - assertEquals(expected, actual, delta); - - } - - @Test - public void testMisspelledSoundAlike() { - SimilarityStrategy s = new JaroStrategy(); - String first = "Dixon"; - String second = "Dicksonx"; - double expected = 0.767; - double delta = 0.001; - double actual = s.score(first, second); - assertEquals(expected, actual, delta); - - } - - @Test - public void testAbsoluteSimilarity() { - SimilarityStrategy s = new JaroStrategy(); - String first = "Mississippi"; - String second = "Mississippi"; - double expected = 1.000; - double delta = 0.000; - double actual = s.score(first, second); - assertEquals(expected, actual, delta); - } - - @Test - public void testAbsoluteDissimilarity() { - SimilarityStrategy s = new JaroStrategy(); - String first = "Mississippi"; - String second = "Oklahoma"; - double expected = 0.000; - double delta = 0.000; - double actual = s.score(first, second); - assertEquals(expected, actual, delta); - } - -} diff --git a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/JaroWinklerStrategyTest.java b/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/JaroWinklerStrategyTest.java deleted file mode 100644 index 5b8262ba..00000000 --- a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/JaroWinklerStrategyTest.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import java.util.Arrays; -import java.util.Collection; -import static org.junit.Assert.assertEquals; - -@RunWith(Parameterized.class) -public class JaroWinklerStrategyTest { - - private String first; - private String second; - private double expected; - private double delta; - - public JaroWinklerStrategyTest(String first, String second, double expected, double delta) { - this.first = first; - this.second = second; - this.expected = expected; - this.delta = delta; - } - - @Parameterized.Parameters - public static Collection data() { - return Arrays.asList(new Object[][]{ - {"Martha", "Marhta", 0.961, 0.001}, - {"Dwayne", "Duane", 0.840, 0.001}, - {"Dixon", "Dicksonx", 0.813, 0.001} - }); - } - - @Test - public void testWithMultipleCases() { - SimilarityStrategy s = new JaroWinklerStrategy(); - double actual = s.score(first, second); - assertEquals(expected, actual, delta); - } - - @Test - public void testAbsoluteSimilarity() { - SimilarityStrategy s = new JaroStrategy(); - String first = "Mississippi"; - String second = "Mississippi"; - double expected = 1.000; - double delta = 0.000; - double actual = s.score(first, second); - assertEquals(expected, actual, delta); - } - - @Test - public void testAbsoluteDissimilarity() { - SimilarityStrategy s = new JaroStrategy(); - String first = "Mississippi"; - String second = "Oklahoma"; - double expected = 0.000; - double delta = 0.000; - double actual = s.score(first, second); - assertEquals(expected, actual, delta); - } -} \ No newline at end of file diff --git a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/SimilarityScoreTest.java b/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/SimilarityScoreTest.java deleted file mode 100644 index 6bed6f6d..00000000 --- a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/SimilarityScoreTest.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2010 Ralph Allan Rice ralph.rice@gmail.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -package net.ricecode.similarity; - -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -public class SimilarityScoreTest { - - @Test - public void testGetKey() { - SimilarityScore s = new SimilarityScore("Test", 0.99); - assertEquals("Test", s.getKey()); - } - - @Test - public void testGetScore() { - SimilarityScore s = new SimilarityScore("Test", 0.99); - assertEquals(0.99, s.getScore(), 0.000); - - } - - - -} diff --git a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/package-info.java b/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/package-info.java deleted file mode 100644 index ba58c381..00000000 --- a/htmlSanityCheck-core/src/test/java/net/ricecode/similarity/package-info.java +++ /dev/null @@ -1,39 +0,0 @@ -/** - Calculates a similarity score between two strings. - A score of 0.0 means that the two strings are absolutely dissimilar, - and 1.0 means that absolutely similar (or equal). - Anything in between indicates how similar each the two strings are. - - This package has been created by Ralph Rice (@rrice), - @see https://github.com/rrice/java-string-similarity - - As Ralph, the original author and copyright holder, - has not published the compiled package on any public OSS server, - we forked it and included the source. - - All credits for this package and its tests go, unless otherwise mentioned, to: - @author - - - **/ - -package net.ricecode.similarity; - - -/*====================================================================== - -Copyright Gernot Starke and aim42 contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an - "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ======================================================================*/ \ No newline at end of file diff --git a/src/docs/arc42/chapters/chap-09-Decisions.adoc b/src/docs/arc42/chapters/chap-09-Decisions.adoc index 767e0011..0db0ac90 100644 --- a/src/docs/arc42/chapters/chap-09-Decisions.adoc +++ b/src/docs/arc42/chapters/chap-09-Decisions.adoc @@ -43,12 +43,7 @@ Find details on how HSC implements HTML parsing in the {xrefConceptHtmlEncapsula === String Similarity Checking with https://wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance[Jaro-Winkler-Distance] -The small https://github.com/rrice/java-string-similarity[java-string-similarity] library (by Ralph Allen Rice) contains implementations of several similarity-calculation algorithms. As it is *not available* as public binary, -we use the sources instead, primarily: +The small https://github.com/rrice/java-string-similarity[java-string-similarity] library (by Ralph Allen Rice) contains implementations of several similarity-calculation algorithms. +As it is public binary, available at central https://central.sonatype.com/artifact/net.ricecode/string-similarity[maven repository] we have used it as external library dependencies. +Primarily we have used Jaro-Winkler strategy to find similarity. - net.ricecode.similarity.JaroWinklerStrategyTest - net.ricecode.similarity.JaroWinklerStrategy - -[NOTE] -The actual implementation of the similarity comparison has been postponed -to a later release of HSC