From c43de0b137aaf51cf058f044cd540501b1fe4917 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sun, 10 Jul 2016 23:15:25 +0300 Subject: [PATCH 1/5] Added first version of Levenshtein and Damerau-Levenshtein --- include/boost/algorithm/levenshtein.hpp | 150 ++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 include/boost/algorithm/levenshtein.hpp diff --git a/include/boost/algorithm/levenshtein.hpp b/include/boost/algorithm/levenshtein.hpp new file mode 100644 index 000000000..e2266b4f6 --- /dev/null +++ b/include/boost/algorithm/levenshtein.hpp @@ -0,0 +1,150 @@ +// +// Created by zamazan4ik on 10.07.16. +// + +#ifndef BOOST_ALGORITHM_LEVENSHTEIN_HPP +#define BOOST_ALGORITHM_LEVENSHTEIN_HPP + +#include +#include + +#include +#include + +template +VarType +damerau_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, + ForwardIterator2 beginS2, ForwardIterator2 endS2, + const VarType InsertCost = 1, const VarType DeleteCost = 1, + const VarType ReplaceCost = 1, const VarType TransposeCost = 1) +{ + using Type = typename std::iterator_traits::value_type; + + auto m = std::distance(beginS1, endS1), + n = std::distance(beginS2, endS2); + + using DiffType1 = decltype(m); + using DiffType2 = decltype(n); + + if (m == 0) + { + return n; + } + if (n == 0) + { + return m; + } + + // data init + std::vector< std::vector > d(m+2, std::vector(n+2)); + const auto INF = m + n; + + d[0][0] = INF; + for (DiffType1 i = 0; i <= m; ++i) + { + d[i+1][1] = i; + d[i+1][0] = INF; + } + for (DiffType2 j = 0; j <= n; ++j) + { + d[1][j+1] = j; + d[0][j+1] = INF; + } + + std::unordered_map lastpos; + for (auto i = beginS1; i != endS1; ++i) + { + lastpos[*i] = 0; + } + for (auto j = beginS2; j != endS2; ++j) + { + lastpos[*j] = 0; + } + + // go + DiffType1 i; + DiffType2 j; + ForwardIterator1 ii, jj; + for(i = 1, ii=beginS1; i <= m; ++i, ++ii) + { + DiffType2 last = 0; + for (j = 1, jj=beginS2; j <= n; ++j, ++jj) + { + const auto i0 = lastpos[*jj]; + const auto j0 = last; + if (*ii == *jj) + { + d[i+1][j+1] = d[i][j]; + last = j; + } + else + { + d[i+1][j+1] = std::min(d[i][j] + ReplaceCost, + std::min(d[i+1][j] + InsertCost, + d[i][j+1] + DeleteCost)); + } + d[i+1][j+1] = std::min(d[i+1][j+1], + static_cast(d[i0][j0] + (i-i0-1) * DeleteCost + + TransposeCost + (j-j0-1) * InsertCost)); + lastpos[*ii] = i; + } + } + + return d[m+1][n+1]; +} + +template +VarType +levenshtein_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, + ForwardIterator2 beginS2, ForwardIterator2 endS2, + const VarType InsertCost = 1, const VarType DeleteCost = 1, + const VarType ReplaceCost = 1) +{ + + auto m = std::distance(beginS1, endS1), + n = std::distance(beginS2, endS2); + + using DiffType2 = decltype(n); + + if (m == 0) + { + return n; + } + if (n == 0) + { + return m; + } + + + + std::vector vector1(n + 1), vector2(n + 1);//double n+1 is not an error + std::vector& D1(vector1), &D2(vector2); + + for(DiffType2 j = 1; j <= n ; ++j) + D2[j] = D2[j - 1] + InsertCost; + for(;beginS1 != endS1; ++beginS1) + { + std::swap(D1, D2); + D2[0] = D1[0] + DeleteCost; + + DiffType2 j = 1; + for(auto iterS2 = beginS2; iterS2 != endS2; ++iterS2, ++j) + { + if (*beginS1 != *iterS2) + { + D2[j] = std::min(D1[j] + DeleteCost, + std::min(D2[j - 1] + InsertCost, + D1[j - 1] + ReplaceCost)); + } + else + { + D2[j] = D1[j - 1]; + } + } + } + return D2[n]; +} + + + +#endif //BOOST_ALGORITHM_LEVENSHTEIN_HPP From 0ddbc3c193c111a0e59cce05c92d0e91cfbdd736 Mon Sep 17 00:00:00 2001 From: Sergei Shilovsky Date: Mon, 11 Jul 2016 03:16:46 +0300 Subject: [PATCH 2/5] [micro] damerau distance fix --- include/boost/algorithm/levenshtein.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/boost/algorithm/levenshtein.hpp b/include/boost/algorithm/levenshtein.hpp index e2266b4f6..888ed51ba 100644 --- a/include/boost/algorithm/levenshtein.hpp +++ b/include/boost/algorithm/levenshtein.hpp @@ -64,7 +64,8 @@ damerau_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, // go DiffType1 i; DiffType2 j; - ForwardIterator1 ii, jj; + ForwardIterator1 ii; + ForwardIterator2 jj; for(i = 1, ii=beginS1; i <= m; ++i, ++ii) { DiffType2 last = 0; From 863de2a3a76ddb714cb81cdc84101befeff96238 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Wed, 21 Sep 2016 02:18:59 +0300 Subject: [PATCH 3/5] Added Hamming distance, added range support for Lev-dist --- include/boost/algorithm/hamming.hpp | 56 +++++++++++++++++ include/boost/algorithm/levenshtein.hpp | 81 ++++++++++++++++--------- 2 files changed, 109 insertions(+), 28 deletions(-) create mode 100644 include/boost/algorithm/hamming.hpp diff --git a/include/boost/algorithm/hamming.hpp b/include/boost/algorithm/hamming.hpp new file mode 100644 index 000000000..464e6b26a --- /dev/null +++ b/include/boost/algorithm/hamming.hpp @@ -0,0 +1,56 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_HAMMING_HPP +#define BOOST_ALGORITHM_HAMMING_HPP + +#include + +#include +#include + +namespace boost { namespace algorithm { + + +//TODO: Return value for sequences with different lengths +template +VarType +hamming_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, + ForwardIterator2 beginS2, ForwardIterator2 endS2) +{ + VarType result = 0; + if(std::distance(beginS1, endS1) == std::distance(beginS2, endS2)) + { + while(beginS1 != endS1) + { + if(!(*beginS1 == *beginS2)) + { + ++result; + } + ++beginS1; + ++beginS2; + } + } + else + { + result = -1;//TODO: Return value for sequences with different lengths + } + return result; +} + +template +VarType +hamming_distance(Range1 range1, Range2 range2) +{ + return hamming_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; + +}} + +#endif //BOOST_ALGORITHM_HAMMING_HPP diff --git a/include/boost/algorithm/levenshtein.hpp b/include/boost/algorithm/levenshtein.hpp index 888ed51ba..102c55ae0 100644 --- a/include/boost/algorithm/levenshtein.hpp +++ b/include/boost/algorithm/levenshtein.hpp @@ -1,6 +1,10 @@ -// -// Created by zamazan4ik on 10.07.16. -// +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ #ifndef BOOST_ALGORITHM_LEVENSHTEIN_HPP #define BOOST_ALGORITHM_LEVENSHTEIN_HPP @@ -11,7 +15,9 @@ #include #include -template +namespace boost { namespace algorithm { + +template VarType damerau_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, ForwardIterator2 beginS2, ForwardIterator2 endS2, @@ -21,7 +27,7 @@ damerau_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, using Type = typename std::iterator_traits::value_type; auto m = std::distance(beginS1, endS1), - n = std::distance(beginS2, endS2); + n = std::distance(beginS2, endS2); using DiffType1 = decltype(m); using DiffType2 = decltype(n); @@ -36,19 +42,19 @@ damerau_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, } // data init - std::vector< std::vector > d(m+2, std::vector(n+2)); + std::vector > d(m + 2, std::vector(n + 2)); const auto INF = m + n; d[0][0] = INF; for (DiffType1 i = 0; i <= m; ++i) { - d[i+1][1] = i; - d[i+1][0] = INF; + d[i + 1][1] = i; + d[i + 1][0] = INF; } for (DiffType2 j = 0; j <= n; ++j) { - d[1][j+1] = j; - d[0][j+1] = INF; + d[1][j + 1] = j; + d[0][j + 1] = INF; } std::unordered_map lastpos; @@ -66,35 +72,46 @@ damerau_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, DiffType2 j; ForwardIterator1 ii; ForwardIterator2 jj; - for(i = 1, ii=beginS1; i <= m; ++i, ++ii) + for (i = 1, ii = beginS1; i <= m; ++i, ++ii) { DiffType2 last = 0; - for (j = 1, jj=beginS2; j <= n; ++j, ++jj) + for (j = 1, jj = beginS2; j <= n; ++j, ++jj) { const auto i0 = lastpos[*jj]; const auto j0 = last; if (*ii == *jj) { - d[i+1][j+1] = d[i][j]; + d[i + 1][j + 1] = d[i][j]; last = j; } else { - d[i+1][j+1] = std::min(d[i][j] + ReplaceCost, - std::min(d[i+1][j] + InsertCost, - d[i][j+1] + DeleteCost)); + d[i + 1][j + 1] = std::min(d[i][j] + ReplaceCost, + std::min(d[i + 1][j] + InsertCost, + d[i][j + 1] + DeleteCost)); } - d[i+1][j+1] = std::min(d[i+1][j+1], - static_cast(d[i0][j0] + (i-i0-1) * DeleteCost + - TransposeCost + (j-j0-1) * InsertCost)); + d[i + 1][j + 1] = std::min(d[i + 1][j + 1], + static_cast(d[i0][j0] + (i - i0 - 1) * DeleteCost + + TransposeCost + (j - j0 - 1) * InsertCost)); lastpos[*ii] = i; } } - return d[m+1][n+1]; + return d[m + 1][n + 1]; +} + +template +VarType +damerau_distance(Range1 range1, Range2 range2, + const VarType InsertCost = 1, const VarType DeleteCost = 1, + const VarType ReplaceCost = 1, const VarType TransposeCost = 1) +{ + return damerau_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2), + InsertCost, DeleteCost, ReplaceCost, TransposeCost); } -template +template VarType levenshtein_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, ForwardIterator2 beginS2, ForwardIterator2 endS2, @@ -117,19 +134,18 @@ levenshtein_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, } - std::vector vector1(n + 1), vector2(n + 1);//double n+1 is not an error - std::vector& D1(vector1), &D2(vector2); + std::vector &D1(vector1), &D2(vector2); - for(DiffType2 j = 1; j <= n ; ++j) + for (DiffType2 j = 1; j <= n; ++j) D2[j] = D2[j - 1] + InsertCost; - for(;beginS1 != endS1; ++beginS1) + for (; beginS1 != endS1; ++beginS1) { std::swap(D1, D2); D2[0] = D1[0] + DeleteCost; DiffType2 j = 1; - for(auto iterS2 = beginS2; iterS2 != endS2; ++iterS2, ++j) + for (auto iterS2 = beginS2; iterS2 != endS2; ++iterS2, ++j) { if (*beginS1 != *iterS2) { @@ -146,6 +162,15 @@ levenshtein_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, return D2[n]; } - - +template +VarType +levenshtein_distance(Range1 range1, Range2 range2, + const VarType InsertCost = 1, const VarType DeleteCost = 1, + const VarType ReplaceCost = 1) +{ + return levenshtein_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2), + InsertCost, DeleteCost, ReplaceCost); +} +}} #endif //BOOST_ALGORITHM_LEVENSHTEIN_HPP From 618645844b02667e641b14f0788126a4046eac4b Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Wed, 21 Sep 2016 22:32:52 +0300 Subject: [PATCH 4/5] Added new coefficients --- include/boost/algorithm/dice.hpp | 66 ++++++++ include/boost/algorithm/hamming.hpp | 3 +- include/boost/algorithm/jaccard.hpp | 69 ++++++++ include/boost/algorithm/jaro.hpp | 158 ++++++++++++++++++ include/boost/algorithm/levenshtein.hpp | 4 +- include/boost/algorithm/overlap.hpp | 65 +++++++ .../boost/algorithm/ratcliff_obershelp.hpp | 65 +++++++ 7 files changed, 426 insertions(+), 4 deletions(-) create mode 100644 include/boost/algorithm/dice.hpp create mode 100644 include/boost/algorithm/jaccard.hpp create mode 100644 include/boost/algorithm/jaro.hpp create mode 100644 include/boost/algorithm/overlap.hpp create mode 100644 include/boost/algorithm/ratcliff_obershelp.hpp diff --git a/include/boost/algorithm/dice.hpp b/include/boost/algorithm/dice.hpp new file mode 100644 index 000000000..1c52319e1 --- /dev/null +++ b/include/boost/algorithm/dice.hpp @@ -0,0 +1,66 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_DICE_HPP +#define BOOST_ALGORITHM_DICE_HPP + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType dice_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> inter; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(inter)); + + // calculate dice coefficient + size_t total = vec1.size() + vec2.size(); + return static_cast(inter.size() * 2) / static_cast(total); +} + + +template +VarType dice_coefficient(Range1 range1, Range2 range2) +{ + return dice_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_DICE_HPP diff --git a/include/boost/algorithm/hamming.hpp b/include/boost/algorithm/hamming.hpp index 464e6b26a..02e8cd260 100644 --- a/include/boost/algorithm/hamming.hpp +++ b/include/boost/algorithm/hamming.hpp @@ -17,7 +17,6 @@ namespace boost { namespace algorithm { -//TODO: Return value for sequences with different lengths template VarType hamming_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, @@ -47,7 +46,7 @@ template VarType hamming_distance(Range1 range1, Range2 range2) { - return hamming_distance(boost::begin(range1), boost::end(range1), + return hamming_distance(boost::begin(range1), boost::end(range1), boost::begin(range2), boost::end(range2)); }; diff --git a/include/boost/algorithm/jaccard.hpp b/include/boost/algorithm/jaccard.hpp new file mode 100644 index 000000000..a6b58e79b --- /dev/null +++ b/include/boost/algorithm/jaccard.hpp @@ -0,0 +1,69 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_JACCARD_HPP +#define BOOST_ALGORITHM_JACCARD_HPP + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType jaccard_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> intersect; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(intersect)); + + // find the union between the two sets + std::vector::value_type> unionSet; + std::set_union(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(unionSet)); + + // calculate dice coefficient + return static_cast(intersect.size()) / static_cast(unionSet.size()); +} + + +template +VarType jaccard_coefficient(Range1 range1, Range2 range2) +{ + return jaccard_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_BOOST_ALGORITHM_JACCARD_HPP_HPP diff --git a/include/boost/algorithm/jaro.hpp b/include/boost/algorithm/jaro.hpp new file mode 100644 index 000000000..23cc17ee8 --- /dev/null +++ b/include/boost/algorithm/jaro.hpp @@ -0,0 +1,158 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_JARO_HPP +#define BOOST_ALGORITHM_JARO_HPP + +#include +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType jaro_distance(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto aLength = std::distance(begin1, end1); + auto bLength = std::distance(begin2, end2); + + // If one string has null length, we return 0. + if (aLength == 0 || bLength == 0) + { + return 0.0; + } + + // Calculate max length range. + int maxRange = std::max(0L, std::max(aLength, bLength) / 2 - 1); + + // Creates 2 vectors of integers. + std::vector aMatch(aLength, false), bMatch(bLength, false); + + // Calculate matching characters. + int matchingCharacters = 0; + for (int aIndex = 0; aIndex < aLength; ++aIndex) + { + // Calculate window test limits (limit inferior to 0 and superior to bLength). + int minIndex = std::max(aIndex - maxRange, 0); + int maxIndex = std::min(aIndex + maxRange + 1, (int)bLength); + + if (minIndex >= maxIndex) + { + // No more common character because we don't have characters in b to test with characters in a. + break; + } + + for (int bIndex = minIndex; bIndex < maxIndex; ++bIndex) + { + if (!bMatch[bIndex] && *(begin1 + aIndex) == *(begin2 + bIndex)) + { + // Found some new match. + aMatch[aIndex] = true; + bMatch[bIndex] = true; + ++matchingCharacters; + break; + } + } + } + + // If no matching characters, we return 0. + if (matchingCharacters == 0) + { + return 0.0; + } + + // Calculate character transpositions. + std::vector aPosition(matchingCharacters, 0), bPosition(matchingCharacters, 0); + for (int aIndex = 0, positionIndex = 0; aIndex < aLength; ++aIndex) + { + if (aMatch[aIndex]) + { + aPosition[positionIndex] = aIndex; + ++positionIndex; + } + } + + for (int bIndex = 0, positionIndex = 0; bIndex < bLength; ++bIndex) + { + if (bMatch[bIndex]) + { + bPosition[positionIndex] = bIndex; + ++positionIndex; + } + } + + // Counting half-transpositions. + int transpositions = 0; + for (int index = 0; index < matchingCharacters; ++index) + { + if (*(begin1 + aPosition[index]) != *(begin2 + bPosition[index])) + { + ++transpositions; + } + } + + // Calculate Jaro distance. + return ( + (1.0/3.0) * matchingCharacters / aLength + + (1.0/3.0) * matchingCharacters / bLength + + (1.0/3.0) * (matchingCharacters - transpositions / 2) / matchingCharacters + ); +} + +template +VarType jaro_distance(Range1 range1, Range2 range2) +{ + return jaro_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; + +template +VarType jaro_winkler_distance(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + // Calculate Jaro distance. + double distance = jaro_distance(begin1, end1, begin2, end2); + + if (distance > 0.7) + { + // Calculate common string prefix. + int commonPrefix = 0; + for (int index = 0, indexEnd = std::min(std::min(std::distance(begin1, end1), std::distance(begin2, end2)), 4L); + index < indexEnd; ++index) + { + if (*(begin1 + index) == *(begin2 + index)) + { + ++commonPrefix; + } + else + { + break; + } + } + + // Calculate Jaro-Winkler distance. + distance += 0.1 * commonPrefix * (1.0 - distance); + } + + return distance; +} + +template +VarType jaro_winkler_distance(Range1 range1, Range2 range2) +{ + return jaro_winkler_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; + +}} + +#endif //BOOST_ALGORITHM_JARO_HPP diff --git a/include/boost/algorithm/levenshtein.hpp b/include/boost/algorithm/levenshtein.hpp index 102c55ae0..2a979d026 100644 --- a/include/boost/algorithm/levenshtein.hpp +++ b/include/boost/algorithm/levenshtein.hpp @@ -106,7 +106,7 @@ damerau_distance(Range1 range1, Range2 range2, const VarType InsertCost = 1, const VarType DeleteCost = 1, const VarType ReplaceCost = 1, const VarType TransposeCost = 1) { - return damerau_distance(boost::begin(range1), boost::end(range1), + return damerau_distance(boost::begin(range1), boost::end(range1), boost::begin(range2), boost::end(range2), InsertCost, DeleteCost, ReplaceCost, TransposeCost); } @@ -168,7 +168,7 @@ levenshtein_distance(Range1 range1, Range2 range2, const VarType InsertCost = 1, const VarType DeleteCost = 1, const VarType ReplaceCost = 1) { - return levenshtein_distance(boost::begin(range1), boost::end(range1), + return levenshtein_distance(boost::begin(range1), boost::end(range1), boost::begin(range2), boost::end(range2), InsertCost, DeleteCost, ReplaceCost); } diff --git a/include/boost/algorithm/overlap.hpp b/include/boost/algorithm/overlap.hpp new file mode 100644 index 000000000..52fc09e50 --- /dev/null +++ b/include/boost/algorithm/overlap.hpp @@ -0,0 +1,65 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_OVERLAP_HPP +#define BOOST_ALGORITHM_OVERLAP_HPP + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType overlap_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> inter; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(inter)); + + // calculate dice coefficient + return static_cast(inter.size()) / static_cast(std::min(vec1.size(), vec2.size())); +} + + +template +VarType overlap_coefficient(Range1 range1, Range2 range2) +{ + return overlap_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_OVERLAP_HPP diff --git a/include/boost/algorithm/ratcliff_obershelp.hpp b/include/boost/algorithm/ratcliff_obershelp.hpp new file mode 100644 index 000000000..46f263fe2 --- /dev/null +++ b/include/boost/algorithm/ratcliff_obershelp.hpp @@ -0,0 +1,65 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_RATCLIFF_OBERSHELP_HPP +#define BOOST_ALGORITHM_RATCLIFF_OBERSHELP_HPP + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType ratcliff_obershelp_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> intersect; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(intersect)); + + // calculate dice coefficient + return static_cast(intersect.size() * 2) / static_cast(length1 + length2); +} + + +template +VarType ratcliff_obershelp_coefficient(Range1 range1, Range2 range2) +{ + return ratcliff_obershelp_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_RATCLIFF_OBERSHELP_HPP From 3c62f7d479e71c8d899661615b16c2378e086082 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Thu, 22 Sep 2016 07:39:49 +0300 Subject: [PATCH 5/5] Moved algo to 'fuzzy_search' directory --- .../algorithm/{ => fuzzy_search}/dice.hpp | 0 .../algorithm/{ => fuzzy_search}/hamming.hpp | 0 .../algorithm/{ => fuzzy_search}/jaccard.hpp | 2 +- .../algorithm/{ => fuzzy_search}/jaro.hpp | 0 .../{ => fuzzy_search}/levenshtein.hpp | 0 .../boost/algorithm/fuzzy_search/ochiai.hpp | 66 +++++++++++++++++++ .../algorithm/{ => fuzzy_search}/overlap.hpp | 0 .../{ => fuzzy_search}/ratcliff_obershelp.hpp | 0 8 files changed, 67 insertions(+), 1 deletion(-) rename include/boost/algorithm/{ => fuzzy_search}/dice.hpp (100%) rename include/boost/algorithm/{ => fuzzy_search}/hamming.hpp (100%) rename include/boost/algorithm/{ => fuzzy_search}/jaccard.hpp (97%) rename include/boost/algorithm/{ => fuzzy_search}/jaro.hpp (100%) rename include/boost/algorithm/{ => fuzzy_search}/levenshtein.hpp (100%) create mode 100644 include/boost/algorithm/fuzzy_search/ochiai.hpp rename include/boost/algorithm/{ => fuzzy_search}/overlap.hpp (100%) rename include/boost/algorithm/{ => fuzzy_search}/ratcliff_obershelp.hpp (100%) diff --git a/include/boost/algorithm/dice.hpp b/include/boost/algorithm/fuzzy_search/dice.hpp similarity index 100% rename from include/boost/algorithm/dice.hpp rename to include/boost/algorithm/fuzzy_search/dice.hpp diff --git a/include/boost/algorithm/hamming.hpp b/include/boost/algorithm/fuzzy_search/hamming.hpp similarity index 100% rename from include/boost/algorithm/hamming.hpp rename to include/boost/algorithm/fuzzy_search/hamming.hpp diff --git a/include/boost/algorithm/jaccard.hpp b/include/boost/algorithm/fuzzy_search/jaccard.hpp similarity index 97% rename from include/boost/algorithm/jaccard.hpp rename to include/boost/algorithm/fuzzy_search/jaccard.hpp index a6b58e79b..09de24bf2 100644 --- a/include/boost/algorithm/jaccard.hpp +++ b/include/boost/algorithm/fuzzy_search/jaccard.hpp @@ -66,4 +66,4 @@ VarType jaccard_coefficient(Range1 range1, Range2 range2) }; }} -#endif //BOOST_ALGORITHM_BOOST_ALGORITHM_JACCARD_HPP_HPP +#endif //BOOST_ALGORITHM_BOOST_ALGORITHM_JACCARD_HPP diff --git a/include/boost/algorithm/jaro.hpp b/include/boost/algorithm/fuzzy_search/jaro.hpp similarity index 100% rename from include/boost/algorithm/jaro.hpp rename to include/boost/algorithm/fuzzy_search/jaro.hpp diff --git a/include/boost/algorithm/levenshtein.hpp b/include/boost/algorithm/fuzzy_search/levenshtein.hpp similarity index 100% rename from include/boost/algorithm/levenshtein.hpp rename to include/boost/algorithm/fuzzy_search/levenshtein.hpp diff --git a/include/boost/algorithm/fuzzy_search/ochiai.hpp b/include/boost/algorithm/fuzzy_search/ochiai.hpp new file mode 100644 index 000000000..a675e70ad --- /dev/null +++ b/include/boost/algorithm/fuzzy_search/ochiai.hpp @@ -0,0 +1,66 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_OCHIAI_HPP +#define BOOST_ALGORITHM_OCHIAI_HPP + +#include +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType ochiai_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> intersect; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(intersect)); + + // calculate dice coefficient + return static_cast(intersect.size()) / sqrt(static_cast(vec1.size() * vec2.size())); +} + + +template +VarType ochiai_coefficient(Range1 range1, Range2 range2) +{ + return ochiai_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_OCHIAI_HPP diff --git a/include/boost/algorithm/overlap.hpp b/include/boost/algorithm/fuzzy_search/overlap.hpp similarity index 100% rename from include/boost/algorithm/overlap.hpp rename to include/boost/algorithm/fuzzy_search/overlap.hpp diff --git a/include/boost/algorithm/ratcliff_obershelp.hpp b/include/boost/algorithm/fuzzy_search/ratcliff_obershelp.hpp similarity index 100% rename from include/boost/algorithm/ratcliff_obershelp.hpp rename to include/boost/algorithm/fuzzy_search/ratcliff_obershelp.hpp