diff --git a/include/boost/algorithm/fuzzy_search/dice.hpp b/include/boost/algorithm/fuzzy_search/dice.hpp new file mode 100644 index 000000000..1c52319e1 --- /dev/null +++ b/include/boost/algorithm/fuzzy_search/dice.hpp @@ -0,0 +1,66 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_DICE_HPP +#define BOOST_ALGORITHM_DICE_HPP + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType dice_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> inter; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(inter)); + + // calculate dice coefficient + size_t total = vec1.size() + vec2.size(); + return static_cast(inter.size() * 2) / static_cast(total); +} + + +template +VarType dice_coefficient(Range1 range1, Range2 range2) +{ + return dice_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_DICE_HPP diff --git a/include/boost/algorithm/fuzzy_search/hamming.hpp b/include/boost/algorithm/fuzzy_search/hamming.hpp new file mode 100644 index 000000000..02e8cd260 --- /dev/null +++ b/include/boost/algorithm/fuzzy_search/hamming.hpp @@ -0,0 +1,55 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_HAMMING_HPP +#define BOOST_ALGORITHM_HAMMING_HPP + +#include + +#include +#include + +namespace boost { namespace algorithm { + + +template +VarType +hamming_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, + ForwardIterator2 beginS2, ForwardIterator2 endS2) +{ + VarType result = 0; + if(std::distance(beginS1, endS1) == std::distance(beginS2, endS2)) + { + while(beginS1 != endS1) + { + if(!(*beginS1 == *beginS2)) + { + ++result; + } + ++beginS1; + ++beginS2; + } + } + else + { + result = -1;//TODO: Return value for sequences with different lengths + } + return result; +} + +template +VarType +hamming_distance(Range1 range1, Range2 range2) +{ + return hamming_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; + +}} + +#endif //BOOST_ALGORITHM_HAMMING_HPP diff --git a/include/boost/algorithm/fuzzy_search/jaccard.hpp b/include/boost/algorithm/fuzzy_search/jaccard.hpp new file mode 100644 index 000000000..09de24bf2 --- /dev/null +++ b/include/boost/algorithm/fuzzy_search/jaccard.hpp @@ -0,0 +1,69 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_JACCARD_HPP +#define BOOST_ALGORITHM_JACCARD_HPP + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType jaccard_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> intersect; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(intersect)); + + // find the union between the two sets + std::vector::value_type> unionSet; + std::set_union(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(unionSet)); + + // calculate dice coefficient + return static_cast(intersect.size()) / static_cast(unionSet.size()); +} + + +template +VarType jaccard_coefficient(Range1 range1, Range2 range2) +{ + return jaccard_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_BOOST_ALGORITHM_JACCARD_HPP diff --git a/include/boost/algorithm/fuzzy_search/jaro.hpp b/include/boost/algorithm/fuzzy_search/jaro.hpp new file mode 100644 index 000000000..23cc17ee8 --- /dev/null +++ b/include/boost/algorithm/fuzzy_search/jaro.hpp @@ -0,0 +1,158 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_JARO_HPP +#define BOOST_ALGORITHM_JARO_HPP + +#include +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType jaro_distance(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto aLength = std::distance(begin1, end1); + auto bLength = std::distance(begin2, end2); + + // If one string has null length, we return 0. + if (aLength == 0 || bLength == 0) + { + return 0.0; + } + + // Calculate max length range. + int maxRange = std::max(0L, std::max(aLength, bLength) / 2 - 1); + + // Creates 2 vectors of integers. + std::vector aMatch(aLength, false), bMatch(bLength, false); + + // Calculate matching characters. + int matchingCharacters = 0; + for (int aIndex = 0; aIndex < aLength; ++aIndex) + { + // Calculate window test limits (limit inferior to 0 and superior to bLength). + int minIndex = std::max(aIndex - maxRange, 0); + int maxIndex = std::min(aIndex + maxRange + 1, (int)bLength); + + if (minIndex >= maxIndex) + { + // No more common character because we don't have characters in b to test with characters in a. + break; + } + + for (int bIndex = minIndex; bIndex < maxIndex; ++bIndex) + { + if (!bMatch[bIndex] && *(begin1 + aIndex) == *(begin2 + bIndex)) + { + // Found some new match. + aMatch[aIndex] = true; + bMatch[bIndex] = true; + ++matchingCharacters; + break; + } + } + } + + // If no matching characters, we return 0. + if (matchingCharacters == 0) + { + return 0.0; + } + + // Calculate character transpositions. + std::vector aPosition(matchingCharacters, 0), bPosition(matchingCharacters, 0); + for (int aIndex = 0, positionIndex = 0; aIndex < aLength; ++aIndex) + { + if (aMatch[aIndex]) + { + aPosition[positionIndex] = aIndex; + ++positionIndex; + } + } + + for (int bIndex = 0, positionIndex = 0; bIndex < bLength; ++bIndex) + { + if (bMatch[bIndex]) + { + bPosition[positionIndex] = bIndex; + ++positionIndex; + } + } + + // Counting half-transpositions. + int transpositions = 0; + for (int index = 0; index < matchingCharacters; ++index) + { + if (*(begin1 + aPosition[index]) != *(begin2 + bPosition[index])) + { + ++transpositions; + } + } + + // Calculate Jaro distance. + return ( + (1.0/3.0) * matchingCharacters / aLength + + (1.0/3.0) * matchingCharacters / bLength + + (1.0/3.0) * (matchingCharacters - transpositions / 2) / matchingCharacters + ); +} + +template +VarType jaro_distance(Range1 range1, Range2 range2) +{ + return jaro_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; + +template +VarType jaro_winkler_distance(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + // Calculate Jaro distance. + double distance = jaro_distance(begin1, end1, begin2, end2); + + if (distance > 0.7) + { + // Calculate common string prefix. + int commonPrefix = 0; + for (int index = 0, indexEnd = std::min(std::min(std::distance(begin1, end1), std::distance(begin2, end2)), 4L); + index < indexEnd; ++index) + { + if (*(begin1 + index) == *(begin2 + index)) + { + ++commonPrefix; + } + else + { + break; + } + } + + // Calculate Jaro-Winkler distance. + distance += 0.1 * commonPrefix * (1.0 - distance); + } + + return distance; +} + +template +VarType jaro_winkler_distance(Range1 range1, Range2 range2) +{ + return jaro_winkler_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; + +}} + +#endif //BOOST_ALGORITHM_JARO_HPP diff --git a/include/boost/algorithm/fuzzy_search/levenshtein.hpp b/include/boost/algorithm/fuzzy_search/levenshtein.hpp new file mode 100644 index 000000000..2a979d026 --- /dev/null +++ b/include/boost/algorithm/fuzzy_search/levenshtein.hpp @@ -0,0 +1,176 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_LEVENSHTEIN_HPP +#define BOOST_ALGORITHM_LEVENSHTEIN_HPP + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType +damerau_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, + ForwardIterator2 beginS2, ForwardIterator2 endS2, + const VarType InsertCost = 1, const VarType DeleteCost = 1, + const VarType ReplaceCost = 1, const VarType TransposeCost = 1) +{ + using Type = typename std::iterator_traits::value_type; + + auto m = std::distance(beginS1, endS1), + n = std::distance(beginS2, endS2); + + using DiffType1 = decltype(m); + using DiffType2 = decltype(n); + + if (m == 0) + { + return n; + } + if (n == 0) + { + return m; + } + + // data init + std::vector > d(m + 2, std::vector(n + 2)); + const auto INF = m + n; + + d[0][0] = INF; + for (DiffType1 i = 0; i <= m; ++i) + { + d[i + 1][1] = i; + d[i + 1][0] = INF; + } + for (DiffType2 j = 0; j <= n; ++j) + { + d[1][j + 1] = j; + d[0][j + 1] = INF; + } + + std::unordered_map lastpos; + for (auto i = beginS1; i != endS1; ++i) + { + lastpos[*i] = 0; + } + for (auto j = beginS2; j != endS2; ++j) + { + lastpos[*j] = 0; + } + + // go + DiffType1 i; + DiffType2 j; + ForwardIterator1 ii; + ForwardIterator2 jj; + for (i = 1, ii = beginS1; i <= m; ++i, ++ii) + { + DiffType2 last = 0; + for (j = 1, jj = beginS2; j <= n; ++j, ++jj) + { + const auto i0 = lastpos[*jj]; + const auto j0 = last; + if (*ii == *jj) + { + d[i + 1][j + 1] = d[i][j]; + last = j; + } + else + { + d[i + 1][j + 1] = std::min(d[i][j] + ReplaceCost, + std::min(d[i + 1][j] + InsertCost, + d[i][j + 1] + DeleteCost)); + } + d[i + 1][j + 1] = std::min(d[i + 1][j + 1], + static_cast(d[i0][j0] + (i - i0 - 1) * DeleteCost + + TransposeCost + (j - j0 - 1) * InsertCost)); + lastpos[*ii] = i; + } + } + + return d[m + 1][n + 1]; +} + +template +VarType +damerau_distance(Range1 range1, Range2 range2, + const VarType InsertCost = 1, const VarType DeleteCost = 1, + const VarType ReplaceCost = 1, const VarType TransposeCost = 1) +{ + return damerau_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2), + InsertCost, DeleteCost, ReplaceCost, TransposeCost); +} + +template +VarType +levenshtein_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1, + ForwardIterator2 beginS2, ForwardIterator2 endS2, + const VarType InsertCost = 1, const VarType DeleteCost = 1, + const VarType ReplaceCost = 1) +{ + + auto m = std::distance(beginS1, endS1), + n = std::distance(beginS2, endS2); + + using DiffType2 = decltype(n); + + if (m == 0) + { + return n; + } + if (n == 0) + { + return m; + } + + + std::vector vector1(n + 1), vector2(n + 1);//double n+1 is not an error + std::vector &D1(vector1), &D2(vector2); + + for (DiffType2 j = 1; j <= n; ++j) + D2[j] = D2[j - 1] + InsertCost; + for (; beginS1 != endS1; ++beginS1) + { + std::swap(D1, D2); + D2[0] = D1[0] + DeleteCost; + + DiffType2 j = 1; + for (auto iterS2 = beginS2; iterS2 != endS2; ++iterS2, ++j) + { + if (*beginS1 != *iterS2) + { + D2[j] = std::min(D1[j] + DeleteCost, + std::min(D2[j - 1] + InsertCost, + D1[j - 1] + ReplaceCost)); + } + else + { + D2[j] = D1[j - 1]; + } + } + } + return D2[n]; +} + +template +VarType +levenshtein_distance(Range1 range1, Range2 range2, + const VarType InsertCost = 1, const VarType DeleteCost = 1, + const VarType ReplaceCost = 1) +{ + return levenshtein_distance(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2), + InsertCost, DeleteCost, ReplaceCost); +} +}} +#endif //BOOST_ALGORITHM_LEVENSHTEIN_HPP diff --git a/include/boost/algorithm/fuzzy_search/ochiai.hpp b/include/boost/algorithm/fuzzy_search/ochiai.hpp new file mode 100644 index 000000000..a675e70ad --- /dev/null +++ b/include/boost/algorithm/fuzzy_search/ochiai.hpp @@ -0,0 +1,66 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_OCHIAI_HPP +#define BOOST_ALGORITHM_OCHIAI_HPP + +#include +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType ochiai_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> intersect; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(intersect)); + + // calculate dice coefficient + return static_cast(intersect.size()) / sqrt(static_cast(vec1.size() * vec2.size())); +} + + +template +VarType ochiai_coefficient(Range1 range1, Range2 range2) +{ + return ochiai_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_OCHIAI_HPP diff --git a/include/boost/algorithm/fuzzy_search/overlap.hpp b/include/boost/algorithm/fuzzy_search/overlap.hpp new file mode 100644 index 000000000..52fc09e50 --- /dev/null +++ b/include/boost/algorithm/fuzzy_search/overlap.hpp @@ -0,0 +1,65 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_OVERLAP_HPP +#define BOOST_ALGORITHM_OVERLAP_HPP + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType overlap_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> inter; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(inter)); + + // calculate dice coefficient + return static_cast(inter.size()) / static_cast(std::min(vec1.size(), vec2.size())); +} + + +template +VarType overlap_coefficient(Range1 range1, Range2 range2) +{ + return overlap_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_OVERLAP_HPP diff --git a/include/boost/algorithm/fuzzy_search/ratcliff_obershelp.hpp b/include/boost/algorithm/fuzzy_search/ratcliff_obershelp.hpp new file mode 100644 index 000000000..46f263fe2 --- /dev/null +++ b/include/boost/algorithm/fuzzy_search/ratcliff_obershelp.hpp @@ -0,0 +1,65 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_RATCLIFF_OBERSHELP_HPP +#define BOOST_ALGORITHM_RATCLIFF_OBERSHELP_HPP + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +VarType ratcliff_obershelp_coefficient(RAIterator1 begin1, RAIterator1 end1, + RAIterator2 begin2, RAIterator2 end2) +{ + auto length1 = std::distance(begin1, end1); + auto length2 = std::distance(begin2, end2); + //base case + if(length1 == 0 || length2 == 0) + { + return 0.0; + } + + //TODO: Can i avoid copying? + std::vector::value_type> vec1(length1); + std::vector::value_type> vec2(length2); + + std::copy(begin1, end1, vec1.begin()); + std::copy(begin2, end2, vec2.begin()); + + std::sort(vec1.begin(), vec1.end()); + std::sort(vec2.begin(), vec2.end()); + + auto last1 = std::unique(vec1.begin(), vec1.end()); + auto last2 = std::unique(vec2.begin(), vec2.end()); + + vec1.erase(last1, vec1.end()); + vec2.erase(last2, vec2.end()); + + // find the intersection between the two sets + std::vector::value_type> intersect; + std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(intersect)); + + // calculate dice coefficient + return static_cast(intersect.size() * 2) / static_cast(length1 + length2); +} + + +template +VarType ratcliff_obershelp_coefficient(Range1 range1, Range2 range2) +{ + return ratcliff_obershelp_coefficient(boost::begin(range1), boost::end(range1), + boost::begin(range2), boost::end(range2)); +}; +}} + +#endif //BOOST_ALGORITHM_RATCLIFF_OBERSHELP_HPP