Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fuzzy search. #26

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions include/boost/algorithm/fuzzy_search/dice.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
Copyright (c) Alexander Zaitsev <[email protected]>, 2016
Distributed under the Boost Software License, Version 1.0. (See
accompanying file LICENSE_1_0.txt or copy at
http://www.boost.org/LICENSE_1_0.txt)
See http://www.boost.org/ for latest version.
*/

#ifndef BOOST_ALGORITHM_DICE_HPP
#define BOOST_ALGORITHM_DICE_HPP

#include <algorithm>
#include <vector>

#include <boost/range/begin.hpp>
#include <boost/range/end.hpp>

namespace boost { namespace algorithm {

template<typename VarType = double, typename RAIterator1, typename RAIterator2>
VarType dice_coefficient(RAIterator1 begin1, RAIterator1 end1,
RAIterator2 begin2, RAIterator2 end2)
{
auto length1 = std::distance(begin1, end1);
auto length2 = std::distance(begin2, end2);
//base case
if(length1 == 0 || length2 == 0)
{
return 0.0;
}

//TODO: Can i avoid copying?
std::vector<typename std::iterator_traits<RAIterator1>::value_type> vec1(length1);
std::vector<typename std::iterator_traits<RAIterator2>::value_type> vec2(length2);

std::copy(begin1, end1, vec1.begin());
std::copy(begin2, end2, vec2.begin());

std::sort(vec1.begin(), vec1.end());
std::sort(vec2.begin(), vec2.end());

auto last1 = std::unique(vec1.begin(), vec1.end());
auto last2 = std::unique(vec2.begin(), vec2.end());

vec1.erase(last1, vec1.end());
vec2.erase(last2, vec2.end());

// find the intersection between the two sets
std::vector<typename std::iterator_traits<RAIterator1>::value_type> inter;
std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(inter));

// calculate dice coefficient
size_t total = vec1.size() + vec2.size();
return static_cast<double>(inter.size() * 2) / static_cast<double>(total);
}


template<typename VarType = double, typename Range1, typename Range2>
VarType dice_coefficient(Range1 range1, Range2 range2)
{
return dice_coefficient<VarType>(boost::begin(range1), boost::end(range1),
boost::begin(range2), boost::end(range2));
};
}}

#endif //BOOST_ALGORITHM_DICE_HPP
55 changes: 55 additions & 0 deletions include/boost/algorithm/fuzzy_search/hamming.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
Copyright (c) Alexander Zaitsev <[email protected]>, 2016
Distributed under the Boost Software License, Version 1.0. (See
accompanying file LICENSE_1_0.txt or copy at
http://www.boost.org/LICENSE_1_0.txt)
See http://www.boost.org/ for latest version.
*/

#ifndef BOOST_ALGORITHM_HAMMING_HPP
#define BOOST_ALGORITHM_HAMMING_HPP

#include <iterator>

#include <boost/range/begin.hpp>
#include <boost/range/end.hpp>

namespace boost { namespace algorithm {


template<typename VarType = int, typename ForwardIterator1, typename ForwardIterator2>
VarType
hamming_distance(ForwardIterator1 beginS1, ForwardIterator1 endS1,
ForwardIterator2 beginS2, ForwardIterator2 endS2)
{
VarType result = 0;
if(std::distance(beginS1, endS1) == std::distance(beginS2, endS2))
{
while(beginS1 != endS1)
{
if(!(*beginS1 == *beginS2))
{
++result;
}
++beginS1;
++beginS2;
}
}
else
{
result = -1;//TODO: Return value for sequences with different lengths
}
return result;
}

template<typename VarType = int, typename Range1, typename Range2>
VarType
hamming_distance(Range1 range1, Range2 range2)
{
return hamming_distance<VarType>(boost::begin(range1), boost::end(range1),
boost::begin(range2), boost::end(range2));
};

}}

#endif //BOOST_ALGORITHM_HAMMING_HPP
69 changes: 69 additions & 0 deletions include/boost/algorithm/fuzzy_search/jaccard.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
Copyright (c) Alexander Zaitsev <[email protected]>, 2016
Distributed under the Boost Software License, Version 1.0. (See
accompanying file LICENSE_1_0.txt or copy at
http://www.boost.org/LICENSE_1_0.txt)
See http://www.boost.org/ for latest version.
*/

#ifndef BOOST_ALGORITHM_JACCARD_HPP
#define BOOST_ALGORITHM_JACCARD_HPP

#include <algorithm>
#include <vector>

#include <boost/range/begin.hpp>
#include <boost/range/end.hpp>

namespace boost { namespace algorithm {

template<typename VarType = double, typename RAIterator1, typename RAIterator2>
VarType jaccard_coefficient(RAIterator1 begin1, RAIterator1 end1,
RAIterator2 begin2, RAIterator2 end2)
{
auto length1 = std::distance(begin1, end1);
auto length2 = std::distance(begin2, end2);
//base case
if(length1 == 0 || length2 == 0)
{
return 0.0;
}

//TODO: Can i avoid copying?
std::vector<typename std::iterator_traits<RAIterator1>::value_type> vec1(length1);
std::vector<typename std::iterator_traits<RAIterator2>::value_type> vec2(length2);

std::copy(begin1, end1, vec1.begin());
std::copy(begin2, end2, vec2.begin());

std::sort(vec1.begin(), vec1.end());
std::sort(vec2.begin(), vec2.end());

auto last1 = std::unique(vec1.begin(), vec1.end());
auto last2 = std::unique(vec2.begin(), vec2.end());

vec1.erase(last1, vec1.end());
vec2.erase(last2, vec2.end());

// find the intersection between the two sets
std::vector<typename std::iterator_traits<RAIterator1>::value_type> intersect;
std::set_intersection(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(intersect));

// find the union between the two sets
std::vector<typename std::iterator_traits<RAIterator1>::value_type> unionSet;
std::set_union(vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), std::back_inserter(unionSet));

// calculate dice coefficient
return static_cast<double>(intersect.size()) / static_cast<double>(unionSet.size());
}


template<typename VarType = double, typename Range1, typename Range2>
VarType jaccard_coefficient(Range1 range1, Range2 range2)
{
return jaccard_coefficient<VarType>(boost::begin(range1), boost::end(range1),
boost::begin(range2), boost::end(range2));
};
}}

#endif //BOOST_ALGORITHM_BOOST_ALGORITHM_JACCARD_HPP
158 changes: 158 additions & 0 deletions include/boost/algorithm/fuzzy_search/jaro.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
Copyright (c) Alexander Zaitsev <[email protected]>, 2016
Distributed under the Boost Software License, Version 1.0. (See
accompanying file LICENSE_1_0.txt or copy at
http://www.boost.org/LICENSE_1_0.txt)
See http://www.boost.org/ for latest version.
*/

#ifndef BOOST_ALGORITHM_JARO_HPP
#define BOOST_ALGORITHM_JARO_HPP

#include <vector>
#include <iterator>
#include <algorithm>

#include <boost/range/begin.hpp>
#include <boost/range/end.hpp>

namespace boost { namespace algorithm {

template<typename VarType = double, typename RAIterator1, typename RAIterator2>
VarType jaro_distance(RAIterator1 begin1, RAIterator1 end1,
RAIterator2 begin2, RAIterator2 end2)
{
auto aLength = std::distance(begin1, end1);
auto bLength = std::distance(begin2, end2);

// If one string has null length, we return 0.
if (aLength == 0 || bLength == 0)
{
return 0.0;
}

// Calculate max length range.
int maxRange = std::max(0L, std::max(aLength, bLength) / 2 - 1);

// Creates 2 vectors of integers.
std::vector<char> aMatch(aLength, false), bMatch(bLength, false);

// Calculate matching characters.
int matchingCharacters = 0;
for (int aIndex = 0; aIndex < aLength; ++aIndex)
{
// Calculate window test limits (limit inferior to 0 and superior to bLength).
int minIndex = std::max(aIndex - maxRange, 0);
int maxIndex = std::min(aIndex + maxRange + 1, (int)bLength);

if (minIndex >= maxIndex)
{
// No more common character because we don't have characters in b to test with characters in a.
break;
}

for (int bIndex = minIndex; bIndex < maxIndex; ++bIndex)
{
if (!bMatch[bIndex] && *(begin1 + aIndex) == *(begin2 + bIndex))
{
// Found some new match.
aMatch[aIndex] = true;
bMatch[bIndex] = true;
++matchingCharacters;
break;
}
}
}

// If no matching characters, we return 0.
if (matchingCharacters == 0)
{
return 0.0;
}

// Calculate character transpositions.
std::vector<int> aPosition(matchingCharacters, 0), bPosition(matchingCharacters, 0);
for (int aIndex = 0, positionIndex = 0; aIndex < aLength; ++aIndex)
{
if (aMatch[aIndex])
{
aPosition[positionIndex] = aIndex;
++positionIndex;
}
}

for (int bIndex = 0, positionIndex = 0; bIndex < bLength; ++bIndex)
{
if (bMatch[bIndex])
{
bPosition[positionIndex] = bIndex;
++positionIndex;
}
}

// Counting half-transpositions.
int transpositions = 0;
for (int index = 0; index < matchingCharacters; ++index)
{
if (*(begin1 + aPosition[index]) != *(begin2 + bPosition[index]))
{
++transpositions;
}
}

// Calculate Jaro distance.
return (
(1.0/3.0) * matchingCharacters / aLength +
(1.0/3.0) * matchingCharacters / bLength +
(1.0/3.0) * (matchingCharacters - transpositions / 2) / matchingCharacters
);
}

template<typename VarType = double, typename Range1, typename Range2>
VarType jaro_distance(Range1 range1, Range2 range2)
{
return jaro_distance<VarType>(boost::begin(range1), boost::end(range1),
boost::begin(range2), boost::end(range2));
};

template<typename VarType = double, typename RAIterator1, typename RAIterator2>
VarType jaro_winkler_distance(RAIterator1 begin1, RAIterator1 end1,
RAIterator2 begin2, RAIterator2 end2)
{
// Calculate Jaro distance.
double distance = jaro_distance(begin1, end1, begin2, end2);

if (distance > 0.7)
{
// Calculate common string prefix.
int commonPrefix = 0;
for (int index = 0, indexEnd = std::min(std::min(std::distance(begin1, end1), std::distance(begin2, end2)), 4L);
index < indexEnd; ++index)
{
if (*(begin1 + index) == *(begin2 + index))
{
++commonPrefix;
}
else
{
break;
}
}

// Calculate Jaro-Winkler distance.
distance += 0.1 * commonPrefix * (1.0 - distance);
}

return distance;
}

template<typename VarType = double, typename Range1, typename Range2>
VarType jaro_winkler_distance(Range1 range1, Range2 range2)
{
return jaro_winkler_distance<VarType>(boost::begin(range1), boost::end(range1),
boost::begin(range2), boost::end(range2));
};

}}

#endif //BOOST_ALGORITHM_JARO_HPP
Loading