ArcdpsExtension
 
Loading...
Searching...
No Matches
rapidfuzz Namespace Reference

Namespaces

namespace  common
 
namespace  detail
 
namespace  fuzz
 

Classes

struct  CachedLevenshtein
 
struct  EditOp
 Edit operations used by the Levenshtein distance. More...
 
class  Editops
 
class  IteratorView
 
struct  LevenshteinWeightTable
 
struct  Opcode
 Edit operations used by the Levenshtein distance. More...
 
class  Opcodes
 
class  SplittedSentenceView
 
struct  StringAffix
 

Typedefs

template<typename InputIt >
using IteratorViewVec = std::vector< IteratorView< InputIt > >
 

Enumerations

enum class  EditType { None = 0 , Replace = 1 , Insert = 2 , Delete = 3 }
 Edit operation types used by the Levenshtein distance. More...
 

Functions

template<typename InputIt1 , typename InputIt2 >
bool operator== (const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
 
template<typename InputIt1 , typename InputIt2 >
bool operator!= (const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
 
template<typename InputIt1 , typename InputIt2 >
bool operator< (const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
 
template<typename InputIt1 , typename InputIt2 >
bool operator> (const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
 
template<typename InputIt1 , typename InputIt2 >
bool operator<= (const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
 
template<typename InputIt1 , typename InputIt2 >
bool operator>= (const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
 
bool operator== (EditOp a, EditOp b)
 
bool operator== (Opcode a, Opcode b)
 
bool operator== (const Editops &lhs, const Editops &rhs)
 
bool operator!= (const Editops &lhs, const Editops &rhs)
 
void swap (Editops &lhs, Editops &rhs)
 
bool operator== (const Opcodes &lhs, const Opcodes &rhs)
 
bool operator!= (const Opcodes &lhs, const Opcodes &rhs)
 
void swap (Opcodes &lhs, Opcodes &rhs)
 
template<typename InputIt1 , typename InputIt2 >
int64_t levenshtein_distance (InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights={1, 1, 1}, int64_t max=std::numeric_limits< int64_t >::max())
 Calculates the minimum number of insertions, deletions, and substitutions required to change one sequence into the other according to Levenshtein with custom costs for insertion, deletion and substitution.
 
template<typename Sentence1 , typename Sentence2 >
int64_t levenshtein_distance (const Sentence1 &s1, const Sentence2 &s2, LevenshteinWeightTable weights={1, 1, 1}, int64_t max=std::numeric_limits< int64_t >::max())
 
template<typename InputIt1 , typename InputIt2 >
int64_t levenshtein_similarity (InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights={1, 1, 1}, int64_t score_cutoff=0.0)
 
template<typename Sentence1 , typename Sentence2 >
int64_t levenshtein_similarity (const Sentence1 &s1, const Sentence2 &s2, LevenshteinWeightTable weights={1, 1, 1}, int64_t score_cutoff=0.0)
 
template<typename InputIt1 , typename InputIt2 >
double levenshtein_normalized_distance (InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights={1, 1, 1}, double score_cutoff=1.0)
 
template<typename Sentence1 , typename Sentence2 >
double levenshtein_normalized_distance (const Sentence1 &s1, const Sentence2 &s2, LevenshteinWeightTable weights={1, 1, 1}, double score_cutoff=1.0)
 
template<typename InputIt1 , typename InputIt2 >
double levenshtein_normalized_similarity (InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights={1, 1, 1}, double score_cutoff=0.0)
 Calculates a normalized levenshtein distance using custom costs for insertion, deletion and substitution.
 
template<typename Sentence1 , typename Sentence2 >
double levenshtein_normalized_similarity (const Sentence1 &s1, const Sentence2 &s2, LevenshteinWeightTable weights={1, 1, 1}, double score_cutoff=0.0)
 
template<typename InputIt1 , typename InputIt2 >
Editops levenshtein_editops (InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2)
 Return list of EditOp describing how to turn s1 into s2.
 
template<typename Sentence1 , typename Sentence2 >
Editops levenshtein_editops (const Sentence1 &s1, const Sentence2 &s2)
 

Typedef Documentation

◆ IteratorViewVec

template<typename InputIt >
using rapidfuzz::IteratorViewVec = typedef std::vector<IteratorView<InputIt> >

Enumeration Type Documentation

◆ EditType

enum class rapidfuzz::EditType
strong

Edit operation types used by the Levenshtein distance.

Enumerator
None 

No Operation required

Replace 

Replace a character if a string by another character

Insert 

Insert a character into a string

Delete 

Delete a character from a string

Function Documentation

◆ levenshtein_distance() [1/2]

template<typename Sentence1 , typename Sentence2 >
int64_t rapidfuzz::levenshtein_distance ( const Sentence1 &  s1,
const Sentence2 &  s2,
LevenshteinWeightTable  weights = {1, 1, 1},
int64_t  max = std::numeric_limits<int64_t>::max() 
)

◆ levenshtein_distance() [2/2]

template<typename InputIt1 , typename InputIt2 >
int64_t rapidfuzz::levenshtein_distance ( InputIt1  first1,
InputIt1  last1,
InputIt2  first2,
InputIt2  last2,
LevenshteinWeightTable  weights = {1, 1, 1},
int64_t  max = std::numeric_limits<int64_t>::max() 
)

Calculates the minimum number of insertions, deletions, and substitutions required to change one sequence into the other according to Levenshtein with custom costs for insertion, deletion and substitution.

Template Parameters
Sentence1This is a string that can be converted to basic_string_view<char_type>
Sentence2This is a string that can be converted to basic_string_view<char_type>
Parameters
s1string to compare with s2 (for type info check Template parameters above)
s2string to compare with s1 (for type info check Template parameters above)
weightsThe weights for the three operations in the form (insertion, deletion, substitution). Default is {1, 1, 1}, which gives all three operations a weight of 1.
maxMaximum Levenshtein distance between s1 and s2, that is considered as a result. If the distance is bigger than max, max + 1 is returned instead. Default is std::numeric_limits<int64_t>::max(), which deactivates this behaviour.
Returns
returns the levenshtein distance between s1 and s2
Remarks

Depending on the input parameters different optimized implementation are used to improve the performance. Worst-case performance is O(m * n).

Insertion = Deletion = Substitution:

This is known as uniform Levenshtein distance and is the distance most commonly referred to as Levenshtein distance. The following implementation is used with a worst-case performance of O([N/64]M).

  • if max is 0 the similarity can be calculated using a direct comparision, since no difference between the strings is allowed. The time complexity of this algorithm is O(N).
  • A common prefix/suffix of the two compared strings does not affect the Levenshtein distance, so the affix is removed before calculating the similarity.
  • If max is ≤ 3 the mbleven algorithm is used. This algorithm checks all possible edit operations that are possible under the threshold max. The time complexity of this algorithm is O(N).
  • If the length of the shorter string is ≤ 64 after removing the common affix Hyyrös' algorithm is used, which calculates the Levenshtein distance in parallel. The algorithm is described by [hyrro_2002]. The time complexity of this algorithm is O(N).
  • If the length of the shorter string is ≥ 64 after removing the common affix a blockwise implementation of Myers' algorithm is used, which calculates the Levenshtein distance in parallel (64 characters at a time). The algorithm is described by [myers_1999]. The time complexity of this algorithm is O([N/64]M).

Insertion = Deletion, Substitution >= Insertion + Deletion:

Since every Substitution can be performed as Insertion + Deletion, this variant of the Levenshtein distance only uses Insertions and Deletions. Therefore this variant is often referred to as InDel-Distance. The following implementation is used with a worst-case performance of O([N/64]M).

  • if max is 0 the similarity can be calculated using a direct comparision, since no difference between the strings is allowed. The time complexity of this algorithm is O(N).
  • if max is 1 and the two strings have a similar length, the similarity can be calculated using a direct comparision aswell, since a substitution would cause a edit distance higher than max. The time complexity of this algorithm is O(N).
  • A common prefix/suffix of the two compared strings does not affect the Levenshtein distance, so the affix is removed before calculating the similarity.
  • If max is ≤ 4 the mbleven algorithm is used. This algorithm checks all possible edit operations that are possible under the threshold max. As a difference to the normal Levenshtein distance this algorithm can even be used up to a threshold of 4 here, since the higher weight of substitutions decreases the amount of possible edit operations. The time complexity of this algorithm is O(N).
  • If the length of the shorter string is ≤ 64 after removing the common affix Hyyrös' lcs algorithm is used, which calculates the InDel distance in parallel. The algorithm is described by [hyrro_lcs_2004] and is extended with support for UTF32 in this implementation. The time complexity of this algorithm is O(N).
  • If the length of the shorter string is ≥ 64 after removing the common affix a blockwise implementation of Hyyrös' lcs algorithm is used, which calculates the Levenshtein distance in parallel (64 characters at a time). The algorithm is described by [hyrro_lcs_2004]. The time complexity of this algorithm is O([N/64]M).

Other weights:

The implementation for other weights is based on Wagner-Fischer. It has a performance of O(N * M) and has a memory usage of O(N). Further details can be found in [wagner_fischer_1974].

Examples

Find the Levenshtein distance between two strings:

// dist is 2
int64_t dist = levenshtein_distance("lewenstein", "levenshtein");
int64_t levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights={1, 1, 1}, int64_t max=std::numeric_limits< int64_t >::max())
Calculates the minimum number of insertions, deletions, and substitutions required to change one sequ...
Definition rapidfuzz_amalgamated.hpp:3374

Setting a maximum distance allows the implementation to select a more efficient implementation:

// dist is 2
int64_t dist = levenshtein_distance("lewenstein", "levenshtein", {1, 1, 1}, 1);

It is possible to select different weights by passing a weight struct.

// dist is 3
int64_t dist = levenshtein_distance("lewenstein", "levenshtein", {1, 1, 2});

◆ levenshtein_editops() [1/2]

template<typename Sentence1 , typename Sentence2 >
Editops rapidfuzz::levenshtein_editops ( const Sentence1 &  s1,
const Sentence2 &  s2 
)

◆ levenshtein_editops() [2/2]

template<typename InputIt1 , typename InputIt2 >
Editops rapidfuzz::levenshtein_editops ( InputIt1  first1,
InputIt1  last1,
InputIt2  first2,
InputIt2  last2 
)

Return list of EditOp describing how to turn s1 into s2.

Template Parameters
Sentence1This is a string that can be converted to basic_string_view<char_type>
Sentence2This is a string that can be converted to basic_string_view<char_type>
Parameters
s1string to compare with s2 (for type info check Template parameters above)
s2string to compare with s1 (for type info check Template parameters above)
Returns
Edit operations required to turn s1 into s2

◆ levenshtein_normalized_distance() [1/2]

template<typename Sentence1 , typename Sentence2 >
double rapidfuzz::levenshtein_normalized_distance ( const Sentence1 &  s1,
const Sentence2 &  s2,
LevenshteinWeightTable  weights = {1, 1, 1},
double  score_cutoff = 1.0 
)

◆ levenshtein_normalized_distance() [2/2]

template<typename InputIt1 , typename InputIt2 >
double rapidfuzz::levenshtein_normalized_distance ( InputIt1  first1,
InputIt1  last1,
InputIt2  first2,
InputIt2  last2,
LevenshteinWeightTable  weights = {1, 1, 1},
double  score_cutoff = 1.0 
)

◆ levenshtein_normalized_similarity() [1/2]

template<typename Sentence1 , typename Sentence2 >
double rapidfuzz::levenshtein_normalized_similarity ( const Sentence1 &  s1,
const Sentence2 &  s2,
LevenshteinWeightTable  weights = {1, 1, 1},
double  score_cutoff = 0.0 
)

◆ levenshtein_normalized_similarity() [2/2]

template<typename InputIt1 , typename InputIt2 >
double rapidfuzz::levenshtein_normalized_similarity ( InputIt1  first1,
InputIt1  last1,
InputIt2  first2,
InputIt2  last2,
LevenshteinWeightTable  weights = {1, 1, 1},
double  score_cutoff = 0.0 
)

Calculates a normalized levenshtein distance using custom costs for insertion, deletion and substitution.

Template Parameters
Sentence1This is a string that can be converted to basic_string_view<char_type>
Sentence2This is a string that can be converted to basic_string_view<char_type>
Parameters
s1string to compare with s2 (for type info check Template parameters above)
s2string to compare with s1 (for type info check Template parameters above)
weightsThe weights for the three operations in the form (insertion, deletion, substitution). Default is {1, 1, 1}, which gives all three operations a weight of 1.
score_cutoffOptional argument for a score threshold as a float between 0 and 1.0. For ratio < score_cutoff 0 is returned instead. Default is 0, which deactivates this behaviour.
Returns
Normalized weighted levenshtein distance between s1 and s2 as a double between 0 and 1.0
See also
levenshtein()
Remarks

The normalization of the Levenshtein distance is performed in the following way:

\begin{align*}
  ratio &= \frac{distance(s1, s2)}{max_dist}
\end{align*}

Examples

Find the normalized Levenshtein distance between two strings:

// ratio is 81.81818181818181
double ratio = normalized_levenshtein("lewenstein", "levenshtein");

Setting a score_cutoff allows the implementation to select a more efficient implementation:

// ratio is 0.0
double ratio = normalized_levenshtein("lewenstein", "levenshtein", {1, 1, 1}, 85.0);

It is possible to select different weights by passing a weight struct

// ratio is 85.71428571428571
double ratio = normalized_levenshtein("lewenstein", "levenshtein", {1, 1, 2});

◆ levenshtein_similarity() [1/2]

template<typename Sentence1 , typename Sentence2 >
int64_t rapidfuzz::levenshtein_similarity ( const Sentence1 &  s1,
const Sentence2 &  s2,
LevenshteinWeightTable  weights = {1, 1, 1},
int64_t  score_cutoff = 0.0 
)

◆ levenshtein_similarity() [2/2]

template<typename InputIt1 , typename InputIt2 >
int64_t rapidfuzz::levenshtein_similarity ( InputIt1  first1,
InputIt1  last1,
InputIt2  first2,
InputIt2  last2,
LevenshteinWeightTable  weights = {1, 1, 1},
int64_t  score_cutoff = 0.0 
)

◆ operator!=() [1/3]

bool rapidfuzz::operator!= ( const Editops lhs,
const Editops rhs 
)
inline

◆ operator!=() [2/3]

template<typename InputIt1 , typename InputIt2 >
bool rapidfuzz::operator!= ( const IteratorView< InputIt1 > &  a,
const IteratorView< InputIt2 > &  b 
)
inline

◆ operator!=() [3/3]

bool rapidfuzz::operator!= ( const Opcodes lhs,
const Opcodes rhs 
)
inline

◆ operator<()

template<typename InputIt1 , typename InputIt2 >
bool rapidfuzz::operator< ( const IteratorView< InputIt1 > &  a,
const IteratorView< InputIt2 > &  b 
)
inline

◆ operator<=()

template<typename InputIt1 , typename InputIt2 >
bool rapidfuzz::operator<= ( const IteratorView< InputIt1 > &  a,
const IteratorView< InputIt2 > &  b 
)
inline

◆ operator==() [1/5]

bool rapidfuzz::operator== ( const Editops lhs,
const Editops rhs 
)
inline

◆ operator==() [2/5]

template<typename InputIt1 , typename InputIt2 >
bool rapidfuzz::operator== ( const IteratorView< InputIt1 > &  a,
const IteratorView< InputIt2 > &  b 
)
inline

◆ operator==() [3/5]

bool rapidfuzz::operator== ( const Opcodes lhs,
const Opcodes rhs 
)
inline

◆ operator==() [4/5]

bool rapidfuzz::operator== ( EditOp  a,
EditOp  b 
)
inline

◆ operator==() [5/5]

bool rapidfuzz::operator== ( Opcode  a,
Opcode  b 
)
inline

◆ operator>()

template<typename InputIt1 , typename InputIt2 >
bool rapidfuzz::operator> ( const IteratorView< InputIt1 > &  a,
const IteratorView< InputIt2 > &  b 
)
inline

◆ operator>=()

template<typename InputIt1 , typename InputIt2 >
bool rapidfuzz::operator>= ( const IteratorView< InputIt1 > &  a,
const IteratorView< InputIt2 > &  b 
)
inline

◆ swap() [1/2]

void rapidfuzz::swap ( Editops lhs,
Editops rhs 
)
inline

◆ swap() [2/2]

void rapidfuzz::swap ( Opcodes lhs,
Opcodes rhs 
)
inline