722 return m_sentence.empty();
726 return m_sentence.size();
729 std::basic_string<CharT>
join()
const;
739 template<
typename InputIt>
741 int64_t old_word_count = word_count();
742 m_sentence.erase(std::unique(m_sentence.begin(), m_sentence.end()), m_sentence.end());
743 return old_word_count - word_count();
746 template<
typename InputIt>
748 if (m_sentence.empty())
return 0;
751 int64_t result = m_sentence.size() - 1;
752 for (
const auto& word : m_sentence) {
753 result += std::distance(word.first, word.last);
759 template<
typename InputIt>
761 if (m_sentence.empty()) {
762 return std::basic_string<CharT>();
765 auto sentence_iter = m_sentence.begin();
766 std::basic_string<CharT> joined(sentence_iter->first, sentence_iter->last);
767 const std::basic_string<CharT> whitespace{0x20};
769 for (; sentence_iter != m_sentence.end(); ++sentence_iter) {
770 joined.append(whitespace)
771 .append(std::basic_string<CharT>(sentence_iter->first, sentence_iter->last));
777#include <unordered_map>
782 template<
typename InputIt1,
typename InputIt2,
typename InputIt3>
783 struct DecomposedSet {
784 SplittedSentenceView<InputIt1> difference_ab;
785 SplittedSentenceView<InputIt2> difference_ba;
786 SplittedSentenceView<InputIt3> intersection;
787 DecomposedSet(SplittedSentenceView<InputIt1> diff_ab, SplittedSentenceView<InputIt2> diff_ba, SplittedSentenceView<InputIt3> intersect)
788 : difference_ab(std::move(diff_ab)),
789 difference_ba(std::move(diff_ba)),
790 intersection(std::move(intersect)) {}
801 template<
typename InputIt1,
typename InputIt2>
805 return (result >= score_cutoff) ? result : 0;
808 template<
int Max = 1>
809 constexpr double norm_distance(int64_t dist, int64_t lensum,
double score_cutoff = 0) {
810 double max =
static_cast<double>(Max);
811 return result_cutoff((lensum > 0) ? (max - max * dist / lensum) : max, score_cutoff);
814 template<
int Max = 1>
815 static inline int64_t score_cutoff_to_distance(
double score_cutoff, int64_t lensum) {
816 return static_cast<int64_t
>(
817 std::ceil(
static_cast<double>(lensum) * (1.0 - score_cutoff / Max))
822 constexpr bool is_zero(T a, T tolerance = std::numeric_limits<T>::epsilon()) {
823 return std::fabs(a) <= tolerance;
826 template<
typename Sentence,
typename CharT =
char_type<Sentence>,
typename = std::enable_if_t<is_explicitly_convertible<Sentence, std::basic_
string<CharT>>::value>>
827 std::basic_string<CharT>
to_string(Sentence&& str);
829 template<
typename Sentence,
typename CharT =
char_type<Sentence>,
typename = std::enable_if_t<!is_explicitly_convertible<Sentence, std::basic_
string<CharT>>::value && has_data_and_size<Sentence>::value>>
830 std::basic_string<CharT>
to_string(
const Sentence& str);
832 template<
typename CharT>
843 template<
typename CharT>
867 template<
typename InputIterator1,
typename InputIterator2>
868 std::pair<InputIterator1, InputIterator2>
mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2);
870 template<
typename InputIt1,
typename InputIt2>
873 template<
typename InputIt1,
typename InputIt2>
874 int64_t
remove_common_prefix(InputIt1& first1, InputIt1 last1, InputIt2& first2, InputIt2 last2);
876 template<
typename InputIt1,
typename InputIt2>
877 int64_t
remove_common_suffix(InputIt1 first1, InputIt1& last1, InputIt2 first2, InputIt2& last2);
879 template<
typename InputIt,
typename CharT = iterator_type<InputIt>>
883 constexpr auto to_unsigned(T value) ->
typename std::make_unsigned<T>::type {
884 return typename std::make_unsigned<T>::type(value);
888 constexpr auto to_signed(T value) ->
typename std::make_unsigned<T>::type {
889 return typename std::make_signed<T>::type(value);
895 template<
typename T,
typename U>
897 const intmax_t botT = intmax_t(std::numeric_limits<T>::min());
898 const intmax_t botU = intmax_t(std::numeric_limits<U>::min());
899 const uintmax_t topT = uintmax_t(std::numeric_limits<T>::max());
900 const uintmax_t topU = uintmax_t(std::numeric_limits<U>::max());
901 return !((botT > botU && value < static_cast<U>(botT)) || (topT < topU && value >
static_cast<U
>(topT)));
914 template<
typename InputIt>
919 template<
typename InputIt>
920 void insert(InputIt first, InputIt last) {
922 for (; first != last; ++first) {
923 insert_mask(*first, mask);
928 template<
typename CharT>
930 insert_mask(key, 1ull << pos);
933 template<
typename CharT>
934 uint64_t
get(CharT key)
const {
935 if (key >= 0 && key <= 255) {
938 return m_map[lookup((uint64_t) key)].value;
942 template<
typename CharT>
943 uint64_t
get(int64_t block, CharT key)
const {
950 template<
typename CharT>
951 void insert_mask(CharT key, uint64_t mask) {
952 if (key >= 0 && key <= 255) {
955 int64_t i = lookup((uint64_t) key);
957 m_map[i].value |= mask;
965 int64_t lookup(uint64_t key)
const {
966 int64_t i = key % 128;
972 int64_t perturb = key;
974 i = ((i * 5) + perturb + 1) % 128;
985 std::vector<PatternMatchVector>
m_val;
989 template<
typename InputIt>
994 template<
typename CharT>
995 void insert(int64_t block, CharT ch,
int pos) {
996 auto* be = &
m_val[block];
1000 template<
typename InputIt>
1002 int64_t len = std::distance(first, last);
1003 int64_t block_count = (len / 64) +
bool(len % 64);
1004 m_val.resize(block_count);
1006 for (int64_t block = 0; block < block_count; ++block) {
1007 if (std::distance(first + block * 64, last) > 64) {
1008 m_val[block].insert(first + block * 64, first + (block + 1) * 64);
1010 m_val[block].insert(first + block * 64, last);
1015 template<
typename CharT>
1016 uint64_t
get(int64_t block, CharT ch)
const {
1017 auto* be = &
m_val[block];
1022 template<
typename CharT1,
typename ValueType,
int64_t size = sizeof(CharT1)>
1023 struct CharHashTable;
1025 template<
typename CharT1,
typename ValueType>
1027 using UCharT1 =
typename std::make_unsigned<CharT1>::type;
1029 std::array<ValueType, std::numeric_limits<UCharT1>::max() + 1>
m_val;
1038 template<
typename CharT2>
1040 if (!CanTypeFitValue<CharT1>(ch)) {
1048 template<
typename CharT1,
typename ValueType,
int64_t size>
1050 std::unordered_map<CharT1, ValueType>
m_val;
1059 template<
typename CharT2>
1061 if (!CanTypeFitValue<CharT1>(ch)) {
1065 auto search =
m_val.find(CharT1(ch));
1066 if (search ==
m_val.end()) {
1070 return search->second;
1074 template<
typename T>
1079 assert(col < m_cols);
1080 return m_vector[col];
1088 template<
typename T>
1095 assert(col < m_cols);
1096 return m_vector[col];
1104 template<
typename T>
1106 Matrix(uint64_t rows, uint64_t cols, uint64_t val) {
1109 if (rows * cols > 0) {
1110 m_matrix =
new T[rows * cols];
1111 std::fill_n(m_matrix, rows * cols, val);
1122 assert(row < m_rows);
1127 assert(row < m_rows);
1150 template<
typename InputIt1,
typename InputIt2>
1151 DecomposedSet<InputIt1, InputIt2, InputIt1>
1160 for (
const auto& current_a : a.
words()) {
1161 auto element_b = std::find(difference_ba.begin(), difference_ba.end(), current_a);
1163 if (element_b != difference_ba.end()) {
1164 difference_ba.erase(element_b);
1165 intersection.push_back(current_a);
1167 difference_ab.push_back(current_a);
1171 return {difference_ab, difference_ba, intersection};
1174 template<
typename Sentence,
typename CharT,
typename>
1176 return std::basic_string<CharT>(std::forward<Sentence>(str));
1179 template<
typename Sentence,
typename CharT,
typename>
1181 return std::basic_string<CharT>(str.data(), str.size());
1184 template<
typename InputIterator1,
typename InputIterator2>
1185 std::pair<InputIterator1, InputIterator2>
1186 common::mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2) {
1187 while (first1 != last1 && first2 != last2 && *first1 == *first2) {
1191 return std::pair<InputIterator1, InputIterator2>(first1, first2);
1197 template<
typename InputIt1,
typename InputIt2>
1199 int64_t prefix = std::distance(first1,
common::mismatch(first1, last1, first2, last2).first);
1208 template<
typename InputIt1,
typename InputIt2>
1210 auto rfirst1 = std::make_reverse_iterator(last1);
1211 auto rlast1 = std::make_reverse_iterator(first1);
1212 auto rfirst2 = std::make_reverse_iterator(last2);
1213 auto rlast2 = std::make_reverse_iterator(first2);
1216 std::distance(rfirst1,
common::mismatch(rfirst1, rlast1, rfirst2, rlast2).first);
1225 template<
typename InputIt1,
typename InputIt2>
1230 template<
typename,
typename =
void>
1231 struct is_space_dispatch_tag : std::integral_constant<int, 0> {};
1233 template<
typename CharT>
1234 struct is_space_dispatch_tag<CharT, typename std::enable_if<sizeof(CharT) == 1>::type>
1235 : std::integral_constant<int, 1> {};
1240 template<
typename CharT>
1241 bool is_space_impl(
const CharT ch, std::integral_constant<int, 0>) {
1280 template<
typename CharT>
1281 bool is_space_impl(
const CharT ch, std::integral_constant<int, 1>) {
1302 template<
typename CharT>
1303 bool is_space(
const CharT ch) {
1304 return is_space_impl(ch, is_space_dispatch_tag<CharT>{});
1307 template<
typename InputIt,
typename CharT>
1310 auto second = first;
1312 for (; second != last && first != last; first = second + 1) {
1313 second = std::find_if(first, last, is_space<CharT>);
1315 if (first != second) {
1316 splitted.emplace_back(first, second);
1320 std::sort(splitted.begin(), splitted.end());
1354 template<
typename InputIt1,
typename InputIt2>
1355 int64_t hamming_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff = std::numeric_limits<int64_t>::max());
1357 template<
typename Sentence1,
typename Sentence2>
1358 int64_t hamming_distance(
const Sentence1& s1,
const Sentence2& s2, int64_t score_cutoff = std::numeric_limits<int64_t>::max());
1360 template<
typename InputIt1,
typename InputIt2>
1361 int64_t hamming_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0);
1363 template<
typename Sentence1,
typename Sentence2>
1364 int64_t hamming_similarity(
const Sentence1& s1,
const Sentence2& s2, int64_t score_cutoff = 0);
1366 template<
typename InputIt1,
typename InputIt2>
1367 double hamming_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 1.0);
1369 template<
typename Sentence1,
typename Sentence2>
1370 double hamming_normalized_distance(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 1.0);
1396 template<
typename InputIt1,
typename InputIt2>
1397 double hamming_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0.0);
1399 template<
typename Sentence1,
typename Sentence2>
1400 double hamming_normalized_similarity(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0.0);
1402 template<
typename CharT1>
1403 struct CachedHamming {
1404 template<
typename Sentence1>
1405 CachedHamming(
const Sentence1& s1_) : s1(common::
to_string(s1_)) {}
1407 template<
typename InputIt1>
1408 CachedHamming(InputIt1 first1, InputIt1 last1) : s1(first1, last1) {}
1410 template<
typename InputIt2>
1411 int64_t distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = std::numeric_limits<int64_t>::max())
const;
1413 template<
typename Sentence2>
1414 int64_t distance(
const Sentence2& s2, int64_t score_cutoff = std::numeric_limits<int64_t>::max())
const;
1416 template<
typename InputIt2>
1417 int64_t similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0)
const;
1419 template<
typename Sentence2>
1420 int64_t similarity(
const Sentence2& s2, int64_t score_cutoff = 0)
const;
1422 template<
typename InputIt2>
1423 double normalized_distance(InputIt2 first2, InputIt2 last2,
double score_cutoff = 1.0)
const;
1425 template<
typename Sentence2>
1426 double normalized_distance(
const Sentence2& s2,
double score_cutoff = 1.0)
const;
1428 template<
typename InputIt2>
1429 double normalized_similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0.0)
const;
1431 template<
typename Sentence2>
1432 double normalized_similarity(
const Sentence2& s2,
double score_cutoff = 0.0)
const;
1435 std::basic_string<CharT1> s1;
1438#if __cplusplus >= 201703L
1439 template<
typename Sentence1>
1440 CachedHamming(
const Sentence1& s1_) -> CachedHamming<char_type<Sentence1>>;
1442 template<
typename InputIt1>
1443 CachedHamming(InputIt1 first1, InputIt1 last1) -> CachedHamming<iterator_type<InputIt1>>;
1455 template<
typename InputIt1,
typename InputIt2>
1456 int64_t hamming_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
1457 if (std::distance(first1, last1) != std::distance(first2, last2)) {
1458 throw std::invalid_argument(
"Sequences are not the same length.");
1462 for (; first1 != last1; first1++, first2++) {
1463 dist += bool(*first1 != *first2);
1466 return (dist <= score_cutoff) ? dist : score_cutoff + 1;
1469 template<
typename Sentence1,
typename Sentence2>
1470 int64_t hamming_distance(
const Sentence1& s1,
const Sentence2& s2, int64_t score_cutoff) {
1474 template<
typename InputIt1,
typename InputIt2>
1475 int64_t hamming_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
1476 int64_t maximum = std::distance(first1, last1);
1477 int64_t cutoff_distance = maximum - score_cutoff;
1478 int64_t dist = hamming_distance(first1, last1, first2, last2, cutoff_distance);
1479 int64_t sim = maximum - dist;
1480 return (sim >= score_cutoff) ? sim : 0;
1483 template<
typename Sentence1,
typename Sentence2>
1484 int64_t hamming_similarity(
const Sentence1& s1,
const Sentence2& s2, int64_t score_cutoff) {
1488 template<
typename InputIt1,
typename InputIt2>
1489 double hamming_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
1490 int64_t maximum = std::distance(first1, last1);
1491 int64_t cutoff_distance =
static_cast<int64_t
>(std::ceil(maximum * score_cutoff));
1492 int64_t dist = hamming_distance(first1, last1, first2, last2, cutoff_distance);
1493 double norm_dist = (maximum) ? (
double) dist / (double) maximum : 0.0;
1494 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
1497 template<
typename Sentence1,
typename Sentence2>
1498 double hamming_normalized_distance(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff) {
1502 template<
typename InputIt1,
typename InputIt2>
1503 double hamming_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
1504 int64_t maximum = std::distance(first1, last1);
1505 int64_t cutoff_distance = maximum - score_cutoff;
1506 int64_t dist = hamming_distance(first1, last1, first2, last2, cutoff_distance);
1507 int64_t sim = maximum - dist;
1508 return (sim >= score_cutoff) ? sim : 0;
1511 template<
typename Sentence1,
typename Sentence2>
1512 double hamming_normalized_similarity(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff) {
1516 template<
typename CharT1>
1517 template<
typename InputIt2>
1518 int64_t CachedHamming<CharT1>::distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff)
const {
1522 template<
typename CharT1>
1523 template<
typename Sentence2>
1524 int64_t CachedHamming<CharT1>::distance(
const Sentence2& s2, int64_t score_cutoff)
const {
1525 return hamming_distance(s1, s2, score_cutoff);
1528 template<
typename CharT1>
1529 template<
typename InputIt2>
1530 int64_t CachedHamming<CharT1>::similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff)
const {
1534 template<
typename CharT1>
1535 template<
typename Sentence2>
1536 int64_t CachedHamming<CharT1>::similarity(
const Sentence2& s2, int64_t score_cutoff)
const {
1537 return hamming_similarity(s1, s2, score_cutoff);
1540 template<
typename CharT1>
1541 template<
typename InputIt2>
1542 double CachedHamming<CharT1>::normalized_distance(InputIt2 first2, InputIt2 last2,
double score_cutoff)
const {
1546 template<
typename CharT1>
1547 template<
typename Sentence2>
1548 double CachedHamming<CharT1>::normalized_distance(
const Sentence2& s2,
double score_cutoff)
const {
1549 return hamming_normalized_distance(s1, s2, score_cutoff);
1552 template<
typename CharT1>
1553 template<
typename InputIt2>
1554 double CachedHamming<CharT1>::normalized_similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff)
const {
1558 template<
typename CharT1>
1559 template<
typename Sentence2>
1560 double CachedHamming<CharT1>::normalized_similarity(
const Sentence2& s2,
double score_cutoff)
const {
1561 return hamming_normalized_similarity(s1, s2, score_cutoff);
1574 template<
typename InputIt1,
typename InputIt2>
1575 int64_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max = std::numeric_limits<int64_t>::max());
1577 template<
typename Sentence1,
typename Sentence2>
1578 int64_t indel_distance(
const Sentence1& s1,
const Sentence2& s2, int64_t max = std::numeric_limits<int64_t>::max());
1580 template<
typename InputIt1,
typename InputIt2>
1581 int64_t indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0.0);
1583 template<
typename Sentence1,
typename Sentence2>
1584 int64_t indel_similarity(
const Sentence1& s1,
const Sentence2& s2, int64_t score_cutoff = 0.0);
1586 template<
typename InputIt1,
typename InputIt2>
1587 double indel_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 1.0);
1589 template<
typename Sentence1,
typename Sentence2>
1590 double indel_normalized_distance(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 1.0);
1592 template<
typename InputIt1,
typename InputIt2>
1593 double indel_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0.0);
1595 template<
typename Sentence1,
typename Sentence2>
1596 double indel_normalized_similarity(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0.0);
1598 template<
typename InputIt1,
typename InputIt2>
1599 Editops indel_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2);
1601 template<
typename Sentence1,
typename Sentence2>
1602 Editops indel_editops(
const Sentence1& s1,
const Sentence2& s2);
1604 template<
typename CharT1>
1605 struct CachedIndel {
1606 template<
typename Sentence1>
1607 CachedIndel(
const Sentence1& s1_)
1610 template<
typename InputIt1>
1611 CachedIndel(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1) {}
1613 template<
typename InputIt2>
1614 int64_t distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = std::numeric_limits<int64_t>::max())
const;
1616 template<
typename Sentence2>
1617 int64_t distance(
const Sentence2& s2, int64_t score_cutoff = std::numeric_limits<int64_t>::max())
const;
1619 template<
typename InputIt2>
1620 int64_t similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0)
const;
1622 template<
typename Sentence2>
1623 int64_t similarity(
const Sentence2& s2, int64_t score_cutoff = 0)
const;
1625 template<
typename InputIt2>
1626 double normalized_distance(InputIt2 first2, InputIt2 last2,
double score_cutoff = 1.0)
const;
1628 template<
typename Sentence2>
1629 double normalized_distance(
const Sentence2& s2,
double score_cutoff = 1.0)
const;
1631 template<
typename InputIt2>
1632 double normalized_similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0.0)
const;
1634 template<
typename Sentence2>
1635 double normalized_similarity(
const Sentence2& s2,
double score_cutoff = 0.0)
const;
1638 std::basic_string<CharT1> s1;
1639 common::BlockPatternMatchVector PM;
1642#if __cplusplus >= 201703L
1643 template<
typename Sentence1>
1644 CachedIndel(
const Sentence1& s1_) -> CachedIndel<char_type<Sentence1>>;
1646 template<
typename InputIt1>
1647 CachedIndel(InputIt1 first1, InputIt1 last1) -> CachedIndel<iterator_type<InputIt1>>;
1656#if defined(_MSC_VER) && !defined(__clang__)
1663 static inline uint64_t addc64(uint64_t a, uint64_t b, uint64_t carryin, uint64_t* carryout) {
1666 *carryout = a < carryin;
1672 template<
typename T,
typename U>
1673 T ceil_div(T a, U divisor) {
1674 return a / divisor +
static_cast<T
>(a % divisor != 0);
1677 static inline int64_t popcount64(uint64_t x) {
1678 const uint64_t m1 = 0x5555555555555555;
1679 const uint64_t m2 = 0x3333333333333333;
1680 const uint64_t m4 = 0x0f0f0f0f0f0f0f0f;
1681 const uint64_t h01 = 0x0101010101010101;
1684 x = (x & m2) + ((x >> 2) & m2);
1685 x = (x + (x >> 4)) & m4;
1686 return static_cast<int64_t
>((x * h01) >> 56);
1692 template<
typename T>
1700 template<
typename T>
1709 template<
typename T>
1714#if defined(_MSC_VER) && !defined(__clang__)
1715 static inline int tzcnt(uint32_t x) {
1716 unsigned long trailing_zero = 0;
1717 _BitScanForward(&trailing_zero, x);
1718 return trailing_zero;
1721#if defined(_M_ARM) || defined(_M_X64)
1722 static inline int tzcnt(uint64_t x) {
1723 unsigned long trailing_zero = 0;
1724 _BitScanForward64(&trailing_zero, x);
1725 return trailing_zero;
1728 static inline int tzcnt(uint64_t x) {
1729 uint32_t msh = (uint32_t) (x >> 32);
1730 uint32_t lsh = (uint32_t) (x & 0xFFFFFFFF);
1734 return 32 + tzcnt(msh);
1739 static inline int tzcnt(uint32_t x) {
1740 return __builtin_ctz(x);
1743 static inline int tzcnt(uint64_t x) {
1744 return __builtin_ctzll(x);
1778 static constexpr uint8_t indel_mbleven2018_matrix[14][7] = {
1793 {0x96, 0x66, 0x5A, 0x99, 0x69, 0xA5},
1795 {0x65, 0x56, 0x95, 0x59},
1800 template<
typename InputIt1,
typename InputIt2>
1801 int64_t indel_mbleven2018(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1802 int64_t len1 = std::distance(first1, last1);
1803 int64_t len2 = std::distance(first2, last2);
1806 return indel_mbleven2018(first2, last2, first1, last1, max);
1809 int64_t len_diff = len1 - len2;
1810 auto possible_ops = indel_mbleven2018_matrix[(max + max * max) / 2 + len_diff - 1];
1811 int64_t dist = max + 1;
1813 for (
int pos = 0; possible_ops[pos] != 0; ++pos) {
1814 uint8_t ops = possible_ops[pos];
1817 int64_t cur_dist = 0;
1819 while (s1_pos < len1 && s2_pos < len2) {
1820 if (first1[s1_pos] != first2[s2_pos]) {
1824 if (ops & 1) s1_pos++;
1825 if (ops & 2) s2_pos++;
1833 cur_dist += (len1 - s1_pos) + (len2 - s2_pos);
1834 dist = std::min(dist, cur_dist);
1837 return (dist <= max) ? dist : max + 1;
1840 template<
int64_t N,
typename PMV,
typename InputIt1,
typename InputIt2>
1841 static inline int64_t longest_common_subsequence_unroll(
const PMV& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1842 int64_t len1 = std::distance(first1, last1);
1843 int64_t len2 = std::distance(first2, last2);
1846 for (int64_t i = 0; i < N; ++i) {
1850 for (; first2 != last2; ++first2) {
1852 uint64_t Matches[N];
1855 for (int64_t i = 0; i < N; ++i) {
1856 Matches[i] = block.get(i);
1857 u[i] = S[i] & Matches[i];
1858 x[i] = addc64(S[i], u[i], carry, &carry);
1859 S[i] = x[i] | (S[i] - u[i]);
1864 for (int64_t i = 0; i < N; ++i) {
1865 res += popcount64(~S[i]);
1868 int64_t dist = len1 + len2 - 2 * res;
1869 return (dist <= max) ? dist : max + 1;
1872 template<
typename InputIt1,
typename InputIt2>
1873 static inline int64_t
1874 longest_common_subsequence_blockwise(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1875 int64_t len1 = std::distance(first1, last1);
1876 int64_t len2 = std::distance(first2, last2);
1878 int64_t words = block.m_val.size();
1879 std::vector<uint64_t> S(words, ~0x0ull);
1881 for (; first2 != last2; ++first2) {
1883 for (int64_t word = 0; word < words; ++word) {
1884 const uint64_t Matches = block.get(word, *first2);
1885 uint64_t Stemp = S[word];
1887 uint64_t u = Stemp & Matches;
1889 uint64_t x = addc64(Stemp, u, carry, &carry);
1890 S[word] = x | (Stemp - u);
1895 for (uint64_t Stemp : S) {
1896 res += popcount64(~Stemp);
1899 int64_t dist = len1 + len2 - 2 * res;
1900 return (dist <= max) ? dist : max + 1;
1903 template<
typename InputIt1,
typename InputIt2>
1904 int64_t longest_common_subsequence(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1905 int64_t len1 = std::distance(first1, last1);
1906 int64_t len2 = std::distance(first2, last2);
1907 int64_t nr = ceil_div(len1, 64);
1910 return (len2 <= max) ? len2 : max + 1;
1912 return longest_common_subsequence_unroll<1>(block, first1, last1, first2, last2, max);
1914 return longest_common_subsequence_unroll<2>(block, first1, last1, first2, last2, max);
1916 return longest_common_subsequence_unroll<3>(block, first1, last1, first2, last2, max);
1918 return longest_common_subsequence_unroll<4>(block, first1, last1, first2, last2, max);
1920 return longest_common_subsequence_unroll<5>(block, first1, last1, first2, last2, max);
1922 return longest_common_subsequence_unroll<6>(block, first1, last1, first2, last2, max);
1924 return longest_common_subsequence_unroll<7>(block, first1, last1, first2, last2, max);
1926 return longest_common_subsequence_unroll<8>(block, first1, last1, first2, last2, max);
1928 return longest_common_subsequence_blockwise(block, first1, last1, first2, last2, max);
1932 template<
typename InputIt1,
typename InputIt2>
1933 int64_t longest_common_subsequence(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1934 int64_t len1 = std::distance(first1, last1);
1935 int64_t nr = ceil_div(len1, 64);
1938 auto block = common::PatternMatchVector(first1, last1);
1939 return longest_common_subsequence_unroll<1>(block, first1, last1, first2, last2, max);
1942 auto block = common::BlockPatternMatchVector(first1, last1);
1943 return longest_common_subsequence_unroll<2>(block, first1, last1, first2, last2, max);
1946 auto block = common::BlockPatternMatchVector(first1, last1);
1947 return longest_common_subsequence_unroll<3>(block, first1, last1, first2, last2, max);
1950 auto block = common::BlockPatternMatchVector(first1, last1);
1951 return longest_common_subsequence_unroll<4>(block, first1, last1, first2, last2, max);
1954 auto block = common::BlockPatternMatchVector(first1, last1);
1955 return longest_common_subsequence_unroll<5>(block, first1, last1, first2, last2, max);
1958 auto block = common::BlockPatternMatchVector(first1, last1);
1959 return longest_common_subsequence_unroll<6>(block, first1, last1, first2, last2, max);
1962 auto block = common::BlockPatternMatchVector(first1, last1);
1963 return longest_common_subsequence_unroll<7>(block, first1, last1, first2, last2, max);
1966 auto block = common::BlockPatternMatchVector(first1, last1);
1967 return longest_common_subsequence_unroll<8>(block, first1, last1, first2, last2, max);
1970 auto block = common::BlockPatternMatchVector(first1, last1);
1971 return longest_common_subsequence_blockwise(block, first1, last1, first2, last2, max);
1976 template<
typename InputIt1,
typename InputIt2>
1977 int64_t indel_distance(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1978 int64_t len1 = std::distance(first1, last1);
1979 int64_t len2 = std::distance(first2, last2);
1982 if (max == 0 || (max == 1 && len1 == len2)) {
1983 return !std::equal(first1, last1, first2, last2);
1986 if (max < std::abs(len1 - len2)) {
1992 return longest_common_subsequence(block, first1, last1, first2, last2, max);
1997 len1 = std::distance(first1, last1);
1998 len2 = std::distance(first2, last2);
1999 if (!len1 || !len2) {
2003 return indel_mbleven2018(first1, last1, first2, last2, max);
2006 template<
typename InputIt1,
typename InputIt2>
2007 int64_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2008 int64_t len1 = std::distance(first1, last1);
2009 int64_t len2 = std::distance(first2, last2);
2013 return indel_distance(first2, last2, first1, last1, max);
2017 if (max == 0 || (max == 1 && len1 == len2)) {
2018 return !std::equal(first1, last1, first2, last2);
2021 if (max < std::abs(len1 - len2)) {
2027 len1 = std::distance(first1, last1);
2028 len2 = std::distance(first2, last2);
2029 if (!len1 || !len2) {
2034 return indel_mbleven2018(first1, last1, first2, last2, max);
2037 return longest_common_subsequence(first1, last1, first2, last2, max);
2040 template<
typename InputIt1,
typename InputIt2>
2041 double indel_normalized_distance(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
2042 int64_t len1 = std::distance(first1, last1);
2043 int64_t len2 = std::distance(first2, last2);
2044 int64_t maximum = len1 + len2;
2045 int64_t cutoff_distance =
static_cast<int64_t
>(std::ceil(maximum * score_cutoff));
2046 int64_t dist = indel_distance(block, first1, last1, first2, last2, cutoff_distance);
2047 double norm_dist = (maximum) ? (
double) dist / (double) maximum : 0.0;
2048 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
2051 template<
typename InputIt1,
typename InputIt2>
2052 double indel_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
2053 int64_t len1 = std::distance(first1, last1);
2054 int64_t len2 = std::distance(first2, last2);
2055 int64_t maximum = len1 + len2;
2056 int64_t cutoff_distance =
static_cast<int64_t
>(std::ceil(maximum * score_cutoff));
2057 int64_t dist = indel_distance(first1, last1, first2, last2, cutoff_distance);
2058 double norm_dist = (maximum) ? (
double) dist / (double) maximum : 0.0;
2059 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
2062 template<
typename InputIt1,
typename InputIt2>
2063 int64_t indel_similarity(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
2064 int64_t len1 = std::distance(first1, last1);
2065 int64_t len2 = std::distance(first2, last2);
2066 int64_t maximum = len1 + len2;
2067 int64_t cutoff_distance = maximum - score_cutoff;
2068 int64_t dist = indel_distance(block, first1, last1, first2, last2, cutoff_distance);
2069 int64_t sim = maximum - dist;
2070 return (sim >= score_cutoff) ? sim : 0;
2073 template<
typename InputIt1,
typename InputIt2>
2074 int64_t indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
2075 int64_t len1 = std::distance(first1, last1);
2076 int64_t len2 = std::distance(first2, last2);
2077 int64_t maximum = len1 + len2;
2078 int64_t cutoff_distance = maximum - score_cutoff;
2079 int64_t dist = indel_distance(first1, last1, first2, last2, cutoff_distance);
2080 int64_t sim = maximum - dist;
2081 return (sim >= score_cutoff) ? sim : 0;
2084 template<
typename InputIt1,
typename InputIt2>
2085 double indel_normalized_similarity(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
2087 indel_normalized_distance(block, first1, last1, first2, last2, 1.0 - score_cutoff);
2088 double norm_sim = 1.0 - norm_dist;
2089 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
2092 template<
typename InputIt1,
typename InputIt2>
2093 double indel_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
2094 double norm_dist = indel_normalized_distance(first1, last1, first2, last2, 1.0 - score_cutoff);
2095 double norm_sim = 1.0 - norm_dist;
2096 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
2099 struct LLCSBitMatrix {
2100 LLCSBitMatrix(uint64_t rows, uint64_t cols) : S(rows, cols, (uint64_t) -1), dist(0) {}
2102 common::Matrix<uint64_t> S;
2110 template<
typename InputIt1,
typename InputIt2>
2111 Editops recover_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
const LLCSBitMatrix& matrix, StringAffix affix) {
2112 int64_t len1 = std::distance(first1, last1);
2113 int64_t len2 = std::distance(first2, last2);
2114 int64_t dist = matrix.dist;
2115 Editops editops(dist);
2116 editops.set_src_len(len1 + affix.prefix_len + affix.suffix_len);
2117 editops.set_dest_len(len2 + affix.prefix_len + affix.suffix_len);
2126 while (row && col) {
2127 uint64_t col_pos = col - 1;
2128 uint64_t col_word = col_pos / 64;
2129 col_pos = col_pos % 64;
2130 uint64_t mask = 1ull << col_pos;
2133 if (matrix.S[row - 1][col_word] & mask) {
2137 editops[dist].src_pos = row + affix.prefix_len;
2138 editops[dist].dest_pos = col + affix.prefix_len;
2143 if (row && (~matrix.S[row - 1][col_word]) & mask) {
2146 editops[dist].src_pos = row + affix.prefix_len;
2147 editops[dist].dest_pos = col + affix.prefix_len;
2152 assert(first1[row] == first2[col]);
2161 editops[dist].src_pos = row + affix.prefix_len;
2162 editops[dist].dest_pos = col + affix.prefix_len;
2169 editops[dist].src_pos = row + affix.prefix_len;
2170 editops[dist].dest_pos = col + affix.prefix_len;
2176 template<
int64_t N,
typename PMV,
typename InputIt1,
typename InputIt2>
2177 LLCSBitMatrix llcs_matrix_unroll(
const PMV& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
2178 int64_t len1 = std::distance(first1, last1);
2179 int64_t len2 = std::distance(first2, last2);
2181 for (int64_t i = 0; i < N; ++i) {
2185 LLCSBitMatrix matrix(len2, N);
2187 for (int64_t i = 0; i < len2; ++i) {
2189 uint64_t Matches[N];
2192 for (int64_t word = 0; word < N; ++word) {
2193 Matches[word] = block.get(word);
2194 u[word] = S[word] & Matches[word];
2195 x[word] = addc64(S[word], u[word], carry, &carry);
2196 S[word] = matrix.S[i][word] = x[word] | (S[word] - u[word]);
2201 for (int64_t i = 0; i < N; ++i) {
2202 res += popcount64(~S[i]);
2205 matrix.dist = len1 + len2 - 2 * res;
2210 template<
typename InputIt1,
typename InputIt2>
2211 LLCSBitMatrix llcs_matrix_blockwise(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
2212 int64_t len1 = std::distance(first1, last1);
2213 int64_t len2 = std::distance(first2, last2);
2214 int64_t words = block.m_val.size();
2217 std::vector<uint64_t> S(words, ~0x0ull);
2218 LLCSBitMatrix matrix(len2, words);
2220 for (int64_t i = 0; i < len2; ++i) {
2222 for (int64_t word = 0; word < words; ++word) {
2223 const uint64_t Matches = block.get(word, first2[i]);
2224 uint64_t Stemp = S[word];
2226 uint64_t u = Stemp & Matches;
2228 uint64_t x = addc64(Stemp, u, carry, &carry);
2229 S[word] = matrix.S[i][word] = x | (Stemp - u);
2234 for (uint64_t Stemp : S) {
2235 res += popcount64(~Stemp);
2238 matrix.dist = len1 + len2 - 2 * res;
2243 template<
typename InputIt1,
typename InputIt2>
2244 LLCSBitMatrix llcs_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
2245 int64_t len1 = std::distance(first1, last1);
2246 int64_t len2 = std::distance(first2, last2);
2247 if (!len1 || !len2) {
2248 LLCSBitMatrix matrix(0, 0);
2249 matrix.dist = len1 + len2;
2251 }
else if (len2 <= 64) {
2252 common::PatternMatchVector block(first2, last2);
2253 return llcs_matrix_unroll<1>(block, first2, last2, first1, last1);
2255 common::BlockPatternMatchVector block(first2, last2);
2256 switch (block.m_val.size()) {
2258 return llcs_matrix_unroll<1>(block, first2, last2, first1, last1);
2260 return llcs_matrix_unroll<2>(block, first2, last2, first1, last1);
2262 return llcs_matrix_unroll<3>(block, first2, last2, first1, last1);
2264 return llcs_matrix_unroll<4>(block, first2, last2, first1, last1);
2266 return llcs_matrix_unroll<5>(block, first2, last2, first1, last1);
2268 return llcs_matrix_unroll<6>(block, first2, last2, first1, last1);
2270 return llcs_matrix_unroll<7>(block, first2, last2, first1, last1);
2272 return llcs_matrix_unroll<8>(block, first2, last2, first1, last1);
2274 return llcs_matrix_blockwise(block, first2, last2, first1, last1);
2281 template<
typename InputIt1,
typename InputIt2>
2282 int64_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
2283 return detail::indel_distance(first1, last1, first2, last2, score_cutoff);
2286 template<
typename Sentence1,
typename Sentence2>
2287 int64_t indel_distance(
const Sentence1& s1,
const Sentence2& s2, int64_t max) {
2291 template<
typename InputIt1,
typename InputIt2>
2292 double indel_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
2293 int64_t maximum = std::distance(first1, last1) + std::distance(first2, last2);
2294 int64_t cutoff_distance =
static_cast<int64_t
>(std::ceil(maximum * score_cutoff));
2295 int64_t dist = indel_distance(first1, last1, first2, last2, cutoff_distance);
2296 double norm_dist = (maximum) ? (
double) dist / (double) maximum : 0.0;
2297 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
2300 template<
typename Sentence1,
typename Sentence2>
2301 double indel_normalized_distance(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff) {
2305 template<
typename InputIt1,
typename InputIt2>
2306 int64_t indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
2307 int64_t maximum = std::distance(first1, last1) + std::distance(first2, last2);
2308 int64_t cutoff_distance = maximum - score_cutoff;
2309 int64_t dist = indel_distance(first1, last1, first2, last2, cutoff_distance);
2310 int64_t sim = maximum - dist;
2311 return (sim >= score_cutoff) ? sim : 0;
2314 template<
typename Sentence1,
typename Sentence2>
2315 int64_t indel_similarity(
const Sentence1& s1,
const Sentence2& s2, int64_t score_cutoff) {
2319 template<
typename InputIt1,
typename InputIt2>
2320 double indel_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
2321 double norm_dist = indel_normalized_distance(first1, last1, first2, last2, 1.0 - score_cutoff);
2322 double norm_sim = 1.0 - norm_dist;
2323 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
2326 template<
typename Sentence1,
typename Sentence2>
2327 double indel_normalized_similarity(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff) {
2331 template<
typename InputIt1,
typename InputIt2>
2332 Editops indel_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
2336 return detail::recover_alignment(first1, last1, first2, last2, detail::llcs_matrix(first1, last1, first2, last2), affix);
2339 template<
typename Sentence1,
typename Sentence2>
2340 Editops indel_editops(
const Sentence1& s1,
const Sentence2& s2) {
2344 template<
typename CharT1>
2345 template<
typename InputIt2>
2346 int64_t CachedIndel<CharT1>::distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff)
const {
2350 template<
typename CharT1>
2351 template<
typename Sentence2>
2352 int64_t CachedIndel<CharT1>::distance(
const Sentence2& s2, int64_t score_cutoff)
const {
2356 template<
typename CharT1>
2357 template<
typename InputIt2>
2358 double CachedIndel<CharT1>::normalized_distance(InputIt2 first2, InputIt2 last2,
double score_cutoff)
const {
2359 int64_t maximum = s1.size() + std::distance(first2, last2);
2360 int64_t cutoff_distance =
static_cast<int64_t
>(std::ceil(maximum * score_cutoff));
2361 int64_t dist = distance(first2, last2, cutoff_distance);
2362 double norm_dist = (maximum) ? (
double) dist / (double) maximum : 0.0;
2363 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
2366 template<
typename CharT1>
2367 template<
typename Sentence2>
2368 double CachedIndel<CharT1>::normalized_distance(
const Sentence2& s2,
double score_cutoff)
const {
2372 template<
typename CharT1>
2373 template<
typename InputIt2>
2374 int64_t CachedIndel<CharT1>::similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff)
const {
2375 int64_t maximum = s1.size() + std::distance(first2, last2);
2376 int64_t cutoff_distance = maximum - score_cutoff;
2377 int64_t dist = distance(first2, last2, cutoff_distance);
2378 int64_t sim = maximum - dist;
2379 return (sim >= score_cutoff) ? sim : 0;
2382 template<
typename CharT1>
2383 template<
typename Sentence2>
2384 int64_t CachedIndel<CharT1>::similarity(
const Sentence2& s2, int64_t score_cutoff)
const {
2388 template<
typename CharT1>
2389 template<
typename InputIt2>
2390 double CachedIndel<CharT1>::normalized_similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff)
const {
2391 double norm_dist = normalized_distance(first2, last2, 1.0 - score_cutoff);
2392 double norm_sim = 1.0 - norm_dist;
2393 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
2396 template<
typename CharT1>
2397 template<
typename Sentence2>
2398 double CachedIndel<CharT1>::normalized_similarity(
const Sentence2& s2,
double score_cutoff)
const {
2537 template<
typename InputIt1,
typename InputIt2>
2538 int64_t
levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights = {1, 1, 1}, int64_t max = std::numeric_limits<int64_t>::max());
2540 template<
typename Sentence1,
typename Sentence2>
2541 int64_t
levenshtein_distance(
const Sentence1& s1,
const Sentence2& s2, LevenshteinWeightTable weights = {1, 1, 1}, int64_t max = std::numeric_limits<int64_t>::max());
2543 template<
typename InputIt1,
typename InputIt2>
2544 int64_t
levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights = {1, 1, 1}, int64_t score_cutoff = 0.0);
2546 template<
typename Sentence1,
typename Sentence2>
2547 int64_t
levenshtein_similarity(
const Sentence1& s1,
const Sentence2& s2, LevenshteinWeightTable weights = {1, 1, 1}, int64_t score_cutoff = 0.0);
2549 template<
typename InputIt1,
typename InputIt2>
2550 double levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights = {1, 1, 1},
double score_cutoff = 1.0);
2552 template<
typename Sentence1,
typename Sentence2>
2553 double levenshtein_normalized_distance(
const Sentence1& s1,
const Sentence2& s2, LevenshteinWeightTable weights = {1, 1, 1},
double score_cutoff = 1.0);
2614 template<
typename InputIt1,
typename InputIt2>
2615 double levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights = {1, 1, 1},
double score_cutoff = 0.0);
2617 template<
typename Sentence1,
typename Sentence2>
2635 template<
typename InputIt1,
typename InputIt2>
2636 Editops
levenshtein_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2);
2638 template<
typename Sentence1,
typename Sentence2>
2641 template<
typename CharT1>
2643 template<
typename Sentence1>
2646 PM(common::to_begin(s1), common::to_end(s1)),
2647 weights(aWeights) {}
2649 template<
typename InputIt1>
2651 : s1(first1, last1), PM(first1, last1), weights(aWeights) {}
2653 template<
typename InputIt2>
2654 int64_t
distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = std::numeric_limits<int64_t>::max())
const;
2656 template<
typename Sentence2>
2657 int64_t
distance(
const Sentence2& s2, int64_t score_cutoff = std::numeric_limits<int64_t>::max())
const;
2659 template<
typename InputIt2>
2660 int64_t
similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0)
const;
2662 template<
typename Sentence2>
2663 int64_t
similarity(
const Sentence2& s2, int64_t score_cutoff = 0)
const;
2665 template<
typename InputIt2>
2666 double normalized_distance(InputIt2 first2, InputIt2 last2,
double score_cutoff = 1.0)
const;
2668 template<
typename Sentence2>
2671 template<
typename InputIt2>
2674 template<
typename Sentence2>
2678 std::basic_string<CharT1> s1;
2679 common::BlockPatternMatchVector PM;
2680 LevenshteinWeightTable weights;
2683#if __cplusplus >= 201703L
2684 template<
typename Sentence1>
2685 CachedLevenshtein(
const Sentence1& s1_, LevenshteinWeightTable aWeights)
2686 -> CachedLevenshtein<char_type<Sentence1>>;
2688 template<
typename InputIt1>
2689 CachedLevenshtein(InputIt1 first1, InputIt1 last1, LevenshteinWeightTable aWeights)
2690 -> CachedLevenshtein<iterator_type<InputIt1>>;
2700 template<
typename InputIt1,
typename InputIt2>
2701 int64_t generalized_levenshtein_wagner_fischer(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights, int64_t max) {
2702 int64_t len1 = std::distance(first1, last1);
2703 int64_t cache_size = len1 + 1;
2704 std::vector<int64_t> cache(cache_size);
2707 for (int64_t i = 1; i < cache_size; ++i) {
2708 cache[i] = cache[i - 1] + (int64_t) weights.delete_cost;
2711 for (; first2 != last2; ++first2) {
2712 auto cache_iter = cache.begin();
2713 int64_t temp = *cache_iter;
2714 *cache_iter += (int64_t) weights.insert_cost;
2716 auto _first1 = first1;
2717 for (; _first1 != last1; ++_first1) {
2718 if (*_first1 != *first2) {
2719 temp = std::min({*cache_iter + (int64_t) weights.delete_cost, *(cache_iter + 1) + (int64_t) weights.insert_cost, temp + (int64_t) weights.replace_cost});
2722 std::swap(*cache_iter, temp);
2726 int64_t dist = cache.back();
2727 return (dist <= max) ? dist : max + 1;
2734 template<
typename InputIt1,
typename InputIt2>
2735 int64_t levenshtein_maximum(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights) {
2736 int64_t len1 = std::distance(first1, last1);
2737 int64_t len2 = std::distance(first2, last2);
2739 int64_t max_dist = len1 * (int64_t) weights.delete_cost + len2 * (int64_t) weights.insert_cost;
2742 max_dist = std::min(max_dist, len2 * (int64_t) weights.replace_cost + (len1 - len2) * (int64_t) weights.delete_cost);
2744 max_dist = std::min(max_dist, len1 * (int64_t) weights.replace_cost + (len2 - len1) * (int64_t) weights.insert_cost);
2754 template<
typename InputIt1,
typename InputIt2>
2755 int64_t levenshtein_min_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights) {
2756 int64_t len1 = std::distance(first1, last1);
2757 int64_t len2 = std::distance(first2, last2);
2758 return std::max((len1 - len2) * (int64_t) weights.delete_cost, (len2 - len1) * (int64_t) weights.insert_cost);
2761 template<
typename InputIt1,
typename InputIt2>
2762 int64_t generalized_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights, int64_t max) {
2763 int64_t min_edits = levenshtein_min_distance(first1, last1, first2, last2, weights);
2764 if (min_edits > max) {
2771 return generalized_levenshtein_wagner_fischer(first1, last1, first2, last2, weights, max);
2789 static constexpr uint8_t levenshtein_mbleven2018_matrix[9][8] = {
2798 {0x3F, 0x27, 0x2D, 0x39, 0x36, 0x1E, 0x1B},
2799 {0x3D, 0x37, 0x1F, 0x25, 0x19, 0x16},
2804 template<
typename InputIt1,
typename InputIt2>
2805 int64_t levenshtein_mbleven2018(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2806 int64_t len1 = std::distance(first1, last1);
2807 int64_t len2 = std::distance(first2, last2);
2810 return levenshtein_mbleven2018(first2, last2, first1, last1, max);
2813 int64_t len_diff = len1 - len2;
2814 auto possible_ops = levenshtein_mbleven2018_matrix[(max + max * max) / 2 + len_diff - 1];
2815 int64_t dist = max + 1;
2817 for (
int pos = 0; possible_ops[pos] != 0; ++pos) {
2818 uint8_t ops = possible_ops[pos];
2821 int64_t cur_dist = 0;
2822 while (s1_pos < len1 && s2_pos < len2) {
2823 if (first1[s1_pos] != first2[s2_pos]) {
2826 if (ops & 1) s1_pos++;
2827 if (ops & 2) s2_pos++;
2834 cur_dist += (len1 - s1_pos) + (len2 - s2_pos);
2835 dist = std::min(dist, cur_dist);
2838 return (dist <= max) ? dist : max + 1;
2859 template<
typename InputIt1,
typename InputIt2>
2860 int64_t levenshtein_hyrroe2003(
const common::PatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2861 int64_t len1 = std::distance(first1, last1);
2864 uint64_t VP = (uint64_t) -1;
2866 int64_t currDist = len1;
2869 uint64_t mask = (uint64_t) 1 << (len1 - 1);
2872 for (; first2 != last2; ++first2) {
2874 uint64_t PM_j = PM.get(*first2);
2876 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
2879 uint64_t HP = VN | ~(D0 | VP);
2880 uint64_t HN = D0 & VP;
2883 currDist += bool(HP & mask);
2884 currDist -= bool(HN & mask);
2890 VP = HN | ~(D0 | HP);
2894 return (currDist <= max) ? currDist : max + 1;
2897 template<
typename InputIt1,
typename InputIt2>
2898 int64_t levenshtein_hyrroe2003_small_band(
const common::BlockPatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2899 int64_t len1 = std::distance(first1, last1);
2900 int64_t len2 = std::distance(first2, last2);
2903 uint64_t VP = (uint64_t) -1;
2906 int64_t currDist = len1;
2909 uint64_t mask = 1ull << 63;
2911 const int64_t words = PM.m_val.size();
2914 for (int64_t i = 0; i < len2; ++i) {
2916 int64_t word = i / 64;
2917 int64_t word_pos = i % 64;
2919 uint64_t PM_j = PM.get(word, first2[i]) >> word_pos;
2921 if (word + 1 < words && word_pos != 0) {
2922 PM_j |= PM.get(word + 1, first2[i]) << (64 - word_pos);
2927 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
2930 uint64_t HP = VN | ~(D0 | VP);
2931 uint64_t HN = D0 & VP;
2934 currDist += bool(HP & mask);
2935 currDist -= bool(HN & mask);
2938 VP = HN | ~((D0 >> 1) | HP);
2939 VN = (D0 >> 1) & HP;
2942 return (currDist <= max) ? currDist : max + 1;
2945 template<
typename InputIt1,
typename InputIt2>
2946 int64_t levenshtein_myers1999_block(
const common::BlockPatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2951 Vectors() : VP(~0x0ull), VN(0) {}
2954 int64_t len1 = std::distance(first1, last1);
2955 int64_t len2 = std::distance(first2, last2);
2956 int64_t words = PM.m_val.size();
2957 int64_t currDist = len1;
2960 max = std::min(max, std::max(len1, len2));
2963 int64_t full_band = std::min(len1, 2 * max + 1);
2965 if (full_band <= 64) {
2966 return levenshtein_hyrroe2003_small_band(PM, first1, last1, first2, last2, max);
2969 std::vector<Vectors> vecs(words);
2970 uint64_t Last = (uint64_t) 1 << ((len1 - 1) % 64);
2973 for (int64_t i = 0; i < len2; i++) {
2974 uint64_t HP_carry = 1;
2975 uint64_t HN_carry = 0;
2977 for (int64_t word = 0; word < words - 1; word++) {
2979 uint64_t PM_j = PM.get(word, first2[i]);
2980 uint64_t VN = vecs[word].VN;
2981 uint64_t VP = vecs[word].VP;
2983 uint64_t X = PM_j | HN_carry;
2984 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
2987 uint64_t HP = VN | ~(D0 | VP);
2988 uint64_t HN = D0 & VP;
2994 uint64_t HP_carry_temp = HP_carry;
2995 HP_carry = HP >> 63;
2996 HP = (HP << 1) | HP_carry_temp;
2998 uint64_t HN_carry_temp = HN_carry;
2999 HN_carry = HN >> 63;
3000 HN = (HN << 1) | HN_carry_temp;
3002 vecs[word].VP = HN | ~(D0 | HP);
3003 vecs[word].VN = HP & D0;
3008 uint64_t PM_j = PM.get(words - 1, first2[i]);
3009 uint64_t VN = vecs[words - 1].VN;
3010 uint64_t VP = vecs[words - 1].VP;
3012 uint64_t X = PM_j | HN_carry;
3013 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
3016 uint64_t HP = VN | ~(D0 | VP);
3017 uint64_t HN = D0 & VP;
3020 currDist += bool(HP & Last);
3021 currDist -= bool(HN & Last);
3024 HP = (HP << 1) | HP_carry;
3025 HN = (HN << 1) | HN_carry;
3027 vecs[words - 1].VP = HN | ~(D0 | HP);
3028 vecs[words - 1].VN = HP & D0;
3032 return (currDist <= max) ? currDist : max + 1;
3035 template<
typename InputIt1,
typename InputIt2>
3036 int64_t uniform_levenshtein_distance(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
3037 int64_t len1 = std::distance(first1, last1);
3038 int64_t len2 = std::distance(first2, last2);
3042 return !std::equal(first1, last1, first2, last2);
3045 if (max < std::abs(len1 - len2)) {
3051 return (len2 <= max) ? len2 : max + 1;
3059 return levenshtein_hyrroe2003(block.m_val[0], first1, last1, first2, last2, max);
3061 return levenshtein_myers1999_block(block, first1, last1, first2, last2, max);
3067 len1 = std::distance(first1, last1);
3068 len2 = std::distance(first2, last2);
3069 if (!len1 || !len2) {
3073 return levenshtein_mbleven2018(first1, last1, first2, last2, max);
3076 template<
typename InputIt1,
typename InputIt2>
3077 int64_t uniform_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
3078 int64_t len1 = std::distance(first1, last1);
3079 int64_t len2 = std::distance(first2, last2);
3083 return uniform_levenshtein_distance(first2, last2, first1, last1, max);
3088 return !std::equal(first1, last1, first2, last2);
3092 if (max < (len1 - len2)) {
3098 len1 = std::distance(first1, last1);
3099 len2 = std::distance(first2, last2);
3100 if (!len1 || !len2) {
3105 return levenshtein_mbleven2018(first1, last1, first2, last2, max);
3110 return levenshtein_hyrroe2003(common::PatternMatchVector(first1, last1), first1, last1, first2, last2, max);
3112 return levenshtein_myers1999_block(common::BlockPatternMatchVector(first1, last1), first1, last1, first2, last2, max);
3116 template<
typename InputIt1,
typename InputIt2>
3117 double uniform_levenshtein_normalized_distance(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
3118 int64_t len1 = std::distance(first1, last1);
3119 int64_t len2 = std::distance(first2, last2);
3120 int64_t maximum = std::max(len1, len2);
3121 int64_t cutoff_distance =
static_cast<int64_t
>(std::ceil(maximum * score_cutoff));
3123 uniform_levenshtein_distance(block, first1, last1, first2, last2, cutoff_distance);
3124 double norm_dist = (maximum) ? (
double) dist / (double) maximum : 0.0;
3125 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
3128 template<
typename InputIt1,
typename InputIt2>
3129 int64_t uniform_levenshtein_similarity(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
3130 int64_t len1 = std::distance(first1, last1);
3131 int64_t len2 = std::distance(first2, last2);
3132 int64_t maximum = std::max(len1, len2);
3133 int64_t cutoff_distance = maximum - score_cutoff;
3135 uniform_levenshtein_distance(block, first1, last1, first2, last2, cutoff_distance);
3136 int64_t sim = maximum - dist;
3137 return (sim >= score_cutoff) ? sim : 0;
3140 template<
typename InputIt1,
typename InputIt2>
3141 double uniform_levenshtein_normalized_similarity(
const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
3142 double norm_dist = uniform_levenshtein_normalized_distance(block, first1, last1, first2, last2, 1.0 - score_cutoff);
3143 double norm_sim = 1.0 - norm_dist;
3144 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
3147 struct LevenshteinBitMatrix {
3148 LevenshteinBitMatrix(uint64_t rows, uint64_t cols)
3149 : VP(rows, cols, (uint64_t) -1), VN(rows, cols, 0), dist(0) {}
3151 common::Matrix<uint64_t> VP;
3152 common::Matrix<uint64_t> VN;
3160 template<
typename InputIt1,
typename InputIt2>
3161 Editops recover_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
const LevenshteinBitMatrix& matrix, StringAffix affix) {
3162 int64_t len1 = std::distance(first1, last1);
3163 int64_t len2 = std::distance(first2, last2);
3164 int64_t dist = matrix.dist;
3165 Editops editops(dist);
3166 editops.set_src_len(len1 + affix.prefix_len + affix.suffix_len);
3167 editops.set_dest_len(len2 + affix.prefix_len + affix.suffix_len);
3176 while (row && col) {
3177 uint64_t col_pos = col - 1;
3178 uint64_t col_word = col_pos / 64;
3179 col_pos = col_pos % 64;
3180 uint64_t mask = 1ull << col_pos;
3183 if (matrix.VP[row - 1][col_word] & mask) {
3187 editops[dist].src_pos = row + affix.prefix_len;
3188 editops[dist].dest_pos = col + affix.prefix_len;
3193 if (row && matrix.VN[row - 1][col_word] & mask) {
3196 editops[dist].src_pos = row + affix.prefix_len;
3197 editops[dist].dest_pos = col + affix.prefix_len;
3204 if (first1[row] != first2[col]) {
3207 editops[dist].src_pos = row + affix.prefix_len;
3208 editops[dist].dest_pos = col + affix.prefix_len;
3218 editops[dist].src_pos = row + affix.prefix_len;
3219 editops[dist].dest_pos = col + affix.prefix_len;
3226 editops[dist].src_pos = row + affix.prefix_len;
3227 editops[dist].dest_pos = col + affix.prefix_len;
3233 template<
typename InputIt1,
typename InputIt2>
3234 LevenshteinBitMatrix levenshtein_matrix_hyrroe2003(
const common::PatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
3235 int64_t len1 = std::distance(first1, last1);
3236 int64_t len2 = std::distance(first2, last2);
3237 uint64_t VP = ~0x0ull;
3240 LevenshteinBitMatrix matrix(len2, 1);
3244 uint64_t mask = (uint64_t) 1 << (len1 - 1);
3247 for (int64_t i = 0; i < len2; ++i) {
3249 uint64_t PM_j = PM.get(first2[i]);
3251 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
3254 uint64_t HP = VN | ~(D0 | VP);
3255 uint64_t HN = D0 & VP;
3258 matrix.dist += bool(HP & mask);
3259 matrix.dist -= bool(HN & mask);
3265 VP = matrix.VP[i][0] = HN | ~(D0 | HP);
3266 VN = matrix.VN[i][0] = HP & D0;
3272 template<
typename InputIt1,
typename InputIt2>
3273 LevenshteinBitMatrix levenshtein_matrix_hyrroe2003_block(
const common::BlockPatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
3274 int64_t len1 = std::distance(first1, last1);
3275 int64_t len2 = std::distance(first2, last2);
3282 Vectors() : VP(~0x0ull), VN(0) {}
3285 int64_t words = PM.m_val.size();
3286 LevenshteinBitMatrix matrix(len2, words);
3289 std::vector<Vectors> vecs(words);
3290 uint64_t Last = (uint64_t) 1 << ((len1 - 1) % 64);
3293 for (int64_t i = 0; i < len2; i++) {
3294 uint64_t HP_carry = 1;
3295 uint64_t HN_carry = 0;
3297 for (int64_t word = 0; word < words - 1; word++) {
3299 uint64_t PM_j = PM.get(word, first2[i]);
3300 uint64_t VN = vecs[word].VN;
3301 uint64_t VP = vecs[word].VP;
3303 uint64_t X = PM_j | HN_carry;
3304 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
3307 uint64_t HP = VN | ~(D0 | VP);
3308 uint64_t HN = D0 & VP;
3314 uint64_t HP_carry_temp = HP_carry;
3315 HP_carry = HP >> 63;
3316 HP = (HP << 1) | HP_carry_temp;
3318 uint64_t HN_carry_temp = HN_carry;
3319 HN_carry = HN >> 63;
3320 HN = (HN << 1) | HN_carry_temp;
3322 vecs[word].VP = matrix.VP[i][word] = HN | ~(D0 | HP);
3323 vecs[word].VN = matrix.VN[i][word] = HP & D0;
3328 uint64_t PM_j = PM.get(words - 1, first2[i]);
3329 uint64_t VN = vecs[words - 1].VN;
3330 uint64_t VP = vecs[words - 1].VP;
3332 uint64_t X = PM_j | HN_carry;
3333 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
3336 uint64_t HP = VN | ~(D0 | VP);
3337 uint64_t HN = D0 & VP;
3340 matrix.dist += bool(HP & Last);
3341 matrix.dist -= bool(HN & Last);
3344 HP = (HP << 1) | HP_carry;
3345 HN = (HN << 1) | HN_carry;
3347 vecs[words - 1].VP = matrix.VP[i][words - 1] = HN | ~(D0 | HP);
3348 vecs[words - 1].VN = matrix.VN[i][words - 1] = HP & D0;
3355 template<
typename InputIt1,
typename InputIt2>
3356 LevenshteinBitMatrix levenshtein_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
3357 int64_t len1 = std::distance(first1, last1);
3358 int64_t len2 = std::distance(first2, last2);
3360 if (!len1 || !len2) {
3361 LevenshteinBitMatrix matrix(0, 0);
3362 matrix.dist = len1 + len2;
3364 }
else if (len1 <= 64) {
3365 return levenshtein_matrix_hyrroe2003(common::PatternMatchVector(first2, last2), first2, last2, first1, last1);
3367 return levenshtein_matrix_hyrroe2003_block(common::BlockPatternMatchVector(first2, last2), first2, last2, first1, last1);
3373 template<
typename InputIt1,
typename InputIt2>
3384 int64_t new_max = detail::ceil_div(max, weights.
insert_cost);
3386 detail::uniform_levenshtein_distance(first1, last1, first2, last2, new_max);
3388 return (distance <= max) ? distance : max + 1;
3396 int64_t new_max = detail::ceil_div(max, weights.
insert_cost);
3397 int64_t distance = indel_distance(first1, last1, first2, last2, new_max);
3399 return (distance <= max) ? distance : max + 1;
3403 return detail::generalized_levenshtein_wagner_fischer(first1, last1, first2, last2, weights, max);
3406 template<
typename Sentence1,
typename Sentence2>
3411 template<
typename InputIt1,
typename InputIt2>
3413 int64_t maximum = detail::levenshtein_maximum(first1, last1, first2, last2, weights);
3414 int64_t cutoff_distance =
static_cast<int64_t
>(std::ceil(maximum * score_cutoff));
3416 double norm_dist = (maximum) ? (
double) dist / (double) maximum : 0.0;
3417 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
3420 template<
typename Sentence1,
typename Sentence2>
3425 template<
typename InputIt1,
typename InputIt2>
3427 int64_t maximum = detail::levenshtein_maximum(first1, last1, first2, last2, weights);
3428 int64_t cutoff_distance = maximum - score_cutoff;
3430 int64_t sim = maximum - dist;
3431 return (sim >= score_cutoff) ? sim : 0;
3434 template<
typename Sentence1,
typename Sentence2>
3439 template<
typename InputIt1,
typename InputIt2>
3443 double norm_sim = 1.0 - norm_dist;
3444 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
3447 template<
typename Sentence1,
typename Sentence2>
3452 template<
typename InputIt1,
typename InputIt2>
3457 return detail::recover_alignment(first1, last1, first2, last2, detail::levenshtein_matrix(first1, last1, first2, last2), affix);
3460 template<
typename Sentence1,
typename Sentence2>
3465 template<
typename CharT1>
3466 template<
typename InputIt2>
3471 if (weights.insert_cost == weights.delete_cost) {
3473 if (weights.insert_cost == 0) {
3478 if (weights.insert_cost == weights.replace_cost) {
3480 int64_t new_max = detail::ceil_div(score_cutoff, weights.insert_cost);
3482 detail::uniform_levenshtein_distance(PM, first1, last1, first2, last2, new_max);
3483 dist *= weights.insert_cost;
3485 return (dist <= score_cutoff) ? dist : score_cutoff + 1;
3491 else if (weights.replace_cost >= weights.insert_cost + weights.delete_cost) {
3493 int64_t new_max = detail::ceil_div(score_cutoff, weights.insert_cost);
3494 int64_t dist = detail::indel_distance(PM, first1, last1, first2, last2, new_max);
3495 dist *= weights.insert_cost;
3496 return (dist <= score_cutoff) ? dist : score_cutoff + 1;
3500 return detail::generalized_levenshtein_distance(first1, last1, first2, last2, weights, score_cutoff);
3503 template<
typename CharT1>
3504 template<
typename Sentence2>
3509 template<
typename CharT1>
3510 template<
typename InputIt2>
3514 int64_t maximum = detail::levenshtein_maximum(first1, last1, first2, last2, weights);
3515 int64_t cutoff_distance =
static_cast<int64_t
>(std::ceil(maximum * score_cutoff));
3516 int64_t dist = distance(first2, last2, cutoff_distance);
3517 double norm_dist = (maximum) ? (
double) dist / (double) maximum : 0.0;
3518 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
3521 template<
typename CharT1>
3522 template<
typename Sentence2>
3527 template<
typename CharT1>
3528 template<
typename InputIt2>
3532 int64_t maximum = detail::levenshtein_maximum(first1, last1, first2, last2, weights);
3533 int64_t cutoff_distance = maximum - score_cutoff;
3534 int64_t dist = distance(first2, last2, cutoff_distance);
3535 int64_t sim = maximum - dist;
3536 return (sim >= score_cutoff) ? sim : 0;
3539 template<
typename CharT1>
3540 template<
typename Sentence2>
3545 template<
typename CharT1>
3546 template<
typename InputIt2>
3548 double norm_dist = normalized_distance(first2, last2, 1.0 - score_cutoff);
3549 double norm_sim = 1.0 - norm_dist;
3550 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
3553 template<
typename CharT1>
3554 template<
typename Sentence2>
3561#include <type_traits>
3596 template<
typename InputIt1,
typename InputIt2>
3597 double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
3599 template<
typename Sentence1,
typename Sentence2>
3600 double ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0);
3603 template<
typename CharT1>
3605 template<
typename InputIt1>
3606 CachedRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1) {}
3608 template<
typename Sentence1>
3611 template<
typename InputIt2>
3612 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0.0)
const;
3614 template<
typename Sentence2>
3615 double similarity(
const Sentence2& s2,
double score_cutoff = 0)
const;
3618 std::basic_string<CharT1> s1;
3622#if __cplusplus >= 201703L
3623 template<
typename Sentence1>
3624 CachedRatio(
const Sentence1& s1) -> CachedRatio<char_type<Sentence1>>;
3626 template<
typename InputIt1>
3627 CachedRatio(InputIt1 first1, InputIt1 last1) -> CachedRatio<iterator_type<InputIt1>>;
3655 template<
typename InputIt1,
typename InputIt2>
3656 double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
3658 template<
typename Sentence1,
typename Sentence2>
3659 double partial_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0);
3662 template<
typename CharT1>
3667 template<
typename InputIt1>
3670 template<
typename Sentence1>
3674 template<
typename InputIt2>
3675 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0.0)
const;
3677 template<
typename Sentence2>
3678 double similarity(
const Sentence2& s2,
double score_cutoff = 0.0)
const;
3681 std::basic_string<CharT1> s1;
3686#if __cplusplus >= 201703L
3687 template<
typename Sentence1>
3688 CachedPartialRatio(
const Sentence1& s1) -> CachedPartialRatio<char_type<Sentence1>>;
3690 template<
typename InputIt1>
3691 CachedPartialRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialRatio<iterator_type<InputIt1>>;
3720 template<
typename InputIt1,
typename InputIt2>
3721 double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
3723 template<
typename Sentence1,
typename Sentence2,
typename CharT1 =
char_type<Sentence1>,
typename CharT2 =
char_type<Sentence2>>
3724 double token_sort_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0);
3728 template<
typename CharT1>
3730 template<
typename InputIt1>
3732 : s1_sorted(common::sorted_split(first1, last1).join()), cached_ratio(s1_sorted) {}
3734 template<
typename Sentence1>
3738 template<
typename InputIt2>
3739 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0)
const;
3741 template<
typename Sentence2>
3742 double similarity(
const Sentence2& s2,
double score_cutoff = 0)
const;
3745 std::basic_string<CharT1> s1_sorted;
3749#if __cplusplus >= 201703L
3750 template<
typename Sentence1>
3751 CachedTokenSortRatio(
const Sentence1& s1) -> CachedTokenSortRatio<char_type<Sentence1>>;
3753 template<
typename InputIt1>
3754 CachedTokenSortRatio(InputIt1 first1, InputIt1 last1)
3755 -> CachedTokenSortRatio<iterator_type<InputIt1>>;
3778 template<
typename InputIt1,
typename InputIt2>
3779 double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
3781 template<
typename Sentence1,
typename Sentence2>
3785 template<
typename CharT1>
3787 template<
typename InputIt1>
3789 : s1_sorted(common::sorted_split(first1, last1).join()), cached_partial_ratio(s1_sorted) {}
3791 template<
typename Sentence1>
3795 template<
typename InputIt2>
3796 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0)
const;
3798 template<
typename Sentence2>
3799 double similarity(
const Sentence2& s2,
double score_cutoff = 0)
const;
3802 std::basic_string<CharT1> s1_sorted;
3806#if __cplusplus >= 201703L
3807 template<
typename Sentence1>
3808 CachedPartialTokenSortRatio(
const Sentence1& s1)
3809 -> CachedPartialTokenSortRatio<char_type<Sentence1>>;
3811 template<
typename InputIt1>
3812 CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1)
3813 -> CachedPartialTokenSortRatio<iterator_type<InputIt1>>;
3844 template<
typename InputIt1,
typename InputIt2>
3845 double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
3847 template<
typename Sentence1,
typename Sentence2>
3848 double token_set_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0);
3851 template<
typename CharT1>
3853 template<
typename InputIt1>
3855 : s1(first1, last1), tokens_s1(common::sorted_split(std::begin(s1), std::end(s1))) {}
3857 template<
typename Sentence1>
3861 template<
typename InputIt2>
3862 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0)
const;
3864 template<
typename Sentence2>
3865 double similarity(
const Sentence2& s2,
double score_cutoff = 0)
const;
3868 std::basic_string<CharT1> s1;
3872#if __cplusplus >= 201703L
3873 template<
typename Sentence1>
3874 CachedTokenSetRatio(
const Sentence1& s1) -> CachedTokenSetRatio<char_type<Sentence1>>;
3876 template<
typename InputIt1>
3877 CachedTokenSetRatio(InputIt1 first1, InputIt1 last1)
3878 -> CachedTokenSetRatio<iterator_type<InputIt1>>;
3900 template<
typename InputIt1,
typename InputIt2>
3901 double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
3903 template<
typename Sentence1,
typename Sentence2>
3907 template<
typename CharT1>
3909 template<
typename InputIt1>
3911 : s1(first1, last1), tokens_s1(common::sorted_split(std::begin(s1), std::end(s1))) {}
3913 template<
typename Sentence1>
3917 template<
typename InputIt2>
3918 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0)
const;
3920 template<
typename Sentence2>
3921 double similarity(
const Sentence2& s2,
double score_cutoff = 0)
const;
3924 std::basic_string<CharT1> s1;
3928#if __cplusplus >= 201703L
3929 template<
typename Sentence1>
3930 CachedPartialTokenSetRatio(
const Sentence1& s1) -> CachedPartialTokenSetRatio<char_type<Sentence1>>;
3932 template<
typename InputIt1>
3933 CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1)
3934 -> CachedPartialTokenSetRatio<iterator_type<InputIt1>>;
3956 template<
typename InputIt1,
typename InputIt2>
3957 double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
3959 template<
typename Sentence1,
typename Sentence2>
3960 double token_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0);
3963 template<
typename CharT1>
3965 template<
typename InputIt1>
3967 : s1(first1, last1),
3968 s1_tokens(common::sorted_split(std::begin(s1), std::end(s1))),
3969 s1_sorted(s1_tokens.join()),
3970 cached_ratio_s1_sorted(s1_sorted) {}
3972 template<
typename Sentence1>
3976 template<
typename InputIt2>
3977 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0)
const;
3979 template<
typename Sentence2>
3980 double similarity(
const Sentence2& s2,
double score_cutoff = 0)
const;
3983 std::basic_string<CharT1> s1;
3985 std::basic_string<CharT1> s1_sorted;
3989#if __cplusplus >= 201703L
3990 template<
typename Sentence1>
3991 CachedTokenRatio(
const Sentence1& s1) -> CachedTokenRatio<char_type<Sentence1>>;
3993 template<
typename InputIt1>
3994 CachedTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenRatio<iterator_type<InputIt1>>;
4017 template<
typename InputIt1,
typename InputIt2>
4018 double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
4020 template<
typename Sentence1,
typename Sentence2>
4021 double partial_token_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0);
4024 template<
typename CharT1>
4026 template<
typename InputIt1>
4028 : s1(first1, last1),
4029 tokens_s1(common::sorted_split(std::begin(s1), std::end(s1))),
4030 s1_sorted(tokens_s1.join()) {}
4032 template<
typename Sentence1>
4036 template<
typename InputIt2>
4037 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0)
const;
4039 template<
typename Sentence2>
4040 double similarity(
const Sentence2& s2,
double score_cutoff = 0)
const;
4043 std::basic_string<CharT1> s1;
4045 std::basic_string<CharT1> s1_sorted;
4048#if __cplusplus >= 201703L
4049 template<
typename Sentence1>
4050 CachedPartialTokenRatio(
const Sentence1& s1) -> CachedPartialTokenRatio<char_type<Sentence1>>;
4052 template<
typename InputIt1>
4053 CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1)
4054 -> CachedPartialTokenRatio<iterator_type<InputIt1>>;
4078 template<
typename InputIt1,
typename InputIt2>
4079 double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
4081 template<
typename Sentence1,
typename Sentence2>
4082 double WRatio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0);
4085 template<
typename CharT1>
4087 template<
typename InputIt1>
4090 template<
typename Sentence1>
4093 template<
typename InputIt2>
4094 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0)
const;
4096 template<
typename Sentence2>
4097 double similarity(
const Sentence2& s2,
double score_cutoff = 0)
const;
4102 std::basic_string<CharT1> s1;
4105 std::basic_string<CharT1> s1_sorted;
4109#if __cplusplus >= 201703L
4110 template<
typename Sentence1>
4111 CachedWRatio(
const Sentence1& s1) -> CachedWRatio<char_type<Sentence1>>;
4113 template<
typename InputIt1>
4114 CachedWRatio(InputIt1 first1, InputIt1 last1) -> CachedWRatio<iterator_type<InputIt1>>;
4138 template<
typename InputIt1,
typename InputIt2>
4139 double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0);
4141 template<
typename Sentence1,
typename Sentence2>
4142 double QRatio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff = 0);
4144 template<
typename CharT1>
4146 template<
typename InputIt1>
4147 CachedQRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), cached_ratio(first1, last1) {}
4149 template<
typename Sentence1>
4152 template<
typename InputIt2>
4153 double similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff = 0)
const;
4155 template<
typename Sentence2>
4156 double similarity(
const Sentence2& s2,
double score_cutoff = 0)
const;
4159 std::basic_string<CharT1> s1;
4163#if __cplusplus >= 201703L
4164 template<
typename Sentence1>
4165 CachedQRatio(
const Sentence1& s1) -> CachedQRatio<char_type<Sentence1>>;
4167 template<
typename InputIt1>
4168 CachedQRatio(InputIt1 first1, InputIt1 last1) -> CachedQRatio<iterator_type<InputIt1>>;
4203#include <unordered_map>
4208 struct MatchingBlock {
4212 MatchingBlock(int64_t aSPos, int64_t aDPos, int64_t aLength)
4213 : spos(aSPos), dpos(aDPos), length(aLength) {}
4218 template<
typename InputIt1,
typename InputIt2>
4221 using match_t = std::tuple<int64_t, int64_t, int64_t>;
4226 j2len_.resize(b_len + 1);
4227 for (int64_t i = 0; i < b_len; ++i) {
4233 int64_t best_i = a_low;
4234 int64_t best_j = b_low;
4235 int64_t best_size = 0;
4239 for (int64_t i = a_low; i < a_high; ++i) {
4240 const auto& indexes = b2j_[
a_first[i]];
4242 int64_t next_val = 0;
4244 for (; pos < indexes.size(); pos++) {
4245 int64_t j = indexes[pos];
4246 if (j < b_low)
continue;
4248 next_val = j2len_[j];
4252 for (; pos < indexes.size(); pos++) {
4253 int64_t j = indexes[pos];
4254 if (j >= b_high)
break;
4257 int64_t k = next_val + 1;
4261 if (pos + 1 < indexes.size()) {
4262 next_val = j2len_[indexes[pos + 1]];
4266 if (k > best_size) {
4274 std::fill(j2len_.begin() + b_low, j2len_.begin() + b_high, 0);
4278 std::fill(j2len_.begin() + b_low, j2len_.begin() + b_high, 0);
4281 while (best_i > a_low && best_j > b_low &&
a_first[best_i - 1] ==
b_first[best_j - 1]) {
4287 while ((best_i + best_size) < a_high && (best_j + best_size) < b_high &&
a_first[best_i + best_size] ==
b_first[best_j + best_size]) {
4291 return std::make_tuple(best_i, best_j, best_size);
4298 std::vector<std::tuple<int64_t, int64_t, int64_t, int64_t>> queue;
4299 std::vector<match_t> matching_blocks_pass1;
4301 size_t queue_head = 0;
4302 queue.reserve(std::min(a_len, b_len));
4303 queue.emplace_back(0, a_len, 0, b_len);
4305 while (queue_head < queue.size()) {
4306 int64_t a_low, a_high, b_low, b_high;
4307 std::tie(a_low, a_high, b_low, b_high) = queue[queue_head++];
4308 int64_t spos, dpos, length;
4311 if (a_low < spos && b_low < dpos) {
4312 queue.emplace_back(a_low, spos, b_low, dpos);
4314 if ((spos + length) < a_high && (dpos + length) < b_high) {
4315 queue.emplace_back(spos + length, a_high, dpos + length, b_high);
4317 matching_blocks_pass1.emplace_back(spos, dpos, length);
4322 std::vector<MatchingBlock> matching_blocks;
4324 matching_blocks.reserve(matching_blocks_pass1.size());
4329 for (
match_t const& m : matching_blocks_pass1) {
4330 if (i1 + k1 == std::get<0>(m) && j1 + k1 == std::get<1>(m)) {
4331 k1 += std::get<2>(m);
4334 matching_blocks.emplace_back(i1, j1, k1);
4336 std::tie(i1, j1, k1) = m;
4340 matching_blocks.emplace_back(i1, j1, k1);
4342 matching_blocks.emplace_back(a_len, b_len, 0);
4344 return matching_blocks;
4355 std::vector<int64_t> j2len_;
4360 template<
typename InputIt1,
typename InputIt2>
4361 std::vector<MatchingBlock> get_matching_blocks(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
4372#include <unordered_map>
4382 template<
typename InputIt1,
typename InputIt2>
4383 double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
4384 return indel_normalized_similarity(first1, last1, first2, last2, score_cutoff / 100) * 100;
4387 template<
typename Sentence1,
typename Sentence2>
4388 double ratio(
const Sentence1& s1,
const Sentence2& s2,
const double score_cutoff) {
4392 template<
typename CharT1>
4393 template<
typename InputIt2>
4395 double norm_sim = rapidfuzz::detail::indel_normalized_similarity(
4398 return norm_sim * 100;
4401 template<
typename CharT1>
4402 template<
typename Sentence2>
4413 template<
typename InputIt1,
typename InputIt2,
typename CachedCharT1>
4416 double max_ratio = 0;
4417 int64_t len1 = std::distance(first1, last1);
4418 int64_t len2 = std::distance(first2, last2);
4419 assert(len2 >= len1);
4421 for (int64_t i = 1; i < len1; ++i) {
4422 auto substr_last = first2 + i;
4424 if (!s1_char_map[*(substr_last - 1)]) {
4428 double ls_ratio = cached_ratio.
similarity(first2, substr_last, score_cutoff);
4429 if (ls_ratio > max_ratio) {
4430 score_cutoff = max_ratio = ls_ratio;
4431 if (ls_ratio == 100.0) {
4437 for (int64_t i = 0; i < len2 - len1; ++i) {
4438 auto substr_first = first2 + i;
4439 auto substr_last = substr_first + len1;
4441 if (!s1_char_map[*(substr_last - 1)]) {
4445 double ls_ratio = cached_ratio.
similarity(substr_first, substr_last, score_cutoff);
4446 if (ls_ratio > max_ratio) {
4447 score_cutoff = max_ratio = ls_ratio;
4448 if (ls_ratio == 100.0) {
4454 for (int64_t i = len2 - len1; i < len2; ++i) {
4455 auto substr_first = first2 + i;
4457 if (!s1_char_map[*substr_first]) {
4461 double ls_ratio = cached_ratio.
similarity(substr_first, last2, score_cutoff);
4462 if (ls_ratio > max_ratio) {
4463 score_cutoff = max_ratio = ls_ratio;
4464 if (ls_ratio == 100.0) {
4473 template<
typename InputIt1,
typename InputIt2,
typename CharT1 = iterator_type<InputIt1>>
4478 int64_t len1 = std::distance(first1, last1);
4479 for (int64_t i = 0; i < len1; ++i) {
4480 s1_char_map.
create(first1[i]) =
true;
4486 template<
typename InputIt1,
typename InputIt2,
typename CachedCharT1>
4488 int64_t len1 = std::distance(first1, last1);
4489 int64_t len2 = std::distance(first2, last2);
4490 assert(len2 >= len1);
4492 double max_ratio = 0;
4493 if (score_cutoff > 100) {
4497 if (!len1 || !len2) {
4498 return static_cast<double>(len1 == len2) * 100.0;
4501 auto blocks = rapidfuzz::detail::get_matching_blocks(first1, last1, first2, last2);
4504 for (
const auto& block : blocks) {
4505 if (block.length == len1) {
4510 for (
const auto& block : blocks) {
4511 int64_t long_start = (block.dpos > block.spos) ? block.dpos - block.spos : 0;
4512 auto substr_first = first2 + long_start;
4514 (std::distance(substr_first, last2) < len1) ? last2 : substr_first + len1;
4516 double ls_ratio = cached_ratio.
similarity(substr_first, substr_last, score_cutoff);
4517 if (ls_ratio > max_ratio) {
4518 score_cutoff = max_ratio = ls_ratio;
4525 template<
typename InputIt1,
typename InputIt2,
typename CharT1 = iterator_type<InputIt1>>
4534 template<
typename InputIt1,
typename InputIt2>
4535 double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
4536 int64_t len1 = std::distance(first1, last1);
4537 int64_t len2 = std::distance(first2, last2);
4539 if (score_cutoff > 100) {
4543 if (!len1 || !len2) {
4544 return static_cast<double>(len1 == len2) * 100.0;
4548 return partial_ratio(first2, last2, first1, last1, score_cutoff);
4558 template<
typename Sentence1,
typename Sentence2>
4559 double partial_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff) {
4563 template<
typename CharT1>
4564 template<
typename InputIt1>
4566 : s1(first1, last1), cached_ratio(first1, last1) {
4567 for (
const auto& ch : s1) {
4568 s1_char_map.
create(ch) =
true;
4572 template<
typename CharT1>
4573 template<
typename InputIt2>
4575 int64_t len1 = (int64_t) s1.size();
4576 int64_t len2 = std::distance(first2, last2);
4582 if (!len1 || !len2) {
4583 return static_cast<double>(len1 == len2) * 100.0;
4593 template<
typename CharT1>
4594 template<
typename Sentence2>
4602 template<
typename InputIt1,
typename InputIt2>
4603 double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
4604 if (score_cutoff > 100)
return 0;
4609 template<
typename Sentence1,
typename Sentence2,
typename CharT1,
typename CharT2>
4614 template<
typename CharT1>
4615 template<
typename InputIt2>
4617 if (score_cutoff > 100)
return 0;
4622 template<
typename CharT1>
4623 template<
typename Sentence2>
4632 template<
typename InputIt1,
typename InputIt2>
4634 if (score_cutoff > 100)
return 0;
4639 template<
typename Sentence1,
typename Sentence2>
4644 template<
typename CharT1>
4645 template<
typename InputIt2>
4647 if (score_cutoff > 100)
return 0;
4649 return cached_partial_ratio.similarity(
common::sorted_split(first2, last2).join(), score_cutoff);
4652 template<
typename CharT1>
4653 template<
typename Sentence2>
4663 template<
typename InputIt1,
typename InputIt2>
4672 auto intersect = decomposition.intersection;
4673 auto diff_ab = decomposition.difference_ab;
4674 auto diff_ba = decomposition.difference_ba;
4677 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
4681 auto diff_ab_joined = diff_ab.join();
4682 auto diff_ba_joined = diff_ba.join();
4684 int64_t ab_len = diff_ab_joined.length();
4685 int64_t ba_len = diff_ba_joined.length();
4686 int64_t sect_len = intersect.length();
4689 int64_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
4690 int64_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
4693 auto cutoff_distance = common::score_cutoff_to_distance<100>(score_cutoff, ab_len + ba_len);
4694 int64_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
4696 if (dist <= cutoff_distance) {
4697 result = common::norm_distance<100>(dist, sect_ab_len + sect_ba_len, score_cutoff);
4708 int64_t sect_ab_dist = bool(sect_len) + ab_len;
4709 double sect_ab_ratio =
4710 common::norm_distance<100>(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
4712 int64_t sect_ba_dist = bool(sect_len) + ba_len;
4713 double sect_ba_ratio =
4714 common::norm_distance<100>(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
4716 return std::max({result, sect_ab_ratio, sect_ba_ratio});
4720 template<
typename InputIt1,
typename InputIt2>
4721 double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
4722 if (score_cutoff > 100)
return 0;
4727 template<
typename Sentence1,
typename Sentence2>
4732 template<
typename CharT1>
4733 template<
typename InputIt2>
4735 if (score_cutoff > 100)
return 0;
4740 template<
typename CharT1>
4741 template<
typename Sentence2>
4751 template<
typename InputIt1,
typename InputIt2>
4762 if (!decomposition.intersection.empty())
return 100;
4764 return partial_ratio(decomposition.difference_ab.join(), decomposition.difference_ba.join(), score_cutoff);
4768 template<
typename InputIt1,
typename InputIt2>
4770 if (score_cutoff > 100)
return 0;
4775 template<
typename Sentence1,
typename Sentence2>
4780 template<
typename CharT1>
4781 template<
typename InputIt2>
4783 if (score_cutoff > 100)
return 0;
4788 template<
typename CharT1>
4789 template<
typename Sentence2>
4798 template<
typename InputIt1,
typename InputIt2>
4799 double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
4800 if (score_cutoff > 100)
return 0;
4806 auto intersect = decomposition.intersection;
4807 auto diff_ab = decomposition.difference_ab;
4808 auto diff_ba = decomposition.difference_ba;
4810 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
4814 auto diff_ab_joined = diff_ab.join();
4815 auto diff_ba_joined = diff_ba.join();
4817 int64_t ab_len = diff_ab_joined.length();
4818 int64_t ba_len = diff_ba_joined.length();
4819 int64_t sect_len = intersect.length();
4821 double result =
ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
4824 int64_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
4825 int64_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
4827 auto cutoff_distance = common::score_cutoff_to_distance<100>(score_cutoff, ab_len + ba_len);
4828 int64_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
4829 if (dist <= cutoff_distance) {
4831 result, common::norm_distance<100>(dist, sect_ab_len + sect_ba_len, score_cutoff)
4843 int64_t sect_ab_dist = bool(sect_len) + ab_len;
4844 double sect_ab_ratio =
4845 common::norm_distance<100>(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
4847 int64_t sect_ba_dist = bool(sect_len) + ba_len;
4848 double sect_ba_ratio =
4849 common::norm_distance<100>(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
4851 return std::max({result, sect_ab_ratio, sect_ba_ratio});
4854 template<
typename Sentence1,
typename Sentence2>
4855 double token_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff) {
4860 template<
typename CharT1,
typename CachedCharT1,
typename InputIt2>
4862 if (score_cutoff > 100)
return 0;
4867 auto intersect = decomposition.intersection;
4868 auto diff_ab = decomposition.difference_ab;
4869 auto diff_ba = decomposition.difference_ba;
4871 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
4875 auto diff_ab_joined = diff_ab.join();
4876 auto diff_ba_joined = diff_ba.join();
4878 int64_t ab_len = diff_ab_joined.length();
4879 int64_t ba_len = diff_ba_joined.length();
4880 int64_t sect_len = intersect.length();
4882 double result = cached_ratio_s1_sorted.
similarity(s2_tokens.join(), score_cutoff);
4885 int64_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
4886 int64_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
4888 auto cutoff_distance = common::score_cutoff_to_distance<100>(score_cutoff, ab_len + ba_len);
4889 int64_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
4890 if (dist <= cutoff_distance) {
4892 result, common::norm_distance<100>(dist, sect_ab_len + sect_ba_len, score_cutoff)
4904 int64_t sect_ab_dist = bool(sect_len) + ab_len;
4905 double sect_ab_ratio =
4906 common::norm_distance<100>(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
4908 int64_t sect_ba_dist = bool(sect_len) + ba_len;
4909 double sect_ba_ratio =
4910 common::norm_distance<100>(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
4912 return std::max({result, sect_ab_ratio, sect_ba_ratio});
4916 template<
typename CharT1,
typename InputIt1,
typename InputIt2>
4918 if (score_cutoff > 100)
return 0;
4923 auto intersect = decomposition.intersection;
4924 auto diff_ab = decomposition.difference_ab;
4925 auto diff_ba = decomposition.difference_ba;
4927 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
4931 auto diff_ab_joined = diff_ab.join();
4932 auto diff_ba_joined = diff_ba.join();
4934 int64_t ab_len = diff_ab_joined.length();
4935 int64_t ba_len = diff_ba_joined.length();
4936 int64_t sect_len = intersect.length();
4939 auto s2_sorted = tokens_b.join();
4940 if (s1_sorted.size() < 65) {
4941 double norm_sim = rapidfuzz::detail::indel_normalized_similarity(
4945 result = norm_sim * 100;
4947 result =
fuzz::ratio(s1_sorted, s2_sorted, score_cutoff);
4951 int64_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
4952 int64_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
4954 auto cutoff_distance = common::score_cutoff_to_distance<100>(score_cutoff, ab_len + ba_len);
4955 int64_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
4956 if (dist <= cutoff_distance) {
4958 result, common::norm_distance<100>(dist, sect_ab_len + sect_ba_len, score_cutoff)
4970 int64_t sect_ab_dist = bool(sect_len) + ab_len;
4971 double sect_ab_ratio =
4972 common::norm_distance<100>(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
4974 int64_t sect_ba_dist = bool(sect_len) + ba_len;
4975 double sect_ba_ratio =
4976 common::norm_distance<100>(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
4978 return std::max({result, sect_ab_ratio, sect_ba_ratio});
4982 template<
typename CharT1>
4983 template<
typename InputIt2>
4985 return detail::token_ratio(s1_tokens, cached_ratio_s1_sorted, first2, last2, score_cutoff);
4988 template<
typename CharT1>
4989 template<
typename Sentence2>
4998 template<
typename InputIt1,
typename InputIt2>
4999 double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
5000 if (score_cutoff > 100)
return 0;
5008 if (!decomposition.intersection.empty())
return 100;
5010 auto diff_ab = decomposition.difference_ab;
5011 auto diff_ba = decomposition.difference_ba;
5013 double result =
partial_ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
5016 if (tokens_a.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
5020 score_cutoff = std::max(score_cutoff, result);
5021 return std::max(result,
partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
5024 template<
typename Sentence1,
typename Sentence2>
5030 template<
typename CharT1,
typename InputIt1,
typename InputIt2>
5032 if (score_cutoff > 100)
return 0;
5039 if (!decomposition.intersection.empty())
return 100;
5041 auto diff_ab = decomposition.difference_ab;
5042 auto diff_ba = decomposition.difference_ba;
5044 double result =
partial_ratio(s1_sorted, tokens_b.join(), score_cutoff);
5047 if (tokens_s1.
word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
5051 score_cutoff = std::max(score_cutoff, result);
5052 return std::max(result,
partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
5057 template<
typename CharT1>
5058 template<
typename InputIt2>
5063 template<
typename CharT1>
5064 template<
typename Sentence2>
5073 template<
typename InputIt1,
typename InputIt2>
5074 double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
5075 if (score_cutoff > 100)
return 0;
5077 constexpr double UNBASE_SCALE = 0.95;
5079 int64_t len1 = std::distance(first1, last1);
5080 int64_t len2 = std::distance(first2, last2);
5084 if (!len1 || !len2) {
5088 double len_ratio = (len1 > len2) ?
static_cast<double>(len1) /
static_cast<double>(len2)
5089 :
static_cast<double>(len2) /
static_cast<double>(len1);
5091 double end_ratio =
ratio(first1, last1, first2, last2, score_cutoff);
5093 if (len_ratio < 1.5) {
5094 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
5095 return std::max(end_ratio,
token_ratio(first1, last1, first2, last2, score_cutoff) * UNBASE_SCALE);
5098 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
5100 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
5101 end_ratio = std::max(end_ratio,
partial_ratio(first1, last1, first2, last2, score_cutoff) * PARTIAL_SCALE);
5103 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
5104 return std::max(end_ratio,
partial_token_ratio(first1, last1, first2, last2, score_cutoff) * UNBASE_SCALE * PARTIAL_SCALE);
5107 template<
typename Sentence1,
typename Sentence2>
5108 double WRatio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff) {
5112 template<
typename Sentence1>
5113 template<
typename InputIt1>
5115 : s1(first1, last1),
5116 cached_partial_ratio(first1, last1),
5117 tokens_s1(common::sorted_split(std::begin(s1), std::end(s1))),
5118 s1_sorted(tokens_s1.join()) {
5119 blockmap_s1_sorted.
insert(std::begin(s1_sorted), std::end(s1_sorted));
5122 template<
typename CharT1>
5123 template<
typename InputIt2>
5125 if (score_cutoff > 100)
return 0;
5127 constexpr double UNBASE_SCALE = 0.95;
5129 int64_t len1 = s1.size();
5130 int64_t len2 = std::distance(first2, last2);
5134 if (!len1 || !len2) {
5138 double len_ratio = (len1 > len2) ?
static_cast<double>(len1) /
static_cast<double>(len2)
5139 :
static_cast<double>(len2) /
static_cast<double>(len1);
5141 double end_ratio = cached_partial_ratio.cached_ratio.similarity(first2, last2, score_cutoff);
5143 if (len_ratio < 1.5) {
5144 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
5146 auto r =
detail::token_ratio(s1_sorted, tokens_s1, blockmap_s1_sorted, first2, last2, score_cutoff);
5147 return std::max(end_ratio, r * UNBASE_SCALE);
5150 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
5152 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
5153 end_ratio = std::max(end_ratio, cached_partial_ratio.similarity(first2, last2, score_cutoff) * PARTIAL_SCALE);
5155 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
5157 return std::max(end_ratio, r * UNBASE_SCALE * PARTIAL_SCALE);
5160 template<
typename CharT1>
5161 template<
typename Sentence2>
5170 template<
typename InputIt1,
typename InputIt2>
5171 double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff) {
5172 int64_t len1 = std::distance(first1, last1);
5173 int64_t len2 = std::distance(first2, last2);
5177 if (!len1 || !len2) {
5181 return ratio(first1, last1, first2, last2, score_cutoff);
5184 template<
typename Sentence1,
typename Sentence2>
5185 double QRatio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff) {
5189 template<
typename CharT1>
5190 template<
typename InputIt2>
5192 int64_t len2 = std::distance(first2, last2);
5196 if (s1.empty() || !len2) {
5200 return cached_ratio.similarity(first2, last2, score_cutoff);
5203 template<
typename CharT1>
5204 template<
typename Sentence2>