ArcdpsExtension
 
Loading...
Searching...
No Matches
rapidfuzz_amalgamated.hpp
Go to the documentation of this file.
1// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
2// SPDX-License-Identifier: MIT
3// RapidFuzz v0.0.1
4// Generated: 2022-01-24 22:53:25.415558
5// ----------------------------------------------------------
6// This file is an amalgamation of multiple different files.
7// You probably shouldn't edit it directly.
8// ----------------------------------------------------------
9#ifndef RAPIDFUZZ_AMALGAMATED_HPP_INCLUDED
10#define RAPIDFUZZ_AMALGAMATED_HPP_INCLUDED
11
12
13#include <cmath>
14#include <numeric>
15
16#include <algorithm>
17#include <array>
18#include <cmath>
19#include <cstring>
20
21
22#include <algorithm>
23#include <cassert>
24#include <stddef.h>
25#include <stdexcept>
26#include <stdint.h>
27#include <type_traits>
28#include <vector>
29
30namespace rapidfuzz {
31
32 template<typename InputIt>
34 public:
35 IteratorView(InputIt first_, InputIt last_) : first(first_), last(last_) {}
36
37 InputIt first;
38 InputIt last;
39 };
40
41 template<typename InputIt1, typename InputIt2>
43 return std::equal(a.first, a.last, b.first, b.last);
44 }
45
46 template<typename InputIt1, typename InputIt2>
48 return !(a == b);
49 }
50
51 template<typename InputIt1, typename InputIt2>
53 return (std::lexicographical_compare(a.first, a.last, b.first, b.last));
54 }
55
56 template<typename InputIt1, typename InputIt2>
58 return b < a;
59 }
60
61 template<typename InputIt1, typename InputIt2>
63 return !(b < a);
64 }
65
66 template<typename InputIt1, typename InputIt2>
68 return !(a < b);
69 }
70
71 template<typename InputIt>
72 using IteratorViewVec = std::vector<IteratorView<InputIt>>;
73
74 struct StringAffix {
75 int64_t prefix_len;
76 int64_t suffix_len;
77 };
78
80 int64_t insert_cost;
81 int64_t delete_cost;
82 int64_t replace_cost;
83 };
84
88 enum class EditType {
89 None = 0,
90 Replace = 1,
91 Insert = 2,
92 Delete = 3
93 };
94
105 struct EditOp {
107 int64_t src_pos;
108 int64_t dest_pos;
111
112 EditOp(EditType type_, int64_t src_pos_, int64_t dest_pos_)
113 : type(type_), src_pos(src_pos_), dest_pos(dest_pos_) {}
114 };
115
116 inline bool operator==(EditOp a, EditOp b) {
117 return (a.type == b.type) && (a.src_pos == b.src_pos) && (a.dest_pos == b.dest_pos);
118 }
119
133 struct Opcode {
135 int64_t src_begin;
136 int64_t src_end;
137 int64_t dest_begin;
138 int64_t dest_end;
141
142 Opcode(EditType type_, int64_t src_begin_, int64_t src_end_, int64_t dest_begin_,
143 int64_t dest_end_)
144 : type(type_),
145 src_begin(src_begin_),
146 src_end(src_end_),
147 dest_begin(dest_begin_),
148 dest_end(dest_end_) {}
149 };
150
151 inline bool operator==(Opcode a, Opcode b) {
152 return (a.type == b.type) && (a.src_begin == b.src_begin) && (a.src_end == b.src_end) && (a.dest_begin == b.dest_begin) && (a.dest_end == b.dest_end);
153 }
154
155 namespace detail {
156 template<typename T>
157 void vector_slice(std::vector<T>& new_vec, const std::vector<T>& vec, int start, int stop, int step) {
158 if (step > 0) {
159 if (start < 0) {
160 start = std::max((int) (start + (int) vec.size()), 0);
161 } else if (start > (int) vec.size()) {
162 start = (int) vec.size();
163 }
164
165 if (stop < 0) {
166 stop = std::max((int) (stop + (int) vec.size()), 0);
167 } else if (stop > (int) vec.size()) {
168 stop = (int) vec.size();
169 }
170
171 if (start >= stop) {
172 return;
173 }
174
175 int count = (stop - 1 - start) / step + 1;
176 new_vec.reserve(count);
177
178 for (int i = start; i < stop; i += step) {
179 new_vec.push_back(vec[i]);
180 }
181 } else if (step < 0) {
182 if (start < 0) {
183 start = std::max((int) (start + (int) vec.size()), -1);
184 } else if (start >= (int) vec.size()) {
185 start = (int) vec.size() - 1;
186 }
187
188 if (stop < 0) {
189 stop = std::max((int) (stop + (int) vec.size()), -1);
190 } else if (stop >= (int) vec.size()) {
191 stop = (int) vec.size() - 1;
192 }
193
194 if (start <= stop) {
195 return;
196 }
197
198 int count = (stop + 1 - start) / step + 1;
199 new_vec.reserve(count);
200
201 for (int i = start; i > stop; i += step) {
202 new_vec.push_back(vec[i]);
203 }
204 } else {
205 throw std::invalid_argument("slice step cannot be zero");
206 }
207 }
208 } // namespace detail
209
210 class Opcodes;
211
212 class Editops : private std::vector<EditOp> {
213 public:
214 using std::vector<EditOp>::size_type;
215
216 Editops() : src_len(0), dest_len(0) {}
217
218 Editops(size_type count, const EditOp& value)
219 : std::vector<EditOp>(count, value), src_len(0), dest_len(0) {}
220
221 explicit Editops(size_type count) : std::vector<EditOp>(count), src_len(0), dest_len(0) {}
222
223 Editops(const Editops& other)
224 : std::vector<EditOp>(other), src_len(other.src_len), dest_len(other.dest_len) {}
225
226 Editops(const Opcodes& other);
227
228 Editops(Editops&& other) noexcept {
229 swap(other);
230 }
231
233 swap(other);
234 return *this;
235 }
236
237 /* Element access */
238 using std::vector<EditOp>::at;
239 using std::vector<EditOp>::operator[];
240 using std::vector<EditOp>::front;
241 using std::vector<EditOp>::back;
242 using std::vector<EditOp>::data;
243
244 /* Iterators */
245 using std::vector<EditOp>::begin;
246 using std::vector<EditOp>::cbegin;
247 using std::vector<EditOp>::end;
248 using std::vector<EditOp>::cend;
249 using std::vector<EditOp>::rbegin;
250 using std::vector<EditOp>::crbegin;
251 using std::vector<EditOp>::rend;
252 using std::vector<EditOp>::crend;
253
254 /* Capacity */
255 using std::vector<EditOp>::empty;
256 using std::vector<EditOp>::size;
257 using std::vector<EditOp>::max_size;
258 using std::vector<EditOp>::reserve;
259 using std::vector<EditOp>::capacity;
260 using std::vector<EditOp>::shrink_to_fit;
261
262 /* Modifiers */
263 using std::vector<EditOp>::clear;
264 using std::vector<EditOp>::insert;
265 using std::vector<EditOp>::emplace;
266 using std::vector<EditOp>::erase;
267 using std::vector<EditOp>::push_back;
268 using std::vector<EditOp>::emplace_back;
269 using std::vector<EditOp>::pop_back;
270
271 void swap(Editops& rhs) noexcept {
272 std::swap(src_len, rhs.src_len);
273 std::swap(dest_len, rhs.dest_len);
274 std::vector<EditOp>::swap(rhs);
275 }
276
277 Editops slice(int start, int stop, int step = 1) const {
278 Editops ed_slice;
279 detail::vector_slice(ed_slice, *this, start, stop, step);
280 ed_slice.src_len = src_len;
281 ed_slice.dest_len = dest_len;
282 return ed_slice;
283 }
284
285 Editops reverse() const {
286 Editops reversed = *this;
287 std::reverse(reversed.begin(), reversed.end());
288 return reversed;
289 }
290
291 int64_t get_src_len() const {
292 return src_len;
293 }
294 void set_src_len(int64_t len) {
295 src_len = len;
296 }
297 int64_t get_dest_len() const {
298 return dest_len;
299 }
300 void set_dest_len(int64_t len) {
301 dest_len = len;
302 }
303
304 Editops inverse() const {
305 Editops inv_ops = *this;
306 std::swap(inv_ops.src_len, inv_ops.dest_len);
307 for (auto& op : inv_ops) {
308 std::swap(op.src_pos, op.dest_pos);
309 if (op.type == EditType::Delete) {
310 op.type = EditType::Insert;
311 } else if (op.type == EditType::Insert) {
312 op.type = EditType::Delete;
313 }
314 }
315 return inv_ops;
316 }
317
318 private:
319 int64_t src_len;
320 int64_t dest_len;
321 };
322
323 inline bool operator==(const Editops& lhs, const Editops& rhs) {
324 if (lhs.get_src_len() != rhs.get_src_len() || lhs.get_dest_len() != rhs.get_dest_len()) {
325 return false;
326 }
327
328 if (lhs.size() != rhs.size()) {
329 return false;
330 }
331 return std::equal(lhs.begin(), lhs.end(), rhs.begin());
332 }
333
334 inline bool operator!=(const Editops& lhs, const Editops& rhs) {
335 return !(lhs == rhs);
336 }
337
338 inline void swap(Editops& lhs, Editops& rhs) {
339 lhs.swap(rhs);
340 }
341
342 class Opcodes : private std::vector<Opcode> {
343 public:
344 using std::vector<Opcode>::size_type;
345
346 Opcodes() : src_len(0), dest_len(0) {}
347
348 Opcodes(size_type count, const Opcode& value)
349 : std::vector<Opcode>(count, value), src_len(0), dest_len(0) {}
350
351 explicit Opcodes(size_type count) : std::vector<Opcode>(count), src_len(0), dest_len(0) {}
352
353 Opcodes(const Opcodes& other)
354 : std::vector<Opcode>(other), src_len(other.src_len), dest_len(other.dest_len) {}
355
356 Opcodes(const Editops& other);
357
358 Opcodes(Opcodes&& other) noexcept {
359 swap(other);
360 }
361
363 swap(other);
364 return *this;
365 }
366
367 /* Element access */
368 using std::vector<Opcode>::at;
369 using std::vector<Opcode>::operator[];
370 using std::vector<Opcode>::front;
371 using std::vector<Opcode>::back;
372 using std::vector<Opcode>::data;
373
374 /* Iterators */
375 using std::vector<Opcode>::begin;
376 using std::vector<Opcode>::cbegin;
377 using std::vector<Opcode>::end;
378 using std::vector<Opcode>::cend;
379 using std::vector<Opcode>::rbegin;
380 using std::vector<Opcode>::crbegin;
381 using std::vector<Opcode>::rend;
382 using std::vector<Opcode>::crend;
383
384 /* Capacity */
385 using std::vector<Opcode>::empty;
386 using std::vector<Opcode>::size;
387 using std::vector<Opcode>::max_size;
388 using std::vector<Opcode>::reserve;
389 using std::vector<Opcode>::capacity;
390 using std::vector<Opcode>::shrink_to_fit;
391
392 /* Modifiers */
393 using std::vector<Opcode>::clear;
394 using std::vector<Opcode>::insert;
395 using std::vector<Opcode>::emplace;
396 using std::vector<Opcode>::erase;
397 using std::vector<Opcode>::push_back;
398 using std::vector<Opcode>::emplace_back;
399 using std::vector<Opcode>::pop_back;
400
401 void swap(Opcodes& rhs) noexcept {
402 std::swap(src_len, rhs.src_len);
403 std::swap(dest_len, rhs.dest_len);
404 std::vector<Opcode>::swap(rhs);
405 }
406
407 Opcodes slice(int start, int stop, int step = 1) const {
408 Opcodes ed_slice;
409 detail::vector_slice(ed_slice, *this, start, stop, step);
410 ed_slice.src_len = src_len;
411 ed_slice.dest_len = dest_len;
412 return ed_slice;
413 }
414
415 Opcodes reverse() const {
416 Opcodes reversed = *this;
417 std::reverse(reversed.begin(), reversed.end());
418 return reversed;
419 }
420
421 int64_t get_src_len() const {
422 return src_len;
423 }
424 void set_src_len(int64_t len) {
425 src_len = len;
426 }
427 int64_t get_dest_len() const {
428 return dest_len;
429 }
430 void set_dest_len(int64_t len) {
431 dest_len = len;
432 }
433
434 Opcodes inverse() const {
435 Opcodes inv_ops = *this;
436 std::swap(inv_ops.src_len, inv_ops.dest_len);
437 for (auto& op : inv_ops) {
438 std::swap(op.src_begin, op.dest_begin);
439 std::swap(op.src_end, op.dest_end);
440 if (op.type == EditType::Delete) {
441 op.type = EditType::Insert;
442 } else if (op.type == EditType::Insert) {
443 op.type = EditType::Delete;
444 }
445 }
446 return inv_ops;
447 }
448
449 private:
450 int64_t src_len;
451 int64_t dest_len;
452 };
453
454 inline bool operator==(const Opcodes& lhs, const Opcodes& rhs) {
455 if (lhs.get_src_len() != rhs.get_src_len() || lhs.get_dest_len() != rhs.get_dest_len()) {
456 return false;
457 }
458
459 if (lhs.size() != rhs.size()) {
460 return false;
461 }
462 return std::equal(lhs.begin(), lhs.end(), rhs.begin());
463 }
464
465 inline bool operator!=(const Opcodes& lhs, const Opcodes& rhs) {
466 return !(lhs == rhs);
467 }
468
469 inline void swap(Opcodes& lhs, Opcodes& rhs) {
470 lhs.swap(rhs);
471 }
472
473 inline Editops::Editops(const Opcodes& other) {
474 src_len = other.get_src_len();
475 dest_len = other.get_dest_len();
476 for (const auto& op : other) {
477 switch (op.type) {
478 case EditType::None:
479 break;
480
482 for (int64_t j = 0; j < op.src_end - op.src_begin; j++) {
483 push_back({EditType::Replace, op.src_begin + j, op.dest_begin + j});
484 }
485 break;
486
487 case EditType::Insert:
488 for (int64_t j = 0; j < op.dest_end - op.dest_begin; j++) {
489 push_back({EditType::Insert, op.src_begin, op.dest_begin + j});
490 }
491 break;
492
493 case EditType::Delete:
494 for (int64_t j = 0; j < op.src_end - op.src_begin; j++) {
495 push_back({EditType::Delete, op.src_begin + j, op.dest_begin});
496 }
497 break;
498 }
499 }
500 }
501
502 inline Opcodes::Opcodes(const Editops& other) {
503 src_len = other.get_src_len();
504 dest_len = other.get_dest_len();
505 int64_t src_pos = 0;
506 int64_t dest_pos = 0;
507 for (size_t i = 0; i < other.size();) {
508 if (src_pos < other[i].src_pos || dest_pos < other[i].dest_pos) {
509 push_back({EditType::None, src_pos, other[i].src_pos, dest_pos, other[i].dest_pos});
510 src_pos = other[i].src_pos;
511 dest_pos = other[i].dest_pos;
512 }
513
514 int64_t src_begin = src_pos;
515 int64_t dest_begin = dest_pos;
516 EditType type = other[i].type;
517 do {
518 switch (type) {
519 case EditType::None:
520 break;
521
523 src_pos++;
524 dest_pos++;
525 break;
526
527 case EditType::Insert:
528 dest_pos++;
529 break;
530
531 case EditType::Delete:
532 src_pos++;
533 break;
534 }
535 i++;
536 } while (i < other.size() && other[i].type == type && src_pos && other[i].src_pos && dest_pos == other[i].dest_pos);
537
538 push_back({type, src_begin, src_pos, dest_begin, dest_pos});
539 }
540
541 if (src_pos < other.get_src_len() || dest_pos < other.get_dest_len()) {
542 push_back({EditType::None, src_pos, other.get_src_len(), dest_pos, other.get_dest_len()});
543 }
544 }
545
546} // namespace rapidfuzz
547
548#include <functional>
549#include <iterator>
550#include <type_traits>
551#include <utility>
552
553namespace rapidfuzz {
554
555 namespace detail {
556 template<typename T>
557 auto inner_type(T const*) -> T;
558
559 template<typename T>
560 auto inner_type(T const&) -> typename T::value_type;
561 } // namespace detail
562
563 template<typename T>
564 using char_type = decltype(detail::inner_type(std::declval<T const&>()));
565
566 template<typename T>
567 using plain = std::remove_cv_t<std::remove_reference_t<T>>;
568
569 template<typename T>
570 using iterator_type = plain<decltype(*std::declval<T>())>;
571
572 template<typename... Conds>
573 struct satisfies_all : std::true_type {};
574
575 template<typename Cond, typename... Conds>
576 struct satisfies_all<Cond, Conds...>
577 : std::conditional<Cond::value, satisfies_all<Conds...>, std::false_type>::type {};
578
579 template<typename... Conds>
580 struct satisfies_any : std::false_type {};
581
582 template<typename Cond, typename... Conds>
583 struct satisfies_any<Cond, Conds...>
584 : std::conditional<Cond::value, std::true_type, satisfies_any<Conds...>>::type {};
585
586 // taken from
587 // https://stackoverflow.com/questions/16893992/check-if-type-can-be-explicitly-converted
588 template<typename From, typename To>
589 struct is_explicitly_convertible {
590 template<typename T>
591 static void f(T);
592
593 template<typename F, typename T>
594 static constexpr auto test(int /*unused*/)
595 -> decltype(f(static_cast<T>(std::declval<F>())), true) {
596 return true;
597 }
598
599 template<typename F, typename T>
600 static constexpr auto test(...) -> bool {
601 return false;
602 }
603
604 static bool const value = test<From, To>(0);
605 };
606
607#define GENERATE_HAS_MEMBER(member) \
608 \
609 template<typename T> \
610 struct has_member_##member { \
611 private: \
612 using yes = std::true_type; \
613 using no = std::false_type; \
614 \
615 struct Fallback { \
616 int member; \
617 }; \
618 struct Derived : T, Fallback {}; \
619 \
620 template<class U> \
621 static no test(decltype(U::member)*); \
622 template<typename U> \
623 static yes test(U*); \
624 \
625 template<typename U, typename = std::enable_if_t<std::is_class<U>::value>> \
626 static constexpr bool class_test(U*) { \
627 return std::is_same<decltype(test<Derived>(nullptr)), yes>::value; \
628 } \
629 \
630 template<typename U, typename = std::enable_if_t<!std::is_class<U>::value>> \
631 static constexpr bool class_test(const U&) { \
632 return false; \
633 } \
634 \
635 public: \
636 static constexpr bool value = class_test(static_cast<T*>(nullptr)); \
637 };
638
639 GENERATE_HAS_MEMBER(data) // Creates 'has_member_data'
640 GENERATE_HAS_MEMBER(size) // Creates 'has_member_size'
641
642 template<typename Sentence>
643 using has_data_and_size = satisfies_all<has_member_data<Sentence>, has_member_size<Sentence>>;
644
645 // This trait checks if a given type is a standard collection of hashable types
646 // SFINAE ftw
647 template<class T>
648 class is_hashable_sequence {
649 is_hashable_sequence() = delete;
650 using hashable = char;
651 struct not_hashable {
652 char t[2];
653 }; // Ensured to work on any platform
654 template<typename C>
655 static hashable matcher(decltype(&std::hash<typename C::value_type>::operator()));
656 template<typename C>
657 static not_hashable matcher(...);
658
659 public:
660 static bool const value = (sizeof(matcher<T>(nullptr)) == sizeof(hashable));
661 };
662
663 template<class T>
664 class is_standard_iterable {
665 is_standard_iterable() = delete;
666 using iterable = char;
667 struct not_iterable {
668 char t[2];
669 }; // Ensured to work on any platform
670 template<typename C>
671 static iterable matcher(typename C::const_iterator*);
672 template<typename C>
673 static not_iterable matcher(...);
674
675 public:
676 static bool const value = (sizeof(matcher<T>(nullptr)) == sizeof(iterable));
677 };
678
679 template<typename C>
680 void* sub_matcher(typename C::value_type const& (C::*) (int64_t) const);
681
682 // TODO: Not a real SFINAE, because of the ambiguity between
683 // value_type const& operator[](int64_t) const;
684 // and value_type& operator[](int64_t);
685 // Not really important
686 template<class T>
687 class has_bracket_operator {
688 has_bracket_operator() = delete;
689 using has_op = char;
690 struct hasnt_op {
691 char t[2];
692 }; // Ensured to work on any platform
693 template<typename C>
694 static has_op matcher(decltype(sub_matcher<T>(&T::at)));
695 template<typename C>
696 static hasnt_op matcher(...);
697
698 public:
699 static bool const value = (sizeof(matcher<T>(nullptr)) == sizeof(has_op));
700 };
701
702} // namespace rapidfuzz
703#include <string>
704
705namespace rapidfuzz {
706
707 template<typename InputIt>
709 public:
710 using CharT = iterator_type<InputIt>;
711
712 SplittedSentenceView(IteratorViewVec<InputIt> sentence) : m_sentence(std::move(sentence)) {}
713
714 int64_t dedupe();
715 int64_t size() const;
716
717 int64_t length() const {
718 return size();
719 }
720
721 bool empty() const {
722 return m_sentence.empty();
723 }
724
725 int64_t word_count() const {
726 return m_sentence.size();
727 }
728
729 std::basic_string<CharT> join() const;
730
732 return m_sentence;
733 }
734
735 private:
736 IteratorViewVec<InputIt> m_sentence;
737 };
738
739 template<typename InputIt>
741 int64_t old_word_count = word_count();
742 m_sentence.erase(std::unique(m_sentence.begin(), m_sentence.end()), m_sentence.end());
743 return old_word_count - word_count();
744 }
745
746 template<typename InputIt>
748 if (m_sentence.empty()) return 0;
749
750 // there is a whitespace between each word
751 int64_t result = m_sentence.size() - 1;
752 for (const auto& word : m_sentence) {
753 result += std::distance(word.first, word.last);
754 }
755
756 return result;
757 }
758
759 template<typename InputIt>
760 auto SplittedSentenceView<InputIt>::join() const -> std::basic_string<CharT> {
761 if (m_sentence.empty()) {
762 return std::basic_string<CharT>();
763 }
764
765 auto sentence_iter = m_sentence.begin();
766 std::basic_string<CharT> joined(sentence_iter->first, sentence_iter->last);
767 const std::basic_string<CharT> whitespace{0x20};
768 ++sentence_iter;
769 for (; sentence_iter != m_sentence.end(); ++sentence_iter) {
770 joined.append(whitespace)
771 .append(std::basic_string<CharT>(sentence_iter->first, sentence_iter->last));
772 }
773 return joined;
774 }
775
776} // namespace rapidfuzz
777#include <unordered_map>
778#include <vector>
779
780namespace rapidfuzz {
781
782 template<typename InputIt1, typename InputIt2, typename InputIt3>
783 struct DecomposedSet {
784 SplittedSentenceView<InputIt1> difference_ab;
785 SplittedSentenceView<InputIt2> difference_ba;
786 SplittedSentenceView<InputIt3> intersection;
787 DecomposedSet(SplittedSentenceView<InputIt1> diff_ab, SplittedSentenceView<InputIt2> diff_ba, SplittedSentenceView<InputIt3> intersect)
788 : difference_ab(std::move(diff_ab)),
789 difference_ba(std::move(diff_ba)),
790 intersection(std::move(intersect)) {}
791 };
792
793 namespace common {
794
801 template<typename InputIt1, typename InputIt2>
802 DecomposedSet<InputIt1, InputIt2, InputIt1> set_decomposition(SplittedSentenceView<InputIt1> a, SplittedSentenceView<InputIt2> b);
803
804 constexpr double result_cutoff(double result, double score_cutoff) {
805 return (result >= score_cutoff) ? result : 0;
806 }
807
808 template<int Max = 1>
809 constexpr double norm_distance(int64_t dist, int64_t lensum, double score_cutoff = 0) {
810 double max = static_cast<double>(Max);
811 return result_cutoff((lensum > 0) ? (max - max * dist / lensum) : max, score_cutoff);
812 }
813
814 template<int Max = 1>
815 static inline int64_t score_cutoff_to_distance(double score_cutoff, int64_t lensum) {
816 return static_cast<int64_t>(
817 std::ceil(static_cast<double>(lensum) * (1.0 - score_cutoff / Max))
818 );
819 }
820
821 template<typename T>
822 constexpr bool is_zero(T a, T tolerance = std::numeric_limits<T>::epsilon()) {
823 return std::fabs(a) <= tolerance;
824 }
825
826 template<typename Sentence, typename CharT = char_type<Sentence>, typename = std::enable_if_t<is_explicitly_convertible<Sentence, std::basic_string<CharT>>::value>>
827 std::basic_string<CharT> to_string(Sentence&& str);
828
829 template<typename Sentence, typename CharT = char_type<Sentence>, typename = std::enable_if_t<!is_explicitly_convertible<Sentence, std::basic_string<CharT>>::value && has_data_and_size<Sentence>::value>>
830 std::basic_string<CharT> to_string(const Sentence& str);
831
832 template<typename CharT>
833 CharT* to_begin(CharT* s) {
834 return s;
835 }
836
837 template<typename T>
838 auto to_begin(T& x) {
839 using std::begin;
840 return begin(x);
841 }
842
843 template<typename CharT>
844 CharT* to_end(CharT* s) {
845 while (*s != 0) {
846 ++s;
847 }
848 return s;
849 }
850
851 template<typename T>
852 auto to_end(T& x) {
853 using std::end;
854 return end(x);
855 }
856
867 template<typename InputIterator1, typename InputIterator2>
868 std::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2);
869
870 template<typename InputIt1, typename InputIt2>
871 StringAffix remove_common_affix(InputIt1& first1, InputIt1& last1, InputIt2& first2, InputIt2& last2);
872
873 template<typename InputIt1, typename InputIt2>
874 int64_t remove_common_prefix(InputIt1& first1, InputIt1 last1, InputIt2& first2, InputIt2 last2);
875
876 template<typename InputIt1, typename InputIt2>
877 int64_t remove_common_suffix(InputIt1 first1, InputIt1& last1, InputIt2 first2, InputIt2& last2);
878
879 template<typename InputIt, typename CharT = iterator_type<InputIt>>
880 SplittedSentenceView<InputIt> sorted_split(InputIt first, InputIt last);
881
882 template<typename T>
883 constexpr auto to_unsigned(T value) -> typename std::make_unsigned<T>::type {
884 return typename std::make_unsigned<T>::type(value);
885 }
886
887 template<typename T>
888 constexpr auto to_signed(T value) -> typename std::make_unsigned<T>::type {
889 return typename std::make_signed<T>::type(value);
890 }
891
892 /*
893 * taken from https://stackoverflow.com/a/17251989/11335032
894 */
895 template<typename T, typename U>
896 bool CanTypeFitValue(const U value) {
897 const intmax_t botT = intmax_t(std::numeric_limits<T>::min());
898 const intmax_t botU = intmax_t(std::numeric_limits<U>::min());
899 const uintmax_t topT = uintmax_t(std::numeric_limits<T>::max());
900 const uintmax_t topU = uintmax_t(std::numeric_limits<U>::max());
901 return !((botT > botU && value < static_cast<U>(botT)) || (topT < topU && value > static_cast<U>(topT)));
902 }
903
905 struct MapElem {
906 uint64_t key = 0;
907 uint64_t value = 0;
908 };
909 std::array<MapElem, 128> m_map;
910 std::array<uint64_t, 256> m_extendedAscii;
911
913
914 template<typename InputIt>
915 PatternMatchVector(InputIt first, InputIt last) : m_map(), m_extendedAscii() {
916 insert(first, last);
917 }
918
919 template<typename InputIt>
920 void insert(InputIt first, InputIt last) {
921 uint64_t mask = 1;
922 for (; first != last; ++first) {
923 insert_mask(*first, mask);
924 mask <<= 1;
925 }
926 }
927
928 template<typename CharT>
929 void insert(CharT key, int64_t pos) {
930 insert_mask(key, 1ull << pos);
931 }
932
933 template<typename CharT>
934 uint64_t get(CharT key) const {
935 if (key >= 0 && key <= 255) {
936 return m_extendedAscii[(uint8_t) key];
937 } else {
938 return m_map[lookup((uint64_t) key)].value;
939 }
940 }
941
942 template<typename CharT>
943 uint64_t get(int64_t block, CharT key) const {
944 assert(block == 0);
945 (void) block;
946 return get(key);
947 }
948
949 private:
950 template<typename CharT>
951 void insert_mask(CharT key, uint64_t mask) {
952 if (key >= 0 && key <= 255) {
953 m_extendedAscii[(uint8_t) key] |= mask;
954 } else {
955 int64_t i = lookup((uint64_t) key);
956 m_map[i].key = key;
957 m_map[i].value |= mask;
958 }
959 }
960
965 int64_t lookup(uint64_t key) const {
966 int64_t i = key % 128;
967
968 if (!m_map[i].value || m_map[i].key == key) {
969 return i;
970 }
971
972 int64_t perturb = key;
973 while (true) {
974 i = ((i * 5) + perturb + 1) % 128;
975 if (!m_map[i].value || m_map[i].key == key) {
976 return i;
977 }
978
979 perturb >>= 5;
980 }
981 }
982 };
983
985 std::vector<PatternMatchVector> m_val;
986
988
989 template<typename InputIt>
990 BlockPatternMatchVector(InputIt first, InputIt last) {
991 insert(first, last);
992 }
993
994 template<typename CharT>
995 void insert(int64_t block, CharT ch, int pos) {
996 auto* be = &m_val[block];
997 be->insert(ch, pos);
998 }
999
1000 template<typename InputIt>
1001 void insert(InputIt first, InputIt last) {
1002 int64_t len = std::distance(first, last);
1003 int64_t block_count = (len / 64) + bool(len % 64);
1004 m_val.resize(block_count);
1005
1006 for (int64_t block = 0; block < block_count; ++block) {
1007 if (std::distance(first + block * 64, last) > 64) {
1008 m_val[block].insert(first + block * 64, first + (block + 1) * 64);
1009 } else {
1010 m_val[block].insert(first + block * 64, last);
1011 }
1012 }
1013 }
1014
1015 template<typename CharT>
1016 uint64_t get(int64_t block, CharT ch) const {
1017 auto* be = &m_val[block];
1018 return be->get(ch);
1019 }
1020 };
1021
1022 template<typename CharT1, typename ValueType, int64_t size = sizeof(CharT1)>
1023 struct CharHashTable;
1024
1025 template<typename CharT1, typename ValueType>
1026 struct CharHashTable<CharT1, ValueType, 1> {
1027 using UCharT1 = typename std::make_unsigned<CharT1>::type;
1028
1029 std::array<ValueType, std::numeric_limits<UCharT1>::max() + 1> m_val;
1030 ValueType m_default;
1031
1033
1034 ValueType& create(CharT1 ch) {
1035 return m_val[UCharT1(ch)];
1036 }
1037
1038 template<typename CharT2>
1039 const ValueType& operator[](CharT2 ch) const {
1040 if (!CanTypeFitValue<CharT1>(ch)) {
1041 return m_default;
1042 }
1043
1044 return m_val[UCharT1(ch)];
1045 }
1046 };
1047
1048 template<typename CharT1, typename ValueType, int64_t size>
1050 std::unordered_map<CharT1, ValueType> m_val;
1051 ValueType m_default;
1052
1054
1055 ValueType& create(CharT1 ch) {
1056 return m_val[ch];
1057 }
1058
1059 template<typename CharT2>
1060 const ValueType& operator[](CharT2 ch) const {
1061 if (!CanTypeFitValue<CharT1>(ch)) {
1062 return m_default;
1063 }
1064
1065 auto search = m_val.find(CharT1(ch));
1066 if (search == m_val.end()) {
1067 return m_default;
1068 }
1069
1070 return search->second;
1071 }
1072 };
1073
1074 template<typename T>
1076 explicit MatrixVectorView(T* vector, int64_t cols) : m_vector(vector), m_cols(cols) {}
1077
1078 T& operator[](uint64_t col) {
1079 assert(col < m_cols);
1080 return m_vector[col];
1081 }
1082
1083 private:
1084 T* m_vector;
1085 int64_t m_cols;
1086 };
1087
1088 template<typename T>
1090 explicit ConstMatrixVectorView(const T* vector, int64_t cols) : m_vector(vector), m_cols(cols) {}
1091
1092 ConstMatrixVectorView(const MatrixVectorView<T>& other) : m_vector(other.m_vector) {}
1093
1094 const T& operator[](uint64_t col) {
1095 assert(col < m_cols);
1096 return m_vector[col];
1097 }
1098
1099 private:
1100 const T* m_vector;
1101 int64_t m_cols;
1102 };
1103
1104 template<typename T>
1105 struct Matrix {
1106 Matrix(uint64_t rows, uint64_t cols, uint64_t val) {
1107 m_rows = rows;
1108 m_cols = cols;
1109 if (rows * cols > 0) {
1110 m_matrix = new T[rows * cols];
1111 std::fill_n(m_matrix, rows * cols, val);
1112 } else {
1113 m_matrix = nullptr;
1114 }
1115 }
1116
1118 delete[] m_matrix;
1119 }
1120
1122 assert(row < m_rows);
1123 return MatrixVectorView<uint64_t>(&m_matrix[row * m_cols], m_rows);
1124 }
1125
1127 assert(row < m_rows);
1128 return ConstMatrixVectorView<uint64_t>(&m_matrix[row * m_cols], m_rows);
1129 }
1130
1131 private:
1132 uint64_t m_rows;
1133 uint64_t m_cols;
1134 T* m_matrix;
1135 };
1136
1139 } // namespace common
1140} // namespace rapidfuzz
1141
1142
1143#include <algorithm>
1144#include <array>
1145#include <cctype>
1146#include <cwctype>
1147
1148namespace rapidfuzz {
1149
1150 template<typename InputIt1, typename InputIt2>
1151 DecomposedSet<InputIt1, InputIt2, InputIt1>
1153 a.dedupe();
1154 b.dedupe();
1155
1156 IteratorViewVec<InputIt1> intersection;
1157 IteratorViewVec<InputIt1> difference_ab;
1158 IteratorViewVec<InputIt2> difference_ba = b.words();
1159
1160 for (const auto& current_a : a.words()) {
1161 auto element_b = std::find(difference_ba.begin(), difference_ba.end(), current_a);
1162
1163 if (element_b != difference_ba.end()) {
1164 difference_ba.erase(element_b);
1165 intersection.push_back(current_a);
1166 } else {
1167 difference_ab.push_back(current_a);
1168 }
1169 }
1170
1171 return {difference_ab, difference_ba, intersection};
1172 }
1173
1174 template<typename Sentence, typename CharT, typename>
1175 std::basic_string<CharT> common::to_string(Sentence&& str) {
1176 return std::basic_string<CharT>(std::forward<Sentence>(str));
1177 }
1178
1179 template<typename Sentence, typename CharT, typename>
1180 std::basic_string<CharT> common::to_string(const Sentence& str) {
1181 return std::basic_string<CharT>(str.data(), str.size());
1182 }
1183
1184 template<typename InputIterator1, typename InputIterator2>
1185 std::pair<InputIterator1, InputIterator2>
1186 common::mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2) {
1187 while (first1 != last1 && first2 != last2 && *first1 == *first2) {
1188 ++first1;
1189 ++first2;
1190 }
1191 return std::pair<InputIterator1, InputIterator2>(first1, first2);
1192 }
1193
1197 template<typename InputIt1, typename InputIt2>
1198 int64_t common::remove_common_prefix(InputIt1& first1, InputIt1 last1, InputIt2& first2, InputIt2 last2) {
1199 int64_t prefix = std::distance(first1, common::mismatch(first1, last1, first2, last2).first);
1200 first1 += prefix;
1201 first2 += prefix;
1202 return prefix;
1203 }
1204
1208 template<typename InputIt1, typename InputIt2>
1209 int64_t common::remove_common_suffix(InputIt1 first1, InputIt1& last1, InputIt2 first2, InputIt2& last2) {
1210 auto rfirst1 = std::make_reverse_iterator(last1);
1211 auto rlast1 = std::make_reverse_iterator(first1);
1212 auto rfirst2 = std::make_reverse_iterator(last2);
1213 auto rlast2 = std::make_reverse_iterator(first2);
1214
1215 int64_t suffix =
1216 std::distance(rfirst1, common::mismatch(rfirst1, rlast1, rfirst2, rlast2).first);
1217 last1 -= suffix;
1218 last2 -= suffix;
1219 return suffix;
1220 }
1221
1225 template<typename InputIt1, typename InputIt2>
1226 StringAffix common::remove_common_affix(InputIt1& first1, InputIt1& last1, InputIt2& first2, InputIt2& last2) {
1227 return StringAffix{(int64_t) remove_common_prefix(first1, last1, first2, last2), (int64_t) remove_common_suffix(first1, last1, first2, last2)};
1228 }
1229
1230 template<typename, typename = void>
1231 struct is_space_dispatch_tag : std::integral_constant<int, 0> {};
1232
1233 template<typename CharT>
1234 struct is_space_dispatch_tag<CharT, typename std::enable_if<sizeof(CharT) == 1>::type>
1235 : std::integral_constant<int, 1> {};
1236
1237 /*
1238 * Implementation of is_space for char types that are at least 2 Byte in size
1239 */
1240 template<typename CharT>
1241 bool is_space_impl(const CharT ch, std::integral_constant<int, 0>) {
1242 switch (ch) {
1243 case 0x0009:
1244 case 0x000A:
1245 case 0x000B:
1246 case 0x000C:
1247 case 0x000D:
1248 case 0x001C:
1249 case 0x001D:
1250 case 0x001E:
1251 case 0x001F:
1252 case 0x0020:
1253 case 0x0085:
1254 case 0x00A0:
1255 case 0x1680:
1256 case 0x2000:
1257 case 0x2001:
1258 case 0x2002:
1259 case 0x2003:
1260 case 0x2004:
1261 case 0x2005:
1262 case 0x2006:
1263 case 0x2007:
1264 case 0x2008:
1265 case 0x2009:
1266 case 0x200A:
1267 case 0x2028:
1268 case 0x2029:
1269 case 0x202F:
1270 case 0x205F:
1271 case 0x3000:
1272 return true;
1273 }
1274 return false;
1275 }
1276
1277 /*
1278 * Implementation of is_space for char types that are 1 Byte in size
1279 */
1280 template<typename CharT>
1281 bool is_space_impl(const CharT ch, std::integral_constant<int, 1>) {
1282 switch (ch) {
1283 case 0x0009:
1284 case 0x000A:
1285 case 0x000B:
1286 case 0x000C:
1287 case 0x000D:
1288 case 0x001C:
1289 case 0x001D:
1290 case 0x001E:
1291 case 0x001F:
1292 case 0x0020:
1293 return true;
1294 }
1295 return false;
1296 }
1297
1298 /*
1299 * checks whether unicode characters have the bidirectional
1300 * type 'WS', 'B' or 'S' or the category 'Zs'
1301 */
1302 template<typename CharT>
1303 bool is_space(const CharT ch) {
1304 return is_space_impl(ch, is_space_dispatch_tag<CharT>{});
1305 }
1306
1307 template<typename InputIt, typename CharT>
1309 IteratorViewVec<InputIt> splitted;
1310 auto second = first;
1311
1312 for (; second != last && first != last; first = second + 1) {
1313 second = std::find_if(first, last, is_space<CharT>);
1314
1315 if (first != second) {
1316 splitted.emplace_back(first, second);
1317 }
1318 }
1319
1320 std::sort(splitted.begin(), splitted.end());
1321
1322 return SplittedSentenceView<InputIt>(splitted);
1323 }
1324
1325} // namespace rapidfuzz
1326#include <stdexcept>
1327
1328namespace rapidfuzz {
1329
1354 template<typename InputIt1, typename InputIt2>
1355 int64_t hamming_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff = std::numeric_limits<int64_t>::max());
1356
1357 template<typename Sentence1, typename Sentence2>
1358 int64_t hamming_distance(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = std::numeric_limits<int64_t>::max());
1359
1360 template<typename InputIt1, typename InputIt2>
1361 int64_t hamming_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0);
1362
1363 template<typename Sentence1, typename Sentence2>
1364 int64_t hamming_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0);
1365
1366 template<typename InputIt1, typename InputIt2>
1367 double hamming_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 1.0);
1368
1369 template<typename Sentence1, typename Sentence2>
1370 double hamming_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0);
1371
1396 template<typename InputIt1, typename InputIt2>
1397 double hamming_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0);
1398
1399 template<typename Sentence1, typename Sentence2>
1400 double hamming_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0);
1401
1402 template<typename CharT1>
1403 struct CachedHamming {
1404 template<typename Sentence1>
1405 CachedHamming(const Sentence1& s1_) : s1(common::to_string(s1_)) {}
1406
1407 template<typename InputIt1>
1408 CachedHamming(InputIt1 first1, InputIt1 last1) : s1(first1, last1) {}
1409
1410 template<typename InputIt2>
1411 int64_t distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = std::numeric_limits<int64_t>::max()) const;
1412
1413 template<typename Sentence2>
1414 int64_t distance(const Sentence2& s2, int64_t score_cutoff = std::numeric_limits<int64_t>::max()) const;
1415
1416 template<typename InputIt2>
1417 int64_t similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0) const;
1418
1419 template<typename Sentence2>
1420 int64_t similarity(const Sentence2& s2, int64_t score_cutoff = 0) const;
1421
1422 template<typename InputIt2>
1423 double normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff = 1.0) const;
1424
1425 template<typename Sentence2>
1426 double normalized_distance(const Sentence2& s2, double score_cutoff = 1.0) const;
1427
1428 template<typename InputIt2>
1429 double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const;
1430
1431 template<typename Sentence2>
1432 double normalized_similarity(const Sentence2& s2, double score_cutoff = 0.0) const;
1433
1434 private:
1435 std::basic_string<CharT1> s1;
1436 };
1437
1438#if __cplusplus >= 201703L
1439 template<typename Sentence1>
1440 CachedHamming(const Sentence1& s1_) -> CachedHamming<char_type<Sentence1>>;
1441
1442 template<typename InputIt1>
1443 CachedHamming(InputIt1 first1, InputIt1 last1) -> CachedHamming<iterator_type<InputIt1>>;
1444#endif
1445
1448} // namespace rapidfuzz
1449
1450
1451#include <cmath>
1452
1453namespace rapidfuzz {
1454
1455 template<typename InputIt1, typename InputIt2>
1456 int64_t hamming_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
1457 if (std::distance(first1, last1) != std::distance(first2, last2)) {
1458 throw std::invalid_argument("Sequences are not the same length.");
1459 }
1460
1461 int64_t dist = 0;
1462 for (; first1 != last1; first1++, first2++) {
1463 dist += bool(*first1 != *first2);
1464 }
1465
1466 return (dist <= score_cutoff) ? dist : score_cutoff + 1;
1467 }
1468
1469 template<typename Sentence1, typename Sentence2>
1470 int64_t hamming_distance(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff) {
1471 return hamming_distance(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
1472 }
1473
1474 template<typename InputIt1, typename InputIt2>
1475 int64_t hamming_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
1476 int64_t maximum = std::distance(first1, last1);
1477 int64_t cutoff_distance = maximum - score_cutoff;
1478 int64_t dist = hamming_distance(first1, last1, first2, last2, cutoff_distance);
1479 int64_t sim = maximum - dist;
1480 return (sim >= score_cutoff) ? sim : 0;
1481 }
1482
1483 template<typename Sentence1, typename Sentence2>
1484 int64_t hamming_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff) {
1485 return hamming_similarity(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
1486 }
1487
1488 template<typename InputIt1, typename InputIt2>
1489 double hamming_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
1490 int64_t maximum = std::distance(first1, last1);
1491 int64_t cutoff_distance = static_cast<int64_t>(std::ceil(maximum * score_cutoff));
1492 int64_t dist = hamming_distance(first1, last1, first2, last2, cutoff_distance);
1493 double norm_dist = (maximum) ? (double) dist / (double) maximum : 0.0;
1494 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
1495 }
1496
1497 template<typename Sentence1, typename Sentence2>
1498 double hamming_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
1499 return hamming_normalized_distance(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
1500 }
1501
1502 template<typename InputIt1, typename InputIt2>
1503 double hamming_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
1504 int64_t maximum = std::distance(first1, last1);
1505 int64_t cutoff_distance = maximum - score_cutoff;
1506 int64_t dist = hamming_distance(first1, last1, first2, last2, cutoff_distance);
1507 int64_t sim = maximum - dist;
1508 return (sim >= score_cutoff) ? sim : 0;
1509 }
1510
1511 template<typename Sentence1, typename Sentence2>
1512 double hamming_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
1513 return hamming_normalized_similarity(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
1514 }
1515
1516 template<typename CharT1>
1517 template<typename InputIt2>
1518 int64_t CachedHamming<CharT1>::distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff) const {
1519 return hamming_distance(common::to_begin(s1), common::to_end(s1), first2, last2, score_cutoff);
1520 }
1521
1522 template<typename CharT1>
1523 template<typename Sentence2>
1524 int64_t CachedHamming<CharT1>::distance(const Sentence2& s2, int64_t score_cutoff) const {
1525 return hamming_distance(s1, s2, score_cutoff);
1526 }
1527
1528 template<typename CharT1>
1529 template<typename InputIt2>
1530 int64_t CachedHamming<CharT1>::similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff) const {
1531 return hamming_similarity(common::to_begin(s1), common::to_end(s1), first2, last2, score_cutoff);
1532 }
1533
1534 template<typename CharT1>
1535 template<typename Sentence2>
1536 int64_t CachedHamming<CharT1>::similarity(const Sentence2& s2, int64_t score_cutoff) const {
1537 return hamming_similarity(s1, s2, score_cutoff);
1538 }
1539
1540 template<typename CharT1>
1541 template<typename InputIt2>
1542 double CachedHamming<CharT1>::normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
1543 return hamming_normalized_distance(common::to_begin(s1), common::to_end(s1), first2, last2, score_cutoff);
1544 }
1545
1546 template<typename CharT1>
1547 template<typename Sentence2>
1548 double CachedHamming<CharT1>::normalized_distance(const Sentence2& s2, double score_cutoff) const {
1549 return hamming_normalized_distance(s1, s2, score_cutoff);
1550 }
1551
1552 template<typename CharT1>
1553 template<typename InputIt2>
1554 double CachedHamming<CharT1>::normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
1555 return hamming_normalized_similarity(common::to_begin(s1), common::to_end(s1), first2, last2, score_cutoff);
1556 }
1557
1558 template<typename CharT1>
1559 template<typename Sentence2>
1560 double CachedHamming<CharT1>::normalized_similarity(const Sentence2& s2, double score_cutoff) const {
1561 return hamming_normalized_similarity(s1, s2, score_cutoff);
1562 }
1563
1566} // namespace rapidfuzz
1567
1568
1569#include <cmath>
1570#include <limits>
1571
1572namespace rapidfuzz {
1573
1574 template<typename InputIt1, typename InputIt2>
1575 int64_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max = std::numeric_limits<int64_t>::max());
1576
1577 template<typename Sentence1, typename Sentence2>
1578 int64_t indel_distance(const Sentence1& s1, const Sentence2& s2, int64_t max = std::numeric_limits<int64_t>::max());
1579
1580 template<typename InputIt1, typename InputIt2>
1581 int64_t indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0.0);
1582
1583 template<typename Sentence1, typename Sentence2>
1584 int64_t indel_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0.0);
1585
1586 template<typename InputIt1, typename InputIt2>
1587 double indel_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 1.0);
1588
1589 template<typename Sentence1, typename Sentence2>
1590 double indel_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0);
1591
1592 template<typename InputIt1, typename InputIt2>
1593 double indel_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0);
1594
1595 template<typename Sentence1, typename Sentence2>
1596 double indel_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0);
1597
1598 template<typename InputIt1, typename InputIt2>
1599 Editops indel_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2);
1600
1601 template<typename Sentence1, typename Sentence2>
1602 Editops indel_editops(const Sentence1& s1, const Sentence2& s2);
1603
1604 template<typename CharT1>
1605 struct CachedIndel {
1606 template<typename Sentence1>
1607 CachedIndel(const Sentence1& s1_)
1608 : s1(common::to_string(s1_)), PM(common::to_begin(s1), common::to_end(s1)) {}
1609
1610 template<typename InputIt1>
1611 CachedIndel(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1) {}
1612
1613 template<typename InputIt2>
1614 int64_t distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = std::numeric_limits<int64_t>::max()) const;
1615
1616 template<typename Sentence2>
1617 int64_t distance(const Sentence2& s2, int64_t score_cutoff = std::numeric_limits<int64_t>::max()) const;
1618
1619 template<typename InputIt2>
1620 int64_t similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0) const;
1621
1622 template<typename Sentence2>
1623 int64_t similarity(const Sentence2& s2, int64_t score_cutoff = 0) const;
1624
1625 template<typename InputIt2>
1626 double normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff = 1.0) const;
1627
1628 template<typename Sentence2>
1629 double normalized_distance(const Sentence2& s2, double score_cutoff = 1.0) const;
1630
1631 template<typename InputIt2>
1632 double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const;
1633
1634 template<typename Sentence2>
1635 double normalized_similarity(const Sentence2& s2, double score_cutoff = 0.0) const;
1636
1637 private:
1638 std::basic_string<CharT1> s1;
1639 common::BlockPatternMatchVector PM;
1640 };
1641
1642#if __cplusplus >= 201703L
1643 template<typename Sentence1>
1644 CachedIndel(const Sentence1& s1_) -> CachedIndel<char_type<Sentence1>>;
1645
1646 template<typename InputIt1>
1647 CachedIndel(InputIt1 first1, InputIt1 last1) -> CachedIndel<iterator_type<InputIt1>>;
1648#endif
1649
1650} // namespace rapidfuzz
1651
1652
1653#include <cstddef>
1654#include <cstdint>
1655
1656#if defined(_MSC_VER) && !defined(__clang__)
1657#include <intrin.h>
1658#endif
1659
1660namespace rapidfuzz {
1661 namespace detail {
1662
1663 static inline uint64_t addc64(uint64_t a, uint64_t b, uint64_t carryin, uint64_t* carryout) {
1664 /* todo should use _addcarry_u64 when available */
1665 a += carryin;
1666 *carryout = a < carryin;
1667 a += b;
1668 *carryout |= a < b;
1669 return a;
1670 }
1671
1672 template<typename T, typename U>
1673 T ceil_div(T a, U divisor) {
1674 return a / divisor + static_cast<T>(a % divisor != 0);
1675 }
1676
1677 static inline int64_t popcount64(uint64_t x) {
1678 const uint64_t m1 = 0x5555555555555555;
1679 const uint64_t m2 = 0x3333333333333333;
1680 const uint64_t m4 = 0x0f0f0f0f0f0f0f0f;
1681 const uint64_t h01 = 0x0101010101010101;
1682
1683 x -= (x >> 1) & m1;
1684 x = (x & m2) + ((x >> 2) & m2);
1685 x = (x + (x >> 4)) & m4;
1686 return static_cast<int64_t>((x * h01) >> 56);
1687 }
1688
1692 template<typename T>
1693 T blsi(T a) {
1694 return a & -a;
1695 }
1696
1700 template<typename T>
1701 T blsr(T x) {
1702 return x & (x - 1);
1703 }
1704
1709 template<typename T>
1710 T blsmsk(T a) {
1711 return a ^ (a - 1);
1712 }
1713
1714#if defined(_MSC_VER) && !defined(__clang__)
1715 static inline int tzcnt(uint32_t x) {
1716 unsigned long trailing_zero = 0;
1717 _BitScanForward(&trailing_zero, x);
1718 return trailing_zero;
1719 }
1720
1721#if defined(_M_ARM) || defined(_M_X64)
1722 static inline int tzcnt(uint64_t x) {
1723 unsigned long trailing_zero = 0;
1724 _BitScanForward64(&trailing_zero, x);
1725 return trailing_zero;
1726 }
1727#else
1728 static inline int tzcnt(uint64_t x) {
1729 uint32_t msh = (uint32_t) (x >> 32);
1730 uint32_t lsh = (uint32_t) (x & 0xFFFFFFFF);
1731 if (lsh != 0) {
1732 return tzcnt(lsh);
1733 }
1734 return 32 + tzcnt(msh);
1735 }
1736#endif
1737
1738#else /* gcc / clang */
1739 static inline int tzcnt(uint32_t x) {
1740 return __builtin_ctz(x);
1741 }
1742
1743 static inline int tzcnt(uint64_t x) {
1744 return __builtin_ctzll(x);
1745 }
1746#endif
1747
1748 } // namespace detail
1749} // namespace rapidfuzz
1750
1751#include <algorithm>
1752#include <array>
1753#include <limits>
1754#include <string>
1755
1756namespace rapidfuzz {
1757 namespace detail {
1758
1759 /*
1760 * An encoded mbleven model table.
1761 *
1762 * Each 8-bit integer represents an edit sequence, with using two
1763 * bits for a single operation.
1764 *
1765 * Each Row of 8 integers represent all possible combinations
1766 * of edit sequences for a gived maximum edit distance and length
1767 * difference between the two strings, that is below the maximum
1768 * edit distance
1769 *
1770 * 0x1 = 01 = DELETE,
1771 * 0x2 = 10 = INSERT
1772 *
1773 * 0x5 -> DEL + DEL
1774 * 0x6 -> DEL + INS
1775 * 0x9 -> INS + DEL
1776 * 0xA -> INS + INS
1777 */
1778 static constexpr uint8_t indel_mbleven2018_matrix[14][7] = {
1779 /* max edit distance 1 */
1780 {0},
1781 /* case does not occur */ /* len_diff 0 */
1782 {0x01}, /* len_diff 1 */
1783 /* max edit distance 2 */
1784 {0x09, 0x06}, /* len_diff 0 */
1785 {0x01}, /* len_diff 1 */
1786 {0x05}, /* len_diff 2 */
1787 /* max edit distance 3 */
1788 {0x09, 0x06}, /* len_diff 0 */
1789 {0x25, 0x19, 0x16}, /* len_diff 1 */
1790 {0x05}, /* len_diff 2 */
1791 {0x15}, /* len_diff 3 */
1792 /* max edit distance 4 */
1793 {0x96, 0x66, 0x5A, 0x99, 0x69, 0xA5}, /* len_diff 0 */
1794 {0x25, 0x19, 0x16}, /* len_diff 1 */
1795 {0x65, 0x56, 0x95, 0x59}, /* len_diff 2 */
1796 {0x15}, /* len_diff 3 */
1797 {0x55}, /* len_diff 4 */
1798 };
1799
1800 template<typename InputIt1, typename InputIt2>
1801 int64_t indel_mbleven2018(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1802 int64_t len1 = std::distance(first1, last1);
1803 int64_t len2 = std::distance(first2, last2);
1804
1805 if (len1 < len2) {
1806 return indel_mbleven2018(first2, last2, first1, last1, max);
1807 }
1808
1809 int64_t len_diff = len1 - len2;
1810 auto possible_ops = indel_mbleven2018_matrix[(max + max * max) / 2 + len_diff - 1];
1811 int64_t dist = max + 1;
1812
1813 for (int pos = 0; possible_ops[pos] != 0; ++pos) {
1814 uint8_t ops = possible_ops[pos];
1815 int64_t s1_pos = 0;
1816 int64_t s2_pos = 0;
1817 int64_t cur_dist = 0;
1818
1819 while (s1_pos < len1 && s2_pos < len2) {
1820 if (first1[s1_pos] != first2[s2_pos]) {
1821 cur_dist++;
1822
1823 if (!ops) break;
1824 if (ops & 1) s1_pos++;
1825 if (ops & 2) s2_pos++;
1826 ops >>= 2;
1827 } else {
1828 s1_pos++;
1829 s2_pos++;
1830 }
1831 }
1832
1833 cur_dist += (len1 - s1_pos) + (len2 - s2_pos);
1834 dist = std::min(dist, cur_dist);
1835 }
1836
1837 return (dist <= max) ? dist : max + 1;
1838 }
1839
1840 template<int64_t N, typename PMV, typename InputIt1, typename InputIt2>
1841 static inline int64_t longest_common_subsequence_unroll(const PMV& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1842 int64_t len1 = std::distance(first1, last1);
1843 int64_t len2 = std::distance(first2, last2);
1844
1845 uint64_t S[N];
1846 for (int64_t i = 0; i < N; ++i) {
1847 S[i] = ~0x0ull;
1848 }
1849
1850 for (; first2 != last2; ++first2) {
1851 uint64_t carry = 0;
1852 uint64_t Matches[N];
1853 uint64_t u[N];
1854 uint64_t x[N];
1855 for (int64_t i = 0; i < N; ++i) {
1856 Matches[i] = block.get(i);
1857 u[i] = S[i] & Matches[i];
1858 x[i] = addc64(S[i], u[i], carry, &carry);
1859 S[i] = x[i] | (S[i] - u[i]);
1860 }
1861 }
1862
1863 int64_t res = 0;
1864 for (int64_t i = 0; i < N; ++i) {
1865 res += popcount64(~S[i]);
1866 }
1867
1868 int64_t dist = len1 + len2 - 2 * res;
1869 return (dist <= max) ? dist : max + 1;
1870 }
1871
1872 template<typename InputIt1, typename InputIt2>
1873 static inline int64_t
1874 longest_common_subsequence_blockwise(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1875 int64_t len1 = std::distance(first1, last1);
1876 int64_t len2 = std::distance(first2, last2);
1877
1878 int64_t words = block.m_val.size();
1879 std::vector<uint64_t> S(words, ~0x0ull);
1880
1881 for (; first2 != last2; ++first2) {
1882 uint64_t carry = 0;
1883 for (int64_t word = 0; word < words; ++word) {
1884 const uint64_t Matches = block.get(word, *first2);
1885 uint64_t Stemp = S[word];
1886
1887 uint64_t u = Stemp & Matches;
1888
1889 uint64_t x = addc64(Stemp, u, carry, &carry);
1890 S[word] = x | (Stemp - u);
1891 }
1892 }
1893
1894 int64_t res = 0;
1895 for (uint64_t Stemp : S) {
1896 res += popcount64(~Stemp);
1897 }
1898
1899 int64_t dist = len1 + len2 - 2 * res;
1900 return (dist <= max) ? dist : max + 1;
1901 }
1902
1903 template<typename InputIt1, typename InputIt2>
1904 int64_t longest_common_subsequence(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1905 int64_t len1 = std::distance(first1, last1);
1906 int64_t len2 = std::distance(first2, last2);
1907 int64_t nr = ceil_div(len1, 64);
1908 switch (nr) {
1909 case 0:
1910 return (len2 <= max) ? len2 : max + 1;
1911 case 1:
1912 return longest_common_subsequence_unroll<1>(block, first1, last1, first2, last2, max);
1913 case 2:
1914 return longest_common_subsequence_unroll<2>(block, first1, last1, first2, last2, max);
1915 case 3:
1916 return longest_common_subsequence_unroll<3>(block, first1, last1, first2, last2, max);
1917 case 4:
1918 return longest_common_subsequence_unroll<4>(block, first1, last1, first2, last2, max);
1919 case 5:
1920 return longest_common_subsequence_unroll<5>(block, first1, last1, first2, last2, max);
1921 case 6:
1922 return longest_common_subsequence_unroll<6>(block, first1, last1, first2, last2, max);
1923 case 7:
1924 return longest_common_subsequence_unroll<7>(block, first1, last1, first2, last2, max);
1925 case 8:
1926 return longest_common_subsequence_unroll<8>(block, first1, last1, first2, last2, max);
1927 default:
1928 return longest_common_subsequence_blockwise(block, first1, last1, first2, last2, max);
1929 }
1930 }
1931
1932 template<typename InputIt1, typename InputIt2>
1933 int64_t longest_common_subsequence(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1934 int64_t len1 = std::distance(first1, last1);
1935 int64_t nr = ceil_div(len1, 64);
1936 switch (nr) {
1937 case 1: {
1938 auto block = common::PatternMatchVector(first1, last1);
1939 return longest_common_subsequence_unroll<1>(block, first1, last1, first2, last2, max);
1940 }
1941 case 2: {
1942 auto block = common::BlockPatternMatchVector(first1, last1);
1943 return longest_common_subsequence_unroll<2>(block, first1, last1, first2, last2, max);
1944 }
1945 case 3: {
1946 auto block = common::BlockPatternMatchVector(first1, last1);
1947 return longest_common_subsequence_unroll<3>(block, first1, last1, first2, last2, max);
1948 }
1949 case 4: {
1950 auto block = common::BlockPatternMatchVector(first1, last1);
1951 return longest_common_subsequence_unroll<4>(block, first1, last1, first2, last2, max);
1952 }
1953 case 5: {
1954 auto block = common::BlockPatternMatchVector(first1, last1);
1955 return longest_common_subsequence_unroll<5>(block, first1, last1, first2, last2, max);
1956 }
1957 case 6: {
1958 auto block = common::BlockPatternMatchVector(first1, last1);
1959 return longest_common_subsequence_unroll<6>(block, first1, last1, first2, last2, max);
1960 }
1961 case 7: {
1962 auto block = common::BlockPatternMatchVector(first1, last1);
1963 return longest_common_subsequence_unroll<7>(block, first1, last1, first2, last2, max);
1964 }
1965 case 8: {
1966 auto block = common::BlockPatternMatchVector(first1, last1);
1967 return longest_common_subsequence_unroll<8>(block, first1, last1, first2, last2, max);
1968 }
1969 default: {
1970 auto block = common::BlockPatternMatchVector(first1, last1);
1971 return longest_common_subsequence_blockwise(block, first1, last1, first2, last2, max);
1972 }
1973 }
1974 }
1975
1976 template<typename InputIt1, typename InputIt2>
1977 int64_t indel_distance(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
1978 int64_t len1 = std::distance(first1, last1);
1979 int64_t len2 = std::distance(first2, last2);
1980
1981 /* no edits are allowed */
1982 if (max == 0 || (max == 1 && len1 == len2)) {
1983 return !std::equal(first1, last1, first2, last2);
1984 }
1985
1986 if (max < std::abs(len1 - len2)) {
1987 return max + 1;
1988 }
1989
1990 // do this first, since we can not remove any affix in encoded form
1991 if (max >= 5) {
1992 return longest_common_subsequence(block, first1, last1, first2, last2, max);
1993 }
1994
1995 /* common affix does not effect Levenshtein distance */
1996 common::remove_common_affix(first1, last1, first2, last2);
1997 len1 = std::distance(first1, last1);
1998 len2 = std::distance(first2, last2);
1999 if (!len1 || !len2) {
2000 return len1 + len2;
2001 }
2002
2003 return indel_mbleven2018(first1, last1, first2, last2, max);
2004 }
2005
2006 template<typename InputIt1, typename InputIt2>
2007 int64_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2008 int64_t len1 = std::distance(first1, last1);
2009 int64_t len2 = std::distance(first2, last2);
2010
2011 // Swapping the strings so the second string is shorter
2012 if (len1 < len2) {
2013 return indel_distance(first2, last2, first1, last1, max);
2014 }
2015
2016 /* no edits are allowed */
2017 if (max == 0 || (max == 1 && len1 == len2)) {
2018 return !std::equal(first1, last1, first2, last2);
2019 }
2020
2021 if (max < std::abs(len1 - len2)) {
2022 return max + 1;
2023 }
2024
2025 /* common affix does not effect Levenshtein distance */
2026 common::remove_common_affix(first1, last1, first2, last2);
2027 len1 = std::distance(first1, last1);
2028 len2 = std::distance(first2, last2);
2029 if (!len1 || !len2) {
2030 return len1 + len2;
2031 }
2032
2033 if (max < 5) {
2034 return indel_mbleven2018(first1, last1, first2, last2, max);
2035 }
2036
2037 return longest_common_subsequence(first1, last1, first2, last2, max);
2038 }
2039
2040 template<typename InputIt1, typename InputIt2>
2041 double indel_normalized_distance(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
2042 int64_t len1 = std::distance(first1, last1);
2043 int64_t len2 = std::distance(first2, last2);
2044 int64_t maximum = len1 + len2;
2045 int64_t cutoff_distance = static_cast<int64_t>(std::ceil(maximum * score_cutoff));
2046 int64_t dist = indel_distance(block, first1, last1, first2, last2, cutoff_distance);
2047 double norm_dist = (maximum) ? (double) dist / (double) maximum : 0.0;
2048 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
2049 }
2050
2051 template<typename InputIt1, typename InputIt2>
2052 double indel_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
2053 int64_t len1 = std::distance(first1, last1);
2054 int64_t len2 = std::distance(first2, last2);
2055 int64_t maximum = len1 + len2;
2056 int64_t cutoff_distance = static_cast<int64_t>(std::ceil(maximum * score_cutoff));
2057 int64_t dist = indel_distance(first1, last1, first2, last2, cutoff_distance);
2058 double norm_dist = (maximum) ? (double) dist / (double) maximum : 0.0;
2059 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
2060 }
2061
2062 template<typename InputIt1, typename InputIt2>
2063 int64_t indel_similarity(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
2064 int64_t len1 = std::distance(first1, last1);
2065 int64_t len2 = std::distance(first2, last2);
2066 int64_t maximum = len1 + len2;
2067 int64_t cutoff_distance = maximum - score_cutoff;
2068 int64_t dist = indel_distance(block, first1, last1, first2, last2, cutoff_distance);
2069 int64_t sim = maximum - dist;
2070 return (sim >= score_cutoff) ? sim : 0;
2071 }
2072
2073 template<typename InputIt1, typename InputIt2>
2074 int64_t indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
2075 int64_t len1 = std::distance(first1, last1);
2076 int64_t len2 = std::distance(first2, last2);
2077 int64_t maximum = len1 + len2;
2078 int64_t cutoff_distance = maximum - score_cutoff;
2079 int64_t dist = indel_distance(first1, last1, first2, last2, cutoff_distance);
2080 int64_t sim = maximum - dist;
2081 return (sim >= score_cutoff) ? sim : 0;
2082 }
2083
2084 template<typename InputIt1, typename InputIt2>
2085 double indel_normalized_similarity(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
2086 double norm_dist =
2087 indel_normalized_distance(block, first1, last1, first2, last2, 1.0 - score_cutoff);
2088 double norm_sim = 1.0 - norm_dist;
2089 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
2090 }
2091
2092 template<typename InputIt1, typename InputIt2>
2093 double indel_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
2094 double norm_dist = indel_normalized_distance(first1, last1, first2, last2, 1.0 - score_cutoff);
2095 double norm_sim = 1.0 - norm_dist;
2096 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
2097 }
2098
2099 struct LLCSBitMatrix {
2100 LLCSBitMatrix(uint64_t rows, uint64_t cols) : S(rows, cols, (uint64_t) -1), dist(0) {}
2101
2102 common::Matrix<uint64_t> S;
2103
2104 int64_t dist;
2105 };
2106
2110 template<typename InputIt1, typename InputIt2>
2111 Editops recover_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, const LLCSBitMatrix& matrix, StringAffix affix) {
2112 int64_t len1 = std::distance(first1, last1);
2113 int64_t len2 = std::distance(first2, last2);
2114 int64_t dist = matrix.dist;
2115 Editops editops(dist);
2116 editops.set_src_len(len1 + affix.prefix_len + affix.suffix_len);
2117 editops.set_dest_len(len2 + affix.prefix_len + affix.suffix_len);
2118
2119 if (dist == 0) {
2120 return editops;
2121 }
2122
2123 int64_t row = len1;
2124 int64_t col = len2;
2125
2126 while (row && col) {
2127 uint64_t col_pos = col - 1;
2128 uint64_t col_word = col_pos / 64;
2129 col_pos = col_pos % 64;
2130 uint64_t mask = 1ull << col_pos;
2131
2132 /* Insertion */
2133 if (matrix.S[row - 1][col_word] & mask) {
2134 dist--;
2135 col--;
2136 editops[dist].type = EditType::Insert;
2137 editops[dist].src_pos = row + affix.prefix_len;
2138 editops[dist].dest_pos = col + affix.prefix_len;
2139 } else {
2140 row--;
2141
2142 /* Deletion */
2143 if (row && (~matrix.S[row - 1][col_word]) & mask) {
2144 dist--;
2145 editops[dist].type = EditType::Delete;
2146 editops[dist].src_pos = row + affix.prefix_len;
2147 editops[dist].dest_pos = col + affix.prefix_len;
2148 }
2149 /* Match */
2150 else {
2151 col--;
2152 assert(first1[row] == first2[col]);
2153 }
2154 }
2155 }
2156
2157 while (col) {
2158 dist--;
2159 col--;
2160 editops[dist].type = EditType::Insert;
2161 editops[dist].src_pos = row + affix.prefix_len;
2162 editops[dist].dest_pos = col + affix.prefix_len;
2163 }
2164
2165 while (row) {
2166 dist--;
2167 row--;
2168 editops[dist].type = EditType::Delete;
2169 editops[dist].src_pos = row + affix.prefix_len;
2170 editops[dist].dest_pos = col + affix.prefix_len;
2171 }
2172
2173 return editops;
2174 }
2175
2176 template<int64_t N, typename PMV, typename InputIt1, typename InputIt2>
2177 LLCSBitMatrix llcs_matrix_unroll(const PMV& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
2178 int64_t len1 = std::distance(first1, last1);
2179 int64_t len2 = std::distance(first2, last2);
2180 uint64_t S[N];
2181 for (int64_t i = 0; i < N; ++i) {
2182 S[i] = ~0x0ull;
2183 }
2184
2185 LLCSBitMatrix matrix(len2, N);
2186
2187 for (int64_t i = 0; i < len2; ++i) {
2188 uint64_t carry = 0;
2189 uint64_t Matches[N];
2190 uint64_t u[N];
2191 uint64_t x[N];
2192 for (int64_t word = 0; word < N; ++word) {
2193 Matches[word] = block.get(word);
2194 u[word] = S[word] & Matches[word];
2195 x[word] = addc64(S[word], u[word], carry, &carry);
2196 S[word] = matrix.S[i][word] = x[word] | (S[word] - u[word]);
2197 }
2198 }
2199
2200 int64_t res = 0;
2201 for (int64_t i = 0; i < N; ++i) {
2202 res += popcount64(~S[i]);
2203 }
2204
2205 matrix.dist = len1 + len2 - 2 * res;
2206
2207 return matrix;
2208 }
2209
2210 template<typename InputIt1, typename InputIt2>
2211 LLCSBitMatrix llcs_matrix_blockwise(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
2212 int64_t len1 = std::distance(first1, last1);
2213 int64_t len2 = std::distance(first2, last2);
2214 int64_t words = block.m_val.size();
2215 /* todo could be replaced with access to matrix which would slightly
2216 * reduce memory usage */
2217 std::vector<uint64_t> S(words, ~0x0ull);
2218 LLCSBitMatrix matrix(len2, words);
2219
2220 for (int64_t i = 0; i < len2; ++i) {
2221 uint64_t carry = 0;
2222 for (int64_t word = 0; word < words; ++word) {
2223 const uint64_t Matches = block.get(word, first2[i]);
2224 uint64_t Stemp = S[word];
2225
2226 uint64_t u = Stemp & Matches;
2227
2228 uint64_t x = addc64(Stemp, u, carry, &carry);
2229 S[word] = matrix.S[i][word] = x | (Stemp - u);
2230 }
2231 }
2232
2233 int64_t res = 0;
2234 for (uint64_t Stemp : S) {
2235 res += popcount64(~Stemp);
2236 }
2237
2238 matrix.dist = len1 + len2 - 2 * res;
2239
2240 return matrix;
2241 }
2242
2243 template<typename InputIt1, typename InputIt2>
2244 LLCSBitMatrix llcs_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
2245 int64_t len1 = std::distance(first1, last1);
2246 int64_t len2 = std::distance(first2, last2);
2247 if (!len1 || !len2) {
2248 LLCSBitMatrix matrix(0, 0);
2249 matrix.dist = len1 + len2;
2250 return matrix;
2251 } else if (len2 <= 64) {
2252 common::PatternMatchVector block(first2, last2);
2253 return llcs_matrix_unroll<1>(block, first2, last2, first1, last1);
2254 } else {
2255 common::BlockPatternMatchVector block(first2, last2);
2256 switch (block.m_val.size()) {
2257 case 1:
2258 return llcs_matrix_unroll<1>(block, first2, last2, first1, last1);
2259 case 2:
2260 return llcs_matrix_unroll<2>(block, first2, last2, first1, last1);
2261 case 3:
2262 return llcs_matrix_unroll<3>(block, first2, last2, first1, last1);
2263 case 4:
2264 return llcs_matrix_unroll<4>(block, first2, last2, first1, last1);
2265 case 5:
2266 return llcs_matrix_unroll<5>(block, first2, last2, first1, last1);
2267 case 6:
2268 return llcs_matrix_unroll<6>(block, first2, last2, first1, last1);
2269 case 7:
2270 return llcs_matrix_unroll<7>(block, first2, last2, first1, last1);
2271 case 8:
2272 return llcs_matrix_unroll<8>(block, first2, last2, first1, last1);
2273 default:
2274 return llcs_matrix_blockwise(block, first2, last2, first1, last1);
2275 }
2276 }
2277 }
2278
2279 } // namespace detail
2280
2281 template<typename InputIt1, typename InputIt2>
2282 int64_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
2283 return detail::indel_distance(first1, last1, first2, last2, score_cutoff);
2284 }
2285
2286 template<typename Sentence1, typename Sentence2>
2287 int64_t indel_distance(const Sentence1& s1, const Sentence2& s2, int64_t max) {
2288 return indel_distance(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), max);
2289 }
2290
2291 template<typename InputIt1, typename InputIt2>
2292 double indel_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
2293 int64_t maximum = std::distance(first1, last1) + std::distance(first2, last2);
2294 int64_t cutoff_distance = static_cast<int64_t>(std::ceil(maximum * score_cutoff));
2295 int64_t dist = indel_distance(first1, last1, first2, last2, cutoff_distance);
2296 double norm_dist = (maximum) ? (double) dist / (double) maximum : 0.0;
2297 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
2298 }
2299
2300 template<typename Sentence1, typename Sentence2>
2301 double indel_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
2302 return indel_normalized_distance(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
2303 }
2304
2305 template<typename InputIt1, typename InputIt2>
2306 int64_t indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
2307 int64_t maximum = std::distance(first1, last1) + std::distance(first2, last2);
2308 int64_t cutoff_distance = maximum - score_cutoff;
2309 int64_t dist = indel_distance(first1, last1, first2, last2, cutoff_distance);
2310 int64_t sim = maximum - dist;
2311 return (sim >= score_cutoff) ? sim : 0;
2312 }
2313
2314 template<typename Sentence1, typename Sentence2>
2315 int64_t indel_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff) {
2317 }
2318
2319 template<typename InputIt1, typename InputIt2>
2320 double indel_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
2321 double norm_dist = indel_normalized_distance(first1, last1, first2, last2, 1.0 - score_cutoff);
2322 double norm_sim = 1.0 - norm_dist;
2323 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
2324 }
2325
2326 template<typename Sentence1, typename Sentence2>
2327 double indel_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
2328 return indel_normalized_similarity(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
2329 }
2330
2331 template<typename InputIt1, typename InputIt2>
2332 Editops indel_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
2333 /* prefix and suffix are no-ops, which do not need to be added to the editops */
2334 StringAffix affix = common::remove_common_affix(first1, last1, first2, last2);
2335
2336 return detail::recover_alignment(first1, last1, first2, last2, detail::llcs_matrix(first1, last1, first2, last2), affix);
2337 }
2338
2339 template<typename Sentence1, typename Sentence2>
2340 Editops indel_editops(const Sentence1& s1, const Sentence2& s2) {
2341 return indel_editops(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2));
2342 }
2343
2344 template<typename CharT1>
2345 template<typename InputIt2>
2346 int64_t CachedIndel<CharT1>::distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff) const {
2347 return detail::indel_distance(PM, common::to_begin(s1), common::to_end(s1), first2, last2, score_cutoff);
2348 }
2349
2350 template<typename CharT1>
2351 template<typename Sentence2>
2352 int64_t CachedIndel<CharT1>::distance(const Sentence2& s2, int64_t score_cutoff) const {
2353 return distance(common::to_begin(s2), common::to_end(s2), score_cutoff);
2354 }
2355
2356 template<typename CharT1>
2357 template<typename InputIt2>
2358 double CachedIndel<CharT1>::normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
2359 int64_t maximum = s1.size() + std::distance(first2, last2);
2360 int64_t cutoff_distance = static_cast<int64_t>(std::ceil(maximum * score_cutoff));
2361 int64_t dist = distance(first2, last2, cutoff_distance);
2362 double norm_dist = (maximum) ? (double) dist / (double) maximum : 0.0;
2363 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
2364 }
2365
2366 template<typename CharT1>
2367 template<typename Sentence2>
2368 double CachedIndel<CharT1>::normalized_distance(const Sentence2& s2, double score_cutoff) const {
2369 return normalized_distance(common::to_begin(s2), common::to_end(s2), score_cutoff);
2370 }
2371
2372 template<typename CharT1>
2373 template<typename InputIt2>
2374 int64_t CachedIndel<CharT1>::similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff) const {
2375 int64_t maximum = s1.size() + std::distance(first2, last2);
2376 int64_t cutoff_distance = maximum - score_cutoff;
2377 int64_t dist = distance(first2, last2, cutoff_distance);
2378 int64_t sim = maximum - dist;
2379 return (sim >= score_cutoff) ? sim : 0;
2380 }
2381
2382 template<typename CharT1>
2383 template<typename Sentence2>
2384 int64_t CachedIndel<CharT1>::similarity(const Sentence2& s2, int64_t score_cutoff) const {
2385 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
2386 }
2387
2388 template<typename CharT1>
2389 template<typename InputIt2>
2390 double CachedIndel<CharT1>::normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
2391 double norm_dist = normalized_distance(first2, last2, 1.0 - score_cutoff);
2392 double norm_sim = 1.0 - norm_dist;
2393 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
2394 }
2395
2396 template<typename CharT1>
2397 template<typename Sentence2>
2398 double CachedIndel<CharT1>::normalized_similarity(const Sentence2& s2, double score_cutoff) const {
2399 return normalized_similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
2400 }
2401
2402} // namespace rapidfuzz
2403
2404#include <limits>
2405
2406namespace rapidfuzz {
2407
2537 template<typename InputIt1, typename InputIt2>
2538 int64_t levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights = {1, 1, 1}, int64_t max = std::numeric_limits<int64_t>::max());
2539
2540 template<typename Sentence1, typename Sentence2>
2541 int64_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2, LevenshteinWeightTable weights = {1, 1, 1}, int64_t max = std::numeric_limits<int64_t>::max());
2542
2543 template<typename InputIt1, typename InputIt2>
2544 int64_t levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights = {1, 1, 1}, int64_t score_cutoff = 0.0);
2545
2546 template<typename Sentence1, typename Sentence2>
2547 int64_t levenshtein_similarity(const Sentence1& s1, const Sentence2& s2, LevenshteinWeightTable weights = {1, 1, 1}, int64_t score_cutoff = 0.0);
2548
2549 template<typename InputIt1, typename InputIt2>
2550 double levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 1.0);
2551
2552 template<typename Sentence1, typename Sentence2>
2553 double levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2, LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 1.0);
2554
2614 template<typename InputIt1, typename InputIt2>
2615 double levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 0.0);
2616
2617 template<typename Sentence1, typename Sentence2>
2618 double levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2, LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 0.0);
2619
2635 template<typename InputIt1, typename InputIt2>
2636 Editops levenshtein_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2);
2637
2638 template<typename Sentence1, typename Sentence2>
2639 Editops levenshtein_editops(const Sentence1& s1, const Sentence2& s2);
2640
2641 template<typename CharT1>
2643 template<typename Sentence1>
2644 CachedLevenshtein(const Sentence1& s1_, LevenshteinWeightTable aWeights = {1, 1, 1})
2645 : s1(common::to_string(s1_)),
2646 PM(common::to_begin(s1), common::to_end(s1)),
2647 weights(aWeights) {}
2648
2649 template<typename InputIt1>
2650 CachedLevenshtein(InputIt1 first1, InputIt1 last1, LevenshteinWeightTable aWeights = {1, 1, 1})
2651 : s1(first1, last1), PM(first1, last1), weights(aWeights) {}
2652
2653 template<typename InputIt2>
2654 int64_t distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = std::numeric_limits<int64_t>::max()) const;
2655
2656 template<typename Sentence2>
2657 int64_t distance(const Sentence2& s2, int64_t score_cutoff = std::numeric_limits<int64_t>::max()) const;
2658
2659 template<typename InputIt2>
2660 int64_t similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0) const;
2661
2662 template<typename Sentence2>
2663 int64_t similarity(const Sentence2& s2, int64_t score_cutoff = 0) const;
2664
2665 template<typename InputIt2>
2666 double normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff = 1.0) const;
2667
2668 template<typename Sentence2>
2669 double normalized_distance(const Sentence2& s2, double score_cutoff = 1.0) const;
2670
2671 template<typename InputIt2>
2672 double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const;
2673
2674 template<typename Sentence2>
2675 double normalized_similarity(const Sentence2& s2, double score_cutoff = 0.0) const;
2676
2677 private:
2678 std::basic_string<CharT1> s1;
2679 common::BlockPatternMatchVector PM;
2680 LevenshteinWeightTable weights;
2681 };
2682
2683#if __cplusplus >= 201703L
2684 template<typename Sentence1>
2685 CachedLevenshtein(const Sentence1& s1_, LevenshteinWeightTable aWeights)
2686 -> CachedLevenshtein<char_type<Sentence1>>;
2687
2688 template<typename InputIt1>
2689 CachedLevenshtein(InputIt1 first1, InputIt1 last1, LevenshteinWeightTable aWeights)
2690 -> CachedLevenshtein<iterator_type<InputIt1>>;
2691#endif
2692
2693} // namespace rapidfuzz
2694
2695
2696#include <cmath>
2697
2698namespace rapidfuzz {
2699 namespace detail {
2700 template<typename InputIt1, typename InputIt2>
2701 int64_t generalized_levenshtein_wagner_fischer(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights, int64_t max) {
2702 int64_t len1 = std::distance(first1, last1);
2703 int64_t cache_size = len1 + 1;
2704 std::vector<int64_t> cache(cache_size);
2705
2706 cache[0] = 0;
2707 for (int64_t i = 1; i < cache_size; ++i) {
2708 cache[i] = cache[i - 1] + (int64_t) weights.delete_cost;
2709 }
2710
2711 for (; first2 != last2; ++first2) {
2712 auto cache_iter = cache.begin();
2713 int64_t temp = *cache_iter;
2714 *cache_iter += (int64_t) weights.insert_cost;
2715
2716 auto _first1 = first1;
2717 for (; _first1 != last1; ++_first1) {
2718 if (*_first1 != *first2) {
2719 temp = std::min({*cache_iter + (int64_t) weights.delete_cost, *(cache_iter + 1) + (int64_t) weights.insert_cost, temp + (int64_t) weights.replace_cost});
2720 }
2721 ++cache_iter;
2722 std::swap(*cache_iter, temp);
2723 }
2724 }
2725
2726 int64_t dist = cache.back();
2727 return (dist <= max) ? dist : max + 1;
2728 }
2729
2734 template<typename InputIt1, typename InputIt2>
2735 int64_t levenshtein_maximum(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights) {
2736 int64_t len1 = std::distance(first1, last1);
2737 int64_t len2 = std::distance(first2, last2);
2738
2739 int64_t max_dist = len1 * (int64_t) weights.delete_cost + len2 * (int64_t) weights.insert_cost;
2740
2741 if (len1 >= len2) {
2742 max_dist = std::min(max_dist, len2 * (int64_t) weights.replace_cost + (len1 - len2) * (int64_t) weights.delete_cost);
2743 } else {
2744 max_dist = std::min(max_dist, len1 * (int64_t) weights.replace_cost + (len2 - len1) * (int64_t) weights.insert_cost);
2745 }
2746
2747 return max_dist;
2748 }
2749
2754 template<typename InputIt1, typename InputIt2>
2755 int64_t levenshtein_min_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights) {
2756 int64_t len1 = std::distance(first1, last1);
2757 int64_t len2 = std::distance(first2, last2);
2758 return std::max((len1 - len2) * (int64_t) weights.delete_cost, (len2 - len1) * (int64_t) weights.insert_cost);
2759 }
2760
2761 template<typename InputIt1, typename InputIt2>
2762 int64_t generalized_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights, int64_t max) {
2763 int64_t min_edits = levenshtein_min_distance(first1, last1, first2, last2, weights);
2764 if (min_edits > max) {
2765 return max + 1;
2766 }
2767
2768 /* common affix does not effect Levenshtein distance */
2769 common::remove_common_affix(first1, last1, first2, last2);
2770
2771 return generalized_levenshtein_wagner_fischer(first1, last1, first2, last2, weights, max);
2772 }
2773
2774 /*
2775 * An encoded mbleven model table.
2776 *
2777 * Each 8-bit integer represents an edit sequence, with using two
2778 * bits for a single operation.
2779 *
2780 * Each Row of 8 integers represent all possible combinations
2781 * of edit sequences for a gived maximum edit distance and length
2782 * difference between the two strings, that is below the maximum
2783 * edit distance
2784 *
2785 * 01 = DELETE, 10 = INSERT, 11 = SUBSTITUTE
2786 *
2787 * For example, 3F -> 0b111111 means three substitutions
2788 */
2789 static constexpr uint8_t levenshtein_mbleven2018_matrix[9][8] = {
2790 /* max edit distance 1 */
2791 {0x03}, /* len_diff 0 */
2792 {0x01}, /* len_diff 1 */
2793 /* max edit distance 2 */
2794 {0x0F, 0x09, 0x06}, /* len_diff 0 */
2795 {0x0D, 0x07}, /* len_diff 1 */
2796 {0x05}, /* len_diff 2 */
2797 /* max edit distance 3 */
2798 {0x3F, 0x27, 0x2D, 0x39, 0x36, 0x1E, 0x1B}, /* len_diff 0 */
2799 {0x3D, 0x37, 0x1F, 0x25, 0x19, 0x16}, /* len_diff 1 */
2800 {0x35, 0x1D, 0x17}, /* len_diff 2 */
2801 {0x15}, /* len_diff 3 */
2802 };
2803
2804 template<typename InputIt1, typename InputIt2>
2805 int64_t levenshtein_mbleven2018(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2806 int64_t len1 = std::distance(first1, last1);
2807 int64_t len2 = std::distance(first2, last2);
2808
2809 if (len1 < len2) {
2810 return levenshtein_mbleven2018(first2, last2, first1, last1, max);
2811 }
2812
2813 int64_t len_diff = len1 - len2;
2814 auto possible_ops = levenshtein_mbleven2018_matrix[(max + max * max) / 2 + len_diff - 1];
2815 int64_t dist = max + 1;
2816
2817 for (int pos = 0; possible_ops[pos] != 0; ++pos) {
2818 uint8_t ops = possible_ops[pos];
2819 int64_t s1_pos = 0;
2820 int64_t s2_pos = 0;
2821 int64_t cur_dist = 0;
2822 while (s1_pos < len1 && s2_pos < len2) {
2823 if (first1[s1_pos] != first2[s2_pos]) {
2824 cur_dist++;
2825 if (!ops) break;
2826 if (ops & 1) s1_pos++;
2827 if (ops & 2) s2_pos++;
2828 ops >>= 2;
2829 } else {
2830 s1_pos++;
2831 s2_pos++;
2832 }
2833 }
2834 cur_dist += (len1 - s1_pos) + (len2 - s2_pos);
2835 dist = std::min(dist, cur_dist);
2836 }
2837
2838 return (dist <= max) ? dist : max + 1;
2839 }
2840
2859 template<typename InputIt1, typename InputIt2>
2860 int64_t levenshtein_hyrroe2003(const common::PatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2861 int64_t len1 = std::distance(first1, last1);
2862
2863 /* VP is set to 1^m. Shifting by bitwidth would be undefined behavior */
2864 uint64_t VP = (uint64_t) -1;
2865 uint64_t VN = 0;
2866 int64_t currDist = len1;
2867
2868 /* mask used when computing D[m,j] in the paper 10^(m-1) */
2869 uint64_t mask = (uint64_t) 1 << (len1 - 1);
2870
2871 /* Searching */
2872 for (; first2 != last2; ++first2) {
2873 /* Step 1: Computing D0 */
2874 uint64_t PM_j = PM.get(*first2);
2875 uint64_t X = PM_j;
2876 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
2877
2878 /* Step 2: Computing HP and HN */
2879 uint64_t HP = VN | ~(D0 | VP);
2880 uint64_t HN = D0 & VP;
2881
2882 /* Step 3: Computing the value D[m,j] */
2883 currDist += bool(HP & mask);
2884 currDist -= bool(HN & mask);
2885
2886 /* Step 4: Computing Vp and VN */
2887 HP = (HP << 1) | 1;
2888 HN = (HN << 1);
2889
2890 VP = HN | ~(D0 | HP);
2891 VN = HP & D0;
2892 }
2893
2894 return (currDist <= max) ? currDist : max + 1;
2895 }
2896
2897 template<typename InputIt1, typename InputIt2>
2898 int64_t levenshtein_hyrroe2003_small_band(const common::BlockPatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2899 int64_t len1 = std::distance(first1, last1);
2900 int64_t len2 = std::distance(first2, last2);
2901
2902 /* VP is set to 1^m. Shifting by bitwidth would be undefined behavior */
2903 uint64_t VP = (uint64_t) -1;
2904 uint64_t VN = 0;
2905
2906 int64_t currDist = len1;
2907
2908 /* mask used when computing D[m,j] in the paper 10^(m-1) */
2909 uint64_t mask = 1ull << 63;
2910
2911 const int64_t words = PM.m_val.size();
2912
2913 /* Searching */
2914 for (int64_t i = 0; i < len2; ++i) {
2915 /* Step 1: Computing D0 */
2916 int64_t word = i / 64;
2917 int64_t word_pos = i % 64;
2918
2919 uint64_t PM_j = PM.get(word, first2[i]) >> word_pos;
2920
2921 if (word + 1 < words && word_pos != 0) {
2922 PM_j |= PM.get(word + 1, first2[i]) << (64 - word_pos);
2923 }
2924
2925 /* Step 1: Computing D0 */
2926 uint64_t X = PM_j;
2927 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
2928
2929 /* Step 2: Computing HP and HN */
2930 uint64_t HP = VN | ~(D0 | VP);
2931 uint64_t HN = D0 & VP;
2932
2933 /* Step 3: Computing the value D[m,j] */
2934 currDist += bool(HP & mask);
2935 currDist -= bool(HN & mask);
2936
2937 /* Step 4: Computing Vp and VN */
2938 VP = HN | ~((D0 >> 1) | HP);
2939 VN = (D0 >> 1) & HP;
2940 }
2941
2942 return (currDist <= max) ? currDist : max + 1;
2943 }
2944
2945 template<typename InputIt1, typename InputIt2>
2946 int64_t levenshtein_myers1999_block(const common::BlockPatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
2947 struct Vectors {
2948 uint64_t VP;
2949 uint64_t VN;
2950
2951 Vectors() : VP(~0x0ull), VN(0) {}
2952 };
2953
2954 int64_t len1 = std::distance(first1, last1);
2955 int64_t len2 = std::distance(first2, last2);
2956 int64_t words = PM.m_val.size();
2957 int64_t currDist = len1;
2958
2959 /* upper bound */
2960 max = std::min(max, std::max(len1, len2));
2961
2962 // todo could safe up to 25% even without max when ignoring irrelevant paths
2963 int64_t full_band = std::min(len1, 2 * max + 1);
2964
2965 if (full_band <= 64) {
2966 return levenshtein_hyrroe2003_small_band(PM, first1, last1, first2, last2, max);
2967 }
2968
2969 std::vector<Vectors> vecs(words);
2970 uint64_t Last = (uint64_t) 1 << ((len1 - 1) % 64);
2971
2972 /* Searching */
2973 for (int64_t i = 0; i < len2; i++) {
2974 uint64_t HP_carry = 1;
2975 uint64_t HN_carry = 0;
2976
2977 for (int64_t word = 0; word < words - 1; word++) {
2978 /* Step 1: Computing D0 */
2979 uint64_t PM_j = PM.get(word, first2[i]);
2980 uint64_t VN = vecs[word].VN;
2981 uint64_t VP = vecs[word].VP;
2982
2983 uint64_t X = PM_j | HN_carry;
2984 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
2985
2986 /* Step 2: Computing HP and HN */
2987 uint64_t HP = VN | ~(D0 | VP);
2988 uint64_t HN = D0 & VP;
2989
2990 /* Step 3: Computing the value D[m,j] */
2991 // only required for last vector
2992
2993 /* Step 4: Computing Vp and VN */
2994 uint64_t HP_carry_temp = HP_carry;
2995 HP_carry = HP >> 63;
2996 HP = (HP << 1) | HP_carry_temp;
2997
2998 uint64_t HN_carry_temp = HN_carry;
2999 HN_carry = HN >> 63;
3000 HN = (HN << 1) | HN_carry_temp;
3001
3002 vecs[word].VP = HN | ~(D0 | HP);
3003 vecs[word].VN = HP & D0;
3004 }
3005
3006 {
3007 /* Step 1: Computing D0 */
3008 uint64_t PM_j = PM.get(words - 1, first2[i]);
3009 uint64_t VN = vecs[words - 1].VN;
3010 uint64_t VP = vecs[words - 1].VP;
3011
3012 uint64_t X = PM_j | HN_carry;
3013 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
3014
3015 /* Step 2: Computing HP and HN */
3016 uint64_t HP = VN | ~(D0 | VP);
3017 uint64_t HN = D0 & VP;
3018
3019 /* Step 3: Computing the value D[m,j] */
3020 currDist += bool(HP & Last);
3021 currDist -= bool(HN & Last);
3022
3023 /* Step 4: Computing Vp and VN */
3024 HP = (HP << 1) | HP_carry;
3025 HN = (HN << 1) | HN_carry;
3026
3027 vecs[words - 1].VP = HN | ~(D0 | HP);
3028 vecs[words - 1].VN = HP & D0;
3029 }
3030 }
3031
3032 return (currDist <= max) ? currDist : max + 1;
3033 }
3034
3035 template<typename InputIt1, typename InputIt2>
3036 int64_t uniform_levenshtein_distance(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
3037 int64_t len1 = std::distance(first1, last1);
3038 int64_t len2 = std::distance(first2, last2);
3039
3040 // when no differences are allowed a direct comparision is sufficient
3041 if (max == 0) {
3042 return !std::equal(first1, last1, first2, last2);
3043 }
3044
3045 if (max < std::abs(len1 - len2)) {
3046 return max + 1;
3047 }
3048
3049 // important to catch, since this causes block.m_val to be empty -> raises exception on access
3050 if (!len1) {
3051 return (len2 <= max) ? len2 : max + 1;
3052 }
3053
3054 /* do this first, since we can not remove any affix in encoded form
3055 * todo actually we could at least remove the common prefix and just shift the band
3056 */
3057 if (max >= 4) {
3058 if (len1 < 65) {
3059 return levenshtein_hyrroe2003(block.m_val[0], first1, last1, first2, last2, max);
3060 } else {
3061 return levenshtein_myers1999_block(block, first1, last1, first2, last2, max);
3062 }
3063 }
3064
3065 /* common affix does not effect Levenshtein distance */
3066 common::remove_common_affix(first1, last1, first2, last2);
3067 len1 = std::distance(first1, last1);
3068 len2 = std::distance(first2, last2);
3069 if (!len1 || !len2) {
3070 return len1 + len2;
3071 }
3072
3073 return levenshtein_mbleven2018(first1, last1, first2, last2, max);
3074 }
3075
3076 template<typename InputIt1, typename InputIt2>
3077 int64_t uniform_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t max) {
3078 int64_t len1 = std::distance(first1, last1);
3079 int64_t len2 = std::distance(first2, last2);
3080
3081 /* Swapping the strings so the second string is shorter */
3082 if (len1 < len2) {
3083 return uniform_levenshtein_distance(first2, last2, first1, last1, max);
3084 }
3085
3086 // when no differences are allowed a direct comparision is sufficient
3087 if (max == 0) {
3088 return !std::equal(first1, last1, first2, last2);
3089 }
3090
3091 // at least length difference insertions/deletions required
3092 if (max < (len1 - len2)) {
3093 return max + 1;
3094 }
3095
3096 /* common affix does not effect Levenshtein distance */
3097 common::remove_common_affix(first1, last1, first2, last2);
3098 len1 = std::distance(first1, last1);
3099 len2 = std::distance(first2, last2);
3100 if (!len1 || !len2) {
3101 return len1 + len2;
3102 }
3103
3104 if (max < 4) {
3105 return levenshtein_mbleven2018(first1, last1, first2, last2, max);
3106 }
3107
3108 /* when the short strings has less then 65 elements Hyyrös' algorithm can be used */
3109 if (len1 < 65) {
3110 return levenshtein_hyrroe2003(common::PatternMatchVector(first1, last1), first1, last1, first2, last2, max);
3111 } else {
3112 return levenshtein_myers1999_block(common::BlockPatternMatchVector(first1, last1), first1, last1, first2, last2, max);
3113 }
3114 }
3115
3116 template<typename InputIt1, typename InputIt2>
3117 double uniform_levenshtein_normalized_distance(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
3118 int64_t len1 = std::distance(first1, last1);
3119 int64_t len2 = std::distance(first2, last2);
3120 int64_t maximum = std::max(len1, len2);
3121 int64_t cutoff_distance = static_cast<int64_t>(std::ceil(maximum * score_cutoff));
3122 int64_t dist =
3123 uniform_levenshtein_distance(block, first1, last1, first2, last2, cutoff_distance);
3124 double norm_dist = (maximum) ? (double) dist / (double) maximum : 0.0;
3125 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
3126 }
3127
3128 template<typename InputIt1, typename InputIt2>
3129 int64_t uniform_levenshtein_similarity(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, int64_t score_cutoff) {
3130 int64_t len1 = std::distance(first1, last1);
3131 int64_t len2 = std::distance(first2, last2);
3132 int64_t maximum = std::max(len1, len2);
3133 int64_t cutoff_distance = maximum - score_cutoff;
3134 int64_t dist =
3135 uniform_levenshtein_distance(block, first1, last1, first2, last2, cutoff_distance);
3136 int64_t sim = maximum - dist;
3137 return (sim >= score_cutoff) ? sim : 0;
3138 }
3139
3140 template<typename InputIt1, typename InputIt2>
3141 double uniform_levenshtein_normalized_similarity(const common::BlockPatternMatchVector& block, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
3142 double norm_dist = uniform_levenshtein_normalized_distance(block, first1, last1, first2, last2, 1.0 - score_cutoff);
3143 double norm_sim = 1.0 - norm_dist;
3144 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
3145 }
3146
3147 struct LevenshteinBitMatrix {
3148 LevenshteinBitMatrix(uint64_t rows, uint64_t cols)
3149 : VP(rows, cols, (uint64_t) -1), VN(rows, cols, 0), dist(0) {}
3150
3151 common::Matrix<uint64_t> VP;
3152 common::Matrix<uint64_t> VN;
3153
3154 int64_t dist;
3155 };
3156
3160 template<typename InputIt1, typename InputIt2>
3161 Editops recover_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, const LevenshteinBitMatrix& matrix, StringAffix affix) {
3162 int64_t len1 = std::distance(first1, last1);
3163 int64_t len2 = std::distance(first2, last2);
3164 int64_t dist = matrix.dist;
3165 Editops editops(dist);
3166 editops.set_src_len(len1 + affix.prefix_len + affix.suffix_len);
3167 editops.set_dest_len(len2 + affix.prefix_len + affix.suffix_len);
3168
3169 if (dist == 0) {
3170 return editops;
3171 }
3172
3173 int64_t row = len1;
3174 int64_t col = len2;
3175
3176 while (row && col) {
3177 uint64_t col_pos = col - 1;
3178 uint64_t col_word = col_pos / 64;
3179 col_pos = col_pos % 64;
3180 uint64_t mask = 1ull << col_pos;
3181
3182 /* Insertion */
3183 if (matrix.VP[row - 1][col_word] & mask) {
3184 dist--;
3185 col--;
3186 editops[dist].type = EditType::Insert;
3187 editops[dist].src_pos = row + affix.prefix_len;
3188 editops[dist].dest_pos = col + affix.prefix_len;
3189 } else {
3190 row--;
3191
3192 /* Deletion */
3193 if (row && matrix.VN[row - 1][col_word] & mask) {
3194 dist--;
3195 editops[dist].type = EditType::Delete;
3196 editops[dist].src_pos = row + affix.prefix_len;
3197 editops[dist].dest_pos = col + affix.prefix_len;
3198 }
3199 /* Match/Mismatch */
3200 else {
3201 col--;
3202
3203 /* Replace (Matches are not recorded) */
3204 if (first1[row] != first2[col]) {
3205 dist--;
3206 editops[dist].type = EditType::Replace;
3207 editops[dist].src_pos = row + affix.prefix_len;
3208 editops[dist].dest_pos = col + affix.prefix_len;
3209 }
3210 }
3211 }
3212 }
3213
3214 while (col) {
3215 dist--;
3216 col--;
3217 editops[dist].type = EditType::Insert;
3218 editops[dist].src_pos = row + affix.prefix_len;
3219 editops[dist].dest_pos = col + affix.prefix_len;
3220 }
3221
3222 while (row) {
3223 dist--;
3224 row--;
3225 editops[dist].type = EditType::Delete;
3226 editops[dist].src_pos = row + affix.prefix_len;
3227 editops[dist].dest_pos = col + affix.prefix_len;
3228 }
3229
3230 return editops;
3231 }
3232
3233 template<typename InputIt1, typename InputIt2>
3234 LevenshteinBitMatrix levenshtein_matrix_hyrroe2003(const common::PatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
3235 int64_t len1 = std::distance(first1, last1);
3236 int64_t len2 = std::distance(first2, last2);
3237 uint64_t VP = ~0x0ull;
3238 uint64_t VN = 0;
3239
3240 LevenshteinBitMatrix matrix(len2, 1);
3241 matrix.dist = len1;
3242
3243 /* mask used when computing D[m,j] in the paper 10^(m-1) */
3244 uint64_t mask = (uint64_t) 1 << (len1 - 1);
3245
3246 /* Searching */
3247 for (int64_t i = 0; i < len2; ++i) {
3248 /* Step 1: Computing D0 */
3249 uint64_t PM_j = PM.get(first2[i]);
3250 uint64_t X = PM_j;
3251 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
3252
3253 /* Step 2: Computing HP and HN */
3254 uint64_t HP = VN | ~(D0 | VP);
3255 uint64_t HN = D0 & VP;
3256
3257 /* Step 3: Computing the value D[m,j] */
3258 matrix.dist += bool(HP & mask);
3259 matrix.dist -= bool(HN & mask);
3260
3261 /* Step 4: Computing Vp and VN */
3262 HP = (HP << 1) | 1;
3263 HN = (HN << 1);
3264
3265 VP = matrix.VP[i][0] = HN | ~(D0 | HP);
3266 VN = matrix.VN[i][0] = HP & D0;
3267 }
3268
3269 return matrix;
3270 }
3271
3272 template<typename InputIt1, typename InputIt2>
3273 LevenshteinBitMatrix levenshtein_matrix_hyrroe2003_block(const common::BlockPatternMatchVector& PM, InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
3274 int64_t len1 = std::distance(first1, last1);
3275 int64_t len2 = std::distance(first2, last2);
3276 /* todo could be replaced with access to matrix which would slightly
3277 * reduce memory usage */
3278 struct Vectors {
3279 uint64_t VP;
3280 uint64_t VN;
3281
3282 Vectors() : VP(~0x0ull), VN(0) {}
3283 };
3284
3285 int64_t words = PM.m_val.size();
3286 LevenshteinBitMatrix matrix(len2, words);
3287 matrix.dist = len1;
3288
3289 std::vector<Vectors> vecs(words);
3290 uint64_t Last = (uint64_t) 1 << ((len1 - 1) % 64);
3291
3292 /* Searching */
3293 for (int64_t i = 0; i < len2; i++) {
3294 uint64_t HP_carry = 1;
3295 uint64_t HN_carry = 0;
3296
3297 for (int64_t word = 0; word < words - 1; word++) {
3298 /* Step 1: Computing D0 */
3299 uint64_t PM_j = PM.get(word, first2[i]);
3300 uint64_t VN = vecs[word].VN;
3301 uint64_t VP = vecs[word].VP;
3302
3303 uint64_t X = PM_j | HN_carry;
3304 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
3305
3306 /* Step 2: Computing HP and HN */
3307 uint64_t HP = VN | ~(D0 | VP);
3308 uint64_t HN = D0 & VP;
3309
3310 /* Step 3: Computing the value D[m,j] */
3311 // only required for last vector
3312
3313 /* Step 4: Computing Vp and VN */
3314 uint64_t HP_carry_temp = HP_carry;
3315 HP_carry = HP >> 63;
3316 HP = (HP << 1) | HP_carry_temp;
3317
3318 uint64_t HN_carry_temp = HN_carry;
3319 HN_carry = HN >> 63;
3320 HN = (HN << 1) | HN_carry_temp;
3321
3322 vecs[word].VP = matrix.VP[i][word] = HN | ~(D0 | HP);
3323 vecs[word].VN = matrix.VN[i][word] = HP & D0;
3324 }
3325
3326 {
3327 /* Step 1: Computing D0 */
3328 uint64_t PM_j = PM.get(words - 1, first2[i]);
3329 uint64_t VN = vecs[words - 1].VN;
3330 uint64_t VP = vecs[words - 1].VP;
3331
3332 uint64_t X = PM_j | HN_carry;
3333 uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN;
3334
3335 /* Step 2: Computing HP and HN */
3336 uint64_t HP = VN | ~(D0 | VP);
3337 uint64_t HN = D0 & VP;
3338
3339 /* Step 3: Computing the value D[m,j] */
3340 matrix.dist += bool(HP & Last);
3341 matrix.dist -= bool(HN & Last);
3342
3343 /* Step 4: Computing Vp and VN */
3344 HP = (HP << 1) | HP_carry;
3345 HN = (HN << 1) | HN_carry;
3346
3347 vecs[words - 1].VP = matrix.VP[i][words - 1] = HN | ~(D0 | HP);
3348 vecs[words - 1].VN = matrix.VN[i][words - 1] = HP & D0;
3349 }
3350 }
3351
3352 return matrix;
3353 }
3354
3355 template<typename InputIt1, typename InputIt2>
3356 LevenshteinBitMatrix levenshtein_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
3357 int64_t len1 = std::distance(first1, last1);
3358 int64_t len2 = std::distance(first2, last2);
3359
3360 if (!len1 || !len2) {
3361 LevenshteinBitMatrix matrix(0, 0);
3362 matrix.dist = len1 + len2;
3363 return matrix;
3364 } else if (len1 <= 64) {
3365 return levenshtein_matrix_hyrroe2003(common::PatternMatchVector(first2, last2), first2, last2, first1, last1);
3366 } else {
3367 return levenshtein_matrix_hyrroe2003_block(common::BlockPatternMatchVector(first2, last2), first2, last2, first1, last1);
3368 }
3369 }
3370
3371 } // namespace detail
3372
3373 template<typename InputIt1, typename InputIt2>
3374 int64_t levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights, int64_t max) {
3375 if (weights.insert_cost == weights.delete_cost) {
3376 /* when insertions + deletions operations are free there can not be any edit distance */
3377 if (weights.insert_cost == 0) {
3378 return 0;
3379 }
3380
3381 /* uniform Levenshtein multiplied with the common factor */
3382 if (weights.insert_cost == weights.replace_cost) {
3383 // max can make use of the common divisor of the three weights
3384 int64_t new_max = detail::ceil_div(max, weights.insert_cost);
3385 int64_t distance =
3386 detail::uniform_levenshtein_distance(first1, last1, first2, last2, new_max);
3387 distance *= weights.insert_cost;
3388 return (distance <= max) ? distance : max + 1;
3389 }
3390 /*
3391 * when replace_cost >= insert_cost + delete_cost no substitutions are performed
3392 * therefore this can be implemented as InDel distance multiplied with the common factor
3393 */
3394 else if (weights.replace_cost >= weights.insert_cost + weights.delete_cost) {
3395 // max can make use of the common divisor of the three weights
3396 int64_t new_max = detail::ceil_div(max, weights.insert_cost);
3397 int64_t distance = indel_distance(first1, last1, first2, last2, new_max);
3398 distance *= weights.insert_cost;
3399 return (distance <= max) ? distance : max + 1;
3400 }
3401 }
3402
3403 return detail::generalized_levenshtein_wagner_fischer(first1, last1, first2, last2, weights, max);
3404 }
3405
3406 template<typename Sentence1, typename Sentence2>
3407 int64_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2, LevenshteinWeightTable weights, int64_t max) {
3409 }
3410
3411 template<typename InputIt1, typename InputIt2>
3412 double levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights, double score_cutoff) {
3413 int64_t maximum = detail::levenshtein_maximum(first1, last1, first2, last2, weights);
3414 int64_t cutoff_distance = static_cast<int64_t>(std::ceil(maximum * score_cutoff));
3415 int64_t dist = levenshtein_distance(first1, last1, first2, last2, weights, cutoff_distance);
3416 double norm_dist = (maximum) ? (double) dist / (double) maximum : 0.0;
3417 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
3418 }
3419
3420 template<typename Sentence1, typename Sentence2>
3421 double levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2, LevenshteinWeightTable weights, double score_cutoff) {
3422 return levenshtein_normalized_distance(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), weights, score_cutoff);
3423 }
3424
3425 template<typename InputIt1, typename InputIt2>
3426 int64_t levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights, int64_t score_cutoff) {
3427 int64_t maximum = detail::levenshtein_maximum(first1, last1, first2, last2, weights);
3428 int64_t cutoff_distance = maximum - score_cutoff;
3429 int64_t dist = levenshtein_distance(first1, last1, first2, last2, weights, cutoff_distance);
3430 int64_t sim = maximum - dist;
3431 return (sim >= score_cutoff) ? sim : 0;
3432 }
3433
3434 template<typename Sentence1, typename Sentence2>
3435 int64_t levenshtein_similarity(const Sentence1& s1, const Sentence2& s2, LevenshteinWeightTable weights, int64_t score_cutoff) {
3436 return levenshtein_similarity(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), weights, score_cutoff);
3437 }
3438
3439 template<typename InputIt1, typename InputIt2>
3440 double levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights, double score_cutoff) {
3441 double norm_dist =
3442 levenshtein_normalized_distance(first1, last1, first2, last2, weights, 1.0 - score_cutoff);
3443 double norm_sim = 1.0 - norm_dist;
3444 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
3445 }
3446
3447 template<typename Sentence1, typename Sentence2>
3448 double levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2, LevenshteinWeightTable weights, double score_cutoff) {
3450 }
3451
3452 template<typename InputIt1, typename InputIt2>
3453 Editops levenshtein_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
3454 /* prefix and suffix are no-ops, which do not need to be added to the editops */
3455 StringAffix affix = common::remove_common_affix(first1, last1, first2, last2);
3456
3457 return detail::recover_alignment(first1, last1, first2, last2, detail::levenshtein_matrix(first1, last1, first2, last2), affix);
3458 }
3459
3460 template<typename Sentence1, typename Sentence2>
3461 Editops levenshtein_editops(const Sentence1& s1, const Sentence2& s2) {
3463 }
3464
3465 template<typename CharT1>
3466 template<typename InputIt2>
3467 int64_t CachedLevenshtein<CharT1>::distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff) const {
3468 auto first1 = common::to_begin(s1);
3469 auto last1 = common::to_end(s1);
3470
3471 if (weights.insert_cost == weights.delete_cost) {
3472 /* when insertions + deletions operations are free there can not be any edit distance */
3473 if (weights.insert_cost == 0) {
3474 return 0;
3475 }
3476
3477 /* uniform Levenshtein multiplied with the common factor */
3478 if (weights.insert_cost == weights.replace_cost) {
3479 // max can make use of the common divisor of the three weights
3480 int64_t new_max = detail::ceil_div(score_cutoff, weights.insert_cost);
3481 int64_t dist =
3482 detail::uniform_levenshtein_distance(PM, first1, last1, first2, last2, new_max);
3483 dist *= weights.insert_cost;
3484
3485 return (dist <= score_cutoff) ? dist : score_cutoff + 1;
3486 }
3487 /*
3488 * when replace_cost >= insert_cost + delete_cost no substitutions are performed
3489 * therefore this can be implemented as InDel distance multiplied with the common factor
3490 */
3491 else if (weights.replace_cost >= weights.insert_cost + weights.delete_cost) {
3492 // max can make use of the common divisor of the three weights
3493 int64_t new_max = detail::ceil_div(score_cutoff, weights.insert_cost);
3494 int64_t dist = detail::indel_distance(PM, first1, last1, first2, last2, new_max);
3495 dist *= weights.insert_cost;
3496 return (dist <= score_cutoff) ? dist : score_cutoff + 1;
3497 }
3498 }
3499
3500 return detail::generalized_levenshtein_distance(first1, last1, first2, last2, weights, score_cutoff);
3501 }
3502
3503 template<typename CharT1>
3504 template<typename Sentence2>
3505 int64_t CachedLevenshtein<CharT1>::distance(const Sentence2& s2, int64_t score_cutoff) const {
3506 return distance(common::to_begin(s2), common::to_end(s2), score_cutoff);
3507 }
3508
3509 template<typename CharT1>
3510 template<typename InputIt2>
3511 double CachedLevenshtein<CharT1>::normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
3512 auto first1 = common::to_begin(s1);
3513 auto last1 = common::to_end(s1);
3514 int64_t maximum = detail::levenshtein_maximum(first1, last1, first2, last2, weights);
3515 int64_t cutoff_distance = static_cast<int64_t>(std::ceil(maximum * score_cutoff));
3516 int64_t dist = distance(first2, last2, cutoff_distance);
3517 double norm_dist = (maximum) ? (double) dist / (double) maximum : 0.0;
3518 return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
3519 }
3520
3521 template<typename CharT1>
3522 template<typename Sentence2>
3523 double CachedLevenshtein<CharT1>::normalized_distance(const Sentence2& s2, double score_cutoff) const {
3524 return normalized_distance(common::to_begin(s2), common::to_end(s2), score_cutoff);
3525 }
3526
3527 template<typename CharT1>
3528 template<typename InputIt2>
3529 int64_t CachedLevenshtein<CharT1>::similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff) const {
3530 auto first1 = common::to_begin(s1);
3531 auto last1 = common::to_end(s1);
3532 int64_t maximum = detail::levenshtein_maximum(first1, last1, first2, last2, weights);
3533 int64_t cutoff_distance = maximum - score_cutoff;
3534 int64_t dist = distance(first2, last2, cutoff_distance);
3535 int64_t sim = maximum - dist;
3536 return (sim >= score_cutoff) ? sim : 0;
3537 }
3538
3539 template<typename CharT1>
3540 template<typename Sentence2>
3541 int64_t CachedLevenshtein<CharT1>::similarity(const Sentence2& s2, int64_t score_cutoff) const {
3542 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
3543 }
3544
3545 template<typename CharT1>
3546 template<typename InputIt2>
3547 double CachedLevenshtein<CharT1>::normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
3548 double norm_dist = normalized_distance(first2, last2, 1.0 - score_cutoff);
3549 double norm_sim = 1.0 - norm_dist;
3550 return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
3551 }
3552
3553 template<typename CharT1>
3554 template<typename Sentence2>
3555 double CachedLevenshtein<CharT1>::normalized_similarity(const Sentence2& s2, double score_cutoff) const {
3556 return normalized_similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
3557 }
3558
3559} // namespace rapidfuzz
3560
3561#include <type_traits>
3562
3563namespace rapidfuzz {
3564 namespace fuzz {
3565
3596 template<typename InputIt1, typename InputIt2>
3597 double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
3598
3599 template<typename Sentence1, typename Sentence2>
3600 double ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
3601
3602 // TODO documentation
3603 template<typename CharT1>
3605 template<typename InputIt1>
3606 CachedRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1) {}
3607
3608 template<typename Sentence1>
3609 CachedRatio(const Sentence1& s1) : CachedRatio(common::to_begin(s1), common::to_end(s1)) {}
3610
3611 template<typename InputIt2>
3612 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const;
3613
3614 template<typename Sentence2>
3615 double similarity(const Sentence2& s2, double score_cutoff = 0) const;
3616
3617 private:
3618 std::basic_string<CharT1> s1;
3620 };
3621
3622#if __cplusplus >= 201703L
3623 template<typename Sentence1>
3624 CachedRatio(const Sentence1& s1) -> CachedRatio<char_type<Sentence1>>;
3625
3626 template<typename InputIt1>
3627 CachedRatio(InputIt1 first1, InputIt1 last1) -> CachedRatio<iterator_type<InputIt1>>;
3628#endif
3629
3655 template<typename InputIt1, typename InputIt2>
3656 double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
3657
3658 template<typename Sentence1, typename Sentence2>
3659 double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
3660
3661 // todo add real implementation
3662 template<typename CharT1>
3664 template<typename>
3665 friend struct CachedWRatio;
3666
3667 template<typename InputIt1>
3668 CachedPartialRatio(InputIt1 first1, InputIt1 last1);
3669
3670 template<typename Sentence1>
3671 CachedPartialRatio(const Sentence1& s1)
3672 : CachedPartialRatio(common::to_begin(s1), common::to_end(s1)) {}
3673
3674 template<typename InputIt2>
3675 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const;
3676
3677 template<typename Sentence2>
3678 double similarity(const Sentence2& s2, double score_cutoff = 0.0) const;
3679
3680 private:
3681 std::basic_string<CharT1> s1;
3683 CachedRatio<CharT1> cached_ratio;
3684 };
3685
3686#if __cplusplus >= 201703L
3687 template<typename Sentence1>
3688 CachedPartialRatio(const Sentence1& s1) -> CachedPartialRatio<char_type<Sentence1>>;
3689
3690 template<typename InputIt1>
3691 CachedPartialRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialRatio<iterator_type<InputIt1>>;
3692#endif
3693
3720 template<typename InputIt1, typename InputIt2>
3721 double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
3722
3723 template<typename Sentence1, typename Sentence2, typename CharT1 = char_type<Sentence1>, typename CharT2 = char_type<Sentence2>>
3724 double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
3725
3726 // todo CachedRatio speed for equal strings vs original implementation
3727 // TODO documentation
3728 template<typename CharT1>
3730 template<typename InputIt1>
3731 CachedTokenSortRatio(InputIt1 first1, InputIt1 last1)
3732 : s1_sorted(common::sorted_split(first1, last1).join()), cached_ratio(s1_sorted) {}
3733
3734 template<typename Sentence1>
3735 CachedTokenSortRatio(const Sentence1& s1)
3736 : CachedTokenSortRatio(common::to_begin(s1), common::to_end(s1)) {}
3737
3738 template<typename InputIt2>
3739 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const;
3740
3741 template<typename Sentence2>
3742 double similarity(const Sentence2& s2, double score_cutoff = 0) const;
3743
3744 private:
3745 std::basic_string<CharT1> s1_sorted;
3746 CachedRatio<CharT1> cached_ratio;
3747 };
3748
3749#if __cplusplus >= 201703L
3750 template<typename Sentence1>
3751 CachedTokenSortRatio(const Sentence1& s1) -> CachedTokenSortRatio<char_type<Sentence1>>;
3752
3753 template<typename InputIt1>
3754 CachedTokenSortRatio(InputIt1 first1, InputIt1 last1)
3755 -> CachedTokenSortRatio<iterator_type<InputIt1>>;
3756#endif
3757
3778 template<typename InputIt1, typename InputIt2>
3779 double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
3780
3781 template<typename Sentence1, typename Sentence2>
3782 double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
3783
3784 // TODO documentation
3785 template<typename CharT1>
3787 template<typename InputIt1>
3788 CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1)
3789 : s1_sorted(common::sorted_split(first1, last1).join()), cached_partial_ratio(s1_sorted) {}
3790
3791 template<typename Sentence1>
3792 CachedPartialTokenSortRatio(const Sentence1& s1)
3793 : CachedPartialTokenSortRatio(common::to_begin(s1), common::to_end(s1)) {}
3794
3795 template<typename InputIt2>
3796 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const;
3797
3798 template<typename Sentence2>
3799 double similarity(const Sentence2& s2, double score_cutoff = 0) const;
3800
3801 private:
3802 std::basic_string<CharT1> s1_sorted;
3803 CachedPartialRatio<CharT1> cached_partial_ratio;
3804 };
3805
3806#if __cplusplus >= 201703L
3807 template<typename Sentence1>
3808 CachedPartialTokenSortRatio(const Sentence1& s1)
3809 -> CachedPartialTokenSortRatio<char_type<Sentence1>>;
3810
3811 template<typename InputIt1>
3812 CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1)
3813 -> CachedPartialTokenSortRatio<iterator_type<InputIt1>>;
3814#endif
3815
3844 template<typename InputIt1, typename InputIt2>
3845 double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
3846
3847 template<typename Sentence1, typename Sentence2>
3848 double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
3849
3850 // TODO documentation
3851 template<typename CharT1>
3853 template<typename InputIt1>
3854 CachedTokenSetRatio(InputIt1 first1, InputIt1 last1)
3855 : s1(first1, last1), tokens_s1(common::sorted_split(std::begin(s1), std::end(s1))) {}
3856
3857 template<typename Sentence1>
3858 CachedTokenSetRatio(const Sentence1& s1)
3859 : CachedTokenSetRatio(common::to_begin(s1), common::to_end(s1)) {}
3860
3861 template<typename InputIt2>
3862 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const;
3863
3864 template<typename Sentence2>
3865 double similarity(const Sentence2& s2, double score_cutoff = 0) const;
3866
3867 private:
3868 std::basic_string<CharT1> s1;
3870 };
3871
3872#if __cplusplus >= 201703L
3873 template<typename Sentence1>
3874 CachedTokenSetRatio(const Sentence1& s1) -> CachedTokenSetRatio<char_type<Sentence1>>;
3875
3876 template<typename InputIt1>
3877 CachedTokenSetRatio(InputIt1 first1, InputIt1 last1)
3878 -> CachedTokenSetRatio<iterator_type<InputIt1>>;
3879#endif
3880
3900 template<typename InputIt1, typename InputIt2>
3901 double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
3902
3903 template<typename Sentence1, typename Sentence2>
3904 double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
3905
3906 // TODO documentation
3907 template<typename CharT1>
3909 template<typename InputIt1>
3910 CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1)
3911 : s1(first1, last1), tokens_s1(common::sorted_split(std::begin(s1), std::end(s1))) {}
3912
3913 template<typename Sentence1>
3914 CachedPartialTokenSetRatio(const Sentence1& s1)
3915 : CachedPartialTokenSetRatio(common::to_begin(s1), common::to_end(s1)) {}
3916
3917 template<typename InputIt2>
3918 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const;
3919
3920 template<typename Sentence2>
3921 double similarity(const Sentence2& s2, double score_cutoff = 0) const;
3922
3923 private:
3924 std::basic_string<CharT1> s1;
3926 };
3927
3928#if __cplusplus >= 201703L
3929 template<typename Sentence1>
3930 CachedPartialTokenSetRatio(const Sentence1& s1) -> CachedPartialTokenSetRatio<char_type<Sentence1>>;
3931
3932 template<typename InputIt1>
3933 CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1)
3934 -> CachedPartialTokenSetRatio<iterator_type<InputIt1>>;
3935#endif
3936
3956 template<typename InputIt1, typename InputIt2>
3957 double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
3958
3959 template<typename Sentence1, typename Sentence2>
3960 double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
3961
3962 // todo add real implementation
3963 template<typename CharT1>
3965 template<typename InputIt1>
3966 CachedTokenRatio(InputIt1 first1, InputIt1 last1)
3967 : s1(first1, last1),
3968 s1_tokens(common::sorted_split(std::begin(s1), std::end(s1))),
3969 s1_sorted(s1_tokens.join()),
3970 cached_ratio_s1_sorted(s1_sorted) {}
3971
3972 template<typename Sentence1>
3973 CachedTokenRatio(const Sentence1& s1)
3974 : CachedTokenRatio(common::to_begin(s1), common::to_end(s1)) {}
3975
3976 template<typename InputIt2>
3977 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const;
3978
3979 template<typename Sentence2>
3980 double similarity(const Sentence2& s2, double score_cutoff = 0) const;
3981
3982 private:
3983 std::basic_string<CharT1> s1;
3985 std::basic_string<CharT1> s1_sorted;
3986 CachedRatio<CharT1> cached_ratio_s1_sorted;
3987 };
3988
3989#if __cplusplus >= 201703L
3990 template<typename Sentence1>
3991 CachedTokenRatio(const Sentence1& s1) -> CachedTokenRatio<char_type<Sentence1>>;
3992
3993 template<typename InputIt1>
3994 CachedTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenRatio<iterator_type<InputIt1>>;
3995#endif
3996
4017 template<typename InputIt1, typename InputIt2>
4018 double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
4019
4020 template<typename Sentence1, typename Sentence2>
4021 double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
4022
4023 // todo add real implementation
4024 template<typename CharT1>
4026 template<typename InputIt1>
4027 CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1)
4028 : s1(first1, last1),
4029 tokens_s1(common::sorted_split(std::begin(s1), std::end(s1))),
4030 s1_sorted(tokens_s1.join()) {}
4031
4032 template<typename Sentence1>
4033 CachedPartialTokenRatio(const Sentence1& s1)
4034 : CachedPartialTokenRatio(common::to_begin(s1), common::to_end(s1)) {}
4035
4036 template<typename InputIt2>
4037 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const;
4038
4039 template<typename Sentence2>
4040 double similarity(const Sentence2& s2, double score_cutoff = 0) const;
4041
4042 private:
4043 std::basic_string<CharT1> s1;
4045 std::basic_string<CharT1> s1_sorted;
4046 };
4047
4048#if __cplusplus >= 201703L
4049 template<typename Sentence1>
4050 CachedPartialTokenRatio(const Sentence1& s1) -> CachedPartialTokenRatio<char_type<Sentence1>>;
4051
4052 template<typename InputIt1>
4053 CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1)
4054 -> CachedPartialTokenRatio<iterator_type<InputIt1>>;
4055#endif
4056
4078 template<typename InputIt1, typename InputIt2>
4079 double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
4080
4081 template<typename Sentence1, typename Sentence2>
4082 double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
4083
4084 // todo add real implementation
4085 template<typename CharT1>
4087 template<typename InputIt1>
4088 CachedWRatio(InputIt1 first1, InputIt1 last1);
4089
4090 template<typename Sentence1>
4091 CachedWRatio(const Sentence1& s1) : CachedWRatio(common::to_begin(s1), common::to_end(s1)) {}
4092
4093 template<typename InputIt2>
4094 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const;
4095
4096 template<typename Sentence2>
4097 double similarity(const Sentence2& s2, double score_cutoff = 0) const;
4098
4099 private:
4100 // todo somehow implement this using other ratios with creating PatternMatchVector
4101 // multiple times
4102 std::basic_string<CharT1> s1;
4103 CachedPartialRatio<CharT1> cached_partial_ratio;
4105 std::basic_string<CharT1> s1_sorted;
4106 common::BlockPatternMatchVector blockmap_s1_sorted;
4107 };
4108
4109#if __cplusplus >= 201703L
4110 template<typename Sentence1>
4111 CachedWRatio(const Sentence1& s1) -> CachedWRatio<char_type<Sentence1>>;
4112
4113 template<typename InputIt1>
4114 CachedWRatio(InputIt1 first1, InputIt1 last1) -> CachedWRatio<iterator_type<InputIt1>>;
4115#endif
4116
4138 template<typename InputIt1, typename InputIt2>
4139 double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
4140
4141 template<typename Sentence1, typename Sentence2>
4142 double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
4143
4144 template<typename CharT1>
4146 template<typename InputIt1>
4147 CachedQRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), cached_ratio(first1, last1) {}
4148
4149 template<typename Sentence1>
4150 CachedQRatio(const Sentence1& s1) : CachedQRatio(common::to_begin(s1), common::to_end(s1)) {}
4151
4152 template<typename InputIt2>
4153 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const;
4154
4155 template<typename Sentence2>
4156 double similarity(const Sentence2& s2, double score_cutoff = 0) const;
4157
4158 private:
4159 std::basic_string<CharT1> s1;
4160 CachedRatio<CharT1> cached_ratio;
4161 };
4162
4163#if __cplusplus >= 201703L
4164 template<typename Sentence1>
4165 CachedQRatio(const Sentence1& s1) -> CachedQRatio<char_type<Sentence1>>;
4166
4167 template<typename InputIt1>
4168 CachedQRatio(InputIt1 first1, InputIt1 last1) -> CachedQRatio<iterator_type<InputIt1>>;
4169#endif
4170
4173 } // namespace fuzz
4174} // namespace rapidfuzz
4175
4176
4177// The MIT License (MIT)
4178//
4179// Copyright (c) 2020 Max Bachmann
4180// Copyright (c) 2014 Jean-Bernard Jansen
4181//
4182// Permission is hereby granted, free of charge, to any person obtaining a copy
4183// of this software and associated documentation files (the "Software"), to deal
4184// in the Software without restriction, including without limitation the rights
4185// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
4186// copies of the Software, and to permit persons to whom the Software is
4187// furnished to do so, subject to the following conditions:
4188//
4189// The above copyright notice and this permission notice shall be included in
4190// all copies or substantial portions of the Software.
4191//
4192// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4193// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4194// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4195// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4196// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4197// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
4198// THE SOFTWARE.
4199
4200
4201#include <algorithm>
4202#include <tuple>
4203#include <unordered_map>
4204#include <vector>
4205
4206namespace rapidfuzz {
4207 namespace detail {
4208 struct MatchingBlock {
4209 int64_t spos;
4210 int64_t dpos;
4211 int64_t length;
4212 MatchingBlock(int64_t aSPos, int64_t aDPos, int64_t aLength)
4213 : spos(aSPos), dpos(aDPos), length(aLength) {}
4214 };
4215
4216 namespace difflib {
4217
4218 template<typename InputIt1, typename InputIt2>
4220 public:
4221 using match_t = std::tuple<int64_t, int64_t, int64_t>;
4222
4223 SequenceMatcher(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2)
4224 : a_first(first1), a_last(last1), b_first(first2), b_last(last2) {
4225 int64_t b_len = std::distance(b_first, b_last);
4226 j2len_.resize(b_len + 1);
4227 for (int64_t i = 0; i < b_len; ++i) {
4228 b2j_.create(b_first[i]).push_back(i);
4229 }
4230 }
4231
4232 match_t find_longest_match(int64_t a_low, int64_t a_high, int64_t b_low, int64_t b_high) {
4233 int64_t best_i = a_low;
4234 int64_t best_j = b_low;
4235 int64_t best_size = 0;
4236
4237 // Find longest junk free match
4238 {
4239 for (int64_t i = a_low; i < a_high; ++i) {
4240 const auto& indexes = b2j_[a_first[i]];
4241 size_t pos = 0;
4242 int64_t next_val = 0;
4243 bool found = false;
4244 for (; pos < indexes.size(); pos++) {
4245 int64_t j = indexes[pos];
4246 if (j < b_low) continue;
4247
4248 next_val = j2len_[j];
4249 break;
4250 }
4251
4252 for (; pos < indexes.size(); pos++) {
4253 int64_t j = indexes[pos];
4254 if (j >= b_high) break;
4255
4256 found = true;
4257 int64_t k = next_val + 1;
4258
4259 /* the next value might be overwritten below
4260 * so cache it */
4261 if (pos + 1 < indexes.size()) {
4262 next_val = j2len_[indexes[pos + 1]];
4263 }
4264
4265 j2len_[j + 1] = k;
4266 if (k > best_size) {
4267 best_i = i - k + 1;
4268 best_j = j - k + 1;
4269 best_size = k;
4270 }
4271 }
4272
4273 if (!found) {
4274 std::fill(j2len_.begin() + b_low, j2len_.begin() + b_high, 0);
4275 }
4276 }
4277
4278 std::fill(j2len_.begin() + b_low, j2len_.begin() + b_high, 0);
4279 }
4280
4281 while (best_i > a_low && best_j > b_low && a_first[best_i - 1] == b_first[best_j - 1]) {
4282 --best_i;
4283 --best_j;
4284 ++best_size;
4285 }
4286
4287 while ((best_i + best_size) < a_high && (best_j + best_size) < b_high && a_first[best_i + best_size] == b_first[best_j + best_size]) {
4288 ++best_size;
4289 }
4290
4291 return std::make_tuple(best_i, best_j, best_size);
4292 }
4293
4294 std::vector<MatchingBlock> get_matching_blocks() {
4295 int64_t a_len = std::distance(a_first, a_last);
4296 int64_t b_len = std::distance(b_first, b_last);
4297 // The following are tuple extracting aliases
4298 std::vector<std::tuple<int64_t, int64_t, int64_t, int64_t>> queue;
4299 std::vector<match_t> matching_blocks_pass1;
4300
4301 size_t queue_head = 0;
4302 queue.reserve(std::min(a_len, b_len));
4303 queue.emplace_back(0, a_len, 0, b_len);
4304
4305 while (queue_head < queue.size()) {
4306 int64_t a_low, a_high, b_low, b_high;
4307 std::tie(a_low, a_high, b_low, b_high) = queue[queue_head++];
4308 int64_t spos, dpos, length;
4309 std::tie(spos, dpos, length) = find_longest_match(a_low, a_high, b_low, b_high);
4310 if (length) {
4311 if (a_low < spos && b_low < dpos) {
4312 queue.emplace_back(a_low, spos, b_low, dpos);
4313 }
4314 if ((spos + length) < a_high && (dpos + length) < b_high) {
4315 queue.emplace_back(spos + length, a_high, dpos + length, b_high);
4316 }
4317 matching_blocks_pass1.emplace_back(spos, dpos, length);
4318 }
4319 }
4320 std::sort(common::to_begin(matching_blocks_pass1), common::to_end(matching_blocks_pass1));
4321
4322 std::vector<MatchingBlock> matching_blocks;
4323
4324 matching_blocks.reserve(matching_blocks_pass1.size());
4325
4326 int64_t i1, j1, k1;
4327 i1 = j1 = k1 = 0;
4328
4329 for (match_t const& m : matching_blocks_pass1) {
4330 if (i1 + k1 == std::get<0>(m) && j1 + k1 == std::get<1>(m)) {
4331 k1 += std::get<2>(m);
4332 } else {
4333 if (k1) {
4334 matching_blocks.emplace_back(i1, j1, k1);
4335 }
4336 std::tie(i1, j1, k1) = m;
4337 }
4338 }
4339 if (k1) {
4340 matching_blocks.emplace_back(i1, j1, k1);
4341 }
4342 matching_blocks.emplace_back(a_len, b_len, 0);
4343
4344 return matching_blocks;
4345 }
4346
4347 protected:
4348 InputIt1 a_first;
4349 InputIt1 a_last;
4350 InputIt2 b_first;
4351 InputIt2 b_last;
4352
4353 private:
4354 // Cache to avoid reallocations
4355 std::vector<int64_t> j2len_;
4356 common::CharHashTable<iterator_type<InputIt2>, std::vector<int64_t>> b2j_;
4357 };
4358 } // namespace difflib
4359
4360 template<typename InputIt1, typename InputIt2>
4361 std::vector<MatchingBlock> get_matching_blocks(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) {
4362 return difflib::SequenceMatcher<InputIt1, InputIt2>(first1, last1, first2, last2)
4364 }
4365
4366 } /* namespace detail */
4367} /* namespace rapidfuzz */
4368
4369#include <algorithm>
4370#include <cmath>
4371#include <iterator>
4372#include <unordered_map>
4373#include <vector>
4374
4375namespace rapidfuzz {
4376 namespace fuzz {
4377
4378 /**********************************************
4379 * ratio
4380 *********************************************/
4381
4382 template<typename InputIt1, typename InputIt2>
4383 double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4384 return indel_normalized_similarity(first1, last1, first2, last2, score_cutoff / 100) * 100;
4385 }
4386
4387 template<typename Sentence1, typename Sentence2>
4388 double ratio(const Sentence1& s1, const Sentence2& s2, const double score_cutoff) {
4389 return ratio(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
4390 }
4391
4392 template<typename CharT1>
4393 template<typename InputIt2>
4394 double CachedRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
4395 double norm_sim = rapidfuzz::detail::indel_normalized_similarity(
4396 PM, common::to_begin(s1), common::to_end(s1), first2, last2, score_cutoff / 100
4397 );
4398 return norm_sim * 100;
4399 }
4400
4401 template<typename CharT1>
4402 template<typename Sentence2>
4403 double CachedRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
4404 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
4405 }
4406
4407 /**********************************************
4408 * partial_ratio
4409 *********************************************/
4410
4411 namespace detail {
4412
4413 template<typename InputIt1, typename InputIt2, typename CachedCharT1>
4414 double
4415 partial_ratio_short_needle(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, const CachedRatio<CachedCharT1>& cached_ratio, const common::CharHashTable<iterator_type<InputIt1>, bool>& s1_char_map, double score_cutoff) {
4416 double max_ratio = 0;
4417 int64_t len1 = std::distance(first1, last1);
4418 int64_t len2 = std::distance(first2, last2);
4419 assert(len2 >= len1);
4420
4421 for (int64_t i = 1; i < len1; ++i) {
4422 auto substr_last = first2 + i;
4423
4424 if (!s1_char_map[*(substr_last - 1)]) {
4425 continue;
4426 }
4427
4428 double ls_ratio = cached_ratio.similarity(first2, substr_last, score_cutoff);
4429 if (ls_ratio > max_ratio) {
4430 score_cutoff = max_ratio = ls_ratio;
4431 if (ls_ratio == 100.0) {
4432 return 100.0;
4433 }
4434 }
4435 }
4436
4437 for (int64_t i = 0; i < len2 - len1; ++i) {
4438 auto substr_first = first2 + i;
4439 auto substr_last = substr_first + len1;
4440
4441 if (!s1_char_map[*(substr_last - 1)]) {
4442 continue;
4443 }
4444
4445 double ls_ratio = cached_ratio.similarity(substr_first, substr_last, score_cutoff);
4446 if (ls_ratio > max_ratio) {
4447 score_cutoff = max_ratio = ls_ratio;
4448 if (ls_ratio == 100.0) {
4449 return 100.0;
4450 }
4451 }
4452 }
4453
4454 for (int64_t i = len2 - len1; i < len2; ++i) {
4455 auto substr_first = first2 + i;
4456
4457 if (!s1_char_map[*substr_first]) {
4458 continue;
4459 }
4460
4461 double ls_ratio = cached_ratio.similarity(substr_first, last2, score_cutoff);
4462 if (ls_ratio > max_ratio) {
4463 score_cutoff = max_ratio = ls_ratio;
4464 if (ls_ratio == 100.0) {
4465 return 100.0;
4466 }
4467 }
4468 }
4469
4470 return max_ratio;
4471 }
4472
4473 template<typename InputIt1, typename InputIt2, typename CharT1 = iterator_type<InputIt1>>
4474 double partial_ratio_short_needle(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4475 CachedRatio<CharT1> cached_ratio(first1, last1);
4476
4478 int64_t len1 = std::distance(first1, last1);
4479 for (int64_t i = 0; i < len1; ++i) {
4480 s1_char_map.create(first1[i]) = true;
4481 }
4482
4483 return partial_ratio_short_needle(first1, last1, first2, last2, cached_ratio, s1_char_map, score_cutoff);
4484 }
4485
4486 template<typename InputIt1, typename InputIt2, typename CachedCharT1>
4487 double partial_ratio_long_needle(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, const CachedRatio<CachedCharT1>& cached_ratio, double score_cutoff) {
4488 int64_t len1 = std::distance(first1, last1);
4489 int64_t len2 = std::distance(first2, last2);
4490 assert(len2 >= len1);
4491
4492 double max_ratio = 0;
4493 if (score_cutoff > 100) {
4494 return 0;
4495 }
4496
4497 if (!len1 || !len2) {
4498 return static_cast<double>(len1 == len2) * 100.0;
4499 }
4500
4501 auto blocks = rapidfuzz::detail::get_matching_blocks(first1, last1, first2, last2);
4502
4503 // when there is a full match exit early
4504 for (const auto& block : blocks) {
4505 if (block.length == len1) {
4506 return 100;
4507 }
4508 }
4509
4510 for (const auto& block : blocks) {
4511 int64_t long_start = (block.dpos > block.spos) ? block.dpos - block.spos : 0;
4512 auto substr_first = first2 + long_start;
4513 auto substr_last =
4514 (std::distance(substr_first, last2) < len1) ? last2 : substr_first + len1;
4515
4516 double ls_ratio = cached_ratio.similarity(substr_first, substr_last, score_cutoff);
4517 if (ls_ratio > max_ratio) {
4518 score_cutoff = max_ratio = ls_ratio;
4519 }
4520 }
4521
4522 return max_ratio;
4523 }
4524
4525 template<typename InputIt1, typename InputIt2, typename CharT1 = iterator_type<InputIt1>>
4526 double partial_ratio_long_needle(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4527 CachedRatio<CharT1> cached_ratio(first1, last1);
4528
4529 return partial_ratio_long_needle(first1, last1, first2, last2, cached_ratio, score_cutoff);
4530 }
4531
4532 } // namespace detail
4533
4534 template<typename InputIt1, typename InputIt2>
4535 double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4536 int64_t len1 = std::distance(first1, last1);
4537 int64_t len2 = std::distance(first2, last2);
4538
4539 if (score_cutoff > 100) {
4540 return 0;
4541 }
4542
4543 if (!len1 || !len2) {
4544 return static_cast<double>(len1 == len2) * 100.0;
4545 }
4546
4547 if (len1 > len2) {
4548 return partial_ratio(first2, last2, first1, last1, score_cutoff);
4549 }
4550
4551 if (len1 <= 64) {
4552 return detail::partial_ratio_short_needle(first1, last1, first2, last2, score_cutoff);
4553 } else {
4554 return detail::partial_ratio_long_needle(first1, last1, first2, last2, score_cutoff);
4555 }
4556 }
4557
4558 template<typename Sentence1, typename Sentence2>
4559 double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
4560 return partial_ratio(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
4561 }
4562
4563 template<typename CharT1>
4564 template<typename InputIt1>
4565 CachedPartialRatio<CharT1>::CachedPartialRatio(InputIt1 first1, InputIt1 last1)
4566 : s1(first1, last1), cached_ratio(first1, last1) {
4567 for (const auto& ch : s1) {
4568 s1_char_map.create(ch) = true;
4569 }
4570 }
4571
4572 template<typename CharT1>
4573 template<typename InputIt2>
4574 double CachedPartialRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
4575 int64_t len1 = (int64_t) s1.size();
4576 int64_t len2 = std::distance(first2, last2);
4577
4578 if (len1 > len2) {
4579 return partial_ratio(common::to_begin(s1), common::to_end(s1), first2, last2, score_cutoff);
4580 }
4581
4582 if (!len1 || !len2) {
4583 return static_cast<double>(len1 == len2) * 100.0;
4584 }
4585
4586 if (len1 <= 64) {
4587 return detail::partial_ratio_short_needle(common::to_begin(s1), common::to_end(s1), first2, last2, cached_ratio, s1_char_map, score_cutoff);
4588 } else {
4589 return detail::partial_ratio_long_needle(common::to_begin(s1), common::to_end(s1), first2, last2, cached_ratio, score_cutoff);
4590 }
4591 }
4592
4593 template<typename CharT1>
4594 template<typename Sentence2>
4595 double CachedPartialRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
4596 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
4597 }
4598
4599 /**********************************************
4600 * token_sort_ratio
4601 *********************************************/
4602 template<typename InputIt1, typename InputIt2>
4603 double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4604 if (score_cutoff > 100) return 0;
4605
4606 return ratio(common::sorted_split(first1, last1).join(), common::sorted_split(first2, last2).join(), score_cutoff);
4607 }
4608
4609 template<typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
4610 double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
4611 return token_sort_ratio(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
4612 }
4613
4614 template<typename CharT1>
4615 template<typename InputIt2>
4616 double CachedTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
4617 if (score_cutoff > 100) return 0;
4618
4619 return cached_ratio.similarity(common::sorted_split(first2, last2).join(), score_cutoff);
4620 }
4621
4622 template<typename CharT1>
4623 template<typename Sentence2>
4624 double CachedTokenSortRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
4625 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
4626 }
4627
4628 /**********************************************
4629 * partial_token_sort_ratio
4630 *********************************************/
4631
4632 template<typename InputIt1, typename InputIt2>
4633 double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4634 if (score_cutoff > 100) return 0;
4635
4636 return partial_ratio(common::sorted_split(first1, last1).join(), common::sorted_split(first2, last2).join(), score_cutoff);
4637 }
4638
4639 template<typename Sentence1, typename Sentence2>
4640 double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
4641 return partial_token_sort_ratio(common::to_begin(s2), common::to_end(s2), score_cutoff);
4642 }
4643
4644 template<typename CharT1>
4645 template<typename InputIt2>
4646 double CachedPartialTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
4647 if (score_cutoff > 100) return 0;
4648
4649 return cached_partial_ratio.similarity(common::sorted_split(first2, last2).join(), score_cutoff);
4650 }
4651
4652 template<typename CharT1>
4653 template<typename Sentence2>
4654 double CachedPartialTokenSortRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
4655 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
4656 }
4657
4658 /**********************************************
4659 * token_set_ratio
4660 *********************************************/
4661
4662 namespace detail {
4663 template<typename InputIt1, typename InputIt2>
4664 double token_set_ratio(const SplittedSentenceView<InputIt1>& tokens_a, const SplittedSentenceView<InputIt2>& tokens_b, const double score_cutoff) {
4665 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
4666 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
4667 if (tokens_a.empty() || tokens_a.empty()) {
4668 return 0;
4669 }
4670
4671 auto decomposition = common::set_decomposition(tokens_a, tokens_b);
4672 auto intersect = decomposition.intersection;
4673 auto diff_ab = decomposition.difference_ab;
4674 auto diff_ba = decomposition.difference_ba;
4675
4676 // one sentence is part of the other one
4677 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
4678 return 100;
4679 }
4680
4681 auto diff_ab_joined = diff_ab.join();
4682 auto diff_ba_joined = diff_ba.join();
4683
4684 int64_t ab_len = diff_ab_joined.length();
4685 int64_t ba_len = diff_ba_joined.length();
4686 int64_t sect_len = intersect.length();
4687
4688 // string length sect+ab <-> sect and sect+ba <-> sect
4689 int64_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
4690 int64_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
4691
4692 double result = 0;
4693 auto cutoff_distance = common::score_cutoff_to_distance<100>(score_cutoff, ab_len + ba_len);
4694 int64_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
4695
4696 if (dist <= cutoff_distance) {
4697 result = common::norm_distance<100>(dist, sect_ab_len + sect_ba_len, score_cutoff);
4698 }
4699
4700 // exit early since the other ratios are 0
4701 if (!sect_len) {
4702 return result;
4703 }
4704
4705 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
4706 // since only sect is similar in them the distance can be calculated based on
4707 // the length difference
4708 int64_t sect_ab_dist = bool(sect_len) + ab_len;
4709 double sect_ab_ratio =
4710 common::norm_distance<100>(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
4711
4712 int64_t sect_ba_dist = bool(sect_len) + ba_len;
4713 double sect_ba_ratio =
4714 common::norm_distance<100>(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
4715
4716 return std::max({result, sect_ab_ratio, sect_ba_ratio});
4717 }
4718 } // namespace detail
4719
4720 template<typename InputIt1, typename InputIt2>
4721 double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4722 if (score_cutoff > 100) return 0;
4723
4724 return detail::token_set_ratio(common::sorted_split(first1, last1), common::sorted_split(first2, last2), score_cutoff);
4725 }
4726
4727 template<typename Sentence1, typename Sentence2>
4728 double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
4729 return token_set_ratio(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
4730 }
4731
4732 template<typename CharT1>
4733 template<typename InputIt2>
4734 double CachedTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
4735 if (score_cutoff > 100) return 0;
4736
4737 return detail::token_set_ratio(tokens_s1, common::sorted_split(first2, last2), score_cutoff);
4738 }
4739
4740 template<typename CharT1>
4741 template<typename Sentence2>
4742 double CachedTokenSetRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
4743 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
4744 }
4745
4746 /**********************************************
4747 * partial_token_set_ratio
4748 *********************************************/
4749
4750 namespace detail {
4751 template<typename InputIt1, typename InputIt2>
4752 double partial_token_set_ratio(const SplittedSentenceView<InputIt1>& tokens_a, const SplittedSentenceView<InputIt2>& tokens_b, const double score_cutoff) {
4753 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
4754 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
4755 if (tokens_a.empty() || tokens_a.empty()) {
4756 return 0;
4757 }
4758
4759 auto decomposition = common::set_decomposition(tokens_a, tokens_b);
4760
4761 // exit early when there is a common word in both sequences
4762 if (!decomposition.intersection.empty()) return 100;
4763
4764 return partial_ratio(decomposition.difference_ab.join(), decomposition.difference_ba.join(), score_cutoff);
4765 }
4766 } // namespace detail
4767
4768 template<typename InputIt1, typename InputIt2>
4769 double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4770 if (score_cutoff > 100) return 0;
4771
4772 return detail::partial_token_set_ratio(common::sorted_split(first1, last1), common::sorted_split(first2, last2), score_cutoff);
4773 }
4774
4775 template<typename Sentence1, typename Sentence2>
4776 double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
4778 }
4779
4780 template<typename CharT1>
4781 template<typename InputIt2>
4782 double CachedPartialTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
4783 if (score_cutoff > 100) return 0;
4784
4785 return detail::partial_token_set_ratio(tokens_s1, common::sorted_split(first2, last2), score_cutoff);
4786 }
4787
4788 template<typename CharT1>
4789 template<typename Sentence2>
4790 double CachedPartialTokenSetRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
4791 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
4792 }
4793
4794 /**********************************************
4795 * token_ratio
4796 *********************************************/
4797
4798 template<typename InputIt1, typename InputIt2>
4799 double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4800 if (score_cutoff > 100) return 0;
4801
4802 auto tokens_a = common::sorted_split(first1, last1);
4803 auto tokens_b = common::sorted_split(first2, last2);
4804
4805 auto decomposition = common::set_decomposition(tokens_a, tokens_b);
4806 auto intersect = decomposition.intersection;
4807 auto diff_ab = decomposition.difference_ab;
4808 auto diff_ba = decomposition.difference_ba;
4809
4810 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
4811 return 100;
4812 }
4813
4814 auto diff_ab_joined = diff_ab.join();
4815 auto diff_ba_joined = diff_ba.join();
4816
4817 int64_t ab_len = diff_ab_joined.length();
4818 int64_t ba_len = diff_ba_joined.length();
4819 int64_t sect_len = intersect.length();
4820
4821 double result = ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
4822
4823 // string length sect+ab <-> sect and sect+ba <-> sect
4824 int64_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
4825 int64_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
4826
4827 auto cutoff_distance = common::score_cutoff_to_distance<100>(score_cutoff, ab_len + ba_len);
4828 int64_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
4829 if (dist <= cutoff_distance) {
4830 result = std::max(
4831 result, common::norm_distance<100>(dist, sect_ab_len + sect_ba_len, score_cutoff)
4832 );
4833 }
4834
4835 // exit early since the other ratios are 0
4836 if (!sect_len) {
4837 return result;
4838 }
4839
4840 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
4841 // since only sect is similar in them the distance can be calculated based on
4842 // the length difference
4843 int64_t sect_ab_dist = bool(sect_len) + ab_len;
4844 double sect_ab_ratio =
4845 common::norm_distance<100>(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
4846
4847 int64_t sect_ba_dist = bool(sect_len) + ba_len;
4848 double sect_ba_ratio =
4849 common::norm_distance<100>(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
4850
4851 return std::max({result, sect_ab_ratio, sect_ba_ratio});
4852 }
4853
4854 template<typename Sentence1, typename Sentence2>
4855 double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
4856 return token_ratio(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
4857 }
4858
4859 namespace detail {
4860 template<typename CharT1, typename CachedCharT1, typename InputIt2>
4861 double token_ratio(const SplittedSentenceView<CharT1>& s1_tokens, const CachedRatio<CachedCharT1>& cached_ratio_s1_sorted, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4862 if (score_cutoff > 100) return 0;
4863
4864 auto s2_tokens = common::sorted_split(first2, last2);
4865
4866 auto decomposition = common::set_decomposition(s1_tokens, s2_tokens);
4867 auto intersect = decomposition.intersection;
4868 auto diff_ab = decomposition.difference_ab;
4869 auto diff_ba = decomposition.difference_ba;
4870
4871 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
4872 return 100;
4873 }
4874
4875 auto diff_ab_joined = diff_ab.join();
4876 auto diff_ba_joined = diff_ba.join();
4877
4878 int64_t ab_len = diff_ab_joined.length();
4879 int64_t ba_len = diff_ba_joined.length();
4880 int64_t sect_len = intersect.length();
4881
4882 double result = cached_ratio_s1_sorted.similarity(s2_tokens.join(), score_cutoff);
4883
4884 // string length sect+ab <-> sect and sect+ba <-> sect
4885 int64_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
4886 int64_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
4887
4888 auto cutoff_distance = common::score_cutoff_to_distance<100>(score_cutoff, ab_len + ba_len);
4889 int64_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
4890 if (dist <= cutoff_distance) {
4891 result = std::max(
4892 result, common::norm_distance<100>(dist, sect_ab_len + sect_ba_len, score_cutoff)
4893 );
4894 }
4895
4896 // exit early since the other ratios are 0
4897 if (!sect_len) {
4898 return result;
4899 }
4900
4901 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
4902 // since only sect is similar in them the distance can be calculated based on
4903 // the length difference
4904 int64_t sect_ab_dist = bool(sect_len) + ab_len;
4905 double sect_ab_ratio =
4906 common::norm_distance<100>(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
4907
4908 int64_t sect_ba_dist = bool(sect_len) + ba_len;
4909 double sect_ba_ratio =
4910 common::norm_distance<100>(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
4911
4912 return std::max({result, sect_ab_ratio, sect_ba_ratio});
4913 }
4914
4915 // todo this is a temporary solution until WRatio is properly implemented using other scorers
4916 template<typename CharT1, typename InputIt1, typename InputIt2>
4917 double token_ratio(const std::basic_string<CharT1>& s1_sorted, const SplittedSentenceView<InputIt1>& tokens_s1, const common::BlockPatternMatchVector& blockmap_s1_sorted, InputIt2 first2, InputIt2 last2, double score_cutoff) {
4918 if (score_cutoff > 100) return 0;
4919
4920 auto tokens_b = common::sorted_split(first2, last2);
4921
4922 auto decomposition = common::set_decomposition(tokens_s1, tokens_b);
4923 auto intersect = decomposition.intersection;
4924 auto diff_ab = decomposition.difference_ab;
4925 auto diff_ba = decomposition.difference_ba;
4926
4927 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
4928 return 100;
4929 }
4930
4931 auto diff_ab_joined = diff_ab.join();
4932 auto diff_ba_joined = diff_ba.join();
4933
4934 int64_t ab_len = diff_ab_joined.length();
4935 int64_t ba_len = diff_ba_joined.length();
4936 int64_t sect_len = intersect.length();
4937
4938 double result = 0;
4939 auto s2_sorted = tokens_b.join();
4940 if (s1_sorted.size() < 65) {
4941 double norm_sim = rapidfuzz::detail::indel_normalized_similarity(
4942 blockmap_s1_sorted, common::to_begin(s1_sorted), common::to_end(s1_sorted),
4943 common::to_begin(s2_sorted), common::to_end(s2_sorted), score_cutoff / 100
4944 );
4945 result = norm_sim * 100;
4946 } else {
4947 result = fuzz::ratio(s1_sorted, s2_sorted, score_cutoff);
4948 }
4949
4950 // string length sect+ab <-> sect and sect+ba <-> sect
4951 int64_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
4952 int64_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
4953
4954 auto cutoff_distance = common::score_cutoff_to_distance<100>(score_cutoff, ab_len + ba_len);
4955 int64_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
4956 if (dist <= cutoff_distance) {
4957 result = std::max(
4958 result, common::norm_distance<100>(dist, sect_ab_len + sect_ba_len, score_cutoff)
4959 );
4960 }
4961
4962 // exit early since the other ratios are 0
4963 if (!sect_len) {
4964 return result;
4965 }
4966
4967 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
4968 // since only sect is similar in them the distance can be calculated based on
4969 // the length difference
4970 int64_t sect_ab_dist = bool(sect_len) + ab_len;
4971 double sect_ab_ratio =
4972 common::norm_distance<100>(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
4973
4974 int64_t sect_ba_dist = bool(sect_len) + ba_len;
4975 double sect_ba_ratio =
4976 common::norm_distance<100>(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
4977
4978 return std::max({result, sect_ab_ratio, sect_ba_ratio});
4979 }
4980 } // namespace detail
4981
4982 template<typename CharT1>
4983 template<typename InputIt2>
4984 double CachedTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
4985 return detail::token_ratio(s1_tokens, cached_ratio_s1_sorted, first2, last2, score_cutoff);
4986 }
4987
4988 template<typename CharT1>
4989 template<typename Sentence2>
4990 double CachedTokenRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
4991 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
4992 }
4993
4994 /**********************************************
4995 * partial_token_ratio
4996 *********************************************/
4997
4998 template<typename InputIt1, typename InputIt2>
4999 double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
5000 if (score_cutoff > 100) return 0;
5001
5002 auto tokens_a = common::sorted_split(first1, last1);
5003 auto tokens_b = common::sorted_split(first2, last2);
5004
5005 auto decomposition = common::set_decomposition(tokens_a, tokens_b);
5006
5007 // exit early when there is a common word in both sequences
5008 if (!decomposition.intersection.empty()) return 100;
5009
5010 auto diff_ab = decomposition.difference_ab;
5011 auto diff_ba = decomposition.difference_ba;
5012
5013 double result = partial_ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
5014
5015 // do not calculate the same partial_ratio twice
5016 if (tokens_a.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
5017 return result;
5018 }
5019
5020 score_cutoff = std::max(score_cutoff, result);
5021 return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
5022 }
5023
5024 template<typename Sentence1, typename Sentence2>
5025 double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
5027 }
5028
5029 namespace detail {
5030 template<typename CharT1, typename InputIt1, typename InputIt2>
5031 double partial_token_ratio(const std::basic_string<CharT1>& s1_sorted, const SplittedSentenceView<InputIt1>& tokens_s1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
5032 if (score_cutoff > 100) return 0;
5033
5034 auto tokens_b = common::sorted_split(first2, last2);
5035
5036 auto decomposition = common::set_decomposition(tokens_s1, tokens_b);
5037
5038 // exit early when there is a common word in both sequences
5039 if (!decomposition.intersection.empty()) return 100;
5040
5041 auto diff_ab = decomposition.difference_ab;
5042 auto diff_ba = decomposition.difference_ba;
5043
5044 double result = partial_ratio(s1_sorted, tokens_b.join(), score_cutoff);
5045
5046 // do not calculate the same partial_ratio twice
5047 if (tokens_s1.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
5048 return result;
5049 }
5050
5051 score_cutoff = std::max(score_cutoff, result);
5052 return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
5053 }
5054
5055 } // namespace detail
5056
5057 template<typename CharT1>
5058 template<typename InputIt2>
5059 double CachedPartialTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
5060 return detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
5061 }
5062
5063 template<typename CharT1>
5064 template<typename Sentence2>
5065 double CachedPartialTokenRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
5066 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
5067 }
5068
5069 /**********************************************
5070 * WRatio
5071 *********************************************/
5072
5073 template<typename InputIt1, typename InputIt2>
5074 double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
5075 if (score_cutoff > 100) return 0;
5076
5077 constexpr double UNBASE_SCALE = 0.95;
5078
5079 int64_t len1 = std::distance(first1, last1);
5080 int64_t len2 = std::distance(first2, last2);
5081
5082 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
5083 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
5084 if (!len1 || !len2) {
5085 return 0;
5086 }
5087
5088 double len_ratio = (len1 > len2) ? static_cast<double>(len1) / static_cast<double>(len2)
5089 : static_cast<double>(len2) / static_cast<double>(len1);
5090
5091 double end_ratio = ratio(first1, last1, first2, last2, score_cutoff);
5092
5093 if (len_ratio < 1.5) {
5094 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
5095 return std::max(end_ratio, token_ratio(first1, last1, first2, last2, score_cutoff) * UNBASE_SCALE);
5096 }
5097
5098 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
5099
5100 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
5101 end_ratio = std::max(end_ratio, partial_ratio(first1, last1, first2, last2, score_cutoff) * PARTIAL_SCALE);
5102
5103 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
5104 return std::max(end_ratio, partial_token_ratio(first1, last1, first2, last2, score_cutoff) * UNBASE_SCALE * PARTIAL_SCALE);
5105 }
5106
5107 template<typename Sentence1, typename Sentence2>
5108 double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
5109 return WRatio(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
5110 }
5111
5112 template<typename Sentence1>
5113 template<typename InputIt1>
5114 CachedWRatio<Sentence1>::CachedWRatio(InputIt1 first1, InputIt1 last1)
5115 : s1(first1, last1),
5116 cached_partial_ratio(first1, last1),
5117 tokens_s1(common::sorted_split(std::begin(s1), std::end(s1))),
5118 s1_sorted(tokens_s1.join()) {
5119 blockmap_s1_sorted.insert(std::begin(s1_sorted), std::end(s1_sorted));
5120 }
5121
5122 template<typename CharT1>
5123 template<typename InputIt2>
5124 double CachedWRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
5125 if (score_cutoff > 100) return 0;
5126
5127 constexpr double UNBASE_SCALE = 0.95;
5128
5129 int64_t len1 = s1.size();
5130 int64_t len2 = std::distance(first2, last2);
5131
5132 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
5133 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
5134 if (!len1 || !len2) {
5135 return 0;
5136 }
5137
5138 double len_ratio = (len1 > len2) ? static_cast<double>(len1) / static_cast<double>(len2)
5139 : static_cast<double>(len2) / static_cast<double>(len1);
5140
5141 double end_ratio = cached_partial_ratio.cached_ratio.similarity(first2, last2, score_cutoff);
5142
5143 if (len_ratio < 1.5) {
5144 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
5145 // use pre calculated values
5146 auto r = detail::token_ratio(s1_sorted, tokens_s1, blockmap_s1_sorted, first2, last2, score_cutoff);
5147 return std::max(end_ratio, r * UNBASE_SCALE);
5148 }
5149
5150 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
5151
5152 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
5153 end_ratio = std::max(end_ratio, cached_partial_ratio.similarity(first2, last2, score_cutoff) * PARTIAL_SCALE);
5154
5155 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
5156 auto r = detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
5157 return std::max(end_ratio, r * UNBASE_SCALE * PARTIAL_SCALE);
5158 }
5159
5160 template<typename CharT1>
5161 template<typename Sentence2>
5162 double CachedWRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
5163 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
5164 }
5165
5166 /**********************************************
5167 * QRatio
5168 *********************************************/
5169
5170 template<typename InputIt1, typename InputIt2>
5171 double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff) {
5172 int64_t len1 = std::distance(first1, last1);
5173 int64_t len2 = std::distance(first2, last2);
5174
5175 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
5176 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
5177 if (!len1 || !len2) {
5178 return 0;
5179 }
5180
5181 return ratio(first1, last1, first2, last2, score_cutoff);
5182 }
5183
5184 template<typename Sentence1, typename Sentence2>
5185 double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff) {
5186 return QRatio(common::to_begin(s1), common::to_end(s1), common::to_begin(s2), common::to_end(s2), score_cutoff);
5187 }
5188
5189 template<typename CharT1>
5190 template<typename InputIt2>
5191 double CachedQRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff) const {
5192 int64_t len2 = std::distance(first2, last2);
5193
5194 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
5195 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
5196 if (s1.empty() || !len2) {
5197 return 0;
5198 }
5199
5200 return cached_ratio.similarity(first2, last2, score_cutoff);
5201 }
5202
5203 template<typename CharT1>
5204 template<typename Sentence2>
5205 double CachedQRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff) const {
5206 return similarity(common::to_begin(s2), common::to_end(s2), score_cutoff);
5207 }
5208
5209 } // namespace fuzz
5210} // namespace rapidfuzz
5211#endif // RAPIDFUZZ_AMALGAMATED_HPP_INCLUDED
std::string to_string(Alignment alignment)
Definition arcdps_structs.cpp:6
Definition rapidfuzz_amalgamated.hpp:212
Editops inverse() const
Definition rapidfuzz_amalgamated.hpp:304
Editops(Editops &&other) noexcept
Definition rapidfuzz_amalgamated.hpp:228
Editops reverse() const
Definition rapidfuzz_amalgamated.hpp:285
Editops(size_type count)
Definition rapidfuzz_amalgamated.hpp:221
void swap(Editops &rhs) noexcept
Definition rapidfuzz_amalgamated.hpp:271
int64_t get_src_len() const
Definition rapidfuzz_amalgamated.hpp:291
Editops(const Editops &other)
Definition rapidfuzz_amalgamated.hpp:223
void set_dest_len(int64_t len)
Definition rapidfuzz_amalgamated.hpp:300
Editops slice(int start, int stop, int step=1) const
Definition rapidfuzz_amalgamated.hpp:277
Editops()
Definition rapidfuzz_amalgamated.hpp:216
Editops(size_type count, const EditOp &value)
Definition rapidfuzz_amalgamated.hpp:218
Editops & operator=(Editops other)
Definition rapidfuzz_amalgamated.hpp:232
void set_src_len(int64_t len)
Definition rapidfuzz_amalgamated.hpp:294
int64_t get_dest_len() const
Definition rapidfuzz_amalgamated.hpp:297
Definition rapidfuzz_amalgamated.hpp:33
IteratorView(InputIt first_, InputIt last_)
Definition rapidfuzz_amalgamated.hpp:35
InputIt first
Definition rapidfuzz_amalgamated.hpp:37
InputIt last
Definition rapidfuzz_amalgamated.hpp:38
Definition rapidfuzz_amalgamated.hpp:342
int64_t get_dest_len() const
Definition rapidfuzz_amalgamated.hpp:427
Opcodes(Opcodes &&other) noexcept
Definition rapidfuzz_amalgamated.hpp:358
Opcodes()
Definition rapidfuzz_amalgamated.hpp:346
int64_t get_src_len() const
Definition rapidfuzz_amalgamated.hpp:421
void swap(Opcodes &rhs) noexcept
Definition rapidfuzz_amalgamated.hpp:401
Opcodes reverse() const
Definition rapidfuzz_amalgamated.hpp:415
Opcodes(size_type count, const Opcode &value)
Definition rapidfuzz_amalgamated.hpp:348
Opcodes & operator=(Opcodes other)
Definition rapidfuzz_amalgamated.hpp:362
Opcodes slice(int start, int stop, int step=1) const
Definition rapidfuzz_amalgamated.hpp:407
Opcodes(size_type count)
Definition rapidfuzz_amalgamated.hpp:351
Opcodes(const Opcodes &other)
Definition rapidfuzz_amalgamated.hpp:353
Opcodes inverse() const
Definition rapidfuzz_amalgamated.hpp:434
void set_dest_len(int64_t len)
Definition rapidfuzz_amalgamated.hpp:430
void set_src_len(int64_t len)
Definition rapidfuzz_amalgamated.hpp:424
Definition rapidfuzz_amalgamated.hpp:708
SplittedSentenceView(IteratorViewVec< InputIt > sentence)
Definition rapidfuzz_amalgamated.hpp:712
int64_t length() const
Definition rapidfuzz_amalgamated.hpp:717
int64_t size() const
Definition rapidfuzz_amalgamated.hpp:747
iterator_type< InputIt > CharT
Definition rapidfuzz_amalgamated.hpp:710
const IteratorViewVec< InputIt > & words() const
Definition rapidfuzz_amalgamated.hpp:731
std::basic_string< CharT > join() const
Definition rapidfuzz_amalgamated.hpp:760
bool empty() const
Definition rapidfuzz_amalgamated.hpp:721
int64_t dedupe()
Definition rapidfuzz_amalgamated.hpp:740
int64_t word_count() const
Definition rapidfuzz_amalgamated.hpp:725
Definition rapidfuzz_amalgamated.hpp:4219
std::vector< MatchingBlock > get_matching_blocks()
Definition rapidfuzz_amalgamated.hpp:4294
std::tuple< int64_t, int64_t, int64_t > match_t
Definition rapidfuzz_amalgamated.hpp:4221
InputIt1 a_last
Definition rapidfuzz_amalgamated.hpp:4349
InputIt2 b_first
Definition rapidfuzz_amalgamated.hpp:4350
InputIt1 a_first
Definition rapidfuzz_amalgamated.hpp:4348
InputIt2 b_last
Definition rapidfuzz_amalgamated.hpp:4351
match_t find_longest_match(int64_t a_low, int64_t a_high, int64_t b_low, int64_t b_high)
Definition rapidfuzz_amalgamated.hpp:4232
SequenceMatcher(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2)
Definition rapidfuzz_amalgamated.hpp:4223
constexpr bool is_zero(T a, T tolerance=std::numeric_limits< T >::epsilon())
Definition rapidfuzz_amalgamated.hpp:822
constexpr auto to_signed(T value) -> typename std::make_unsigned< T >::type
Definition rapidfuzz_amalgamated.hpp:888
std::pair< InputIterator1, InputIterator2 > mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2)
Finds the first mismatching pair of elements from two ranges: one defined by [first1,...
Definition rapidfuzz_amalgamated.hpp:1186
DecomposedSet< InputIt1, InputIt2, InputIt1 > set_decomposition(SplittedSentenceView< InputIt1 > a, SplittedSentenceView< InputIt2 > b)
Definition rapidfuzz_amalgamated.hpp:1152
int64_t remove_common_prefix(InputIt1 &first1, InputIt1 last1, InputIt2 &first2, InputIt2 last2)
Definition rapidfuzz_amalgamated.hpp:1198
bool CanTypeFitValue(const U value)
Definition rapidfuzz_amalgamated.hpp:896
constexpr auto to_unsigned(T value) -> typename std::make_unsigned< T >::type
Definition rapidfuzz_amalgamated.hpp:883
std::basic_string< CharT > to_string(Sentence &&str)
Definition rapidfuzz_amalgamated.hpp:1175
constexpr double norm_distance(int64_t dist, int64_t lensum, double score_cutoff=0)
Definition rapidfuzz_amalgamated.hpp:809
StringAffix remove_common_affix(InputIt1 &first1, InputIt1 &last1, InputIt2 &first2, InputIt2 &last2)
Definition rapidfuzz_amalgamated.hpp:1226
constexpr double result_cutoff(double result, double score_cutoff)
Definition rapidfuzz_amalgamated.hpp:804
CharT * to_end(CharT *s)
Definition rapidfuzz_amalgamated.hpp:844
CharT * to_begin(CharT *s)
Definition rapidfuzz_amalgamated.hpp:833
SplittedSentenceView< InputIt > sorted_split(InputIt first, InputIt last)
Definition rapidfuzz_amalgamated.hpp:1308
int64_t remove_common_suffix(InputIt1 first1, InputIt1 &last1, InputIt2 first2, InputIt2 &last2)
Definition rapidfuzz_amalgamated.hpp:1209
double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::ratio between them.
Definition rapidfuzz_amalgamated.hpp:4603
double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
calculates the fuzz::ratio of the optimal string alignment
Definition rapidfuzz_amalgamated.hpp:4535
double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::token_set_ratio and fuzz::token_sort_ratio (faster th...
Definition rapidfuzz_amalgamated.hpp:4799
double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
calculates a simple ratio between two strings
Definition rapidfuzz_amalgamated.hpp:4383
double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::partial_ratio between them.
Definition rapidfuzz_amalgamated.hpp:4633
double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::partial_r...
Definition rapidfuzz_amalgamated.hpp:4769
double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Calculates a weighted ratio based on the other ratio algorithms.
Definition rapidfuzz_amalgamated.hpp:5074
double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Calculates a quick ratio between two strings using fuzz.ratio.
Definition rapidfuzz_amalgamated.hpp:5171
double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::partial_token_set_ratio and fuzz::partial_token_sort_...
Definition rapidfuzz_amalgamated.hpp:4999
double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::ratio.
Definition rapidfuzz_amalgamated.hpp:4721
void vector_slice(std::vector< T > &new_vec, const std::vector< T > &vec, int start, int stop, int step)
Definition rapidfuzz_amalgamated.hpp:157
double token_ratio(const SplittedSentenceView< CharT1 > &s1_tokens, const CachedRatio< CachedCharT1 > &cached_ratio_s1_sorted, InputIt2 first2, InputIt2 last2, double score_cutoff)
Definition rapidfuzz_amalgamated.hpp:4861
double partial_token_ratio(const std::basic_string< CharT1 > &s1_sorted, const SplittedSentenceView< InputIt1 > &tokens_s1, InputIt2 first2, InputIt2 last2, double score_cutoff)
Definition rapidfuzz_amalgamated.hpp:5031
double partial_ratio_short_needle(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, const CachedRatio< CachedCharT1 > &cached_ratio, const common::CharHashTable< iterator_type< InputIt1 >, bool > &s1_char_map, double score_cutoff)
Definition rapidfuzz_amalgamated.hpp:4415
double token_set_ratio(const SplittedSentenceView< InputIt1 > &tokens_a, const SplittedSentenceView< InputIt2 > &tokens_b, const double score_cutoff)
Definition rapidfuzz_amalgamated.hpp:4664
double partial_ratio_long_needle(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, const CachedRatio< CachedCharT1 > &cached_ratio, double score_cutoff)
Definition rapidfuzz_amalgamated.hpp:4487
double partial_token_set_ratio(const SplittedSentenceView< InputIt1 > &tokens_a, const SplittedSentenceView< InputIt2 > &tokens_b, const double score_cutoff)
Definition rapidfuzz_amalgamated.hpp:4752
Definition rapidfuzz_amalgamated.hpp:30
bool operator>(const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
Definition rapidfuzz_amalgamated.hpp:57
int64_t levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights={1, 1, 1}, int64_t max=std::numeric_limits< int64_t >::max())
Calculates the minimum number of insertions, deletions, and substitutions required to change one sequ...
Definition rapidfuzz_amalgamated.hpp:3374
Editops levenshtein_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2)
Return list of EditOp describing how to turn s1 into s2.
Definition rapidfuzz_amalgamated.hpp:3453
int64_t levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights={1, 1, 1}, int64_t score_cutoff=0.0)
Definition rapidfuzz_amalgamated.hpp:3426
bool operator!=(const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
Definition rapidfuzz_amalgamated.hpp:47
void swap(Editops &lhs, Editops &rhs)
Definition rapidfuzz_amalgamated.hpp:338
std::vector< IteratorView< InputIt > > IteratorViewVec
Definition rapidfuzz_amalgamated.hpp:72
EditType
Edit operation types used by the Levenshtein distance.
Definition rapidfuzz_amalgamated.hpp:88
double levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights={1, 1, 1}, double score_cutoff=0.0)
Calculates a normalized levenshtein distance using custom costs for insertion, deletion and substitut...
Definition rapidfuzz_amalgamated.hpp:3440
bool operator<(const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
Definition rapidfuzz_amalgamated.hpp:52
bool operator==(const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
Definition rapidfuzz_amalgamated.hpp:42
bool operator>=(const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
Definition rapidfuzz_amalgamated.hpp:67
double levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, LevenshteinWeightTable weights={1, 1, 1}, double score_cutoff=1.0)
Definition rapidfuzz_amalgamated.hpp:3412
bool operator<=(const IteratorView< InputIt1 > &a, const IteratorView< InputIt2 > &b)
Definition rapidfuzz_amalgamated.hpp:62
#define GENERATE_HAS_MEMBER(member)
Definition rapidfuzz_amalgamated.hpp:607
Definition rapidfuzz_amalgamated.hpp:2642
int64_t similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff=0) const
Definition rapidfuzz_amalgamated.hpp:3529
CachedLevenshtein(InputIt1 first1, InputIt1 last1, LevenshteinWeightTable aWeights={1, 1, 1})
Definition rapidfuzz_amalgamated.hpp:2650
CachedLevenshtein(const Sentence1 &s1_, LevenshteinWeightTable aWeights={1, 1, 1})
Definition rapidfuzz_amalgamated.hpp:2644
double normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff=1.0) const
Definition rapidfuzz_amalgamated.hpp:3511
int64_t distance(InputIt2 first2, InputIt2 last2, int64_t score_cutoff=std::numeric_limits< int64_t >::max()) const
Definition rapidfuzz_amalgamated.hpp:3467
double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0.0) const
Definition rapidfuzz_amalgamated.hpp:3547
Edit operations used by the Levenshtein distance.
Definition rapidfuzz_amalgamated.hpp:105
EditType type
Definition rapidfuzz_amalgamated.hpp:106
int64_t dest_pos
Definition rapidfuzz_amalgamated.hpp:108
EditOp()
Definition rapidfuzz_amalgamated.hpp:110
int64_t src_pos
Definition rapidfuzz_amalgamated.hpp:107
EditOp(EditType type_, int64_t src_pos_, int64_t dest_pos_)
Definition rapidfuzz_amalgamated.hpp:112
Definition rapidfuzz_amalgamated.hpp:79
int64_t insert_cost
Definition rapidfuzz_amalgamated.hpp:80
int64_t replace_cost
Definition rapidfuzz_amalgamated.hpp:82
int64_t delete_cost
Definition rapidfuzz_amalgamated.hpp:81
Edit operations used by the Levenshtein distance.
Definition rapidfuzz_amalgamated.hpp:133
EditType type
Definition rapidfuzz_amalgamated.hpp:134
Opcode(EditType type_, int64_t src_begin_, int64_t src_end_, int64_t dest_begin_, int64_t dest_end_)
Definition rapidfuzz_amalgamated.hpp:142
int64_t src_end
Definition rapidfuzz_amalgamated.hpp:136
int64_t dest_begin
Definition rapidfuzz_amalgamated.hpp:137
int64_t dest_end
Definition rapidfuzz_amalgamated.hpp:138
int64_t src_begin
Definition rapidfuzz_amalgamated.hpp:135
Opcode()
Definition rapidfuzz_amalgamated.hpp:140
Definition rapidfuzz_amalgamated.hpp:74
int64_t suffix_len
Definition rapidfuzz_amalgamated.hpp:76
int64_t prefix_len
Definition rapidfuzz_amalgamated.hpp:75
Definition rapidfuzz_amalgamated.hpp:984
void insert(int64_t block, CharT ch, int pos)
Definition rapidfuzz_amalgamated.hpp:995
std::vector< PatternMatchVector > m_val
Definition rapidfuzz_amalgamated.hpp:985
uint64_t get(int64_t block, CharT ch) const
Definition rapidfuzz_amalgamated.hpp:1016
void insert(InputIt first, InputIt last)
Definition rapidfuzz_amalgamated.hpp:1001
BlockPatternMatchVector()
Definition rapidfuzz_amalgamated.hpp:987
BlockPatternMatchVector(InputIt first, InputIt last)
Definition rapidfuzz_amalgamated.hpp:990
const ValueType & operator[](CharT2 ch) const
Definition rapidfuzz_amalgamated.hpp:1039
ValueType & create(CharT1 ch)
Definition rapidfuzz_amalgamated.hpp:1034
std::array< ValueType, std::numeric_limits< UCharT1 >::max()+1 > m_val
Definition rapidfuzz_amalgamated.hpp:1029
typename std::make_unsigned< CharT1 >::type UCharT1
Definition rapidfuzz_amalgamated.hpp:1027
ValueType m_default
Definition rapidfuzz_amalgamated.hpp:1030
CharHashTable()
Definition rapidfuzz_amalgamated.hpp:1032
Definition rapidfuzz_amalgamated.hpp:1049
ValueType m_default
Definition rapidfuzz_amalgamated.hpp:1051
const ValueType & operator[](CharT2 ch) const
Definition rapidfuzz_amalgamated.hpp:1060
ValueType & create(CharT1 ch)
Definition rapidfuzz_amalgamated.hpp:1055
CharHashTable()
Definition rapidfuzz_amalgamated.hpp:1053
std::unordered_map< CharT1, ValueType > m_val
Definition rapidfuzz_amalgamated.hpp:1050
Definition rapidfuzz_amalgamated.hpp:1089
const T & operator[](uint64_t col)
Definition rapidfuzz_amalgamated.hpp:1094
ConstMatrixVectorView(const MatrixVectorView< T > &other)
Definition rapidfuzz_amalgamated.hpp:1092
ConstMatrixVectorView(const T *vector, int64_t cols)
Definition rapidfuzz_amalgamated.hpp:1090
Definition rapidfuzz_amalgamated.hpp:1075
T & operator[](uint64_t col)
Definition rapidfuzz_amalgamated.hpp:1078
MatrixVectorView(T *vector, int64_t cols)
Definition rapidfuzz_amalgamated.hpp:1076
Definition rapidfuzz_amalgamated.hpp:1105
ConstMatrixVectorView< uint64_t > operator[](uint64_t row) const
Definition rapidfuzz_amalgamated.hpp:1126
~Matrix()
Definition rapidfuzz_amalgamated.hpp:1117
Matrix(uint64_t rows, uint64_t cols, uint64_t val)
Definition rapidfuzz_amalgamated.hpp:1106
MatrixVectorView< uint64_t > operator[](uint64_t row)
Definition rapidfuzz_amalgamated.hpp:1121
Definition rapidfuzz_amalgamated.hpp:905
uint64_t value
Definition rapidfuzz_amalgamated.hpp:907
uint64_t key
Definition rapidfuzz_amalgamated.hpp:906
Definition rapidfuzz_amalgamated.hpp:904
std::array< MapElem, 128 > m_map
Definition rapidfuzz_amalgamated.hpp:909
PatternMatchVector()
Definition rapidfuzz_amalgamated.hpp:912
uint64_t get(CharT key) const
Definition rapidfuzz_amalgamated.hpp:934
void insert(InputIt first, InputIt last)
Definition rapidfuzz_amalgamated.hpp:920
PatternMatchVector(InputIt first, InputIt last)
Definition rapidfuzz_amalgamated.hpp:915
void insert(CharT key, int64_t pos)
Definition rapidfuzz_amalgamated.hpp:929
std::array< uint64_t, 256 > m_extendedAscii
Definition rapidfuzz_amalgamated.hpp:910
uint64_t get(int64_t block, CharT key) const
Definition rapidfuzz_amalgamated.hpp:943
Definition rapidfuzz_amalgamated.hpp:3663
CachedPartialRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:4565
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0.0) const
Definition rapidfuzz_amalgamated.hpp:4574
CachedPartialRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:3671
Definition rapidfuzz_amalgamated.hpp:4025
CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:4027
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0) const
Definition rapidfuzz_amalgamated.hpp:5059
CachedPartialTokenRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:4033
Definition rapidfuzz_amalgamated.hpp:3908
CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:3910
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0) const
Definition rapidfuzz_amalgamated.hpp:4782
CachedPartialTokenSetRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:3914
Definition rapidfuzz_amalgamated.hpp:3786
CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:3788
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0) const
Definition rapidfuzz_amalgamated.hpp:4646
CachedPartialTokenSortRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:3792
Definition rapidfuzz_amalgamated.hpp:4145
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0) const
Definition rapidfuzz_amalgamated.hpp:5191
CachedQRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:4147
CachedQRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:4150
Definition rapidfuzz_amalgamated.hpp:3604
CachedRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:3606
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0.0) const
Definition rapidfuzz_amalgamated.hpp:4394
CachedRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:3609
Definition rapidfuzz_amalgamated.hpp:3964
CachedTokenRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:3973
CachedTokenRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:3966
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0) const
Definition rapidfuzz_amalgamated.hpp:4984
Definition rapidfuzz_amalgamated.hpp:3852
CachedTokenSetRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:3858
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0) const
Definition rapidfuzz_amalgamated.hpp:4734
CachedTokenSetRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:3854
Definition rapidfuzz_amalgamated.hpp:3729
CachedTokenSortRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:3731
CachedTokenSortRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:3735
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0) const
Definition rapidfuzz_amalgamated.hpp:4616
Definition rapidfuzz_amalgamated.hpp:4086
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff=0) const
Definition rapidfuzz_amalgamated.hpp:5124
CachedWRatio(InputIt1 first1, InputIt1 last1)
Definition rapidfuzz_amalgamated.hpp:5114
CachedWRatio(const Sentence1 &s1)
Definition rapidfuzz_amalgamated.hpp:4091