From 82d5f3bc91e896d67c6fc1bc6ce8cce0a4cf546b Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Thu, 12 Dec 2019 14:48:52 -0500 Subject: [PATCH] remove Hamming, create StringDistance --- README.md | 59 +++++++++++++----------------- benchmark/.sublime2Terminal.jl | 19 +++++++++- benchmark/benchmark.jl | 25 ++++++------- src/StringDistances.jl | 44 +++++++++++------------ src/compare.jl | 65 ++++++++++++++-------------------- src/edit.jl | 26 +++----------- src/find.jl | 18 +++++----- src/qgram.jl | 14 ++++---- test/distances.jl | 14 -------- test/modifiers.jl | 20 ++++------- 10 files changed, 128 insertions(+), 176 deletions(-) diff --git a/README.md b/README.md index 5804369..ccb4164 100644 --- a/README.md +++ b/README.md @@ -6,16 +6,14 @@ This Julia package computes various distances between `AbstractString`s ## Installation The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`. -## Syntax +## Compare The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar. Its syntax is: ```julia -compare(::AbstractString, ::AbstractString, ::PreMetric = TokenMax(Levenshtein())) +compare(::AbstractString, ::AbstractString, ::StringDistance) ``` -## Distances - Edit Distances - - [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()` - [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` - [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()` - [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()` @@ -35,38 +33,35 @@ compare(::AbstractString, ::AbstractString, ::PreMetric = TokenMax(Levenshtein() - [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string. - [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) combines scores using the base distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. - ```julia - compare("martha", "marhta", Jaro()) - compare("martha", "marhta", Winkler(Jaro())) - compare("william", "williams", QGram(2)) - compare("william", "williams", Winkler(QGram(2))) - compare("New York Yankees", "Yankees", Levenshtein()) - compare("New York Yankees", "Yankees", Partial(Levenshtein())) - compare("mariners vs angels", "los angeles angels at seattle mariners", Jaro()) - compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Jaro())) - compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp())) - ``` - -## Find -`find_best` returns the index of the element with the highest similarity score. -It returns nothing if all elements have a similarity score below `min_score` (default to 0.0) +Some examples: ```julia -find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) -#> 1 +compare("martha", "marhta", Jaro()) +compare("martha", "marhta", Winkler(Jaro())) +compare("william", "williams", QGram(2)) +compare("william", "williams", Winkler(QGram(2))) +compare("New York Yankees", "Yankees", Levenshtein()) +compare("New York Yankees", "Yankees", Partial(Levenshtein())) +compare("mariners vs angels", "los angeles angels at seattle mariners", Jaro()) +compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Jaro())) +compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp())) ``` -`find_all` returns the indices of the elements with a similarity score higher than a minimum value (default to 0.8) +A good distance to link adresses etc (where the word order does not matter) is `TokenMax(Levenshtein()` -```julia -find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.8) -#> 1-element Array{String,1}: -#> [1] -``` +## Find +- `findmax` returns the value and index of the element in `iter` with the highest similarity score with `x`. Its syntax is: + ```julia + findmax(x::AbstractString, iter::AbstractString, dist::StringDistance) + ``` -While these functions are defined for any distance, they are particularly optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) +- `findall` returns the indices of all elements in `iter` with a similarity score with `x` higher than a minimum value (default to 0.8). Its syntax is: + ```julia + findall(x::AbstractString, iter::AbstractVector, dist::StringDistance) + ``` -## Evaluate +The functions `findmax` and `findall` are particularly optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`). +## Evaluate The function `compare` returns a similarity score: a value of 0 means completely different and a value of 1 means completely similar. In contrast, the function `evaluate` returns the litteral distance between two strings, with a value of 0 being completely similar. Some distances are between 0 and 1, while others are unbouded. ```julia @@ -76,12 +71,6 @@ evaluate(Levenshtein(), "New York", "New York") #> 0 ``` -## Which distance should I use? - -As a rule of thumb, -- Standardize strings before comparing them (cases, whitespaces, accents, abbreviations...) -- The distance `TokenMax(Levenshtein())` (the default for `compare`) is a good choice to link sequence of words (adresses, names) across datasets (see [`fuzzywuzzy`](https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)) - ## References - [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) diff --git a/benchmark/.sublime2Terminal.jl b/benchmark/.sublime2Terminal.jl index d6793c8..9757aee 100644 --- a/benchmark/.sublime2Terminal.jl +++ b/benchmark/.sublime2Terminal.jl @@ -1 +1,18 @@ -@time find_all(x[1], y, TokenMax(DamerauLevenshtein())) + + + + +# check +function h(t, x, y; min_score = 1/3) + out = fill(false, length(x)) + for i in eachindex(x) + if compare(x[i], y[i], t) < min_score + out[i] = compare(x[i], y[i], t ; min_score = min_score) ≈ 0.0 + else + out[i] = compare(x[i], y[i], t ; min_score = min_score) ≈ compare(x[i], y[i], t) + end + end + all(out) +end +h(Levenshtein(), x, y) +h(DamerauLevenshtein(), x, y) diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl index 477a114..1d90439 100644 --- a/benchmark/benchmark.jl +++ b/benchmark/benchmark.jl @@ -8,8 +8,6 @@ function f(t, x, y; min_score = 0.0) [compare(x[i], y[i], t; min_score = min_score) for i in 1:length(x)] end -@time f(Hamming(), x, y) -#0.05s @time f(Jaro(), x, y) #0.3s @time f(Levenshtein(), x, y) @@ -23,27 +21,26 @@ end -@time find_best(x[1], y, Levenshtein()) -# 0.41 -@time find_best(x[1], y, DamerauLevenshtein()) -# 0.41 +@time findmax(x[1], y, Levenshtein()) +# 0.14 +@time findmax(x[1], y, DamerauLevenshtein()) +# 0.15 -@time find_all(x[1], y, Levenshtein()) +@time findall(x[1], y, Levenshtein()) # 0.06 -@time find_all(x[1], y, DamerauLevenshtein()) +@time findall(x[1], y, DamerauLevenshtein()) # 0.05 -@time find_all(x[1], y, Partial(DamerauLevenshtein())) +@time findall(x[1], y, Partial(DamerauLevenshtein())) # 0.9 -@time find_all(x[1], y, TokenSort(DamerauLevenshtein())) +@time findall(x[1], y, TokenSort(DamerauLevenshtein())) # 0.27 -@time find_all(x[1], y, TokenSet(DamerauLevenshtein())) -# 0.8 -@time find_all(x[1], y, TokenMax(DamerauLevenshtein())) +@time findall(x[1], y, TokenSet(DamerauLevenshtein())) +# 0.74 +@time findall(x[1], y, TokenMax(DamerauLevenshtein())) # 2.25 -# 1.6s slower compared to StringDist diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 40b72d4..f098cc7 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -6,6 +6,22 @@ using Distances import Distances: evaluate, result_type using DataStructures # for SortedSet in TokenSort +############################################################################## +## +## include +## +############################################################################## +abstract type StringDistance <: SemiMetric end +include("utils.jl") +include("edit.jl") +include("qgram.jl") +include("compare.jl") +include("find.jl") + +function result_type(m::StringDistance, a::AbstractString, b::AbstractString) + typeof(evaluate(m, oneunit(a), oneunit(b))) +end + ############################################################################## ## ## Export @@ -13,10 +29,7 @@ using DataStructures # for SortedSet in TokenSort ############################################################################## export -evaluate, -compare, -result_type, -Hamming, +StringDistance, Levenshtein, DamerauLevenshtein, Jaro, @@ -31,25 +44,10 @@ Partial, TokenSort, TokenSet, TokenMax, -qgram, -find_best, -find_all - -############################################################################## -## -## include -## -############################################################################## -include("utils.jl") -include("edit.jl") -include("qgram.jl") -include("compare.jl") -include("find.jl") - -function result_type(m::Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}, a::AbstractString, b::AbstractString) - typeof(evaluate(m, oneunit(a), oneunit(b))) -end - +evaluate, +compare, +result_type, +qgram end ############################################################################## diff --git a/src/compare.jl b/src/compare.jl index 26c6000..bc6c1be 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -6,36 +6,16 @@ ## ############################################################################## """ - compare(s1::AbstractString, s2::AbstractString, dist::PreMetric = TokenMax(Levenshtein())) + compare(s1::AbstractString, s2::AbstractString, dist::StringDistance) compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist` """ -function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Hamming; min_score = 0.0) - (ismissing(s1) | ismissing(s2)) && return missing - s1, s2 = reorder(s1, s2) - len1, len2 = length(s1), length(s2) - len2 == 0 && return 1.0 - 1.0 - evaluate(dist, s1, s2) / len2 -end function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0) (ismissing(s1) | ismissing(s2)) && return missing 1.0 - evaluate(dist, s1, s2) end -function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::AbstractQGramDistance; min_score = 0.0) - (ismissing(s1) | ismissing(s2)) && return missing - # When string length < q for qgram distance, returns s1 == s2 - s1, s2 = reorder(s1, s2) - len1, len2 = length(s1), length(s2) - len1 <= dist.q - 1 && return convert(Float64, s1 == s2) - if typeof(dist) <: QGram - 1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2) - else - 1.0 - evaluate(dist, s1, s2) - end -end - function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0) (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) @@ -51,8 +31,17 @@ function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, M end end -function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) - compare(s1, s2, TokenMax(Levenshtein())) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::QGramDistance; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing + # When string length < q for qgram distance, returns s1 == s2 + s1, s2 = reorder(s1, s2) + len1, len2 = length(s1), length(s2) + len1 <= dist.q - 1 && return convert(Float64, s1 == s2) + if typeof(dist) <: QGram + 1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2) + else + 1.0 - evaluate(dist, s1, s2) + end end ############################################################################## @@ -61,11 +50,11 @@ end ## ############################################################################## """ - Winkler(dist::Premetric, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4) + Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4) -Winkler is a `PreMetric` modifier that boosts the similarity score between two strings by a scale `p` when the strings share a common prefix with lenth lower than `l` (the boost is only applied the similarity score above `boosting_threshold`) +Winkler is a `StringDistance` modifier that boosts the similarity score between two strings by a scale `p` when the strings share a common prefix with lenth lower than `l` (the boost is only applied the similarity score above `boosting_threshold`) """ -struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real, T4 <: Integer} <: PreMetric +struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance dist::T1 p::T2 # scaling factor. Default to 0.1 boosting_threshold::T3 # boost threshold. Default to 0.7 @@ -98,11 +87,11 @@ JaroWinkler() = Winkler(Jaro(), 0.1, 0.7) ## ############################################################################## """ - Partial(dist::Premetric) + Partial(dist::StringDistance) -Partial is a `PreMetric` modifier that returns the maximal similarity score between the shorter string and substrings of the longer string +Partial is a `StringDistance` modifier that returns the maximal similarity score between the shorter string and substrings of the longer string """ -struct Partial{T <: PreMetric} <: PreMetric +struct Partial{T <: StringDistance} <: StringDistance dist::T end @@ -153,11 +142,11 @@ end ## ############################################################################## """ - TokenSort(dist::Premetric) + TokenSort(dist::StringDistance) -TokenSort is a `PreMetric` modifier that adjusts for differences in word orders by reording words alphabetically. +TokenSort is a `StringDistance` modifier that adjusts for differences in word orders by reording words alphabetically. """ -struct TokenSort{T <: PreMetric} <: PreMetric +struct TokenSort{T <: StringDistance} <: StringDistance dist::T end @@ -175,11 +164,11 @@ end ## ############################################################################## """ - TokenSet(dist::Premetric) + TokenSet(dist::StringDistance) -TokenSort is a `PreMetric` modifier that adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string. +TokenSort is a `StringDistance` modifier that adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string. """ -struct TokenSet{T <: PreMetric} <: PreMetric +struct TokenSet{T <: StringDistance} <: StringDistance dist::T end @@ -207,11 +196,11 @@ end ## ############################################################################## """ - TokenMax(dist::Premetric) + TokenMax(dist::StringDistance) -TokenSort is a `PreMetric` modifier that combines similarlity scores using the base distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on string lengths. +TokenSort is a `StringDistance` modifier that combines similarlity scores using the base distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on string lengths. """ -struct TokenMax{T <: PreMetric} <: PreMetric +struct TokenMax{T <: StringDistance} <: StringDistance dist::T end diff --git a/src/edit.jl b/src/edit.jl index 1ceb7b3..e2fd3b8 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -1,22 +1,4 @@ -############################################################################## -## -## Hamming -## -############################################################################## -function evaluate(dist::Hamming, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) - current = abs(length(s2) - length(s1)) - for (ch1, ch2) in zip(s1, s2) - current += ch1 != ch2 - end - return current -end - -evaluate(dist::Hamming, s1::Missing, s2::AbstractString) = missing -evaluate(dist::Hamming, s1::AbstractString, s2::Missing) = missing - - - ############################################################################## ## ## Jaro @@ -35,7 +17,7 @@ The Jaro distance is defined as where ``m`` is the number of matching characters and ``t`` is half the number of transpositions. """ -struct Jaro <: SemiMetric end +struct Jaro <: StringDistance end ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html function evaluate(dist::Jaro, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) @@ -110,7 +92,7 @@ Creates the Levenshtein metric The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, substitutions of a single character) required to change one string into the other. """ -struct Levenshtein <: SemiMetric end +struct Levenshtein <: StringDistance end ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html function evaluate(dist::Levenshtein, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing) @@ -165,7 +147,7 @@ Creates the DamerauLevenshtein metric The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one string into the other. """ -struct DamerauLevenshtein <: SemiMetric end +struct DamerauLevenshtein <: StringDistance end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html function evaluate(dist::DamerauLevenshtein, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing) @@ -253,7 +235,7 @@ Creates the RatcliffObershelp metric The distance between two strings is defined as one minus the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence. """ -struct RatcliffObershelp <: PreMetric end +struct RatcliffObershelp <: StringDistance end function evaluate(dist::RatcliffObershelp, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) (ismissing(s1) | ismissing(s2)) && return missing diff --git a/src/find.jl b/src/find.jl index 0d01e15..bb67b33 100755 --- a/src/find.jl +++ b/src/find.jl @@ -1,13 +1,14 @@ """ - find_best(s::AbstractString, iter::AbstractVector, dist::PreMetric; min_score = 0.0) + findmax(s::AbstractString, iter::AbstractVector, dist::StringDistance; min_score = 0.0) -`find_best` returns the index of the element of `iter` that has the highest similarity score with `s` according to the distance `dist`. -It returns nothing if all elements have a similarity score below `min_score` (default to 0.0) +`findmax` returns the value and index of the element of `iter` that has the highest similarity score with `s` according to the distance `dist`. +It returns `(nothing, nothing)` if none of the elements has a similarity score higher or equal to `min_score` (default to 0.0) The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) """ -function find_best(s::AbstractString, iter::AbstractVector, dist::PreMetric; min_score = 0.0) +function Base.findmax(s::AbstractString, iter::AbstractVector, dist::StringDistance; min_score = 0.0) min_score >= 0 || throw("min_score should be positive") is = [0 for _ in 1:Threads.nthreads()] + xs = ["" for _ in 1:Threads.nthreads()] scores = [-1.0 for _ in 1:Threads.nthreads()] min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score) Threads.@threads for i in 1:length(iter) @@ -16,20 +17,21 @@ function find_best(s::AbstractString, iter::AbstractVector, dist::PreMetric; min if score >= min_score_atomic_old score == 1.0 && return i is[Threads.threadid()] = i + xs[Threads.threadid()] = iter[i] scores[Threads.threadid()] = score end end i = argmax(scores) - is[i] == 0 ? nothing : is[i] + is[i] == 0 ? (nothing, nothing) : (xs[i], is[i]) end """ - find_all(s::AbstractString, iter::AbstractVector, dist::PreMetric; min_score = 0.8) -`find_all` returns the vector of indices for elements of `iter` that have a similarity score higher or equal than `min_score` according to the distance `dist`. + findall(s::AbstractString, iter::AbstractVector, dist::StringDistance; min_score = 0.8) +`findall` returns the vector of indices for elements of `iter` that have a similarity score higher or equal than `min_score` according to the distance `dist`. The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) """ -function find_all(s::AbstractString, iter::AbstractVector, dist::PreMetric; min_score = 0.8) +function Base.findall(s::AbstractString, iter::AbstractVector, dist::StringDistance; min_score = 0.8) out = [Int[] for _ in 1:Threads.nthreads()] Threads.@threads for i in 1:length(iter) score = compare(s, iter[i], dist; min_score = min_score) diff --git a/src/qgram.jl b/src/qgram.jl index 8e47dbc..f155190 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -129,9 +129,9 @@ end ## Distance on strings is computed by set distance on qgram sets ## ############################################################################## -abstract type AbstractQGramDistance <: SemiMetric end +abstract type QGramDistance <: StringDistance end -function evaluate(dist::AbstractQGramDistance, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) +function evaluate(dist::QGramDistance, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) (ismissing(s1) | ismissing(s2)) && return missing x = count_map(qgram(s1, dist.q), qgram(s2, dist.q)) evaluate(dist, x) @@ -153,7 +153,7 @@ The distance corresponds to where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s """ -struct QGram <: AbstractQGramDistance +struct QGram <: QGramDistance q::Int end @@ -182,7 +182,7 @@ The distance corresponds to where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s """ -struct Cosine <: AbstractQGramDistance +struct Cosine <: QGramDistance q::Int end @@ -212,7 +212,7 @@ The distance corresponds to where ``Q(s, q)`` denotes the set of q-grams of length n for the string s """ -struct Jaccard <: AbstractQGramDistance +struct Jaccard <: QGramDistance q::Int end @@ -242,7 +242,7 @@ The distance corresponds to where ``Q(s, q)`` denotes the set of q-grams of length n for the string s """ -struct SorensenDice <: AbstractQGramDistance +struct SorensenDice <: QGramDistance q::Int end @@ -273,7 +273,7 @@ The distance corresponds to where ``Q(s, q)`` denotes the set of q-grams of length n for the string s """ -struct Overlap <: AbstractQGramDistance +struct Overlap <: QGramDistance q::Int end diff --git a/test/distances.jl b/test/distances.jl index 58b0d23..9c0a8cb 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -31,20 +31,6 @@ using StringDistances, Test @inferred evaluate(DamerauLevenshtein(), "", "") end - @testset "Hamming" begin - @test evaluate(Hamming(), "", "") == 0 - @test evaluate(Hamming(), "", "abc") == 3 - @test evaluate(Hamming(), "abc", "abc") == 0 - @test evaluate(Hamming(), "acc", "abc") == 1 - @test evaluate(Hamming(), "abcd", "abc") == 1 - @test evaluate(Hamming(), "abc", "abcd") == 1 - @test evaluate(Hamming(), "testing", "this is a test") == 13 - @test evaluate(Hamming(), "saturday", "sunday") == 7 - @test result_type(Hamming(), "hello", "world") == Int - @test ismissing(evaluate(Hamming(), "", missing)) - @inferred evaluate(Hamming(), "", "") - end - @testset "QGram" begin @test evaluate(QGram(1), "abc", "abc") == 0 @test evaluate(QGram(1), "", "abc") == 3 diff --git a/test/modifiers.jl b/test/modifiers.jl index 044b7bc..ccfd200 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -3,14 +3,6 @@ using StringDistances, Test @testset "Modifiers" begin - # Hamming - @test compare("", "abc", Hamming()) ≈ 0.0 atol = 1e-4 - @test compare("acc", "abc", Hamming()) ≈ 2/3 atol = 1e-4 - @test compare("saturday", "sunday", Hamming()) ≈ 1/8 atol = 1e-4 - @test compare("New York Yankees", "Yankees", Partial(Hamming())) ≈ 1 - @test compare("New York Yankees", "", Partial(Hamming())) ≈ 1 - compare("aüa", "aua", Hamming()) - # Qgram @test compare("", "abc", QGram(1)) ≈ 0.0 atol = 1e-4 @test compare("abc", "cba", QGram(1)) ≈ 1.0 atol = 1e-4 @@ -104,12 +96,12 @@ using StringDistances, Test end # check find_best and find_all - @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == 1 - @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == nothing - @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == 1 - @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1] - @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2] - @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[] + @test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1) + @test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == (nothing, nothing) + @test findmax("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1) + @test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1] + @test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2] + @test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[] end