diff --git a/README.md b/README.md index 8acbd21..5804369 100644 --- a/README.md +++ b/README.md @@ -48,18 +48,19 @@ compare(::AbstractString, ::AbstractString, ::PreMetric = TokenMax(Levenshtein() ``` ## Find -`find_best` returns the element of an iterator with the highest similarity score +`find_best` returns the index of the element with the highest similarity score. +It returns nothing if all elements have a similarity score below `min_score` (default to 0.0) ```julia find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) -#> "NewYork" +#> 1 ``` -`find_all` returns all the elements of an iterator with a similarity score higher than a minimum value (default to 0.8) +`find_all` returns the indices of the elements with a similarity score higher than a minimum value (default to 0.8) ```julia find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.8) #> 1-element Array{String,1}: -#> "NewYork" +#> [1] ``` While these functions are defined for any distance, they are particularly optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) diff --git a/benchmark/.sublime2Terminal.jl b/benchmark/.sublime2Terminal.jl index dfe3b3a..d6793c8 100644 --- a/benchmark/.sublime2Terminal.jl +++ b/benchmark/.sublime2Terminal.jl @@ -1 +1 @@ -@time find_best(x[1], y, Levenshtein()) +@time find_all(x[1], y, TokenMax(DamerauLevenshtein())) diff --git a/src/find.jl b/src/find.jl index 2261797..0d01e15 100755 --- a/src/find.jl +++ b/src/find.jl @@ -1,44 +1,41 @@ """ - find_best(s1::AbstractString, iter, dist::PreMetric; min_score = 0.0) + find_best(s::AbstractString, iter::AbstractVector, dist::PreMetric; min_score = 0.0) -`find_best` returns the element of the iterator `iter` that has the highest similarity score with `s1` according to the distance `dist`. Return nothing if all elements have a similarity score below `min_score`. +`find_best` returns the index of the element of `iter` that has the highest similarity score with `s` according to the distance `dist`. +It returns nothing if all elements have a similarity score below `min_score` (default to 0.0) The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) """ -function find_best(s1::AbstractString, iter_s2, dist::PreMetric; min_score = 0.0) +function find_best(s::AbstractString, iter::AbstractVector, dist::PreMetric; min_score = 0.0) min_score >= 0 || throw("min_score should be positive") - best_s2s = AbstractString["" for _ in 1:Threads.nthreads()] - best_scores = [-1.0 for _ in 1:Threads.nthreads()] + is = [0 for _ in 1:Threads.nthreads()] + scores = [-1.0 for _ in 1:Threads.nthreads()] min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score) - Threads.@threads for s2 in iter_s2 - score = compare(s1, s2, dist; min_score = min_score_atomic[]) + Threads.@threads for i in 1:length(iter) + score = compare(s, iter[i], dist; min_score = min_score_atomic[]) min_score_atomic_old = Threads.atomic_max!(min_score_atomic, score) if score >= min_score_atomic_old - best_s2s[Threads.threadid()] = s2 - best_scores[Threads.threadid()] = score - score == 1.0 && return s2 + score == 1.0 && return i + is[Threads.threadid()] = i + scores[Threads.threadid()] = score end end - i = argmax(best_scores) - if best_scores[i] < 0 - return nothing - else - return best_s2s[i] - end + i = argmax(scores) + is[i] == 0 ? nothing : is[i] end """ - find_all(s1::AbstractString, iter, dist::PreMetric; min_score = 0.8) -`find_all` returns the vector with all the elements of `iter` that have a similarity score higher or equal than `min_score` according to the distance `dist`. + find_all(s::AbstractString, iter::AbstractVector, dist::PreMetric; min_score = 0.8) +`find_all` returns the vector of indices for elements of `iter` that have a similarity score higher or equal than `min_score` according to the distance `dist`. The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) """ -function find_all(s1::AbstractString, iter_s2, dist::PreMetric; min_score = 0.8) - best_s2s = [eltype(iter_s2)[] for _ in 1:Threads.nthreads()] - Threads.@threads for s2 in iter_s2 - score = compare(s1, s2, dist; min_score = min_score) +function find_all(s::AbstractString, iter::AbstractVector, dist::PreMetric; min_score = 0.8) + out = [Int[] for _ in 1:Threads.nthreads()] + Threads.@threads for i in 1:length(iter) + score = compare(s, iter[i], dist; min_score = min_score) if score >= min_score - push!(best_s2s[Threads.threadid()], s2) + push!(out[Threads.threadid()], i) end end - vcat(best_s2s...) + vcat(out...) end diff --git a/test/modifiers.jl b/test/modifiers.jl index f75c4a6..044b7bc 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -104,10 +104,12 @@ using StringDistances, Test end # check find_best and find_all - @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork" - @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork" - @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"] - @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"] + @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == 1 + @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == nothing + @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == 1 + @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1] + @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2] + @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[] end