From a575eeab6a2e7a1a827c47a93202022215103fce Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Fri, 13 Dec 2019 10:33:06 -0500 Subject: [PATCH] rmv datastructures + add docs --- Project.toml | 6 +- README.md | 4 +- src/StringDistances.jl | 1 - src/compare.jl | 121 ++++++++++++++++++++++++++++------------- src/edit.jl | 4 +- src/find.jl | 37 +++++++++++-- src/qgram.jl | 22 ++++---- src/utils.jl | 15 ++--- test/modifiers.jl | 3 + 9 files changed, 141 insertions(+), 72 deletions(-) diff --git a/Project.toml b/Project.toml index bdc3cdc..f620db1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,15 +1,13 @@ name = "StringDistances" uuid = "88034a9c-02f8-509d-84a9-84ec65e18404" -version = "0.5.1" +version = "0.5.2" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" [compat] julia = "1" -DataStructures = "0.14, 0.15, 0.16, 0.17" -Distances = "0.2, 0.3, 0.4, 0.4, 0.6, 0.7, 0.8" +Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/README.md b/README.md index ffddda1..3e109cc 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl) [![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master) -This Julia package computes various distances between `AbstractString`s +This Julia package computes various distances between AbstractStrings ## Installation The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`. @@ -46,7 +46,7 @@ compare("martha", "marhta", TokenSet(Jaro())) compare("martha", "marhta", TokenMax(RatcliffObershelp())) ``` -In case the word order does not matter, a good distance is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)). +A good distance to match strings composed of multiple words (like addresses) is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)). ## Find - `findmax` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is: diff --git a/src/StringDistances.jl b/src/StringDistances.jl index cc128fd..908ff67 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -2,7 +2,6 @@ module StringDistances using Distances import Distances: evaluate, result_type -using DataStructures # for SortedSet in TokenSort ############################################################################## ## diff --git a/src/compare.jl b/src/compare.jl index 298b6ef..a790e7d 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -1,8 +1,14 @@ """ compare(s1::AbstractString, s2::AbstractString, dist::StringDistance) -compare returns a similarity score between 0 and 1 for the strings `s1` and -`s2` based on the distance `dist` +return a similarity score between 0 and 1 for the strings `s1` and +`s2` based on the `StringDistance` `dist` + +### Examples +```julia-repl +julia> compare("martha", "marhta", Levenshtein()) +0.6666666666666667 +``` """ function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0) @@ -38,46 +44,56 @@ function compare(s1::AbstractString, s2::AbstractString, end """ - Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4) + Winkler(dist::StringDistance; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4) -Winkler is a `StringDistance` modifier that boosts the similarity score between -two strings by a scale `p` when the strings share a common prefix with lenth lower -than `l` (the boost is only applied the similarity score above `boosting_threshold`) +Creates the `Winkler{dist, p, threshold, maxlength}` distance + +`Winkler{dist, p, threshold, length)` modifies the string distance `dist` to boost the +similarity score between two strings, when their original similarity score is above some `threshold`. +The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the +length of their common prefix and `score` denotes the original score """ -struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance - dist::T1 - p::T2 # scaling factor. Default to 0.1 - boosting_threshold::T3 # boost threshold. Default to 0.7 - l::Integer # length of common prefix. Default to 4 - function Winkler(dist::T1, p::T2, boosting_threshold::T3, l::T4) where {T1, T2, T3, T4} - p * l >= 1 && throw("scaling factor times length of common prefix must be lower than one") - new{T1, T2, T3, T4}(dist, p, boosting_threshold, l) - end +struct Winkler{S <: StringDistance} <: StringDistance + dist::S + p::Float64 # scaling factor. Default to 0.1 + threshold::Float64 # boost threshold. Default to 0.7 + maxlength::Integer # max length of common prefix. Default to 4 +end + +function Winkler(dist::StringDistance; p = 0.1, threshold = 0.7, maxlength = 4) + p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one") + Winkler(dist, 0.1, 0.7, 4) end -Winkler(x) = Winkler(x, 0.1, 0.7, 4) -# hard to use min_score because of whether there is boost or not in the end function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_score = 0.0) - l = remove_prefix(s1, s2, dist.l)[1] # cannot do min_score because of boosting threshold score = compare(s1, s2, dist.dist) - if score >= dist.boosting_threshold - score += l * dist.p * (1 - score) + if score >= dist.threshold + l = common_prefix(s1, s2)[1] + score += min(l, dist.maxlength) * dist.p * (1 - score) end return score end -JaroWinkler() = Winkler(Jaro(), 0.1, 0.7) - """ Partial(dist::StringDistance) -Partial is a `StringDistance` modifier that returns the maximal similarity score -between the shorter string and substrings of the longer string +Creates the `Partial{dist}` distance + +`Partial{dist}` modifies the string distance `dist` to return the +maximal similarity score between the shorter string and substrings of the longer string + +### Examples +```julia-repl +julia> s1 = "New York Mets vs Atlanta Braves" +julia> s2 = "Atlanta Braves vs New York Mets" +julia> compare(s1, s2, Partial(RatcliffObershelp())) +0.4516129032258065 +``` """ -struct Partial{T <: StringDistance} <: StringDistance - dist::T +struct Partial{S <: StringDistance} <: StringDistance + dist::S end function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = 0.0) @@ -121,8 +137,19 @@ end """ TokenSort(dist::StringDistance) -TokenSort is a `StringDistance` modifier that adjusts for differences in word orders -by reording words alphabetically. +Creates the `TokenSort{dist}` distance + +`TokenSort{dist}` modifies the string distance `dist` to adjust for differences +in word orders by reording words alphabetically. + +### Examples +```julia-repl +julia> s1 = "New York Mets vs Atlanta Braves" +julia> s1 = "New York Mets vs Atlanta Braves" +julia> s2 = "Atlanta Braves vs New York Mets" +julia> compare(s1, s2, TokenSort(RatcliffObershelp())) +1.0 +``` """ struct TokenSort{T <: StringDistance} <: StringDistance dist::T @@ -139,8 +166,18 @@ end """ TokenSet(dist::StringDistance) -TokenSort is a `StringDistance` modifier that adjusts for differences in word orders -and word numbers by comparing the intersection of two strings with each string. +Creates the `TokenSet{dist}` distance + +`TokenSet{dist}` modifies the string distance `dist` to adjust for differences +in word orders and word numbers, by comparing the intersection of two strings with each string. + +### Examples +```julia-repl +julia> s1 = "New York Mets vs Atlanta" +julia> s2 = "Atlanta Braves vs New York Mets" +julia> compare(s1, s2, TokenSet(RatcliffObershelp())) +1.0 +``` """ struct TokenSet{T <: StringDistance} <: StringDistance dist::T @@ -148,8 +185,8 @@ end # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0) - v1 = SortedSet(split(s1)) - v2 = SortedSet(split(s2)) + v1 = unique!(sort!(split(s1))) + v2 = unique!(sort!(split(s2))) v0 = intersect(v1, v2) s0 = join(v0, " ") s1 = join(v1, " ") @@ -167,12 +204,22 @@ end """ TokenMax(dist::StringDistance) -TokenSort is a `StringDistance` modifier that combines similarlity scores using the base -distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on -string lengths. +Creates the `TokenMax{dist}` distance + +`TokenMax{dist}` combines similarity scores of the base distance `dist`, +its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its +[`TokenSet`](@ref) modifier, with penalty terms depending on string lengths. + +### Examples +```julia-repl +julia> s1 = "New York Mets vs Atlanta" +julia> s2 = "Atlanta Braves vs New York Mets" +julia> compare(s1, s2, TokenMax(RatcliffObershelp())) +0.95 +``` """ -struct TokenMax{T <: StringDistance} <: StringDistance - dist::T +struct TokenMax{S <: StringDistance} <: StringDistance + dist::S end function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0) diff --git a/src/edit.jl b/src/edit.jl index eb53a11..b6856e3 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -89,7 +89,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 # prefix common to both strings can be ignored - k, x1, x2start = remove_prefix(s1, s2) + k, x1, x2start = common_prefix(s1, s2) x1 == nothing && return len2 - k # distance initialized to first row of matrix # => distance between "" and s2[1:i} @@ -141,7 +141,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 # prefix common to both strings can be ignored - k, x1, x2start = remove_prefix(s1, s2) + k, x1, x2start = common_prefix(s1, s2) (x1 == nothing) && return len2 - k v0 = collect(1:(len2 - k)) v2 = similar(v0) diff --git a/src/find.jl b/src/find.jl index 79a5d3a..300f102 100755 --- a/src/find.jl +++ b/src/find.jl @@ -5,16 +5,28 @@ highest similarity score with `s` according to the distance `dist`. It returns `(nothing, nothing)` if none of the elements has a similarity score higher or equal to `min_score` (default to 0.0). -The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances -(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`). + +It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances +(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)). + +### Examples +```julia-repl +julia> using StringDistances +julia> s = ""Newark" +julia> iter = ["New York", "Princeton", "San Francisco"] +julia> findmax(s, iter, Levenshtein()) +("NewYork", 1) +julia> findmax(s, iter, Levenshtein(); min_score = 0.9) +(nothing, nothing) +``` """ function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0) - min_score = Threads.Atomic{typeof(min_score)}(min_score) + min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score) scores = [0.0 for _ in 1:Threads.nthreads()] is = [0 for _ in 1:Threads.nthreads()] Threads.@threads for i in collect(keys(itr)) - score = compare(s, itr[i], dist; min_score = min_score[]) - score_old = Threads.atomic_max!(min_score, score) + score = compare(s, itr[i], dist; min_score = min_score_atomic[]) + score_old = Threads.atomic_max!(min_score_atomic, score) if score >= score_old scores[Threads.threadid()] = score is[Threads.threadid()] = i @@ -30,8 +42,21 @@ end `findall` returns the vector of indices for elements of `itr` that have a similarity score higher or equal than `min_score` according to the distance `dist`. If there are no such elements, return an empty array. -The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances + +It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`). + +### Examples +```julia-repl +julia> using StringDistances +julia> s = "Newark" +julia> iter = ["Newwark", "Princeton", "San Francisco"] +julia> findall(s, iter, Levenshtein()) +1-element Array{Int64,1}: + 1 +julia> findall(s, iter, Levenshtein(); min_score = 0.9) +0-element Array{Int64,1} +``` """ function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8) out = [Int[] for _ in 1:Threads.nthreads()] diff --git a/src/qgram.jl b/src/qgram.jl index 71dd72f..a2045b2 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -48,7 +48,7 @@ abstract type QGramDistance <: StringDistance end function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString) x = count_map(qgram(s1, dist.q), qgram(s2, dist.q)) - evaluate(dist, x) + evaluate(dist, values(x)) end # For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2, @@ -98,9 +98,9 @@ struct QGram <: QGramDistance q::Int end -function evaluate(dist::QGram, count_dict) +function evaluate(dist::QGram, itr) n = 0 - for (n1, n2) in values(count_dict) + for (n1, n2) in itr n += abs(n1 - n2) end n @@ -122,9 +122,9 @@ struct Cosine <: QGramDistance q::Int end -function evaluate(dist::Cosine, count_dict) +function evaluate(dist::Cosine, itr) norm1, norm2, prodnorm = 0, 0, 0 - for (n1, n2) in values(count_dict) + for (n1, n2) in itr norm1 += n1^2 norm2 += n2^2 prodnorm += n1 * n2 @@ -147,9 +147,9 @@ struct Jaccard <: QGramDistance q::Int end -function evaluate(dist::Jaccard, count_dict) +function evaluate(dist::Jaccard, itr) ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in values(count_dict) + for (n1, n2) in itr ndistinct1 += n1 > 0 ndistinct2 += n2 > 0 nintersect += (n1 > 0) & (n2 > 0) @@ -172,9 +172,9 @@ struct SorensenDice <: QGramDistance q::Int end -function evaluate(dist::SorensenDice, count_dict) +function evaluate(dist::SorensenDice, itr) ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in values(count_dict) + for (n1, n2) in itr ndistinct1 += n1 > 0 ndistinct2 += n2 > 0 nintersect += (n1 > 0) & (n2 > 0) @@ -197,9 +197,9 @@ struct Overlap <: QGramDistance q::Int end -function evaluate(dist::Overlap, count_dict) +function evaluate(dist::Overlap, itr) ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in values(count_dict) + for (n1, n2) in itr ndistinct1 += n1 > 0 ndistinct2 += n2 > 0 nintersect += (n1 > 0) & (n2 > 0) diff --git a/src/utils.jl b/src/utils.jl index b3c232e..c6000c9 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,6 +1,5 @@ -# String with Length -# This allows to compute length once and only once -struct StringWithLength{T<:AbstractString} <: AbstractString +# This type allows to compute length once and for all +struct StringWithLength{T <: AbstractString} <: AbstractString s::T l::Int end @@ -21,19 +20,17 @@ function reorder(s1::AbstractString, s2::AbstractString) end end - -## Find common prefixes (up to lim. -1 means Inf) -function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1) - l = 0 +function common_prefix(s1::AbstractString, s2::AbstractString) x1 = iterate(s1) x2 = iterate(s2) - while (x1 !== nothing) & (x2 !== nothing) & (l < lim || lim < 0) + l = 0 + while (x1 !== nothing) & (x2 !== nothing) ch1, state1 = x1 ch2, state2 = x2 ch1 != ch2 && break + l += 1 x1 = iterate(s1, state1) x2 = iterate(s2, state2) - l += 1 end return l, x1, x2 end \ No newline at end of file diff --git a/test/modifiers.jl b/test/modifiers.jl index a670644..fc62919 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -97,6 +97,9 @@ using StringDistances, Test # check find_best and find_all @test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1) + @test findmax("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2) + @test findmax("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3) + @test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == (nothing, nothing) @test findmax("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1) @test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]