rmv datastructures + add docs

matthieugomez · Dec 13, 2019 · a575eea · a575eea · matthieugomez · Dec 13, 2019
1 parent 8f9ab74
commit a575eea
Show file tree

Hide file tree

Showing 9 changed files with 141 additions and 72 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,15 +1,13 @@
 name = "StringDistances"
 uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
-version = "0.5.1"
+version = "0.5.2"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
-DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 
 [compat]
 julia = "1"
-DataStructures = "0.14, 0.15, 0.16, 0.17"
-Distances = "0.2, 0.3, 0.4, 0.4, 0.6, 0.7, 0.8"
+Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 [![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
 [![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
 
-This Julia package computes various distances between `AbstractString`s
+This Julia package computes various distances between AbstractStrings
 
 ## Installation
 The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
@@ -46,7 +46,7 @@ compare("martha", "marhta", TokenSet(Jaro()))
 compare("martha", "marhta", TokenMax(RatcliffObershelp()))
 ```
 
-In case the word order does not matter, a good distance is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
+A good distance to match strings composed of multiple words (like addresses) is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
 
 ## Find
 - `findmax` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is:

diff --git a/src/StringDistances.jl b/src/StringDistances.jl
@@ -2,7 +2,6 @@ module StringDistances
 
 using Distances
 import Distances: evaluate, result_type
-using DataStructures  # for SortedSet in TokenSort
 
 ##############################################################################
 ##

diff --git a/src/compare.jl b/src/compare.jl
@@ -1,8 +1,14 @@
 """
     compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
 
-compare returns a similarity score between 0 and 1 for the strings `s1` and 
-`s2` based on the distance `dist`
+return a similarity score between 0 and 1 for the strings `s1` and 
+`s2` based on the `StringDistance` `dist`
+
+### Examples
+```julia-repl
+julia> compare("martha", "marhta", Levenshtein())
+0.6666666666666667
+```
 """
 function compare(s1::AbstractString, s2::AbstractString, 
     dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
@@ -38,46 +44,56 @@ function compare(s1::AbstractString, s2::AbstractString,
 end
 
 """
-   Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4)
+   Winkler(dist::StringDistance; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
 
-Winkler is a `StringDistance` modifier that boosts the similarity score between 
-two strings by a scale `p` when the strings share a common prefix with lenth lower 
-than `l` (the boost is only applied the similarity score above `boosting_threshold`)
+Creates the `Winkler{dist, p, threshold, maxlength}` distance
+
+`Winkler{dist, p, threshold, length)` modifies the string distance `dist` to boost the 
+similarity score between  two strings, when their original similarity score is above some `threshold`.
+The boost is equal to `min(l,  maxlength) * p * (1 - score)` where `l` denotes the 
+length of their common prefix and `score` denotes the original score
 """
-struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance
-    dist::T1
-    p::T2          # scaling factor. Default to 0.1
-    boosting_threshold::T3      # boost threshold. Default to 0.7
-    l::Integer                  # length of common prefix. Default to 4
-    function Winkler(dist::T1, p::T2,  boosting_threshold::T3, l::T4) where {T1, T2, T3, T4}
-        p * l >= 1 && throw("scaling factor times length of common prefix must be lower than one")
-        new{T1, T2, T3, T4}(dist, p, boosting_threshold, l)
-    end
+struct Winkler{S <: StringDistance} <: StringDistance
+    dist::S
+    p::Float64          # scaling factor. Default to 0.1
+    threshold::Float64  # boost threshold. Default to 0.7
+    maxlength::Integer      # max length of common prefix. Default to 4
+end
+
+function Winkler(dist::StringDistance; p = 0.1, threshold = 0.7, maxlength = 4)
+    p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
+    Winkler(dist, 0.1, 0.7, 4)
 end
-Winkler(x) = Winkler(x, 0.1, 0.7, 4)
 
-# hard to use min_score because of whether there is boost or not in the end
 function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_score = 0.0)
-    l = remove_prefix(s1, s2, dist.l)[1]
     # cannot do min_score because of boosting threshold
     score = compare(s1, s2, dist.dist)
-    if score >= dist.boosting_threshold
-        score += l * dist.p * (1 - score)
+    if score >= dist.threshold
+        l = common_prefix(s1, s2)[1]
+        score += min(l, dist.maxlength) * dist.p * (1 - score)
     end
     return score
 end
 
-JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
-
 
 """
    Partial(dist::StringDistance)
 
-Partial is a `StringDistance` modifier that returns the maximal similarity score 
-between the shorter string and substrings of the longer string
+Creates the `Partial{dist}` distance
+
+`Partial{dist}` modifies the string distance `dist` to return the 
+maximal similarity score  between the shorter string and substrings of the longer string
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta Braves"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> compare(s1, s2, Partial(RatcliffObershelp()))
+0.4516129032258065
+```
 """
-struct Partial{T <: StringDistance} <: StringDistance
-    dist::T
+struct Partial{S <: StringDistance} <: StringDistance
+    dist::S
 end
 
 function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = 0.0)
@@ -121,8 +137,19 @@ end
 """
    TokenSort(dist::StringDistance)
 
-TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
-by reording words alphabetically.
+Creates the `TokenSort{dist}` distance
+
+`TokenSort{dist}` modifies the string distance `dist` to adjust for differences 
+in word orders by reording words alphabetically.
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta Braves"
+julia> s1 = "New York Mets vs Atlanta Braves"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
+1.0
+```
 """
 struct TokenSort{T <: StringDistance} <: StringDistance
     dist::T
@@ -139,17 +166,27 @@ end
 """
    TokenSet(dist::StringDistance)
 
-TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
-and word numbers by comparing the intersection of two strings with each string.
+Creates the `TokenSet{dist}` distance
+
+`TokenSet{dist}` modifies the string distance `dist` to adjust for differences 
+in  word orders and word numbers, by comparing the intersection of two strings with each string.
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
+1.0
+```
 """
 struct TokenSet{T <: StringDistance} <: StringDistance
     dist::T
 end
 
 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0)
-    v1 = SortedSet(split(s1))
-    v2 = SortedSet(split(s2))
+    v1 = unique!(sort!(split(s1)))
+    v2 = unique!(sort!(split(s2)))
     v0 = intersect(v1, v2)
     s0 = join(v0, " ")
     s1 = join(v1, " ")
@@ -167,12 +204,22 @@ end
 """
    TokenMax(dist::StringDistance)
 
-TokenSort is a `StringDistance` modifier that combines similarlity scores using the base 
-distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on 
-string lengths.
+Creates the `TokenMax{dist}` distance
+
+`TokenMax{dist}` combines similarity scores of the base distance `dist`,
+its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its 
+[`TokenSet`](@ref) modifier, with penalty terms depending on string lengths.
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
+0.95
+```
 """
-struct TokenMax{T <: StringDistance} <: StringDistance
-    dist::T
+struct TokenMax{S <: StringDistance} <: StringDistance
+    dist::S
 end
 
 function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)

diff --git a/src/edit.jl b/src/edit.jl
@@ -89,7 +89,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
     len1, len2 = length(s1), length(s2)
     max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
     # prefix common to both strings can be ignored
-    k, x1, x2start = remove_prefix(s1, s2)
+    k, x1, x2start = common_prefix(s1, s2)
     x1 == nothing && return len2 - k
     # distance initialized to first row of matrix
     # => distance between "" and s2[1:i}
@@ -141,7 +141,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
     len1, len2 = length(s1), length(s2)
     max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
     # prefix common to both strings can be ignored
-    k, x1, x2start = remove_prefix(s1, s2)
+    k, x1, x2start = common_prefix(s1, s2)
     (x1 == nothing) && return len2 - k
     v0 = collect(1:(len2 - k))
     v2 = similar(v0)

diff --git a/src/find.jl b/src/find.jl
@@ -5,16 +5,28 @@
 highest similarity score with `s` according to the distance `dist`. 
 It returns `(nothing, nothing)` if none of the elements has a similarity score 
 higher or equal to `min_score` (default to 0.0).
-The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances 
-(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
+
+It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
+(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> s = ""Newark"
+julia> iter = ["New York", "Princeton", "San Francisco"]
+julia> findmax(s, iter, Levenshtein())
+("NewYork", 1)
+julia> findmax(s, iter, Levenshtein(); min_score = 0.9)
+(nothing, nothing)
+```
 """
 function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
-    min_score = Threads.Atomic{typeof(min_score)}(min_score)
+    min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
     scores = [0.0 for _ in 1:Threads.nthreads()]
     is = [0 for _ in 1:Threads.nthreads()]
     Threads.@threads for i in collect(keys(itr))
-        score = compare(s, itr[i], dist; min_score = min_score[])
-        score_old = Threads.atomic_max!(min_score, score)
+        score = compare(s, itr[i], dist; min_score = min_score_atomic[])
+        score_old = Threads.atomic_max!(min_score_atomic, score)
         if score >= score_old
             scores[Threads.threadid()] = score
             is[Threads.threadid()] = i
@@ -30,8 +42,21 @@ end
 `findall` returns the vector of indices for elements of `itr` that have a 
 similarity score higher or equal than `min_score` according to the distance `dist`.
 If there are no such elements, return an empty array. 
-The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances 
+
+It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
 (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> s = "Newark"
+julia> iter = ["Newwark", "Princeton", "San Francisco"]
+julia> findall(s, iter, Levenshtein())
+1-element Array{Int64,1}:
+ 1
+julia> findall(s, iter, Levenshtein(); min_score = 0.9)
+0-element Array{Int64,1}
+```
 """
 function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
     out = [Int[] for _ in 1:Threads.nthreads()]

diff --git a/src/qgram.jl b/src/qgram.jl
@@ -48,7 +48,7 @@ abstract type QGramDistance <: StringDistance end
 
 function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
 	x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
-	evaluate(dist, x)
+	evaluate(dist, values(x))
 end
 
 # For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2, 
@@ -98,9 +98,9 @@ struct QGram <: QGramDistance
 	q::Int
 end
 
-function evaluate(dist::QGram, count_dict)
+function evaluate(dist::QGram, itr)
 	n = 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		n += abs(n1 - n2)
 	end
 	n
@@ -122,9 +122,9 @@ struct Cosine <: QGramDistance
 	q::Int
 end
 
-function evaluate(dist::Cosine, count_dict)
+function evaluate(dist::Cosine, itr)
 	norm1, norm2, prodnorm = 0, 0, 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		norm1 += n1^2
 		norm2 += n2^2
 		prodnorm += n1 * n2
@@ -147,9 +147,9 @@ struct Jaccard <: QGramDistance
 	q::Int
 end
 
-function evaluate(dist::Jaccard, count_dict)
+function evaluate(dist::Jaccard, itr)
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@@ -172,9 +172,9 @@ struct SorensenDice <: QGramDistance
 	q::Int
 end
 
-function evaluate(dist::SorensenDice, count_dict)
+function evaluate(dist::SorensenDice, itr)
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@@ -197,9 +197,9 @@ struct Overlap <: QGramDistance
 	q::Int
 end
 
-function evaluate(dist::Overlap, count_dict)
+function evaluate(dist::Overlap, itr)
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)