Skip to content

Commit

Permalink
rmv datastructures + add docs
Browse files Browse the repository at this point in the history
  • Loading branch information
matthieugomez committed Dec 13, 2019
1 parent 8f9ab74 commit a575eea
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 72 deletions.
6 changes: 2 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.5.1"
version = "0.5.2"

[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"

[compat]
julia = "1"
DataStructures = "0.14, 0.15, 0.16, 0.17"
Distances = "0.2, 0.3, 0.4, 0.4, 0.6, 0.7, 0.8"
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)

This Julia package computes various distances between `AbstractString`s
This Julia package computes various distances between AbstractStrings

## Installation
The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
Expand Down Expand Up @@ -46,7 +46,7 @@ compare("martha", "marhta", TokenSet(Jaro()))
compare("martha", "marhta", TokenMax(RatcliffObershelp()))
```

In case the word order does not matter, a good distance is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
A good distance to match strings composed of multiple words (like addresses) is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).

## Find
- `findmax` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is:
Expand Down
1 change: 0 additions & 1 deletion src/StringDistances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ module StringDistances

using Distances
import Distances: evaluate, result_type
using DataStructures # for SortedSet in TokenSort

##############################################################################
##
Expand Down
121 changes: 84 additions & 37 deletions src/compare.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
"""
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
compare returns a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the `StringDistance` `dist`
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1::AbstractString, s2::AbstractString,
dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
Expand Down Expand Up @@ -38,46 +44,56 @@ function compare(s1::AbstractString, s2::AbstractString,
end

"""
Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4)
Winkler(dist::StringDistance; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
Winkler is a `StringDistance` modifier that boosts the similarity score between
two strings by a scale `p` when the strings share a common prefix with lenth lower
than `l` (the boost is only applied the similarity score above `boosting_threshold`)
Creates the `Winkler{dist, p, threshold, maxlength}` distance
`Winkler{dist, p, threshold, length)` modifies the string distance `dist` to boost the
similarity score between two strings, when their original similarity score is above some `threshold`.
The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the
length of their common prefix and `score` denotes the original score
"""
struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance
dist::T1
p::T2 # scaling factor. Default to 0.1
boosting_threshold::T3 # boost threshold. Default to 0.7
l::Integer # length of common prefix. Default to 4
function Winkler(dist::T1, p::T2, boosting_threshold::T3, l::T4) where {T1, T2, T3, T4}
p * l >= 1 && throw("scaling factor times length of common prefix must be lower than one")
new{T1, T2, T3, T4}(dist, p, boosting_threshold, l)
end
struct Winkler{S <: StringDistance} <: StringDistance
dist::S
p::Float64 # scaling factor. Default to 0.1
threshold::Float64 # boost threshold. Default to 0.7
maxlength::Integer # max length of common prefix. Default to 4
end

function Winkler(dist::StringDistance; p = 0.1, threshold = 0.7, maxlength = 4)
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
Winkler(dist, 0.1, 0.7, 4)
end
Winkler(x) = Winkler(x, 0.1, 0.7, 4)

# hard to use min_score because of whether there is boost or not in the end
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_score = 0.0)
l = remove_prefix(s1, s2, dist.l)[1]
# cannot do min_score because of boosting threshold
score = compare(s1, s2, dist.dist)
if score >= dist.boosting_threshold
score += l * dist.p * (1 - score)
if score >= dist.threshold
l = common_prefix(s1, s2)[1]
score += min(l, dist.maxlength) * dist.p * (1 - score)
end
return score
end

JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)


"""
Partial(dist::StringDistance)
Partial is a `StringDistance` modifier that returns the maximal similarity score
between the shorter string and substrings of the longer string
Creates the `Partial{dist}` distance
`Partial{dist}` modifies the string distance `dist` to return the
maximal similarity score between the shorter string and substrings of the longer string
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> compare(s1, s2, Partial(RatcliffObershelp()))
0.4516129032258065
```
"""
struct Partial{T <: StringDistance} <: StringDistance
dist::T
struct Partial{S <: StringDistance} <: StringDistance
dist::S
end

function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = 0.0)
Expand Down Expand Up @@ -121,8 +137,19 @@ end
"""
TokenSort(dist::StringDistance)
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
by reording words alphabetically.
Creates the `TokenSort{dist}` distance
`TokenSort{dist}` modifies the string distance `dist` to adjust for differences
in word orders by reording words alphabetically.
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
1.0
```
"""
struct TokenSort{T <: StringDistance} <: StringDistance
dist::T
Expand All @@ -139,17 +166,27 @@ end
"""
TokenSet(dist::StringDistance)
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
and word numbers by comparing the intersection of two strings with each string.
Creates the `TokenSet{dist}` distance
`TokenSet{dist}` modifies the string distance `dist` to adjust for differences
in word orders and word numbers, by comparing the intersection of two strings with each string.
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
1.0
```
"""
struct TokenSet{T <: StringDistance} <: StringDistance
dist::T
end

# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0)
v1 = SortedSet(split(s1))
v2 = SortedSet(split(s2))
v1 = unique!(sort!(split(s1)))
v2 = unique!(sort!(split(s2)))
v0 = intersect(v1, v2)
s0 = join(v0, " ")
s1 = join(v1, " ")
Expand All @@ -167,12 +204,22 @@ end
"""
TokenMax(dist::StringDistance)
TokenSort is a `StringDistance` modifier that combines similarlity scores using the base
distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on
string lengths.
Creates the `TokenMax{dist}` distance
`TokenMax{dist}` combines similarity scores of the base distance `dist`,
its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its
[`TokenSet`](@ref) modifier, with penalty terms depending on string lengths.
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
0.95
```
"""
struct TokenMax{T <: StringDistance} <: StringDistance
dist::T
struct TokenMax{S <: StringDistance} <: StringDistance
dist::S
end

function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
Expand Down
4 changes: 2 additions & 2 deletions src/edit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
k, x1, x2start = common_prefix(s1, s2)
x1 == nothing && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
Expand Down Expand Up @@ -141,7 +141,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
k, x1, x2start = common_prefix(s1, s2)
(x1 == nothing) && return len2 - k
v0 = collect(1:(len2 - k))
v2 = similar(v0)
Expand Down
37 changes: 31 additions & 6 deletions src/find.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,28 @@
highest similarity score with `s` according to the distance `dist`.
It returns `(nothing, nothing)` if none of the elements has a similarity score
higher or equal to `min_score` (default to 0.0).
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
### Examples
```julia-repl
julia> using StringDistances
julia> s = ""Newark"
julia> iter = ["New York", "Princeton", "San Francisco"]
julia> findmax(s, iter, Levenshtein())
("NewYork", 1)
julia> findmax(s, iter, Levenshtein(); min_score = 0.9)
(nothing, nothing)
```
"""
function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
min_score = Threads.Atomic{typeof(min_score)}(min_score)
min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()]
Threads.@threads for i in collect(keys(itr))
score = compare(s, itr[i], dist; min_score = min_score[])
score_old = Threads.atomic_max!(min_score, score)
score = compare(s, itr[i], dist; min_score = min_score_atomic[])
score_old = Threads.atomic_max!(min_score_atomic, score)
if score >= score_old
scores[Threads.threadid()] = score
is[Threads.threadid()] = i
Expand All @@ -30,8 +42,21 @@ end
`findall` returns the vector of indices for elements of `itr` that have a
similarity score higher or equal than `min_score` according to the distance `dist`.
If there are no such elements, return an empty array.
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
### Examples
```julia-repl
julia> using StringDistances
julia> s = "Newark"
julia> iter = ["Newwark", "Princeton", "San Francisco"]
julia> findall(s, iter, Levenshtein())
1-element Array{Int64,1}:
1
julia> findall(s, iter, Levenshtein(); min_score = 0.9)
0-element Array{Int64,1}
```
"""
function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()]
Expand Down
22 changes: 11 additions & 11 deletions src/qgram.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ abstract type QGramDistance <: StringDistance end

function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
evaluate(dist, x)
evaluate(dist, values(x))
end

# For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2,
Expand Down Expand Up @@ -98,9 +98,9 @@ struct QGram <: QGramDistance
q::Int
end

function evaluate(dist::QGram, count_dict)
function evaluate(dist::QGram, itr)
n = 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
n += abs(n1 - n2)
end
n
Expand All @@ -122,9 +122,9 @@ struct Cosine <: QGramDistance
q::Int
end

function evaluate(dist::Cosine, count_dict)
function evaluate(dist::Cosine, itr)
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
norm1 += n1^2
norm2 += n2^2
prodnorm += n1 * n2
Expand All @@ -147,9 +147,9 @@ struct Jaccard <: QGramDistance
q::Int
end

function evaluate(dist::Jaccard, count_dict)
function evaluate(dist::Jaccard, itr)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
Expand All @@ -172,9 +172,9 @@ struct SorensenDice <: QGramDistance
q::Int
end

function evaluate(dist::SorensenDice, count_dict)
function evaluate(dist::SorensenDice, itr)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
Expand All @@ -197,9 +197,9 @@ struct Overlap <: QGramDistance
q::Int
end

function evaluate(dist::Overlap, count_dict)
function evaluate(dist::Overlap, itr)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
Expand Down
Loading

2 comments on commit a575eea

@matthieugomez
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/6694

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if Julia TagBot is installed, or can be done manually through the github interface, or via:

git tag -a v0.5.2 -m "<description of version>" a575eeab6a2e7a1a827c47a93202022215103fce
git push origin v0.5.2

Please sign in to comment.