Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
paulirwin committed Sep 13, 2017
2 parents b7c19ec + 488b075 commit 3d9ddb8
Show file tree
Hide file tree
Showing 23 changed files with 171 additions and 855 deletions.
42 changes: 5 additions & 37 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,39 +1,7 @@
language: csharp
solution: F23.StringSimilarity.sln
install:
- nuget restore F23.StringSimilarity.sln
- nuget install xunit.runner.console -Version 2.1.0 -OutputDirectory testrunner
# - nuget install coveralls.io -Version 1.3.4 -OutputDirectory coveralls

# - CURRENT_WORKING_DIR=$(pwd)

# - sudo apt-get install gtk-sharp2
# - curl -sS https://api.nuget.org/packages/mono.cecil.0.9.5.4.nupkg > /tmp/mono.cecil.0.9.5.4.nupkg.zip
# - unzip /tmp/mono.cecil.0.9.5.4.nupkg.zip -d /tmp/cecil
# - cp /tmp/cecil/lib/net40/Mono.Cecil.dll .
# - cp /tmp/cecil/lib/net40/Mono.Cecil.dll /tmp/cecil/
# - git clone --depth=50 git://github.com/csMACnz/monocov.git ../../csMACnz/monocov
# - cd ../../csMACnz/monocov
# - cp /tmp/cecil/Mono.Cecil.dll .
# - ./configure
# - make
# - sudo make install
# - cd $CURRENT_WORKING_DIR

mono: none
dotnet: 2.0.0
dist: trusty
script:
- xbuild /p:Configuration=Release F23.StringSimilarity.sln
- mono ./testrunner/xunit.runner.console.2.1.0/tools/xunit.console.exe ./test/F23.StringSimilarity.Tests/bin/Release/F23.StringSimilarity.Tests.dll

# - export LD_LIBRARY_PATH=/usr/local/lib
# - mono --debug --profile=monocov:outfile=monocovCoverage.cov,+[F23.StringSimilarity],+[F23.StringSimilarity.Tests] ./testrunner/xunit.runner.console.2.1.0/tools/xunit.console.exe ./test/F23.StringSimilarity.Tests/bin/Release/F23.StringSimilarity.Tests.dll
# - monocov --export-xml=monocovCoverage monocovCoverage.cov
# - REPO_COMMIT_AUTHOR=$(git show -s --pretty=format:"%cn")
# - REPO_COMMIT_AUTHOR_EMAIL=$(git show -s --pretty=format:"%ce")
# - REPO_COMMIT_MESSAGE=$(git show -s --pretty=format:"%s")
# - echo $TRAVIS_COMMIT
# - echo $TRAVIS_BRANCH
# - echo $REPO_COMMIT_AUTHOR
# - echo $REPO_COMMIT_AUTHOR_EMAIL
# - echo $REPO_COMMIT_MESSAGE
# - echo $TRAVIS_JOB_ID
# - mono ./coveralls/coveralls.io.1.3.4/tools/coveralls.net.exe --monocov -i ./monocovCoverage --commitId $TRAVIS_COMMIT --commitBranch $TRAVIS_BRANCH --commitAuthor "$REPO_COMMIT_AUTHOR" --commitEmail "$REPO_COMMIT_AUTHOR_EMAIL" --commitMessage "$REPO_COMMIT_MESSAGE" --jobId $TRAVIS_JOB_ID --serviceName "travis-ci" --useRelativePaths
- dotnet build -c Release src/F23.StringSimilarity/F23.StringSimilarity.csproj
- dotnet test test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj
22 changes: 22 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,25 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

Portions of this code are licensed and copyright as follows:

Copyright 2015 Thibault Debatty.

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 changes: 0 additions & 22 deletions build/F23.StringSimilarity.nuspec

This file was deleted.

16 changes: 10 additions & 6 deletions src/F23.StringSimilarity/Cosine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,14 @@ public Cosine(int k) : base(k) { }
/// Default k is 3.
/// </summary>
public Cosine() { }

/// <summary>
/// Compute the cosine similarity between strings.
/// </summary>
/// <param name="s1">The first string to compare.</param>
/// <param name="s2">The second string to compare.</param>
/// <returns>The cosine similarity in the range [0, 1]</returns>
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
/// <exception cref="T:System.ArgumentNullException">If s1 or s2 is null.</exception>
public double Similarity(string s1, string s2)
{
if (s1 == null)
Expand Down Expand Up @@ -118,9 +118,7 @@ private static double DotProduct(IDictionary<string, int> profile1,
double agg = 0;
foreach (var entry in small_profile)
{
int i;

if (!large_profile.TryGetValue(entry.Key, out i)) continue;
if (!large_profile.TryGetValue(entry.Key, out var i)) continue;

agg += 1.0 * entry.Value * i;
}
Expand All @@ -137,7 +135,13 @@ private static double DotProduct(IDictionary<string, int> profile1,
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
public double Distance(string s1, string s2)
=> 1.0 - Similarity(s1, s2);


/// <summary>
///
/// </summary>
/// <param name="profile1"></param>
/// <param name="profile2"></param>
/// <returns></returns>
public double Similarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
=> DotProduct(profile1, profile2)
/ (Norm(profile1) * Norm(profile2));
Expand Down
1 change: 0 additions & 1 deletion src/F23.StringSimilarity/Damerau.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ namespace F23.StringSimilarity
/// substitution of a single character, or a transposition of two adjacent
/// characters.
/// It does respect triangle inequality, and is thus a metric distance.
///
/// This is not to be confused with the optimal string alignment distance, which
/// is an extension where no substring can be edited more than once.
/// </summary>
Expand Down
94 changes: 20 additions & 74 deletions src/F23.StringSimilarity/F23.StringSimilarity.csproj
Original file line number Diff line number Diff line change
@@ -1,77 +1,23 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{FA27327B-BCCC-46C7-8EED-BBD1ECF4BF53}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>F23.StringSimilarity</RootNamespace>
<AssemblyName>F23.StringSimilarity</AssemblyName>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<TargetFramework>netstandard1.0</TargetFramework>
<PackageId>F23.StringSimilarity</PackageId>
<PackageVersion>3.0.0</PackageVersion>
<Title>StringSimilarity.NET</Title>
<Authors>James Blair, Paul Irwin</Authors>
<Copyright>Copyright 2016 feature[23]</Copyright>
<Description>A .NET port of java-string-similarity.</Description>
<Summary>A .NET port of java-string-similarity (https://github.com/tdebatty/java-string-similarity). A library implementing different string similarity and distance measures. Several algorithms (including Levenshtein edit distance and sibblings, Jaro-Winkler, Longest Common Subsequence, cosine similarity etc.) are currently implemented.</Summary>
<PackageProjectUrl>https://github.com/feature23/StringSimilarity.NET</PackageProjectUrl>
<PackageLicenseUrl>https://raw.githubusercontent.com/feature23/StringSimilarity.NET/master/LICENSE</PackageLicenseUrl>
<PackageIconUrl>https://raw.githubusercontent.com/feature23/StringSimilarity.NET/master/logo.png</PackageIconUrl>
<PackageRequireLicenseAcceptance>false</PackageRequireLicenseAcceptance>
<PackageTags>string similarity distance cosine damerau jaccard jaro-winkler levenshtein ngram qgram shingle sift4</PackageTags>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>

<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
<DocumentationFile>bin\Release\netstandard1.0\F23.StringSimilarity.xml</DocumentationFile>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Data" />
<Reference Include="System.Net.Http" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="Cosine.cs" />
<Compile Include="Damerau.cs" />
<Compile Include="Experimental\Sift4.cs" />
<Compile Include="ICharacterSubstitution.cs" />
<Compile Include="Interfaces\IMetricStringDistance.cs" />
<Compile Include="Interfaces\INormalizedStringDistance.cs" />
<Compile Include="Interfaces\INormalizedStringSimilarity.cs" />
<Compile Include="Interfaces\IStringDistance.cs" />
<Compile Include="Interfaces\IStringSimilarity.cs" />
<Compile Include="Jaccard.cs" />
<Compile Include="JaroWinkler.cs" />
<Compile Include="Levenshtein.cs" />
<Compile Include="LongestCommonSubsequence.cs" />
<Compile Include="MetricLCS.cs" />
<Compile Include="NGram.cs" />
<Compile Include="NormalizedLevenshtein.cs" />
<Compile Include="OptimalStringAlignment.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="QGram.cs" />
<Compile Include="ShingleBased.cs" />
<Compile Include="SorensenDice.cs" />
<Compile Include="Support\ArrayExtensions.cs" />
<Compile Include="Utils\SparseBooleanVector.cs" />
<Compile Include="Utils\SparseIntegerVector.cs" />
<Compile Include="WeightedLevenshtein.cs" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

</Project>
6 changes: 6 additions & 0 deletions src/F23.StringSimilarity/Interfaces/IMetricStringDistance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ namespace F23.StringSimilarity.Interfaces
/// </summary>
public interface IMetricStringDistance : IStringDistance
{
/// <summary>
/// Compute and return the metric distance.
/// </summary>
/// <param name="s1"></param>
/// <param name="s2"></param>
/// <returns></returns>
new double Distance(string s1, string s2);
}
}
7 changes: 7 additions & 0 deletions src/F23.StringSimilarity/Interfaces/IStringDistance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ namespace F23.StringSimilarity.Interfaces
{
public interface IStringDistance
{
/// <summary>
/// Compute and return a measure of distance.
/// Must be >= 0.
/// </summary>
/// <param name="s1"></param>
/// <param name="s2"></param>
/// <returns></returns>
double Distance(string s1, string s2);
}
}
29 changes: 22 additions & 7 deletions src/F23.StringSimilarity/Jaccard.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,30 @@

namespace F23.StringSimilarity
{
/// <summary>
/// Each input string is converted into a set of n-grams, the Jaccard index is
/// then computed as |V1 inter V2| / |V1 union V2|.
/// Like Q-Gram distance, the input strings are first converted into sets of
/// n-grams (sequences of n characters, also called k-shingles), but this time
/// the cardinality of each n-gram is not taken into account.
/// Distance is computed as 1 - cosine similarity.
/// Jaccard index is a metric distance.
/// </summary>
public class Jaccard : ShingleBased, IMetricStringDistance, INormalizedStringDistance, INormalizedStringSimilarity
{
/// <summary>
/// The strings are first transformed into sets of k-shingles (sequences of k
/// characters), then Jaccard index is computed as |A inter B| / |A union B|.
/// The default value of k is 3.
/// </summary>
/// <param name="k"></param>
public Jaccard(int k) : base(k) { }

/// <summary>
/// The strings are first transformed into sets of k-shingles (sequences of k
/// characters), then Jaccard index is computed as |A inter B| / |A union B|.
/// The default value of k is 3.
/// </summary>
public Jaccard() { }

/// <summary>
Expand Down Expand Up @@ -67,13 +87,8 @@ public double Similarity(string s1, string s2)
union.UnionWith(profile1.Keys);
union.UnionWith(profile2.Keys);

int inter = 0;

foreach (var key in union)
{
if (profile1.ContainsKey(key) && profile2.ContainsKey(key))
inter++;
}
int inter = profile1.Keys.Count + profile2.Keys.Count
- union.Count;

return 1.0 * inter / union.Count;
}
Expand Down
18 changes: 9 additions & 9 deletions src/F23.StringSimilarity/JaroWinkler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ public double Similarity(string s1, string s2)
public double Distance(string s1, string s2)
=> 1.0 - Similarity(s1, s2);

private int[] Matches(string s1, string s2)
private static int[] Matches(string s1, string s2)
{
string max, min;
if (s1.Length > s2.Length)
Expand All @@ -135,20 +135,20 @@ private int[] Matches(string s1, string s2)

//int[] matchIndexes = new int[min.Length];
//Arrays.fill(matchIndexes, -1);
int[] matchIndexes = Enumerable.Repeat(-1, min.Length).ToArray();
int[] match_indexes = Enumerable.Repeat(-1, min.Length).ToArray();

bool[] matchFlags = new bool[max.Length];
bool[] match_flags = new bool[max.Length];
int matches = 0;
for (int mi = 0; mi < min.Length; mi++)
{
char c1 = min[mi];
for (int xi = Math.Max(mi - range, 0),
xn = Math.Min(mi + range + 1, max.Length); xi < xn; xi++)
{
if (!matchFlags[xi] && c1 == max[xi])
if (!match_flags[xi] && c1 == max[xi])
{
matchIndexes[mi] = xi;
matchFlags[xi] = true;
match_indexes[mi] = xi;
match_flags[xi] = true;
matches++;
break;
}
Expand All @@ -158,15 +158,15 @@ private int[] Matches(string s1, string s2)
char[] ms2 = new char[matches];
for (int i = 0, si = 0; i < min.Length; i++)
{
if (matchIndexes[i] != -1)
if (match_indexes[i] != -1)
{
ms1[si] = min[i];
si++;
}
}
for (int i = 0, si = 0; i < max.Length; i++)
{
if (matchFlags[i])
if (match_flags[i])
{
ms2[si] = max[i];
si++;
Expand All @@ -192,7 +192,7 @@ private int[] Matches(string s1, string s2)
break;
}
}
return new int[] { matches, transpositions / 2, prefix, max.Length };
return new[] { matches, transpositions / 2, prefix, max.Length };
}
}
}
Loading

0 comments on commit 3d9ddb8

Please sign in to comment.