-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsearch.py
39 lines (36 loc) · 1.76 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
# fuzzy finder class
class Search():
# expects DataFrame and a name of column which becomes a learning corpus
def __init__(self, df, column, analyzer='word', ngram_range=(1, 1)):
self.df = df
# TfidfVectorizer is so cool that it does all the work here
# maximum document frequency set to 0.28 is just enough to eliminate "the"s out
self.vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, max_df = 0.28)
# creates feature/document matrix where features are distinct words or ngrams
self.matrix = self.vectorizer.fit_transform(df[column].values)
self.features = set(self.vectorizer.get_feature_names())
# performs query matching against the matrix
def search(self, string):
# scores features of query in accordance with corpus
r = self.vectorizer.transform([string])
# adds query as a new row to the matrix
r = sp.vstack((r, self.matrix))
# removes all columns where query row has zeros
r = r[:, list(set(range(r.shape[1])) - set(np.where(r[0, :].todense() == 0)[1]))]
# calculates similarities
r = (r * r.T)[1:, 0].toarray()
# turns result into DataFrame having original index
r = pd.DataFrame(r, index=self.df.index, columns=['score'])
# removes all irrelevant rows
r = r[r['score'] != 0]
# inner joins result to original DataFrame
r = self.df.join(r, on=self.df.index, how='inner')
# drops leftovers of join
r = r.drop(labels='key_0', axis=1)
# ranks results by relevance
r = r.sort_values(by='score', ascending=False)
return r