From cd7902912bde6562f9a3f09cd0a5fe5e9bcf9cfd Mon Sep 17 00:00:00 2001 From: Jeremy Grifski Date: Mon, 21 Mar 2022 19:36:29 -0400 Subject: [PATCH] Improved the Search Algorithm (#5) * Testing search method * Fixed the search algorithm * Incremented the patch * Added more test cases * Added a comment --- code_bot.py | 72 ++-------------------------------------------- code_bot_utils.py | 73 +++++++++++++++++++++++++++++++++++++++++++++++ test_code_bot.py | 36 +++++++++++++++++++++++ 3 files changed, 112 insertions(+), 69 deletions(-) create mode 100644 code_bot_utils.py create mode 100644 test_code_bot.py diff --git a/code_bot.py b/code_bot.py index c010d7d..6ef3445 100644 --- a/code_bot.py +++ b/code_bot.py @@ -1,6 +1,7 @@ import json import os -import string + +from code_bot_utils import * import discord from discord import Message @@ -11,74 +12,7 @@ from dotenv import load_dotenv -__version__ = "0.1.0" - - -# Helper methods -def generate_keyword_mapping(queries: list) -> dict: - """ - Creates a mapping of keywords to queries. - - :param queries: a list of queries with responses - :return: a dictionary of keywords to query indices - """ - keyword_to_queries = dict() - for i, question in enumerate(queries): - if question.get('query'): - keywords = generate_keywords(question.get("query")) - keywords.extend(generate_keywords(question.get("response"))) - for keyword in keywords: - keyword_to_queries.setdefault(keyword, []).append(i) - return keyword_to_queries - - -def generate_keywords(query: string) -> list: - """ - Create a list of keywords from a query. - - :param query: a search query - :return: the list of keywords from that query - """ - stop_words = ["", "is", "a", "the", "can", - "i", "to", "in", "by", "from", "be", "of"] - keywords = query \ - .translate(str.maketrans('', '', string.punctuation)) \ - .lower() \ - .split(" ") - keywords = [word for word in keywords if word not in stop_words] - return keywords - - -def search(keyword_to_queries: dict, keywords: list) -> list: - """ - Looks up the list of queries that satisfy a keyword. - - :param keyword_to_queries: a mapping of keywords to query indices - :param keywords: a list of keywords to lookup - :return: a list of query indices - """ - query_count = dict() - for keyword in keywords: - query_indices = keyword_to_queries.get(keyword, []) - for i in query_indices: - query_count.setdefault(i, 0) - query_count[i] += 1 - best_matches = list( - dict(sorted(query_count.items(), key=lambda item: item[1])).keys()) - return best_matches - - -def create_md_link(url: string, text: string) -> string: - """ - Creates a markdown link. - - :param url: the url to link to - :param text: the text to display - :return: the markdown link - """ - if url: - return f"[{text}]({url})" - return text +__version__ = "0.1.1" # Global variables diff --git a/code_bot_utils.py b/code_bot_utils.py new file mode 100644 index 0000000..b62256c --- /dev/null +++ b/code_bot_utils.py @@ -0,0 +1,73 @@ +import string + +def generate_keyword_mapping(queries: list) -> dict: + """ + Creates a mapping of keywords to queries. + + :param queries: a list of queries with responses + :return: a dictionary of keywords to query indices + """ + keyword_to_queries = dict() + for i, question in enumerate(queries): + if question.get('query'): + keywords = generate_keywords(question.get("query")) + for keyword in keywords: + keyword_to_queries.setdefault(keyword, {}) + keyword_to_queries[keyword].setdefault(i, 0) + keyword_to_queries[keyword][i] += 10 + keywords = generate_keywords(question.get("response")) + for keyword in keywords: + keyword_to_queries.setdefault(keyword, {}) + keyword_to_queries[keyword].setdefault(i, 0) + keyword_to_queries[keyword][i] += 1 + return keyword_to_queries + + +def generate_keywords(query: string) -> list: + """ + Create a list of keywords from a query. + + :param query: a search query + :return: the list of keywords from that query + """ + stop_words = ["", "is", "a", "the", "can", + "i", "to", "in", "by", "from", "be", "of", + "what", "where", "when", "why", "how", "which"] + keywords = query \ + .translate(str.maketrans('', '', string.punctuation)) \ + .lower() \ + .split(" ") + keywords = [word for word in keywords if word not in stop_words] + return keywords + + +def search(keyword_to_queries: dict, keywords: list) -> list: + """ + Looks up the list of queries that satisfy a keyword. + + :param keyword_to_queries: a mapping of keywords to query indices + :param keywords: a list of keywords to lookup + :return: a list of query indices + """ + query_count = dict() + for keyword in keywords: + query_indices = keyword_to_queries.get(keyword, {}) + for i, weight in query_indices.items(): + query_count.setdefault(i, 0) + query_count[i] += weight + best_matches = list( + dict(sorted(query_count.items(), key=lambda item: item[1], reverse=True)).keys()) + return best_matches + + +def create_md_link(url: string, text: string) -> string: + """ + Creates a markdown link. + + :param url: the url to link to + :param text: the text to display + :return: the markdown link + """ + if url: + return f"[{text}]({url})" + return text diff --git a/test_code_bot.py b/test_code_bot.py new file mode 100644 index 0000000..0fba8b4 --- /dev/null +++ b/test_code_bot.py @@ -0,0 +1,36 @@ +import json +from code_bot_utils import * + +queries = json.load(open("queries.json")) +keyword_mapping = generate_keyword_mapping(queries) + +def top_queries(top_ids): + return [queries[i]["query"] for i in top_ids] + +def test_search_exact_match_first(): + top_ids = search(keyword_mapping, ["what", "is", "a", "magic", "number"])[:1] + assert 1 in top_ids + assert "What is a magic number?" in top_queries(top_ids) + +def test_search_exact_match_middle(): + top_ids = search(keyword_mapping, ["what", "is", "method", "overriding"])[:1] + assert 12 in top_ids + assert "What is method overriding?" in top_queries(top_ids) + +def test_search_routine_match(): + top_ids = search(keyword_mapping, ["what", "does", "implements", "mean"])[:3] + assert 14 in top_ids + assert "What is the implements relationship?" in top_queries(top_ids) + +def test_generate_keyword_mapping(): + test_query = { + "query": "How now brown cow?", + "response": "The cow is brown." + } + expected_mapping = { + "brown": {0: 11}, # index: weight + "cow": {0: 11}, + "now": {0: 10} + } + keyword_mapping = generate_keyword_mapping([test_query]) + assert keyword_mapping == expected_mapping