From 31c42446f442e1c1e2f7f81c3044bebdbc9f9293 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Thu, 27 Apr 2023 23:34:07 +0530 Subject: [PATCH 01/25] added support for the question answering task to be applied on DataFrames --- hugging_py_face/nlp.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index 83aebde..96fc52d 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -111,6 +111,31 @@ def question_answering(self, question: Text, context: Text, model: Optional[Text task='question-answering' ) + def question_answering_in_df(self, df: DataFrame, question_column: Text, context_column: Text, model: Optional[Text] = None) -> DataFrame: + """ + Generate answers for a column of questions based on a provided column of context. + + :param df: a pandas DataFrame containing the questions to be answered along with the relevant context. + :param question_column: the column containing the questions to be answered. + :param context_column: the column containing the relevant context for each question. + :param model: the model to use for the question answering task. If not provided, the recommended model from Hugging Face will be used. + :return: a pandas DataFrame with the answers for the questions. The answers will be added as a new column called 'predictions' to the original DataFrame. + """ + answers = [] + for index, row in df.iterrows(): + answer = self._query( + { + "question": row[question_column], + "context": row[context_column] + }, + model=model, + task='question-answering' + ) + answers.append(answer['answer']) + + df['predictions'] = answers + return df + def sentence_similarity(self, source_sentence: Text, sentences: List, options: Optional[Dict] = None, model: Optional[Text] = None) -> List: """ Calculate the semantic similarity between one text and a list of other sentences by comparing their embeddings. From 1fcb44f88acf426cd22cd5bcfbfa828ca679ff82 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Thu, 27 Apr 2023 23:50:36 +0530 Subject: [PATCH 02/25] added support for the sentence similarity task to be applied on DataFrames --- hugging_py_face/nlp.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index 96fc52d..51b90d3 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -156,6 +156,31 @@ def sentence_similarity(self, source_sentence: Text, sentences: List, options: O task='sentence-similarity' ) + def sentence_similarity_in_df(self, df: DataFrame, source_sentence_column: Text, sentence_column: Text, options: Optional[Dict] = None, model: Optional[Text] = None) -> DataFrame: + """ + + :param df: a pandas DataFrame containing the source sentences and the sentences to be compared against. + :param source_sentence_column: the column containing the strings that you wish to compare the other strings with. + :param sentence_column: the column containing the strings which will be compared against the source_sentence. + :param options: a dict of options. For more information, see the `detailed parameters for the sentence similarity task `_. + :param model: the model to use for the sentence similarity task. If not provided, the recommended model from Hugging Face will be used. + :return: a pandas DataFrame with the similarity scores for the sentences. The scores will be added as a new column called 'predictions' to the original DataFrame. + """ + scores = [] + for index, row in df.iterrows(): + score = self._query( + { + "source_sentence": row[source_sentence_column], + "sentences": [row[sentence_column]] + }, + model=model, + task='sentence-similarity' + ) + scores.append(score[0]) + + df['predictions'] = scores + return df + def text_classification(self, text: Union[Text, List], options: Optional[Dict] = None, model: Optional[Text] = None) -> Union[Dict, List]: """ Analyze the sentiment of a string or a list of strings. From cf93c4c9c87f51e9aecd0a096f0007328985273f Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 00:11:59 +0530 Subject: [PATCH 03/25] added support for the zero shot classification task to be applied on DataFrames --- hugging_py_face/nlp.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index 51b90d3..cd33088 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -158,6 +158,7 @@ def sentence_similarity(self, source_sentence: Text, sentences: List, options: O def sentence_similarity_in_df(self, df: DataFrame, source_sentence_column: Text, sentence_column: Text, options: Optional[Dict] = None, model: Optional[Text] = None) -> DataFrame: """ + Calculate the semantic similarity between sentences in two columns by comparing their embeddings. :param df: a pandas DataFrame containing the source sentences and the sentences to be compared against. :param source_sentence_column: the column containing the strings that you wish to compare the other strings with. @@ -254,6 +255,22 @@ def zero_shot_classification(self, text: Union[Text, List], candidate_labels: Li task='zero-shot-classification' ) + def zero_shot_classification_in_df(self, df: DataFrame, column: Text, candidate_labels: List, parameters: Optional[Dict] = {}, options: Optional[Dict] = None, model: Optional[Text] = None): + """ + + :param df: a pandas DataFrame containing the strings to be classified. + :param column: the column containing the strings to be classified. + :param candidate_labels: a list of strings that are potential classes for inputs. + :param parameters: a dict of parameters excluding candidate_labels which is passed in as a separate argument. For more information, see the `detailed parameters for the zero shot classification task `_. + :param options: a dict of options. For more information, see the `detailed parameters for the zero shot classification task `_. + :param model: the model to use for the zero shot classification task. If not provided, the recommended model from Hugging Face will be used. + :return: a pandas DataFrame with the classifications. The classifications will be added as a new column called 'predictions' to the original DataFrame. + """ + parameters['candidate_labels'] = candidate_labels + predictions = self._query_in_df(df, column, parameters=parameters, options=options, model=model, task='zero-shot-classification') + df['predictions'] = [prediction['labels'][0] for prediction in predictions] + return df + def conversational(self, text: Union[Text, List], past_user_inputs: Optional[List] = None, generated_responses: Optional[List] = None, parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None) -> Union[Dict, List]: """ Corresponds to any chatbot like structure: pass in some text along with the past_user_inputs and generated_responses to receive a response. From bb7a7457900a6b2b139b443001cbaa2165220c2f Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 00:27:28 +0530 Subject: [PATCH 04/25] added tests for question_answering_in_df, sentence_similarity_in_df and zero_shot_classification_in_df --- tests/nlp/test_nlp_in_df.py | 57 ++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/tests/nlp/test_nlp_in_df.py b/tests/nlp/test_nlp_in_df.py index 528b4d0..5429778 100644 --- a/tests/nlp/test_nlp_in_df.py +++ b/tests/nlp/test_nlp_in_df.py @@ -42,6 +42,45 @@ def test_summarize_in_df(self): ), ) + def test_question_answering_in_df(self): + questions = ["What's my name?"] + contexts = ["My name is Clara and I live in Berkeley"] + df = pd.DataFrame({ + "questions": questions, + "contexts": contexts + }) + + assert_frame_equal( + self.nlp.question_answering_in_df(df, 'questions', 'contexts'), + pd.DataFrame( + { + "questions": questions, + "contexts": contexts, + "predictions": ["Clara"], + } + ), + ) + + def test_sentence_similarity_in_df(self): + source_sentences = ["That is a happy person"] + sentences = ["That is a happy dog"] + + df = pd.DataFrame({ + "source_sentences": source_sentences, + "sentences": sentences + }) + + assert_frame_equal( + self.nlp.sentence_similarity_in_df(df, 'source_sentences', 'sentences'), + pd.DataFrame( + { + "source_sentences": source_sentences, + "sentences": sentences, + "predictions": [0.6945773363113403], + } + ), + ) + def test_text_classification_in_df(self): texts = ["I like you. I love you", "I don't like you. I hate you"] df = pd.DataFrame(texts, columns=['texts']) @@ -68,4 +107,20 @@ def test_text_generation_in_df(self): "predictions": ["The answer to the universe is that we find the Universe, a very large, unchanging, infinitely intricate, incredibly complex place that could not have been created by God in the first place. We'll explore this in more detail at the end of this"], } ), - ) \ No newline at end of file + ) + + def test_zero_shot_classification_in_df(self): + texts = ["Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!"] + df = pd.DataFrame(texts, columns=['texts']) + + candidate_labels = ["refund", "legal", "faq"] + + assert_frame_equal( + self.nlp.zero_shot_classification_in_df(df, 'texts', candidate_labels), + pd.DataFrame( + { + "texts": texts, + "predictions": ["refund"], + } + ), + ) From d30399a2f615eae2ef4912fd3617dcbbc69ac4dc Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 00:54:26 +0530 Subject: [PATCH 05/25] added support for the table question answering task --- hugging_py_face/nlp.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index cd33088..ee0e46c 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -136,6 +136,25 @@ def question_answering_in_df(self, df: DataFrame, question_column: Text, context df['predictions'] = answers return df + def table_question_answering(self, question: Union[Text, List], table: List[Dict], options: Optional[Dict] = None, model: Optional[Text] = None) -> List: + """ + + :param question: a string or a list of strings of the question(s) to be answered. + :param table: a list of dicts representing a table of data. + :param options: a dict of options. For more information, see the `detailed parameters for the table question answering task `_. + :param model: the model to use for the table question answering task. If not provided, the recommended model from Hugging Face will be used. + :return: a dict or a list of dicts of the answers. + """ + return self._query( + { + "query": question, + "table": table + }, + options=options, + model=model, + task='question-answering' + ) + def sentence_similarity(self, source_sentence: Text, sentences: List, options: Optional[Dict] = None, model: Optional[Text] = None) -> List: """ Calculate the semantic similarity between one text and a list of other sentences by comparing their embeddings. From 54a86af181402a8fe06041ea594459dcdd027cb7 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 01:03:46 +0530 Subject: [PATCH 06/25] added support for the table question answering task to be applied on DataFrames --- hugging_py_face/nlp.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index ee0e46c..a25c6e3 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -1,4 +1,6 @@ import json + +import pandas as pd import requests from pandas import DataFrame from typing import Text, List, Dict, Optional, Union @@ -152,9 +154,25 @@ def table_question_answering(self, question: Union[Text, List], table: List[Dict }, options=options, model=model, - task='question-answering' + task='table-question-answering' + ) + + def table_question_answering_task_in_df(self, df: DataFrame, question: Union[Text, List], options: Optional[Dict] = None, model: Optional[Text] = None) -> DataFrame: + answers = self._query( + { + "query": question, + "table": df.to_dict('list') + }, + options=options, + model=model, + task='table-question-answering' ) + return pd.DataFrame({ + "question": question, + "predictions": [answer['answer'] for answer in answers] + }) + def sentence_similarity(self, source_sentence: Text, sentences: List, options: Optional[Dict] = None, model: Optional[Text] = None) -> List: """ Calculate the semantic similarity between one text and a list of other sentences by comparing their embeddings. From 21f7399a05f736f31523944cb8539169e13bfed0 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 01:08:01 +0530 Subject: [PATCH 07/25] updated the type hint and the docstring of the table parameter in table_question_answering --- hugging_py_face/nlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index a25c6e3..cea415d 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -138,11 +138,11 @@ def question_answering_in_df(self, df: DataFrame, question_column: Text, context df['predictions'] = answers return df - def table_question_answering(self, question: Union[Text, List], table: List[Dict], options: Optional[Dict] = None, model: Optional[Text] = None) -> List: + def table_question_answering(self, question: Union[Text, List], table: Dict[List], options: Optional[Dict] = None, model: Optional[Text] = None) -> List: """ :param question: a string or a list of strings of the question(s) to be answered. - :param table: a list of dicts representing a table of data. + :param table: a dict of lists representing a table of data. :param options: a dict of options. For more information, see the `detailed parameters for the table question answering task `_. :param model: the model to use for the table question answering task. If not provided, the recommended model from Hugging Face will be used. :return: a dict or a list of dicts of the answers. From a494bd5eea8c1caab9d81731a3d1329b588a91b9 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 01:09:06 +0530 Subject: [PATCH 08/25] added the recommended model for the table question answering task --- hugging_py_face/config/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/hugging_py_face/config/config.yaml b/hugging_py_face/config/config.yaml index 8d21cb3..75d2515 100644 --- a/hugging_py_face/config/config.yaml +++ b/hugging_py_face/config/config.yaml @@ -3,6 +3,7 @@ TASK_MODEL_MAP: fill-mask: bert-base-uncased summarization: facebook/bart-large-cnn question-answering: deepset/roberta-base-squad2 + table-question-answering: google/tapas-base-finetuned-wtq sentence-similarity: sentence-transformers/all-MiniLM-L6-v2 text-classification: distilbert-base-uncased-finetuned-sst-2-english text-generation: gpt2 From 7b5528ff671aeb9ec151679fc48400563b697bd7 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 01:11:50 +0530 Subject: [PATCH 09/25] updated the type hint of the table parameter in table_question_answering --- hugging_py_face/nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index cea415d..0b498ed 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -138,7 +138,7 @@ def question_answering_in_df(self, df: DataFrame, question_column: Text, context df['predictions'] = answers return df - def table_question_answering(self, question: Union[Text, List], table: Dict[List], options: Optional[Dict] = None, model: Optional[Text] = None) -> List: + def table_question_answering(self, question: Union[Text, List], table: Dict[Text, List], options: Optional[Dict] = None, model: Optional[Text] = None) -> List: """ :param question: a string or a list of strings of the question(s) to be answered. From d0fe3d02aad3dfdde761f1e90f07fb436a55eccb Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 01:12:20 +0530 Subject: [PATCH 10/25] added a unit test for the table question answering task --- tests/nlp/test_nlp.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/nlp/test_nlp.py b/tests/nlp/test_nlp.py index afb8034..bc50145 100644 --- a/tests/nlp/test_nlp.py +++ b/tests/nlp/test_nlp.py @@ -77,6 +77,29 @@ def test_question_answering(self): } ) + def test_table_question_answering(self): + question = "How many stars does the transformers repository have?" + table = { + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": [ + "Python", + "Python", + "Rust, Python and NodeJS", + ], + } + + self.assertEqual( + self.nlp.table_question_answering(question, table), + { + "answer": "AVERAGE > 36542", + "coordinates": [[0, 1]], + "cells": ["36542"], + "aggregator": "AVERAGE", + }, + ) + def test_sentence_similarity(self): source_sentence = "That is a happy person" sentences = ["That is a happy dog", "That is a very happy person", "Today is a sunny day"] From 320a1486de4d4a431c02c91abcf89b2712206baa Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 12:46:36 +0530 Subject: [PATCH 11/25] added MAX_RETRIES and HTTP_SERVICE_UNAVAILABLE config variables --- hugging_py_face/config/config.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hugging_py_face/config/config.yaml b/hugging_py_face/config/config.yaml index 75d2515..805558d 100644 --- a/hugging_py_face/config/config.yaml +++ b/hugging_py_face/config/config.yaml @@ -13,4 +13,6 @@ TASK_MODEL_MAP: image-classification: google/vit-base-patch16-224 object-detection: facebook/detr-resnet-50 speech-recognition: facebook/wav2vec2-base-960h - audio-classification: superb/hubert-large-superb-er \ No newline at end of file + audio-classification: superb/hubert-large-superb-er +MAX_RETRIES: 5 +HTTP_SERVICE_UNAVAILABLE: 503 \ No newline at end of file From dcf3e1b22dabf339bb5177a589f1cff4a0289e0e Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 12:47:11 +0530 Subject: [PATCH 12/25] added the HTTPServiceUnavailableException exception in the exceptions module --- hugging_py_face/exceptions.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 hugging_py_face/exceptions.py diff --git a/hugging_py_face/exceptions.py b/hugging_py_face/exceptions.py new file mode 100644 index 0000000..b873290 --- /dev/null +++ b/hugging_py_face/exceptions.py @@ -0,0 +1,3 @@ + +class HTTPServiceUnavailableException(Exception): + pass From 516ff53da13756fe3adcca937def898c5158cf31 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 12:48:27 +0530 Subject: [PATCH 13/25] added a logger and a retry mechanism to NLP --- hugging_py_face/nlp.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index 0b498ed..9e91ea5 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -1,11 +1,13 @@ import json - -import pandas as pd +import time +import logging import requests +import pandas as pd from pandas import DataFrame from typing import Text, List, Dict, Optional, Union from .config_parser import ConfigParser +from .exceptions import HTTPServiceUnavailableException class NLP: @@ -15,6 +17,11 @@ def __init__(self, api_token): config_parser = ConfigParser() self.config = config_parser.get_config_dict() + @property + def _logger(self): + logging.basicConfig(level=logging.INFO) + return logging.getLogger(__name__) + def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]: api_url = f"{self.config['BASE_URL']}/{model if model is not None else self.config['TASK_MODEL_MAP'][task]}" @@ -32,8 +39,21 @@ def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = N if options is not None: data['options'] = options - response = requests.request("POST", api_url, headers=headers, data=json.dumps(data)) - return json.loads(response.content.decode("utf-8")) + retries = 0 + + while retries < self.config['MAX_RETRIES']: + retries += 1 + + response = requests.request("POST", api_url, headers=headers, data=json.dumps(data)) + if response.status_code == self.config['HTTP_SERVICE_UNAVAILABLE']: + self._logger.info(f"Status code: {response.status_code}.") + self._logger.info("Retrying..") + time.sleep(1) + else: + return json.loads(response.content.decode("utf-8")) + + self._logger.debug(f"Status code: {json.loads(response.content.decode('utf-8'))}.") + raise HTTPServiceUnavailableException("The HTTP service is unavailable.") def _query_in_df(self, df: DataFrame, column: Text, parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]: return self._query(df[column].tolist(), parameters, options, model, task) From 2d74f6c3e6e87fa0586578f3a7e8dfe2f10622ec Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 12:57:37 +0530 Subject: [PATCH 14/25] fixed bug in the query method of NLP --- hugging_py_face/nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index 9e91ea5..8d92eba 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -45,7 +45,7 @@ def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = N retries += 1 response = requests.request("POST", api_url, headers=headers, data=json.dumps(data)) - if response.status_code == self.config['HTTP_SERVICE_UNAVAILABLE']: + if response.status_code == int(self.config['HTTP_SERVICE_UNAVAILABLE']): self._logger.info(f"Status code: {response.status_code}.") self._logger.info("Retrying..") time.sleep(1) From 26e46e0101c247d57e44cb7014c6f52bf83c7044 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 12:59:01 +0530 Subject: [PATCH 15/25] updated the unit test for the question answering task to handle exceptions where the service is not available --- tests/nlp/test_nlp.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/nlp/test_nlp.py b/tests/nlp/test_nlp.py index bc50145..2edaba4 100644 --- a/tests/nlp/test_nlp.py +++ b/tests/nlp/test_nlp.py @@ -3,6 +3,7 @@ from dotenv import load_dotenv from hugging_py_face.nlp import NLP +from hugging_py_face.exceptions import HTTPServiceUnavailableException load_dotenv() @@ -66,16 +67,18 @@ def test_summarization(self): def test_question_answering(self): question = "What's my name?" context = "My name is Clara and I live in Berkeley" - - self.assertEqual( - self.nlp.question_answering(question, context), - { - "score": 0.7940344214439392, - "start": 11, - "end": 16, - "answer": "Clara" - } - ) + try: + self.assertEqual( + self.nlp.question_answering(question, context), + { + "score": 0.7940344214439392, + "start": 11, + "end": 16, + "answer": "Clara" + } + ) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.question_answering(question, context)) def test_table_question_answering(self): question = "How many stars does the transformers repository have?" From 239bb44626974baac5865844c2861ffc4f04f769 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 13:02:33 +0530 Subject: [PATCH 16/25] updated a log message in the query method of NLP --- hugging_py_face/nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index 8d92eba..47a2d06 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -52,7 +52,7 @@ def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = N else: return json.loads(response.content.decode("utf-8")) - self._logger.debug(f"Status code: {json.loads(response.content.decode('utf-8'))}.") + self._logger.debug(f"Response: {json.loads(response.content.decode('utf-8'))}.") raise HTTPServiceUnavailableException("The HTTP service is unavailable.") def _query_in_df(self, df: DataFrame, column: Text, parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]: From 5bae1d369c2c14ffd0aba661a150e23d6e37805c Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 13:09:14 +0530 Subject: [PATCH 17/25] added some extra log messages --- hugging_py_face/nlp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index 47a2d06..e1687e8 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -52,6 +52,8 @@ def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = N else: return json.loads(response.content.decode("utf-8")) + self._logger.info(f"Status code: {response.status_code}.") + self._logger.info("Connection to the server failed after reaching maximum retry attempts.") self._logger.debug(f"Response: {json.loads(response.content.decode('utf-8'))}.") raise HTTPServiceUnavailableException("The HTTP service is unavailable.") From e7419c4e66016746d9433018bcf51232ed843384 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 13:16:16 +0530 Subject: [PATCH 18/25] updated all of the NLP unit tests to account for when the service is not available --- tests/nlp/test_nlp.py | 241 +++++++++++++++++++++++------------------- 1 file changed, 134 insertions(+), 107 deletions(-) diff --git a/tests/nlp/test_nlp.py b/tests/nlp/test_nlp.py index 2edaba4..ba9fc77 100644 --- a/tests/nlp/test_nlp.py +++ b/tests/nlp/test_nlp.py @@ -16,53 +16,59 @@ def setUpClass(cls): def test_fill_mask(self): text = "The answer to the universe is [MASK]." - self.assertEqual( - self.nlp.fill_mask(text), - [ - { - "sequence": "the answer to the universe is no.", - "score": 0.16963981091976166, - "token": 2053, - "token_str": "no", - }, - { - "sequence": "the answer to the universe is nothing.", - "score": 0.07344783842563629, - "token": 2498, - "token_str": "nothing", - }, - { - "sequence": "the answer to the universe is yes.", - "score": 0.05803249776363373, - "token": 2748, - "token_str": "yes", - }, - { - "sequence": "the answer to the universe is unknown.", - "score": 0.043957870453596115, - "token": 4242, - "token_str": "unknown", - }, - { - "sequence": "the answer to the universe is simple.", - "score": 0.040157340466976166, - "token": 3722, - "token_str": "simple", - }, - ], - ) + try: + self.assertEqual( + self.nlp.fill_mask(text), + [ + { + "sequence": "the answer to the universe is no.", + "score": 0.16963981091976166, + "token": 2053, + "token_str": "no", + }, + { + "sequence": "the answer to the universe is nothing.", + "score": 0.07344783842563629, + "token": 2498, + "token_str": "nothing", + }, + { + "sequence": "the answer to the universe is yes.", + "score": 0.05803249776363373, + "token": 2748, + "token_str": "yes", + }, + { + "sequence": "the answer to the universe is unknown.", + "score": 0.043957870453596115, + "token": 4242, + "token_str": "unknown", + }, + { + "sequence": "the answer to the universe is simple.", + "score": 0.040157340466976166, + "token": 3722, + "token_str": "simple", + }, + ], + ) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.fill_mask(text)) def test_summarization(self): text = "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct." - self.assertEqual( - self.nlp.summarization(text), - [ - { - "summary_text": "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world.", - }, - ], - ) + try: + self.assertEqual( + self.nlp.summarization(text), + [ + { + "summary_text": "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world.", + }, + ], + ) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.summarization(text)) def test_question_answering(self): question = "What's my name?" @@ -93,98 +99,119 @@ def test_table_question_answering(self): ], } - self.assertEqual( - self.nlp.table_question_answering(question, table), - { - "answer": "AVERAGE > 36542", - "coordinates": [[0, 1]], - "cells": ["36542"], - "aggregator": "AVERAGE", - }, - ) + try: + self.assertEqual( + self.nlp.table_question_answering(question, table), + { + "answer": "AVERAGE > 36542", + "coordinates": [[0, 1]], + "cells": ["36542"], + "aggregator": "AVERAGE", + }, + ) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.table_question_answering(question, table)) def test_sentence_similarity(self): source_sentence = "That is a happy person" sentences = ["That is a happy dog", "That is a very happy person", "Today is a sunny day"] - self.assertEqual( - self.nlp.sentence_similarity(source_sentence, sentences), - [0.6945773363113403, 0.9429150819778442, 0.2568760812282562], - ) + try: + self.assertEqual( + self.nlp.sentence_similarity(source_sentence, sentences), + [0.6945773363113403, 0.9429150819778442, 0.2568760812282562], + ) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.sentence_similarity(source_sentence, sentences)) def test_text_classification(self): text = "I like you. I love you" - self.assertEqual( - self.nlp.text_classification(text), - [ + try: + self.assertEqual( + self.nlp.text_classification(text), [ - {"label": "POSITIVE", "score": 0.9998738765716553}, - {"label": "NEGATIVE", "score": 0.00012611244164872915}, - ] - ], - ) + [ + {"label": "POSITIVE", "score": 0.9998738765716553}, + {"label": "NEGATIVE", "score": 0.00012611244164872915}, + ] + ], + ) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.text_classification(text)) def test_text_generation(self): text = "The answer to the universe is" - self.assertEqual( - self.nlp.text_generation(text), - [ - {'generated_text': "The answer to the universe is in one's own minds and your " - 'thoughts.\n' - '\n' - 'When the universe is created to bring forth new creation, ' - 'our minds are born anew. The universe is a process ' - 'wherein all things are possible and the universe is one ' - 'created' - } - ] - ) + try: + self.assertEqual( + self.nlp.text_generation(text), + [ + {'generated_text': "The answer to the universe is in one's own minds and your " + 'thoughts.\n' + '\n' + 'When the universe is created to bring forth new creation, ' + 'our minds are born anew. The universe is a process ' + 'wherein all things are possible and the universe is one ' + 'created' + } + ] + ) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.text_generation(text)) def test_zero_shot_classification(self): text = "Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!" candidate_labels = ["refund", "legal", "faq"] - self.assertEqual( - self.nlp.zero_shot_classification(text, candidate_labels), - { - "sequence": "Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!", - "labels": ["refund", "faq", "legal"], - "scores": [ - # 88% refund - 0.8777875304222107, - 0.10522652417421341, - 0.01698593609035015, - ], - }, - ) + try: + self.assertEqual( + self.nlp.zero_shot_classification(text, candidate_labels), + { + "sequence": "Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!", + "labels": ["refund", "faq", "legal"], + "scores": [ + # 88% refund + 0.8777875304222107, + 0.10522652417421341, + 0.01698593609035015, + ], + }, + ) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.zero_shot_classification(text, candidate_labels)) def test_conversational(self): past_user_inputs = ["Which movie is the best ?"] generated_responses = ["It's Die Hard for sure."] text = "Can you explain why ?" - self.assertEqual( - self.nlp.conversational(text, past_user_inputs, generated_responses), - { - "generated_text": "It's the best movie ever.", - "conversation": { - "past_user_inputs": [ - "Which movie is the best ?", - "Can you explain why ?", - ], - "generated_responses": [ - "It's Die Hard for sure.", - "It's the best movie ever.", - ], + try: + self.assertEqual( + self.nlp.conversational(text, past_user_inputs, generated_responses), + { + "generated_text": "It's the best movie ever.", + "conversation": { + "past_user_inputs": [ + "Which movie is the best ?", + "Can you explain why ?", + ], + "generated_responses": [ + "It's Die Hard for sure.", + "It's the best movie ever.", + ], + }, + "warnings": ["Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation."], }, - "warnings": ["Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation."], - }, - ) + ) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.conversational(text, past_user_inputs, generated_responses)) def test_feature_extraction(self): text = "Transformers is an awesome library!" - self.assertEqual(type(self.nlp.feature_extraction(text)), list) + try: + self.assertEqual(type(self.nlp.feature_extraction(text)), list) + except HTTPServiceUnavailableException: + self.assertRaises(HTTPServiceUnavailableException, lambda: self.nlp.feature_extraction(text)) From 0266bf5b1d3159980020964897fc82e2b6dcb822 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 13:22:09 +0530 Subject: [PATCH 19/25] updated the unit test for text generation --- tests/nlp/test_nlp.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/nlp/test_nlp.py b/tests/nlp/test_nlp.py index ba9fc77..15f72c1 100644 --- a/tests/nlp/test_nlp.py +++ b/tests/nlp/test_nlp.py @@ -147,13 +147,10 @@ def test_text_generation(self): self.assertEqual( self.nlp.text_generation(text), [ - {'generated_text': "The answer to the universe is in one's own minds and your " - 'thoughts.\n' - '\n' - 'When the universe is created to bring forth new creation, ' - 'our minds are born anew. The universe is a process ' - 'wherein all things are possible and the universe is one ' - 'created' + {'generated_text': 'The answer to the universe is simple: The answer can be ' + 'proved. The first thing you should do is make an ' + 'appointment with a psychic called S.K. Gupta, a.k.a. ' + 'Shiva (his real name) who has the most' } ] ) From 0defc275d8111d37f7ee14587eb9240e84eda733 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 15:45:57 +0530 Subject: [PATCH 20/25] added a config file for logging --- hugging_py_face/config/logging.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 hugging_py_face/config/logging.yaml diff --git a/hugging_py_face/config/logging.yaml b/hugging_py_face/config/logging.yaml new file mode 100644 index 0000000..dbf6258 --- /dev/null +++ b/hugging_py_face/config/logging.yaml @@ -0,0 +1,13 @@ +version: 1 +formatters: + simple: + format: '%(asctime)s - %(levelname)s - %(message)s' +handlers: + console: + class: logging.StreamHandler + level: INFO + formatter: simple + stream: ext://sys.stdout +root: + level: INFO + handlers: [console] From 663fc1edcf9b1b3905be161c2514ec6c4199980c Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 15:47:30 +0530 Subject: [PATCH 21/25] removed the logger as a property and added a module-level logger for NLP --- hugging_py_face/nlp.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index e1687e8..8154ad5 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -3,12 +3,17 @@ import logging import requests import pandas as pd +import logging.config from pandas import DataFrame from typing import Text, List, Dict, Optional, Union from .config_parser import ConfigParser from .exceptions import HTTPServiceUnavailableException +logging_config_parser = ConfigParser('config/logging.yaml') +logging.config.dictConfig(logging_config_parser.get_config_dict()) +logger = logging.getLogger(__name__) + class NLP: def __init__(self, api_token): @@ -17,11 +22,6 @@ def __init__(self, api_token): config_parser = ConfigParser() self.config = config_parser.get_config_dict() - @property - def _logger(self): - logging.basicConfig(level=logging.INFO) - return logging.getLogger(__name__) - def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]: api_url = f"{self.config['BASE_URL']}/{model if model is not None else self.config['TASK_MODEL_MAP'][task]}" @@ -46,15 +46,15 @@ def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = N response = requests.request("POST", api_url, headers=headers, data=json.dumps(data)) if response.status_code == int(self.config['HTTP_SERVICE_UNAVAILABLE']): - self._logger.info(f"Status code: {response.status_code}.") - self._logger.info("Retrying..") + logger.info(f"Status code: {response.status_code}.") + logger.info("Retrying..") time.sleep(1) else: return json.loads(response.content.decode("utf-8")) - self._logger.info(f"Status code: {response.status_code}.") - self._logger.info("Connection to the server failed after reaching maximum retry attempts.") - self._logger.debug(f"Response: {json.loads(response.content.decode('utf-8'))}.") + logger.info(f"Status code: {response.status_code}.") + logger.info("Connection to the server failed after reaching maximum retry attempts.") + logger.debug(f"Response: {json.loads(response.content.decode('utf-8'))}.") raise HTTPServiceUnavailableException("The HTTP service is unavailable.") def _query_in_df(self, df: DataFrame, column: Text, parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]: From 7c626f125cda35b9485642d920c41d93749986bf Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 16:10:24 +0530 Subject: [PATCH 22/25] updated how the logger is used in NLP --- hugging_py_face/nlp.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py index 8154ad5..243bce3 100644 --- a/hugging_py_face/nlp.py +++ b/hugging_py_face/nlp.py @@ -12,7 +12,7 @@ logging_config_parser = ConfigParser('config/logging.yaml') logging.config.dictConfig(logging_config_parser.get_config_dict()) -logger = logging.getLogger(__name__) +logger = logging.getLogger() class NLP: @@ -22,6 +22,8 @@ def __init__(self, api_token): config_parser = ConfigParser() self.config = config_parser.get_config_dict() + self.logger = logger + def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]: api_url = f"{self.config['BASE_URL']}/{model if model is not None else self.config['TASK_MODEL_MAP'][task]}" @@ -46,15 +48,15 @@ def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = N response = requests.request("POST", api_url, headers=headers, data=json.dumps(data)) if response.status_code == int(self.config['HTTP_SERVICE_UNAVAILABLE']): - logger.info(f"Status code: {response.status_code}.") - logger.info("Retrying..") + self.logger.info(f"Status code: {response.status_code}.") + self.logger.info("Retrying..") time.sleep(1) else: return json.loads(response.content.decode("utf-8")) - logger.info(f"Status code: {response.status_code}.") - logger.info("Connection to the server failed after reaching maximum retry attempts.") - logger.debug(f"Response: {json.loads(response.content.decode('utf-8'))}.") + self.logger.info(f"Status code: {response.status_code}.") + self.logger.info("Connection to the server failed after reaching maximum retry attempts.") + self.logger.debug(f"Response: {json.loads(response.content.decode('utf-8'))}.") raise HTTPServiceUnavailableException("The HTTP service is unavailable.") def _query_in_df(self, df: DataFrame, column: Text, parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]: From d5cb3154de89e4de43cbde0530c66e20c29f6c6b Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 16:11:03 +0530 Subject: [PATCH 23/25] added logging and a retry mechanism to MultimediaProcessing --- hugging_py_face/multimedia_processing.py | 28 +++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/hugging_py_face/multimedia_processing.py b/hugging_py_face/multimedia_processing.py index 6e11d61..889b0c2 100644 --- a/hugging_py_face/multimedia_processing.py +++ b/hugging_py_face/multimedia_processing.py @@ -1,7 +1,16 @@ import json +import time +import logging import requests +import logging.config from typing import Text, Dict, List, Optional, Union + from .config_parser import ConfigParser +from .exceptions import HTTPServiceUnavailableException + +logging_config_parser = ConfigParser('config/logging.yaml') +logging.config.dictConfig(logging_config_parser.get_config_dict()) +logger = logging.getLogger() class MultimediaProcessing: @@ -11,6 +20,8 @@ def __init__(self, api_token): config_parser = ConfigParser() self.config = config_parser.get_config_dict() + self.logger = logger + def _query(self, input: Text, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]: api_url = f"{self.config['BASE_URL']}/{model if model is not None else self.config['TASK_MODEL_MAP'][task]}" @@ -28,8 +39,23 @@ def _query(self, input: Text, model: Optional[Text] = None, task: Optional[Text] with open(input, "rb") as f: data = f.read() + retries = 0 + + while retries < self.config['MAX_RETRIES']: + retries += 1 + response = requests.request("POST", api_url, headers=headers, data=data) - return json.loads(response.content.decode("utf-8")) + if response.status_code == int(self.config['HTTP_SERVICE_UNAVAILABLE']): + self.logger.info(f"Status code: {response.status_code}.") + self.logger.info("Retrying..") + time.sleep(1) + else: + return json.loads(response.content.decode("utf-8")) + + self.logger.info(f"Status code: {response.status_code}.") + self.logger.info("Connection to the server failed after reaching maximum retry attempts.") + self.logger.debug(f"Response: {json.loads(response.content.decode('utf-8'))}.") + raise HTTPServiceUnavailableException("The HTTP service is unavailable.") def _query_in_list(self, inputs: List[Text], model: Optional[Text] = None, task: Optional[Text] = None) -> List[Union[Dict, List]]: return [self._query(input, model, task) for input in inputs] From 87fbaeeb00db912d11e589ff9115ef0818a8af55 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 16:17:05 +0530 Subject: [PATCH 24/25] updated the test for the audio classification task --- .../audio_processing/test_audio_processing.py | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/tests/audio_processing/test_audio_processing.py b/tests/audio_processing/test_audio_processing.py index ab4ceac..816f35d 100644 --- a/tests/audio_processing/test_audio_processing.py +++ b/tests/audio_processing/test_audio_processing.py @@ -25,21 +25,9 @@ def test_audio_classification(self): self.assertEqual( self.ap.audio_classification(self.inputs), [ - { - 'score': 0.996896505355835, - 'label': 'hap' - }, - { - 'score': 0.0029580998234450817, - 'label': 'sad' - }, - { - 'score': 9.905469050863758e-05, - 'label': 'neu' - }, - { - 'score': 4.624614666681737e-05, - 'label': 'ang' - } + {'label': 'hap', 'score': 0.996896505355835}, + {'label': 'sad', 'score': 0.002958094235509634}, + {'label': 'neu', 'score': 9.905487240757793e-05}, + {'label': 'ang', 'score': 4.624627763405442e-05} ], ) \ No newline at end of file From 3eea4e46ba91bd9b10d188b3b369d41052eb3f01 Mon Sep 17 00:00:00 2001 From: Minura Punchihewa Date: Fri, 28 Apr 2023 18:00:03 +0530 Subject: [PATCH 25/25] updated the package version --- hugging_py_face/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hugging_py_face/__about__.py b/hugging_py_face/__about__.py index cabb2d7..eb0939c 100644 --- a/hugging_py_face/__about__.py +++ b/hugging_py_face/__about__.py @@ -1,6 +1,6 @@ __title__ = 'hugging_py_face' __package_name__ = 'hugging_py_face' -__version__ = '0.1.1' +__version__ = '0.2.0' __description__ = "Hugging-Py-Face, the Python client for the Hugging Face Inference API." __email__ = "minurapunchihewa17@gmail.com" __author__ = 'Minura Punchihewa'