Merge pull request #6 from MinuraPunchihewa/development

[PR] dev to main: supported additional NLP tasks, added logging and improved tests
MinuraPunchihewa · Apr 28, 2023 · ab38920 · ab38920
2 parents e09ade8 + 3eea4e4
commit ab38920
Show file tree

Hide file tree

Showing 9 changed files with 395 additions and 129 deletions.
diff --git a/hugging_py_face/__about__.py b/hugging_py_face/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'hugging_py_face'
 __package_name__ = 'hugging_py_face'
-__version__ = '0.1.1'
+__version__ = '0.2.0'
 __description__ = "Hugging-Py-Face, the Python client for the Hugging Face Inference API."
 __email__ = "[email protected]"
 __author__ = 'Minura Punchihewa'

diff --git a/hugging_py_face/config/config.yaml b/hugging_py_face/config/config.yaml
@@ -3,6 +3,7 @@ TASK_MODEL_MAP:
   fill-mask: bert-base-uncased
   summarization: facebook/bart-large-cnn
   question-answering: deepset/roberta-base-squad2
+  table-question-answering: google/tapas-base-finetuned-wtq
   sentence-similarity: sentence-transformers/all-MiniLM-L6-v2
   text-classification: distilbert-base-uncased-finetuned-sst-2-english
   text-generation: gpt2
@@ -12,4 +13,6 @@ TASK_MODEL_MAP:
   image-classification: google/vit-base-patch16-224
   object-detection: facebook/detr-resnet-50
   speech-recognition: facebook/wav2vec2-base-960h
-  audio-classification: superb/hubert-large-superb-er
+  audio-classification: superb/hubert-large-superb-er
+MAX_RETRIES: 5
+HTTP_SERVICE_UNAVAILABLE: 503
diff --git a/hugging_py_face/config/logging.yaml b/hugging_py_face/config/logging.yaml
@@ -0,0 +1,13 @@
+version: 1
+formatters:
+  simple:
+    format: '%(asctime)s - %(levelname)s - %(message)s'
+handlers:
+  console:
+    class: logging.StreamHandler
+    level: INFO
+    formatter: simple
+    stream: ext://sys.stdout
+root:
+  level: INFO
+  handlers: [console]
diff --git a/hugging_py_face/exceptions.py b/hugging_py_face/exceptions.py
@@ -0,0 +1,3 @@
+
+class HTTPServiceUnavailableException(Exception):
+    pass
diff --git a/hugging_py_face/multimedia_processing.py b/hugging_py_face/multimedia_processing.py
@@ -1,7 +1,16 @@
 import json
+import time
+import logging
 import requests
+import logging.config
 from typing import Text, Dict, List, Optional, Union
+
 from .config_parser import ConfigParser
+from .exceptions import HTTPServiceUnavailableException
+
+logging_config_parser = ConfigParser('config/logging.yaml')
+logging.config.dictConfig(logging_config_parser.get_config_dict())
+logger = logging.getLogger()
 
 
 class MultimediaProcessing:
@@ -11,6 +20,8 @@ def __init__(self, api_token):
         config_parser = ConfigParser()
         self.config = config_parser.get_config_dict()
 
+        self.logger = logger
+
     def _query(self, input: Text, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]:
         api_url = f"{self.config['BASE_URL']}/{model if model is not None else self.config['TASK_MODEL_MAP'][task]}"
 
@@ -28,8 +39,23 @@ def _query(self, input: Text, model: Optional[Text] = None, task: Optional[Text]
             with open(input, "rb") as f:
                 data = f.read()
 
+        retries = 0
+
+        while retries < self.config['MAX_RETRIES']:
+            retries += 1
+
         response = requests.request("POST", api_url, headers=headers, data=data)
-        return json.loads(response.content.decode("utf-8"))
+        if response.status_code == int(self.config['HTTP_SERVICE_UNAVAILABLE']):
+            self.logger.info(f"Status code: {response.status_code}.")
+            self.logger.info("Retrying..")
+            time.sleep(1)
+        else:
+            return json.loads(response.content.decode("utf-8"))
+
+        self.logger.info(f"Status code: {response.status_code}.")
+        self.logger.info("Connection to the server failed after reaching maximum retry attempts.")
+        self.logger.debug(f"Response: {json.loads(response.content.decode('utf-8'))}.")
+        raise HTTPServiceUnavailableException("The HTTP service is unavailable.")
 
     def _query_in_list(self, inputs: List[Text], model: Optional[Text] = None, task: Optional[Text] = None) -> List[Union[Dict, List]]:
         return [self._query(input, model, task) for input in inputs]

diff --git a/hugging_py_face/nlp.py b/hugging_py_face/nlp.py
@@ -1,9 +1,18 @@
 import json
+import time
+import logging
 import requests
+import pandas as pd
+import logging.config
 from pandas import DataFrame
 from typing import Text, List, Dict, Optional, Union
 
 from .config_parser import ConfigParser
+from .exceptions import HTTPServiceUnavailableException
+
+logging_config_parser = ConfigParser('config/logging.yaml')
+logging.config.dictConfig(logging_config_parser.get_config_dict())
+logger = logging.getLogger()
 
 
 class NLP:
@@ -13,6 +22,8 @@ def __init__(self, api_token):
         config_parser = ConfigParser()
         self.config = config_parser.get_config_dict()
 
+        self.logger = logger
+
     def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]:
         api_url = f"{self.config['BASE_URL']}/{model if model is not None else self.config['TASK_MODEL_MAP'][task]}"
 
@@ -30,8 +41,23 @@ def _query(self, inputs: Union[Text, List, Dict], parameters: Optional[Dict] = N
         if options is not None:
             data['options'] = options
 
-        response = requests.request("POST", api_url, headers=headers, data=json.dumps(data))
-        return json.loads(response.content.decode("utf-8"))
+        retries = 0
+
+        while retries < self.config['MAX_RETRIES']:
+            retries += 1
+
+            response = requests.request("POST", api_url, headers=headers, data=json.dumps(data))
+            if response.status_code == int(self.config['HTTP_SERVICE_UNAVAILABLE']):
+                self.logger.info(f"Status code: {response.status_code}.")
+                self.logger.info("Retrying..")
+                time.sleep(1)
+            else:
+                return json.loads(response.content.decode("utf-8"))
+
+        self.logger.info(f"Status code: {response.status_code}.")
+        self.logger.info("Connection to the server failed after reaching maximum retry attempts.")
+        self.logger.debug(f"Response: {json.loads(response.content.decode('utf-8'))}.")
+        raise HTTPServiceUnavailableException("The HTTP service is unavailable.")
 
     def _query_in_df(self, df: DataFrame, column: Text, parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None, task: Optional[Text] = None) -> Union[Dict, List]:
         return self._query(df[column].tolist(), parameters, options, model, task)
@@ -111,6 +137,66 @@ def question_answering(self, question: Text, context: Text, model: Optional[Text
             task='question-answering'
         )
 
+    def question_answering_in_df(self, df: DataFrame, question_column: Text, context_column: Text, model: Optional[Text] = None) -> DataFrame:
+        """
+        Generate answers for a column of questions based on a provided column of context.
+
+        :param df: a pandas DataFrame containing the questions to be answered along with the relevant context.
+        :param question_column: the column containing the questions to be answered.
+        :param context_column: the column containing the relevant context for each question.
+        :param model: the model to use for the question answering task. If not provided, the recommended model from Hugging Face will be used.
+        :return: a pandas DataFrame with the answers for the questions. The answers will be added as a new column called 'predictions' to the original DataFrame.
+        """
+        answers = []
+        for index, row in df.iterrows():
+            answer = self._query(
+                {
+                    "question": row[question_column],
+                    "context": row[context_column]
+                },
+                model=model,
+                task='question-answering'
+            )
+            answers.append(answer['answer'])
+
+        df['predictions'] = answers
+        return df
+
+    def table_question_answering(self, question: Union[Text, List], table: Dict[Text, List], options: Optional[Dict] = None, model: Optional[Text] = None) -> List:
+        """
+
+        :param question: a string or a list of strings of the question(s) to be answered.
+        :param table: a dict of lists representing a table of data.
+        :param options: a dict of options. For more information, see the `detailed parameters for the table question answering task <https://huggingface.co/docs/api-inference/detailed_parameters#table-question-answering-task>`_.
+        :param model: the model to use for the table question answering task. If not provided, the recommended model from Hugging Face will be used.
+        :return: a dict or a list of dicts of the answers.
+        """
+        return self._query(
+            {
+                "query": question,
+                "table": table
+            },
+            options=options,
+            model=model,
+            task='table-question-answering'
+        )
+
+    def table_question_answering_task_in_df(self, df: DataFrame, question: Union[Text, List], options: Optional[Dict] = None, model: Optional[Text] = None) -> DataFrame:
+        answers = self._query(
+            {
+                "query": question,
+                "table": df.to_dict('list')
+            },
+            options=options,
+            model=model,
+            task='table-question-answering'
+        )
+
+        return pd.DataFrame({
+            "question": question,
+            "predictions": [answer['answer'] for answer in answers]
+        })
+
     def sentence_similarity(self, source_sentence: Text, sentences: List, options: Optional[Dict] = None, model: Optional[Text] = None) -> List:
         """
         Calculate the semantic similarity between one text and a list of other sentences by comparing their embeddings.
@@ -131,6 +217,32 @@ def sentence_similarity(self, source_sentence: Text, sentences: List, options: O
             task='sentence-similarity'
         )
 
+    def sentence_similarity_in_df(self, df: DataFrame, source_sentence_column: Text, sentence_column: Text, options: Optional[Dict] = None, model: Optional[Text] = None) -> DataFrame:
+        """
+        Calculate the semantic similarity between sentences in two columns by comparing their embeddings.
+
+        :param df: a pandas DataFrame containing the source sentences and the sentences to be compared against.
+        :param source_sentence_column: the column containing the strings that you wish to compare the other strings with.
+        :param sentence_column: the column containing the strings which will be compared against the source_sentence.
+        :param options: a dict of options. For more information, see the `detailed parameters for the sentence similarity task <https://huggingface.co/docs/api-inference/detailed_parameters#sentence-similarity-task>`_.
+        :param model: the model to use for the sentence similarity task. If not provided, the recommended model from Hugging Face will be used.
+        :return: a pandas DataFrame with the similarity scores for the sentences. The scores will be added as a new column called 'predictions' to the original DataFrame.
+        """
+        scores = []
+        for index, row in df.iterrows():
+            score = self._query(
+                {
+                    "source_sentence": row[source_sentence_column],
+                    "sentences": [row[sentence_column]]
+                },
+                model=model,
+                task='sentence-similarity'
+            )
+            scores.append(score[0])
+
+        df['predictions'] = scores
+        return df
+
     def text_classification(self, text: Union[Text, List], options: Optional[Dict] = None, model: Optional[Text] = None) -> Union[Dict, List]:
         """
         Analyze the sentiment of a string or a list of strings.
@@ -204,6 +316,22 @@ def zero_shot_classification(self, text: Union[Text, List], candidate_labels: Li
             task='zero-shot-classification'
         )
 
+    def zero_shot_classification_in_df(self, df: DataFrame, column: Text, candidate_labels: List, parameters: Optional[Dict] = {}, options: Optional[Dict] = None, model: Optional[Text] = None):
+        """
+
+        :param df: a pandas DataFrame containing the strings to be classified.
+        :param column: the column containing the strings to be classified.
+        :param candidate_labels: a list of strings that are potential classes for inputs.
+        :param parameters: a dict of parameters excluding candidate_labels which is passed in as a separate argument. For more information, see the `detailed parameters for the zero shot classification task <https://huggingface.co/docs/api-inference/detailed_parameters#zeroshot-classification-task>`_.
+        :param options: a dict of options. For more information, see the `detailed parameters for the zero shot classification task <https://huggingface.co/docs/api-inference/detailed_parameters#zeroshot-classification-task>`_.
+        :param model: the model to use for the zero shot classification task. If not provided, the recommended model from Hugging Face will be used.
+        :return: a pandas DataFrame with the classifications. The classifications will be added as a new column called 'predictions' to the original DataFrame.
+        """
+        parameters['candidate_labels'] = candidate_labels
+        predictions = self._query_in_df(df, column, parameters=parameters, options=options, model=model, task='zero-shot-classification')
+        df['predictions'] = [prediction['labels'][0] for prediction in predictions]
+        return df
+
     def conversational(self, text: Union[Text, List], past_user_inputs: Optional[List] = None, generated_responses: Optional[List] = None, parameters: Optional[Dict] = None, options: Optional[Dict] = None, model: Optional[Text] = None) -> Union[Dict, List]:
         """
         Corresponds to any chatbot like structure: pass in some text along with the past_user_inputs and generated_responses to receive a response.

diff --git a/tests/audio_processing/test_audio_processing.py b/tests/audio_processing/test_audio_processing.py
@@ -25,21 +25,9 @@ def test_audio_classification(self):
         self.assertEqual(
             self.ap.audio_classification(self.inputs),
             [
-                {
-                    'score': 0.996896505355835,
-                    'label': 'hap'
-                },
-                {
-                    'score': 0.0029580998234450817,
-                    'label': 'sad'
-                },
-                {
-                    'score': 9.905469050863758e-05,
-                    'label': 'neu'
-                },
-                {
-                    'score': 4.624614666681737e-05,
-                    'label': 'ang'
-                }
+                {'label': 'hap', 'score': 0.996896505355835},
+                {'label': 'sad', 'score': 0.002958094235509634},
+                {'label': 'neu', 'score': 9.905487240757793e-05},
+                {'label': 'ang', 'score': 4.624627763405442e-05}
             ],
         )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@

		class HTTPServiceUnavailableException(Exception):
		pass