From 3889948944e6d62871c1a2032f531bb8ae149cd4 Mon Sep 17 00:00:00 2001 From: Salt0095 <2022210095@bupt.cn> Date: Fri, 20 Dec 2024 00:53:51 +0800 Subject: [PATCH] Add llmrec prompt construction files --- .../gpt_i_attribute_generate_aug.py | 547 ++++++++++++++++++ .../gpt_ui_aug.py | 305 ++++++++++ .../gpt_user_profiling.py | 458 +++++++++++++++ 3 files changed, 1310 insertions(+) create mode 100644 examples/llmrec/LLM_augmentation_construct_prompt/gpt_i_attribute_generate_aug.py create mode 100644 examples/llmrec/LLM_augmentation_construct_prompt/gpt_ui_aug.py create mode 100644 examples/llmrec/LLM_augmentation_construct_prompt/gpt_user_profiling.py diff --git a/examples/llmrec/LLM_augmentation_construct_prompt/gpt_i_attribute_generate_aug.py b/examples/llmrec/LLM_augmentation_construct_prompt/gpt_i_attribute_generate_aug.py new file mode 100644 index 00000000..607f7b2b --- /dev/null +++ b/examples/llmrec/LLM_augmentation_construct_prompt/gpt_i_attribute_generate_aug.py @@ -0,0 +1,547 @@ + +import threading +import openai +import time +import pandas as pd +import pickle +import os +import numpy as np +import torch + +# openai.api_key = "" +openai.api_key = "" + +import requests + +file_path = "" + + +# # MovieLens +# def construct_prompting(item_attribute, indices): +# # pre string +# pre_string = "You are now a search engines, and required to provide the inquired information of the given movies bellow:\n" +# # make item list +# item_list_string = "" +# for index in indices: +# title = item_attribute['title'][index] +# genre = item_attribute['genre'][index] +# item_list_string += "[" +# item_list_string += str(index) +# item_list_string += "] " +# item_list_string += title + ", " +# item_list_string += genre + "\n" +# # output format +# output_format = "The inquired information is : director, country, language.\nAnd please output them in form of: \ndirector::country::language\nplease output only the content in the form above, i.e., director::country::language\n, but no other thing else, no reasoning, no index.\n\n" +# # make prompt +# prompt = pre_string + item_list_string + output_format +# return prompt + +# Netflix +def construct_prompting(item_attribute, indices): + # pre string + pre_string = "You are now a search engines, and required to provide the inquired information of the given movies bellow:\n" + # make item list + item_list_string = "" + for index in indices: + year = item_attribute['year'][index] + title = item_attribute['title'][index] + item_list_string += "[" + item_list_string += str(index) + item_list_string += "] " + item_list_string += str(year) + ", " + item_list_string += title + "\n" + # output format + output_format = "The inquired information is : director, country, language.\nAnd please output them in form of: \ndirector::country::language\nplease output only the content in the form above, i.e., director::country::language\n, but no other thing else, no reasoning, no index.\n\n" + # make prompt + prompt = pre_string + item_list_string + output_format + return prompt + + +# def file_reading(): +# augmented_attribute_dict = pickle.load(open(file_path + 'augmented_attribute_dict','rb')) +# return augmented_attribute_dict + +# ## baidu attribute generate +# # error_cnt = 0 +# global error_cnt +# error_cnt = 0 +# def LLM_request(toy_item_attribute, indices, model_type, augmented_attribute_dict, error_cnt): + +# # try: +# # augmented_attribute_dict = file_reading() +# # except pickle.UnpicklingError as e: +# # # Handle the unpickling error +# # # time.sleep(0.001) +# # # augmented_attribute_dict = file_reading() +# # print("Error occurred while unpickling:", e) +# # # return +# # LLM_request(toy_item_attribute, indices, model_type, augmented_attribute_dict, error_cnt) + +# if indices[0] in augmented_attribute_dict: +# return 0 +# else: +# try: +# print(f"{indices}") +# prompt = construct_prompting(toy_item_attribute, indices) +# url = "http://llms-se.baidu-int.com:8200/chat/completions" +# headers={ +# # "Content-Type": "application/json", +# # "Authorization": "Bearer your key" +# +# } +# params={ +# "model": model_type, +# "messages": [{"role": "user", "content": prompt}], +# "temperature":1, +# "max_tokens": 1000, +# "stream": False, +# "top_p": 0.1 +# } + +# response = requests.post(url=url, headers=headers,json=params) +# message = response.json() + +# content = message['choices'][0]['message']['content'] +# print(f"content: {content}, model_type: {model_type}") + +# rows = content.strip().split("\n") # Split the content into rows +# for i,row in enumerate(rows): +# elements = row.split("::") # Split each row into elements using "::" as the delimiter +# director = elements[0] +# country = elements[1] +# language = elements[2] +# augmented_attribute_dict[indices[i]] = {} +# augmented_attribute_dict[indices[i]][0] = director +# augmented_attribute_dict[indices[i]][1] = country +# augmented_attribute_dict[indices[i]][2] = language +# # pickle.dump(augmented_sample_dict, open('augmented_sample_dict','wb')) +# pickle.dump(augmented_attribute_dict, open(file_path + 'augmented_attribute_dict','wb')) +# # time.sleep(5) + +# error_cnt = 0 +# # except ValueError as e: +# except requests.exceptions.RequestException as e: +# print("An HTTP error occurred:", str(e)) +# time.sleep(25) +# # print(content) +# error_cnt += 1 +# if error_cnt==5: +# return 1 +# LLM_request(toy_item_attribute, indices, model_type, augmented_attribute_dict, error_cnt) +# except ValueError as ve: +# print("ValueError error occurred while parsing the response:", str(ve)) +# time.sleep(25) +# error_cnt += 1 +# if error_cnt==5: +# return 1 +# # print(content) +# LLM_request(toy_item_attribute, indices, model_type, augmented_attribute_dict, error_cnt) +# except KeyError as ke: +# print("KeyError error occurred while accessing the response:", str(ke)) +# time.sleep(25) +# error_cnt += 1 +# if error_cnt==5: +# return 1 +# # print(content) +# LLM_request(toy_item_attribute, indices, model_type, augmented_attribute_dict, error_cnt) +# except IndexError as ke: +# print("IndexError error occurred while accessing the response:", str(ke)) +# time.sleep(25) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # # print(content) +# LLM_request(toy_item_attribute, indices, model_type, augmented_attribute_dict, error_cnt) +# return 1 +# except Exception as ex: +# print("An unknown error occurred:", str(ex)) +# time.sleep(25) +# error_cnt += 1 +# if error_cnt==5: +# return 1 +# # print(content) +# LLM_request(toy_item_attribute, indices, model_type, augmented_attribute_dict, error_cnt) +# return 1 + + +### chatgpt attribute generate +def LLM_request(toy_item_attribute, indices, model_type, augmented_attribute_dict, error_cnt): + if indices[0] in augmented_attribute_dict: + return 0 + else: + try: + print(f"{indices}") + prompt = construct_prompting(toy_item_attribute, indices) + url = "https://api.openai.com/v1/completions" + headers={ + # "Content-Type": "application/json", + "Authorization": "Bearer your key" + } + + params={ + "model": "text-davinci-003", + "prompt": prompt, + "max_tokens": 1024, + "temperature": 0.6, + "stream": False, + } + + response = requests.post(url=url, headers=headers,json=params) + message = response.json() + + content = message['choices'][0]['text'] + print(f"content: {content}, model_type: {model_type}") + + rows = content.strip().split("\n") # Split the content into rows + for i,row in enumerate(rows): + elements = row.split("::") # Split each row into elements using "::" as the delimiter + director = elements[0] + country = elements[1] + language = elements[2] + augmented_attribute_dict[indices[i]] = {} + augmented_attribute_dict[indices[i]][0] = director + augmented_attribute_dict[indices[i]][1] = country + augmented_attribute_dict[indices[i]][2] = language + # pickle.dump(augmented_sample_dict, open('augmented_sample_dict','wb')) + pickle.dump(augmented_attribute_dict, open(file_path + 'augmented_attribute_dict','wb')) + + # except ValueError as e: + except requests.exceptions.RequestException as e: + print("An HTTP error occurred:", str(e)) + # time.sleep(25) + # print(content) + error_cnt += 1 + if error_cnt==5: + return 1 + LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) + except ValueError as ve: + print("ValueError error occurred while parsing the response:", str(ve)) + # time.sleep(25) + error_cnt += 1 + if error_cnt==5: + return 1 + # print(content) + LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) + except KeyError as ke: + print("KeyError error occurred while accessing the response:", str(ke)) + # time.sleep(25) + error_cnt += 1 + if error_cnt==5: + return 1 + # print(content) + LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) + except IndexError as ke: + print("IndexError error occurred while accessing the response:", str(ke)) + # time.sleep(25) + # error_cnt += 1 + # if error_cnt==5: + # return 1 + # # print(content) + # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict) + return 1 + except Exception as ex: + print("An unknown error occurred:", str(ex)) + # time.sleep(25) + error_cnt += 1 + if error_cnt==5: + return 1 + # print(content) + LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) + return 1 + + + + + + + +### chatgpt attribute embedding +def LLM_request(toy_augmented_item_attribute, indices, model_type, augmented_atttribute_embedding_dict, error_cnt): + for value in augmented_atttribute_embedding_dict.keys(): + print(value) + if indices[0] in augmented_atttribute_embedding_dict[value]: + # return 0 + continue + else: + try: + print(f"{indices}") + # prompt = construct_prompting(toy_item_attribute, indices) + url = "https://api.openai.com/v1/embeddings" + headers={ + # "Content-Type": "application/json", + "Authorization": "Bearer your key" + } + params={ + "model": "text-embedding-ada-002", + "input": toy_augmented_item_attribute[value][indices].values[0] + } + + response = requests.post(url=url, headers=headers,json=params) + message = response.json() + + content = message['data'][0]['embedding'] + + augmented_atttribute_embedding_dict[value][indices[0]] = content + # pickle.dump(augmented_sample_dict, open('augmented_sample_dict','wb')) + pickle.dump(augmented_atttribute_embedding_dict, open(file_path + 'augmented_atttribute_embedding_dict','wb')) + + # except ValueError as e: + except requests.exceptions.RequestException as e: + print("An HTTP error occurred:", str(e)) + time.sleep(5) + # print(content) + LLM_request(toy_augmented_item_attribute, indices, "text-embedding-ada-002", augmented_atttribute_embedding_dict, error_cnt) + except ValueError as ve: + print("An error occurred while parsing the response:", str(ve)) + time.sleep(5) + # print(content) + LLM_request(toy_augmented_item_attribute, indices, "text-embedding-ada-002", augmented_atttribute_embedding_dict, error_cnt) + except KeyError as ke: + print("An error occurred while accessing the response:", str(ke)) + time.sleep(5) + # print(content) + LLM_request(toy_augmented_item_attribute, indices, "text-embedding-ada-002", augmented_atttribute_embedding_dict, error_cnt) + except Exception as ex: + print("An unknown error occurred:", str(ex)) + time.sleep(5) + # print(content) + LLM_request(toy_augmented_item_attribute, indices, "text-embedding-ada-002", augmented_atttribute_embedding_dict, error_cnt) + # return 1 + + + + + + + + + + +def file_reading(): + augmented_atttribute_embedding_dict = pickle.load(open(file_path + 'augmented_atttribute_embedding_dict','rb')) + return augmented_atttribute_embedding_dict + +### baidu attribute embedding +def LLM_request(toy_augmented_item_attribute, indices, model_type, augmented_atttribute_embedding_dict, error_cnt, key, file_name): + for value in augmented_atttribute_embedding_dict.keys(): + if indices[0] in augmented_atttribute_embedding_dict[value]: + # return 0 + continue + else: + try: + print(f"{indices}") + print(value) + + ### chatgpt ############################################################################################################################# + # prompt = construct_prompting(toy_item_attribute, indices) + url = "https://api.openai.com/v1/embeddings" + headers={ + # "Content-Type": "application/json", + "Authorization": "Bearer your key" + } + ### chatgpt ############################################################################################################################# + + + params={ + "model": "text-embedding-ada-002", + "input": str(toy_augmented_item_attribute[value][indices].values[0]) + } + response = requests.post(url=url, headers=headers,json=params) + message = response.json() + + content = message['data'][0]['embedding'] + + augmented_atttribute_embedding_dict[value][indices[0]] = content + pickle.dump(augmented_atttribute_embedding_dict, open(file_path + file_name,'wb')) + + # except ValueError as e: + except requests.exceptions.RequestException as e: + print("An HTTP error occurred:", str(e)) + time.sleep(5) + # print(content) + LLM_request(toy_augmented_item_attribute, indices, "text-embedding-ada-002", augmented_atttribute_embedding_dict, error_cnt, key, file_name) + except ValueError as ve: + print("An error occurred while parsing the response:", str(ve)) + time.sleep(5) + # print(content) + LLM_request(toy_augmented_item_attribute, indices, "text-embedding-ada-002", augmented_atttribute_embedding_dict, error_cnt, key, file_name) + except KeyError as ke: + print("An error occurred while accessing the response:", str(ke)) + time.sleep(5) + # print(content) + LLM_request(toy_augmented_item_attribute, indices, "text-embedding-ada-002", augmented_atttribute_embedding_dict, error_cnt, key, file_name) + except Exception as ex: + print("An unknown error occurred:", str(ex)) + time.sleep(5) + # print(content) + LLM_request(toy_augmented_item_attribute, indices, "text-embedding-ada-002", augmented_atttribute_embedding_dict, error_cnt, key, file_name) + # return 1 + + + + +# error_cnt = 0 +# ############################# step 1: built item attribute ########################################################## +# ### write augmented dict +# augmented_attribute_dict = {} +# if os.path.exists(file_path + "augmented_attribute_dict"): +# print(f"The file augmented_attribute_dict exists.") +# augmented_attribute_dict = pickle.load(open(file_path + 'augmented_attribute_dict','rb')) +# else: +# print(f"The file augmented_attribute_dict does not exist.") +# pickle.dump(augmented_attribute_dict, open(file_path + 'augmented_attribute_dict','wb')) + +# ### read item attribute file +# # toy_item_attribute = pd.read_csv(file_path + 'item_attribute.csv', names=['id','title', 'genre']) +# toy_item_attribute = pd.read_csv(file_path + 'item_attribute.csv', names=['id','year', 'title']) + +# for i in range(0, toy_item_attribute.shape[0], 1): +# batch_start = i +# batch_end = min(i + 1, toy_item_attribute.shape[0]) +# indices = list(range(batch_start, batch_end)) +# print(f"###i###: {i}") +# print(f"#######: {indices}") +# re = LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-16k", augmented_attribute_dict, error_cnt) +# # if re: +# # time.sleep(0.3) + +# # # +# # for i in range(4189, toy_item_attribute.shape[0], 1): +# # batch_start = i +# # batch_end = min(i + 1, toy_item_attribute.shape[0]) +# # indices = list(range(batch_start, batch_end)) +# # print(f"###i###: {i}") +# # print(f"#######: {indices}") +# # re = LLM_request(toy_item_attribute, indices, "gpt-4", augmented_attribute_dict, error_cnt) +# # # # if re: +# # time.sleep(1) +# # "gpt-3.5-turbo", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k" +# ############################# step 1: built item attribute ########################################################## + + + +# ############################# step 2: generate new csv ########################################################## +# import pandas as pd +# # raw_item_attribute = pd.read_csv(file_path + 'item_attribute.csv', names=['id','title','genre']) +# raw_item_attribute = pd.read_csv(file_path + 'item_attribute_filter.csv', names=['id','year','title']) +# augmented_attribute_dict = pickle.load(open(file_path + 'augmented_attribute_dict','rb')) +# director_list, country_list, language_list = [], [], [] +# for i in range(len(augmented_attribute_dict)): +# director_list.append(augmented_attribute_dict[i][0]) +# country_list.append(augmented_attribute_dict[i][1]) +# language_list.append(augmented_attribute_dict[i][2]) +# director_series = pd.Series(director_list) +# country_series = pd.Series(country_list) +# language_series = pd.Series(language_list) +# raw_item_attribute['director'] = director_series +# raw_item_attribute['country'] = country_series +# raw_item_attribute['language'] = language_series +# raw_item_attribute.to_csv(file_path + 'augmented_item_attribute_agg.csv', index=False, header=None) +# ############################# step 2: generate new csv ########################################################## + + +# ############################# step 3: generate item atttribute embedding ########################################################## +# ### write augmented dict +# # emb_dict_name = ['title_embedding_dict', 'genre_embedding_dict', 'director_embedding_dict', 'country_embedding_dict', 'language_embedding_dict'] # TODO: add total +# emb_dict_name = ['year_embedding_dict', 'title_embedding_dict', 'director_embedding_dict', 'country_embedding_dict', 'language_embedding_dict'] # TODO: add total +# title_embedding_dict, genre_embedding_dict, director_embedding_dict, country_embedding_dict, language_embedding_dict = {}, {}, {}, {}, {} +# # augmented_atttribute_embedding_dict = {'title':title_embedding_dict, 'genre':genre_embedding_dict, 'director':director_embedding_dict, 'country':country_embedding_dict, 'language':language_embedding_dict} +# augmented_atttribute_embedding_dict = {'year':genre_embedding_dict, 'title':title_embedding_dict, 'director':director_embedding_dict, 'country':country_embedding_dict, 'language':language_embedding_dict} + +# augmented_atttribute_embedding_dict1 = augmented_atttribute_embedding_dict2 = augmented_atttribute_embedding_dict3 = augmented_atttribute_embedding_dict4 = augmented_atttribute_embedding_dict5 = augmented_atttribute_embedding_dict6 = augmented_atttribute_embedding_dict7 = augmented_atttribute_embedding_dict8 = augmented_atttribute_embedding_dict9 = augmented_atttribute_embedding_dict10 = augmented_atttribute_embedding_dict11 = augmented_atttribute_embedding_dict12 = augmented_atttribute_embedding_dict13 = augmented_atttribute_embedding_dict14 = augmented_atttribute_embedding_dict15 = augmented_atttribute_embedding_dict16 = augmented_atttribute_embedding_dict17 = augmented_atttribute_embedding_dict18 = augmented_atttribute_embedding_dict19 = augmented_atttribute_embedding_dict20 = augmented_atttribute_embedding_dict21 = augmented_atttribute_embedding_dict22 = augmented_atttribute_embedding_dict23 = augmented_atttribute_embedding_dict24 = augmented_atttribute_embedding_dict25 = augmented_atttribute_embedding_dict26 = augmented_atttribute_embedding_dict27 = augmented_atttribute_embedding_dict28 = augmented_atttribute_embedding_dict29 = augmented_atttribute_embedding_dict30 = augmented_atttribute_embedding_dict31 = augmented_atttribute_embedding_dict32 = augmented_atttribute_embedding_dict33 = augmented_atttribute_embedding_dict34 = augmented_atttribute_embedding_dict35 = augmented_atttribute_embedding_dict +# # if os.path.exists(file_path + "augmented_atttribute_embedding_dict"): +# # print(f"The file augmented_atttribute_embedding_dict exists.") +# # augmented_atttribute_embedding_dict = pickle.load(open(file_path + 'augmented_atttribute_embedding_dict','rb')) +# # else: +# # print(f"The file augmented_atttribute_embedding_dict does not exist.") +# # pickle.dump(augmented_atttribute_embedding_dict, open(file_path + 'augmented_atttribute_embedding_dict','wb')) + +# file_name = "augmented_atttribute_embedding_dict12" +# if os.path.exists(file_path + file_name): +# print(f"The file augmented_atttribute_embedding_dict exists.") +# augmented_atttribute_embedding_dict = pickle.load(open(file_path + file_name,'rb')) +# else: +# print(f"The file augmented_atttribute_embedding_dict does not exist.") +# pickle.dump(augmented_atttribute_embedding_dict, open(file_path + file_name,'wb')) + + + +# error_cnt=0 +# ### read augmented item attribute file +# # toy_augmented_item_attribute = pd.read_csv(file_path + 'augmented_item_attribute_agg.csv', names=['id','title', 'genre', 'director', 'country', 'language']) +# toy_augmented_item_attribute = pd.read_csv(file_path + 'augmented_item_attribute_agg.csv', names=['id', 'year','title', 'director', 'country', 'language']) + + +# g_key = "" + +# for i in range(5500, 6000, 1): +# batch_start = i +# batch_end = min(i + 1, toy_augmented_item_attribute.shape[0]) +# indices = list(range(batch_start, batch_end)) +# # print(f"###i###: {i}") +# print(f"#######: {indices}") +# LLM_request(toy_augmented_item_attribute, indices, "text-embedding-ada-002", augmented_atttribute_embedding_dict, error_cnt, g_key, file_name) + + +# # ### get separate embedding matrix +# # import pandas as pd +# # augmented_atttribute_embedding_dict = pickle.load(open(file_path + 'augmented_atttribute_embedding_dict','rb')) +# # for value in augmented_atttribute_embedding_dict.keys(): +# # augmented_atttribute_embedding_dict[value] = np.array(augmented_atttribute_embedding_dict[value]) + + +# # raw_item_attribute = pd.read_csv(file_path + 'toy_item_attribute.csv', names=['id','title','genre']) + + +# # augmented_attribute_dict = pickle.load(open('augmented_attribute_dict','rb')) +# # director_list, country_list, language_list = [], [], [] +# # for i in range(len(augmented_attribute_dict)): +# # director_list.append(augmented_attribute_dict[i][0]) +# # country_list.append(augmented_attribute_dict[i][1]) +# # language_list.append(augmented_attribute_dict[i][2]) +# # director_series = pd.Series(director_list) +# # country_series = pd.Series(country_list) +# # language_series = pd.Series(language_list) +# # raw_item_attribute['director'] = director_series +# # raw_item_attribute['country'] = country_series +# # raw_item_attribute['language'] = language_series +# # raw_item_attribute.to_csv(file_path + 'toy_augmented_item_attribute.csv', index=False, header=None) +# ############################# step 3: generate item atttribute embedding ########################################################## + + + +# ############################# step 4: get separate embedding matrix ########################################################## +# augmented_total_embed_dict = {'title':[] , 'genre':[], 'director':[], 'country':[], 'language':[]} +# augmented_atttribute_embedding_dict = pickle.load(open('augmented_atttribute_embedding_dict','rb')) +# for value in augmented_atttribute_embedding_dict.keys(): +# for i in range(len(augmented_atttribute_embedding_dict[value])): +# augmented_total_embed_dict[value].append(augmented_atttribute_embedding_dict[value][i]) +# augmented_total_embed_dict[value] = np.array(augmented_total_embed_dict[value]) +# pickle.dump(augmented_total_embed_dict, open(file_path + 'augmented_total_embed_dict','wb')) +# ############################# step 4: get separate embedding matrix ########################################################## + + + +# ############################# step 5: i-i relation struction: (constructured when start) ########################################################## +# # augmented_total_embed_dict = pickle.load(open(file_path + 'augmented_total_embed_dict','rb')) +# # for value in augmented_atttribute_embedding_dict.keys(): +# # augmented_atttribute_embedding_dict[value] = torch.tensor(augmented_atttribute_embedding_dict[value]) +# pass +# ############################# step 5: i-i relation struction: (constructured when start) ########################################################## + + + +# # ############################# step 6: agg file ########################################################## +# dict_list = [] +# for i in range(1,36): +# tmp_dict = pickle.load(open('augmented_atttribute_embedding_dict'+str(i),'rb')) +# dict_list.append(tmp_dict) +# total_dict = {'year':{}, 'title':{}, 'director':{}, 'country':{}, 'language':{}} +# for value in dict_list: +# for key in total_dict.keys(): +# total_dict[key].update(value[key]) +# # ############################# step 6: agg file ########################################################## + + + + diff --git a/examples/llmrec/LLM_augmentation_construct_prompt/gpt_ui_aug.py b/examples/llmrec/LLM_augmentation_construct_prompt/gpt_ui_aug.py new file mode 100644 index 00000000..1a596a34 --- /dev/null +++ b/examples/llmrec/LLM_augmentation_construct_prompt/gpt_ui_aug.py @@ -0,0 +1,305 @@ + +import threading +import openai +import time +import pandas as pd +import csv +import requests +import concurrent.futures +import pickle +import torch +import os +import threading +import time +import tqdm +import requests + +file_path = "" +max_threads = 5 +cnt = 0 + +# MovieLens +def construct_prompting(item_attribute, item_list, candidate_list): + # make history string + history_string = "User history:\n" + for index in item_list: + title = item_attribute['title'][index] + genre = item_attribute['genre'][index] + history_string += "[" + history_string += str(index) + history_string += "] " + history_string += title + ", " + history_string += genre + "\n" + # make candidates + candidate_string = "Candidates:\n" + for index in candidate_list: + title = item_attribute['title'][index.item()] + genre = item_attribute['genre'][index.item()] + candidate_string += "[" + candidate_string += str(index.item()) + candidate_string += "] " + candidate_string += title + ", " + candidate_string += genre + "\n" + # output format + output_format = "Please output the index of user\'s favorite and least favorite movie only from candidate, but not user history. Please get the index from candidate, at the beginning of each line.\nOutput format:\nTwo numbers separated by '::'. Nothing else.Plese just give the index of candicates, remove [] (just output the digital value), please do not output other thing else, do not give reasoning.\n\n" + # make prompt + prompt = "You are a movie recommendation system and required to recommend user with movies based on user history that each movie with title(same topic/doctor), year(similar years), genre(similar genre).\n" + prompt += history_string + prompt += candidate_string + prompt += output_format + return prompt + +# # Netflix +# def construct_prompting(item_attribute, item_list, candidate_list): +# # make history string +# history_string = "User history:\n" +# for index in item_list: +# year = item_attribute['year'][index] +# title = item_attribute['title'][index] +# history_string += "[" +# history_string += str(index) +# history_string += "] " +# history_string += str(year) + ", " +# history_string += title + "\n" +# # make candidates +# candidate_string = "Candidates:\n" +# for index in candidate_list: +# year = item_attribute['year'][index.item()] +# title = item_attribute['title'][index.item()] +# candidate_string += "[" +# candidate_string += str(index.item()) +# candidate_string += "] " +# candidate_string += str(year) + ", " +# candidate_string += title + "\n" +# # output format +# output_format = "Please output the index of user\'s favorite and least favorite movie only from candidate, but not user history. Please get the index from candidate, at the beginning of each line.\nOutput format:\nTwo numbers separated by '::'. Nothing else.Plese just give the index of candicates, remove [] (just output the digital value), please do not output other thing else, do not give reasoning.\n\n" +# # make prompt +# # prompt = "You are a movie recommendation system and required to recommend user with movies based on user history that each movie with title(same topic/doctor), year(similar years), genre(similar genre).\n" +# prompt = "" +# prompt += history_string +# prompt += candidate_string +# prompt += output_format +# return prompt + +### read candidate +candidate_indices = pickle.load(open(file_path + 'candidate_indices','rb')) +candidate_indices_dict = {} +for index in range(candidate_indices.shape[0]): + candidate_indices_dict[index] = candidate_indices[index] +### read adjacency_list +adjacency_list_dict = {} +train_mat = pickle.load(open(file_path + 'train_mat','rb')) +for index in range(train_mat.shape[0]): + data_x, data_y = train_mat[index].nonzero() + adjacency_list_dict[index] = data_y +### read item_attribute +toy_item_attribute = pd.read_csv(file_path + 'item_attribute.csv', names=['id','title', 'genre']) +### write augmented dict +augmented_sample_dict = {} +if os.path.exists(file_path + "augmented_sample_dict"): + print(f"The file augmented_sample_dict exists.") + augmented_sample_dict = pickle.load(open(file_path + 'augmented_sample_dict','rb')) +else: + print(f"The file augmented_sample_dict does not exist.") + pickle.dump(augmented_sample_dict, open(file_path + 'augmented_sample_dict','wb')) + +def file_reading(): + augmented_attribute_dict = pickle.load(open(file_path + 'augmented_sample_dict','rb')) + return augmented_attribute_dict + +# baidu +def LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, model_type, augmented_sample_dict): + + try: + augmented_sample_dict = file_reading() + except pickle.UnpicklingError as e: + print("Error occurred while unpickling:", e) + LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, model_type, augmented_sample_dict) + if index in augmented_sample_dict: + return 0 + else: + try: + print(f"{index}") + prompt = construct_prompting(toy_item_attribute, adjacency_list_dict[index], candidate_indices_dict[index]) + url = "http://llms-se.baidu-int.com:8200/chat/completions" + headers={ + # "Content-Type": "application/json", + "Authorization": "Bearer your key" + + } + params={ + "model": model_type, + "messages": [{"role": "user", "content": prompt}], + "temperature":0.6, + "max_tokens": 1000, + "stream": False, + "top_p": 0.1 + } + + response = requests.post(url=url, headers=headers,json=params) + message = response.json() + + content = message['choices'][0]['message']['content'] + print(f"content: {content}, model_type: {model_type}") + samples = content.split("::") + pos_sample = int(samples[0]) + neg_sample = int(samples[1]) + augmented_sample_dict[index] = {} + augmented_sample_dict[index][0] = pos_sample + augmented_sample_dict[index][1] = neg_sample + pickle.dump(augmented_sample_dict, open(file_path + 'augmented_sample_dict','wb')) + + # except ValueError as e: + except requests.exceptions.RequestException as e: + print("An HTTP error occurred:", str(e)) + time.sleep(10) + except ValueError as ve: + print("An error occurred while parsing the response:", str(ve)) + time.sleep(10) + LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, "gpt-3.5-turbo-0613", augmented_sample_dict) + except KeyError as ke: + print("An error occurred while accessing the response:", str(ke)) + time.sleep(10) + LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, "gpt-3.5-turbo-0613", augmented_sample_dict) + except Exception as ex: + print("An unknown error occurred:", str(ex)) + time.sleep(10) + + return 1 + + + + + +# # chatgpt +# def LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, model_type, augmented_sample_dict): + +# if index in augmented_sample_dict: +# print(f"g:{index}") +# return 0 +# else: +# try: +# print(f"{index}") +# prompt = construct_prompting(toy_item_attribute, adjacency_list_dict[index], candidate_indices_dict[index]) +# # url = "http://llms-se.baidu-int.com:8200/chat/completions" +# # url = "https://api.openai.com/v1/completions" +# url = "https://api.openai.com/v1/chat/completions" + +# headers={ +# # "Content-Type": "application/json", +# # "Authorization": "Bearer your key" +# +# } +# # params={ +# # "model": model_type, +# # "prompt": prompt, +# # "max_tokens": 1024, +# # "temperature": 0.6, +# # "stream": False, +# # } + +# params = { +# "model": "gpt-3.5-turbo", +# "messages": [{"role": "system", "content": "You are a movie recommendation system and required to recommend user with movies based on user history that each movie with title(same topic/doctor), year(similar years), genre(similar genre).\n"}, {"role": "user", "content": prompt}] +# } + +# response = requests.post(url=url, headers=headers,json=params) +# message = response.json() + +# content = message['choices'][0]['message']['content'] +# # content = message['choices'][0]['text'] +# print(f"content: {content}, model_type: {model_type}") +# samples = content.split("::") +# pos_sample = int(samples[0]) +# neg_sample = int(samples[1]) +# augmented_sample_dict[index] = {} +# augmented_sample_dict[index][0] = pos_sample +# augmented_sample_dict[index][1] = neg_sample +# # pickle.dump(augmented_sample_dict, open('augmented_sample_dict','wb')) +# # pickle.dump(augmented_sample_dict, open('/Users/weiwei/Documents/Datasets/ml-10m/ml-10M100K/preprocessed_raw_MovieLens/toy_MovieLens1000/augmented_sample_dict','wb')) +# pickle.dump(augmented_sample_dict, open(file_path + 'augmented_sample_dict','wb')) + +# # # except ValueError as e: +# # except requests.exceptions.RequestException as e: +# # print("An HTTP error occurred:", str(e)) +# # # time.sleep(40) +# # except ValueError as ve: +# # print("An error occurred while parsing the response:", str(ve)) +# # # time.sleep(40) +# # LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, "gpt-3.5-turbo-0613", augmented_sample_dict) +# # except KeyError as ke: +# # print("An error occurred while accessing the response:", str(ke)) +# # # time.sleep(40) +# # LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, "gpt-3.5-turbo-0613", augmented_sample_dict) +# # except Exception as ex: +# # print("An unknown error occurred:", str(ex)) +# # # time.sleep(40) + +# # return 1 + +# # except ValueError as e: +# except requests.exceptions.RequestException as e: +# print("An HTTP error occurred:", str(e)) +# time.sleep(8) +# # print(content) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) +# LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, model_type, augmented_sample_dict) +# except ValueError as ve: +# print("ValueError error occurred while parsing the response:", str(ve)) +# time.sleep(10) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # print(content) +# # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) +# LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, model_type, augmented_sample_dict) +# except KeyError as ke: +# print("KeyError error occurred while accessing the response:", str(ke)) +# time.sleep(10) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # print(content) +# # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) +# LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, model_type, augmented_sample_dict) +# except IndexError as ke: +# print("IndexError error occurred while accessing the response:", str(ke)) +# time.sleep(10) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # # print(content) +# # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict) +# # return 1 +# LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, model_type, augmented_sample_dict) +# except EOFError as ke: +# print("EOFError: : Ran out of input error occurred while accessing the response:", str(ke)) +# time.sleep(10) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # print(content) +# # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) +# LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, model_type, augmented_sample_dict) +# except Exception as ex: +# print("An unknown error occurred:", str(ex)) +# time.sleep(10) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # print(content) +# # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) +# LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, model_type, augmented_sample_dict) +# return 1 + + +for index in range(0, len(adjacency_list_dict)): + # # make prompting + re = LLM_request(toy_item_attribute, adjacency_list_dict, candidate_indices_dict, index, "gpt-3.5-turbo", augmented_sample_dict) + + + + diff --git a/examples/llmrec/LLM_augmentation_construct_prompt/gpt_user_profiling.py b/examples/llmrec/LLM_augmentation_construct_prompt/gpt_user_profiling.py new file mode 100644 index 00000000..b6f6fd20 --- /dev/null +++ b/examples/llmrec/LLM_augmentation_construct_prompt/gpt_user_profiling.py @@ -0,0 +1,458 @@ + +import threading +import openai +import time +import pandas as pd +import csv +import requests +import concurrent.futures +import pickle +import torch +import os +import threading +import time +import numpy as np + + +openai.api_base = "http://llms-se.baidu-int.com:8200" + + +import requests + + +file_path = "" +max_threads = 5 +cnt = 0 + +# MovieLens +def construct_prompting(item_attribute, item_list): + # make history string + history_string = "User history:\n" + for index in item_list: + title = item_attribute['title'][index] + genre = item_attribute['genre'][index] + history_string += "[" + history_string += str(index) + history_string += "] " + history_string += title + ", " + history_string += genre + "\n" + # output format + output_format = "Please output the following infomation of user, output format:\n{\'age\':age, \'gender\':gender, \'liked genre\':liked genre, \'disliked genre\':disliked genre, \'liked directors\':liked directors, \'country\':country\, 'language\':language}\nPlease do not fill in \'unknown\', but make an educated guess based on the available information and fill in the specific content.\nplease output only the content in format above, but no other thing else, no reasoning, no analysis, no Chinese. Reiterating once again!! Please only output the content after \"output format: \", and do not include any other content such as introduction or acknowledgments.\n\n" + # make prompt + prompt = "You are required to generate user profile based on the history of user, that each movie with title, year, genre.\n" + prompt += history_string + prompt += output_format + return prompt + +# # Netflix +# def construct_prompting(item_attribute, item_list): +# # make history string +# history_string = "User history:\n" +# for index in item_list: +# year = item_attribute['year'][index] +# title = item_attribute['title'][index] +# history_string += "[" +# history_string += str(index) +# history_string += "] " +# history_string += str(year) + ", " +# history_string += title + "\n" +# # output format +# output_format = "Please output the following infomation of user, output format:\n{\'age\':age, \'gender\':gender, \'liked genre\':liked genre, \'disliked genre\':disliked genre, \'liked directors\':liked directors, \'country\':country\, 'language\':language}\nPlease do not fill in \'unknown\', but make an educated guess based on the available information and fill in the specific content.\nplease output only the content in format above, but no other thing else, no reasoning, no analysis, no Chinese. Reiterating once again!! Please only output the content after \"output format: \", and do not include any other content such as introduction or acknowledgments.\n\n" +# # make prompt +# prompt = "You are required to generate user profile based on the history of user, that each movie with title, year, genre.\n" +# prompt += history_string +# prompt += output_format +# return prompt + + +# # embedding +# def LLM_request(augmented_user_profiling_dict, index, model_type, augmented_user_init_embedding): + +# if index in augmented_user_init_embedding: +# return 0 +# else: +# try: +# # print(f"{index}") +# # prompt = construct_prompting(augmented_user_init_embedding, index) +# url = "https://api.openai.com/v1/embeddings" +# headers={ +# # "Content-Type": "application/json", +# "Authorization": "Bearer your key" + +# } +# params={ +# "model": "text-embedding-ada-002", +# "input": augmented_user_profiling_dict[index] +# } + +# response = requests.post(url=url, headers=headers,json=params) +# message = response.json() +# content = message['data'][0]['embedding'] +# # print(content) +# print(index) + +# augmented_user_init_embedding[index] = np.array(content) +# # pickle.dump(augmented_sample_dict, open('augmented_sample_dict','wb')) +# pickle.dump(augmented_user_init_embedding, open(file_path + 'augmented_user_init_embedding','wb')) + + +# # except ValueError as e: +# except requests.exceptions.RequestException as e: +# print("An HTTP error occurred:", str(e)) +# time.sleep(5) +# except ValueError as ve: +# print("An error occurred while parsing the response:", str(ve)) +# time.sleep(5) +# LLM_request(augmented_user_profiling_dict, index, "text-embedding-ada-002", augmented_user_init_embedding) +# except KeyError as ke: +# print("An error occurred while accessing the response:", str(ke)) +# time.sleep(5) +# LLM_request(augmented_user_profiling_dict, index, "text-embedding-ada-002", augmented_user_init_embedding) +# except Exception as ex: +# print("An unknown error occurred:", str(ex)) +# time.sleep(5) + +# return 1 + + +### user profile ################################################################################################################################# +def get_gpt_response_w_system(model_type, prompt): + # global system_prompt + completion = openai.ChatCompletion.create( + model=model_type, + messages=[ + # {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ] + ) + response = completion.choices[0].message.content + # print(response) + return response + +start_id = 0 +g_model_type = "gpt-3.5-turbo-0613" +# # "claude", "chatglm-6b", "hambuger-13b", "baichuan-7B", "gpt-4", "gpt-4-0613" + + +# toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict +def file_reading(): + augmented_user_profiling_dict = pickle.load(open(file_path + 'augmented_user_profiling_dict','rb')) + return augmented_user_profiling_dict +## baidu user profile generate +def LLM_request(toy_item_attribute, adjacency_list_dict, index, model_type, _, error_cnt): + # try: + # augmented_user_profiling_dict = file_reading() + # except pickle.UnpicklingError as e: + # # Handle the unpickling error + # # time.sleep(0.001) + # augmented_user_profiling_dict = file_reading() + # print("Error occurred while unpickling:", e) + + # if index in augmented_user_profiling_dict: + # return 0 + # else: + # try: + print(f"{index}") + prompt = construct_prompting(toy_item_attribute, adjacency_list_dict[index]) + url = "http://llms-se.baidu-int.com:8200/chat/completions" + headers={ + # "Authorization": "Bearer your key" + } + prompt = "Please output the following infomation of user, output format:\n{\'age\':age, \'gender\':gender, \'liked genre\':liked genre, \'disliked genre\':disliked genre, \'liked directors\':liked directors, \'country\':country\, 'language\':language}\nPlease do not fill in \'unknown\', but make an educated guess based on the available information and fill in the specific content.\nplease output only the content in format above, but no other thing else, no reasoning, no analysis, no Chinese. Reiterating once again!! Please only output the content after \"output format: \", and do not include any other content such as introduction or acknowledgments.\n\n" + "User history:\n" + "[332]" + "title: Heart and Souls (1993), " + "genre: Comedy|Fantasy\n" + "[364]" + "title: Men with Brooms (2002), " + "genre: Comedy|Drama|Romance\n" + "You are required to generate user profile based on the history of user, that each movie with title, year, genre.\n" + params={ + "model": model_type, + "messages": [{"role": "user", "content": prompt}], + "temperature":0.8, + "max_tokens": 1000, + "stream": False, + "top_p": 0.1 + } + response = requests.post(url=url, headers=headers,json=params) + message = response.json() + content = message['choices'][0]['message']['content'] + # content = get_gpt_response_w_system(model_type, prompt) + + print(f"content: {content}, model_type: {model_type}") + + # augmented_user_profiling_dict[index] = content + # pickle.dump(augmented_user_profiling_dict, open(file_path + 'augmented_user_profiling_dict','wb')) + # error_cnt = 0 + # time.sleep(8) + # # # except ValueError as e: + # # except requests.exceptions.RequestException as e: + # # print("An HTTP error occurred:", str(e)) + # # time.sleep(25) + # # # print(content) + # # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict) + # # except ValueError as ve: + # # print("An error occurred while parsing the response:", str(ve)) + # # time.sleep(25) + # # # print(content) + # # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict) + # # except KeyError as ke: + # # print("An error occurred while accessing the response:", str(ke)) + # # time.sleep(25) + # # # print(content) + # # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict) + # # except Exception as ex: + # # print("An unknown error occurred:", str(ex)) + # # time.sleep(25) + # # # print(content) + # # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict) + # # return 1 + + # # except ValueError as e: + # except requests.exceptions.RequestException as e: + # print("An HTTP error occurred:", str(e)) + # time.sleep(5) + # # print(content) + # # error_cnt += 1 + # # if error_cnt==5: + # # return 1 + # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) + # LLM_request(toy_item_attribute, adjacency_list_dict, index, model_type, augmented_user_profiling_dict, error_cnt) + # except ValueError as ve: + # print("ValueError error occurred while parsing the response:", str(ve)) + # time.sleep(5) + # # error_cnt += 1 + # # if error_cnt==5: + # # return 1 + # # print(content) + # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) + # LLM_request(toy_item_attribute, adjacency_list_dict, index, model_type, augmented_user_profiling_dict, error_cnt) + # except KeyError as ke: + # print("KeyError error occurred while accessing the response:", str(ke)) + # time.sleep(5) + # # error_cnt += 1 + # # if error_cnt==5: + # # return 1 + # # print(content) + # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) + # LLM_request(toy_item_attribute, adjacency_list_dict, index, model_type, augmented_user_profiling_dict, error_cnt) + # except IndexError as ke: + # print("IndexError error occurred while accessing the response:", str(ke)) + # time.sleep(5) + # # error_cnt += 1 + # # if error_cnt==5: + # # return 1 + # # # print(content) + # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict) + # # return 1 + # LLM_request(toy_item_attribute, adjacency_list_dict, index, model_type, augmented_user_profiling_dict, error_cnt) + # except EOFError as ke: + # print("EOFError: : Ran out of input error occurred while accessing the response:", str(ke)) + # time.sleep(5) + # # error_cnt += 1 + # # if error_cnt==5: + # # return 1 + # # print(content) + # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) + # LLM_request(toy_item_attribute, adjacency_list_dict, index, model_type, augmented_user_profiling_dict, error_cnt) + # except Exception as ex: + # print("An unknown error occurred:", str(ex)) + # time.sleep(5) + # # error_cnt += 1 + # # if error_cnt==5: + # # return 1 + # # print(content) + # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) + # LLM_request(toy_item_attribute, adjacency_list_dict, index, model_type, augmented_user_profiling_dict, error_cnt) + # return 1 +### user profile ################################################################################################################################# + + + + +# ## chatgpt user profile generate +# def LLM_request(toy_item_attribute, adjacency_list_dict, index, model_type, augmented_user_profiling_dict, error_cnt): +# if index in augmented_user_profiling_dict: +# return 0 +# else: +# # try: +# print(f"{index}") +# prompt = construct_prompting(toy_item_attribute, adjacency_list_dict[index]) +# url = "http://llms-se.baidu-int.com:8200/chat/completions" +# headers={ +# # "Content-Type": "application/json", +# # "Authorization": "Bearer your key" +# +# } +# params={ +# "model": model_type, +# "messages": [{"role": "user", "content": prompt}], +# "temperature":0.6, +# "max_tokens": 1000, +# "stream": False, +# "top_p": 0.1 +# } + +# response = requests.post(url=url, headers=headers,json=params) +# message = response.json() + +# content = message['choices'][0]['message']['content'] +# print(f"content: {content}, model_type: {model_type}") + +# augmented_user_profiling_dict[index] = content +# pickle.dump(augmented_user_profiling_dict, open(file_path + 'augmented_user_profiling_dict','wb')) +# error_cnt = 0 +# # # except ValueError as e: +# # except requests.exceptions.RequestException as e: +# # print("An HTTP error occurred:", str(e)) +# # time.sleep(25) +# # # print(content) +# # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict) +# # except ValueError as ve: +# # print("An error occurred while parsing the response:", str(ve)) +# # time.sleep(25) +# # # print(content) +# # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict) +# # except KeyError as ke: +# # print("An error occurred while accessing the response:", str(ke)) +# # time.sleep(25) +# # # print(content) +# # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict) +# # except Exception as ex: +# # print("An unknown error occurred:", str(ex)) +# # time.sleep(25) +# # # print(content) +# # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict) +# # return 1 + +# # # except ValueError as e: +# # except requests.exceptions.RequestException as e: +# # print("An HTTP error occurred:", str(e)) +# # # time.sleep(25) +# # # print(content) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) +# # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict, error_cnt) +# # except ValueError as ve: +# # print("ValueError error occurred while parsing the response:", str(ve)) +# # # time.sleep(25) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # # print(content) +# # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) +# # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict, error_cnt) +# # except KeyError as ke: +# # print("KeyError error occurred while accessing the response:", str(ke)) +# # # time.sleep(25) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # # print(content) +# # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) +# # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict, error_cnt) +# # except IndexError as ke: +# # print("IndexError error occurred while accessing the response:", str(ke)) +# # # time.sleep(25) +# # error_cnt += 1 +# # if error_cnt==5: +# # return 1 +# # # # print(content) +# # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict) +# # # return 1 +# # except Exception as ex: +# # print("An unknown error occurred:", str(ex)) +# # # time.sleep(25) +# # # error_cnt += 1 +# # # if error_cnt==5: +# # # return 1 +# # # print(content) +# # # LLM_request(toy_item_attribute, indices, "gpt-3.5-turbo-0613", augmented_attribute_dict, error_cnt) +# # LLM_request(toy_item_attribute, adjacency_list_dict, index, "gpt-4", augmented_user_profiling_dict, error_cnt) +# # return 1 + + + + + + + + + + +error_cnt = 0 + + +### step1: generate user profiling ################################################################################## +### read item_attribute +toy_item_attribute = pd.read_csv(file_path + '/item_attribute.csv', names=['id','title', 'genre']) +### write augmented dict +augmented_user_profiling_dict = {} +if os.path.exists(file_path + "augmented_user_profiling_dict"): + print(f"The file augmented_user_profiling_dict exists.") + augmented_user_profiling_dict = pickle.load(open(file_path + 'augmented_user_profiling_dict','rb')) +else: + print(f"The file augmented_user_profiling_dict does not exist.") + pickle.dump(augmented_user_profiling_dict, open(file_path + 'augmented_user_profiling_dict','wb')) + +### read adjacency_list +adjacency_list_dict = {} +train_mat = pickle.load(open(file_path + 'train_mat','rb')) +for index in range(train_mat.shape[0]): + data_x, data_y = train_mat[index].nonzero() + adjacency_list_dict[index] = data_y + +for index in range(start_id, len(adjacency_list_dict.keys())): + print(index) + # # make prompting + re = LLM_request(toy_item_attribute, adjacency_list_dict, index, g_model_type, augmented_user_profiling_dict, error_cnt) +# "claude", "chatglm-6b", "hambuger-13b", "baichuan-7B", "gpt-4", "gpt-4-0613" +### step1: generate user profiling ################################################################################## + + + + + +# ### step2: generate user embedding ################################################################################## + +# ### read user_profile +# augmented_user_profiling_dict = pickle.load(open(file_path + 'augmented_user_profiling_dict','rb')) +# ### write augmented_user_init_embedding +# augmented_user_init_embedding = {} +# if os.path.exists(file_path + "augmented_user_init_embedding"): +# print(f"The file augmented_user_init_embedding exists.") +# augmented_user_init_embedding = pickle.load(open(file_path + 'augmented_user_init_embedding','rb')) +# else: +# print(f"The file augmented_user_init_embedding does not exist.") +# pickle.dump(augmented_user_init_embedding, open(file_path + 'augmented_user_init_embedding','wb')) + +# for index,value in enumerate(augmented_user_profiling_dict.keys()): +# # # make prompting +# # prompt = construct_prompting(toy_item_attribute, adjacency_list_dict[index], candidate_indices_dict[index]) +# re = LLM_request(augmented_user_profiling_dict, index, "text-embedding-ada-002", augmented_user_init_embedding) +# # print(f"{index}") +# # if re: +# # time.sleep(0.5) +# ### step2: generate user embedding ################################################################################## + + + + +# # ### step3: get user embedding ################################################################################## +# augmented_user_init_embedding = pickle.load(open('/Users/weiwei/Documents/Datasets/ml-10m/ml-10M100K/preprocessed_raw_MovieLens/toy_MovieLens1000/augmented_user_init_embedding','rb')) +# augmented_user_init_embedding_list = [] +# for i in range(len(augmented_user_init_embedding)): +# augmented_user_init_embedding_list.append(augmented_user_init_embedding[i]) +# augmented_user_init_embedding_final = np.array(augmented_user_init_embedding_list) +# pickle.dump(augmented_user_init_embedding_final, open('/Users/weiwei/Documents/Datasets/ml-10m/ml-10M100K/preprocessed_raw_MovieLens/toy_MovieLens1000/augmented_user_init_embedding_final','wb')) +# # ### step3: get user embedding ################################################################################## + + + + +# ### clean keys ######################################################################################################## +# In [196]: new_augmented_user_profiling_dict = {} +# ...: for index,value in enumerate(augmented_user_profiling_dict.keys()): +# ...: if type(value) == str: +# ...: if int(value.strip("'")) in augmented_user_profiling_dict: +# ...: continue +# ...: else: +# ...: new_augmented_user_profiling_dict[int(value.strip("'"))] = augmented_user_profiling_dict[value] +# ...: else: +# ...: new_augmented_user_profiling_dict[value] = augmented_user_profiling_dict[value] +# ### clean keys ########################################################################################################