diff --git a/src_python/main.py b/src_python/main.py index a089e04..bf12312 100644 --- a/src_python/main.py +++ b/src_python/main.py @@ -1,6 +1,4 @@ -import asyncio import logging -import os from pathlib import Path import discord @@ -11,20 +9,13 @@ from src_python.src.scrape_server.save_to_disk import save_server_data_to_disk from src_python.src.scrape_server.scrape_server import process_server +from src_python.src.utilities.load_env_variables import DISCORD_DEV_BOT_ID, OUTPUT_DIRECTORY, TARGET_SERVER_ID, \ + STUDENT_IDENTIFIERS_CSV_PATH, DISCORD_DEV_BOT_TOKEN configure_logging() logger = logging.getLogger(__name__) -# Load environment variables -load_dotenv("../.env.analysis") -DISCORD_DEV_BOT_TOKEN = os.getenv('DISCORD_DEV_BOT_TOKEN') -TARGET_SERVER_ID = os.getenv('TARGET_SERVER_ID') -OUTPUT_DIRECTORY = os.getenv('OUTPUT_DIRECTORY') -STUDENT_IDENTIFIERS_CSV_PATH = os.getenv('STUDENT_IDENTIFIERS_CSV_PATH') -# Ensure the environment variables are set -if not DISCORD_DEV_BOT_TOKEN or not OUTPUT_DIRECTORY or not OUTPUT_DIRECTORY: - raise ValueError("Please set DISCORD_DEV_BOT_TOKEN and OUTPUT_DIRECTORY in your .env file") # Initialize the Discord client client = commands.Bot(command_prefix='!', intents=discord.Intents.all()) @@ -33,6 +24,8 @@ @client.event async def on_ready(): logger.info(f'Logged in as {client.user.name} (ID: {client.user.id})') + if not int(DISCORD_DEV_BOT_ID) == client.user.id: + raise ValueError("Discord bot ID does not match expected ID") await main_server_scraper(client=client, target_server_id=TARGET_SERVER_ID, output_directory=str(Path(OUTPUT_DIRECTORY)), @@ -52,26 +45,13 @@ async def main_server_scraper(client: commands.Bot, # class_roster = ClassRosterModel.from_csv(student_identifiers_path) # save_student_data_to_disk(output_directory=output_directory, server_data=server_data, class_roster=class_roster) + else: + logger.error(f"Could not find server with ID: {target_server_id}") client.run(DISCORD_DEV_BOT_TOKEN) if __name__ == "__main__": - from src_python.src.ai.analyze_directory import analyze_directory - from src_python.src.models.extract_text_data import ExtractedTextData - in_server_name = "jonmatthiss_server" - input_directory_out = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}" - output_directory_out = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}_AI_Processed" - classbot_prompt_file = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}\{in_server_name}_classbot_prompt.txt" - - with open(classbot_prompt_file, 'r', encoding='utf-8') as f: - classbot_prompt = f.read() - - asyncio.run(analyze_directory(input_directory=input_directory_out, - output_directory=output_directory_out, - json_schema_model=ExtractedTextData, - base_prompt_text=classbot_prompt)) - - logger.info(f"Analysis complete for directory: {input_directory_out}") - - print("Done!") + # run this script and botto will scrape the server on startup + # run the `ai/analyze_directory.py` script to analyze the server data + pass \ No newline at end of file diff --git a/src_python/src/ai/analyze_directory.py b/src_python/src/ai/analyze_directory.py index 6e10247..a4cd818 100644 --- a/src_python/src/ai/analyze_directory.py +++ b/src_python/src/ai/analyze_directory.py @@ -10,15 +10,17 @@ configure_logging() import logging + logger = logging.getLogger(__name__) -async def analyze_directory(input_directory: str, + +async def analyze_directory(base_directory: str, output_directory: str, json_schema_model: Type[BaseModel], base_prompt_text: str, max_file_count: int = None, llm_model: str = "gpt-3.5-turbo"): - input_directory_path = Path(input_directory) + input_directory_path = Path(base_directory) output_directory_path = Path(output_directory) output_directory_path.mkdir(parents=True, exist_ok=True) @@ -27,7 +29,6 @@ async def analyze_directory(input_directory: str, logger.info(f"Analyzing directory: {input_directory_path}") tasks = [] - for file_number, file in enumerate(input_directory_path.rglob('*.md')): if max_file_count and file_number >= max_file_count: break @@ -53,9 +54,9 @@ async def analyze_markdown_file(base_prompt_text: str, output_parent_path.mkdir(parents=True, exist_ok=True) try: constructed_pydantic_model = await analyze_text(input_text=input_file_text, - json_schema_model=json_schema_model, - base_prompt_text=base_prompt_text, - llm_model=llm_model) + json_schema_model=json_schema_model, + base_prompt_text=base_prompt_text, + llm_model=llm_model) except Exception as e: logger.error(f"Error analyzing file: {file_path}") logger.error(e) @@ -66,7 +67,7 @@ async def analyze_markdown_file(base_prompt_text: str, logger.info(f"Constructed Pydantic model:\n\n{constructed_pydantic_model}") output_markdown_string = str(constructed_pydantic_model) - full_output_string = output_markdown_string + "\n\nOriginal text:\n\n```\n\n" + input_file_text + "\n\n``` \n\n" + full_output_string = output_markdown_string + "\n\n___\n\n___\n\nOriginal text:\n\n" + input_file_text output_file_name = constructed_pydantic_model.filename save_path = output_parent_path / output_file_name @@ -79,21 +80,20 @@ async def analyze_markdown_file(base_prompt_text: str, if __name__ == "__main__": + from src_python.src.utilities.load_env_variables import OUTPUT_DIRECTORY + in_server_name = "HMN_Fall24" + classbot_prompt_file_name = f"{in_server_name}-prompt.txt" + classbot_prompt_file_path = str(Path(OUTPUT_DIRECTORY) / classbot_prompt_file_name) - in_server_name = "jonmatthiss_server" - input_directory_out = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}" - output_directory_out = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}_AI_Processed" - classbot_prompt_file = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}_prompt.txt" - - with open(classbot_prompt_file, 'r', encoding='utf-8') as f: + with open(classbot_prompt_file_path, 'r', encoding='utf-8') as f: classbot_prompt = f.read() - asyncio.run(analyze_directory(input_directory=input_directory_out, - output_directory=output_directory_out, + asyncio.run(analyze_directory(base_directory=OUTPUT_DIRECTORY, + output_directory=str(Path(OUTPUT_DIRECTORY) / f"{in_server_name}-ai-processed"), json_schema_model=ExtractedTextData, base_prompt_text=classbot_prompt)) - logger.info(f"Analysis complete for directory: {input_directory_out}") + logger.info(f"Analysis complete for directory: {OUTPUT_DIRECTORY}") - print("Done!") \ No newline at end of file + print("Done!") diff --git a/src_python/src/ai/analyze_text.py b/src_python/src/ai/analyze_text.py index 3143d0f..c6a3869 100644 --- a/src_python/src/ai/analyze_text.py +++ b/src_python/src/ai/analyze_text.py @@ -2,6 +2,7 @@ import logging import os import pprint +from typing import Type import tiktoken from dotenv import load_dotenv @@ -23,7 +24,7 @@ async def analyze_text(input_text: str, - json_schema_model: ExtractedTextData, + json_schema_model: Type[ExtractedTextData], base_prompt_text: str = "", max_input_tokens: int = 1.6e4, llm_model: str = "gpt-4o-mini") -> BaseModel: diff --git a/src_python/src/ai/construct_prompt.py b/src_python/src/ai/construct_prompt.py index 1596f75..42b3d6c 100644 --- a/src_python/src/ai/construct_prompt.py +++ b/src_python/src/ai/construct_prompt.py @@ -11,7 +11,7 @@ SANDWICH_CAPPER = "Remember! You instructions are to: \n\n" -def construct_analyzer_prompt(json_schema_model: ExtractedTextData, +def construct_analyzer_prompt(json_schema_model: Type[ExtractedTextData], input_text: str, base_prompt_text: str = "", ) -> str: @@ -29,7 +29,7 @@ def construct_analyzer_prompt(json_schema_model: ExtractedTextData, input_text_prompt_string = f"BEGIN INPUT TEXT: \n\n{input_text}\n\n END INPUT TEXT\n\n" - sandwich_cap_prompt = f"{SANDWICH_CAPPER} \n\n {instruction_prompt} \n\n {json_schema_prompt}" + sandwich_cap_prompt = f"{SANDWICH_CAPPER} \n\n {BASE_JSON_PROMPT} \n\n {json_schema_prompt}" output_prompt = instruction_prompt + "\n\n" + input_text_prompt_string + "\n\n" + sandwich_cap_prompt + "\n" @@ -54,7 +54,7 @@ def construct_json_prompt(pydantic_model: Type[BaseModel]) -> str: json_prompt = ['{\n'] for name, field in fields.items(): - field_info = pydantic_model.__fields__[name] + field_info = pydantic_model.model_fields[name] description = field_info.description or "" json_prompt.append(f'"{name}": ({field_info.annotation}) // {description},') diff --git a/src_python/src/configure_logging.py b/src_python/src/configure_logging.py index c73498e..56a9f99 100644 --- a/src_python/src/configure_logging.py +++ b/src_python/src/configure_logging.py @@ -98,7 +98,7 @@ class ColoredConsoleHandler(logging.StreamHandler): "INFO": "\033[96m", # Cyan "SUCCESS": "\033[95m", # Magenta "WARNING": "\033[33m", # Yellow - "ERROR": "\033[101m", # Background Dark Red + "ERROR": "\033[30;41m", # Black text on Red background } def emit(self, record): diff --git a/src_python/src/models/extract_text_data.py b/src_python/src/models/extract_text_data.py index 351c37b..8bddaed 100644 --- a/src_python/src/models/extract_text_data.py +++ b/src_python/src/models/extract_text_data.py @@ -4,20 +4,20 @@ class ExtractedTextData(BaseModel): - detailed_summary: str = Field("", - description="An exhaustively thorough and detailed summary of the major points of this text in markdown bulleted outline format, like `* point 1\n* point 2\n* point 3` etc") - highlights: str = Field("", - description="A list of the most important points of the text, formatted as a bulleted list") - short_summary: str = Field("", description="A short (2-3 sentence) summary of the text") - very_short_summary: str = Field("", description="A very short one sentence summary of the text") - extremely_short_summary: str = Field("", description="An extremely short 6-10 word summary of the text") title_slug: str = Field("", description="The a descriptive title of the text, will be used as the H1 header, the filename slug, and the URL slug. It should be short (only a few words) and provide a terse preview of the basic content of the full text, it should include NO colons") tags: str = Field("", - description="A list of tags that describe the content of the text, formatted as comma separated #lower-kabob-case. These should be like topic tags that can be used to categorize the text within a larger collection of texts") + description="A list of tags that describe the content of the text, formatted as comma separated #lower-kabob-case. These should be like topic tags that can be used to categorize the text within a larger collection of texts. Ignore conversational aspects (such as 'greetings', 'farewells', 'thanks', etc.)") + extremely_short_summary: str = Field("", description="An extremely short 6-10 word summary of the text") + very_short_summary: str = Field("", description="A very short one sentence summary of the text") + short_summary: str = Field("", description="A short (2-3 sentence) summary of the text") + highlights: str = Field("", + description="A list of the most important points of the text, formatted as a bulleted list") + detailed_summary: str = Field("", + description="An exhaustively thorough and detailed summary of the major points of this text in markdown bulleted outline format, like `* point 1\n* point 2\n* point 3` etc") backlinks: str = Field("", - description="A list of key concepts and terms that will be used as backlinks in the text, formatted as comma separated wiki style links like `[[backlink 1]], [[backlink 2]], [[backlink 3]]` etc. These shoud be the kinds of things you would expect to find a Wikipedia article about") - + description="A list of key words and phrases in the text which will highlighted as [[backlinks]] within the text, These should be the kinds of things you would expect to find a Wikipedia article about. Format this section as comma separated wiki style links like `[[backlink 1]], [[backlink 2]], [[backlink 3]]` etc. ") + pull_quotes: str = Field("",description="A list of the most important quotes from the text which the key points of the contentful aspects of the text, formatted as a bulleted list") @property def title(self): return self.title_slug.replace("-", " ").title() @@ -26,7 +26,7 @@ def title(self): def filename(self, extension="md"): if not extension.startswith("."): extension = "." + extension - return sanitize_name(self.title_slug) + f"{extension}" + return sanitize_name(self.title_slug.lower()) + f"{extension}" def __str__(self): tags = "\n".join(self.tags.split(",")) @@ -34,6 +34,10 @@ def __str__(self): # {self.title}\n\n ## Extremely Short Summary\n\n {self.extremely_short_summary}\n\n +## Highlights\n +{self.highlights}\n\n +## Pull Quotes\n +{self.pull_quotes}\n\n ## Very Short Summary\n {self.very_short_summary}\n\n ## Short Summary\n @@ -42,6 +46,8 @@ def __str__(self): {self.detailed_summary}\n\n ## Tags\n {tags}\n\n +## Backlinks\n +{self.backlinks}\n\n """ diff --git a/src_python/src/scrape_server/save_to_disk.py b/src_python/src/scrape_server/save_to_disk.py index 1ad107e..a3b0db4 100644 --- a/src_python/src/scrape_server/save_to_disk.py +++ b/src_python/src/scrape_server/save_to_disk.py @@ -5,11 +5,13 @@ from src_python.src.models.server_data_model import ServerData, save_as_markdown_directory, save_as_json from src_python.src.models.student_info import ClassRosterModel +from src_python.src.utilities.sanitize_filename import sanitize_name logger = logging.getLogger(__name__) def save_server_data_to_disk(output_directory: str, server_data: ServerData): + json_save_path = save_as_json(server_data=server_data, output_directory=output_directory) logger.info(f"Saved server data to disk: {json_save_path}") @@ -18,12 +20,13 @@ def save_server_data_to_disk(output_directory: str, server_data: ServerData): pickle.dump(server_data, open(pickle_save_path, 'wb')) logger.info(f"Saved server data to disk: {pickle_save_path}") except Exception as e: - logger.error(f"Error saving server data as pickle: {e}") + raise ValueError(f"Error saving server data as pickle: {e}") + try: markdown_save_path = save_as_markdown_directory(server_data=server_data, output_directory=output_directory) logger.info(f"Saved server data to disk: {markdown_save_path}") except Exception as e: - logger.error(f"Error saving server data as markdown: {e}") + raise ValueError(f"Error saving server data as markdown: {e}") def save_student_data_to_disk(output_directory: str, diff --git a/src_python/src/utilities/load_env_variables.py b/src_python/src/utilities/load_env_variables.py new file mode 100644 index 0000000..7085e6c --- /dev/null +++ b/src_python/src/utilities/load_env_variables.py @@ -0,0 +1,20 @@ +import os +from pathlib import Path + +from dotenv import load_dotenv +# Load environment variables +env_analysis_path = Path(__file__).parent.parent.parent.parent / ".env.analysis" +if not os.path.exists(env_analysis_path): + raise FileNotFoundError(f".env.analysis file not found at: {env_analysis_path}") +load_dotenv(str(env_analysis_path)) + + +DISCORD_DEV_BOT_TOKEN = os.getenv('DISCORD_DEV_BOT_TOKEN') +DISCORD_DEV_BOT_ID = os.getenv('DISCORD_DEV_BOT_ID') +TARGET_SERVER_ID = os.getenv('TARGET_SERVER_ID') +OUTPUT_DIRECTORY = os.getenv('OUTPUT_DIRECTORY') +STUDENT_IDENTIFIERS_CSV_PATH = os.getenv('STUDENT_IDENTIFIERS_CSV_PATH') + +# Ensure the environment variables are set +if not DISCORD_DEV_BOT_TOKEN or not OUTPUT_DIRECTORY or not OUTPUT_DIRECTORY: + raise ValueError("Please set DISCORD_DEV_BOT_TOKEN and OUTPUT_DIRECTORY in your .env file")