Skip to content

Commit

Permalink
scrape runs
Browse files Browse the repository at this point in the history
  • Loading branch information
jonmatthis committed Oct 15, 2024
1 parent 9d67bfc commit 216ccfa
Show file tree
Hide file tree
Showing 5 changed files with 1,030 additions and 82 deletions.
16 changes: 16 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[project]
name = "skellybot"
version = "0.1.0"
description = "Python code for Skellybot"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"discord>=2.3.2",
"openai>=1.51.2",
"pandas>=2.2.3",
"pydantic[email]>=2.9.2",
"python-dotenv>=1.0.1",
"tiktoken>=0.8.0",
]

scripts = ["skellybot = skellybot.__main__:main"]
33 changes: 15 additions & 18 deletions src_python/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from datetime import datetime
from pathlib import Path

import discord
Expand All @@ -11,45 +12,41 @@
from src.scrape_server.scrape_server import process_server
from src.utilities.load_env_variables import DISCORD_DEV_BOT_ID, OUTPUT_DIRECTORY, TARGET_SERVER_ID, \
STUDENT_IDENTIFIERS_CSV_PATH, DISCORD_DEV_BOT_TOKEN
from src_python.src.utilities.sanitize_filename import sanitize_name

configure_logging()
logger = logging.getLogger(__name__)



# Initialize the Discord client
client = commands.Bot(command_prefix='!', intents=discord.Intents.all())
DISCORD_CLIENT = commands.Bot(command_prefix='!', intents=discord.Intents.all())


@client.event
@DISCORD_CLIENT.event
async def on_ready():
logger.info(f'Logged in as {client.user.name} (ID: {client.user.id})')
if not int(DISCORD_DEV_BOT_ID) == client.user.id:
logger.info(f'Logged in as {DISCORD_CLIENT.user.name} (ID: {DISCORD_CLIENT.user.id})')
if not int(DISCORD_DEV_BOT_ID) == DISCORD_CLIENT.user.id:
raise ValueError("Discord bot ID does not match expected ID")
await main_server_scraper(client=client,
target_server_id=TARGET_SERVER_ID,
output_directory=str(Path(OUTPUT_DIRECTORY)),
student_identifiers_path=STUDENT_IDENTIFIERS_CSV_PATH)
await main_server_scraper()
logger.info('------Done!------')
await DISCORD_CLIENT.close()


async def main_server_scraper(client: commands.Bot,
target_server_id: str,
output_directory: str,
student_identifiers_path: str):
target_server = discord.utils.get(client.guilds, id=int(target_server_id))

async def main_server_scraper():
target_server = discord.utils.get(DISCORD_CLIENT.guilds, id=int(TARGET_SERVER_ID))
dated_output_directory = str(Path(OUTPUT_DIRECTORY) / Path(f"{sanitize_name(datetime.now().isoformat(timespec='minutes'))}"))
if target_server:
server_data = await process_server(target_server)
save_server_data_to_disk(output_directory=output_directory, server_data=server_data)
save_server_data_to_disk(output_directory=dated_output_directory, server_data=server_data)

# class_roster = ClassRosterModel.from_csv(student_identifiers_path)
# save_student_data_to_disk(output_directory=output_directory, server_data=server_data, class_roster=class_roster)
# save_student_data_to_disk(output_directory=OUTPUT_DIRECTORY)
else:
logger.error(f"Could not find server with ID: {target_server_id}")
logger.error(f"Could not find server with ID: {TARGET_SERVER_ID}")


client.run(DISCORD_DEV_BOT_TOKEN)
DISCORD_CLIENT.run(DISCORD_DEV_BOT_TOKEN)

if __name__ == "__main__":
# run this script and botto will scrape the server on startup
Expand Down
63 changes: 0 additions & 63 deletions src_python/src/models/server_data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,69 +47,6 @@ class ServerData(BaseModel):
bot_prompt_messages: List[ContentMessage] = []
categories: Dict[str, CategoryData] = {}

def save_as_json(server_data:ServerData, output_directory: str) -> str:
directory_path = Path(output_directory)
directory_path.mkdir(parents=True, exist_ok=True)
server_data_json = server_data.json()
date_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
full_path = f"{output_directory}/{server_data.name}_{date_string}.json"
# encoding='utf-8' is necessary to avoid UnicodeEncodeError
with open(f"{output_directory}/{server_data.name}_{date_string}.json", 'w', encoding='utf-8') as f:
f.write(server_data_json)
return full_path

def save_as_markdown_directory(server_data:ServerData, output_directory: str) -> str:
"""
creates a directory with structure like
[server]/[category]/[channel]/[thread_name].md
where the markdown files contain the chat data, formatted like this:
```
# [thread_name]
## [message_author]
[message_url]
[message_content]
[attachments]
## (etc for each message in the thread)
...
```
"""
directory_path = Path(output_directory)
save_path = directory_path / "markdown"
save_path.mkdir(parents=True, exist_ok=True)
server_directory = save_path / sanitize_name(server_data.name)
server_directory.mkdir(exist_ok=True)
for category_name, category_data in server_data.categories.items():
clean_category_name = sanitize_name(category_name)
category_directory = server_directory / clean_category_name
category_directory.mkdir(exist_ok=True)
for channel_name, channel_data in category_data.channels.items():
clean_channel_name = sanitize_name(channel_name)
channel_directory = category_directory / clean_channel_name
channel_directory.mkdir(exist_ok=True)
for thread_name, thread_data in channel_data.chat_threads.items():
thread_file_name = f"{clean_category_name}_{clean_channel_name}_thread-{thread_data.id}.md"
thread_file_path = channel_directory / thread_file_name
with open(thread_file_path, 'w', encoding='utf-8') as f:
clean_thread_name = thread_name.replace('name:', '')
clean_thread_name = clean_thread_name.split(',id:')[0]
f.write(f"# {clean_thread_name}\n\n")
for message_number, message in enumerate(thread_data.messages):
if message_number == 0:
f.write(f"## Starting ContentMessage\n\n")
elif message.is_bot:
f.write(f"## AI MESSAGE\n\n")
else:
f.write(f"## HUMAN MESSAGE\n\n")
f.write(f'> userid: {message.user_id}')
f.write(f"> {message.jump_url}\n\n")
f.write(f"{message.content}\n\n")
if message.attachments:
f.write("### Attachments:\n\n")
for attachment in message.attachments:
f.write(f"{attachment}\n\n")
f.write("\n\n")
return str(server_directory)


if __name__ == '__main__':
pickle_path = r"C:\Users\jonma\Sync\skellybot-data\2024 NEU Capstone_2024-04-07_12-45-43.pkl"
Expand Down
70 changes: 69 additions & 1 deletion src_python/src/scrape_server/save_to_disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,81 @@
from datetime import datetime
from pathlib import Path

from src.models.server_data_model import ServerData, save_as_markdown_directory, save_as_json
from src.models.server_data_model import ServerData
from src.models.student_info import ClassRosterModel
from src.utilities.sanitize_filename import sanitize_name

logger = logging.getLogger(__name__)


def save_as_json(server_data:ServerData, output_directory: str) -> str:
directory_path = Path(output_directory)
directory_path.mkdir(parents=True, exist_ok=True)
server_data_json = server_data.json()
date_string = datetime.now().isoformat()
sanitized_server_name = sanitize_name(server_data.name)
base_filename = f"{sanitized_server_name}_{date_string}"
full_json_path = f"{output_directory}/{base_filename}.json"
# encoding='utf-8' is necessary to avoid UnicodeEncodeError
with open(full_json_path, 'w', encoding='utf-8') as f:
f.write(server_data_json)
return full_json_path

def save_as_markdown_directory(server_data:ServerData, output_directory: str) -> str:
"""
creates a directory with structure like
[server]/[category]/[channel]/[thread_name].md
where the markdown files contain the chat data, formatted like this:
```
# [thread_name]
## [message_author]
[message_url]
[message_content]
[attachments]
## (etc for each message in the thread)
...
```
"""
try:
directory_path = Path(output_directory)
save_path = directory_path / "raw-markdown"
save_path.mkdir(parents=True, exist_ok=True)
server_directory = save_path / sanitize_name(server_data.name)
server_directory.mkdir(exist_ok=True, parents=True)
for category_key, category_data in server_data.categories.items():
clean_category_name = sanitize_name(category_data.name)
category_directory = server_directory / clean_category_name
category_directory.mkdir(exist_ok=True, parents=True)
for channel_key, channel_data in category_data.channels.items():
clean_channel_name = sanitize_name(channel_data.name)
channel_directory = category_directory / clean_channel_name
channel_directory.mkdir(exist_ok=True, parents=True)
for thread_key, thread_data in channel_data.chat_threads.items():
thread_file_name = f"{clean_category_name}__{clean_channel_name}__thread-{thread_data.id}.md"
thread_file_path = channel_directory / thread_file_name
with open(thread_file_path, 'w', encoding='utf-8') as f:
clean_thread_name = thread_key.replace('name:', '')
clean_thread_name = clean_thread_name.split(',id:')[0]
f.write(f"# {clean_thread_name}\n\n")
for message_number, message in enumerate(thread_data.messages):
if message_number == 0:
f.write(f"## Starting ContentMessage\n\n")
elif message.is_bot:
f.write(f"## AI MESSAGE\n\n")
else:
f.write(f"## HUMAN MESSAGE\n\n")
f.write(f'> userid: {message.user_id}')
f.write(f"> {message.jump_url}\n\n")
f.write(f"{message.content}\n\n")
if message.attachments:
f.write("### Attachments:\n\n")
for attachment in message.attachments:
f.write(f"{attachment}\n\n")
f.write("\n\n")
except Exception as e:
raise ValueError(f"Error saving server data as markdown: {e}")
return str(server_directory)

def save_server_data_to_disk(output_directory: str, server_data: ServerData):

json_save_path = save_as_json(server_data=server_data, output_directory=output_directory)
Expand Down
Loading

0 comments on commit 216ccfa

Please sign in to comment.