Skip to content

Commit

Permalink
Merge pull request #253 from MLSysOps/feat/enhance-dir-preview-func
Browse files Browse the repository at this point in the history
[MRG] Improve data operation functions
  • Loading branch information
huangyz0918 authored Oct 21, 2024
2 parents 3ceb69d + 9268cd1 commit 915bd3f
Show file tree
Hide file tree
Showing 5 changed files with 273 additions and 104 deletions.
14 changes: 8 additions & 6 deletions mle/agents/coder.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def __init__(self, model, working_dir='.', console=None, single_file=False):
- Writing clean, efficient, and well-documented code using function `create_file` and `write_file`.
- Exam the project to re-use the existing code snippets as much as possible, you may need to use
functions like `list_files`, `read_file` and `write_file`.
- Use function `preview_zip_structure` to preview the structure of the file if the task include zip file processing.
- Use function `unzip_data` to extract the compressed file if the task include compressed file processing.
- Writing the code into the file when creating new files, do not create empty files.
- Use function `preview_csv_data` to preview the CSV data if the task include CSV data processing.
- Decide whether the task requires execution and debugging before moving to the next or not.
Expand All @@ -73,6 +75,8 @@ def __init__(self, model, working_dir='.', console=None, single_file=False):
- You should create a single script first, with the complete code inside. You can have multiple functions and classes.
- Writing clean, efficient, and well-documented code to a script using functions `create_file`.
- Use function `preview_csv_data` to preview the CSV data if the task include CSV dataset or examples.
- Use function `preview_zip_structure` to preview the structure of the file if the task include zip file processing.
- Use function `unzip_data` to extract the compressed file if the task include compressed file processing.
- Generate the commands to run and test the current script, and the dependencies list required for this script.
- You only write Python scripts, don't write Jupiter notebooks which require interactive execution.
- Make sure the code has met the task description, and the suggested methods.
Expand Down Expand Up @@ -134,7 +138,9 @@ def __init__(self, model, working_dir='.', console=None, single_file=False):
schema_write_file,
schema_list_files,
schema_create_directory,
schema_preview_csv_data
schema_preview_csv_data,
schema_preview_zip_structure,
schema_unzip_data
]

if config_data.get('search_key'):
Expand All @@ -150,11 +156,7 @@ def read_requirement(self, advisor_report: str):
:param advisor_report:
:return:
"""
req_details = f"""
The overall project information:\n
{advisor_report}
"""
self.chat_history.append({"role": "system", "content": req_details})
self.chat_history.append({"role": "system", "content": advisor_report})

def code(self, task_dict: dict):
"""
Expand Down
70 changes: 68 additions & 2 deletions mle/function/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@
'path': {
'type': 'string',
'description': 'The file system path to check and list contents from'
},
'limit': {
'type': 'integer',
'description': 'The maximum number of items to list, default is 50'
}
}
}
Expand Down Expand Up @@ -174,6 +178,10 @@
'command': {
'type': 'string',
'description': 'The command to execute in the system shell'
},
'max_lines': {
'type': 'integer',
'description': 'The maximum number of output lines to keep, default is 30'
}
}
}
Expand Down Expand Up @@ -247,6 +255,60 @@
'limit_rows': {
'type': 'integer',
'description': 'The number of rows to preview, should not be a very large number. Default is 3.'
},
'limit_columns': {
'type': 'integer',
'description': 'The number of columns to preview, should not be a very large number. Default is None.'
}
}
}
}

schema_preview_zip_structure = {
'name': 'preview_zip_structure',
'description': 'Preview the structure of a zip file with limits on output and option to show hidden files. '
'Use this function when there is a need to preview the contents of a zip file.',
'parameters': {
'type': 'object',
'properties': {
'zip_path': {
'type': 'string',
'description': 'The path to the zip file'
},
'max_files': {
'type': 'integer',
'description': 'The maximum number of files to display, default is 50.'
},
'max_dirs': {
'type': 'integer',
'description': 'The maximum number of directories to display, default is 20.'
},
'max_output_length': {
'type': 'integer',
'description': 'The maximum length of the output string, default is 1000.'
},
'show_hidden': {
'type': 'boolean',
'description': 'If True, show hidden files and directories (starting with a dot), default is False.'
}
}
}
}

schema_unzip_data = {
'name': 'unzip_data',
'description': 'Unzip a compressed file, supporting various formats (.zip, .7z, .tar, .gz, .bz2, .xz) to a specified directory. '
'Use this function when there is a need to extract a compressed file.',
'parameters': {
'type': 'object',
'properties': {
'compressed_file_path': {
'type': 'string',
'description': 'The path to the compressed file to extract'
},
'extract_path': {
'type': 'string',
'description': 'The directory to extract the compressed file to, default is the current directory'
}
}
}
Expand All @@ -267,7 +329,9 @@
'ask_question',
'ask_yes_no',
'ask_choices',
'preview_csv_data'
'preview_csv_data',
'preview_zip_structure',
'unzip_data'
]

FUNCTIONS = [
Expand All @@ -284,7 +348,9 @@
ask_question,
ask_yes_no,
ask_choices,
preview_csv_data
preview_csv_data,
preview_zip_structure,
unzip_data
]

SEARCH_FUNCTIONS = [
Expand Down
153 changes: 149 additions & 4 deletions mle/function/data.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,166 @@
import os
import py7zr
import gzip
import bz2
import lzma
import shutil
import tarfile
import zipfile
import textwrap
import tempfile
import pandas as pd
from pandas.api.types import is_numeric_dtype


def preview_csv_data(path: str, limit_rows: int = 5) -> str:
def unzip_data(compressed_file_path, extract_path=None):
"""
Unzip a compressed file, supporting various formats (.zip, .7z, .tar, .gz, .bz2, .xz).
If no extract_path is provided, it creates a temporary directory.
:param compressed_file_path: Path to the compressed file
:param extract_path: Path where the contents will be extracted. If None, a temp directory is used.
:return: String with the path to the unzipped contents
"""
if not os.path.exists(compressed_file_path):
raise FileNotFoundError(f"The file {compressed_file_path} does not exist.")

# If no extract_path is provided, create a temporary directory
if extract_path is None:
extract_path = tempfile.mkdtemp()
print(f"No extract path provided. Using temporary directory: {extract_path}")
else:
# Create the extraction directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

file_extension = os.path.splitext(compressed_file_path)[1].lower()
file_name = os.path.splitext(os.path.basename(compressed_file_path))[0]

# Create a subdirectory with the name of the compressed file
specific_extract_path = os.path.join(extract_path, file_name)
os.makedirs(specific_extract_path, exist_ok=True)

try:
if file_extension == '.zip':
with zipfile.ZipFile(compressed_file_path, 'r') as zip_ref:
zip_ref.extractall(specific_extract_path)

elif file_extension == '.7z':
with py7zr.SevenZipFile(compressed_file_path, mode='r') as z:
z.extractall(specific_extract_path)

elif file_extension in ['.tar', '.gz', '.bz2', '.xz']:
if file_extension == '.gz':
open_func = gzip.open
elif file_extension == '.bz2':
open_func = bz2.open
elif file_extension == '.xz':
open_func = lzma.open
else:
open_func = open

with open_func(compressed_file_path, 'rb') as f:
if tarfile.is_tarfile(compressed_file_path) or file_extension in ['.gz', '.bz2', '.xz']:
with tarfile.open(fileobj=f) as tar:
tar.extractall(path=specific_extract_path)
else:
# For single file compression (non-tar)
output_filename = os.path.splitext(os.path.basename(compressed_file_path))[0]
output_path = os.path.join(specific_extract_path, output_filename)
with open(output_path, 'wb') as out_f:
shutil.copyfileobj(f, out_f)

else:
raise ValueError(f"Unsupported file format: {file_extension}")

print(f"Successfully extracted {compressed_file_path} to {specific_extract_path}")
return specific_extract_path

except Exception as e:
print(f"Error extracting {compressed_file_path}: {str(e)}")
raise


def preview_zip_structure(zip_path, max_files=50, max_dirs=20, max_output_length=1000, show_hidden=False):
"""
Preview the structure of a zip file with limits on output and option to show hidden files.
:param zip_path: the path to the zip file.
:param max_files: maximum number of files to display.
:param max_dirs: maximum number of directories to display.
:param max_output_length: maximum length of the output string.
:param show_hidden: if True, show hidden files and directories (starting with a dot).
:return: the limited structure of the zip file as a string.
"""
if not os.path.exists(zip_path):
return f"Error: The file '{zip_path}' does not exist."

if not zipfile.is_zipfile(zip_path):
return f"Error: '{zip_path}' is not a valid zip file."

structure = []
file_count = 0
dir_count = 0
total_count = 0
hidden_count = 0

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
for file_info in zip_ref.infolist():
file_path = file_info.filename
is_hidden = os.path.basename(file_path).startswith('.')

if is_hidden and not show_hidden:
hidden_count += 1
continue

if file_info.is_dir():
if dir_count < max_dirs:
structure.append(f"Directory: {file_path}")
dir_count += 1
else:
if file_count < max_files:
structure.append(f"File: {file_path}")
file_count += 1

total_count += 1
if len("\n".join(structure)) >= max_output_length:
structure.append("... (output truncated due to length)")
break

if file_count >= max_files:
structure.append(f"... (and {total_count - file_count - dir_count} more files)")
if dir_count >= max_dirs:
structure.append(f"... (and {total_count - file_count - dir_count} more directories)")
if not show_hidden and hidden_count > 0:
structure.append(f"... ({hidden_count} hidden items not shown)")

output = "\n".join(structure)
if len(output) > max_output_length:
output = output[:max_output_length] + "... (output truncated)"

return output


def preview_csv_data(path: str, limit_rows: int = 5, limit_columns: int = None) -> str:
"""
Preview the sample dataset from the project data path and include metadata.
Refer to: https://github.com/WecoAI/aideml/blob/main/aide/utils/data_preview.py
:param path: the path to a local CSV file.
:param limit_rows: the number of rows to preview.
:param limit_columns: the number of columns to preview. If None, all columns are previewed.
:return: the sample dataset with metadata as a string.
"""
try:
df = pd.read_csv(path)
num_rows, num_cols = df.shape
summary = [f"CSV file in `{path}` has {num_rows} rows and {num_cols} columns."]

if limit_columns is not None and limit_columns < num_cols:
columns_to_preview = sorted(df.columns)[:limit_columns]
summary.append(f"Previewing {limit_columns} out of {num_cols} columns.")
else:
columns_to_preview = sorted(df.columns)

summary.append("Here is some information about the columns:")
for col in sorted(df.columns):

for col in columns_to_preview:
dtype = df[col].dtype
name = f"{col} ({dtype})"
nan_count = df[col].isnull().sum()
Expand All @@ -33,6 +177,7 @@ def preview_csv_data(path: str, limit_rows: int = 5) -> str:
unique_count = df[col].nunique()
example_values = df[col].value_counts().head(limit_rows).index.tolist()
summary.append(f"{name} has {unique_count} unique values. Some example values: {example_values}")

return textwrap.dedent("\n".join(summary)).strip()
except Exception as e:
return f"cannot read csv data: {e}"
return f"Cannot read CSV data: {e}"
25 changes: 18 additions & 7 deletions mle/function/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,38 @@
"""

import subprocess
from collections import deque


def execute_command(command: str):
def execute_command(command: str, max_lines: int = 30):
"""
Run multiple commands in the shell and return the outputs, errors, and exit statuses.
Run a command in the shell and return the outputs, errors, and exit status,
limiting the output to a specified number of most recent lines.
Args:
command: the list of input commands to run.
command (str): The input command to run.
max_lines (int): Maximum number of output lines to keep. Defaults to 100.
Return: a string of the output, error (if any), and exit status for the command.
Return: A string of the exit status and the limited output (most recent lines).
"""
try:
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
output = ''
output_buffer = deque(maxlen=max_lines)

while True:
line = process.stdout.readline()
if not line and process.poll() is not None:
break
output += line
output_buffer.append(line.rstrip())
print(line, end='')

exit_code = process.wait()
return f"Exit code: {exit_code} \nOutput: \n{output}"

limited_output = "\n".join(output_buffer)
if len(output_buffer) == max_lines:
return f"Exit code: {exit_code}\nOutput (last {max_lines} lines):\n{limited_output}"
else:
return f"Exit code: {exit_code}\nOutput:\n{limited_output}"

except Exception as e:
return f"Error running command: {str(e)}"
Loading

0 comments on commit 915bd3f

Please sign in to comment.