Merge pull request #253 from MLSysOps/feat/enhance-dir-preview-func

[MRG] Improve data operation functions
MLSysOps · Oct 21, 2024 · 915bd3f · 915bd3f
2 parents 3ceb69d + 9268cd1
commit 915bd3f
Show file tree

Hide file tree

Showing 5 changed files with 273 additions and 104 deletions.
diff --git a/mle/agents/coder.py b/mle/agents/coder.py
@@ -56,6 +56,8 @@ def __init__(self, model, working_dir='.', console=None, single_file=False):
         - Writing clean, efficient, and well-documented code using function `create_file` and `write_file`.
         - Exam the project to re-use the existing code snippets as much as possible, you may need to use
          functions like `list_files`, `read_file` and `write_file`.
+        - Use function `preview_zip_structure` to preview the structure of the file if the task include zip file processing.
+        - Use function `unzip_data` to extract the compressed file if the task include compressed file processing.
         - Writing the code into the file when creating new files, do not create empty files.
         - Use function `preview_csv_data` to preview the CSV data if the task include CSV data processing.
         - Decide whether the task requires execution and debugging before moving to the next or not.
@@ -73,6 +75,8 @@ def __init__(self, model, working_dir='.', console=None, single_file=False):
             - You should create a single script first, with the complete code inside. You can have multiple functions and classes.
             - Writing clean, efficient, and well-documented code to a script using functions `create_file`.
             - Use function `preview_csv_data` to preview the CSV data if the task include CSV dataset or examples.
+            - Use function `preview_zip_structure` to preview the structure of the file if the task include zip file processing.
+            - Use function `unzip_data` to extract the compressed file if the task include compressed file processing.
             - Generate the commands to run and test the current script, and the dependencies list required for this script.
             - You only write Python scripts, don't write Jupiter notebooks which require interactive execution.
             - Make sure the code has met the task description, and the suggested methods.
@@ -134,7 +138,9 @@ def __init__(self, model, working_dir='.', console=None, single_file=False):
             schema_write_file,
             schema_list_files,
             schema_create_directory,
-            schema_preview_csv_data
+            schema_preview_csv_data,
+            schema_preview_zip_structure,
+            schema_unzip_data
         ]
 
         if config_data.get('search_key'):
@@ -150,11 +156,7 @@ def read_requirement(self, advisor_report: str):
         :param advisor_report:
         :return:
         """
-        req_details = f"""
-        The overall project information:\n
-        {advisor_report}
-        """
-        self.chat_history.append({"role": "system", "content": req_details})
+        self.chat_history.append({"role": "system", "content": advisor_report})
 
     def code(self, task_dict: dict):
         """

diff --git a/mle/function/__init__.py b/mle/function/__init__.py
@@ -68,6 +68,10 @@
             'path': {
                 'type': 'string',
                 'description': 'The file system path to check and list contents from'
+            },
+            'limit': {
+                'type': 'integer',
+                'description': 'The maximum number of items to list, default is 50'
             }
         }
     }
@@ -174,6 +178,10 @@
             'command': {
                 'type': 'string',
                 'description': 'The command to execute in the system shell'
+            },
+            'max_lines': {
+                'type': 'integer',
+                'description': 'The maximum number of output lines to keep, default is 30'
             }
         }
     }
@@ -247,6 +255,60 @@
             'limit_rows': {
                 'type': 'integer',
                 'description': 'The number of rows to preview, should not be a very large number. Default is 3.'
+            },
+            'limit_columns': {
+                'type': 'integer',
+                'description': 'The number of columns to preview, should not be a very large number. Default is None.'
+            }
+        }
+    }
+}
+
+schema_preview_zip_structure = {
+    'name': 'preview_zip_structure',
+    'description': 'Preview the structure of a zip file with limits on output and option to show hidden files. '
+                   'Use this function when there is a need to preview the contents of a zip file.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'zip_path': {
+                'type': 'string',
+                'description': 'The path to the zip file'
+            },
+            'max_files': {
+                'type': 'integer',
+                'description': 'The maximum number of files to display, default is 50.'
+            },
+            'max_dirs': {
+                'type': 'integer',
+                'description': 'The maximum number of directories to display, default is 20.'
+            },
+            'max_output_length': {
+                'type': 'integer',
+                'description': 'The maximum length of the output string, default is 1000.'
+            },
+            'show_hidden': {
+                'type': 'boolean',
+                'description': 'If True, show hidden files and directories (starting with a dot), default is False.'
+            }
+        }
+    }
+}
+
+schema_unzip_data = {
+    'name': 'unzip_data',
+    'description': 'Unzip a compressed file, supporting various formats (.zip, .7z, .tar, .gz, .bz2, .xz) to a specified directory. '
+                   'Use this function when there is a need to extract a compressed file.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'compressed_file_path': {
+                'type': 'string',
+                'description': 'The path to the compressed file to extract'
+            },
+            'extract_path': {
+                'type': 'string',
+                'description': 'The directory to extract the compressed file to, default is the current directory'
             }
         }
     }
@@ -267,7 +329,9 @@
     'ask_question',
     'ask_yes_no',
     'ask_choices',
-    'preview_csv_data'
+    'preview_csv_data',
+    'preview_zip_structure',
+    'unzip_data'
 ]
 
 FUNCTIONS = [
@@ -284,7 +348,9 @@
     ask_question,
     ask_yes_no,
     ask_choices,
-    preview_csv_data
+    preview_csv_data,
+    preview_zip_structure,
+    unzip_data
 ]
 
 SEARCH_FUNCTIONS = [

diff --git a/mle/function/data.py b/mle/function/data.py
@@ -1,22 +1,166 @@
+import os
+import py7zr
+import gzip
+import bz2
+import lzma
+import shutil
+import tarfile
+import zipfile
 import textwrap
+import tempfile
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
 
 
-def preview_csv_data(path: str, limit_rows: int = 5) -> str:
+def unzip_data(compressed_file_path, extract_path=None):
+    """
+    Unzip a compressed file, supporting various formats (.zip, .7z, .tar, .gz, .bz2, .xz).
+    If no extract_path is provided, it creates a temporary directory.
+
+    :param compressed_file_path: Path to the compressed file
+    :param extract_path: Path where the contents will be extracted. If None, a temp directory is used.
+    :return: String with the path to the unzipped contents
+    """
+    if not os.path.exists(compressed_file_path):
+        raise FileNotFoundError(f"The file {compressed_file_path} does not exist.")
+
+    # If no extract_path is provided, create a temporary directory
+    if extract_path is None:
+        extract_path = tempfile.mkdtemp()
+        print(f"No extract path provided. Using temporary directory: {extract_path}")
+    else:
+        # Create the extraction directory if it doesn't exist
+        os.makedirs(extract_path, exist_ok=True)
+
+    file_extension = os.path.splitext(compressed_file_path)[1].lower()
+    file_name = os.path.splitext(os.path.basename(compressed_file_path))[0]
+
+    # Create a subdirectory with the name of the compressed file
+    specific_extract_path = os.path.join(extract_path, file_name)
+    os.makedirs(specific_extract_path, exist_ok=True)
+
+    try:
+        if file_extension == '.zip':
+            with zipfile.ZipFile(compressed_file_path, 'r') as zip_ref:
+                zip_ref.extractall(specific_extract_path)
+
+        elif file_extension == '.7z':
+            with py7zr.SevenZipFile(compressed_file_path, mode='r') as z:
+                z.extractall(specific_extract_path)
+
+        elif file_extension in ['.tar', '.gz', '.bz2', '.xz']:
+            if file_extension == '.gz':
+                open_func = gzip.open
+            elif file_extension == '.bz2':
+                open_func = bz2.open
+            elif file_extension == '.xz':
+                open_func = lzma.open
+            else:
+                open_func = open
+
+            with open_func(compressed_file_path, 'rb') as f:
+                if tarfile.is_tarfile(compressed_file_path) or file_extension in ['.gz', '.bz2', '.xz']:
+                    with tarfile.open(fileobj=f) as tar:
+                        tar.extractall(path=specific_extract_path)
+                else:
+                    # For single file compression (non-tar)
+                    output_filename = os.path.splitext(os.path.basename(compressed_file_path))[0]
+                    output_path = os.path.join(specific_extract_path, output_filename)
+                    with open(output_path, 'wb') as out_f:
+                        shutil.copyfileobj(f, out_f)
+
+        else:
+            raise ValueError(f"Unsupported file format: {file_extension}")
+
+        print(f"Successfully extracted {compressed_file_path} to {specific_extract_path}")
+        return specific_extract_path
+
+    except Exception as e:
+        print(f"Error extracting {compressed_file_path}: {str(e)}")
+        raise
+
+
+def preview_zip_structure(zip_path, max_files=50, max_dirs=20, max_output_length=1000, show_hidden=False):
+    """
+    Preview the structure of a zip file with limits on output and option to show hidden files.
+    :param zip_path: the path to the zip file.
+    :param max_files: maximum number of files to display.
+    :param max_dirs: maximum number of directories to display.
+    :param max_output_length: maximum length of the output string.
+    :param show_hidden: if True, show hidden files and directories (starting with a dot).
+    :return: the limited structure of the zip file as a string.
+    """
+    if not os.path.exists(zip_path):
+        return f"Error: The file '{zip_path}' does not exist."
+
+    if not zipfile.is_zipfile(zip_path):
+        return f"Error: '{zip_path}' is not a valid zip file."
+
+    structure = []
+    file_count = 0
+    dir_count = 0
+    total_count = 0
+    hidden_count = 0
+
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        for file_info in zip_ref.infolist():
+            file_path = file_info.filename
+            is_hidden = os.path.basename(file_path).startswith('.')
+
+            if is_hidden and not show_hidden:
+                hidden_count += 1
+                continue
+
+            if file_info.is_dir():
+                if dir_count < max_dirs:
+                    structure.append(f"Directory: {file_path}")
+                    dir_count += 1
+            else:
+                if file_count < max_files:
+                    structure.append(f"File: {file_path}")
+                    file_count += 1
+
+            total_count += 1
+            if len("\n".join(structure)) >= max_output_length:
+                structure.append("... (output truncated due to length)")
+                break
+
+    if file_count >= max_files:
+        structure.append(f"... (and {total_count - file_count - dir_count} more files)")
+    if dir_count >= max_dirs:
+        structure.append(f"... (and {total_count - file_count - dir_count} more directories)")
+    if not show_hidden and hidden_count > 0:
+        structure.append(f"... ({hidden_count} hidden items not shown)")
+
+    output = "\n".join(structure)
+    if len(output) > max_output_length:
+        output = output[:max_output_length] + "... (output truncated)"
+
+    return output
+
+
+def preview_csv_data(path: str, limit_rows: int = 5, limit_columns: int = None) -> str:
     """
     Preview the sample dataset from the project data path and include metadata.
-    Refer to: https://github.com/WecoAI/aideml/blob/main/aide/utils/data_preview.py
     :param path: the path to a local CSV file.
     :param limit_rows: the number of rows to preview.
+    :param limit_columns: the number of columns to preview. If None, all columns are previewed.
     :return: the sample dataset with metadata as a string.
     """
     try:
         df = pd.read_csv(path)
         num_rows, num_cols = df.shape
         summary = [f"CSV file in `{path}` has {num_rows} rows and {num_cols} columns."]
+
+        if limit_columns is not None and limit_columns < num_cols:
+            columns_to_preview = sorted(df.columns)[:limit_columns]
+            summary.append(f"Previewing {limit_columns} out of {num_cols} columns.")
+        else:
+            columns_to_preview = sorted(df.columns)
+
         summary.append("Here is some information about the columns:")
-        for col in sorted(df.columns):
+
+        for col in columns_to_preview:
             dtype = df[col].dtype
             name = f"{col} ({dtype})"
             nan_count = df[col].isnull().sum()
@@ -33,6 +177,7 @@ def preview_csv_data(path: str, limit_rows: int = 5) -> str:
                 unique_count = df[col].nunique()
                 example_values = df[col].value_counts().head(limit_rows).index.tolist()
                 summary.append(f"{name} has {unique_count} unique values. Some example values: {example_values}")
+
         return textwrap.dedent("\n".join(summary)).strip()
     except Exception as e:
-        return f"cannot read csv data: {e}"
+        return f"Cannot read CSV data: {e}"
diff --git a/mle/function/execution.py b/mle/function/execution.py
@@ -3,27 +3,38 @@
 """
 
 import subprocess
+from collections import deque
 
 
-def execute_command(command: str):
+def execute_command(command: str, max_lines: int = 30):
     """
-    Run multiple commands in the shell and return the outputs, errors, and exit statuses.
+    Run a command in the shell and return the outputs, errors, and exit status,
+    limiting the output to a specified number of most recent lines.
+
     Args:
-        command: the list of input commands to run.
+        command (str): The input command to run.
+        max_lines (int): Maximum number of output lines to keep. Defaults to 100.
 
-    Return: a string of the output, error (if any), and exit status for the command.
+    Return: A string of the exit status and the limited output (most recent lines).
     """
     try:
         process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-        output = ''
+        output_buffer = deque(maxlen=max_lines)
+
         while True:
             line = process.stdout.readline()
             if not line and process.poll() is not None:
                 break
-            output += line
+            output_buffer.append(line.rstrip())
             print(line, end='')
 
         exit_code = process.wait()
-        return f"Exit code: {exit_code} \nOutput: \n{output}"
+
+        limited_output = "\n".join(output_buffer)
+        if len(output_buffer) == max_lines:
+            return f"Exit code: {exit_code}\nOutput (last {max_lines} lines):\n{limited_output}"
+        else:
+            return f"Exit code: {exit_code}\nOutput:\n{limited_output}"
+
     except Exception as e:
         return f"Error running command: {str(e)}"