Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transform for Code Profiling #646

Merged
merged 16 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
124 changes: 124 additions & 0 deletions .github/workflows/test-code-code_profiler.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#
# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files
#
name: Test - transforms/code/code_profiler

on:
workflow_dispatch:
push:
branches:
- "dev"
- "releases/**"
tags:
- "*"
paths:
- "transforms/code/code_profiler/**"
- "data-processing-lib/**"
- "!transforms/code/code_profiler/**/kfp_ray/**" # This is/will be tested in separate workflow
- "!data-processing-lib/**/test/**"
- "!data-processing-lib/**/test-data/**"
- "!**.md"
- "!**/doc/**"
- "!**/images/**"
- "!**.gitignore"
pull_request:
branches:
- "dev"
- "releases/**"
paths:
- "transforms/code/code_profiler/**"
- "data-processing-lib/**"
- "!transforms/code/code_profiler/**/kfp_ray/**" # This is/will be tested in separate workflow
- "!data-processing-lib/**/test/**"
- "!data-processing-lib/**/test-data/**"
- "!**.md"
- "!**/doc/**"
- "!**/images/**"
- "!**.gitignore"

jobs:
check_if_push_image:
# check whether the Docker images should be pushed to the remote repository
# The images are pushed if it is a merge to dev branch or a new tag is created.
# The latter being part of the release process.
# The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
runs-on: ubuntu-22.04
outputs:
publish_images: ${{ steps.version.outputs.publish_images }}
steps:
- id: version
run: |
publish_images='false'
if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
then
publish_images='true'
fi
if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
then
publish_images='true'
fi
echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
test-src:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test transform source in transforms/code/code_profiler
run: |
if [ -e "transforms/code/code_profiler/Makefile" ]; then
make -C transforms/code/code_profiler DOCKER=docker test-src
else
echo "transforms/code/code_profiler/Makefile not found - source testing disabled for this transform."
fi
test-image:
needs: [check_if_push_image]
runs-on: ubuntu-22.04
timeout-minutes: 120
env:
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test transform image in transforms/code/code_profiler
run: |
if [ -e "transforms/code/code_profiler/Makefile" ]; then
if [ -d "transforms/code/code_profiler/spark" ]; then
make -C data-processing-lib/spark DOCKER=docker image
fi
make -C transforms/code/code_profiler DOCKER=docker test-image
else
echo "transforms/code/code_profiler/Makefile not found - testing disabled for this transform."
fi
- name: Print space
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
docker images
- name: Publish images
if: needs.check_if_push_image.outputs.publish_images == 'true'
run: |
if [ -e "transforms/code/code_profiler/Makefile" ]; then
make -C transforms/code/code_profiler publish
else
echo "transforms/code/code_profiler/Makefile not found - publishing disabled for this transform."
fi
53 changes: 53 additions & 0 deletions transforms/code/code_profiler/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image:
@echo "Skipping test-image step as per configuration."

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:
$(MAKE) -C kfp_ray workflow-venv

.PHONY: workflow-test
workflow-test:
$(MAKE) -C kfp_ray workflow-test

.PHONY: workflow-upload
workflow-upload:
$(MAKE) -C kfp_ray workflow-upload

.PHONY: workflow-build
workflow-build:
$(MAKE) -C kfp_ray workflow-build
63 changes: 63 additions & 0 deletions transforms/code/code_profiler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Code Profiler Tranform

This module extracts the base syntactic concepts from the multi-language source codes and represent these concepts in an unified langauge-agnostic representation that can be further used for multi-lnaguage data profiling. While programming languages expose similar syntactic building blocks to represent programming intent, such as importing packages/libraries, functions, classes, loops, conditionals, comments and others, these concepts are expressed through language-specific grammar, defined by distinct keywords and syntactic form. Our framework abstracts language-specific concepts by transforming them into a unified, language-agnostic representation called universal base syntactic representation (UBSR), referred to as a concept, which is consistently encoded within the proposed schema structure. The current version support the base syntactic concept for importing/including package/libraries, comments, functions.

Table 1 outlines the fields of the UBSR, which maps AST nodes to a structured schema. This schema captures syntactic nodes (based on AST node types) and the relationships between those nodes (derived from AST edges). The UBSR framework currently supports 21 languages, grouped according to their syntactic paradigms.

**Table 1: UBSR Schema Representation**


| **Key** | **Possible Values** | **Description** |
|-----------------------|----------------------------------------------------|----------------------------------------------------------------|
| **"nodes":** | | |
| `"id"` | Integer (e.g., `0`, `1`) | Unique identifier of the node. |
| `"code_snippet"` | String (e.g., `"ubsr_package math"`) | A snippet of code or a description of the node. |
| `"node_type"` | String (e.g., `"ubsr_root"`, `"ubsr_package"`, etc.)| Type of node representing various syntactic concepts. |
| `"parents"` | Array of Integers (e.g., `[1, 2]`) | List of parent node IDs. |
| `"children"` | Array of Integers (e.g., `[1, 2]`) | List of child node IDs. |
| **"metadata" (within nodes):** | | |
| `"info"` | String | General information about the node. |
| `"language"` | String (`"cpp"`, `"python"`, etc.) | Programming language of the node. |
| `"original_code"` | String (e.g., `"int main() {...}"`) | Original code snippet corresponding to the node. |
| `"loc_original_code"` | Integer | Line of code of the concept. |
| **"edges":** | | |
| `"directed_relation"` | String (`"parent_node"`) | Type of relationship between nodes e.g. parent-child. |
| `"metadata"` | Object | Additional metadata for the edge, which can be empty. |


As shown in Table 2, the framework standardizes code representation by categorizing languages within these paradigms for 21 languages. In cases where certain concepts are absent in a language, they are marked as NA in the table. The base syntactic concepts extracted from the UBSR derived from code can be used to derive syntactic and semantic insights of the code data.

**Table 2: Base Syntactic Concepts Supported by the UBSR across Different Syntactical Paradigms**

| **Syntactical Paradigms** | **Languages Supported (Known\*)** | **Package** | **Function** | **Comment** |
|----------------------------------------------------|---------------------------------------------------------------------------------------------------|-------------|--------------|-------------|
| **C-like Syntax** | **C\***, **Java\***, **C#**, **CPP**, **Objective C**, **Rust**, **Golang**, Kotlin | Yes | Yes | Yes |
| **Scripting and Dynamic Syntax** | **Python\***, **JavaScript\***, **Dart**, **Typescript** | Yes | Yes | Yes |
| | QML | Yes | NA | Yes |
| | **Perl** | Yes | Yes | NA |
| **Functional and Expression-Oriented Syntax** | **Haskell\***, Elm\*, Agda, **D**, **Nim**, **Scala** | Yes | Yes | Yes |
| | **Ocaml** | Yes | NA | Yes |


* [python](python/README.md) - provides the base python-based syntactic concept extractor
implementation.
* [ray](ray/README.md) - provides the base ray-based syntactic concept extractor
implementation.



**Offline Path for Syntactic Rule Generation**

The offline path is critical for expanding and refining the syntactic rule database, enabling the USR framework to adapt to new languages and syntactic constructs. This process leverages LLMs to generate syntactic rules for languages that are not yet included in the rule database. To achieve this, we utilize a Few-shot Chain of Thought prompting technique, guiding the LLM through a step-by-step rule generation process. By providing carefully curated training exemplars and detailed instructions, this method ensures the LLM can accurately generalize from these examples to produce effective syntactic rules for a wide range of languages. This structured approach enhances the flexibility of the UBSR framework, allowing it to seamlessly handle evolving language constructs.

The implementation for UI-based offline customization tool is present [here](python/src/offline-customizations). To run the tool, use the following command.

`streamlit run LLM_runner_app.py`

The high-level system design is as follows:

![White Background Image](sys-overview.png)

For each new target language, the offline phase is utilized to create deterministic rules by harnessing the capabilities of LLMs and working with exemplar code samples from the target language. In this process, Workflow W1 facilitates the creation of rules around syntactic structures based on exemplar code samples, while Workflow W2 is used to establish semantic dimensions for profiling. Subsequently, we derive rules that connect syntactic constructs to the predefined semantic concepts. These rules are then stored in a rule database, ready to be employed during the online phase.

In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports.
5 changes: 5 additions & 0 deletions transforms/code/code_profiler/input/data_profiler_params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"input": "multi-package.parquet",
"contents": "Contents",
"language": "Language"
}
Binary file not shown.
1,193 changes: 1,193 additions & 0 deletions transforms/code/code_profiler/notebook_example/code-profiler.ipynb

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions transforms/code/code_profiler/python/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/
37 changes: 37 additions & 0 deletions transforms/code/code_profiler/python/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
test-data/output
output/*
/output/
data-processing-lib/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class


# Distribution / packaging
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
.tox/
htmlcov
.coverage
.cache
nosetests.xml
coverage.xml
45 changes: 45 additions & 0 deletions transforms/code/code_profiler/python/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
FROM docker.io/python:3.10.14-slim-bullseye

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
WORKDIR /home/dpk

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root README.md README.md
COPY --chown=dpk:root requirements.txt requirements.txt

pankajskku marked this conversation as resolved.
Show resolved Hide resolved
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/code_profiler_transform_python.py .

# copy some of the samples in
COPY ./src/code_profiler_local.py local/

# Copy the tree-sitter bindings (this is the important part)
COPY --chown=ray:users ../../input/tree-sitter-bindings/ /home/dpk/input/tree-sitter-bindings/

# copy test
# COPY test/ test/
# COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
55 changes: 55 additions & 0 deletions transforms/code/code_profiler/python/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..
# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

# Include the common configuration for this transform
include ../transform.config

# values possible mach-arm64, x86_64
export RUNTIME_HOST_ARCH=x86_64

venv:: .transforms.python-venv

test:: .transforms.python-test

clean:: .transforms.clean

image:: .transforms.python-image

test-src:: .transforms.test-src

setup:: .transforms.setup

build:: build-dist image

publish: publish-image

publish-image:: .transforms.publish-image-python

setup:: .transforms.setup

set-versions:
$(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE_PROFILER_PYTHON_VERSION) TOML_VERSION=$(CODE_PROFILER_PYTHON_VERSION) .transforms.set-versions

build-dist:: .defaults.build-dist

publish-dist:: .defaults.publish-dist

test-image:
@echo "Skipping test-image step as per configuration."

# Ensure RUN_ARGS has a default value
RUN_ARGS ?= ""

# run-cli-sample: .transforms.run-cli-python-sample

run-local-sample: .transforms.run-local-sample

run-local-python-sample:
$(MAKE) RUN_FILE=code_profiler_local_python.py \
RUN_ARGS="--content 'Contents' --language 'Language'" \
.transforms.run-local-python-sample
Loading