diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish_to_pypi.yml index 9495113..e7fdaa9 100644 --- a/.github/workflows/publish_to_pypi.yml +++ b/.github/workflows/publish_to_pypi.yml @@ -1,159 +1,159 @@ -name: CI - -on: - push: - branches: - - main - - master - tags: - - '*' - pull_request: - workflow_dispatch: - -concurrency: - - group: ${{ github.workflow }}-${{ github.ref }} - - cancel-in-progress: true - -permissions: - contents: read - -# Make sure CI fails on all warnings, including Clippy lints -env: - RUSTFLAGS: "-Dwarnings" - -jobs: - linux_tests: - runs-on: ubuntu-latest - strategy: - matrix: - target: [x86_64] - python-version: ["3.8", "3.9", "3.10", "3.11"] - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - - python-version: ${{ matrix.python-version }} - - - - name: Set up Rust - run: rustup show - - uses: mozilla-actions/sccache-action@v0.0.3 - - run: make venv - - run: make pre-commit - - run: make install - - run: make test - - linux: - runs-on: ubuntu-latest - strategy: - matrix: - target: [x86_64, x86] - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.target }} - - args: --release --out dist --find-interpreter - sccache: 'true' - manylinux: auto - - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - windows: - runs-on: windows-latest - strategy: - matrix: - target: [x64] - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - architecture: ${{ matrix.target }} - - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - - target: ${{ matrix.target }} - - args: --release --out dist --find-interpreter - sccache: 'true' - - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - - macos: - runs-on: macos-latest - strategy: - matrix: - target: [x86_64, aarch64] - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - - target: ${{ matrix.target }} - - args: --release --out dist --find-interpreter - sccache: 'true' - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - env: - RUSTFLAGS: "-Clink-arg=-undefined -Clink-arg=dynamic_lookup" - - sdist: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Build sdist - uses: PyO3/maturin-action@v1 - with: - command: sdist - args: --out dist - - name: Upload sdist - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - - release: - name: Release - if: "startsWith(github.ref, 'refs/tags/')" - needs: [linux, windows, macos, sdist] - runs-on: ubuntu-latest - environment: pypi - permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing - steps: - - uses: actions/download-artifact@v3 - with: - name: wheels - - name: Publish to PyPI - uses: PyO3/maturin-action@v1 - with: - command: upload - args: --non-interactive --skip-existing * - +#name: CI +# +#on: +# push: +# branches: +# - main +# - master +# tags: +# - '*' +# pull_request: +# workflow_dispatch: +# +#concurrency: +# +# group: ${{ github.workflow }}-${{ github.ref }} +# +# cancel-in-progress: true +# +#permissions: +# contents: read +# +## Make sure CI fails on all warnings, including Clippy lints +#env: +# RUSTFLAGS: "-Dwarnings" +# +#jobs: +# linux_tests: +# runs-on: ubuntu-latest +# strategy: +# matrix: +# target: [x86_64] +# python-version: ["3.8", "3.9", "3.10", "3.11"] +# steps: +# - uses: actions/checkout@v3 +# - uses: actions/setup-python@v4 +# with: +# +# python-version: ${{ matrix.python-version }} +# +# +# - name: Set up Rust +# run: rustup show +# - uses: mozilla-actions/sccache-action@v0.0.3 +# - run: make venv +# - run: make pre-commit +# - run: make install +# - run: make test +# +# linux: +# runs-on: ubuntu-latest +# strategy: +# matrix: +# target: [x86_64, x86] +# steps: +# - uses: actions/checkout@v3 +# - uses: actions/setup-python@v4 +# with: +# python-version: '3.10' +# - name: Build wheels +# uses: PyO3/maturin-action@v1 +# with: +# target: ${{ matrix.target }} +# +# args: --release --out dist --find-interpreter +# sccache: 'true' +# manylinux: auto +# +# - name: Upload wheels +# uses: actions/upload-artifact@v3 +# with: +# name: wheels +# path: dist +# windows: +# runs-on: windows-latest +# strategy: +# matrix: +# target: [x64] +# steps: +# - uses: actions/checkout@v3 +# - uses: actions/setup-python@v4 +# with: +# python-version: '3.10' +# +# architecture: ${{ matrix.target }} +# +# - name: Build wheels +# uses: PyO3/maturin-action@v1 +# with: +# +# target: ${{ matrix.target }} +# +# args: --release --out dist --find-interpreter +# sccache: 'true' +# +# - name: Upload wheels +# uses: actions/upload-artifact@v3 +# with: +# name: wheels +# path: dist +# +# macos: +# runs-on: macos-latest +# strategy: +# matrix: +# target: [x86_64, aarch64] +# steps: +# - uses: actions/checkout@v3 +# - uses: actions/setup-python@v4 +# with: +# python-version: '3.10' +# - name: Build wheels +# uses: PyO3/maturin-action@v1 +# with: +# +# target: ${{ matrix.target }} +# +# args: --release --out dist --find-interpreter +# sccache: 'true' +# - name: Upload wheels +# uses: actions/upload-artifact@v3 +# with: +# name: wheels +# path: dist +# env: +# RUSTFLAGS: "-Clink-arg=-undefined -Clink-arg=dynamic_lookup" +# +# sdist: +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v3 +# - name: Build sdist +# uses: PyO3/maturin-action@v1 +# with: +# command: sdist +# args: --out dist +# - name: Upload sdist +# uses: actions/upload-artifact@v3 +# with: +# name: wheels +# path: dist +# +# release: +# name: Release +# if: "startsWith(github.ref, 'refs/tags/')" +# needs: [linux, windows, macos, sdist] +# runs-on: ubuntu-latest +# environment: pypi +# permissions: +# id-token: write # IMPORTANT: mandatory for trusted publishing +# steps: +# - uses: actions/download-artifact@v3 +# with: +# name: wheels +# - name: Publish to PyPI +# uses: PyO3/maturin-action@v1 +# with: +# command: upload +# args: --non-interactive --skip-existing * +# diff --git a/benchmark/run_bench.ipynb b/benchmark/run_bench.ipynb index 9242407..2b3bd81 100644 --- a/benchmark/run_bench.ipynb +++ b/benchmark/run_bench.ipynb @@ -8,7 +8,11 @@ }, { "metadata": { - "collapsed": true + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-12-11T13:20:41.055850Z", + "start_time": "2024-12-11T13:20:38.090893Z" + } }, "cell_type": "code", "source": [ @@ -16,8 +20,40 @@ "!pip install --force-reinstall bioframe==0.7.2 pyarrow fastparquet pyranges==0.1.2 pybedtools==0.10.0 git+https://gitlab.com/gtamazian/pygenomics.git" ], "id": "initial_id", - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mwiewior/.pyenv/versions/miniconda3-latest/lib/python3.12/pty.py:95: RuntimeWarning: Using fork() can cause Polars to deadlock in the child process.\n", + "In addition, using fork() with Python in general is a recipe for mysterious\n", + "deadlocks and crashes.\n", + "\n", + "The most likely reason you are seeing this error is because you are using the\n", + "multiprocessing module on Linux, which uses fork() by default. This will be\n", + "fixed in Python 3.14. Until then, you want to use the \"spawn\" context instead.\n", + "\n", + "See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.\n", + "\n", + " pid, fd = os.forkpty()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mDEPRECATION: Loading egg at /Users/mwiewior/.pyenv/versions/miniconda3-latest/lib/python3.12/site-packages/bllipparser-2021.11.7-py3.12-macosx-11.1-arm64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330\u001B[0m\u001B[33m\r\n", + "\u001B[0m\u001B[33mDEPRECATION: Loading egg at /Users/mwiewior/.pyenv/versions/miniconda3-latest/lib/python3.12/site-packages/owl2vec_star-0.2.1-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330\u001B[0m\u001B[33m\r\n", + "\u001B[0mCollecting git+https://gitlab.com/gtamazian/pygenomics.git\r\n", + " Cloning https://gitlab.com/gtamazian/pygenomics.git to /private/var/folders/l3/8404fxz55gvdfszw4zf7ptp40000gn/T/pip-req-build-zyv8gigp\r\n", + " Running command git clone --filter=blob:none --quiet https://gitlab.com/gtamazian/pygenomics.git /private/var/folders/l3/8404fxz55gvdfszw4zf7ptp40000gn/T/pip-req-build-zyv8gigp\r\n", + "^C\r\n", + "\u001B[31mERROR: Operation cancelled by user\u001B[0m\u001B[31m\r\n", + "\u001B[0m" + ] + } + ], + "execution_count": 1 }, { "metadata": {}, @@ -28,8 +64,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-12-09T18:17:40.183149Z", - "start_time": "2024-12-09T18:17:40.179388Z" + "end_time": "2024-12-11T13:20:43.417613Z", + "start_time": "2024-12-11T13:20:43.414284Z" } }, "cell_type": "code", @@ -48,13 +84,13 @@ ] } ], - "execution_count": 1 + "execution_count": 2 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-12-09T18:17:42.889469Z", - "start_time": "2024-12-09T18:17:42.887721Z" + "end_time": "2024-12-11T13:20:45.124138Z", + "start_time": "2024-12-11T13:20:45.122444Z" } }, "cell_type": "code", @@ -64,7 +100,7 @@ ], "id": "ae490515180f0af4", "outputs": [], - "execution_count": 2 + "execution_count": 3 }, { "metadata": {}, @@ -486,8 +522,8 @@ "source": [ "metrics = {\n", " \"polars-bio-pl-df--pl-lf\" : {\"0-1\": 0.147, \"0-3\": 44.942, \"0-8\": 6.096, \"7-8\": 9.522},\n", - " \"polars-bio-polars\" : {\"0-1\": 0.145, \"0-3\": 24.668, \"0-8\": 4.210, \"7-8\": 6.698},\n", - " \"polars-bio-pandas\" : {\"0-1\": 0.150, \"0-3\": 41.995, \"0-8\": 6.392, \"7-8\": 10.639},\n", + " \"polars-bio-pd-df--pl-lf\" : {\"0-1\": 0.177, \"0-3\": 43.369, \"0-8\": 6.241, \"7-8\": 9.688},\n", + " \"polars-bio-pd-df--pd-df\" : {\"0-1\": , \"0-3\": , \"0-8\": , \"7-8\": },\n", "}\n", "plot_metrics(\n", " metrics, [\"0-1\", \"0-3\", \"0-8\",\"7-8\"], \"Overlap performance comparison\"\n", diff --git a/polars_bio/overlap.py b/polars_bio/overlap.py index a13daaf..0eea38d 100644 --- a/polars_bio/overlap.py +++ b/polars_bio/overlap.py @@ -6,9 +6,8 @@ import pandas as pd import polars as pl import pyarrow as pa -from jsonschema.benchmarks.subcomponents import schema +import pyarrow.compute as pc from polars.io.plugins import register_io_source -from pygments.styles.dracula import yellow from typing_extensions import TYPE_CHECKING, Union @@ -26,8 +25,8 @@ def overlap(df1 : Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], how="inner", suffixes=("_1", "_2"), on_cols=None, - col1: Union[list[str], None]=None, - col2: Union[list[str], None]=None, + col1: Union[list[str]|None]=None, + col2: Union[list[str]|None]=None, output_type: str ="polars.LazyFrame" ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]: """ @@ -52,12 +51,15 @@ def overlap(df1 : Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], :return: **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals. """ - - # TODO: Add support for on_cols () - assert on_cols is None, "on_cols is not supported yet" # TODO: Add support for col1 and col2 assert col1 is None, "col1 is not supported yet" assert col2 is None, "col2 is not supported yet" + col1 = ["contig", "pos_start", "pos_end"] if col1 is None else col1 + col2 = ["contig", "pos_start", "pos_end"] if col2 is None else col2 + + # TODO: Add support for on_cols () + assert on_cols is None, "on_cols is not supported yet" + assert suffixes == ("_1", "_2"), "Only default suffixes are supported" assert output_type in ["polars.LazyFrame", "polars.DataFrame", "pandas.DataFrame"], "Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported" @@ -74,7 +76,7 @@ def overlap(df1 : Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], df_schema2 = _get_schema(df2, suffixes[1]) merged_schema = pl.Schema({**df_schema1, **df_schema2}) if output_type == "polars.LazyFrame": - return scan_overlap(df1, df2, merged_schema) + return overlap_lazy_scan(df1, df2, merged_schema) elif output_type == "polars.DataFrame": return overlap_scan(df1, df2).to_polars() elif output_type == "pandas.DataFrame": @@ -86,11 +88,16 @@ def overlap(df1 : Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], isinstance(df1, pd.DataFrame) and isinstance(df2, pd.DataFrame): if output_type == "polars.LazyFrame": merged_schema = pl.Schema({**_rename_columns(df1,suffixes[0]), **_rename_columns(df2, suffixes[1])}) - return scan_overlap(df1, df2, merged_schema) + return overlap_lazy_scan(df1, df2, merged_schema, col1, col2) elif output_type == "polars.DataFrame": - return overlap_scan(df1, df2).to_polars() + if isinstance(df1, pl.DataFrame) and isinstance(df2, pl.DataFrame): + df1 = df1.to_arrow().to_reader() + df2 = df2.to_arrow().to_reader() + return overlap_frame(df1, df2).to_polars() elif output_type == "pandas.DataFrame": - return overlap_scan(df1, df2).to_pandas() + df1 = pa.Table.from_pandas(df1).to_reader() + df2 = pa.Table.from_pandas(df2).to_reader() + return overlap_frame(df1, df2).to_pandas() else: raise ValueError("Both dataframes must be of the same type: either polars or pandas or a path to a file") @@ -123,11 +130,27 @@ def _get_schema(path: str, suffix = None ) -> pl.Schema: return df.schema +# since there is an error when Pandas DF are coverted to Arrow, we need to use the following function +# to change the type of the columns to largestring (the problem is with the string type for +# larger datasets) +def _string_to_largestring(table: pa.Table, column_name: str) -> pa.Table: + index = _get_column_index(table, column_name) + return table.set_column( + index, # Index of the column to replace + table.schema.field(index).name, # Name of the column + pc.cast(table.column(index), pa.large_string()) # Cast to `largestring` + ) + +def _get_column_index(table, column_name): + try: + return table.schema.names.index(column_name) + except ValueError: + raise KeyError(f"Column '{column_name}' not found in the table.") -def scan_overlap(df_1:Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], - df_2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], - schema: pl.Schema ) -> pl.LazyFrame: +def overlap_lazy_scan(df_1:Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], + df_2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], + schema: pl.Schema, col1: list[str]=None, col2: list[str]=None) -> pl.LazyFrame: overlap_function = None if isinstance(df_1, str) and isinstance(df_2, str): overlap_function = overlap_scan @@ -137,8 +160,10 @@ def scan_overlap(df_1:Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], df_2 = df_2.to_arrow().to_reader() elif isinstance(df_1, pd.DataFrame) and isinstance(df_2, pd.DataFrame): overlap_function = overlap_frame - df_1 = pa.Table.from_pandas(df_1).to_reader() - df_2 = pa.Table.from_pandas(df_2).to_reader() + table_1 = pa.Table.from_pandas(df_1) + df_1 =_string_to_largestring(table_1, col1[0]).to_reader() + table_2 = pa.Table.from_pandas(df_2) + df_2 =_string_to_largestring(table_2, col2[0]).to_reader() else: raise ValueError("Only polars and pandas dataframes are supported") def _overlap_source(