Skip to content

Commit

Permalink
Refactoring range operations
Browse files Browse the repository at this point in the history
  • Loading branch information
mwiewior committed Dec 13, 2024
1 parent 750d04c commit 3ae9342
Show file tree
Hide file tree
Showing 7 changed files with 367 additions and 291 deletions.
6 changes: 3 additions & 3 deletions polars_bio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .overlap import ctx, overlap
from .polars_bio import OverlapFilter
from .polars_bio import FilterOp
from .range_op import ctx, overlap

__all__ = ["overlap", "ctx", "OverlapFilter"]
__all__ = ["overlap", "ctx", "FilterOp"]
258 changes: 0 additions & 258 deletions polars_bio/overlap.py

This file was deleted.

71 changes: 71 additions & 0 deletions polars_bio/range_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

import pandas as pd
import polars as pl
from typing_extensions import TYPE_CHECKING, Union

from .polars_bio import FilterOp, RangeOp, RangeOptions
from .range_op_helpers import Context, range_operation

if TYPE_CHECKING:
pass

DEFAULT_INTERVAL_COLUMNS = ["contig", "pos_start", "pos_end"]

ctx = Context().ctx


def overlap(
df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
how="inner",
overlap_filter: FilterOp = FilterOp.Weak,
suffixes=("_1", "_2"),
on_cols=None,
col1: Union[list[str] | None] = None,
col2: Union[list[str] | None] = None,
output_type: str = "polars.LazyFrame",
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
"""
Find pairs of overlapping genomic intervals.
Bioframe inspired API.
Parameters
----------
:param df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header and Parquet are supported.
:param df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header and Parquet are supported.
:param how: How to handle the overlaps on the two dataframes. inner: use intersection of the set of intervals from df1 and df2, optional.
:param overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict). default is FilterOp.Weak.
:param col1: The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'contig', 'pos_start', 'pos_end'.
:param col2: The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'contig', 'pos_start', 'pos_end'.
:param suffixes: (str, str), optional The suffixes for the columns of the two overlapped sets.
:param on_cols: list[str], optional The list of additional column names to join on. default is None.
:param output_type: str, optional The type of the output. default is "polars.LazyFrame".
:return: **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
"""
# TODO: Add support for col1 and col2
assert col1 is None, "col1 is not supported yet"
assert col2 is None, "col2 is not supported yet"
col1 = ["contig", "pos_start", "pos_end"] if col1 is None else col1
col2 = ["contig", "pos_start", "pos_end"] if col2 is None else col2

# TODO: Add support for on_cols ()
assert on_cols is None, "on_cols is not supported yet"

assert suffixes == ("_1", "_2"), "Only default suffixes are supported"
assert output_type in [
"polars.LazyFrame",
"polars.DataFrame",
"pandas.DataFrame",
], "Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported"

assert how in ["inner"], "Only inner join is supported"

range_options = RangeOptions(range_op=RangeOp.Overlap, filter_op=overlap_filter)
return range_operation(
df1, df2, suffixes, range_options, col1, col2, output_type, ctx
)
Loading

0 comments on commit 3ae9342

Please sign in to comment.