Skip to content

Commit

Permalink
feat: Bioinformatic formats support
Browse files Browse the repository at this point in the history
  • Loading branch information
mwiewior committed Jan 10, 2025
1 parent cbc374c commit 6cfaa9d
Show file tree
Hide file tree
Showing 20 changed files with 2,376 additions and 222 deletions.
1,895 changes: 1,757 additions & 138 deletions Cargo.lock

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars_bio"
version = "0.4.0"
version = "0.4.1"
edition = "2021"

[lib]
Expand All @@ -12,15 +12,17 @@ crate-type= ["cdylib"]

[dependencies]
datafusion-python = { git = "https://github.com/apache/datafusion-python.git", rev = "5c834934dec89bd96ff70df3b278e9d6fe78f7ec"}
pyo3 = { version = "0.22.4", features = ["extension-module", "abi3-py38", "experimental-async"] }
pyo3 = { version = "0.22"}
pyo3-log = "0.11.0"
sequila-core = { git = "https://github.com/biodatageeks/sequila-native.git", rev = "07a36935177f8ffbfbaa7f63958384108efd7b4f" }

datafusion = { version = "43.0.0"}
arrow = "53.3.0"
arrow-schema = "53.3.0"
arrow-array = { version = "53.3.0", features = ["ffi"] }
tokio = {version = "1.42.0", features = ["full", "tracing"]}
log = "0.4.22"
tracing = { version = "0.1.41", features = ["log"] }
futures-util = "0.3.31"


Expand All @@ -30,4 +32,7 @@ polars-plan = { git = "https://github.com/mwiewior/polars.git" , rev = "9d4fca54
polars-lazy = { git = "https://github.com/mwiewior/polars.git" , rev = "9d4fca54b1d71fce08a51cf00a88f67c67313706", features = ["parquet", "new_streaming", "streaming", "csv", "cse"]}
polars-core = {git = "https://github.com/mwiewior/polars.git" , rev = "9d4fca54b1d71fce08a51cf00a88f67c67313706"}
polars-arrow = { git = "https://github.com/mwiewior/polars.git" , rev = "9d4fca54b1d71fce08a51cf00a88f67c67313706"}
polars-python = { git = "https://github.com/mwiewior/polars.git" , rev = "9d4fca54b1d71fce08a51cf00a88f67c67313706"}
polars-python = { git = "https://github.com/mwiewior/polars.git" , rev = "9d4fca54b1d71fce08a51cf00a88f67c67313706"}

#exon ="0.32.4"
exon = { git = "https://github.com/mwiewior/exon.git", rev="c543c03937ce5c8f249a77e45a28d7138e0a9c0f"}
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
It provides a DataFrame API for genomics data and is designed to be blazing fast, memory efficient and easy to use.

## Key Features
* optimized for [peformance](performance.md#results-summary-) and large-scale genomics datasets
* popular genomics [operations](features.md#genomic-ranges-operations) with a DataFrame API (both [Pandas](https://pandas.pydata.org/) and [polars](https://pola.rs/))
* optimized for [peformance](docs/performance.md#results-summary-) and large-scale genomics datasets
* popular genomics [operations](docs/features.md#genomic-ranges-operations) with a DataFrame API (both [Pandas](https://pandas.pydata.org/) and [polars](https://pola.rs/))
* native parallel engine powered by Apache DataFusion and [sequila-native](https://github.com/biodatageeks/sequila-native)
* [out-of-core](features.md#streaming-out-of-core-processing) processing (for data too large to fit into a computer's main memory) with [Apache DataFusion](https://datafusion.apache.org/) and [polars](https://pola.rs/)
* [out-of-core](docs/features.md#streaming-out-of-core-processing-exeprimental) processing (for data too large to fit into a computer's main memory) with [Apache DataFusion](https://datafusion.apache.org/) and [polars](https://pola.rs/)
* zero-copy data exchange with [Apache Arrow](https://arrow.apache.org/)
* pre-built wheel packages for *Linux*, *Windows* and *MacOS* (*arm64* and *x86_64*) available on [PyPI](https://pypi.org/project/polars-bio/#files)

Expand Down
8 changes: 4 additions & 4 deletions benchmark/src/bench_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,10 @@ def genomicranges(df_1, df_2):

# Display the table
benchmark_results = {
"inputs": {
"df_1_num": len(df_1),
"df_2_num": len(df_2),
},
# "inputs": {
# "df_1_num": len(df_1),
# "df_2_num": len(df_2),
# },
# "output_num": pb.overlap(df_1, df_2, col1=columns, col2=columns)
# .collect()
# .count(),
Expand Down
12 changes: 12 additions & 0 deletions docs/cookbook.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,14 @@
:construction:

[//]: # (## Genomic ranges operations)

[//]: # (## How to read bioinformatics formats)


[//]: # ()
[//]: # (## How to work directly with Datafusion DataFrame)

[//]: # (To bypasss issue XXX)

[//]: # (## How to set logging level)

4 changes: 2 additions & 2 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ nav:
- 📚 Tutorial: notebooks/tutorial.ipynb
- 🚀 Performance: performance.md
- ⚙️ API reference: api.md


- Quick start: quickstart.md
- Features: features.md
- Cookbook: cookbook.md
- Tutorial: notebooks/tutorial.ipynb
- Performance: performance.md
- API reference: api.md
- FAQ: faq.md

plugins:
- search
Expand Down
147 changes: 146 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 37 additions & 3 deletions polars_bio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,45 @@
import logging

from .range_op import FilterOp, ctx, nearest, overlap
from polars_bio.polars_bio import InputFormat

from .context import ctx
from .io import (
read_bam,
read_bed,
read_cram,
read_fasta,
read_fastq,
read_gff,
read_gtf,
read_indexed_bam,
read_indexed_vcf,
read_vcf,
)
from .range_op import FilterOp, nearest, overlap
from .range_viz import visualize_intervals

logging.basicConfig()
logging.getLogger().setLevel(logging.WARN)
logger = logging.getLogger("polars_bio")
logger.setLevel(logging.INFO)

__version__ = "0.4.0"
__all__ = ["overlap", "nearest", "ctx", "FilterOp", "vizualize_intervals"]

__version__ = "0.4.1"
__all__ = [
"overlap",
"nearest",
"ctx",
"FilterOp",
"visualize_intervals",
"read_bam",
"read_indexed_bam",
"read_vcf",
"read_cram",
"read_bed",
"read_gff",
"read_gtf",
"read_fasta",
"read_fastq",
"read_indexed_vcf",
"InputFormat",
]
27 changes: 27 additions & 0 deletions polars_bio/context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from polars_bio.polars_bio import BioSessionContext


def singleton(cls):
"""Decorator to make a class a singleton."""
instances = {}

def get_instance(*args, **kwargs):
if cls not in instances:
instances[cls] = cls(*args, **kwargs)
return instances[cls]

return get_instance


@singleton
class Context:
def __init__(self):
self.ctx = BioSessionContext()
self.ctx.set_option("datafusion.execution.target_partitions", "1")
self.ctx.set_option("sequila.interval_join_algorithm", "coitrees")

def set_option(self, key, value):
self.ctx.set_option(key, value)


ctx = Context().ctx
Loading

0 comments on commit 6cfaa9d

Please sign in to comment.