diff --git a/pyproject.toml b/pyproject.toml index 2133055..1d0645c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ ] [project.optional-dependencies] +dev = ["ruff"] plot = ["plotly"] test = ["scikit-learn", "pytest"] demo = ["notebook", "matplotlib"] @@ -31,4 +32,4 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] -where = ["src"] +where = ["src"] diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..42ebdad --- /dev/null +++ b/tests/README.md @@ -0,0 +1,25 @@ + # Unit tests + +We use `pytest` package to implement and run unit tests for copairs. + +## Getting started + +### Installation + +To install copairs with test dependencies, check out code locally and install as: +```bash +pip install -e .[test] +``` + +### Running tests +To execute all tests, run: +```bash +pytest +``` + +Each individual `test_filename.py` file implements tests for particular features in the corresponding `copairs/filename.py`. + +To run tests for a particular source file, specify its test file: +```bash +pytest tests/test_map.py +``` diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..6bd8fb4 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the copairs package.""" \ No newline at end of file diff --git a/tests/helpers.py b/tests/helpers.py index a7a4c25..e7aaaf4 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,3 +1,4 @@ +"""Helper functions for testing.""" from itertools import product from typing import Dict @@ -10,7 +11,7 @@ def simulate_plates(n_compounds, n_replicates, plate_size): - """Round robin creation of platemaps""" + """Round robin creation of platemaps.""" total = n_compounds * n_replicates compounds = [] @@ -35,6 +36,7 @@ def simulate_random_plates( sameby=ColumnList, diffby=ColumnList, ): + """Simulate random platemaps.""" rng = np.random.default_rng(SEED) dframe = simulate_plates(n_compounds, n_replicates, plate_size) # Shuffle values @@ -52,6 +54,7 @@ def simulate_random_dframe( diffby: ColumnList, rng: np.random.Generator, ): + """Simulate random dataframe.""" dframe = pd.DataFrame(columns=list(vocab_size.keys()), index=range(length)) for col, size in vocab_size.items(): dframe[col] = rng.integers(1, size + 1, size=length) @@ -64,9 +67,7 @@ def simulate_random_dframe( def create_dframe(n_options, n_rows): - """ - Random permutation of a fix number of elements per column - """ + """Create a dataframe with predefined number of plates, wells, and compounds.""" if isinstance(n_options, int): n_options = [n_options] * 3 colc = list(f"c{i}" for i in range(n_options[0])) diff --git a/tests/test_build_rank_multilabel.py b/tests/test_build_rank_multilabel.py index 49b6f08..c2afdf4 100644 --- a/tests/test_build_rank_multilabel.py +++ b/tests/test_build_rank_multilabel.py @@ -1,9 +1,11 @@ +"""Test the concatenation of ranges.""" import numpy as np from copairs.compute import concat_ranges def naive_concat_ranges(start: np.ndarray, end: np.ndarray): + """Concatenate ranges into a mask.""" mask = [] for s, e in zip(start, end): mask.extend(range(s, e)) @@ -11,6 +13,7 @@ def naive_concat_ranges(start: np.ndarray, end: np.ndarray): def test_concat_ranges(): + """Test the concatenation of ranges.""" rng = np.random.default_rng() num_range = 5, 10 start_range = 2, 10 diff --git a/tests/test_compute.py b/tests/test_compute.py index 63444c7..03428c7 100644 --- a/tests/test_compute.py +++ b/tests/test_compute.py @@ -1,3 +1,4 @@ +"""Test pairwise distance calculation functions.""" import numpy as np from copairs import compute @@ -7,6 +8,7 @@ def corrcoef_naive(feats, pairs): + """Compute correlation coefficient between pairs of features.""" corr = np.empty((len(pairs),)) for pos, (i, j) in enumerate(pairs): corr[pos] = np.corrcoef(feats[i], feats[j])[0, 1] @@ -14,6 +16,7 @@ def corrcoef_naive(feats, pairs): def cosine_naive(feats, pairs): + """Compute cosine similarity between pairs of features.""" cosine = np.empty((len(pairs),)) for pos, (i, j) in enumerate(pairs): a, b = feats[i], feats[j] @@ -24,6 +27,7 @@ def cosine_naive(feats, pairs): def euclidean_naive(feats, pairs): + """Compute euclidean similarity between pairs of features.""" euclidean_sim = np.empty((len(pairs),)) for pos, (i, j) in enumerate(pairs): dist = np.linalg.norm(feats[i] - feats[j]) @@ -32,10 +36,12 @@ def euclidean_naive(feats, pairs): def abs_cosine_naive(feats, pairs): + """Compute absolute cosine similarity between pairs of features.""" return np.abs(cosine_naive(feats, pairs)) def test_corrcoef(): + """Test correlation coefficient computation.""" n_samples = 10 n_pairs = 20 n_feats = 5 @@ -50,6 +56,7 @@ def test_corrcoef(): def test_cosine(): + """Test cosine similarity computation.""" n_samples = 10 n_pairs = 20 n_feats = 5 @@ -64,6 +71,7 @@ def test_cosine(): def test_euclidean(): + """Test euclidean similarity computation.""" n_samples = 10 n_pairs = 20 n_feats = 5 @@ -78,6 +86,7 @@ def test_euclidean(): def test_abs_cosine(): + """Test absolute cosine similarity computation.""" n_samples = 10 n_pairs = 20 n_feats = 5 diff --git a/tests/test_map.py b/tests/test_map.py index 816d7d8..f2e2379 100644 --- a/tests/test_map.py +++ b/tests/test_map.py @@ -1,3 +1,4 @@ +"""Tests for (mean) Average Precision calculation.""" import numpy as np import pandas as pd import pytest @@ -13,6 +14,7 @@ def test_random_binary_matrix(): + """Test the random binary matrix generation.""" rng = np.random.default_rng(SEED) # Test with n=3, m=4, k=2 A = compute.random_binary_matrix(3, 4, 2, rng) @@ -28,6 +30,7 @@ def test_random_binary_matrix(): def test_compute_ap(): + """Test the average precision computation.""" num_pos, num_neg, num_perm = 5, 6, 100 total = num_pos + num_neg @@ -56,6 +59,7 @@ def test_compute_ap(): def test_compute_ap_contiguous(): + """Test the contiguous average precision computation.""" num_pos_range = [2, 9] num_neg_range = [10, 20] num_samples_range = [5, 30] @@ -88,6 +92,7 @@ def test_compute_ap_contiguous(): def test_pipeline(): + """Check the implementation with for mAP calculation.""" length = 10 vocab_size = {"p": 5, "w": 3, "l": 4} n_feats = 5 @@ -103,7 +108,7 @@ def test_pipeline(): def test_pipeline_multilabel(): - """Check the multilabel implementation with for mAP calculation""" + """Check the multilabel implementation with for mAP calculation.""" length = 10 vocab_size = {"p": 3, "w": 5, "l": 4} n_feats = 8 @@ -124,6 +129,7 @@ def test_pipeline_multilabel(): def test_raise_no_pairs(): + """Test the exception raised when no pairs are found.""" length = 10 vocab_size = {"p": 3, "w": 3, "l": 10} n_feats = 5 @@ -143,6 +149,7 @@ def test_raise_no_pairs(): def test_raise_nan_error(): + """Test the exception raised when there are null values.""" length = 10 vocab_size = {"p": 5, "w": 3, "l": 4} n_feats = 8 diff --git a/tests/test_map_filter.py b/tests/test_map_filter.py index 9b1b311..4fdfe1f 100644 --- a/tests/test_map_filter.py +++ b/tests/test_map_filter.py @@ -1,3 +1,4 @@ +"""Tests data filtering by query.""" import numpy as np import pytest @@ -9,6 +10,7 @@ @pytest.fixture def mock_dataframe(): + """Create a mock dataframe.""" length = 10 vocab_size = {"p": 3, "w": 3, "l": 10} pos_sameby = ["l"] @@ -20,6 +22,7 @@ def mock_dataframe(): def test_correct(mock_dataframe): + """Test correct query.""" df, parsed_cols = evaluate_and_filter(mock_dataframe, ["p == 'p1'", "w > 'w2'"]) assert not df.empty assert "p" in parsed_cols and "w" in parsed_cols @@ -27,6 +30,7 @@ def test_correct(mock_dataframe): def test_invalid_query(mock_dataframe): + """Test invalid query.""" with pytest.raises(ValueError) as excinfo: evaluate_and_filter(mock_dataframe, ['l == "lHello"']) assert "Invalid combined query expression" in str(excinfo.value) @@ -34,12 +38,14 @@ def test_invalid_query(mock_dataframe): def test_empty_result(mock_dataframe): + """Test empty result.""" with pytest.raises(ValueError) as excinfo: evaluate_and_filter(mock_dataframe, ['p == "p1"', 'p == "p2"']) assert "Duplicate queries for column" in str(excinfo.value) def test_empty_result_from_valid_query(mock_dataframe): + """Test empty result from valid query.""" with pytest.raises(ValueError) as excinfo: evaluate_and_filter(mock_dataframe, ['p == "p4"']) assert "No data matched the query" in str(excinfo.value) diff --git a/tests/test_matching.py b/tests/test_matching.py index 5bc1132..91c494f 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -1,4 +1,4 @@ -"""Test functions for Matcher""" +"""Test functions for Matcher.""" from string import ascii_letters @@ -13,7 +13,7 @@ def run_stress_sample_null(dframe, num_pairs): - """Assert every generated null pair does not match any column""" + """Assert every generated null pair does not match any column.""" matcher = Matcher(dframe, dframe.columns, seed=SEED) for _ in range(num_pairs): id1, id2 = matcher.sample_null_pair(dframe.columns) @@ -23,19 +23,19 @@ def run_stress_sample_null(dframe, num_pairs): def test_null_sample_large(): - """Test Matcher guarantees elements with different values""" + """Test Matcher guarantees elements with different values.""" dframe = create_dframe(32, 10000) run_stress_sample_null(dframe, 5000) def test_null_sample_small(): - """Test Sample with small set""" + """Test Sample with small set.""" dframe = create_dframe(3, 10) run_stress_sample_null(dframe, 100) def test_null_sample_nan_vals(): - """Test NaN values are ignored""" + """Test NaN values are ignored.""" dframe = create_dframe(4, 15) rng = np.random.default_rng(SEED) nan_mask = rng.random(dframe.shape) < 0.5 @@ -44,7 +44,7 @@ def test_null_sample_nan_vals(): def get_naive_pairs(dframe: pd.DataFrame, sameby, diffby): - """Compute valid pairs using cross product from pandas""" + """Compute valid pairs using cross product from pandas.""" cross = dframe.reset_index().merge( dframe.reset_index(), how="cross", suffixes=("_x", "_y") ) @@ -62,7 +62,7 @@ def get_naive_pairs(dframe: pd.DataFrame, sameby, diffby): def check_naive(dframe, matcher: Matcher, sameby, diffby): - """Check Matcher and naive generate same pairs""" + """Check Matcher and naive generate same pairs.""" gt_pairs = get_naive_pairs(dframe, sameby, diffby) vals = matcher.get_all_pairs(sameby, diffby) vals = sum(vals.values(), []) @@ -74,14 +74,14 @@ def check_naive(dframe, matcher: Matcher, sameby, diffby): def check_simulated_data(length, vocab_size, sameby, diffby, rng): - """Test sample of valid pairs from a simulated dataset""" + """Test sample of valid pairs from a simulated dataset.""" dframe = simulate_random_dframe(length, vocab_size, sameby, diffby, rng) matcher = Matcher(dframe, dframe.columns, seed=SEED) check_naive(dframe, matcher, sameby, diffby) def test_stress_simulated_data(): - """Run multiple tests using simulated data""" + """Run multiple tests using simulated data.""" rng = np.random.default_rng(SEED) num_cols_range = [2, 6] vocab_size_range = [5, 10] @@ -99,7 +99,7 @@ def test_stress_simulated_data(): def test_empty_sameby(): - """Test query without sameby""" + """Test query without sameby.""" dframe = create_dframe(3, 10) matcher = Matcher(dframe, dframe.columns, seed=SEED) check_naive(dframe, matcher, sameby=[], diffby=["w", "c"]) @@ -107,7 +107,7 @@ def test_empty_sameby(): def test_empty_diffby(): - """Test query without diffby""" + """Test query without diffby.""" dframe = create_dframe(3, 10) matcher = Matcher(dframe, dframe.columns, seed=SEED) matcher.get_all_pairs(["c"], []) @@ -116,7 +116,7 @@ def test_empty_diffby(): def test_raise_distjoint(): - """Test check for disjoint sameby and diffby""" + """Test check for disjoint sameby and diffby.""" dframe = create_dframe(3, 10) matcher = Matcher(dframe, dframe.columns, seed=SEED) with pytest.raises(ValueError, match="must be disjoint lists"): @@ -124,7 +124,7 @@ def test_raise_distjoint(): def test_raise_no_params(): - """Test check for at least one of sameby and diffby""" + """Test check for at least one of sameby and diffby.""" dframe = create_dframe(3, 10) matcher = Matcher(dframe, dframe.columns, seed=SEED) with pytest.raises(ValueError, match="at least one should be provided"): @@ -132,7 +132,7 @@ def test_raise_no_params(): def assert_sameby_diffby(dframe: pd.DataFrame, pairs_dict: dict, sameby, diffby): - """Assert the pairs are valid""" + """Assert the pairs are valid.""" for _, pairs in pairs_dict.items(): for id1, id2 in pairs: for col in sameby: diff --git a/tests/test_matching_any.py b/tests/test_matching_any.py index 25ccc02..3f18c4f 100644 --- a/tests/test_matching_any.py +++ b/tests/test_matching_any.py @@ -1,3 +1,4 @@ +"""Test matching with `any` conditions using simulated data.""" from string import ascii_letters import numpy as np @@ -10,7 +11,7 @@ def get_naive_pairs(dframe: pd.DataFrame, sameby, diffby): - """Compute valid pairs using cross product from pandas""" + """Compute valid pairs using cross product from pandas.""" cross = dframe.reset_index().merge( dframe.reset_index(), how="cross", suffixes=("_x", "_y") ) @@ -39,7 +40,7 @@ def get_naive_pairs(dframe: pd.DataFrame, sameby, diffby): def check_naive(dframe, matcher: Matcher, sameby, diffby): - """Check Matcher and naive generate same pairs""" + """Check Matcher and naive generate same pairs.""" gt_pairs = get_naive_pairs(dframe, sameby, diffby) vals = matcher.get_all_pairs(sameby, diffby) vals = sum(vals.values(), []) @@ -51,7 +52,7 @@ def check_naive(dframe, matcher: Matcher, sameby, diffby): def check_simulated_data(length, vocab_size, sameby, diffby, rng): - """Test sample of valid pairs from a simulated dataset""" + """Test sample of valid pairs from a simulated dataset.""" sameby_cols = sameby["all"] + sameby["any"] diffby_cols = diffby["all"] + diffby["any"] dframe = simulate_random_dframe(length, vocab_size, sameby_cols, diffby_cols, rng) @@ -60,7 +61,7 @@ def check_simulated_data(length, vocab_size, sameby, diffby, rng): def test_stress_simulated_data_any_all(): - """Run multiple tests using simulated data""" + """Run multiple tests using simulated data.""" rng = np.random.default_rng(SEED) num_cols_range = [2, 6] vocab_size_range = [5, 10] @@ -78,7 +79,7 @@ def test_stress_simulated_data_any_all(): def test_stress_simulated_data_all_all(): - """Run multiple tests using simulated data""" + """Run multiple tests using simulated data.""" rng = np.random.default_rng(SEED) num_cols_range = [2, 6] vocab_size_range = [5, 10] @@ -96,7 +97,7 @@ def test_stress_simulated_data_all_all(): def test_stress_simulated_data_all_any(): - """Run multiple tests using simulated data""" + """Run multiple tests using simulated data.""" rng = np.random.default_rng(SEED) num_cols_range = [2, 6] vocab_size_range = [5, 10] @@ -114,7 +115,7 @@ def test_stress_simulated_data_all_any(): def test_stress_simulated_data_any_any(): - """Run multiple tests using simulated data""" + """Run multiple tests using simulated data.""" rng = np.random.default_rng(SEED) num_cols_range = [4, 6] vocab_size_range = [5, 10] diff --git a/tests/test_matching_multilabel.py b/tests/test_matching_multilabel.py index 50f978e..dd6e308 100644 --- a/tests/test_matching_multilabel.py +++ b/tests/test_matching_multilabel.py @@ -1,3 +1,4 @@ +"""Tests for the multilabel matching implementation.""" import pandas as pd from copairs.matching import MatcherMultilabel @@ -7,6 +8,7 @@ def get_naive_pairs(dframe: pd.DataFrame, sameby, diffby, multilabel_col: str): + """Get pairs using a naive implementation.""" dframe = dframe.copy() dframe[multilabel_col] = dframe[multilabel_col].apply(set) @@ -45,7 +47,7 @@ def any_equal(row): def check_naive(dframe, matcher: MatcherMultilabel, sameby, diffby, multilabel_col): - """Check Matcher and naive generate same pairs""" + """Check Matcher and naive generate same pairs.""" gt_pairs = get_naive_pairs(dframe, sameby, diffby, multilabel_col) vals = matcher.get_all_pairs(sameby, diffby) vals = sum(vals.values(), []) @@ -57,7 +59,7 @@ def check_naive(dframe, matcher: MatcherMultilabel, sameby, diffby, multilabel_c def test_sameby(): - """Check the multilabel implementation with sameby""" + """Check the multilabel implementation with sameby.""" multilabel_col = "c" sameby = ["c"] diffby = ["p", "w"] @@ -70,7 +72,7 @@ def test_sameby(): def test_diffby(): - """Check the multilabel implementation with sameby""" + """Check the multilabel implementation with sameby.""" multilabel_col = "c" sameby = ["p"] diffby = ["c", "w"] @@ -84,7 +86,7 @@ def test_diffby(): def test_only_diffby(): - """Check the multilabel implementation with only diffby being equal to c""" + """Check the multilabel implementation with only diffby being equal to c.""" multilabel_col = "c" sameby = [] diffby = ["c"] @@ -97,7 +99,7 @@ def test_only_diffby(): def test_only_diffby_many_cols(): - """Check the multilabel implementation with only diffby being equal to c""" + """Check the multilabel implementation with only diffby being equal to c.""" multilabel_col = "c" sameby = [] diffby = ["c", "w"] @@ -110,7 +112,7 @@ def test_only_diffby_many_cols(): def test_only_sameby_many_cols(): - """Check the multilabel implementation with only diffby being equal to c""" + """Check the multilabel implementation with only diffby being equal to c.""" multilabel_col = "c" sameby = ["c", "w"] diffby = [] diff --git a/tests/test_replicating.py b/tests/test_replicating.py index a273bbe..79d5661 100644 --- a/tests/test_replicating.py +++ b/tests/test_replicating.py @@ -1,3 +1,4 @@ +"""Tests for the replicating module.""" from numpy.random import default_rng from copairs import Matcher @@ -12,6 +13,7 @@ def test_corr_between_replicates(): + """Test calculating correlation between replicates.""" rng = default_rng(SEED) num_samples = 10 X = rng.normal(size=[num_samples, 6]) @@ -20,6 +22,7 @@ def test_corr_between_replicates(): def test_correlation_test(): + """Test correlation test.""" rng = default_rng(SEED) num_samples = 10 X = rng.normal(size=[num_samples, 6]) @@ -31,6 +34,7 @@ def test_correlation_test(): def test_corr_from_pairs(): + """Test calculating correlation from a list of named pairs.""" num_samples = 10 sameby = ["c"] diffby = ["p", "w"]