cytomining · johnarevalo · Dec 16, 2024 · Nov 24, 2024 · Dec 14, 2024 · Dec 14, 2024
diff --git a/examples/mAP_demo.ipynb b/examples/mAP_demo.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+dev = ["ruff"]
 plot = ["plotly"]
 test = ["scikit-learn", "pytest"]
 demo = ["notebook", "matplotlib"]
@@ -31,4 +32,4 @@ requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
-where = ["src"] 
+where = ["src"]
diff --git a/tests/README.md b/tests/README.md
@@ -0,0 +1,25 @@
+ # Unit tests
+
+We use `pytest` package to implement and run unit tests for copairs.
+
+## Getting started
+
+### Installation
+
+To install copairs with test dependencies, check out code locally and install as:
+```bash
+pip install -e .[test]
+```
+
+### Running tests
+To execute all tests, run:
+```bash
+pytest
+```
+
+Each individual `test_filename.py` file implements tests for particular features in the corresponding `copairs/filename.py`.
+
+To run tests for a particular source file, specify its test file:
+```bash
+pytest tests/test_map.py
+```
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1 @@
+"""Unit tests for the copairs package."""
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -1,3 +1,5 @@
+"""Helper functions for testing."""
+
 from itertools import product
 from typing import Dict
 
@@ -10,7 +12,7 @@
 
 
 def simulate_plates(n_compounds, n_replicates, plate_size):
-    """Round robin creation of platemaps"""
+    """Round robin creation of platemaps."""
     total = n_compounds * n_replicates
 
     compounds = []
@@ -35,6 +37,7 @@ def simulate_random_plates(
     sameby=ColumnList,
     diffby=ColumnList,
 ):
+    """Simulate random platemaps."""
     rng = np.random.default_rng(SEED)
     dframe = simulate_plates(n_compounds, n_replicates, plate_size)
     # Shuffle values
@@ -52,6 +55,7 @@ def simulate_random_dframe(
     diffby: ColumnList,
     rng: np.random.Generator,
 ):
+    """Simulate random dataframe."""
     dframe = pd.DataFrame(columns=list(vocab_size.keys()), index=range(length))
     for col, size in vocab_size.items():
         dframe[col] = rng.integers(1, size + 1, size=length)
@@ -64,9 +68,7 @@ def simulate_random_dframe(
 
 
 def create_dframe(n_options, n_rows):
-    """
-    Random permutation of a fix number of elements per column
-    """
+    """Create a dataframe with predefined number of plates, wells, and compounds."""
     if isinstance(n_options, int):
         n_options = [n_options] * 3
     colc = list(f"c{i}" for i in range(n_options[0]))

diff --git a/tests/test_build_rank_multilabel.py b/tests/test_build_rank_multilabel.py
@@ -1,16 +1,20 @@
+"""Test the concatenation of ranges."""
+
 import numpy as np
 
 from copairs.compute import concat_ranges
 
 
 def naive_concat_ranges(start: np.ndarray, end: np.ndarray):
+    """Concatenate ranges into a mask."""
     mask = []
     for s, e in zip(start, end):
         mask.extend(range(s, e))
     return np.asarray(mask, dtype=np.int32)
 
 
 def test_concat_ranges():
+    """Test the concatenation of ranges."""
     rng = np.random.default_rng()
     num_range = 5, 10
     start_range = 2, 10

diff --git a/tests/test_compute.py b/tests/test_compute.py
@@ -1,3 +1,5 @@
+"""Test pairwise distance calculation functions."""
+
 import numpy as np
 
 from copairs import compute
@@ -7,13 +9,15 @@
 
 
 def corrcoef_naive(feats, pairs):
+    """Compute correlation coefficient between pairs of features."""
     corr = np.empty((len(pairs),))
     for pos, (i, j) in enumerate(pairs):
         corr[pos] = np.corrcoef(feats[i], feats[j])[0, 1]
     return corr
 
 
 def cosine_naive(feats, pairs):
+    """Compute cosine similarity between pairs of features."""
     cosine = np.empty((len(pairs),))
     for pos, (i, j) in enumerate(pairs):
         a, b = feats[i], feats[j]
@@ -24,6 +28,7 @@ def cosine_naive(feats, pairs):
 
 
 def euclidean_naive(feats, pairs):
+    """Compute euclidean similarity between pairs of features."""
     euclidean_sim = np.empty((len(pairs),))
     for pos, (i, j) in enumerate(pairs):
         dist = np.linalg.norm(feats[i] - feats[j])
@@ -32,10 +37,12 @@ def euclidean_naive(feats, pairs):
 
 
 def abs_cosine_naive(feats, pairs):
+    """Compute absolute cosine similarity between pairs of features."""
     return np.abs(cosine_naive(feats, pairs))
 
 
 def test_corrcoef():
+    """Test correlation coefficient computation."""
     n_samples = 10
     n_pairs = 20
     n_feats = 5
@@ -50,6 +57,7 @@ def test_corrcoef():
 
 
 def test_cosine():
+    """Test cosine similarity computation."""
     n_samples = 10
     n_pairs = 20
     n_feats = 5
@@ -64,6 +72,7 @@ def test_cosine():
 
 
 def test_euclidean():
+    """Test euclidean similarity computation."""
     n_samples = 10
     n_pairs = 20
     n_feats = 5
@@ -78,6 +87,7 @@ def test_euclidean():
 
 
 def test_abs_cosine():
+    """Test absolute cosine similarity computation."""
     n_samples = 10
     n_pairs = 20
     n_feats = 5

diff --git a/tests/test_map.py b/tests/test_map.py
@@ -1,3 +1,5 @@
+"""Tests for (mean) Average Precision calculation."""
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -13,6 +15,7 @@
 
 
 def test_random_binary_matrix():
+    """Test the random binary matrix generation."""
     rng = np.random.default_rng(SEED)
     # Test with n=3, m=4, k=2
     A = compute.random_binary_matrix(3, 4, 2, rng)
@@ -28,6 +31,7 @@ def test_random_binary_matrix():
 
 
 def test_compute_ap():
+    """Test the average precision computation."""
     num_pos, num_neg, num_perm = 5, 6, 100
     total = num_pos + num_neg
 
@@ -56,6 +60,7 @@ def test_compute_ap():
 
 
 def test_compute_ap_contiguous():
+    """Test the contiguous average precision computation."""
     num_pos_range = [2, 9]
     num_neg_range = [10, 20]
     num_samples_range = [5, 30]
@@ -88,6 +93,7 @@ def test_compute_ap_contiguous():
 
 
 def test_pipeline():
+    """Check the implementation with for mAP calculation."""
     length = 10
     vocab_size = {"p": 5, "w": 3, "l": 4}
     n_feats = 5
@@ -103,7 +109,7 @@ def test_pipeline():
 
 
 def test_pipeline_multilabel():
-    """Check the multilabel implementation with for mAP calculation"""
+    """Check the multilabel implementation with for mAP calculation."""
     length = 10
     vocab_size = {"p": 3, "w": 5, "l": 4}
     n_feats = 8
@@ -124,6 +130,7 @@ def test_pipeline_multilabel():
 
 
 def test_raise_no_pairs():
+    """Test the exception raised when no pairs are found."""
     length = 10
     vocab_size = {"p": 3, "w": 3, "l": 10}
     n_feats = 5
@@ -143,6 +150,7 @@ def test_raise_no_pairs():
 
 
 def test_raise_nan_error():
+    """Test the exception raised when there are null values."""
     length = 10
     vocab_size = {"p": 5, "w": 3, "l": 4}
     n_feats = 8

diff --git a/tests/test_map_filter.py b/tests/test_map_filter.py
@@ -1,3 +1,5 @@
+"""Tests data filtering by query."""
+
 import numpy as np
 import pytest
 
@@ -9,6 +11,7 @@
 
 @pytest.fixture
 def mock_dataframe():
+    """Create a mock dataframe."""
     length = 10
     vocab_size = {"p": 3, "w": 3, "l": 10}
     pos_sameby = ["l"]
@@ -20,26 +23,30 @@ def mock_dataframe():
 
 
 def test_correct(mock_dataframe):
+    """Test correct query."""
     df, parsed_cols = evaluate_and_filter(mock_dataframe, ["p == 'p1'", "w > 'w2'"])
     assert not df.empty
     assert "p" in parsed_cols and "w" in parsed_cols
     assert all(df["w"].str.extract(r"(\d+)")[0].astype(int) > 2)
 
 
 def test_invalid_query(mock_dataframe):
+    """Test invalid query."""
     with pytest.raises(ValueError) as excinfo:
         evaluate_and_filter(mock_dataframe, ['l == "lHello"'])
     assert "Invalid combined query expression" in str(excinfo.value)
     assert "No data matched the query" in str(excinfo.value)
 
 
 def test_empty_result(mock_dataframe):
+    """Test empty result."""
     with pytest.raises(ValueError) as excinfo:
         evaluate_and_filter(mock_dataframe, ['p == "p1"', 'p == "p2"'])
     assert "Duplicate queries for column" in str(excinfo.value)
 
 
 def test_empty_result_from_valid_query(mock_dataframe):
+    """Test empty result from valid query."""
     with pytest.raises(ValueError) as excinfo:
         evaluate_and_filter(mock_dataframe, ['p == "p4"'])
     assert "No data matched the query" in str(excinfo.value)
diff --git a/tests/test_matching.py b/tests/test_matching.py
@@ -1,4 +1,4 @@
-"""Test functions for Matcher"""
+"""Test functions for Matcher."""
 
 from string import ascii_letters
 
@@ -13,7 +13,7 @@
 
 
 def run_stress_sample_null(dframe, num_pairs):
-    """Assert every generated null pair does not match any column"""
+    """Assert every generated null pair does not match any column."""
     matcher = Matcher(dframe, dframe.columns, seed=SEED)
     for _ in range(num_pairs):
         id1, id2 = matcher.sample_null_pair(dframe.columns)
@@ -23,19 +23,19 @@ def run_stress_sample_null(dframe, num_pairs):
 
 
 def test_null_sample_large():
-    """Test Matcher guarantees elements with different values"""
+    """Test Matcher guarantees elements with different values."""
     dframe = create_dframe(32, 10000)
     run_stress_sample_null(dframe, 5000)
 
 
 def test_null_sample_small():
-    """Test Sample with small set"""
+    """Test Sample with small set."""
     dframe = create_dframe(3, 10)
     run_stress_sample_null(dframe, 100)
 
 
 def test_null_sample_nan_vals():
-    """Test NaN values are ignored"""
+    """Test NaN values are ignored."""
     dframe = create_dframe(4, 15)
     rng = np.random.default_rng(SEED)
     nan_mask = rng.random(dframe.shape) < 0.5
@@ -44,7 +44,7 @@ def test_null_sample_nan_vals():
 
 
 def get_naive_pairs(dframe: pd.DataFrame, sameby, diffby):
-    """Compute valid pairs using cross product from pandas"""
+    """Compute valid pairs using cross product from pandas."""
     cross = dframe.reset_index().merge(
         dframe.reset_index(), how="cross", suffixes=("_x", "_y")
     )
@@ -62,7 +62,7 @@ def get_naive_pairs(dframe: pd.DataFrame, sameby, diffby):
 
 
 def check_naive(dframe, matcher: Matcher, sameby, diffby):
-    """Check Matcher and naive generate same pairs"""
+    """Check Matcher and naive generate same pairs."""
     gt_pairs = get_naive_pairs(dframe, sameby, diffby)
     vals = matcher.get_all_pairs(sameby, diffby)
     vals = sum(vals.values(), [])
@@ -74,14 +74,14 @@ def check_naive(dframe, matcher: Matcher, sameby, diffby):
 
 
 def check_simulated_data(length, vocab_size, sameby, diffby, rng):
-    """Test sample of valid pairs from a simulated dataset"""
+    """Test sample of valid pairs from a simulated dataset."""
     dframe = simulate_random_dframe(length, vocab_size, sameby, diffby, rng)
     matcher = Matcher(dframe, dframe.columns, seed=SEED)
     check_naive(dframe, matcher, sameby, diffby)
 
 
 def test_stress_simulated_data():
-    """Run multiple tests using simulated data"""
+    """Run multiple tests using simulated data."""
     rng = np.random.default_rng(SEED)
     num_cols_range = [2, 6]
     vocab_size_range = [5, 10]
@@ -99,15 +99,15 @@ def test_stress_simulated_data():
 
 
 def test_empty_sameby():
-    """Test query without sameby"""
+    """Test query without sameby."""
     dframe = create_dframe(3, 10)
     matcher = Matcher(dframe, dframe.columns, seed=SEED)
     check_naive(dframe, matcher, sameby=[], diffby=["w", "c"])
     check_naive(dframe, matcher, sameby=[], diffby=["w"])
 
 
 def test_empty_diffby():
-    """Test query without diffby"""
+    """Test query without diffby."""
     dframe = create_dframe(3, 10)
     matcher = Matcher(dframe, dframe.columns, seed=SEED)
     matcher.get_all_pairs(["c"], [])
@@ -116,23 +116,23 @@ def test_empty_diffby():
 
 
 def test_raise_distjoint():
-    """Test check for disjoint sameby and diffby"""
+    """Test check for disjoint sameby and diffby."""
     dframe = create_dframe(3, 10)
     matcher = Matcher(dframe, dframe.columns, seed=SEED)
     with pytest.raises(ValueError, match="must be disjoint lists"):
         matcher.get_all_pairs("c", ["w", "c"])
 
 
 def test_raise_no_params():
-    """Test check for at least one of sameby and diffby"""
+    """Test check for at least one of sameby and diffby."""
     dframe = create_dframe(3, 10)
     matcher = Matcher(dframe, dframe.columns, seed=SEED)
     with pytest.raises(ValueError, match="at least one should be provided"):
         matcher.get_all_pairs([], [])
 
 
 def assert_sameby_diffby(dframe: pd.DataFrame, pairs_dict: dict, sameby, diffby):
-    """Assert the pairs are valid"""
+    """Assert the pairs are valid."""
     for _, pairs in pairs_dict.items():
         for id1, id2 in pairs:
             for col in sameby: