Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Harmonizing default Mixpanel properties across Rails & ingest (SCP-5798) #366

Merged
merged 6 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/minify_ontologies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ name: Minify ontologies
on:
pull_request:
types: [opened] # Only trigger on PR "opened" event
# push: # Uncomment, update branches to develop / debug
# branches:
# ew-minify-ontologies
# push: # Uncomment, update branches to develop / debug
# branches:
# jb-anndata-mixpanel-props

jobs:
build:
Expand Down Expand Up @@ -86,7 +86,7 @@ jobs:

# Commit changes
git commit -m "Update minified ontologies via GitHub Actions"
git push origin HEAD
git push origin ${{ github.ref_name }}
else
echo "No changes to commit."
fi
23 changes: 15 additions & 8 deletions ingest/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
MONGO_CONNECTION = MongoConnection()


def init(study_id, study_file_id, user_metric_uuid=None):
def init(study_id, study_file_id, user_metric_uuid=None, action=None):
global __metric_properties

study = Study(study_id)
study_file = StudyFile(study_file_id)
__metric_properties = MetricProperties(study, study_file, user_metric_uuid)
__metric_properties = MetricProperties(study, study_file, user_metric_uuid, action)


def set_parent_event_name(event_name):
Expand All @@ -39,7 +39,7 @@ class MetricProperties:
# This is a generic write-only log token, not a secret
USER_ID = "2f30ec50-a04d-4d43-8fd1-b136a2045079"

def __init__(self, study, study_file, user_uuid=None):
def __init__(self, study, study_file, user_uuid=None, action=None):
distinct_id = user_uuid if user_uuid else MetricProperties.USER_ID
self.__properties = {
"distinct_id": distinct_id,
Expand All @@ -50,7 +50,11 @@ def __init__(self, study, study_file, user_uuid=None):
"trigger": study_file.trigger,
"logger": "ingest-pipeline",
"appId": "single-cell-portal",
"action": action
}
# merge in referenceAnnDataFile if necessary
if study_file.file_type == 'AnnData':
self.__properties["referenceAnnDataFile"] = study_file.is_reference_anndata

def get_properties(self):
return self.__properties
Expand Down Expand Up @@ -171,12 +175,15 @@ def study_file(self, study_file_id):
self.file_type = self.study_file["file_type"]
self.file_size = self.study_file["upload_file_size"]
self.file_name = self.study_file["name"]
upload_trigger = self.study_file.get("options", {}).get("upload_trigger")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some day we'll get optional chaining in Python!

# when set, remote_location is the name of the file in the bucket
if self.study_file.get("remote_location") is not None:
if self.study_file["remote_location"] == "":
self.trigger = 'upload'
else:
self.trigger = 'sync'
if upload_trigger is not None:
self.trigger = upload_trigger
elif self.study_file["remote_location"] is not None:
self.trigger = 'upload' if self.study_file["remote_location"] == "" else 'sync'
# indicate trigger state for tests/mocks
else:
self.trigger = 'not set'

if self.study_file["file_type"] == 'AnnData':
self.is_reference_anndata = self.study_file.get("ann_data_file_info", {}).get("reference_file")
13 changes: 12 additions & 1 deletion ingest/ingest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression

# Differential expression analysis (h5ad matrix)
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/de_dense_matrix.tsv --matrix-file-type h5ad --annotation-file ../tests/data/differential_expression/de_dense_metadata.tsv --cluster-file ../tests/data/differential_expression/de_dense_cluster.tsv --cluster-name de_integration --study-accession SCPdev --differential-expression
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name louvain --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --matrix-file-type h5ad --annotation-file ../tests/data/anndata/h5ad_frag.metadata.tsv --cluster-file ../tests/data/anndata/h5ad_frag.cluster.X_umap.tsv --cluster-name umap --study-accession SCPdev --differential-expression

"""
import json
Expand Down Expand Up @@ -124,6 +124,12 @@ class IngestPipeline:
"../schema/alexandria_convention/alexandria_convention_schema.json"
)

# array of actions to use when reporting to Mixpanel
ACTION_NAMES = [
'ingest_cluster', 'ingest_cell_metadata', 'ingest_expression', 'ingest_anndata', 'ingest_subsample',
'ingest_differential_expression', 'differential_expression', 'render_expression_arrays', 'rank_genes'
]

# Logger provides more details for trouble shooting
dev_logger = setup_logger(__name__, "log.txt", format="support_configs")
user_logger = setup_logger(__name__ + ".usr", "user_log.txt", level=logging.ERROR)
Expand Down Expand Up @@ -786,6 +792,10 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments):
sys.exit(os.EX_DATAERR)
sys.exit(1)

def get_action_from_args(arguments):
"""Get the action from list of arguments denoting which data type is being ingested/extracted"""
action = list(set(IngestPipeline.ACTION_NAMES) & set(arguments))
return action[0] if action else ""

def main() -> None:
"""Enables running Ingest Pipeline via CLI
Expand All @@ -811,6 +821,7 @@ def main() -> None:
arguments["study_id"],
arguments["study_file_id"],
arguments["user_metrics_uuid"],
get_action_from_args(arguments)
)
ingest = IngestPipeline(**arguments)
status, status_cell_metadata = run_ingest(ingest, arguments, parsed_args)
Expand Down
Binary file modified ingest/validation/ontologies/efo.min.tsv.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion ingest/validation/ontologies/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1726164618 # validation cache key
1726600528 # validation cache key
20 changes: 20 additions & 0 deletions tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
IngestPipeline,
exit_pipeline,
run_ingest,
get_action_from_args
)
from expression_files.expression_files import GeneExpression

Expand Down Expand Up @@ -793,6 +794,25 @@ def test_extract_processed_matrix_from_anndata(self):
except:
print(f"Error while deleting file : {file}")

def test_get_action_from_args(self):
args = [
"--study-id",
"5d276a50421aa9117c982845",
"--study-file-id",
"5dd5ae25421aa910a723a337",
"ingest_subsample",
"--cluster-file",
"../tests/data/good_subsample_cluster.csv",
"--name",
"cluster1",
"--cell-metadata-file",
"../tests/data/test_cell_metadata.csv",
"--subsample",
]
self.assertEqual("ingest_subsample", get_action_from_args(args))
bad_args = ["foo", "bar", "bing"]
self.assertEqual("", get_action_from_args(bad_args))


if __name__ == "__main__":
unittest.main()
37 changes: 35 additions & 2 deletions tests/test_metrics_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def mock_post_event(props):
props = json.loads(props)
expected_props = {
"properties": {
"action": "ingest_expression",
"studyAccession": "SCP123",
"fileName": "File_name.txt",
"fileType": "Expression matrix",
Expand All @@ -29,6 +30,7 @@ def mock_post_event(props):
"logger": "ingest-pipeline",
"appId": "single-cell-portal",
"status": "success",
"trigger": "bucket"
}
}
if props["event"] == "file-validation":
Expand Down Expand Up @@ -56,6 +58,8 @@ class MetricsServiceTestCase(unittest.TestCase):
"file_type": "Expression matrix",
"upload_file_size": 400,
"name": "File_name.txt",
"remote_location": "File_name.txt",
"options": { "upload_trigger": "bucket" }
}
]
# Build client mock with functions and return values needed to query
Expand Down Expand Up @@ -94,7 +98,7 @@ def test_log(self, mock_execute_ingest, mock_MONGO_CONNECTION, mock_post_event):
"dense",
]
# Initialize global variables
config.init("5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337")
config.init("5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", None, 'ingest_expression')
config.set_parent_event_name("ingest-pipeline:expression:ingest")
IngestTestCase.execute_ingest(args)
metrics_model = config.get_metric_properties()
Expand All @@ -108,10 +112,39 @@ def test_log(self, mock_execute_ingest, mock_MONGO_CONNECTION, mock_post_event):
# args.insert(3).append(user_metrics_uuid)
# Initialize global variables
config.init(
"5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", user_metrics_uuid
"5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", user_metrics_uuid, 'ingest_expression'
)
config.set_parent_event_name("ingest-pipeline:expression:ingest")
IngestTestCase.execute_ingest(args)
metrics_model = config.get_metric_properties()
# Log Mixpanel events
MetricsService.log(config.get_parent_event_name(), metrics_model)

@patch("config.MONGO_CONNECTION")
def test_init(self, mock_mongo_conn):
client_values = {}
client_values["study_accessions"] = MagicMock()
client_values["study_accessions"].find.return_value = [{"accession": "SCP123"}]
client_values["study_files"] = MagicMock()
client_values["study_files"].find.return_value = [
{
"file_type": "AnnData",
"upload_file_size": 400,
"name": "matrix.h5ad",
"remote_location": "",
"ann_data_file_info": {
"reference_file": False
}
}
]
# Build client mock with functions and return values needed to query
client_mock = MagicMock()
client_mock.__getitem__.side_effect = client_values.__getitem__
mock_mongo_conn._client = client_mock
config.init(
"5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", user_metrics_uuid, "ingest_anndata"
)
props = config.get_metric_properties().get_properties()
self.assertEqual("ingest_anndata", props["action"])
self.assertEqual("upload", props["trigger"])
self.assertFalse(props["referenceAnnDataFile"])