Merge pull request #264 from CanDIG/daisieh/sample-obj

DIG-1252: add submitter_sample_id information to genomic drs objects
CanDIG · Jul 21, 2023 · d3395a7 · d3395a7
2 parents e9fbcfd + 663a333
commit d3395a7
Show file tree

Hide file tree

Showing 5 changed files with 164 additions and 38 deletions.
diff --git a/htsget_server/beacon_openapi.yaml b/htsget_server/beacon_openapi.yaml
@@ -864,7 +864,7 @@ components:
                 description: Reference to the bioinformatics analysis ID (`analysis.id`)
                 type: string
               biosampleId:
-                description: Reference to biosample ID (`biosample.id`)
+                description: Reference to biosample ID (`biosample.id`). For MoH, this will be `{program_id}~{submitter_sample_id}`, delimited with a tilde. If this is not available, it will be the name of the sample as listed in the variant file.
                 type: string
               clinicalInterpretations:
                 items:

diff --git a/htsget_server/drs_openapi.yaml b/htsget_server/drs_openapi.yaml
@@ -349,7 +349,7 @@ components:
             accession numbers or external GUIDs.
     GenomicDrsObject:
       type: object
-      description: A DrsObject that describes a bundled genomic data entity. It usually will consist of a genomic data file, e.g. a variant or read file, and its associated index file.
+      description: A DrsObject that describes a bundled genomic data entity. It usually will consist of a genomic data file, e.g. a variant or read file, and its associated index file. Its contents should also include any associated Samples (as SampleContentObjects), ordered in order of appearance in the associated variant/read files.
       required:
         - id
         - contents
@@ -418,12 +418,13 @@ components:
             = f7a29a04
         contents:
           type: array
-          description: The specific genomic contents objects that comprise this genomic DRS entity. Should contain a GenomicDataContentsObject and an optional GenomicIndexContentsObject.
+          description: The specific genomic contents objects that comprise this genomic DRS entity. Should contain a GenomicDataContentsObject, an optional GenomicIndexContentsObject, and SampleContentsObjects corresponding to the samples analyzed in the genomic data object.
           minItems: 1
           items:
             anyOf:
               - $ref: '#/components/schemas/GenomicDataContentsObject'
               - $ref: '#/components/schemas/GenomicIndexContentsObject'
+              - $ref: '#/components/schemas/SampleContentsObject'
         description:
           type: string
           description: A human readable description of the `DrsObject`.
@@ -697,6 +698,23 @@ components:
           description: The DRS uri(s) to the GenomicDrsObject
           items:
             type: string
+    SampleContentsObject:
+      type: object
+      required:
+        - name
+        - id
+      properties:
+        name:
+          type: string
+          description: The submitter_sample_id of the sample in the MoH model
+        id:
+          type: string
+          description: The id of the SampleDrsObject, corresponding to the sample's name in the VCF file
+        drs_uri:
+          type: array
+          description: The DRS uri(s) to the SampleDrsObject
+          items:
+            type: string
     GenomicDataContentsObject:
       type: object
       required:

diff --git a/htsget_server/drs_operations.py b/htsget_server/drs_operations.py
@@ -163,6 +163,8 @@ def _get_genomic_obj(object_id):
         if 'message' in main_result:
             result = main_result
         else:
+            if "samples" in drs_obj:
+                result['samples'] = drs_obj['samples']
             try:
                 result['file_format'] = drs_obj['format']
                 if drs_obj['type'] == 'read':
@@ -182,7 +184,7 @@ def _describe_drs_object(object_id):
     result = {
         "name": object_id
     }
-    # drs_obj should have two contents objects
+    # drs_obj should have a main contents, index contents, and sample contents
     if "contents" in drs_obj:
         for contents in drs_obj["contents"]:
             # get each drs object (should be the genomic file and its index)
@@ -205,6 +207,11 @@ def _describe_drs_object(object_id):
                 result['main'] = contents['name']
             elif index_match is not None:
                 result['index'] = contents['name']
+            else:
+                if "samples" not in result:
+                    result['samples'] = {}
+                result['samples'][contents['id']] = contents['name']
+
     if 'type' not in result:
         return {"message": f"drs object {object_id} does not represent an htsget object", "status_code": 404}
     return result

diff --git a/htsget_server/variants.py b/htsget_server/variants.py
@@ -67,7 +67,10 @@ def parse_vcf_file(drs_object_id, reference_name=None, start=None, end=None):
     for r in records:
         samples = []
         for s in r.samples:
-            samples.append(s)
+            if "samples" in gen_obj and s in gen_obj['samples']:
+                samples.append(gen_obj['samples'][s])
+            else:
+                samples.append(s)
         variant_record = parse_variant_record(str(r), samples, variants_by_file['info'])
         variants_by_file['variants'].append(variant_record)
     return variants_by_file

diff --git a/tests/test_htsget_server.py b/tests/test_htsget_server.py
@@ -184,6 +184,95 @@ def test_install_public_object():
         assert response.status_code == 200
 
 
+def get_ingest_file():
+    return [
+        (
+            {
+                "genomic_id": "multisample_1",
+                "samples": [
+                    {
+                        "sample_registration_id": "SAMPLE_REGISTRATION_3",
+                        "sample_name_in_file": "TUMOR"
+                    },
+                    {
+                        "sample_registration_id": "SAMPLE_REGISTRATION_4",
+                        "sample_name_in_file": "NORMAL"
+                    }
+                ]
+            }, "SYNTHETIC-2"
+        )
+    ]
+
+
+def get_ingest_sample_names(genomic_id):
+    result = {}
+    for item in get_ingest_file():
+        ingest_map, program_id = item
+        if ingest_map["genomic_id"] == genomic_id:
+            for sample in ingest_map["samples"]:
+                result[sample['sample_name_in_file']] = f"{program_id}~{sample['sample_registration_id']}"
+    return result
+
+
+@pytest.mark.parametrize('input, program_id', get_ingest_file())
+def test_add_sample_drs(input, program_id):
+    post_url = f"{HOST}/ga4gh/drs/v1/objects"
+    headers = get_headers()
+
+    # look for the main genomic drs object
+    get_url = f"{HOST}/ga4gh/drs/v1/objects/{input['genomic_id']}"
+    response = requests.request("GET", get_url, headers=headers)
+    if response.status_code == 200:
+        assert response.status_code == 200
+    genomic_drs_obj = response.json()
+
+    drs_url = HOST.replace("http://", "drs://").replace("https://", "drs://")
+    for sample in input['samples']:
+        sample_id = f"{program_id}~{sample['sample_registration_id']}"
+        # remove any existing objects:
+        sample_url = f"{HOST}/ga4gh/drs/v1/objects/{sample_id}"
+        response = requests.request("GET", sample_url, headers=headers)
+        if response.status_code == 200:
+            response = requests.request("DELETE", sample_url, headers=headers)
+            print(f"DELETE {sample_id}: {response.text}")
+            assert response.status_code == 200
+
+        # create a sampledrsobject to correspond to each sample:
+        sample_drs_object = {
+            "id": sample_id,
+            "contents": [
+                {
+                    "drs_uri": [
+                        f"{drs_url}/{input['genomic_id']}"
+                    ],
+                    "name": sample['sample_name_in_file'],
+                    "id": input['genomic_id']
+                }
+            ],
+            "version": "v1"
+        }
+        response = requests.request("POST", post_url, json=sample_drs_object, headers=headers)
+        print(f"POST {sample_drs_object['id']}: {response.text}")
+        assert response.status_code == 200
+
+        # add the sample contents to the genomic_drs_object's contents
+        sample_contents = {
+            "drs_uri": [
+                f"{drs_url}/{sample_id}"
+            ],
+            "name": sample_id,
+            "id": sample['sample_name_in_file']
+        }
+        genomic_drs_obj["contents"].append(sample_contents)
+
+    response = requests.post(post_url, json=genomic_drs_obj, headers=get_headers())
+    print(response.text)
+    response = requests.request("GET", get_url, headers=headers)
+    if response.status_code == 200:
+        assert response.status_code == 200
+    assert len(genomic_drs_obj["contents"]) == 4
+
+
 def invalid_start_end_data():
     return [(17123456, 23588), (9203, 42220938)]
 
@@ -307,41 +396,41 @@ def test_beacon_get_search():
 
 
 def get_beacon_post_search():
-        return [
-            (
-                # 6 variations, corresponding to three variant records in multisample_1 and multisample_2
-                # first variation, corresponding to "NC_000021.8:g.5030551=", should contain two cases
-                {
-                    "query": {
-                        "requestParameters": {
-                            "start": [5030000],
-                            "end": [5030847],
-                            "assemblyId": "hg37",
-                            "referenceName": "21"
-                        }
-                    },
-                    "meta": {
-                        "apiVersion": "v2"
+    return [
+        (
+            # 6 variations, corresponding to three variant records in multisample_1 and multisample_2
+            # first variation, corresponding to "NC_000021.8:g.5030551=", should contain two cases
+            {
+                "query": {
+                    "requestParameters": {
+                        "start": [5030000],
+                        "end": [5030847],
+                        "assemblyId": "hg37",
+                        "referenceName": "21"
                     }
-                }, 6, 2
-            ),
-            (
-                # 5 variations, corresponding to 2 refs and 3 alts in test
-                # first variation has two cases
-                {
-                    "query": {
-                        "requestParameters": {
-                            "start": [16562322],
-                            "end": [16613564],
-                            "referenceName": "1"
-                        }
-                    },
-                    "meta": {
-                        "apiVersion": "v2"
+                },
+                "meta": {
+                    "apiVersion": "v2"
+                }
+            }, 6, 2
+        ),
+        (
+            # 5 variations, corresponding to 2 refs and 3 alts in test
+            # first variation has two cases
+            {
+                "query": {
+                    "requestParameters": {
+                        "start": [16562322],
+                        "end": [16613564],
+                        "referenceName": "1"
                     }
-                }, 5, 2
-            )
-        ]
+                },
+                "meta": {
+                    "apiVersion": "v2"
+                }
+            }, 5, 2
+        )
+    ]
 
 
 @pytest.mark.parametrize('body, count, cases', get_beacon_post_search())
@@ -353,6 +442,15 @@ def test_beacon_post_search(body, count, cases):
     assert len(response.json()['response']) == count
     assert len(response.json()['response'][0]['caseLevelData']) == cases
 
+    # check to see if the sample names got in:
+    samples = get_ingest_sample_names('multisample_1')
+    for cld in response.json()['response'][0]['caseLevelData']:
+        if cld['analysisId'] == 'multisample_1':
+            assert cld['biosampleId'] in samples.values()
+        else:
+            assert cld['biosampleId'] not in samples.values()
+
+
 # if we search for NBPF1, we should find records in test.vcf that contain NBPF1 in their VEP annotations.
 def test_beacon_search_annotations():
     url = f"{HOST}/beacon/v2/g_variants"