Skip to content

Commit

Permalink
Merge pull request #264 from CanDIG/daisieh/sample-obj
Browse files Browse the repository at this point in the history
DIG-1252: add submitter_sample_id information to genomic drs objects
  • Loading branch information
daisieh authored Jul 21, 2023
2 parents e9fbcfd + 663a333 commit d3395a7
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 38 deletions.
2 changes: 1 addition & 1 deletion htsget_server/beacon_openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,7 @@ components:
description: Reference to the bioinformatics analysis ID (`analysis.id`)
type: string
biosampleId:
description: Reference to biosample ID (`biosample.id`)
description: Reference to biosample ID (`biosample.id`). For MoH, this will be `{program_id}~{submitter_sample_id}`, delimited with a tilde. If this is not available, it will be the name of the sample as listed in the variant file.
type: string
clinicalInterpretations:
items:
Expand Down
22 changes: 20 additions & 2 deletions htsget_server/drs_openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ components:
accession numbers or external GUIDs.
GenomicDrsObject:
type: object
description: A DrsObject that describes a bundled genomic data entity. It usually will consist of a genomic data file, e.g. a variant or read file, and its associated index file.
description: A DrsObject that describes a bundled genomic data entity. It usually will consist of a genomic data file, e.g. a variant or read file, and its associated index file. Its contents should also include any associated Samples (as SampleContentObjects), ordered in order of appearance in the associated variant/read files.
required:
- id
- contents
Expand Down Expand Up @@ -418,12 +418,13 @@ components:
= f7a29a04
contents:
type: array
description: The specific genomic contents objects that comprise this genomic DRS entity. Should contain a GenomicDataContentsObject and an optional GenomicIndexContentsObject.
description: The specific genomic contents objects that comprise this genomic DRS entity. Should contain a GenomicDataContentsObject, an optional GenomicIndexContentsObject, and SampleContentsObjects corresponding to the samples analyzed in the genomic data object.
minItems: 1
items:
anyOf:
- $ref: '#/components/schemas/GenomicDataContentsObject'
- $ref: '#/components/schemas/GenomicIndexContentsObject'
- $ref: '#/components/schemas/SampleContentsObject'
description:
type: string
description: A human readable description of the `DrsObject`.
Expand Down Expand Up @@ -697,6 +698,23 @@ components:
description: The DRS uri(s) to the GenomicDrsObject
items:
type: string
SampleContentsObject:
type: object
required:
- name
- id
properties:
name:
type: string
description: The submitter_sample_id of the sample in the MoH model
id:
type: string
description: The id of the SampleDrsObject, corresponding to the sample's name in the VCF file
drs_uri:
type: array
description: The DRS uri(s) to the SampleDrsObject
items:
type: string
GenomicDataContentsObject:
type: object
required:
Expand Down
9 changes: 8 additions & 1 deletion htsget_server/drs_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ def _get_genomic_obj(object_id):
if 'message' in main_result:
result = main_result
else:
if "samples" in drs_obj:
result['samples'] = drs_obj['samples']
try:
result['file_format'] = drs_obj['format']
if drs_obj['type'] == 'read':
Expand All @@ -182,7 +184,7 @@ def _describe_drs_object(object_id):
result = {
"name": object_id
}
# drs_obj should have two contents objects
# drs_obj should have a main contents, index contents, and sample contents
if "contents" in drs_obj:
for contents in drs_obj["contents"]:
# get each drs object (should be the genomic file and its index)
Expand All @@ -205,6 +207,11 @@ def _describe_drs_object(object_id):
result['main'] = contents['name']
elif index_match is not None:
result['index'] = contents['name']
else:
if "samples" not in result:
result['samples'] = {}
result['samples'][contents['id']] = contents['name']

if 'type' not in result:
return {"message": f"drs object {object_id} does not represent an htsget object", "status_code": 404}
return result
Expand Down
5 changes: 4 additions & 1 deletion htsget_server/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ def parse_vcf_file(drs_object_id, reference_name=None, start=None, end=None):
for r in records:
samples = []
for s in r.samples:
samples.append(s)
if "samples" in gen_obj and s in gen_obj['samples']:
samples.append(gen_obj['samples'][s])
else:
samples.append(s)
variant_record = parse_variant_record(str(r), samples, variants_by_file['info'])
variants_by_file['variants'].append(variant_record)
return variants_by_file
Expand Down
164 changes: 131 additions & 33 deletions tests/test_htsget_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,95 @@ def test_install_public_object():
assert response.status_code == 200


def get_ingest_file():
return [
(
{
"genomic_id": "multisample_1",
"samples": [
{
"sample_registration_id": "SAMPLE_REGISTRATION_3",
"sample_name_in_file": "TUMOR"
},
{
"sample_registration_id": "SAMPLE_REGISTRATION_4",
"sample_name_in_file": "NORMAL"
}
]
}, "SYNTHETIC-2"
)
]


def get_ingest_sample_names(genomic_id):
result = {}
for item in get_ingest_file():
ingest_map, program_id = item
if ingest_map["genomic_id"] == genomic_id:
for sample in ingest_map["samples"]:
result[sample['sample_name_in_file']] = f"{program_id}~{sample['sample_registration_id']}"
return result


@pytest.mark.parametrize('input, program_id', get_ingest_file())
def test_add_sample_drs(input, program_id):
post_url = f"{HOST}/ga4gh/drs/v1/objects"
headers = get_headers()

# look for the main genomic drs object
get_url = f"{HOST}/ga4gh/drs/v1/objects/{input['genomic_id']}"
response = requests.request("GET", get_url, headers=headers)
if response.status_code == 200:
assert response.status_code == 200
genomic_drs_obj = response.json()

drs_url = HOST.replace("http://", "drs://").replace("https://", "drs://")
for sample in input['samples']:
sample_id = f"{program_id}~{sample['sample_registration_id']}"
# remove any existing objects:
sample_url = f"{HOST}/ga4gh/drs/v1/objects/{sample_id}"
response = requests.request("GET", sample_url, headers=headers)
if response.status_code == 200:
response = requests.request("DELETE", sample_url, headers=headers)
print(f"DELETE {sample_id}: {response.text}")
assert response.status_code == 200

# create a sampledrsobject to correspond to each sample:
sample_drs_object = {
"id": sample_id,
"contents": [
{
"drs_uri": [
f"{drs_url}/{input['genomic_id']}"
],
"name": sample['sample_name_in_file'],
"id": input['genomic_id']
}
],
"version": "v1"
}
response = requests.request("POST", post_url, json=sample_drs_object, headers=headers)
print(f"POST {sample_drs_object['id']}: {response.text}")
assert response.status_code == 200

# add the sample contents to the genomic_drs_object's contents
sample_contents = {
"drs_uri": [
f"{drs_url}/{sample_id}"
],
"name": sample_id,
"id": sample['sample_name_in_file']
}
genomic_drs_obj["contents"].append(sample_contents)

response = requests.post(post_url, json=genomic_drs_obj, headers=get_headers())
print(response.text)
response = requests.request("GET", get_url, headers=headers)
if response.status_code == 200:
assert response.status_code == 200
assert len(genomic_drs_obj["contents"]) == 4


def invalid_start_end_data():
return [(17123456, 23588), (9203, 42220938)]

Expand Down Expand Up @@ -307,41 +396,41 @@ def test_beacon_get_search():


def get_beacon_post_search():
return [
(
# 6 variations, corresponding to three variant records in multisample_1 and multisample_2
# first variation, corresponding to "NC_000021.8:g.5030551=", should contain two cases
{
"query": {
"requestParameters": {
"start": [5030000],
"end": [5030847],
"assemblyId": "hg37",
"referenceName": "21"
}
},
"meta": {
"apiVersion": "v2"
return [
(
# 6 variations, corresponding to three variant records in multisample_1 and multisample_2
# first variation, corresponding to "NC_000021.8:g.5030551=", should contain two cases
{
"query": {
"requestParameters": {
"start": [5030000],
"end": [5030847],
"assemblyId": "hg37",
"referenceName": "21"
}
}, 6, 2
),
(
# 5 variations, corresponding to 2 refs and 3 alts in test
# first variation has two cases
{
"query": {
"requestParameters": {
"start": [16562322],
"end": [16613564],
"referenceName": "1"
}
},
"meta": {
"apiVersion": "v2"
},
"meta": {
"apiVersion": "v2"
}
}, 6, 2
),
(
# 5 variations, corresponding to 2 refs and 3 alts in test
# first variation has two cases
{
"query": {
"requestParameters": {
"start": [16562322],
"end": [16613564],
"referenceName": "1"
}
}, 5, 2
)
]
},
"meta": {
"apiVersion": "v2"
}
}, 5, 2
)
]


@pytest.mark.parametrize('body, count, cases', get_beacon_post_search())
Expand All @@ -353,6 +442,15 @@ def test_beacon_post_search(body, count, cases):
assert len(response.json()['response']) == count
assert len(response.json()['response'][0]['caseLevelData']) == cases

# check to see if the sample names got in:
samples = get_ingest_sample_names('multisample_1')
for cld in response.json()['response'][0]['caseLevelData']:
if cld['analysisId'] == 'multisample_1':
assert cld['biosampleId'] in samples.values()
else:
assert cld['biosampleId'] not in samples.values()


# if we search for NBPF1, we should find records in test.vcf that contain NBPF1 in their VEP annotations.
def test_beacon_search_annotations():
url = f"{HOST}/beacon/v2/g_variants"
Expand Down

0 comments on commit d3395a7

Please sign in to comment.