From 794de95206e7a6b64684ae269af595819d473edd Mon Sep 17 00:00:00 2001 From: mgcam Date: Mon, 26 Feb 2024 13:48:13 +0000 Subject: [PATCH] Reviewed and updated the documentation --- README.md | 81 +++++++++++++++++------------------- npg_id_generation/pac_bio.py | 76 +++++++++++++++++++++++++-------- 2 files changed, 97 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index f26507f..b273a2c 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,43 @@ # npg_id_generation -An API used to generate product IDs, which are hashes of the JSON representation -of an object. - -For different sequencing platforms different sets of identifiers might be used to -fully describe the origin of data. For reasons of efficiency and interobility -between different systems it is sometimes desirable to be able to use a single -identifier, which will be unique not only within data for a single platform, -but also between different platforms. - +For different sequencing platforms different sets of attributes are used to +fully describe the origin of data. For reasons of efficiency and interoperability +between systems (for example, databases, long term data storage) it is +sometimes desirable to use a single identifier, which is globally unique. + +Historically, the first algorithm for generating this kind of unique data +identifiers was implemented for the Illumina sequencing platform, see +[documentation](https://github.com/wtsi-npg/npg_tracking/blob/master/lib/npg_tracking/glossary/composition.pm). In the Sanger Institute run ID, lane number and numerical tag index are used -as identifiers for the Illumina platform. Historically, the first algorithm for -generating unique identifiers was implemented in Perl for the Illumina platform, -see [documentation](https://github.com/wtsi-npg/npg_tracking/blob/master/lib/npg_tracking/glossary/composition.pm -). +to describe the origin of the data. The above Perl API uses these attributes of +the data to produce the unique ID. Later a need to have a similar API for other sequencing platforms arose. This -package implements a Python API. The attributes of objects are sequencing -platform specific. The generator for the PacBio platform is implemented by the +package implements a Python API. The rationale for ID generation for an +arbitrary sequencing platform is as follows: + +1. Implement a Python class that encapsulates a data model for the sequencing + platform under consideration. +2. This class should have a constructor, which returns an object representing + an instantiation of the data model for a particular set of attributes. The + nature of these attributes might differ between sequencing platform. +3. This class should have an instance method that returns a unique ID. +4. This class should have an instance method that returns a human-readable + JSON representation of the attributes that were given to the constructor. + +All ID generators should conform to a few simple rules: + +1. Uniqueness of the ID should be guaranteed. +2. The ID should be a 64 character string containing only hexadecimal digits. +3. The value of the ID should **not** depend on the order of attributes given + to the constructor of the object that is used to generate the ID. +4. If the object, which is used to generate the ID, is instantiated from a JSON + string, the value of the ID should **not** depend on the order of keys or + the amount of whitespace in the input JSON. +5. The value of the ID should **not** depend on whether the undefined values + of attributes are explicitly set. + +The ID generator for the PacBio sequencing platform is implemented by the `PacBioEntity` class. Examples of generating IDs for PacBio data from Python code: @@ -39,7 +59,6 @@ print(PacBioEntity( ) # sample-specific indentifier -# for multiple tags a sorted comma-separated list of tagscan be used print(PacBioEntity(run_name="MARATHON", well_label="D1", tags="AAGTACGT").hash_product_id() ``` @@ -49,22 +68,11 @@ ID to the STDOUT stream. Use the `--help` option to find out details. ```perl # Using the script in the Perl code: -my $id = `npg_id_generation --run_name 'MARATHON' --well_label 'D1'`; +my $id = `generate_pac_bio_id --run_name 'MARATHON' --well_label 'D1'`; ``` -All generators should conform to a few simple rules: - -1. Uniqueness of the ID should be guaranteed. -2. The ID should be a 64 character string. -3. It should be possible to generate an ID from a JSON string. -4. The value of the ID should **not** depend on the order of attributes given - to the constructor or the order of keys used in JSON. -5. The value of the ID should **not** depend on the amount of whitespace in - the input JSON. -6. The value of the ID should **not** depend on whether the undefined values - of attributes are explicitly set. - -The examples below clarify the rules. Objects `o1` - `o6` should generate the same ID. +The examples below clarify the rule any ID generator shoudl conform to. +Objects `o1` - `o6` should generate the same ID. ```python o1 = PacBioEntity(run_name="r1", well_label="l1") @@ -74,16 +82,3 @@ o4 = PacBioEntity.parse_raw('{"run_name": "r1","well_label": "l1"}', content_typ o5 = PacBioEntity.parse_raw('{"well_label": "l1", "run_name": "r1"}', content_type="json") o6 = PacBioEntity.parse_raw('{"well_label": "l1","run_name": "r1", "tags": null}', content_type="json") ``` - -In addition, to maintain backwards compatibility for PacBio Revio products, -the following two objects should generate the same ID, meaning that the -value of 1 for the plate number attribute is disregarded. - -```python -o1 = PacBioEntity(run_name="r1", well_label="l1") -o2 = PacBioEntity(run_name="r1", well_label="l1", plate_number=1) -``` - -The algorithm used for generation of identifiers can be replicated in Perl; -on identical input data it gives identical results. However, we cannot -guarantee that this parity will always be maintained in future. diff --git a/npg_id_generation/pac_bio.py b/npg_id_generation/pac_bio.py index 33d2b0c..ef30b0e 100644 --- a/npg_id_generation/pac_bio.py +++ b/npg_id_generation/pac_bio.py @@ -28,15 +28,18 @@ def concatenate_tags(tags: list[str]): - """Concatenates a list of tags so that it can be used as an attribute in - the creation of a PacBioEntity. + """Returns a concatenated list of tags. + + A helper method that converts a list of tags into a single + string, which can be used to set the `tags` attribute of + the PacBioEntity class instance. Args: tags: A list of tag sequences. - Returns:A comma separated string of tags or None - + Returns: A comma-separated string of tags or None """ + if not tags: return None else: @@ -44,28 +47,52 @@ def concatenate_tags(tags: list[str]): class PacBioEntity(BaseModel): - """A PacBio class for product ID generation.""" + """A class that wraps together PacBio data product's attributes. + + This class wraps together the attributes that describe an experiment + performed on a sequencer by Pacific Biosciences (PacBio). A single + experiment produces a number of data products. This class models + a single data product. + + The class provides an instance method for generating a unique ID for + the single product this class models. + + The class inherits from `pydantic`'s `BaseModel`, therefore, + a JSON representation of the data product can be generated by + calling `model_dump_json` method on the instance of this class. + + Example: + + from npg_id_generation.pac_bio import PacBioEntity, concatenate_tags + tags = concatenate_tags(['ACGTACGT', 'TACCCGAA']) + o = PacBioEntity(run_name="RUN_1", well_label="A1", tags=tags) + id = o.hash_product_id() + # use any appropriate options of the model_dump_json method + json_string = o.model_dump_json(exclude_none=True) """ - Pydantic's current default is to serialize attributes in the order - they are listed. if this behaviour changes, we can restore it by - using json.dumps() sort_keys argument, see - https://docs.python.org/3/library/json.html#basic-usage - We are not using this explicit sort for now since it adds to the - execution time. + """ + Pydantic's current default is to serialize attributes in the order + they are listed. if this behaviour changes, we can restore it by + using json.dumps() sort_keys argument, see + https://docs.python.org/3/library/json.html#basic-usage + + We are not using this explicit sort for now since it adds to the + execution time. - Order the attributes alphabetically to maintain order in the output - of model_dump_json(). + Order the attributes alphabetically to maintain order in the output + of model_dump_json(). """ + model_config = ConfigDict(extra="forbid") - run_name: str = Field(title="Pac Bio run name as in LIMS") - well_label: str = Field(title="Pac Bio well label") + run_name: str = Field(title="PacBio run name as in LIMS") + well_label: str = Field(title="PacBio well label") plate_number: Optional[int] = Field( default=None, ge=1, - title="Pac Bio plate number", + title="PacBio plate number", description=""" Plate number is a positive integer and is relevant for Revio instruments only, thus it defaults to None. @@ -108,7 +135,22 @@ def tags_have_correct_characters(cls, v): return v def hash_product_id(self): - """Generate a sha256sum for the PacBio Entity""" + """Returns a unique ID for the PacBio data product. + + Generates and returns a unique ID for an instance of the PacBioEntity + class. + + To maintain backwards compatibility for PacBio Revio products, + plate_number value of 1 is disregarded. The method should generate the + same ID for the following two objects: + + o1 = PacBioEntity(run_name="r1", well_label="l1") + o2 = PacBioEntity(run_name="r1", well_label="l1", plate_number=1) + + Args: None + + Returns: A 64 character long string containing only hexadecimal digits. + """ if self.plate_number is not None and self.plate_number > 1: json = self.model_dump_json(exclude_none=True)