From 323af38d0a45a0a728b1d272851c9d319cd35a00 Mon Sep 17 00:00:00 2001 From: Paul STRETENOWICH <31796146+paulstretenowich@users.noreply.github.com> Date: Wed, 11 Sep 2024 13:51:46 -0400 Subject: [PATCH] Dev (#37) New schama and instrumentation to update it * Renaming organism into specimen * Adding "state" enum for readset + new Reference table + debug * Adding INVALID state + considering only VALID readsets to get undelivered * Adding alembic for when database structure gets updated * Reorganizing alembic folders and adding url in env.py * Adding versioning + auto alembic upgrade + upgrade/downgrade debug for latest version (#36) --- .gitignore | 2 + Containerfile | 7 +- _version.py | 16 + alembic.ini | 116 ++ entrypoint.sh | 8 +- migration/alembic/README | 1 + migration/alembic/env.py | 105 ++ migration/alembic/script.py.mako | 26 + .../versions/d1dc98f61aee_major_changes.py | 145 +++ project_tracking/api/__init__.py | 4 +- project_tracking/api/modification.py | 101 ++ project_tracking/api/project.py | 135 ++- project_tracking/db_action.py | 1028 ++++++++++++----- project_tracking/model.py | 123 +- project_tracking/vocabulary.py | 30 +- pyproject.toml | 13 +- tests/conftest.py | 4 +- tests/data/run_processing.json | 41 +- tests/test_ingestion.py | 30 +- tests/test_model.py | 2 +- tests/test_serialization.py | 4 +- 21 files changed, 1518 insertions(+), 423 deletions(-) create mode 100644 _version.py create mode 100644 alembic.ini mode change 100644 => 100755 entrypoint.sh create mode 100644 migration/alembic/README create mode 100644 migration/alembic/env.py create mode 100644 migration/alembic/script.py.mako create mode 100644 migration/alembic/versions/d1dc98f61aee_major_changes.py create mode 100644 project_tracking/api/modification.py diff --git a/.gitignore b/.gitignore index d689ec5..4fe1079 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ build/ *.egg-info/ .DS_Store + +tests/data/real_data diff --git a/Containerfile b/Containerfile index 55e7aaf..62b6172 100644 --- a/Containerfile +++ b/Containerfile @@ -10,9 +10,12 @@ RUN dnf install -y python3-pip.noarch ENV C3G_SQLALCHEMY_DATABASE_URI="sqlite:////sqlite/tracking_db.sql" ADD . $APP -RUN cd $APP && pip install .[postgres] && chmod 755 entrypoint.sh && mv entrypoint.sh .. +# To be remove once the container is built from a release version instead of the tip of a branche +# It is a temporary hack to be able to keep hatch-vcs in the repo. +ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.0.0 +RUN cd $APP && pip install .[postgres] && chmod 755 entrypoint.sh EXPOSE 8000 - +WORKDIR /app/$APP ENTRYPOINT ["./entrypoint.sh", "-b", "0.0.0.0"] CMD ["-w", "4"] diff --git a/_version.py b/_version.py new file mode 100644 index 0000000..19afb95 --- /dev/null +++ b/_version.py @@ -0,0 +1,16 @@ +# file generated by setuptools_scm +# don't change, don't track in version control +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple, Union + VERSION_TUPLE = Tuple[Union[int, str], ...] +else: + VERSION_TUPLE = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE + +__version__ = version = '1.0.7.dev7+gfc28f13.d20240910' +__version_tuple__ = version_tuple = (1, 0, 7, 'dev7', 'gfc28f13.d20240910') diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..4d5b1ba --- /dev/null +++ b/alembic.ini @@ -0,0 +1,116 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = migration/alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/entrypoint.sh b/entrypoint.sh old mode 100644 new mode 100755 index f8e0a3a..91d1d7e --- a/entrypoint.sh +++ b/entrypoint.sh @@ -7,12 +7,16 @@ fi DB_OPS= if [[ -v "${C3G_SQLALCHEMY_DATABASE_URI}" ]]; then - DB_OPS=--db_uri ${C3G_SQLALCHEMY_DATABASE_URI} + DB_OPS=--db-uri ${C3G_SQLALCHEMY_DATABASE_URI} fi -if [[ -v C3G_INIT_DB ]]; then +if [[ -v "${C3G_INIT_DB}" ]]; then echo instanciating data base flask --app $APP init-db $DB_OPS fi +if [[ -v "${C3G_ALEMBIC_UPGRADE}" ]]; then + alembic upgrade head +fi + gunicorn "project_tracking:create_app()" "${@}" diff --git a/migration/alembic/README b/migration/alembic/README new file mode 100644 index 0000000..70677b0 --- /dev/null +++ b/migration/alembic/README @@ -0,0 +1 @@ +# Add commands for migration (script to run) \ No newline at end of file diff --git a/migration/alembic/env.py b/migration/alembic/env.py new file mode 100644 index 0000000..074ad3b --- /dev/null +++ b/migration/alembic/env.py @@ -0,0 +1,105 @@ +import os + +from logging.config import fileConfig + +from sqlalchemy import create_engine +from alembic import context + +from alembic_utils.pg_function import PGFunction +from alembic_utils.replaceable_entity import register_entities +import alembic_postgresql_enum + +from project_tracking.model import Base + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +# Cf. https://github.com/sqlalchemy/alembic/discussions/1149 +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +# Cf. https://github.com/olirice/alembic_utils +to_upper = PGFunction( + schema='public', + signature='to_upper(some_text text)', + definition=""" + RETURNS text as + $$ + SELECT upper(some_text) + $$ language SQL; + """ +) + +register_entities([to_upper]) + +def get_url(): + """Generate a URL from the environment variables. + """ + return f"{os.environ['C3G_SQLALCHEMY_DATABASE_URI']}" + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = get_url() + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + url = get_url() + + connectable = create_engine(url) + + with connectable.connect() as connection: + context.configure( + connection=connection, + target_metadata=target_metadata, + compare_type=True, + compare_server_default=True + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/migration/alembic/script.py.mako b/migration/alembic/script.py.mako new file mode 100644 index 0000000..fbc4b07 --- /dev/null +++ b/migration/alembic/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/migration/alembic/versions/d1dc98f61aee_major_changes.py b/migration/alembic/versions/d1dc98f61aee_major_changes.py new file mode 100644 index 0000000..37cafd3 --- /dev/null +++ b/migration/alembic/versions/d1dc98f61aee_major_changes.py @@ -0,0 +1,145 @@ +"""Major changes + +Revision ID: d1dc98f61aee +Revises: +Create Date: 2024-07-29 11:42:19.723940 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from alembic_postgresql_enum import TableReference +from alembic_utils.pg_function import PGFunction +from sqlalchemy import text as sql_text +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'd1dc98f61aee' +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum('VALID', 'ON_HOLD', 'INVALID', name='stateenum').create(op.get_bind()) + op.create_table('reference', + sa.Column('name', sa.String(), nullable=True), + sa.Column('alias', sa.String(), nullable=True), + sa.Column('assembly', sa.String(), nullable=True), + sa.Column('version', sa.String(), nullable=True), + sa.Column('taxon_id', sa.String(), nullable=True), + sa.Column('source', sa.String(), nullable=True), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('deprecated', sa.Boolean(), nullable=False), + sa.Column('deleted', sa.Boolean(), nullable=False), + sa.Column('creation', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=True), + sa.Column('modification', sa.DateTime(timezone=True), nullable=True), + sa.Column('extra_metadata', sa.JSON(), nullable=True), + sa.Column('ext_id', sa.Integer(), nullable=True), + sa.Column('ext_src', sa.String(), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + # Rename table patient to specimen + op.rename_table('patient', 'specimen') + op.add_column('specimen', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('specimen', sa.Column('ext_src', sa.String(), nullable=True)) + op.add_column('experiment', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('experiment', sa.Column('ext_src', sa.String(), nullable=True)) + op.add_column('file', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('file', sa.Column('ext_src', sa.String(), nullable=True)) + op.add_column('job', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('job', sa.Column('ext_src', sa.String(), nullable=True)) + op.add_column('location', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('location', sa.Column('ext_src', sa.String(), nullable=True)) + op.add_column('metric', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('metric', sa.Column('ext_src', sa.String(), nullable=True)) + op.add_column('operation', sa.Column('reference_id', sa.Integer(), nullable=True)) + op.add_column('operation', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('operation', sa.Column('ext_src', sa.String(), nullable=True)) + op.create_foreign_key(None, 'operation', 'reference', ['reference_id'], ['id']) + op.add_column('operation_config', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('operation_config', sa.Column('ext_src', sa.String(), nullable=True)) + op.add_column('project', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('project', sa.Column('ext_src', sa.String(), nullable=True)) + op.drop_column('project', 'fms_id') + op.add_column('readset', sa.Column('state', postgresql.ENUM('VALID', 'ON_HOLD', 'INVALID', name='stateenum', create_type=False), nullable=True)) + op.add_column('readset', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('readset', sa.Column('ext_src', sa.String(), nullable=True)) + op.drop_column('readset', 'quality_offset') + op.add_column('run', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('run', sa.Column('ext_src', sa.String(), nullable=True)) + op.drop_column('run', 'fms_id') + # Change sequence name Cf. https://petegraham.co.uk/rename-postgres-table-with-alembic/ + op.execute('ALTER SEQUENCE patient_id_seq RENAME TO specimen_id_seq') + op.add_column('sample', sa.Column('ext_id', sa.Integer(), nullable=True)) + op.add_column('sample', sa.Column('ext_src', sa.String(), nullable=True)) + # Rename column patient_id to specimen_id in sample table + op.alter_column('sample', 'patient_id', new_column_name='specimen_id') + op.drop_column('sample', 'fms_id') + op.drop_column('specimen', 'fms_id') + public_to_upper = PGFunction( + schema="public", + signature="to_upper(some_text text)", + definition='RETURNS text as\n $$\n SELECT upper(some_text)\n $$ language SQL' + ) + op.create_entity(public_to_upper) + + op.sync_enum_values('public', 'flagenum', ['PASS', 'WARNING', 'FAILED', 'MISSING', 'NOT_APPLICABLE'], + [TableReference(table_schema='public', table_name='metric', column_name='flag')], + enum_values_to_rename=[]) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.sync_enum_values('public', 'flagenum', ['PASS', 'WARNING', 'FAILED'], + [TableReference(table_schema='public', table_name='metric', column_name='flag')], + enum_values_to_rename=[]) + public_to_upper = PGFunction( + schema="public", + signature="to_upper(some_text text)", + definition='RETURNS text as\n $$\n SELECT upper(some_text)\n $$ language SQL' + ) + op.drop_entity(public_to_upper) + + op.add_column('sample', sa.Column('fms_id', sa.VARCHAR(), autoincrement=False, nullable=True)) + # Change sequence name Cf. https://petegraham.co.uk/rename-postgres-table-with-alembic/ + op.execute('ALTER SEQUENCE specimen_id_seq RENAME TO patient_id_seq') + op.drop_column('sample', 'ext_src') + op.drop_column('sample', 'ext_id') + # Rename column patient_id to specimen_id in sample table + op.alter_column('sample', 'specimen_id', new_column_name='patient_id') + op.add_column('run', sa.Column('fms_id', sa.VARCHAR(), autoincrement=False, nullable=True)) + op.drop_column('run', 'ext_src') + op.drop_column('run', 'ext_id') + op.add_column('readset', sa.Column('quality_offset', sa.VARCHAR(), autoincrement=False, nullable=True)) + op.drop_column('readset', 'ext_src') + op.drop_column('readset', 'ext_id') + op.drop_column('readset', 'state') + op.add_column('project', sa.Column('fms_id', sa.VARCHAR(), autoincrement=False, nullable=True)) + op.drop_column('project', 'ext_src') + op.drop_column('project', 'ext_id') + op.drop_column('operation_config', 'ext_src') + op.drop_column('operation_config', 'ext_id') + op.drop_column('operation', 'ext_src') + op.drop_column('operation', 'ext_id') + op.drop_column('operation', 'reference_id') + op.drop_column('metric', 'ext_src') + op.drop_column('metric', 'ext_id') + op.drop_column('location', 'ext_src') + op.drop_column('location', 'ext_id') + op.drop_column('job', 'ext_src') + op.drop_column('job', 'ext_id') + op.drop_column('file', 'ext_src') + op.drop_column('file', 'ext_id') + op.drop_column('experiment', 'ext_src') + op.drop_column('experiment', 'ext_id') + op.rename_table('specimen', 'patient') + op.drop_column('patient', 'ext_src') + op.drop_column('patient', 'ext_id') + op.add_column('patient', sa.Column('fms_id', sa.VARCHAR(), autoincrement=False, nullable=True)) + op.drop_table('reference') + sa.Enum('VALID', 'ON_HOLD', 'INVALID', name='stateenum').drop(op.get_bind()) + # ### end Alembic commands ### diff --git a/project_tracking/api/__init__.py b/project_tracking/api/__init__.py index 6ab4ada..53a8baf 100644 --- a/project_tracking/api/__init__.py +++ b/project_tracking/api/__init__.py @@ -1,3 +1,3 @@ -from . import admin, project +from . import admin, project, modification -blueprints = (admin.bp, project.bp) \ No newline at end of file +blueprints = (admin.bp, project.bp, modification.bp) \ No newline at end of file diff --git a/project_tracking/api/modification.py b/project_tracking/api/modification.py new file mode 100644 index 0000000..5168270 --- /dev/null +++ b/project_tracking/api/modification.py @@ -0,0 +1,101 @@ +import logging +import functools + +from flask import Blueprint, jsonify, request, flash, redirect, json, abort + +from .. import db_action +from .. import vocabulary as vc + +logger = logging.getLogger(__name__) + +bp = Blueprint('modification', __name__, url_prefix='/modification') + +@bp.route('/edit', methods=['POST']) +def edit(): + """ + POST: json describing the edit to be made + return: + """ + if request.method == 'POST': + try: + ingest_data = request.get_json(force=True) + except: + flash('Data does not seems to be json') + return redirect(request.url) + + return db_action.edit(ingest_data) + +@bp.route('/delete', methods=['POST']) +def delete(): + """ + POST: json describing the delete to be made + return: + """ + if request.method == 'POST': + try: + ingest_data = request.get_json(force=True) + except: + flash('Data does not seems to be json') + return redirect(request.url) + + return db_action.delete(ingest_data) + +@bp.route('/undelete', methods=['POST']) +def undelete(): + """ + POST: json describing the undelete to be made + return: + """ + if request.method == 'POST': + try: + ingest_data = request.get_json(force=True) + except: + flash('Data does not seems to be json') + return redirect(request.url) + + return db_action.undelete(ingest_data) + +@bp.route('/deprecate', methods=['POST']) +def deprecate(): + """ + POST: json describing the deprecate to be made + return: + """ + if request.method == 'POST': + try: + ingest_data = request.get_json(force=True) + except: + flash('Data does not seems to be json') + return redirect(request.url) + + return db_action.deprecate(ingest_data) + +@bp.route('/undeprecate', methods=['POST']) +def undeprecate(): + """ + POST: json describing the undeprecate to be made + return: + """ + if request.method == 'POST': + try: + ingest_data = request.get_json(force=True) + except: + flash('Data does not seems to be json') + return redirect(request.url) + + return db_action.undeprecate(ingest_data) + +@bp.route('/curate', methods=['POST']) +def curate(): + """ + POST: json describing the curate to be made + return: + """ + if request.method == 'POST': + try: + ingest_data = request.get_json(force=True) + except: + flash('Data does not seems to be json') + return redirect(request.url) + + return db_action.curate(ingest_data) diff --git a/project_tracking/api/project.py b/project_tracking/api/project.py index 66a1e73..7aeb857 100644 --- a/project_tracking/api/project.py +++ b/project_tracking/api/project.py @@ -43,10 +43,13 @@ def wrap(*args, project=None, **kwargs): all_available = [f"id: {project.id}, name: {project.name}" for project in db_action.projects()] project_id = {"DB_ACTION_WARNING": f"Requested Project '{project}' doesn't exist. Please try again with one of the following: {all_available}"} else: - project_id = str(db_action.name_to_id("Project", project.upper())[0]) + project_id = db_action.name_to_id("Project", project.upper()) if not project_id: all_available = [f"id: {project.id}, name: {project.name}" for project in db_action.projects()] project_id = {"DB_ACTION_WARNING": f"Requested Project '{project}' doesn't exist. Please try again with one of the following: {all_available}"} + else: + # Converting list of 1 project to string + project_id = "".join(map(str, project_id)) return func(*args, project_id=project_id, **kwargs) return wrap @@ -77,26 +80,26 @@ def projects(project_id: str = None): return [i.flat_dict for i in db_action.projects(project_id)] -@bp.route('//patients') -@bp.route('//patients/') +@bp.route('//specimens') +@bp.route('//specimens/') @convcheck_project -def patients(project_id: str, patient_id: str = None): +def specimens(project_id: str, specimen_id: str = None): """ GET: - patient_id: uses the form "1,3-8,9", if not provided all patients are returned - return: list all patients or selected patients, belonging to + specimen_id: uses the form "1,3-8,9", if not provided all specimens are returned + return: list all specimens or selected specimens, belonging to Query: (pair, tumor): Default (None, true) The tumor query only have an effect if pair is false (None, true/false): - Return: all or selected patients (Default) + Return: all or selected specimens (Default) (true, true/false): - Return: a subset of patient who have Tumor=False & Tumor=True samples + Return: a subset of specimen who have Tumor=False & Tumor=True samples (false, true): - return: a subset of patient who only have Tumor=True samples + return: a subset of specimen who only have Tumor=True samples (false, false): - return: a subset of patient who only have Tumor=false samples + return: a subset of specimen who only have Tumor=false samples """ query = request.args @@ -112,33 +115,33 @@ def patients(project_id: str, patient_id: str = None): if query.get('tumor','').lower() in ['false', '0']: tumor=False - if patient_id is not None: - patient_id = unroll(patient_id) + if specimen_id is not None: + specimen_id = unroll(specimen_id) if query.get('name'): name = query['name'] if name: - patient_id = [] - for patient_name in name.split(","): - patient_id.extend(db_action.name_to_id("Patient", patient_name)) + specimen_id = [] + for specimen_name in name.split(","): + specimen_id.extend(db_action.name_to_id("Specimen", specimen_name)) if isinstance(project_id, dict) and project_id.get("DB_ACTION_WARNING"): return project_id # pair being either True or False if pair is not None: - action_output = db_action.patient_pair( + action_output = db_action.specimen_pair( project_id, - patient_id=patient_id, + specimen_id=specimen_id, pair=pair, tumor=tumor ) else: - action_output = db_action.patients( + action_output = db_action.specimens( project_id, - patient_id=patient_id + specimen_id=specimen_id ) - return sanity_check("Patient", action_output) + return sanity_check("Specimen", action_output) @bp.route('//samples') @@ -148,7 +151,7 @@ def samples(project_id: str, sample_id: str = None): """ GET: sample_id: uses the form "1,3-8,9", if not provided all samples are returned - return: list all patients or selected samples, belonging to + return: list all specimens or selected samples, belonging to """ query = request.args @@ -179,7 +182,7 @@ def readsets(project_id: str, readset_id: str=None): """ GET: readset_id: uses the form "1,3-8,9", if not provided all readsets are returned - return: list all patients or selected readsets, belonging to + return: list all specimens or selected readsets, belonging to """ query = request.args @@ -205,15 +208,15 @@ def readsets(project_id: str, readset_id: str=None): @bp.route('//files/') -@bp.route('//patients//files') +@bp.route('//specimens//files') @bp.route('//samples//files') @bp.route('//readsets//files') @convcheck_project -def files(project_id: str, patient_id: str=None, sample_id: str=None, readset_id: str=None, file_id: str=None): +def files(project_id: str, specimen_id: str=None, sample_id: str=None, readset_id: str=None, file_id: str=None): """ GET: file_id: uses the form "1,3-8,9". Select file by ids - patient_id: uses the form "1,3-8,9". Select file by patient ids + specimen_id: uses the form "1,3-8,9". Select file by specimen ids sample_id: uses the form "1,3-8,9". Select file by sample ids redeaset_id: uses the form "1,3-8,9". Select file by readset ids return: selected files, belonging to @@ -238,8 +241,8 @@ def files(project_id: str, patient_id: str=None, sample_id: str=None, readset_id elif query['deliverable'].lower() in ['false', '0']: deliverable = False - if patient_id is not None: - patient_id = unroll(patient_id) + if specimen_id is not None: + specimen_id = unroll(specimen_id) elif sample_id is not None: sample_id = unroll(sample_id) elif readset_id is not None: @@ -250,7 +253,7 @@ def files(project_id: str, patient_id: str=None, sample_id: str=None, readset_id if deliverable is not None: action_output = db_action.files_deliverable( project_id=project_id, - patient_id=patient_id, + specimen_id=specimen_id, sample_id=sample_id, readset_id=readset_id, file_id=file_id, @@ -259,7 +262,7 @@ def files(project_id: str, patient_id: str=None, sample_id: str=None, readset_id else: action_output = db_action.files( project_id=project_id, - patient_id=patient_id, + specimen_id=specimen_id, sample_id=sample_id, readset_id=readset_id, file_id=file_id @@ -274,15 +277,15 @@ def files(project_id: str, patient_id: str=None, sample_id: str=None, readset_id @bp.route('//metrics', methods=['GET', 'POST']) @bp.route('//metrics/') -@bp.route('//patients//metrics') +@bp.route('//specimens//metrics') @bp.route('//samples//metrics') @bp.route('//readsets//metrics') @convcheck_project -def metrics(project_id: str, patient_id: str=None, sample_id: str=None, readset_id: str=None, metric_id: str=None): +def metrics(project_id: str, specimen_id: str=None, sample_id: str=None, readset_id: str=None, metric_id: str=None): """ GET: metric_id: uses the form "1,3-8,9". Select metric by ids - patient_id: uses the form "1,3-8,9". Select metric by patient ids + specimen_id: uses the form "1,3-8,9". Select metric by specimen ids sample_id: uses the form "1,3-8,9". Select metric by sample ids redeaset_id: uses the form "1,3-8,9". Select metric by readset ids return: selected metrics, belonging to @@ -291,7 +294,7 @@ def metrics(project_id: str, patient_id: str=None, sample_id: str=None, readset_ metric_name = [,NAME] [...] readset_name = [,NAME] [...] sample_name = [,NAME] [...] - patient_name = [,NAME] [...] + specimen_name = [,NAME] [...] Query: (deliverable): Default (None) @@ -316,7 +319,7 @@ def metrics(project_id: str, patient_id: str=None, sample_id: str=None, readset_ if request.method == 'POST': post_data = request.data.decode() post_input = post_data.split('=') - if post_input[0] in ["metric_name", "readset_name", "sample_name", "patient_name"]: + if post_input[0] in ["metric_name", "readset_name", "sample_name", "specimen_name"]: model_class = post_input[0].split('_')[0] names = post_input[1].split(',') ids = db_action.name_to_id(model_class.capitalize(), names) @@ -326,10 +329,10 @@ def metrics(project_id: str, patient_id: str=None, sample_id: str=None, readset_ readset_id = ids elif post_input[0] == "sample_name": sample_id = ids - elif post_input[0] == "patient_name": - patient_id = ids - elif patient_id is not None: - patient_id = unroll(patient_id) + elif post_input[0] == "specimen_name": + specimen_id = ids + elif specimen_id is not None: + specimen_id = unroll(specimen_id) elif sample_id is not None: sample_id = unroll(sample_id) elif readset_id is not None: @@ -340,7 +343,7 @@ def metrics(project_id: str, patient_id: str=None, sample_id: str=None, readset_ if deliverable is not None: action_output = db_action.metrics_deliverable( project_id=project_id, - patient_id=patient_id, + specimen_id=specimen_id, sample_id=sample_id, readset_id=readset_id, metric_id=metric_id, @@ -349,7 +352,7 @@ def metrics(project_id: str, patient_id: str=None, sample_id: str=None, readset_ else: action_output = db_action.metrics( project_id=project_id, - patient_id=patient_id, + specimen_id=specimen_id, sample_id=sample_id, readset_id=readset_id, metric_id=metric_id @@ -394,7 +397,7 @@ def readsets_from_samples(project_id: str, sample_id: str): @convcheck_project def digest_readset_file(project_id: str): """ - POST: json holding the list of Patient/Sample/Readset Name or id AND location endpoint + experiment nucleic_acid_type + POST: json holding the list of Specimen/Sample/Readset Name or id AND location endpoint + experiment nucleic_acid_type return: all information to create a "Genpipes readset file" """ @@ -415,7 +418,7 @@ def digest_readset_file(project_id: str): @convcheck_project def digest_pair_file(project_id: str): """ - POST: json holding the list of Patient/Sample/Readset Name or id AND location endpoint + experiment nucleic_acid_type + POST: json holding the list of Specimen/Sample/Readset Name or id AND location endpoint + experiment nucleic_acid_type return: all information to create a "Genpipes pair file" """ @@ -450,13 +453,11 @@ def ingest_run_processing(project_id: str): if isinstance(project_id, dict) and project_id.get("DB_ACTION_WARNING"): return project_id + out = db_action.ingest_run_processing(project_id=project_id, ingest_data=ingest_data) + logger.debug(f"ingest_run_processing: {out}") + out["DB_ACTION_OUTPUT"] = [i.flat_dict for i in out["DB_ACTION_OUTPUT"]] - if ingest_data[vc.PROJECT_NAME]: - project_id_from_name = str(db_action.name_to_id("Project", ingest_data[vc.PROJECT_NAME].upper())[0]) - if project_id != project_id_from_name: - return {"DB_ACTION_WARNING": f"Requested Project {project_id_from_name} in the input json is not matching the Project in the route {project_id}"} - - return [i.flat_dict for i in db_action.ingest_run_processing(project_id=project_id, ingest_data=ingest_data)] + return out @bp.route('//ingest_transfer', methods=['POST']) @@ -476,7 +477,11 @@ def ingest_transfer(project_id: str): if isinstance(project_id, dict) and project_id.get("DB_ACTION_WARNING"): return project_id - return [i.flat_dict for i in db_action.ingest_transfer(project_id=project_id, ingest_data=ingest_data)] + out = db_action.ingest_transfer(project_id=project_id, ingest_data=ingest_data) + out["DB_ACTION_OUTPUT"] = [i.flat_dict for i in out["DB_ACTION_OUTPUT"]] + + return out + @bp.route('//ingest_genpipes', methods=['POST']) @convcheck_project @@ -496,16 +501,10 @@ def ingest_genpipes(project_id: str): if isinstance(project_id, dict) and project_id.get("DB_ACTION_WARNING"): return project_id + out = db_action.ingest_genpipes(project_id=project_id, ingest_data=ingest_data) + out["DB_ACTION_OUTPUT"] = [i.flat_dict for i in out["DB_ACTION_OUTPUT"]] - if ingest_data[vc.PROJECT_NAME]: - project_id_from_name = str(db_action.name_to_id("Project", ingest_data[vc.PROJECT_NAME].upper())[0]) - if project_id != project_id_from_name: - return {"DB_ACTION_WARNING": f"Requested Project {project_id_from_name} in the input json is not matching the Project in the route {project_id}"} - - output = db_action.ingest_genpipes(project_id=project_id, ingest_data=ingest_data) - operation = output[0].flat_dict - jobs = [job.flat_dict for job in output[1]] - return [operation, jobs] + return out @bp.route('//digest_unanalyzed', methods=['POST']) @convcheck_project @@ -525,3 +524,23 @@ def digest_unanalyzed(project_id: str): return project_id return db_action.digest_unanalyzed(project_id=project_id, digest_data=ingest_data) + + +@bp.route('//digest_delivery', methods=['POST']) +@convcheck_project +def digest_delivery(project_id: str): + """ + POST: json holding the list of Specimen/Sample/Readset Name or id AND location endpoint + experiment nucleic_acid_type (optional) + return: Samples/Readsets unanalyzed with location endpoint + experiment nucleic_acid_type + """ + if request.method == 'POST': + try: + ingest_data = request.get_json(force=True) + except: + flash('Data does not seems to be json') + return redirect(request.url) + + if isinstance(project_id, dict) and project_id.get("DB_ACTION_WARNING"): + return project_id + + return db_action.digest_delivery(project_id=project_id, digest_data=ingest_data) diff --git a/project_tracking/db_action.py b/project_tracking/db_action.py index 0d88f12..856bb68 100644 --- a/project_tracking/db_action.py +++ b/project_tracking/db_action.py @@ -8,6 +8,7 @@ from datetime import datetime from sqlalchemy import select, exc +from sqlalchemy import delete as sql_delete from pathlib import Path from . import vocabulary as vb @@ -15,12 +16,13 @@ from .model import ( LaneEnum, SequencingTypeEnum, + StateEnum, StatusEnum, FlagEnum, AggregateEnum, readset_file, Project, - Patient, + Specimen, Sample, Experiment, Run, @@ -81,12 +83,12 @@ def __init__(self, message=None, entity=None, attribute=None, value=None): def unique_constraint_error(session, json_format, ingest_data): """ - When unique constraint errors, checks which entity is causing the error and returns it/them + When unique constraint errors, checks which entity is causing the error and returns it/them as a list """ ret = [] if json_format == "run_processing": - for patient_json in ingest_data[vb.PATIENT]: - for sample_json in patient_json[vb.SAMPLE]: + for specimen_json in ingest_data[vb.SPECIMEN]: + for sample_json in specimen_json[vb.SAMPLE]: for readset_json in sample_json[vb.READSET]: readset_name = readset_json[vb.READSET_NAME] stmt = ( @@ -100,7 +102,7 @@ def unique_constraint_error(session, json_format, ingest_data): def name_to_id(model_class, name, session=None): """ - Converting a given name into its id + Converting a given name into its id(s) for a given model_class """ if session is None: session = database.get_session() @@ -112,6 +114,132 @@ def name_to_id(model_class, name, session=None): return session.scalars(stmt).unique().all() + +def select_readsets_from_specimens(session, digest_data, nucleic_acid_type): + """Returning Readsets Objects based on requested specimens in digest_data""" + specimens = [] + readsets = [] + if vb.SPECIMEN_NAME in digest_data.keys(): + for specimen_name in digest_data[vb.SPECIMEN_NAME]: + specimen = session.scalars( + select(Specimen) + .where(Specimen.name == specimen_name) + .join(Specimen.samples) + .join(Sample.readsets) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + .join(Readset.experiment) + .where(Experiment.nucleic_acid_type == nucleic_acid_type) + ).unique().first() + if specimen: + specimens.append(specimen) + else: + raise DidNotFindError(f"'Specimen' with 'name' '{specimen_name}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") + if vb.SPECIMEN_ID in digest_data.keys(): + for specimen_id in digest_data[vb.SPECIMEN_ID]: + logger.debug(f"specimen_id: {specimen_id}") + specimen = session.scalars( + select(Specimen) + .where(Specimen.id == specimen_id) + .join(Specimen.samples) + .join(Sample.readsets) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + .join(Readset.experiment) + .where(Experiment.nucleic_acid_type == nucleic_acid_type) + ).unique().first() + if specimen: + specimens.append(specimen) + else: + raise DidNotFindError(f"'Specimen' with 'id' '{specimen_id}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") + if specimens: + set(specimens) + for specimen in specimens: + for sample in specimen.samples: + for readset in sample.readsets: + readsets.append(readset) + + return readsets + +def select_readsets_from_samples(session, digest_data, nucleic_acid_type): + """Returning Readsets Objects based on requested samples in digest_data""" + samples = [] + readsets = [] + if vb.SAMPLE_NAME in digest_data.keys(): + for sample_name in digest_data[vb.SAMPLE_NAME]: + sample = session.scalars( + select(Sample) + .where(Sample.name == sample_name) + .join(Sample.readsets) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + .join(Readset.experiment) + .where(Experiment.nucleic_acid_type == nucleic_acid_type) + ).unique().first() + if sample: + samples.append(sample) + else: + raise DidNotFindError(f"'Sample' with 'name' '{sample_name}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") + if vb.SAMPLE_ID in digest_data.keys(): + for sample_id in digest_data[vb.SAMPLE_ID]: + logger.debug(f"sample_id: {sample_id}") + sample = session.scalars( + select(Sample) + .where(Sample.id == sample_id) + .join(Sample.readsets) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + .join(Readset.experiment) + .where(Experiment.nucleic_acid_type == nucleic_acid_type) + ).unique().first() + if sample: + samples.append(sample) + else: + raise DidNotFindError(f"'Sample' with 'id' '{sample_id}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") + if samples: + set(samples) + for sample in samples: + for readset in sample.readsets: + readsets.append(readset) + + return readsets + +def select_readsets_from_readsets(session, digest_data, nucleic_acid_type): + """Returning Readsets Objects based on requested readsets in digest_data""" + readsets = [] + if vb.READSET_NAME in digest_data.keys(): + for readset_name in digest_data[vb.READSET_NAME]: + readset = session.scalars( + select(Readset) + .where(Readset.name == readset_name) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + .join(Readset.experiment) + .where(Experiment.nucleic_acid_type == nucleic_acid_type) + ).unique().first() + if readset: + readsets.append(readset) + else: + raise DidNotFindError(f"'Readset' with 'name' '{readset_name}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") + if vb.READSET_ID in digest_data.keys(): + for readset_id in digest_data[vb.READSET_ID]: + readset = session.scalars( + select(Readset) + .where(Readset.id == readset_id) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + .join(Readset.experiment) + .where(Experiment.nucleic_acid_type == nucleic_acid_type) + ).unique().first() + if readset: + readsets.append(readset) + else: + raise DidNotFindError(f"'Readset' with 'id' '{readset_id}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") + if readsets: + set(readsets) + + return readsets + def projects(project_id=None, session=None): """ Fetching all projects in database @@ -127,14 +255,16 @@ def projects(project_id=None, session=None): stmt = ( select(Project) .where(Project.id.in_(project_id)) + .where(Project.deprecated.is_(False)) + .where(Project.deleted.is_(False)) ) return session.scalars(stmt).unique().all() -def metrics_deliverable(project_id: str, deliverable: bool, patient_id=None, sample_id=None, readset_id=None, metric_id=None): +def metrics_deliverable(project_id: str, deliverable: bool, specimen_id=None, sample_id=None, readset_id=None, metric_id=None): """ - deliverable = True: Returns only patients that have a tumor and a normal sample - deliverable = False, Tumor = False: Returns patients that only have a normal samples + deliverable = True: Returns only specimens that have a tumor and a normal sample + deliverable = False, Tumor = False: Returns specimens that only have a normal samples """ session = database.get_session() @@ -148,23 +278,27 @@ def metrics_deliverable(project_id: str, deliverable: bool, patient_id=None, sam select(Metric) .where(Metric.deliverable == deliverable) .where(Metric.id.in_(metric_id)) + .where(Metric.deprecated.is_(False)) + .where(Metric.deleted.is_(False)) .join(Metric.readsets) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) - elif patient_id and project_id: - if isinstance(patient_id, int): - patient_id = [patient_id] + elif specimen_id and project_id: + if isinstance(specimen_id, int): + specimen_id = [specimen_id] stmt = ( select(Metric) .where(Metric.deliverable == deliverable) + .where(Metric.deprecated.is_(False)) + .where(Metric.deleted.is_(False)) .join(Metric.readsets) .join(Readset.sample) - .join(Sample.patient) - .where(Patient.id.in_(patient_id)) - .join(Patient.project) + .join(Sample.specimen) + .where(Specimen.id.in_(specimen_id)) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif sample_id and project_id: @@ -173,11 +307,13 @@ def metrics_deliverable(project_id: str, deliverable: bool, patient_id=None, sam stmt = ( select(Metric) .where(Metric.deliverable == deliverable) + .where(Metric.deprecated.is_(False)) + .where(Metric.deleted.is_(False)) .join(Metric.readsets) .join(Readset.sample) .where(Sample.id.in_(sample_id)) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif readset_id and project_id: @@ -186,11 +322,13 @@ def metrics_deliverable(project_id: str, deliverable: bool, patient_id=None, sam stmt = ( select(Metric) .where(Metric.deliverable == deliverable) + .where(Metric.deprecated.is_(False)) + .where(Metric.deleted.is_(False)) .join(Metric.readsets) .where(Readset.id.in_(readset_id)) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) else: @@ -199,9 +337,9 @@ def metrics_deliverable(project_id: str, deliverable: bool, patient_id=None, sam return session.scalars(stmt).unique().all() -def metrics(project_id=None, patient_id=None, sample_id=None, readset_id=None, metric_id=None): +def metrics(project_id=None, specimen_id=None, sample_id=None, readset_id=None, metric_id=None): """ - Fetching all metrics that are part of the project or patient or sample or readset + Fetching all metrics that are part of the project or specimen or sample or readset """ session = database.get_session() if isinstance(project_id, str): @@ -213,22 +351,26 @@ def metrics(project_id=None, patient_id=None, sample_id=None, readset_id=None, m stmt = ( select(Metric) .where(Metric.id.in_(metric_id)) + .where(Metric.deprecated.is_(False)) + .where(Metric.deleted.is_(False)) .join(Metric.readsets) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) - elif patient_id and project_id: - if isinstance(patient_id, int): - patient_id = [patient_id] + elif specimen_id and project_id: + if isinstance(specimen_id, int): + specimen_id = [specimen_id] stmt = ( select(Metric) + .where(Metric.deprecated.is_(False)) + .where(Metric.deleted.is_(False)) .join(Metric.readsets) .join(Readset.sample) - .join(Sample.patient) - .where(Patient.id.in_(patient_id)) - .join(Patient.project) + .join(Sample.specimen) + .where(Specimen.id.in_(specimen_id)) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif sample_id and project_id: @@ -236,11 +378,13 @@ def metrics(project_id=None, patient_id=None, sample_id=None, readset_id=None, m sample_id = [sample_id] stmt = ( select(Metric) + .where(Metric.deprecated.is_(False)) + .where(Metric.deleted.is_(False)) .join(Metric.readsets) .join(Readset.sample) .where(Sample.id.in_(sample_id)) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif readset_id and project_id: @@ -248,11 +392,13 @@ def metrics(project_id=None, patient_id=None, sample_id=None, readset_id=None, m readset_id = [readset_id] stmt = ( select(Metric) + .where(Metric.deprecated.is_(False)) + .where(Metric.deleted.is_(False)) .join(Metric.readsets) .where(Readset.id.in_(readset_id)) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) else: @@ -261,7 +407,7 @@ def metrics(project_id=None, patient_id=None, sample_id=None, readset_id=None, m return session.scalars(stmt).unique().all() -def files_deliverable(project_id: str, deliverable: bool, patient_id=None, sample_id=None, readset_id=None, file_id=None): +def files_deliverable(project_id: str, deliverable: bool, specimen_id=None, sample_id=None, readset_id=None, file_id=None): """ deliverable = True: Returns only files labelled as deliverable deliverable = False: Returns only files NOT labelled as deliverable @@ -276,25 +422,29 @@ def files_deliverable(project_id: str, deliverable: bool, patient_id=None, sampl file_id = [file_id] stmt = ( select(File) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .where(File.deliverable == deliverable) .where(File.id.in_(file_id)) .join(File.readsets) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) - elif patient_id and project_id: - if isinstance(patient_id, int): - patient_id = [patient_id] + elif specimen_id and project_id: + if isinstance(specimen_id, int): + specimen_id = [specimen_id] stmt = ( select(File) .where(File.deliverable == deliverable) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .join(File.readsets) .join(Readset.sample) - .join(Sample.patient) - .where(Patient.id.in_(patient_id)) - .join(Patient.project) + .join(Sample.specimen) + .where(Specimen.id.in_(specimen_id)) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif sample_id and project_id: @@ -303,11 +453,13 @@ def files_deliverable(project_id: str, deliverable: bool, patient_id=None, sampl stmt = ( select(File) .where(File.deliverable == deliverable) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .join(File.readsets) .join(Readset.sample) .where(Sample.id.in_(sample_id)) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif readset_id and project_id: @@ -316,11 +468,13 @@ def files_deliverable(project_id: str, deliverable: bool, patient_id=None, sampl stmt = ( select(File) .where(File.deliverable == deliverable) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .join(File.readsets) .where(Readset.id.in_(readset_id)) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) else: @@ -328,7 +482,7 @@ def files_deliverable(project_id: str, deliverable: bool, patient_id=None, sampl return session.scalars(stmt).unique().all() -def files(project_id=None, patient_id=None, sample_id=None, readset_id=None, file_id=None): +def files(project_id=None, specimen_id=None, sample_id=None, readset_id=None, file_id=None): """ Fetching all files that are linked to readset """ @@ -342,23 +496,27 @@ def files(project_id=None, patient_id=None, sample_id=None, readset_id=None, fil file_id = [file_id] stmt = ( select(File) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .where(File.id.in_(file_id)) .join(File.readsets) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) - elif patient_id and project_id: - if isinstance(patient_id, int): - patient_id = [patient_id] + elif specimen_id and project_id: + if isinstance(specimen_id, int): + specimen_id = [specimen_id] stmt = ( select(File) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .join(File.readsets) .join(Readset.sample) - .join(Sample.patient) - .where(Patient.id.in_(patient_id)) - .join(Patient.project) + .join(Sample.specimen) + .where(Specimen.id.in_(specimen_id)) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif sample_id and project_id: @@ -366,11 +524,13 @@ def files(project_id=None, patient_id=None, sample_id=None, readset_id=None, fil sample_id = [sample_id] stmt = ( select(File) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .join(File.readsets) .join(Readset.sample) .where(Sample.id.in_(sample_id)) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif readset_id and project_id: @@ -378,11 +538,13 @@ def files(project_id=None, patient_id=None, sample_id=None, readset_id=None, fil readset_id = [readset_id] stmt = ( select(File) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .join(File.readsets) .where(Readset.id.in_(readset_id)) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) else: @@ -401,13 +563,19 @@ def readsets(project_id=None, sample_id=None, readset_id=None): project_id = [project_id] if project_id is None and sample_id is None and readset_id is None: - stmt = select(Readset) + stmt = ( + select(Readset) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + ) elif project_id and sample_id is None and readset_id is None: stmt = ( select(Readset) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif sample_id and project_id: @@ -415,11 +583,13 @@ def readsets(project_id=None, sample_id=None, readset_id=None): sample_id = [sample_id] stmt = ( select(Readset) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) .join(Readset.sample) .where(Sample.id.in_(sample_id)).where(Project.id.in_(project_id)) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) elif readset_id and project_id: @@ -427,56 +597,66 @@ def readsets(project_id=None, sample_id=None, readset_id=None): readset_id = [readset_id] stmt = ( select(Readset) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) .where(Readset.id.in_(readset_id)) .join(Readset.sample) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) return session.scalars(stmt).unique().all() -def patient_pair(project_id: str, pair: bool, patient_id=None, tumor: bool=True): +def specimen_pair(project_id: str, pair: bool, specimen_id=None, tumor: bool=True): """ - Pair = True: Returns only patients that have a tumor and a normal sample - Pair = False, Tumor = True: Returns patients that only have a tumor samples - Pair = False, Tumor = False: Returns patients that only have a normal samples + Pair = True: Returns only specimens that have a tumor and a normal sample + Pair = False, Tumor = True: Returns specimens that only have a tumor samples + Pair = False, Tumor = False: Returns specimens that only have a normal samples """ session = database.get_session() if isinstance(project_id, str): project_id = [project_id] - if patient_id is None: + if specimen_id is None: stmt1 = ( - select(Patient) - .join(Patient.samples) + select(Specimen) + .where(Specimen.deprecated.is_(False)) + .where(Specimen.deleted.is_(False)) + .join(Specimen.samples) .where(Sample.tumour.is_(True)) .where(Project.id.in_(project_id)) ) stmt2 = ( - select(Patient) - .join(Patient.samples) + select(Specimen) + .where(Specimen.deprecated.is_(False)) + .where(Specimen.deleted.is_(False)) + .join(Specimen.samples) .where(Sample.tumour.is_(False)) .where(Project.id.in_(project_id)) ) else: - if isinstance(patient_id, int): - patient_id = [patient_id] + if isinstance(specimen_id, int): + specimen_id = [specimen_id] stmt1 = ( - select(Patient) - .join(Patient.samples) + select(Specimen) + .where(Specimen.deprecated.is_(False)) + .where(Specimen.deleted.is_(False)) + .join(Specimen.samples) .where(Sample.tumour.is_(True)) .where(Project.id.in_(project_id)) - .where(Patient.id.in_(patient_id)) + .where(Specimen.id.in_(specimen_id)) ) stmt2 = ( - select(Patient) - .join(Patient.samples) + select(Specimen) + .where(Specimen.deprecated.is_(False)) + .where(Specimen.deleted.is_(False)) + .join(Specimen.samples) .where(Sample.tumour.is_(False)) .where(Project.id.in_(project_id)) - .where(Patient.id.in_(patient_id)) + .where(Specimen.id.in_(specimen_id)) ) s1 = set(session.scalars(stmt1).all()) s2 = set(session.scalars(stmt2).all()) @@ -488,29 +668,37 @@ def patient_pair(project_id: str, pair: bool, patient_id=None, tumor: bool=True) return s2.difference(s1) -def patients(project_id=None, patient_id=None): +def specimens(project_id=None, specimen_id=None): """ - Fetching all patients from projets or selected patient from id + Fetching all specimens from projets or selected specimen from id """ session = database.get_session() if isinstance(project_id, str): project_id = [project_id] - if project_id is None and patient_id is None: - stmt = select(Patient) - elif patient_id is None and project_id: + if project_id is None and specimen_id is None: + stmt = ( + select(Specimen) + .where(Specimen.deprecated.is_(False)) + .where(Specimen.deleted.is_(False)) + ) + elif specimen_id is None and project_id: stmt = ( - select(Patient) - .join(Patient.project) + select(Specimen) + .where(Specimen.deprecated.is_(False)) + .where(Specimen.deleted.is_(False)) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) else: - if isinstance(patient_id, int): - patient_id = [patient_id] + if isinstance(specimen_id, int): + specimen_id = [specimen_id] stmt = ( - select(Patient) - .where(Patient.id.in_(patient_id)) - .join(Patient.project) + select(Specimen) + .where(Specimen.deprecated.is_(False)) + .where(Specimen.deleted.is_(False)) + .where(Specimen.id.in_(specimen_id)) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) @@ -526,12 +714,18 @@ def samples(project_id=None, sample_id=None): project_id = [project_id] if project_id is None: - stmt = (select(Sample)) + stmt = ( + select(Sample) + .where(Sample.deprecated.is_(False)) + .where(Sample.deleted.is_(False)) + ) elif sample_id is None: stmt = ( select(Sample) - .join(Sample.patient) - .join(Patient.project) + .where(Sample.deprecated.is_(False)) + .where(Sample.deleted.is_(False)) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) else: @@ -539,16 +733,18 @@ def samples(project_id=None, sample_id=None): sample_id = [sample_id] stmt = ( select(Sample) + .where(Sample.deprecated.is_(False)) + .where(Sample.deleted.is_(False)) .where(Sample.id.in_(sample_id)) - .join(Sample.patient) - .join(Patient.project) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) return session.scalars(stmt).unique().all() -def create_project(project_name, fms_id=None, session=None): +def create_project(project_name, ext_id=None, ext_src=None, session=None): """ Creating new project Returns project even if it already exist @@ -556,7 +752,7 @@ def create_project(project_name, fms_id=None, session=None): if not session: session = database.get_session() - project = Project(name=project_name, fms_id=fms_id) + project = Project(name=project_name, ext_id=ext_id, ext_src=ext_src) session.add(project) @@ -578,6 +774,11 @@ def ingest_run_processing(project_id: str, ingest_data, session=None): if not session: session = database.get_session() + ret = { + "DB_ACTION_WARNING": [], + "DB_ACTION_OUTPUT": [] + } + project = projects(project_id=project_id, session=session)[0] operation = Operation( @@ -596,26 +797,27 @@ def ingest_run_processing(project_id: str, ingest_data, session=None): session.add(job) # Defining Run run = Run.from_attributes( - fms_id=ingest_data[vb.RUN_FMS_ID], + ext_id=ingest_data[vb.RUN_EXT_ID], + ext_src=ingest_data[vb.RUN_EXT_SRC], name=ingest_data[vb.RUN_NAME], instrument=ingest_data[vb.RUN_INSTRUMENT], date=datetime.strptime(ingest_data[vb.RUN_DATE], vb.DATE_LONG_FMT), session=session ) - for patient_json in ingest_data[vb.PATIENT]: - patient = Patient.from_name( - name=patient_json[vb.PATIENT_NAME], - cohort=patient_json[vb.PATIENT_COHORT], - institution=patient_json[vb.PATIENT_INSTITUTION], + for specimen_json in ingest_data[vb.SPECIMEN]: + specimen = Specimen.from_name( + name=specimen_json[vb.SPECIMEN_NAME], + cohort=specimen_json[vb.SPECIMEN_COHORT], + institution=specimen_json[vb.SPECIMEN_INSTITUTION], project=project, session=session ) - for sample_json in patient_json[vb.SAMPLE]: + for sample_json in specimen_json[vb.SAMPLE]: sample = Sample.from_name( name=sample_json[vb.SAMPLE_NAME], tumour=sample_json[vb.SAMPLE_TUMOUR], - patient=patient, + specimen=specimen, session=session ) for readset_json in sample_json[vb.READSET]: @@ -638,7 +840,6 @@ def ingest_run_processing(project_id: str, ingest_data, session=None): adapter1=readset_json[vb.READSET_ADAPTER1], adapter2=readset_json[vb.READSET_ADAPTER2], sequencing_type=SequencingTypeEnum(readset_json[vb.READSET_SEQUENCING_TYPE]), - quality_offset=readset_json[vb.READSET_QUALITY_OFFSET], sample=sample, experiment=experiment, run=run, @@ -650,11 +851,13 @@ def ingest_run_processing(project_id: str, ingest_data, session=None): file_type = os.path.splitext(file_json[vb.FILE_NAME])[-1][1:] if ".gz" in suffixes: file_type = "".join(suffixes[-2:]) + if file_type.startswith("."): + file_type = file_type[1:] if vb.FILE_DELIVERABLE in file_json: file_deliverable = file_json[vb.FILE_DELIVERABLE] else: file_deliverable = False - # Need to have an the following otherwise assigning extra_metadata to None converts null into json in the db + # Need to have the following otherwise assigning extra_metadata to None converts null into json in the db if vb.FILE_EXTRA_METADATA in file_json.keys(): file = File( name=file_json[vb.FILE_NAME], @@ -713,7 +916,13 @@ def ingest_run_processing(project_id: str, ingest_data, session=None): # job job = session.scalars(select(Job).where(Job.id == job_id)).first() - return [operation, job] + ret["DB_ACTION_OUTPUT"].append(operation) + # If no warning + if not ret["DB_ACTION_WARNING"]: + ret.pop("DB_ACTION_WARNING") + # return [operation, jobs] + + return ret def ingest_transfer(project_id: str, ingest_data, session=None, check_readset_name=True): @@ -724,6 +933,11 @@ def ingest_transfer(project_id: str, ingest_data, session=None, check_readset_na if not session: session = database.get_session() + ret = { + "DB_ACTION_WARNING": [], + "DB_ACTION_OUTPUT": [] + } + project = projects(project_id=project_id, session=session)[0] operation = Operation( @@ -743,30 +957,41 @@ def ingest_transfer(project_id: str, ingest_data, session=None, check_readset_na readset_list = [] for readset_json in ingest_data[vb.READSET]: readset_name = readset_json[vb.READSET_NAME] - readset_list.append(session.scalars(select(Readset).where(Readset.name == readset_name)).unique().first()) + readset_list.append( + session.scalars( + select(Readset) + .where(Readset.name == readset_name) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + ).unique().first() + ) for file_json in readset_json[vb.FILE]: src_uri = file_json[vb.SRC_LOCATION_URI] dest_uri = file_json[vb.DEST_LOCATION_URI] if check_readset_name: file = session.scalars( select(File) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .join(File.readsets) .where(Readset.name == readset_name) .join(File.locations) .where(Location.uri == src_uri) ).unique().first() if not file: - raise DidNotFindError(f"No file with uri: {src_uri} and readset {readset_name}") + raise DidNotFindError(f"No 'File' with 'uri' '{src_uri}' and 'Readset' with 'name' '{readset_name}'") else: file = session.scalars( select(File) + .where(File.deprecated.is_(False)) + .where(File.deleted.is_(False)) .join(File.readsets) .where(Readset.name == readset_name) .join(File.locations) .where(Location.uri == src_uri) ).unique().first() if not file: - raise DidNotFindError(f"No file with uri: {src_uri}") + raise DidNotFindError(f"No 'File' with 'uri' '{src_uri}'") new_location = Location.from_uri(uri=dest_uri, file=file, session=session) file.jobs.append(job) @@ -790,7 +1015,12 @@ def ingest_transfer(project_id: str, ingest_data, session=None, check_readset_na # job job = session.scalars(select(Job).where(Job.id == job_id)).first() - return [operation, job] + ret["DB_ACTION_OUTPUT"].append(operation) + # If no warning + if not ret["DB_ACTION_WARNING"]: + ret.pop("DB_ACTION_WARNING") + # return [operation, jobs] + return ret def digest_readset_file(project_id: str, digest_data, session=None): @@ -798,11 +1028,9 @@ def digest_readset_file(project_id: str, digest_data, session=None): if not session: session = database.get_session() - patients = [] - samples = [] readsets = [] output = [] - errors = { + warnings = { "DB_ACTION_WARNING": [] } @@ -815,101 +1043,11 @@ def digest_readset_file(project_id: str, digest_data, session=None): else: raise RequestError(argument="experiment_nucleic_acid_type") - if vb.PATIENT_NAME in digest_data.keys(): - for patient_name in digest_data[vb.PATIENT_NAME]: - patient = session.scalars( - select(Patient) - .where(Patient.name == patient_name) - .join(Patient.samples) - .join(Sample.readsets) - .join(Readset.experiment) - .where(Experiment.nucleic_acid_type == nucleic_acid_type) - ).unique().first() - if patient: - patients.append(patient) - else: - raise DidNotFindError(f"'Patient' with 'name' '{patient_name}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") - if vb.PATIENT_ID in digest_data.keys(): - for patient_id in digest_data[vb.PATIENT_ID]: - # logger.debug(f"\n\n{patient_id}\n\n") - patient = session.scalars( - select(Patient) - .where(Patient.id == patient_id) - .join(Patient.samples) - .join(Sample.readsets) - .join(Readset.experiment) - .where(Experiment.nucleic_acid_type == nucleic_acid_type) - ).unique().first() - if patient: - patients.append(patient) - else: - raise DidNotFindError(f"'Patient' with 'id' '{patient_id}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") - if patients: - set(patients) - for patient in patients: - for sample in patient.samples: - for readset in sample.readsets: - readsets.append(readset) + readsets += select_readsets_from_specimens(session, digest_data, nucleic_acid_type) + readsets += select_readsets_from_samples(session, digest_data, nucleic_acid_type) + readsets += select_readsets_from_readsets(session, digest_data, nucleic_acid_type) - if vb.SAMPLE_NAME in digest_data.keys(): - for sample_name in digest_data[vb.SAMPLE_NAME]: - sample = session.scalars( - select(Sample) - .where(Sample.name == sample_name) - .join(Sample.readsets) - .join(Readset.experiment) - .where(Experiment.nucleic_acid_type == nucleic_acid_type) - ).unique().first() - if sample: - samples.append(sample) - else: - raise DidNotFindError(f"'Sample' with 'name' '{patient_name}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") - if vb.SAMPLE_ID in digest_data.keys(): - for sample_id in digest_data[vb.SAMPLE_ID]: - # logger.debug(f"\n\n{sample_id}\n\n") - sample = session.scalars( - select(Sample) - .where(Sample.id == sample_id) - .join(Sample.readsets) - .join(Readset.experiment) - .where(Experiment.nucleic_acid_type == nucleic_acid_type) - ).unique().first() - if sample: - samples.append(sample) - else: - raise DidNotFindError(f"'Sample' with 'id' '{patient_name}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") - if samples: - set(samples) - for sample in samples: - for readset in sample.readsets: - readsets.append(readset) - - if vb.READSET_NAME in digest_data.keys(): - for readset_name in digest_data[vb.READSET_NAME]: - readset = session.scalars( - select(Readset) - .where(Readset.name == readset_name) - .join(Readset.experiment) - .where(Experiment.nucleic_acid_type == nucleic_acid_type) - ).unique().first() - if readset: - readsets.append(readset) - else: - raise DidNotFindError(f"'Readset' with 'name' '{patient_name}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") - if vb.READSET_ID in digest_data.keys(): - for readset_id in digest_data[vb.READSET_ID]: - readset = session.scalars( - select(Readset) - .where(Readset.id == readset_id) - .join(Readset.experiment) - .where(Experiment.nucleic_acid_type == nucleic_acid_type) - ).unique().first() - if readset: - readsets.append(readset) - else: - raise DidNotFindError(f"'Readset' with 'id' '{patient_name}' AND 'nucleic_acid_type' '{nucleic_acid_type}' doesn't exist on database") if readsets: - set(readsets) for readset in readsets: readset_files = [] bed = None @@ -929,21 +1067,21 @@ def digest_readset_file(project_id: str, digest_data, session=None): if location_endpoint == location.endpoint: fastq1 = location.uri.split("://")[-1] if not fastq1: - errors["DB_ACTION_WARNING"].append(f"Looking for fastq R1 file for Sample {readset.sample.name} and Readset {readset.name} in '{location_endpoint}', file only exists on {[l.endpoint for l in file.locations]} system") + warnings["DB_ACTION_WARNING"].append(f"Looking for R1 fastq 'File' for 'Sample' with 'name' '{readset.sample.name}' and 'Readset' with 'name' '{readset.name}' in '{location_endpoint}', file only exists on {[l.endpoint for l in file.locations]} system") elif file.extra_metadata["read_type"] == "R2": if location_endpoint: for location in file.locations: if location_endpoint == location.endpoint: fastq2 = location.uri.split("://")[-1] if not fastq2: - errors["DB_ACTION_WARNING"].append(f"Looking for fastq R2 file for Sample {readset.sample.name} and Readset {readset.name} in '{location_endpoint}', file only exists on {[l.endpoint for l in file.locations]} system") + warnings["DB_ACTION_WARNING"].append(f"Looking for R2 fastq 'File' for 'Sample' with 'name' '{readset.sample.name}' and 'Readset' with 'name' '{readset.name}' in '{location_endpoint}', file only exists on {[l.endpoint for l in file.locations]} system") elif file.type == "bam": if location_endpoint: for location in file.locations: if location_endpoint == location.endpoint: bam = location.uri.split("://")[-1] if not bam: - errors["DB_ACTION_WARNING"].append(f"Looking for bam file for Sample {readset.sample.name} and Readset {readset.name} in '{location_endpoint}', file only exists on {[l.endpoint for l in file.locations]} system") + warnings["DB_ACTION_WARNING"].append(f"Looking for bam 'File' for 'Sample' with 'name' '{readset.sample.name}' and 'Readset' with 'name' '{readset.name}' in '{location_endpoint}', file only exists on {[l.endpoint for l in file.locations]} system") if file.type == "bed": bed = file.name readset_line = { @@ -955,15 +1093,15 @@ def digest_readset_file(project_id: str, digest_data, session=None): "Lane": readset.lane.value, "Adapter1": readset.adapter1, "Adapter2": readset.adapter2, - "QualityOffset": readset.quality_offset, + "QualityOffset": "33", "BED": bed, "FASTQ1": fastq1, "FASTQ2": fastq2, "BAM": bam } output.append(readset_line) - if errors["DB_ACTION_WARNING"]: - ret = errors + if warnings["DB_ACTION_WARNING"]: + ret = warnings else: ret = output return json.dumps(ret) @@ -975,7 +1113,7 @@ def digest_pair_file(project_id: str, digest_data, session=None): pair_dict = {} samples = [] - patients = [] + specimens = [] # readsets = [] output = [] @@ -984,44 +1122,50 @@ def digest_pair_file(project_id: str, digest_data, session=None): else: raise RequestError(argument="experiment_nucleic_acid_type") - if vb.PATIENT_NAME in digest_data.keys(): - for patient_name in digest_data[vb.PATIENT_NAME]: - patient = session.scalars( - select(Patient) - .where(Patient.name == patient_name) - .join(Patient.samples) + if vb.SPECIMEN_NAME in digest_data.keys(): + for specimen_name in digest_data[vb.SPECIMEN_NAME]: + specimen = session.scalars( + select(Specimen) + .where(Specimen.deprecated.is_(False)) + .where(Specimen.deleted.is_(False)) + .where(Specimen.name == specimen_name) + .join(Specimen.samples) .join(Sample.readsets) .join(Readset.experiment) .where(Experiment.nucleic_acid_type == nucleic_acid_type) ).unique().first() - if patient: - patients.append(patient) + if specimen: + specimens.append(specimen) else: - raise DidNotFindError(table="Patient", attribute="name", query=patient_name) - if vb.PATIENT_ID in digest_data.keys(): - for patient_id in digest_data[vb.PATIENT_ID]: - patient = session.scalars( - select(Patient) - .where(Patient.id == patient_id) - .join(Patient.samples) + raise DidNotFindError(table="Specimen", attribute="name", query=specimen_name) + if vb.SPECIMEN_ID in digest_data.keys(): + for specimen_id in digest_data[vb.SPECIMEN_ID]: + specimen = session.scalars( + select(Specimen) + .where(Specimen.deprecated.is_(False)) + .where(Specimen.deleted.is_(False)) + .where(Specimen.id == specimen_id) + .join(Specimen.samples) .join(Sample.readsets) .join(Readset.experiment) .where(Experiment.nucleic_acid_type == nucleic_acid_type) ).unique().first() - if patient: - patients.append(patient) + if specimen: + specimens.append(specimen) else: - raise DidNotFindError(table="Patient", attribute="id", query=patient_id) - if patients: - set(patients) - for patient in patients: - for sample in patient.samples: + raise DidNotFindError(table="Specimen", attribute="id", query=specimen_id) + if specimens: + set(specimens) + for specimen in specimens: + for sample in specimen.samples: samples.append(sample) if vb.SAMPLE_NAME in digest_data.keys(): for sample_name in digest_data[vb.SAMPLE_NAME]: sample = session.scalars( select(Sample) + .where(Sample.deprecated.is_(False)) + .where(Sample.deleted.is_(False)) .where(Sample.name == sample_name) .join(Sample.readsets) .join(Readset.experiment) @@ -1035,6 +1179,8 @@ def digest_pair_file(project_id: str, digest_data, session=None): for sample_id in digest_data[vb.SAMPLE_ID]: sample = session.scalars( select(Sample) + .where(Sample.deprecated.is_(False)) + .where(Sample.deleted.is_(False)) .where(Sample.id == sample_id) .join(Sample.readsets) .join(Readset.experiment) @@ -1048,6 +1194,8 @@ def digest_pair_file(project_id: str, digest_data, session=None): for readset_name in digest_data[vb.READSET_NAME]: readset = session.scalars( select(Readset) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) .where(Readset.name == readset_name) .join(Readset.experiment) .where(Experiment.nucleic_acid_type == nucleic_acid_type) @@ -1060,6 +1208,8 @@ def digest_pair_file(project_id: str, digest_data, session=None): for readset_id in digest_data[vb.READSET_ID]: readset = session.scalars( select(Readset) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) .where(Readset.id == readset_id) .join(Readset.experiment) .where(Experiment.nucleic_acid_type == nucleic_acid_type) @@ -1071,19 +1221,19 @@ def digest_pair_file(project_id: str, digest_data, session=None): if samples: set(samples) for sample in samples: - if not sample.patient.name in pair_dict.keys(): - pair_dict[sample.patient.name] = { + if not sample.specimen.name in pair_dict.keys(): + pair_dict[sample.specimen.name] = { "T": None, "N": None } if sample.tumour: - pair_dict[sample.patient.name]["T"] = sample.name + pair_dict[sample.specimen.name]["T"] = sample.name else: - pair_dict[sample.patient.name]["N"] = sample.name + pair_dict[sample.specimen.name]["N"] = sample.name if pair_dict: - for patient_name, dict_tn in pair_dict.items(): + for specimen_name, dict_tn in pair_dict.items(): pair_line = { - "Patient": patient_name, + "Specimen": specimen_name, "Sample_N": dict_tn["N"], "Sample_T": dict_tn["T"] } @@ -1099,6 +1249,11 @@ def ingest_genpipes(project_id: str, ingest_data, session=None): if not session: session = database.get_session() + ret = { + "DB_ACTION_WARNING": [], + "DB_ACTION_OUTPUT": [] + } + project = projects(project_id=project_id, session=session)[0] operation_config = OperationConfig.from_attributes( @@ -1119,23 +1274,29 @@ def ingest_genpipes(project_id: str, ingest_data, session=None): ) readset_list = [] + if not ingest_data[vb.SAMPLE]: + raise RequestError("No 'Sample' found, this json won't be ingested.") for sample_json in ingest_data[vb.SAMPLE]: sample = session.scalars( select(Sample) + .where(Sample.deprecated.is_(False)) + .where(Sample.deleted.is_(False)) .where(Sample.name == sample_json[vb.SAMPLE_NAME]) ).unique().first() if not sample: - raise DidNotFindError(f"No sample named {sample_json[vb.SAMPLE_NAME]}") + raise DidNotFindError(f"No Sample named '{sample_json[vb.SAMPLE_NAME]}'") for readset_json in sample_json[vb.READSET]: readset = session.scalars( select(Readset) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) .where(Readset.name == readset_json[vb.READSET_NAME]) ).unique().first() readset_list.append(readset) if not readset: - raise DidNotFindError(f"No readset named {readset_json[vb.READSET_NAME]}") + raise DidNotFindError(f"No Readset named '{readset_json[vb.READSET_NAME]}'") if readset.sample != sample: - raise DidNotFindError(f"sample {sample_json[vb.SAMPLE_NAME]} not linked with readset {readset_json[vb.READSET_NAME]}") + raise DidNotFindError(f"'Sample' with 'name' '{sample_json[vb.SAMPLE_NAME]}' not linked with 'Readset' with 'name' '{readset_json[vb.READSET_NAME]}'") for job_json in readset_json[vb.JOB]: try: job_start = datetime.strptime(job_json[vb.JOB_START], vb.DATE_LONG_FMT) @@ -1202,13 +1363,18 @@ def ingest_genpipes(project_id: str, ingest_data, session=None): ) # If job status is null then skip it as we don't want to ingest data not generated else: - pass + ret["DB_ACTION_WARNING"].append(f"'Readset' with 'name' '{readset.name}' has 'Job' with 'name' '{job_json[vb.JOB_NAME]}' with no status, skipping.") - session.add(job) - session.flush() + try: + session.add(job) + session.flush() + except UnboundLocalError: + pass operation.readsets = readset_list operation_id = operation.id job_ids = [job.id for job in operation.jobs] + if not job_ids: + raise RequestError("No 'Job' has a status, this json won't be ingested.") try: session.commit() except exc.SQLAlchemyError as error: @@ -1218,9 +1384,11 @@ def ingest_genpipes(project_id: str, ingest_data, session=None): # operation operation = session.scalars(select(Operation).where(Operation.id == operation_id)).first() # jobs - jobs = [session.scalars(select(Job).where(Job.id == job_id)).first() for job_id in job_ids] - - return [operation, jobs] + ret["DB_ACTION_OUTPUT"].append(operation) + # If no warning + if not ret["DB_ACTION_WARNING"]: + ret.pop("DB_ACTION_WARNING") + return ret def digest_unanalyzed(project_id: str, digest_data, session=None): @@ -1242,29 +1410,50 @@ def digest_unanalyzed(project_id: str, digest_data, session=None): run_id = digest_data["run_id"] run_name = digest_data["run_name"] if run_name: - run_id = name_to_id("Run", run_name)[0] + try: + run_id = name_to_id("Run", run_name)[0] + except: + raise DidNotFindError(f"'Run' with 'name' '{run_name}' doesn't exist on database") experiment_nucleic_acid_type = digest_data["experiment_nucleic_acid_type"] location_endpoint = digest_data["location_endpoint"] if sample_name_flag: - stmt = select(Sample.name) + stmt = ( + select(Sample.name) + .where(Sample.deprecated.is_(False)) + .where(Sample.deleted.is_(False)) + .join(Sample.readsets) + ) key = "sample_name" elif sample_id_flag: - stmt = select(Sample.id) + stmt = ( + select(Sample.id) + .where(Sample.deprecated.is_(False)) + .where(Sample.deleted.is_(False)) + .join(Sample.readsets) + ) key = "sample_id" elif readset_name_flag: - stmt = select(Readset.name) + stmt = ( + select(Readset.name) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + ) key = "readset_name" elif readset_id_flag: - stmt = select(Readset.id) + stmt = ( + select(Readset.id) + .where(Readset.deprecated.is_(False)) + .where(Readset.deleted.is_(False)) + ) key = "readset_id" stmt = ( - stmt.join(Sample.readsets) + stmt.where(Readset.state == StateEnum("VALID")) .join(Readset.operations) - .where(Operation.name.ilike(f"%genpipes%")) - .join(Sample.patient) - .join(Patient.project) + .where(Operation.name.notilike("%genpipes%")) + .join(Sample.specimen) + .join(Specimen.project) .where(Project.id.in_(project_id)) ) @@ -1274,6 +1463,7 @@ def digest_unanalyzed(project_id: str, digest_data, session=None): .join(Readset.run) ) if experiment_nucleic_acid_type: + logger.debug(f"run_name: {run_name}") stmt = ( stmt.where(Experiment.nucleic_acid_type == experiment_nucleic_acid_type) .join(Readset.experiment) @@ -1286,3 +1476,307 @@ def digest_unanalyzed(project_id: str, digest_data, session=None): } return json.dumps(output) + +def digest_delivery(project_id: str, digest_data, session=None): + """ + Getting delivery samples or readsets + """ + if not session: + session = database.get_session() + + location_endpoint = None + if vb.LOCATION_ENDPOINT in digest_data.keys(): + location_endpoint = digest_data[vb.LOCATION_ENDPOINT] + + readsets = [] + output = { + "location_endpoint": location_endpoint, + "readset": [] + } + warnings = { + "DB_ACTION_WARNING": [] + } + + if vb.EXPERIMENT_NUCLEIC_ACID_TYPE in digest_data.keys(): + nucleic_acid_type = digest_data[vb.EXPERIMENT_NUCLEIC_ACID_TYPE] + else: + raise RequestError(argument="experiment_nucleic_acid_type") + + readsets += select_readsets_from_specimens(session, digest_data, nucleic_acid_type) + readsets += select_readsets_from_samples(session, digest_data, nucleic_acid_type) + readsets += select_readsets_from_readsets(session, digest_data, nucleic_acid_type) + + if readsets: + for readset in readsets: + readset_files = [] + for file in readset.files: + if file.deliverable: + if location_endpoint: + logger.debug(f"File: {file}") + for location in file.locations: + logger.debug(f"Location: {location}") + if location_endpoint == location.endpoint: + file_deliverable = location.uri.split("://")[-1] + if not file_deliverable: + warnings["DB_ACTION_WARNING"].append(f"Looking for 'File' with 'name' '{file.name}' for 'Sample' with 'name' '{readset.sample.name}' and 'Readset' with 'name' '{readset.name}' in '{location_endpoint}', file only exists on {[l.endpoint for l in file.locations]} system") + else: + readset_files.append({ + "name": file.name, + "location": file_deliverable + }) + output["readset"].append({ + "name": readset.name, + "file": readset_files + }) + if warnings["DB_ACTION_WARNING"]: + ret = warnings + else: + ret = output + return json.dumps(ret) + +def edit(ingest_data, session=None): + """Edition of the database based on ingested_data""" + if not isinstance(ingest_data, dict): + ingest_data = json.loads(ingest_data) + + if not session: + session = database.get_session() + + ret = { + "DB_ACTION_WARNING": [], + "DB_ACTION_OUTPUT": [] + } + + if not ingest_data[vb.MODIFICATION]: + raise RequestError("No 'table' provided under 'modification' list, this json is malformed.") + + for table in ingest_data[vb.MODIFICATION]: + from . import model + the_table = getattr(model, table[vb.TABLE].title()) + for current_id in set(table[vb.ID]): + selected_table = session.scalars(select(the_table).where(the_table.id == current_id)).unique().first() + if not selected_table: + raise DidNotFindError(table=table[vb.TABLE], attribute="id", query=current_id) + old = getattr(selected_table, table[vb.COLUMN]) + latest_modification = selected_table.modification + # Skip the edit if the new value is the same as the old one + if old == table[vb.NEW]: + ret["DB_ACTION_WARNING"].append(f"Table '{table[vb.TABLE]}' with id '{current_id}' already has '{table[vb.COLUMN]}' with value '{old}'. Skipping...") + else: + setattr(selected_table, table[vb.COLUMN], table[vb.NEW]) + new = getattr(selected_table, table[vb.COLUMN]) + ret["DB_ACTION_OUTPUT"].append(f"Table '{table[vb.TABLE]}' edited: column '{table[vb.COLUMN]}' with id '{current_id}' changes from '{old}' to '{new}' (previous edit done '{latest_modification}').") + + try: + session.commit() + except exc.SQLAlchemyError as error: + logger.error("Error: %s", error) + session.rollback() + + # If no warning + if not ret["DB_ACTION_WARNING"]: + ret.pop("DB_ACTION_WARNING") + + return ret + +def delete(ingest_data, session=None): + """deletion of the database based on ingested_data""" + if not isinstance(ingest_data, dict): + ingest_data = json.loads(ingest_data) + + if not session: + session = database.get_session() + + ret = { + "DB_ACTION_WARNING": [], + "DB_ACTION_OUTPUT": [] + } + + if not ingest_data[vb.MODIFICATION]: + raise RequestError("No 'table' provided under 'modification' list, this json is malformed.") + + for table in ingest_data[vb.MODIFICATION]: + from . import model + the_table = getattr(model, table[vb.TABLE].title()) + for current_id in set(table[vb.ID]): + selected_table = session.scalars(select(the_table).where(the_table.id == current_id)).unique().first() + if not selected_table: + raise DidNotFindError(table=table[vb.TABLE], attribute="id", query=current_id) + if selected_table.deleted is True: + ret["DB_ACTION_WARNING"].append(f"'{table[vb.TABLE]}' with id '{current_id}' already deleted.. Skipping...") + else: + selected_table.deleted = True + ret["DB_ACTION_OUTPUT"].append(f"'{table[vb.TABLE]}' with id '{current_id}' deleted.") + + try: + session.commit() + except exc.SQLAlchemyError as error: + logger.error("Error: %s", error) + session.rollback() + + # If no warning + if not ret["DB_ACTION_WARNING"]: + ret.pop("DB_ACTION_WARNING") + + return ret + +def undelete(ingest_data, session=None): + """revert deletion of the database based on ingested_data""" + if not isinstance(ingest_data, dict): + ingest_data = json.loads(ingest_data) + + if not session: + session = database.get_session() + + ret = { + "DB_ACTION_WARNING": [], + "DB_ACTION_OUTPUT": [] + } + + if not ingest_data[vb.MODIFICATION]: + raise RequestError("No 'table' provided under 'modification' list, this json is malformed.") + + for table in ingest_data[vb.MODIFICATION]: + from . import model + the_table = getattr(model, table[vb.TABLE].title()) + for current_id in set(table[vb.ID]): + selected_table = session.scalars(select(the_table).where(the_table.id == current_id)).unique().first() + if not selected_table: + raise DidNotFindError(table=table[vb.TABLE], attribute="id", query=current_id) + if selected_table.deleted is False: + ret["DB_ACTION_WARNING"].append(f"'{table[vb.TABLE]}' with id '{current_id}' already undeleted.. Skipping...") + else: + selected_table.deleted = False + ret["DB_ACTION_OUTPUT"].append(f"'{table[vb.TABLE]}' with id '{current_id}' undeleted.") + + try: + session.commit() + except exc.SQLAlchemyError as error: + logger.error("Error: %s", error) + session.rollback() + + # If no warning + if not ret["DB_ACTION_WARNING"]: + ret.pop("DB_ACTION_WARNING") + + return ret + +def deprecate(ingest_data, session=None): + """deprecation of the database based on ingested_data""" + if not isinstance(ingest_data, dict): + ingest_data = json.loads(ingest_data) + + if not session: + session = database.get_session() + + ret = { + "DB_ACTION_WARNING": [], + "DB_ACTION_OUTPUT": [] + } + + if not ingest_data[vb.MODIFICATION]: + raise RequestError("No 'table' provided under 'modification' list, this json is malformed.") + + for table in ingest_data[vb.MODIFICATION]: + from . import model + the_table = getattr(model, table[vb.TABLE].title()) + for current_id in set(table[vb.ID]): + selected_table = session.scalars(select(the_table).where(the_table.id == current_id)).unique().first() + if not selected_table: + raise DidNotFindError(table=table[vb.TABLE], attribute="id", query=current_id) + if selected_table.deprecated is True: + ret["DB_ACTION_WARNING"].append(f"'{table[vb.TABLE]}' with id '{current_id}' already deprecated.. Skipping...") + else: + selected_table.deprecated = True + ret["DB_ACTION_OUTPUT"].append(f"'{table[vb.TABLE]}' with id '{current_id}' deprecated.") + + try: + session.commit() + except exc.SQLAlchemyError as error: + logger.error("Error: %s", error) + session.rollback() + + # If no warning + if not ret["DB_ACTION_WARNING"]: + ret.pop("DB_ACTION_WARNING") + + return ret + +def undeprecate(ingest_data, session=None): + """revert deprecation of the database based on ingested_data""" + if not isinstance(ingest_data, dict): + ingest_data = json.loads(ingest_data) + + if not session: + session = database.get_session() + + ret = { + "DB_ACTION_WARNING": [], + "DB_ACTION_OUTPUT": [] + } + + if not ingest_data[vb.MODIFICATION]: + raise RequestError("No 'table' provided under 'modification' list, this json is malformed.") + + for table in ingest_data[vb.MODIFICATION]: + from . import model + the_table = getattr(model, table[vb.TABLE].title()) + for current_id in set(table[vb.ID]): + selected_table = session.scalars(select(the_table).where(the_table.id == current_id)).unique().first() + if not selected_table: + raise DidNotFindError(table=table[vb.TABLE], attribute="id", query=current_id) + if selected_table.deprecated is False: + ret["DB_ACTION_WARNING"].append(f"'{table[vb.TABLE]}' with id '{current_id}' already undeprecated.. Skipping...") + else: + selected_table.deprecated = False + ret["DB_ACTION_OUTPUT"].append(f"'{table[vb.TABLE]}' with id '{current_id}' undeprecated.") + + try: + session.commit() + except exc.SQLAlchemyError as error: + logger.error("Error: %s", error) + session.rollback() + + # If no warning + if not ret["DB_ACTION_WARNING"]: + ret.pop("DB_ACTION_WARNING") + + return ret + +def curate(ingest_data, session=None): + """curate the database based on ingested_data""" + if not isinstance(ingest_data, dict): + ingest_data = json.loads(ingest_data) + + if not session: + session = database.get_session() + + ret = { + "DB_ACTION_WARNING": [], + "DB_ACTION_OUTPUT": [] + } + + if not ingest_data[vb.MODIFICATION]: + raise RequestError("No 'table' provided under 'modification' list, this json is malformed.") + + for table in ingest_data[vb.MODIFICATION]: + from . import model + the_table = getattr(model, table[vb.TABLE].title()) + for current_id in set(table[vb.ID]): + selected_table = session.scalars(select(the_table).where(the_table.id == current_id)).unique().first() + if not selected_table: + raise DidNotFindError(table=table[vb.TABLE], attribute="id", query=current_id) + session.delete(selected_table) + ret["DB_ACTION_OUTPUT"].append(f"'{table[vb.TABLE]}' with id '{current_id}' deleted.") + + try: + session.commit() + except exc.SQLAlchemyError as error: + logger.error("Error: %s", error) + session.rollback() + + # If no warning + if not ret["DB_ACTION_WARNING"]: + ret.pop("DB_ACTION_WARNING") + + return ret diff --git a/project_tracking/model.py b/project_tracking/model.py index 8ad1ee6..3d26559 100644 --- a/project_tracking/model.py +++ b/project_tracking/model.py @@ -62,6 +62,14 @@ class SequencingTypeEnum(enum.Enum): SINGLE_END = "SINGLE_END" PAIRED_END = "PAIRED_END" +class StateEnum(enum.Enum): + """ + state enum + """ + VALID = "VALID" + ON_HOLD = "ON_HOLD" + INVALID = "INVALID" + class StatusEnum(enum.Enum): """ @@ -82,6 +90,8 @@ class FlagEnum(enum.Enum): PASS = "PASS" WARNING = "WARNING" FAILED = "FAILED" + MISSING = "MISSING" + NOT_APPLICABLE = "NOT_APPLICABLE" class AggregateEnum(enum.Enum): @@ -158,6 +168,8 @@ class BaseTable(Base): creation timestamp modification timestamp extra_metadata json + ext_id integer + ext_src text """ __abstract__ = True @@ -165,10 +177,10 @@ class BaseTable(Base): deprecated: Mapped[bool] = mapped_column(default=False) deleted: Mapped[bool] = mapped_column(default=False) creation: Mapped[DateTime] = Column(DateTime(timezone=True), server_default=func.now()) - # creation: Mapped[datetime] = mapped_column(default=datetime.now(), nullable=True) - # modification: Mapped[datetime] = mapped_column(default=None, nullable=True) modification: Mapped[DateTime] = Column(DateTime(timezone=True), onupdate=func.now()) extra_metadata: Mapped[dict] = mapped_column(mutable_json_type(dbtype=JSON, nested=True), default=None, nullable=True) + ext_id: Mapped[int] = mapped_column(default=None, nullable=True) + ext_src: Mapped[str] = mapped_column(default=None, nullable=True) def __repr__(self): """ @@ -238,7 +250,6 @@ class Project(BaseTable): """ Project: id integer [PK] - fms_id integer name text (unique) alias json deprecated boolean @@ -249,20 +260,18 @@ class Project(BaseTable): """ __tablename__ = "project" - fms_id: Mapped[str] = mapped_column(default=None, nullable=True) name: Mapped[str] = mapped_column(default=None, nullable=False, unique=True) alias: Mapped[dict] = mapped_column(mutable_json_type(dbtype=JSON, nested=True), default=None, nullable=True) - patients: Mapped[list["Patient"]] = relationship(back_populates="project") - operations: Mapped[list["Operation"]] = relationship(back_populates="project") + specimens: Mapped[list["Specimen"]] = relationship(back_populates="project", cascade="all, delete") + operations: Mapped[list["Operation"]] = relationship(back_populates="project", cascade="all, delete") -class Patient(BaseTable): +class Specimen(BaseTable): """ - Patient: + Specimen: id integer [PK] project_id integer [ref: > project.id] - fms_id integer name text (unique) alias json cohort text @@ -273,44 +282,42 @@ class Patient(BaseTable): modification timestamp extra_metadata json """ - __tablename__ = "patient" + __tablename__ = "specimen" project_id: Mapped[int] = mapped_column(ForeignKey("project.id"), default=None) - fms_id: Mapped[str] = mapped_column(default=None, nullable=True) name: Mapped[str] = mapped_column(default=None, nullable=False, unique=True) alias: Mapped[dict] = mapped_column(mutable_json_type(dbtype=JSON, nested=True), default=None, nullable=True) cohort: Mapped[str] = mapped_column(default=None, nullable=True) institution: Mapped[str] = mapped_column(default=None, nullable=True) - project: Mapped["Project"] = relationship(back_populates="patients") - samples: Mapped[list["Sample"]] = relationship(back_populates="patient") + project: Mapped["Project"] = relationship(back_populates="specimens") + samples: Mapped[list["Sample"]] = relationship(back_populates="specimen", cascade="all, delete") @classmethod def from_name(cls, name, project, cohort=None, institution=None, session=None): """ - get patient if it exist, set it if it does not exist + get specimen if it exist, set it if it does not exist """ if session is None: session = database.get_session() # Name is unique - patient = session.scalars(select(cls).where(cls.name == name)).first() + specimen = session.scalars(select(cls).where(cls.name == name)).first() - if not patient: - patient = cls(name=name, cohort=cohort, institution=institution, project=project) + if not specimen: + specimen = cls(name=name, cohort=cohort, institution=institution, project=project) else: - if patient.project != project: - logger.error(f"patient {patient.name} already in project {patient.project}") + if specimen.project != project: + logger.error(f"specimen {specimen.name} already in project {specimen.project}") - return patient + return specimen class Sample(BaseTable): """ Sample: id integer [PK] - patient_id integer [ref: > patient.id] - fms_id integer + specimen_id integer [ref: > specimen.id] name text (unique) alias json tumour boolean @@ -322,17 +329,16 @@ class Sample(BaseTable): """ __tablename__ = "sample" - patient_id: Mapped[int] = mapped_column(ForeignKey("patient.id"), default=None) - fms_id: Mapped[str] = mapped_column(default=None, nullable=True) + specimen_id: Mapped[int] = mapped_column(ForeignKey("specimen.id"), default=None) name: Mapped[str] = mapped_column(default=None, nullable=False, unique=True) alias: Mapped[dict] = mapped_column(mutable_json_type(dbtype=JSON, nested=True), default=None, nullable=True) tumour: Mapped[bool] = mapped_column(default=False) - patient: Mapped["Patient"] = relationship(back_populates="samples") - readsets: Mapped[list["Readset"]] = relationship(back_populates="sample") + specimen: Mapped["Specimen"] = relationship(back_populates="samples") + readsets: Mapped[list["Readset"]] = relationship(back_populates="sample", cascade="all, delete") @classmethod - def from_name(cls, name, patient, tumour=None, session=None): + def from_name(cls, name, specimen, tumour=None, session=None): """ get sample if it exist, set it if it does not exist """ @@ -343,10 +349,10 @@ def from_name(cls, name, patient, tumour=None, session=None): sample = session.scalars(select(cls).where(cls.name == name)).first() if not sample: - sample = cls(name=name, patient=patient, tumour=tumour) + sample = cls(name=name, specimen=specimen, tumour=tumour) else: - if sample.patient != patient: - logger.error(f"sample {sample.patient} already attatched to project {patient.name}") + if sample.specimen != specimen: + logger.error(f"sample {sample.specimen} already attatched to project {specimen.name}") return sample @@ -374,7 +380,7 @@ class Experiment(BaseTable): library_kit: Mapped[str] = mapped_column(default=None, nullable=True) kit_expiration_date: Mapped[datetime] = mapped_column(default=None, nullable=True) - readsets: Mapped[list["Readset"]] = relationship(back_populates="experiment") + readsets: Mapped[list["Readset"]] = relationship(back_populates="experiment", cascade="all, delete") @classmethod def from_attributes( @@ -412,9 +418,8 @@ def from_attributes( class Run(BaseTable): """ - Patient: + Specimen: id integer [PK] - fms_id text name text instrument text date timestamp @@ -426,15 +431,14 @@ class Run(BaseTable): """ __tablename__ = "run" - fms_id: Mapped[str] = mapped_column(default=None, nullable=True) name: Mapped[str] = mapped_column(default=None, nullable=True) instrument: Mapped[str] = mapped_column(default=None, nullable=True) date: Mapped[datetime] = mapped_column(default=None, nullable=True) - readsets: Mapped[list["Readset"]] = relationship(back_populates="run") + readsets: Mapped[list["Readset"]] = relationship(back_populates="run", cascade="all, delete") @classmethod - def from_attributes(cls, fms_id=None, name=None, instrument=None, date=None, session=None): + def from_attributes(cls, ext_id=None, ext_src=None, name=None, instrument=None, date=None, session=None): """ get run if it exist, set it if it does not exist """ @@ -442,14 +446,16 @@ def from_attributes(cls, fms_id=None, name=None, instrument=None, date=None, ses session = database.get_session() run = session.scalars( select(cls) - .where(cls.fms_id == fms_id) + .where(cls.ext_id == ext_id) + .where(cls.ext_src == ext_src) .where(cls.name == name) .where(cls.instrument == instrument) .where(cls.date == date) ).first() if not run: run = cls( - fms_id=fms_id, + ext_id=ext_id, + ext_src=ext_src, name=name, instrument=instrument, date=date @@ -470,7 +476,7 @@ class Readset(BaseTable): adapter1 text adapter2 text sequencing_type sequencing_type - quality_offset text + state state deprecated boolean deleted boolean creation timestamp @@ -488,7 +494,7 @@ class Readset(BaseTable): adapter1: Mapped[str] = mapped_column(default=None, nullable=True) adapter2: Mapped[str] = mapped_column(default=None, nullable=True) sequencing_type: Mapped[SequencingTypeEnum] = mapped_column(default=None, nullable=True) - quality_offset: Mapped[str] = mapped_column(default=None, nullable=True) + state: Mapped[StateEnum] = mapped_column(default=StateEnum.VALID, nullable=True) sample: Mapped["Sample"] = relationship(back_populates="readsets") experiment: Mapped["Experiment"] = relationship(back_populates="readsets") @@ -523,6 +529,7 @@ class Operation(BaseTable): Operation: id integer [PK] operation_config_id integer [ref: > operation_config.id] + reference_id integer [ref: > operation_config.id] platform text cmd_line text name text @@ -537,16 +544,44 @@ class Operation(BaseTable): project_id: Mapped[int] = mapped_column(ForeignKey("project.id"), default=None) operation_config_id: Mapped[int] = mapped_column(ForeignKey("operation_config.id"), default=None, nullable=True) + reference_id: Mapped[int] = mapped_column(ForeignKey("reference.id"), default=None, nullable=True) platform: Mapped[str] = mapped_column(default=None, nullable=True) cmd_line: Mapped[str] = mapped_column(default=None, nullable=True) name: Mapped[str] = mapped_column(default=None, nullable=True) status: Mapped[StatusEnum] = mapped_column(default=StatusEnum.PENDING) operation_config: Mapped["OperationConfig"] = relationship(back_populates="operations") + reference: Mapped["Reference"] = relationship(back_populates="operations") project: Mapped["Project"] = relationship(back_populates="operations") - jobs: Mapped[list["Job"]] = relationship(back_populates="operation") + jobs: Mapped[list["Job"]] = relationship(back_populates="operation", cascade="all, delete") readsets: Mapped[list["Readset"]] = relationship(secondary=readset_operation, back_populates="operations") +class Reference(BaseTable): + """ + Reference: + name text // scientific name + alias text + assembly text + version test + taxon_id text + source text + deprecated boolean + deleted boolean + creation timestamp + modification timestamp + extra_metadata json + """ + __tablename__ = "reference" + + name: Mapped[str] = mapped_column(default=None, nullable=True) + alias: Mapped[str] = mapped_column(default=None, nullable=True) + assembly: Mapped[str] = mapped_column(default=None, nullable=True) + version: Mapped[str] = mapped_column(default=None, nullable=True) + taxon_id: Mapped[str] = mapped_column(default=None, nullable=True) + source: Mapped[str] = mapped_column(default=None, nullable=True) + + operations: Mapped[list["Operation"]] = relationship(back_populates="reference", cascade="all, delete") + class OperationConfig(BaseTable): """ @@ -570,7 +605,7 @@ class OperationConfig(BaseTable): md5sum: Mapped[str] = mapped_column(unique=True, default=None, nullable=True) data: Mapped[bytes] = mapped_column(LargeBinary, default=None, nullable=True) - operations: Mapped[list["Operation"]] = relationship(back_populates="operation_config") + operations: Mapped[list["Operation"]] = relationship(back_populates="operation_config", cascade="all, delete") @classmethod def config_data(cls, data): @@ -630,7 +665,7 @@ class Job(BaseTable): type: Mapped[str] = mapped_column(default=None, nullable=True) operation: Mapped["Operation"] = relationship(back_populates="jobs") - metrics: Mapped[list["Metric"]] = relationship(back_populates="job") + metrics: Mapped[list["Metric"]] = relationship(back_populates="job", cascade="all, delete") files: Mapped[list["File"]] = relationship(secondary=job_file,back_populates="jobs") readsets: Mapped[list["Readset"]] = relationship(secondary=readset_job, back_populates="jobs") @@ -725,6 +760,6 @@ class File(BaseTable): md5sum: Mapped[str] = mapped_column(default=None, nullable=True) deliverable: Mapped[bool] = mapped_column(default=False) - locations: Mapped[list["Location"]] = relationship(back_populates="file") + locations: Mapped[list["Location"]] = relationship(back_populates="file", cascade="all, delete") readsets: Mapped[list["Readset"]] = relationship(secondary=readset_file, back_populates="files") jobs: Mapped[list["Job"]] = relationship(secondary=job_file, back_populates="files") diff --git a/project_tracking/vocabulary.py b/project_tracking/vocabulary.py index 28920d5..bcad85c 100644 --- a/project_tracking/vocabulary.py +++ b/project_tracking/vocabulary.py @@ -3,23 +3,32 @@ # Generic DATE_LONG_FMT = "%Y-%m-%d %H:%M:%S" DATE_FMT = "%Y-%m-%d" +TABLE = "table" +COLUMN = "column" +OLD = "old" +NEW = "new" +ID = "id" +MODIFICATION = "modification" # project table -PROJECT_FMS_ID = "project_fms_id" +PROJECT_EXT_ID = "project_ext_id" +PROJECT_EXT_SRC = "project_ext_src" PROJECT_NAME = "project_name" -# patient table -PATIENT = "patient" -PATIENT_ID = "patient_id" -PATIENT_FMS_ID = "patient_fms_id" -PATIENT_NAME = "patient_name" -PATIENT_COHORT = "patient_cohort" -PATIENT_INSTITUTION = "patient_institution" +# specimen table +SPECIMEN = "specimen" +SPECIMEN_ID = "specimen_id" +SPECIMEN_EXT_ID = "specimen_ext_id" +SPECIMEN_EXT_SRC = "specimen_ext_src" +SPECIMEN_NAME = "specimen_name" +SPECIMEN_COHORT = "specimen_cohort" +SPECIMEN_INSTITUTION = "specimen_institution" # sample table SAMPLE = "sample" SAMPLE_ID = "sample_id" -SAMPLE_FMS_ID = "sample_fms_id" +SAMPLE_EXT_ID = "sample_ext_id" +SAMPLE_EXT_SRC = "sample_ext_src" SAMPLE_NAME = "sample_name" SAMPLE_TUMOUR = "sample_tumour" @@ -32,7 +41,8 @@ EXPERIMENT_TYPE_LIST = ["PCR-FREE", "RNASEQ"] # run table -RUN_FMS_ID = "run_fms_id" +RUN_EXT_ID = "run_ext_id" +RUN_EXT_SRC = "run_ext_src" RUN_NAME = "run_name" RUN_INSTRUMENT = "run_instrument" RUN_DATE = "run_date" diff --git a/pyproject.toml b/pyproject.toml index 08d7846..2f15d25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["hatchling"] +requires = ["hatchling", "hatch-vcs"] build-backend = "hatchling.build" [tool.pytest.ini_options] @@ -13,9 +13,15 @@ testpaths = [ run.branch = true run.source = ["project_tracking"] +[tool.hatch.version] +source = "vcs" + +[tool.hatch.build.hooks.vcs] +version-file = "_version.py" + [project] name = "project_tracking" -version = "0.0.1" +dynamic = ["version"] authors = [ { name="P-O Quirion", email="po.quirion@mcgill.ca" }, { name="Paul Streteowich", email="paul.stretenowich@mcgill.ca" }, @@ -31,6 +37,9 @@ classifiers = [ dependencies = [ "flask>=2.2.2", "sqlalchemy>=2.0.25", + "alembic>=1.13.1", + "alembic-utils", + "alembic-postgresql-enum", "gunicorn>=20.1.0", "sqlalchemy-json>=0.5.0" ] diff --git a/tests/conftest.py b/tests/conftest.py index 99b407a..8d24e23 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,9 +30,9 @@ def pre_filled_model(): sequencing_technology = 'Fancy Buzzword' exp = model.Experiment(nucleic_acid_type=model.NucleicAcidTypeEnum.DNA) pa_name = "P_O" - pa = model.Patient(name=pa_name, project=project) + pa = model.Specimen(name=pa_name, project=project) sa_name = 'gros_bobo' - sa = model.Sample(name=sa_name, patient=pa) + sa = model.Sample(name=sa_name, specimen=pa) ru_name = "cure the Conglomerat old director's partner 01" instrument = 'Grosse machine du 6e' ru = model.Run(instrument=instrument, name=ru_name) diff --git a/tests/data/run_processing.json b/tests/data/run_processing.json index 5e85ebd..a106115 100644 --- a/tests/data/run_processing.json +++ b/tests/data/run_processing.json @@ -1,20 +1,24 @@ { "operation_platform": "abacus", - "project_fms_id": null, + "project_ext_id": null, + "project_ext_src": null, "project_name": "MOH-Q", - "run_fms_id": null, + "run_ext_id": null, + "run_ext_src": null, "run_name": "220420_A01433_0157_BHM3NHDSX2_MoHRun08-novaseq", "run_instrument": "novaseq", "run_date": "2022-04-20 00:00:00", - "patient": [ + "specimen": [ { - "patient_fms_id": null, - "patient_name": "MoHQ-JG-9-23", - "patient_cohort": "MoHQ-JG-9", - "patient_institution": "JG", + "specimen_ext_id": null, + "specimen_ext_src": null, + "specimen_name": "MoHQ-JG-9-23", + "specimen_cohort": "MoHQ-JG-9", + "specimen_institution": "JG", "sample": [ { - "sample_fms_id": null, + "sample_ext_id": null, + "sample_ext_src": null, "sample_name": "MoHQ-JG-9-23-15000863775-19933DT", "sample_tumour": true, "readset": [ @@ -67,7 +71,8 @@ ] }, { - "sample_fms_id": null, + "sample_ext_id": null, + "sample_ext_src": null, "sample_name": "MoHQ-JG-9-23-15000936286-19866DN", "sample_tumour": false, "readset": [ @@ -122,13 +127,15 @@ ] }, { - "patient_fms_id": null, - "patient_name": "MoHQ-CM-1-3", - "patient_cohort": "MoHQ-CM-1", - "patient_institution": "CM", + "specimen_ext_id": null, + "specimen_ext_src": null, + "specimen_name": "MoHQ-CM-1-3", + "specimen_cohort": "MoHQ-CM-1", + "specimen_institution": "CM", "sample": [ { - "sample_fms_id": null, + "sample_ext_id": null, + "sample_ext_src": null, "sample_name": "MoHQ-CM-1-3-6929-1RT", "sample_tumour": true, "readset": [ @@ -187,7 +194,8 @@ ] }, { - "sample_fms_id": null, + "sample_ext_id": null, + "sample_ext_src": null, "sample_name": "MoHQ-CM-1-3-15000863775-19933DT", "sample_tumour": true, "readset": [ @@ -240,7 +248,8 @@ ] }, { - "sample_fms_id": null, + "sample_ext_id": null, + "sample_ext_src": null, "sample_name": "MoHQ-CM-1-3-15000936286-19866DN", "sample_tumour": false, "readset": [ diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index a92047b..91074cd 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -20,8 +20,8 @@ def test_create_api(client, run_processing_json, app): assert json.loads(response.data)['id'] == 1 response = client.post(f'project/{project_name}/ingest_run_processing', data=json.dumps(run_processing_json)) assert response.status_code == 200 - assert json.loads(response.data)[0]['name'] == "run_processing" - assert json.loads(response.data)[0]['id'] == 1 + assert json.loads(response.data)["DB_ACTION_OUTPUT"][0]['name'] == "run_processing" + assert json.loads(response.data)["DB_ACTION_OUTPUT"][0]['id'] == 1 with app.app_context(): s = database.get_session() @@ -31,28 +31,28 @@ def test_create(not_app_db, run_processing_json, transfer_json, genpipes_json): db_action.create_project(project_name, session=not_app_db) project_id = db_action.name_to_id("Project", project_name, session=not_app_db) - [run_processing_operation, run_processing_job] = db_action.ingest_run_processing(project_id, run_processing_json, not_app_db) + run_processing_out = db_action.ingest_run_processing(project_id, run_processing_json, not_app_db) - assert isinstance(run_processing_operation, model.Operation) - assert isinstance(run_processing_job, model.Job) + assert isinstance(run_processing_out["DB_ACTION_OUTPUT"][0], model.Operation) + # assert isinstance(run_processing_job, model.Job) assert not_app_db.scalars(select(model.Project)).first().name == project_name - for patient_json in run_processing_json[vb.PATIENT]: - assert not_app_db.scalars(select(model.Patient).where(model.Patient.name == patient_json[vb.PATIENT_NAME])).first().name == patient_json[vb.PATIENT_NAME] - for sample_json in patient_json[vb.SAMPLE]: + for specimen_json in run_processing_json[vb.SPECIMEN]: + assert not_app_db.scalars(select(model.Specimen).where(model.Specimen.name == specimen_json[vb.SPECIMEN_NAME])).first().name == specimen_json[vb.SPECIMEN_NAME] + for sample_json in specimen_json[vb.SAMPLE]: assert not_app_db.scalars(select(model.Sample).where(model.Sample.name == sample_json[vb.SAMPLE_NAME])).first().name == sample_json[vb.SAMPLE_NAME] for readset_json in sample_json[vb.READSET]: assert not_app_db.scalars(select(model.Readset).where(model.Readset.name == readset_json[vb.READSET_NAME])).first().name == readset_json[vb.READSET_NAME] - [transfer_operation, transfer_job] = db_action.ingest_transfer(project_id, transfer_json, not_app_db) + transfer_out = db_action.ingest_transfer(project_id, transfer_json, not_app_db) - assert isinstance(transfer_operation, model.Operation) - assert isinstance(transfer_job, model.Job) + assert isinstance(transfer_out["DB_ACTION_OUTPUT"][0], model.Operation) + # assert isinstance(transfer_job, model.Job) - [genpipes_operation, genpipes_jobs] = db_action.ingest_genpipes(project_id, genpipes_json, not_app_db) + genpipes_out = db_action.ingest_genpipes(project_id, genpipes_json, not_app_db) - assert isinstance(genpipes_operation, model.Operation) - for job in genpipes_jobs: - assert isinstance(job, model.Job) + assert isinstance(genpipes_out["DB_ACTION_OUTPUT"][0], model.Operation) + # for job in genpipes_jobs: + # assert isinstance(job, model.Job) # assert 1 == 2 diff --git a/tests/test_model.py b/tests/test_model.py index 9f36320..41e1e0a 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -13,7 +13,7 @@ def test_add_model(not_app_db, pre_filled_model): db.commit() assert not_app_db.query(model.Project).one().name == entry_dict["project_name"] - assert not_app_db.query(model.Patient).one().name == entry_dict["pa_name"] + assert not_app_db.query(model.Specimen).one().name == entry_dict["pa_name"] assert not_app_db.query(model.Operation).one().name == entry_dict["op_name"] m1 = not_app_db.query(model.Metric).first() r1 = m1.readsets[0] diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 473c8c9..aec9230 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -28,8 +28,8 @@ def test_serialization(not_app_db): project=project) exp = model.Experiment(nucleic_acid_type=model.NucleicAcidTypeEnum.DNA) - pa = model.Patient(name=pa_name, project=project) - sa = model.Sample(name=sa_name, patient=pa) + pa = model.Specimen(name=pa_name, project=project) + sa = model.Sample(name=sa_name, specimen=pa) ru = model.Run(instrument=instrument, name=ru_name) re1 = model.Readset(name=re1_name, sample=sa, experiment=exp, run=ru) re2 = model.Readset(name=re2_name, sample=sa, experiment=exp, run=ru)