From 67fd5129ddcbba93893c77f457fe369a8ac73541 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 11 Sep 2024 16:12:40 -0400 Subject: [PATCH 1/3] small cleaning and python support --- .github/workflows/run-pytest.yml | 2 +- README.md | 44 +++++++++- docs/img/geofetch_logo.svg | 141 ++++++++++++++++++++++++++++++ geofetch/__init__.py | 7 +- geofetch/__main__.py | 1 + geofetch/_version.py | 2 +- geofetch/cli.py | 2 + geofetch/finder.py | 25 +++--- geofetch/geofetch.py | 66 +++++++------- geofetch/sraconvert.py | 7 +- geofetch/utils.py | 9 +- mkdocs.yml | 40 --------- requirements/requirements-all.txt | 2 +- setup.py | 5 +- tests/test_geofetch.py | 7 +- 15 files changed, 255 insertions(+), 105 deletions(-) create mode 100644 docs/img/geofetch_logo.svg delete mode 100644 mkdocs.yml diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 489f0f6..1f6ffb6 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.8", "3.12"] + python-version: ["3.8", "3.13"] os: [ubuntu-latest] steps: diff --git a/README.md b/README.md index 0b4d38d..d814bcd 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/geofetch/README.html) -`geofetch` is a command-line tool that downloads sequencing data and metadata from GEO and SRA and creates [standard PEPs](https://pep.databio.org/). `geofetch` is hosted at [pypi](https://pypi.org/project/geofetch/). You can convert the result of geofetch into unmapped `bam` or `fastq` files with the included `sraconvert` command. +**geofetch** is a command-line tool that downloads sequencing data and metadata from GEO and SRA and create metadata tables in [standard PEP format](https://pep.databio.org/). `geofetch` is hosted at [pypi](https://pypi.org/project/geofetch/). You can convert the result of geofetch into unmapped `bam` or `fastq` files with the included `sraconvert` command. ## Key geofetch features: @@ -20,4 +20,44 @@ - Can search GEO to find relevant data - Can be used either as a command-line tool or from within Python using an API -For more information, see [complete documentation at geofetch.databio.org](http://geofetch.databio.org) (source in the [/docs](/docs) folder). +## Docs + +--- + +**Documentation**: https://pep.databio.org/geofetch/ + +**Source Code**: https://github.com/pepkit/geofetch/ + +--- + + +## Installation +To install `geofetch` use this command: +``` +pip install geofetch +``` +or install the latest version from the GitHub repository: +``` +pip install git+https://github.com/pepkit/geofetch.git +``` + + +## How to cite: +https://doi.org/10.1093/bioinformatics/btad069 +```bibtex +@article{10.1093/bioinformatics/btad069, + author = {Khoroshevskyi, Oleksandr and LeRoy, Nathan and Reuter, Vincent P and Sheffield, Nathan C}, + title = "{GEOfetch: a command-line tool for downloading data and standardized metadata from GEO and SRA}", + journal = {Bioinformatics}, + volume = {39}, + number = {3}, + pages = {btad069}, + year = {2023}, + month = {03}, + abstract = "{The Gene Expression Omnibus has become an important source of biological data for secondary analysis. However, there is no simple, programmatic way to download data and metadata from Gene Expression Omnibus (GEO) in a standardized annotation format.To address this, we present GEOfetch—a command-line tool that downloads and organizes data and metadata from GEO and SRA. GEOfetch formats the downloaded metadata as a Portable Encapsulated Project, providing universal format for the reanalysis of public data.GEOfetch is available on Bioconda and the Python Package Index (PyPI).}", + issn = {1367-4811}, + doi = {10.1093/bioinformatics/btad069}, + url = {https://doi.org/10.1093/bioinformatics/btad069}, + eprint = {https://academic.oup.com/bioinformatics/article-pdf/39/3/btad069/49407404/btad069.pdf}, +} +``` \ No newline at end of file diff --git a/docs/img/geofetch_logo.svg b/docs/img/geofetch_logo.svg new file mode 100644 index 0000000..372c82c --- /dev/null +++ b/docs/img/geofetch_logo.svg @@ -0,0 +1,141 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + geofetch + + + + + + + + + + + + diff --git a/geofetch/__init__.py b/geofetch/__init__.py index fcd9139..2e3620e 100644 --- a/geofetch/__init__.py +++ b/geofetch/__init__.py @@ -1,12 +1,11 @@ """ Package-level data """ -import logmuse import coloredlogs +import logmuse -from geofetch.geofetch import Geofetcher -from geofetch.finder import Finder from geofetch._version import __version__ - +from geofetch.finder import Finder +from geofetch.geofetch import Geofetcher __author__ = ["Oleksandr Khoroshevskyi", "Vince Reuter", "Nathan Sheffield"] __all__ = ["Finder", "Geofetcher", "__version__"] diff --git a/geofetch/__main__.py b/geofetch/__main__.py index 97e5466..077dace 100644 --- a/geofetch/__main__.py +++ b/geofetch/__main__.py @@ -1,4 +1,5 @@ import sys + from geofetch.geofetch import main if __name__ == "__main__": diff --git a/geofetch/_version.py b/geofetch/_version.py index 8e2394f..6ece8ad 100644 --- a/geofetch/_version.py +++ b/geofetch/_version.py @@ -1 +1 @@ -__version__ = "0.12.6" +__version__ = "0.12.7" diff --git a/geofetch/cli.py b/geofetch/cli.py index 6bb96b7..168b2a3 100644 --- a/geofetch/cli.py +++ b/geofetch/cli.py @@ -1,6 +1,8 @@ import argparse import os + import logmuse + from geofetch._version import __version__ diff --git a/geofetch/finder.py b/geofetch/finder.py index e41405e..587e1ae 100644 --- a/geofetch/finder.py +++ b/geofetch/finder.py @@ -1,19 +1,20 @@ +import logging +import os +import re +from datetime import datetime, timedelta + +import coloredlogs +import requests +import xmltodict + from .const import ( - RETMAX, - ETOOLS_GEO_GSE_BASE, - ETOOLS_ENDING, - TODAY_DATE, DATE_FILTER, + ETOOLS_ENDING, + ETOOLS_GEO_GSE_BASE, + RETMAX, THREE_MONTH_FILTER, + TODAY_DATE, ) -import requests -import xmltodict -import re -import os -import logging -import coloredlogs -from datetime import datetime -from datetime import timedelta __author__ = "Oleksandr Khoroshevskyi" diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index bdbd6a1..b490428 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -1,65 +1,65 @@ import copy import csv +import logging import os +import re import sys +import time +from typing import Dict, List, NoReturn, Tuple, Union + +import logmuse +import pandas as pd +import peppy import requests import xmltodict import yaml -import time -import logging - from rich.progress import track -import re -import logmuse from ubiquerg import expandpath -from typing import List, Union, Dict, Tuple, NoReturn -import peppy -import pandas as pd from geofetch.cli import _parse_cmdl from geofetch.const import ( - GSE_PATTERN, - SAMPLE_SUPP_METADATA_FILE, + CONFIG_PROCESSED_TEMPLATE_NAME, + CONFIG_RAW_TEMPLATE_NAME, + CONFIG_SRA_TEMPLATE, EXP_SUPP_METADATA_FILE, - NEW_GENOME_COL_NAME, + EXPERIMENT_PATTERN, FILE_RAW_NAME_SAMPLE_PATTERN, FILE_RAW_NAME_SUBSAMPLE_PATTERN, - CONFIG_RAW_TEMPLATE_NAME, - CONFIG_SRA_TEMPLATE, - CONFIG_PROCESSED_TEMPLATE_NAME, + GSE_PATTERN, + NCBI_EFETCH, + NCBI_ESEARCH, + NEW_GENOME_COL_NAME, NUM_RETRIES, + PROJECT_PATTERN, + SAMPLE_SUPP_METADATA_FILE, SER_SUPP_FILE_PATTERN, SUPP_FILE_PATTERN, - PROJECT_PATTERN, - NCBI_EFETCH, - NCBI_ESEARCH, - EXPERIMENT_PATTERN, ) from geofetch.utils import ( Accession, - build_prefetch_command, - parse_accessions, - parse_SOFT_line, - convert_size, - clean_soft_files, - run_subprocess, + _check_file_existance, + _create_dot_yaml, + _dict_to_list_converter, + _filter_gsm, _get_list_of_keys, _get_value, _read_tar_filelist, - _check_file_existance, - _separate_list_of_files, - _update_columns, - _sanitize_name, _sanitize_config_string, - _create_dot_yaml, - _which, - _dict_to_list_converter, - _standardize_colnames, + _sanitize_name, _separate_file_url, - _filter_gsm, + _separate_list_of_files, + _standardize_colnames, _unify_list_keys, + _update_columns, + _which, + build_prefetch_command, + clean_soft_files, + convert_size, gse_content_to_dict, is_prefetch_callable, + parse_accessions, + parse_SOFT_line, + run_subprocess, ) _LOGGER = logging.getLogger(__name__) diff --git a/geofetch/sraconvert.py b/geofetch/sraconvert.py index d2dd3bc..6e64a9a 100755 --- a/geofetch/sraconvert.py +++ b/geofetch/sraconvert.py @@ -1,10 +1,11 @@ #!/usr/bin/env python -from argparse import ArgumentParser import os -import pypiper -import logmuse import sys +from argparse import ArgumentParser + +import logmuse +import pypiper __version__ = "0.1.0" diff --git a/geofetch/utils.py b/geofetch/utils.py index dcab44f..c006e7b 100644 --- a/geofetch/utils.py +++ b/geofetch/utils.py @@ -1,14 +1,15 @@ """ Independently-importable utilities to circumvent true scripts. """ +import csv import logging import os +import re import subprocess import sys -import re -import requests from io import StringIO -import csv -from typing import Union, List, NoReturn, Dict +from typing import Dict, List, NoReturn, Union + +import requests _LOGGER = logging.getLogger(__name__) diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index e8bc1b9..0000000 --- a/mkdocs.yml +++ /dev/null @@ -1,40 +0,0 @@ -site_name: Geofetch -site_url: http://code.databio.org/geofetch/ -repo_url: http://github.com/pepkit/geofetch -site_logo: img/geofetch_logo_dark.svg -pypi_name: geofetch -paper_link: https://doi.org/10.1093/bioinformatics/btad069 - -nav: - - Getting started: - - Introduction: README.md - - Install and configure: install.md - - SRA convert: sra_convert.md - - Tutorials: - - Tutorial for processed data: processed-data-downloading.md - - Tutorial for raw data: raw-data-downloading.md - - geofetch from within Python: python-usage.md - - GSE Finder: gse_finder.md - - How-to Guides: - - Specifying samples to download: file-specification.md - - Set SRA data download location: howto-location.md - - Run SRA convert: how_to_convert_fastq_from_sra.md - - Reference: - - Metadata output: metadata_output.md - - Usage: usage.md - - FAQ: faq.md - - Support: http://github.com/pepkit/geofetch/issues - - Contributing: contributing.md - - Changelog: changelog.md - -theme: databio - - -plugins: - - search - - databio: - jupyter_source: "docs_jupyter" - jupyter_build: "docs_jupyter/build" - usage_template: "docs/usage_template.md" - usage_cmds: - -"geofetch --help diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 8cc987b..f9b4326 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,6 +4,6 @@ ubiquerg>=0.6.2 requests>=2.28.1 xmltodict>=0.13.0 pandas>=1.5.3 -peppy>=0.40.0 +peppy>=0.40.6 rich>=12.5.1 coloredlogs>=15.0.1 diff --git a/setup.py b/setup.py index 2d180f2..4cd56f2 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,10 @@ #! /usr/bin/env python import os -from setuptools import setup import sys +from setuptools import setup + PACKAGE = "geofetch" REQDIR = "requirements" @@ -49,6 +50,8 @@ def read_reqs(reqs_name): "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Topic :: Scientific/Engineering :: Bio-Informatics", ], keywords="project, bioinformatics, sequencing, ngs, workflow, GUI", diff --git a/tests/test_geofetch.py b/tests/test_geofetch.py index 5e5d70f..e33fe0e 100644 --- a/tests/test_geofetch.py +++ b/tests/test_geofetch.py @@ -1,11 +1,12 @@ +import os +import shutil + import peppy +import pytest import geofetch from geofetch import Geofetcher, utils from geofetch.utils import parse_accessions -import os -import pytest -import shutil INPUT_ACC_FILE = "tests/test_files/input_acc.txt" GSE_FILES = "tests/test_files/soft_files" From 80175d2161086c4be7dbff5406725c448c18492b Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 11 Sep 2024 16:15:26 -0400 Subject: [PATCH 2/3] updated github actions --- .github/workflows/python-publish.yml | 4 ++-- .github/workflows/run-pytest.yml | 4 ++-- .readthedocs.yaml | 10 ---------- 3 files changed, 4 insertions(+), 14 deletions(-) delete mode 100644 .readthedocs.yaml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index b120129..59c6af8 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -11,9 +11,9 @@ jobs: permissions: id-token: write steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: '3.x' - name: Install dependencies diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 1f6ffb6..055ea6e 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -15,10 +15,10 @@ jobs: os: [ubuntu-latest] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index 69e650b..0000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,10 +0,0 @@ -version: 2 -build: - os: ubuntu-22.04 - tools: - python: "3.12" -mkdocs: - configuration: mkdocs.yml -python: - install: - - requirements: requirements/requirements-docs.txt From f52e0ceef9a552e48f370cef72326bc42ee7871b Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 11 Sep 2024 16:16:28 -0400 Subject: [PATCH 3/3] updated github actions --- .github/workflows/run-pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 055ea6e..3d158c5 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.8", "3.13"] + python-version: ["3.8", "3.12"] os: [ubuntu-latest] steps: