Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add extra args to collate to handle edge cases [WIP] #302

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 88 additions & 7 deletions pycytominer/cyto_utils/collate.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ def collate(
add_image_features=True,
image_feature_categories=["Granularity", "Texture", "ImageQuality", "Threshold"],
printtoscreen=True,
append_metadata=False,
overwrite_metadata=False,
download_flags=[],
upload_flags=[],
):
"""Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database

Expand All @@ -49,22 +53,30 @@ def collate(
An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists
munge : bool, default False
Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table
csv_dir : str, default 'analysis'
csv_dir : str, default "analysis"
The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis"
aws_remote : str, optional, default None
A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run
aggregate_only : bool, default False
Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps
tmp_dir: str, default '/tmp'
tmp_dir: str, default "/tmp"
The temporary directory to be used by cytominer-databases for output
overwrite: bool, optional, default False
Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists
add_image_features: bool, optional, default True
Whether or not to add the image features to the profiles
image_feature_categories: list, optional, default ['Granularity','Texture','ImageQuality','Count','Threshold']
image_feature_categories: list, optional, default ["Granularity","Texture","ImageQuality","Count","Threshold"]
The list of image feature groups to be used by add_image_features during aggregation
printtoscreen: bool, optional, default True
Whether or not to print output to the terminal
append_metadata: bool, optional, default False
TODO
overwrite_metadata: bool, optional, default False
TODO
download_flags: list, optional, default []
TODO
upload_flags: list, optional, default []
TODO
"""

from pycytominer.cyto_utils.cells import SingleCells
Expand Down Expand Up @@ -98,11 +110,30 @@ def collate(

remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"

sync_cmd = f"aws s3 sync --exclude * --include */Cells.csv --include */Nuclei.csv --include */Cytoplasm.csv --include */Image.csv {remote_input_dir} {input_dir}"
sync_cmd = [
"aws",
"s3",
"sync",
"--exclude",
"*",
"--include",
"*/Cells.csv",
"--include",
"*/Nuclei.csv",
"--include",
"*/Cytoplasm.csv",
"--include",
"*/Image.csv",
remote_input_dir,
input_dir,
] + download_flags
if printtoscreen:
print(f"Downloading CSVs from {remote_input_dir} to {input_dir}")
run_check_errors(sync_cmd)

if overwrite_metadata or append_metadata:
find_and_fix_metadata(input_dir, overwrite=overwrite_metadata)

ingest_cmd = [
"cytominer-database",
"ingest",
Expand Down Expand Up @@ -159,7 +190,13 @@ def collate(
if aws_remote:
if printtoscreen:
print(f"Uploading {cache_backend_file} to {remote_backend_file}")
cp_cmd = ["aws", "s3", "cp", cache_backend_file, remote_backend_file]
cp_cmd = [
"aws",
"s3",
"cp",
cache_backend_file,
remote_backend_file,
] + upload_flags
run_check_errors(cp_cmd)

if printtoscreen:
Expand All @@ -182,7 +219,7 @@ def collate(

remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"

cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file]
cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file] + download_flags
if printtoscreen:
print(
f"Downloading SQLite files from {remote_backend_file} to {backend_file}"
Expand All @@ -208,11 +245,55 @@ def collate(
if aws_remote:
if printtoscreen:
print(f"Uploading {aggregated_file} to {remote_aggregated_file}")
csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file]
csv_cp_cmd = [
"aws",
"s3",
"cp",
aggregated_file,
remote_aggregated_file,
] + upload_flags
run_check_errors(csv_cp_cmd)

if printtoscreen:
print(f"Removing backend files from {backend_dir}")
import shutil

shutil.rmtree(backend_dir)


def find_and_fix_metadata(path_to_plate_folder, overwrite=False):
site_list = os.listdir(path_to_plate_folder)
for eachsite in site_list:
image_csv = os.path.join(path_to_plate_folder, eachsite, "Image.csv")
if os.path.exists(image_csv):
append_metadata(image_csv, overwrite)


def append_metadata(path_to_csv, overwrite=False):
import pandas as pd

all_meta = path_to_csv.split("/")[-2]
plate = "-".join(all_meta.split("-")[:-2])
well = all_meta.split("-")[-2]
site = all_meta.split("-")[-1]
df = pd.read_csv(path_to_csv)
edited = False
if overwrite:
df.drop(
columns=["Metadata_Plate", "Metadata_Well", "Metadata_Site"],
inplace=True,
errors="ignore",
)
edited = True
insertion_index = list(df.columns).index("ModuleError_01LoadData")
if "Metadata_Plate" not in list(df.columns):
df.insert(insertion_index, "Metadata_Plate", plate)
edited = True
if "Metadata_Well" not in list(df.columns):
df.insert(insertion_index, "Metadata_Well", well)
edited = True
if "Metadata_Site" not in list(df.columns):
df.insert(insertion_index, "Metadata_Site", site)
edited = True
if edited:
df.to_csv(path_to_csv, index=False)
36 changes: 36 additions & 0 deletions pycytominer/cyto_utils/collate_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,38 @@
help="Whether to print status updates",
)

parser.add_argument(
"--append-metadata",
dest="append_metadata",
action="store_true",
default=False,
help="Whether or not to add imputed plate, well, and/or site metadata if it's missing",
)

parser.add_argument(
"--overwrite-metadata",
dest="overwrite_metadata",
action="store_true",
default=False,
help="Whether or not to add imputed plate, well, and/or site metadata, overwriting what's already there",
)

parser.add_argument(
"--download-flags",
dest="download_flags",
type=lambda s: [] if "," not in s else [item for item in s.split(",")],
default="",
help="Extra flags to pass to aws download commands. Multiple values can be passed in if comma separated with no spaces between them",
)

parser.add_argument(
"--upload-flags",
dest="upload_flags",
type=lambda s: [] if "," not in s else [item for item in s.split(",")],
default="",
help="Extra flags to pass to aws upload commands. Multiple values can be passed in if comma separated with no spaces between them",
)

args = parser.parse_args()

collate(
Expand All @@ -94,4 +126,8 @@
add_image_features=args.add_image_features,
image_feature_categories=args.image_feature_categories,
printtoscreen=args.printtoscreen,
append_metadata=args.append_metadata,
overwrite_metadata=args.overwrite_metadata,
download_flags=args.download_flags,
upload_flags=args.upload_flags,
)