Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

exit early if no data to process or all NA #141

Merged
merged 5 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 23 additions & 54 deletions src/stepcount/stepcount.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import warnings
import os
import sys
import pathlib
import urllib
import shutil
Expand Down Expand Up @@ -82,28 +83,38 @@ def main():
data, info_read = utils.read(
args.filepath,
usecols=args.txyz,
resample_hz=30 if args.model_type == 'ssl' else None,
start_time=args.start,
end_time=args.end,
sample_rate=args.sample_rate,
resample_hz=30 if args.model_type == 'ssl' else None,
verbose=verbose
)
info.update(info_read)

# Set start/end times, if given
if args.start is not None:
data = data.loc[args.start:]
if args.end is not None:
data = data.loc[:args.end]

# Exclusion: first/last days
if args.exclude_first_last is not None:
data = utils.exclude_first_last_days(data, args.exclude_first_last)
data = utils.drop_first_last_days(data, args.exclude_first_last)

# Exclusion: days with wear time below threshold
if args.exclude_wear_below is not None:
data = utils.exclude_wear_below_days(data, args.exclude_wear_below)

# Summarize wear time
info.update(summarize_wear_time(data))
data = utils.flag_wear_below_days(data, args.exclude_wear_below)

# Update wear time stats after exclusions
info.update(utils.calculate_wear_stats(data))

# If no data, save Info.json and exit
if len(data) == 0 or data.isna().any(axis=1).all(): # TODO: check na only on x,y,z cols?
# Save Info.json
with open(f"{outdir}/{basename}-Info.json", 'w') as f:
json.dump(info, f, indent=4, cls=utils.NpEncoder)
# Print
print("\nSummary\n-------")
print(json.dumps(
{k: v for k, v in info.items() if not re.search(r'_Weekend|_Weekday|_Hour\d{2}', k)},
indent=4, cls=utils.NpEncoder
))
print("No data to process. Exiting early...")
sys.exit(0)

# Run model
if verbose:
Expand Down Expand Up @@ -428,48 +439,6 @@ def load_model(
return joblib.load(pth)


def summarize_wear_time(
data: pd.DataFrame,
):
"""
Summarize wear time information from raw accelerometer data.

Parameters:
- data (pd.DataFrame): A pandas DataFrame of raw accelerometer data with columns 'x', 'y', 'z'.

Returns:
- dict: A dictionary containing various wear time statistics.

Example:
summary = summarize_wear_time(data)
"""

dt = utils.infer_freq(data.index).total_seconds()
na = data.isna().any(axis=1)

if len(data) == 0 or na.all():
wear_start = None
wear_end = None
nonwear_time = len(data) * dt
wear_time = 0.0
covers24hok = 0
else:
wear_start = data.first_valid_index().strftime("%Y-%m-%d %H:%M:%S")
wear_end = data.last_valid_index().strftime("%Y-%m-%d %H:%M:%S")
nonwear_time = na.sum() * dt / (60 * 60 * 24)
wear_time = len(data) * dt / (60 * 60 * 24) - nonwear_time
coverage = (~na).groupby(na.index.hour).mean()
covers24hok = int(len(coverage) == 24 and coverage.min() >= 0.01)

return {
'WearStartTime': wear_start,
'WearEndTime': wear_end,
'WearTime(days)': wear_time,
'NonwearTime(days)': nonwear_time,
'Covers24hOK': covers24hok
}


def summarize_enmo(
data: pd.DataFrame,
adjust_estimates: bool = False
Expand Down
91 changes: 77 additions & 14 deletions src/stepcount/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
def read(
filepath: str,
usecols: str = 'time,x,y,z',
resample_hz: str = 'uniform',
start_time: str = None,
end_time: str = None,
sample_rate: float = None,
resample_hz: str = 'uniform',
verbose: bool = True
):
"""
Expand Down Expand Up @@ -91,8 +93,6 @@ def read(
"Device": ftype,
"Filesize(MB)": fsize,
"SampleRate": sample_rate,
"StartTime": data.index[0].strftime('%Y-%m-%d %H:%M:%S'),
"EndTime": data.index[-1].strftime('%Y-%m-%d %H:%M:%S')
})

elif ftype in (".cwa", ".gt3x", ".bin"):
Expand All @@ -112,10 +112,73 @@ def read(
if 'ResampleRate' not in info:
info['ResampleRate'] = info['SampleRate']

# Trim the data if start/end times are specified
if start_time is not None:
data = data.loc[start_time:]
if end_time is not None:
data = data.loc[:end_time]

# Update wear stats
info.update(calculate_wear_stats(data))

return data, info


def exclude_wear_below_days(
def calculate_wear_stats(data: pd.DataFrame):
"""
Calculate wear time and related information from raw accelerometer data.

Parameters:
- data (pd.DataFrame): A pandas DataFrame of raw accelerometer data with columns 'x', 'y', 'z' and a DatetimeIndex.

Returns:
- dict: A dictionary containing various wear time stats.

Example:
info = calculate_wear_stats(data)
"""

TIME_FORMAT = "%Y-%m-%d %H:%M:%S"

n_data = len(data)

if n_data == 0:
start_time = None
end_time = None
wear_start_time = None
wear_end_time = None
nonwear_duration = 0.0
wear_duration = 0.0
covers24hok = 0

else:
na = data.isna().any(axis=1) # TODO: check na only on x,y,z cols?
dt = infer_freq(data.index).total_seconds()
start_time = data.index[0].strftime(TIME_FORMAT)
end_time = data.index[-1].strftime(TIME_FORMAT)
wear_start_time = data.first_valid_index()
if wear_start_time is not None:
wear_start_time = wear_start_time.strftime(TIME_FORMAT)
wear_end_time = data.last_valid_index()
if wear_end_time is not None:
wear_end_time = wear_end_time.strftime(TIME_FORMAT)
nonwear_duration = na.sum() * dt / (60 * 60 * 24)
wear_duration = n_data * dt / (60 * 60 * 24) - nonwear_duration
coverage = (~na).groupby(na.index.hour).mean()
covers24hok = int(len(coverage) == 24 and coverage.min() >= 0.01)

return {
'StartTime': start_time,
'EndTime': end_time,
'WearStartTime': wear_start_time,
'WearEndTime': wear_end_time,
'WearTime(days)': wear_duration,
'NonwearTime(days)': nonwear_duration,
'Covers24hOK': covers24hok
}


def flag_wear_below_days(
x: Union[pd.Series, pd.DataFrame],
min_wear: str = '12H'
):
Expand Down Expand Up @@ -154,34 +217,34 @@ def exclude_wear_below_days(
return x


def exclude_first_last_days(
def drop_first_last_days(
x: Union[pd.Series, pd.DataFrame],
first_or_last='both'
):
"""
Set the values of the first day, last day, or both to NaN in a time series.
Drop the first day, last day, or both from a time series.

Parameters:
- x (pd.Series or pd.DataFrame): A pandas Series or DataFrame with a DatetimeIndex representing time series data.
- first_or_last (str, optional): A string indicating which days to exclude. Options are 'first', 'last', or 'both'. Default is 'both'.
- first_or_last (str, optional): A string indicating which days to drop. Options are 'first', 'last', or 'both'. Default is 'both'.

Returns:
- pd.Series or pd.DataFrame: A pandas Series or DataFrame with the values of the specified days set to NaN.
- pd.Series or pd.DataFrame: A pandas Series or DataFrame with the values of the specified days dropped.

Example:
# Exclude the first day from the series
series = exclude_first_last_days(series, first_or_last='first')
# Drop the first day from the series
series = drop_first_last_days(series, first_or_last='first')
"""
if len(x) == 0:
print("No data to exclude")
print("No data to drop")
return x

if first_or_last == 'first':
x[x.index.date == x.index.date[0]] = np.nan
x = x[x.index.date != x.index.date[0]]
elif first_or_last == 'last':
x[x.index.date == x.index.date[-1]] = np.nan
x = x[x.index.date != x.index.date[-1]]
elif first_or_last == 'both':
x[(x.index.date == x.index.date[0]) | (x.index.date == x.index.date[-1])] = np.nan
x = x[(x.index.date != x.index.date[0]) & (x.index.date != x.index.date[-1])]
return x


Expand Down
Loading