Skip to content

Commit

Permalink
Merge pull request #24 from usnistgov/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
kbtriangulum authored Feb 2, 2024
2 parents 471a84f + b23b271 commit 8430da9
Show file tree
Hide file tree
Showing 1,243 changed files with 36,349 additions and 33,743 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SDNist v2.3: Deidentified Data Report Tool
# SDNist v2.4: Deidentified Data Report Tool

## [SDNist is the offical software package for engaging in the NIST Collaborative Research Cycle](https://pages.nist.gov/privacy_collaborative_research_cycle)

Expand Down Expand Up @@ -37,7 +37,7 @@ Setting Up the SDNIST Report Tool

### Brief Setup Instructions

SDNist requires Python version 3.7 or greater. If you have installed a previous version of the SDNist library, we recommend installing v2.3 in a virtual environment. v2.3 can be installed via [Release 2.3](https://github.com/usnistgov/SDNist/releases/tag/v2.3.0) or via the Pypi server: `pip install sdnist` or, if you already have a version installed, `pip install --upgrade sdnist`.
SDNist requires Python version 3.7 or greater. If you have installed a previous version of the SDNist library, we recommend installing v2.4 in a virtual environment. v2.4 can be installed via [Release 2.4](https://github.com/usnistgov/SDNist/releases/tag/v2.4) or via the Pypi server: `pip install sdnist` or, if you already have a version installed, `pip install --upgrade sdnist`.

The NIST Diverse Community Exceprt data will download on the fly.

Expand Down
11 changes: 7 additions & 4 deletions sdnist/metrics/kmarginal.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,13 @@ def __init__(self,
self.features = self.td.columns.tolist()
marg_cols = list(set(self.features).difference(['PUMA', 'INDP']))
marg_cols = sorted(marg_cols)
self.marginals = [(f1, f2)
for i, f1 in enumerate(marg_cols)
for j, f2 in enumerate(marg_cols)
if i < j]
if len(marg_cols) == 1:
self.marginals = [(marg_cols[0], marg_cols[0])]
else:
self.marginals = [(f1, f2)
for i, f1 in enumerate(marg_cols)
for j, f2 in enumerate(marg_cols)
if i < j]

def marginal_pairs(self):
for _ in self.marginals:
Expand Down
8 changes: 5 additions & 3 deletions sdnist/metrics/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, target: pd.DataFrame, synthetic: pd.DataFrame,):
self.comp_df = None

def compute_pca(self):
cc = 5
cc = 5 if self.tar.shape[1] > 5 else self.tar.shape[1]
t_pca = PCA(n_components=cc)

tdf_v = self.tar.values
Expand Down Expand Up @@ -185,10 +185,12 @@ def plot_all_components_pairs(title: str,
plt.close(fig)

fig, ax = plt.subplots(cc, cc, figsize=(6, 6))

for i, pc_i in enumerate(d.columns):
for j, pc_j in enumerate(d.columns):
ax_t = ax[i, j]
if cc == 1:
ax_t = ax
else:
ax_t = ax[i, j]
if pc_i == pc_j:
ax_t.text(0.5, 0.5, pc_i,
ha='center', va='center', color='black', fontsize=30)
Expand Down
9 changes: 6 additions & 3 deletions sdnist/metrics/unique_exact_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,13 @@ def unique_exact_matches(target_data: pd.DataFrame, deidentified_data: pd.DataFr
# number of unique target records that exactly match in deidentified data
t_rec_matched = merged.shape[0]

# percent of unique target records that exactly match in deidentified data
perc_t_rec_matched = t_rec_matched/t_unique_records * 100
if t_unique_records > 0:
# percent of unique target records that exactly match in deidentified data
perc_t_rec_matched = t_rec_matched/t_unique_records * 100

perc_t_rec_matched = round(perc_t_rec_matched, 2)
perc_t_rec_matched = round(perc_t_rec_matched, 2)
else:
perc_t_rec_matched = 0

return t_rec_matched, perc_t_rec_matched, t_unique_records, perc_t_unique_records

Expand Down
2 changes: 1 addition & 1 deletion sdnist/report/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def feature_space_size(target_df: pd.DataFrame, data_dict: Dict):
'DEAR']:
size = size * len(data_dict[col]['values'])
elif col in ['PUMA', 'DENSITY']:
size = size * len(target_df['PUMA'].unique())
size = size * len(target_df[col].unique())
elif col in ['NOC', 'NPF', 'INDP']:
size = size * len(target_df[col].unique())

Expand Down
3 changes: 2 additions & 1 deletion sdnist/report/dataset/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def percentile_rank_synthetic(synthetic: pd.DataFrame,
if f not in to.columns.tolist():
continue
nna_mask = s[~s[f].isin(['N'])].index # not na mask

st = pd.DataFrame(pd.to_numeric(s.loc[nna_mask, f]).astype(int), columns=[f])
final_st = st.copy()
max_b = 0
Expand All @@ -50,7 +51,7 @@ def percentile_rank_synthetic(synthetic: pd.DataFrame,
else:
min_b = max_b
final_st.loc[(st[f] > min_b), f] = b
s.loc[nna_mask, f] = final_st
s.loc[nna_mask, f] = final_st[f]
return s


Expand Down
21 changes: 12 additions & 9 deletions sdnist/report/dataset/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def console_out(text: str):
"nan_records": len(nan_df),
"nan_features": nan_features
}
sd = sd.dropna()
console_out(f'Found {len(nan_df)} records with NaN values. '
f'Removed records with NaN values.')
# sd = sd.dropna()
# console_out(f'Found {len(nan_df)} records with NaN values. '
# f'Removed records with NaN values.')

for f in features:
# check feature has out of bound value
Expand All @@ -43,7 +43,8 @@ def console_out(text: str):
fd.loc[mask, f] = pd.to_numeric(fd.loc[mask, f], errors="coerce")
nans = fd[fd.isna().any(axis=1)]
if len(nans):
vob_vals = list(set(synth_data.loc[nans.index, f].values.tolist()))
vob_vals = list(set([str(v)
for v in synth_data.loc[nans.index, f].values.tolist()]))
vob_features.append((f, vob_vals))
console_out(f'Value out of bound for feature {f}, '
f'out of bound values: {vob_vals}. '
Expand All @@ -62,7 +63,8 @@ def console_out(text: str):
f_unique = sd['PUMA'].unique().tolist()
v_intersect = set(f_unique).intersection(set(f_vals))
if len(v_intersect) < len(f_unique):
vob_vals = list(set(f_unique).difference(v_intersect))
vob_vals = list(set([str(v) for v in
list(set(f_unique).difference(v_intersect))]))
vob_features.append((f, vob_vals))
console_out(f'Value out of bound for feature {f}, '
f'out of bound values: {vob_vals}. Dropping feature from evaluation.')
Expand All @@ -73,7 +75,8 @@ def console_out(text: str):
nans = fd[fd.isna().any(axis=1)]
vob_vals = []
if len(nans):
vob_vals.extend(list(set(synth_data.loc[nans.index, f].values.tolist())))
vob_vals.extend(list(set([str(v)
for v in synth_data.loc[nans.index, f].values.tolist()])))
fd = fd.dropna()

mask = fd[fd[f] != 'N'].index if has_N else fd.index
Expand All @@ -87,7 +90,8 @@ def console_out(text: str):
real_vals = [int(v) for v in real_vals]
v_intersect = set(f_unique).intersection(set(real_vals))
if len(v_intersect) < len(f_unique):
vob_vals.extend(list(set(f_unique).difference(v_intersect)))
vob_vals.extend(set([str(v)
for v in list(set(f_unique).difference(v_intersect))]))

if len(vob_vals):
vob_features.append((f, vob_vals))
Expand All @@ -102,11 +106,10 @@ def console_out(text: str):

if len(vob_features):
last_vob_f, vob_vals = vob_features[-1]

if last_vob_f == f:
sd = sd.loc[:, sd.columns != f]
if has_nan:
vob_features = (last_vob_f, ['nan'] + vob_vals)
vob_features[-1] = (last_vob_f, ['nan'] + list(set(vob_vals)))

validation_log['values_out_of_bound'] = dict(vob_features)
return sd, validation_log

This file was deleted.

Loading

0 comments on commit 8430da9

Please sign in to comment.