Merge pull request #24 from usnistgov/develop

Develop
usnistgov · Feb 2, 2024 · 8430da9 · 8430da9
2 parents 471a84f + b23b271
commit 8430da9
Show file tree

Hide file tree

Showing 1,243 changed files with 36,349 additions and 33,743 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# SDNist v2.3: Deidentified Data Report Tool
+# SDNist v2.4: Deidentified Data Report Tool
 
 ## [SDNist is the offical software package for engaging in the NIST Collaborative Research Cycle](https://pages.nist.gov/privacy_collaborative_research_cycle)
 
@@ -37,7 +37,7 @@ Setting Up the SDNIST Report Tool
 
 ### Brief Setup Instructions
 
-SDNist requires Python version 3.7 or greater. If you have installed a previous version of the SDNist library, we recommend installing v2.3 in a virtual environment. v2.3 can be installed via [Release 2.3](https://github.com/usnistgov/SDNist/releases/tag/v2.3.0) or via the Pypi server: `pip install sdnist` or, if you already have a version installed, `pip install --upgrade sdnist`.
+SDNist requires Python version 3.7 or greater. If you have installed a previous version of the SDNist library, we recommend installing v2.4 in a virtual environment. v2.4 can be installed via [Release 2.4](https://github.com/usnistgov/SDNist/releases/tag/v2.4) or via the Pypi server: `pip install sdnist` or, if you already have a version installed, `pip install --upgrade sdnist`.
 
 The NIST Diverse Community Exceprt data will download on the fly.
 

diff --git a/sdnist/metrics/kmarginal.py b/sdnist/metrics/kmarginal.py
@@ -58,10 +58,13 @@ def __init__(self,
         self.features = self.td.columns.tolist()
         marg_cols = list(set(self.features).difference(['PUMA', 'INDP']))
         marg_cols = sorted(marg_cols)
-        self.marginals = [(f1, f2)
-                           for i, f1 in enumerate(marg_cols)
-                           for j, f2 in enumerate(marg_cols)
-                           if i < j]
+        if len(marg_cols) == 1:
+            self.marginals = [(marg_cols[0], marg_cols[0])]
+        else:
+            self.marginals = [(f1, f2)
+                               for i, f1 in enumerate(marg_cols)
+                               for j, f2 in enumerate(marg_cols)
+                               if i < j]
 
     def marginal_pairs(self):
         for _ in self.marginals:

diff --git a/sdnist/metrics/pca.py b/sdnist/metrics/pca.py
@@ -44,7 +44,7 @@ def __init__(self, target: pd.DataFrame, synthetic: pd.DataFrame,):
         self.comp_df = None
 
     def compute_pca(self):
-        cc = 5
+        cc = 5 if self.tar.shape[1] > 5 else self.tar.shape[1]
         t_pca = PCA(n_components=cc)
 
         tdf_v = self.tar.values
@@ -185,10 +185,12 @@ def plot_all_components_pairs(title: str,
             plt.close(fig)
 
     fig, ax = plt.subplots(cc, cc, figsize=(6, 6))
-
     for i, pc_i in enumerate(d.columns):
         for j, pc_j in enumerate(d.columns):
-            ax_t = ax[i, j]
+            if cc == 1:
+                ax_t = ax
+            else:
+                ax_t = ax[i, j]
             if pc_i == pc_j:
                 ax_t.text(0.5, 0.5, pc_i,
                           ha='center', va='center', color='black', fontsize=30)

diff --git a/sdnist/metrics/unique_exact_matches.py b/sdnist/metrics/unique_exact_matches.py
@@ -26,10 +26,13 @@ def unique_exact_matches(target_data: pd.DataFrame, deidentified_data: pd.DataFr
     # number of unique target records that exactly match in deidentified data
     t_rec_matched = merged.shape[0]
 
-    # percent of unique target records that exactly match in deidentified data
-    perc_t_rec_matched = t_rec_matched/t_unique_records * 100
+    if t_unique_records > 0:
+        # percent of unique target records that exactly match in deidentified data
+        perc_t_rec_matched = t_rec_matched/t_unique_records * 100
 
-    perc_t_rec_matched = round(perc_t_rec_matched, 2)
+        perc_t_rec_matched = round(perc_t_rec_matched, 2)
+    else:
+        perc_t_rec_matched = 0
 
     return t_rec_matched, perc_t_rec_matched, t_unique_records, perc_t_unique_records
 

diff --git a/sdnist/report/dataset/__init__.py b/sdnist/report/dataset/__init__.py
@@ -66,7 +66,7 @@ def feature_space_size(target_df: pd.DataFrame, data_dict: Dict):
                      'DEAR']:
             size = size * len(data_dict[col]['values'])
         elif col in ['PUMA', 'DENSITY']:
-            size = size * len(target_df['PUMA'].unique())
+            size = size * len(target_df[col].unique())
         elif col in ['NOC', 'NPF', 'INDP']:
             size = size * len(target_df[col].unique())
 

diff --git a/sdnist/report/dataset/binning.py b/sdnist/report/dataset/binning.py
@@ -32,6 +32,7 @@ def percentile_rank_synthetic(synthetic: pd.DataFrame,
         if f not in to.columns.tolist():
             continue
         nna_mask = s[~s[f].isin(['N'])].index  # not na mask
+
         st = pd.DataFrame(pd.to_numeric(s.loc[nna_mask, f]).astype(int), columns=[f])
         final_st = st.copy()
         max_b = 0
@@ -50,7 +51,7 @@ def percentile_rank_synthetic(synthetic: pd.DataFrame,
             else:
                 min_b = max_b
                 final_st.loc[(st[f] > min_b), f] = b
-        s.loc[nna_mask, f] = final_st
+        s.loc[nna_mask, f] = final_st[f]
     return s
 
 

diff --git a/sdnist/report/dataset/validate.py b/sdnist/report/dataset/validate.py
@@ -26,9 +26,9 @@ def console_out(text: str):
             "nan_records": len(nan_df),
             "nan_features": nan_features
         }
-        sd = sd.dropna()
-        console_out(f'Found {len(nan_df)} records with NaN values. '
-                    f'Removed records with NaN values.')
+        # sd = sd.dropna()
+        # console_out(f'Found {len(nan_df)} records with NaN values. '
+        #             f'Removed records with NaN values.')
 
     for f in features:
         # check feature has out of bound value
@@ -43,7 +43,8 @@ def console_out(text: str):
             fd.loc[mask, f] = pd.to_numeric(fd.loc[mask, f], errors="coerce")
             nans = fd[fd.isna().any(axis=1)]
             if len(nans):
-                vob_vals = list(set(synth_data.loc[nans.index, f].values.tolist()))
+                vob_vals = list(set([str(v)
+                                     for v in synth_data.loc[nans.index, f].values.tolist()]))
                 vob_features.append((f, vob_vals))
                 console_out(f'Value out of bound for feature {f}, '
                       f'out of bound values: {vob_vals}. '
@@ -62,7 +63,8 @@ def console_out(text: str):
             f_unique = sd['PUMA'].unique().tolist()
             v_intersect = set(f_unique).intersection(set(f_vals))
             if len(v_intersect) < len(f_unique):
-                vob_vals = list(set(f_unique).difference(v_intersect))
+                vob_vals = list(set([str(v) for v in
+                                     list(set(f_unique).difference(v_intersect))]))
                 vob_features.append((f, vob_vals))
                 console_out(f'Value out of bound for feature {f}, '
                             f'out of bound values: {vob_vals}. Dropping feature from evaluation.')
@@ -73,7 +75,8 @@ def console_out(text: str):
             nans = fd[fd.isna().any(axis=1)]
             vob_vals = []
             if len(nans):
-                vob_vals.extend(list(set(synth_data.loc[nans.index, f].values.tolist())))
+                vob_vals.extend(list(set([str(v)
+                                          for v in synth_data.loc[nans.index, f].values.tolist()])))
                 fd = fd.dropna()
 
             mask = fd[fd[f] != 'N'].index if has_N else fd.index
@@ -87,7 +90,8 @@ def console_out(text: str):
                 real_vals = [int(v) for v in real_vals]
                 v_intersect = set(f_unique).intersection(set(real_vals))
                 if len(v_intersect) < len(f_unique):
-                    vob_vals.extend(list(set(f_unique).difference(v_intersect)))
+                    vob_vals.extend(set([str(v)
+                                         for v in list(set(f_unique).difference(v_intersect))]))
 
             if len(vob_vals):
                 vob_features.append((f, vob_vals))
@@ -102,11 +106,10 @@ def console_out(text: str):
 
         if len(vob_features):
             last_vob_f, vob_vals = vob_features[-1]
-
             if last_vob_f == f:
                 sd = sd.loc[:, sd.columns != f]
                 if has_nan:
-                    vob_features = (last_vob_f, ['nan'] + vob_vals)
+                    vob_features[-1] = (last_vob_f, ['nan'] + list(set(vob_vals)))
 
     validation_log['values_out_of_bound'] = dict(vob_features)
     return sd, validation_log
diff --git a/...sgan_ZhaozhiQian_1_05-19-2023T18.01.12/k_marginal_synopsys/subsample_error_comparison.csv b/...sgan_ZhaozhiQian_1_05-19-2023T18.01.12/k_marginal_synopsys/subsample_error_comparison.csv