diff --git a/README.md b/README.md index d631fe6..c61432d 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,6 @@ SDNist v2.0 does not support the Temporal Map Challenge environment. To run the testing environment from the [*NIST PSCR Differential Privacy Temporal Map Challenge*](https://www.nist.gov/ctl/pscr/open-innovation-prize-challenges/past-prize-challenges/2020-differential-privacy-temporal) for the Chicago Taxi data sprint or the American Community Survey sprint, please go to the the [Temporal Map Challenge assets repository](https://github.com/usnistgov/Differential-Privacy-Temporal-Map-Challenge-assets). -### Citing SDNist Deidentified Data Report Tool -If you publish work that utilizes the SDNist Deidentified Data Tool, please cite the software. Citation recommendation: -> Task C., Bhagat K., and Howarth G.S. (2023), SDNist v2: Deidentified Data Report Tool, -> National Institute of Standards and Technology, -> https://doi.org/10.18434/mds2-2943 -(NOTE: DOI is not yet active, but should be by 1 APR 2023). - Setting Up the SDNIST Report Tool ------------------------ @@ -268,3 +261,10 @@ by the sdnist.report package to generate a data quality report. 1. If the sdnist.report package is not able to download the datasets, you can download them from Github [Diverse Community Excerpts Data](https://github.com/usnistgov/SDNist/releases/download/v2.0.0/diverse_community_excerpts_data.zip). 3. Unzip the **diverse_community_excerpts_data.zip** file and move the unzipped **diverse_community_excerpts_data** directory to the **sdnist-project** directory. 4. Delete the **diverse_community_excerpts_data.zip** file once the data is successfully extracted from the zip. + +### Citing SDNist Deidentified Data Report Tool +If you publish work that utilizes the SDNist Deidentified Data Tool, please cite the software. Citation recommendation: +> Task C., Bhagat K., and Howarth G.S. (2023), SDNist v2: Deidentified Data Report Tool, +> National Institute of Standards and Technology, +> https://doi.org/10.18434/mds2-2943 +(NOTE: DOI is not yet active, but should be by 1 APR 2023). \ No newline at end of file diff --git a/nist diverse communities data excerpts/README.md b/nist diverse communities data excerpts/README.md index 36d7880..5f9a7e1 100644 --- a/nist diverse communities data excerpts/README.md +++ b/nist diverse communities data excerpts/README.md @@ -15,6 +15,13 @@ The requirements we sought to meet included the following: - Geographic regions that include enough (but not an overwhelming amount) individuals and sufficient complexity to investigate relationships between features - Geographic regions whose typical feature values vary radically to explore performance on diverse data sets +## Citing NIST Diverse Community Excerpts +If you publish work that utilizes the NIST Diverse Community Excerpt Data, please cite the resource. Citation recommendation: +> Task C., Bhagat K., Streat D., and Howarth G.S. (2023), +> NIST Diverse Community Excerpt Data, National Institute of Standards and Technology, +> https://doi.org/10.18434/mds2-2895 +(NOTE: DOI is not yet active, but should be live by 1 APR 2023). + ## Data set description We are currently offering three geographic regions, each contained within its own directory, containing the data and a description of each PUMA within data we call "postcards". @@ -95,9 +102,10 @@ Suggested partitioning feature: Sex + Disability ## Credits -- [Christine Task](https://github.com/ctask) - Project technical lead - christine.task@knexusresearch.com +- [Christine Task]() - Project technical lead - christine.task@knexusresearch.com - [Karan Bhagat](https://github.com/kbtriangulum) - Contributor - [Aniruddha Sen](https://www.linkedin.com/in/senaniruddha/) - Contributor +- [Damon Streat](https://www.linkedin.com/in/damon-streat-244106190?original_referer=https%3A%2F%2Fwww.google.com%2F) - Contributor - [Gary Howarth](https://www.nist.gov/people/gary-howarth) - Project PI - gary.howarth@nist.gov diff --git a/sdnist/load.py b/sdnist/load.py index eeb9d66..cfc016f 100644 --- a/sdnist/load.py +++ b/sdnist/load.py @@ -82,7 +82,7 @@ def check_exists(root: Path, name: Path, download: bool, data_name: str = strs.D if not name.exists(): print(f"{name} does not exist.") zip_path = Path(root.parent, 'data.zip') - version = "1.4.0-b.1" + version = "2.0.0" version_v = f"v{version}" sdnist_version = DEFAULT_DATASET diff --git a/sdnist/report/plots/univariate.py b/sdnist/report/plots/univariate.py index cbb40c4..4a29e8b 100644 --- a/sdnist/report/plots/univariate.py +++ b/sdnist/report/plots/univariate.py @@ -219,20 +219,20 @@ def save_distribution_plot(self, "plot": relative_path(file_path) } - if i < 3: - self.feat_data[title] = dict() - if c1 >= c2*3 or f in ['PINCP']: - f_val = c_sort_merged.loc[0, f] - f_tc = c_sort_merged.loc[0, 'count_target'] - f_sc = c_sort_merged.loc[0, 'count_deidentified'] - c_sort_merged = c_sort_merged[~c_sort_merged[f].isin([f_val])] - self.feat_data[title] = { - "excluded": { - "feature_value": f_val, - "target_counts": int(f_tc), - "deidentified_counts": int(f_sc) - } + + self.feat_data[title] = dict() + if c1 >= c2*3 or f in ['PINCP']: + f_val = c_sort_merged.loc[0, f] + f_tc = c_sort_merged.loc[0, 'count_target'] + f_sc = c_sort_merged.loc[0, 'count_deidentified'] + c_sort_merged = c_sort_merged[~c_sort_merged[f].isin([f_val])] + self.feat_data[title] = { + "excluded": { + "feature_value": f_val, + "target_counts": int(f_tc), + "deidentified_counts": int(f_sc) } + } merged = c_sort_merged.sort_values(by=f) @@ -274,9 +274,9 @@ def save_distribution_plot(self, plt.savefig(Path(o_path, f'{f}.jpg'), bbox_inches='tight') plt.close() - if i < 3: - saved_file_paths.append(file_path) - self.feat_data[title]['path'] = file_path + # if i < 3: + saved_file_paths.append(file_path) + self.feat_data[title]['path'] = file_path return saved_file_paths