diff --git a/.gitignore b/.gitignore index a62eef2..3c12061 100644 --- a/.gitignore +++ b/.gitignore @@ -1,21 +1,35 @@ -# include -!sdnist/ -!sdnist/test/ -!sdnist/test/report/ -!sdnist/test/report/data/ -!sdnist/test/report/data/na2019_1000.csv - -# ignore -report.json -**.pyc -**.DS_Store - -.ipynb_checkpoints -toy_synthetic_data/ -dask-worker-space/ -results/ -build/ -sdnist.egg-info/ - -**.pkl -build +# include +!sdnist/ +!sdnist/test/ +!sdnist/test/report/ +!sdnist/test/report/data/ +!sdnist/test/report/data/na2019_1000.csv + +# ignore +report.json +**.pyc +**.DS_Store + +.ipynb_checkpoints +toy_synthetic_data/ +dask-worker-space/ +results/ +build/ +sdnist.egg-info/ + +**.pkl +build + +**/.idea/ +**/crc_acceleration_bundle_1.0/ +**/crc_n/ +**/crc_notebooks/ +**/create_data/ +**/data/ +**/diverse_communities_data_excerpts/ +**/meta_reports/ +**/reports/ +**/states_puma_geojson/ +**/venv/ +**/workspace/ + diff --git a/CITATION.cff b/CITATION.cff index ba96fc9..fe56622 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -4,7 +4,7 @@ abstract: "SDNist provides benchmark data and a suite of both machine- and human message: >- If you use this repository or present information about it publicly, please cite us. type: software -version: 2.2 +version: 2.3 doi: 10.18434/mds2-2943 date-released: 2023-4-14 contact: diff --git a/README.md b/README.md index 1516717..137589b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# SDNist v2.2: Deidentified Data Report Tool +# SDNist v2.3: Deidentified Data Report Tool ## [SDNist is the offical software package for engaging in the NIST Collaborative Research Cycle](https://pages.nist.gov/privacy_collaborative_research_cycle) @@ -37,7 +37,7 @@ Setting Up the SDNIST Report Tool ### Brief Setup Instructions -SDNist requires Python version 3.7 or greater. If you have installed a previous version of the SDNist library, we recommend installing v2.2 in a virtual environment. v2.2 can be installed via [Release 2.2](https://github.com/usnistgov/SDNist/releases/tag/v2.2.0) or via the Pypi server: `pip install sdnist` or, if you already have a version installed, `pip install --upgrade sdnist`. +SDNist requires Python version 3.7 or greater. If you have installed a previous version of the SDNist library, we recommend installing v2.3 in a virtual environment. v2.3 can be installed via [Release 2.3](https://github.com/usnistgov/SDNist/releases/tag/v2.3.0) or via the Pypi server: `pip install sdnist` or, if you already have a version installed, `pip install --upgrade sdnist`. The NIST Diverse Community Exceprt data will download on the fly. @@ -61,13 +61,13 @@ The NIST Diverse Community Exceprt data will download on the fly. ``` -4. In the already-opened terminal or powershell window, execute the following command to create a new Python environment. The sdnist library will be installed in this newly created Python environment: +4. In the already-opened terminal or powershell window, execute the following command to create a new Python environment. The sdnist library will be installed in this newly created Python environment: ``` c:\\sdnist-project> python -m venv venv ``` -6. The new Python environment will be created in the sdnist-project directory, and the files of the environment should be in the venv directory. To check whether a new Python environment was created successfully, use the following command to list all directories in the sdnist-project directory, and make sure the venv directory exists. +5. The new Python environment will be created in the sdnist-project directory, and the files of the environment should be in the venv directory. To check whether a new Python environment was created successfully, use the following command to list all directories in the sdnist-project directory, and make sure the venv directory exists. **MAC OS/Linux:** ``` @@ -78,7 +78,7 @@ The NIST Diverse Community Exceprt data will download on the fly. c:\\sdnist-project> dir ``` -7. Now activate the Python environment and install the sdnist library into it. +6. Now activate the Python environment and install the sdnist library into it. **MAC OS/Linux:** ``` @@ -107,27 +107,12 @@ The NIST Diverse Community Exceprt data will download on the fly. Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine ``` - -8. Per step 5 above, the sdnist-2.2.0-py3-none-any.whl file should already be present in the sdnist-project directory. Check whether that is true by listing the files in the sdnist-project directory. - - **MAC OS/Linux:** - ``` - (venv) sdnist-project> ls - ``` - **Windows:** - ``` - (venv) c:\\sdnist-project> dir - ``` - The sdnist-2.2.0-py3-none-any.whl file should be in the list printed by the above command; otherwise, follow steps 4 and 5 again to download the .whl file. - - -9. Install sdnist Python library: +7. Install sdnist Python library: ``` (venv) c:\\sdnist-project> pip install sdnist ``` - -10. Installation is successful if executing the following command outputs a help menu for the sdnist.report package: +8. Installation is successful if executing the following command outputs a help menu for the sdnist.report package: ``` (venv) c:\\sdnist-project> python -m sdnist.report -h ``` @@ -162,8 +147,7 @@ The NIST Diverse Community Exceprt data will download on the fly. NATIONAL national2019 ``` - -11. These instructions install sdnist into a virtual environment. The virtual environment must be activated (step 9) each time a new terminal window is used with sdnist. +9. These instructions install sdnist into a virtual environment. The virtual environment must be activated (step 9) each time a new terminal window is used with sdnist. Generate Data Quality Report @@ -260,7 +244,7 @@ Setup Data for SDNIST Report Tool 4. You can download the toy deidentified datasets from Github [Sdnist Toy Deidentified Dataset](https://github.com/usnistgov/SDNist/releases/download/v2.1.1/toy_deidentified_data.zip). Unzip the downloaded file, and move the unzipped toy_deidentified_dataset directory to the sdnist-project directory. -5. Each toy deidentified dataset file is generated using the [Diverse Communities Data Excerpts](https://github.com/usnistgov/SDNist/releases/download/v2.2.0/diverse_communities_data_excerpts.zip). The syn_ma.csv, syn_tx.csv, and syn_national.csv deidentified dataset files are created from target datasets MA (ma2019.csv), TX (tx2019.csv), and NATIONAL(national2019.csv), respectively. You can use one of the toy deidentified dataset files for testing whether the sdnist.report package is installed correctly on your system. +5. Each toy deidentified dataset file is generated using the [Diverse Communities Data Excerpts](https://github.com/usnistgov/SDNist/releases/download/v2.3.0/diverse_communities_data_excerpts.zip). The syn_ma.csv, syn_tx.csv, and syn_national.csv deidentified dataset files are created from target datasets MA (ma2019.csv), TX (tx2019.csv), and NATIONAL(national2019.csv), respectively. You can use one of the toy deidentified dataset files for testing whether the sdnist.report package is installed correctly on your system. 6. Use the following commands for generating reports if you are using a toy deidentified dataset file: @@ -287,7 +271,7 @@ by the sdnist.report package to generate a data quality report. Download Data Manually ---------------------- -1. If the sdnist.report package is not able to download the datasets, you can download them from Github [Diverse Communities Data Excerpts](https://github.com/usnistgov/SDNist/releases/download/v2.2.0/diverse_communities_data_excerpts.zip). +1. If the sdnist.report package is not able to download the datasets, you can download them from Github [Diverse Communities Data Excerpts](https://github.com/usnistgov/SDNist/releases/download/v2.3.0/diverse_communities_data_excerpts.zip). 3. Unzip the **diverse_community_excerpts_data.zip** file and move the unzipped **diverse_community_excerpts_data** directory to the **sdnist-project** directory. 4. Delete the **diverse_community_excerpts_data.zip** file once the data is successfully extracted from the zip. @@ -305,5 +289,4 @@ Credits - [Christine Task](mailto:christine.task@knexusresearch.com) - Project technical lead - christine.task@knexusresearch.com - [Karan Bhagat](https://github.com/kbtriangulum) - Contributor - [David Lee](https://www.linkedin.com/in/david-lee-13872922/) - Documentation -- [Gary Howarth](https://www.nist.gov/people/gary-howarth) - Project PI - gary.howarth@nist.gov - +- [Gary Howarth](https://www.nist.gov/people/gary-howarth) - Project PI - gary.howarth@nist.gov \ No newline at end of file diff --git a/nist diverse communities data excerpts/data_dictionary.json b/nist diverse communities data excerpts/data_dictionary.json index fe1f111..cdf5979 100644 --- a/nist diverse communities data excerpts/data_dictionary.json +++ b/nist diverse communities data excerpts/data_dictionary.json @@ -127,12 +127,13 @@ }, "INDP": { "description": "Industry codes", + "details": "There are a total of 271 possible codes for INDP, 269 of these codes appear in the Diverse Community Data Excerpts (233 in MA, 264 in Texas and National)", "link": "https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2019.pdf" }, "INDP_CAT": { "description": "Industry categories", "values": { - "N": "N/A (less than 16 years old/NILF who last worked more than 5 years ago or never worked)", + "N": "N/A (less than 16 years old, or last worked more than 5 years ago, or never worked)", "0": "AGR: Agriculture, Forestry, Fishing and Hunting", "1": "EXT: Mining, Quarrying, and Oil and Gas Extraction", "2": "UTL: Utilities", @@ -160,7 +161,7 @@ "N": "N/A (less than 3 years old)", "1": "No schooling completed", "2": "Nursery school, Preschool, or Kindergarten", - "3": "Grade 4 to grade 8", + "3": "Grade 1 to grade 8", "4": "Grade 9 to grade 12, no diploma", "5": "High School diploma", "6": "GED", @@ -181,7 +182,7 @@ } }, "PINCP_DECILE": { - "description": "Person's total income in 10-percentile bins", + "description": "Person's total income rank (with respect to their state) discretized into 10% bins.", "values": { "N": "N/A (less than 15 years old", "9": "90th percentile", diff --git a/sdnist/load.py b/sdnist/load.py index de57962..1c10f91 100644 --- a/sdnist/load.py +++ b/sdnist/load.py @@ -82,7 +82,7 @@ def check_exists(root: Path, name: Path, download: bool, data_name: str = strs.D if not name.exists(): print(f"{name} does not exist.") zip_path = Path(root.parent, 'data.zip') - version = "2.2.0" + version = "2.3.0" version_v = f"v{version}" sdnist_version = DEFAULT_DATASET diff --git a/sdnist/metrics/inconsistency.py b/sdnist/metrics/inconsistency.py index 9320ed3..3f0e25f 100644 --- a/sdnist/metrics/inconsistency.py +++ b/sdnist/metrics/inconsistency.py @@ -268,7 +268,7 @@ def compute(self): 'inconsistency_features': ic_data[2], 'inconsistency_violations': int(ic_data[3].split(' ')[0]), 'inconsistent_data_indexes': ic_dict[i[NAME]], - 'inconsistent_record_example': relative_path(row_path)} + 'inconsistent_record_example': relative_path(row_path, level=3)} ) # ------- Compute work-based Inconsistencies------------ @@ -298,7 +298,7 @@ def compute(self): 'inconsistency_features': ic_data[2], 'inconsistency_violations': int(ic_data[3].split(' ')[0]), 'inconsistent_data_indexes': ic_dict[i[NAME]], - 'inconsistent_record_example': relative_path(row_path)} + 'inconsistent_record_example': relative_path(row_path, level=3)} ) # ------- Compute housing-based Inconsistencies------------ @@ -328,7 +328,7 @@ def compute(self): 'inconsistency_features': ic_data[2], 'inconsistency_violations': int(ic_data[3].split(' ')[0]), 'inconsistent_data_indexes': ic_dict[i[NAME]], - 'inconsistent_record_example': relative_path(row_path)} + 'inconsistent_record_example': relative_path(row_path, level=3)} ) # -------- Compute overall stats--------------------- diff --git a/sdnist/metrics/pca.py b/sdnist/metrics/pca.py index fd38d33..2dcf53d 100644 --- a/sdnist/metrics/pca.py +++ b/sdnist/metrics/pca.py @@ -48,11 +48,10 @@ def compute_pca(self): t_pca = PCA(n_components=cc) tdf_v = self.tar.values - sdf = self.syn.apply(lambda x: x - x.mean()) - sdf_v = sdf.values - - tdf_v = StandardScaler().fit_transform(tdf_v) - sdf_v = StandardScaler().fit_transform(sdf_v) + sdf_v = self.syn.values + scaler = StandardScaler().fit(tdf_v) + sdf_v = scaler.transform(sdf_v) + tdf_v = scaler.transform(tdf_v) t_pc = t_pca.fit_transform(tdf_v) @@ -62,7 +61,7 @@ def compute_pca(self): self.t_comp_data = [] for i, comp in enumerate(t_pca.components_): qc = [[n, round(v, 2)] for n, v in zip(self.tar.columns.tolist(), comp)] - qc = sorted(qc, key=lambda x: x[1], reverse=True) + qc = sorted(qc, key=lambda x: abs(x[1]), reverse=True) qc = [f'{v[0]} ({v[1]})' for v in qc] self.t_comp_data.append({"Principal Component": f"PC-{i}", "Features Contribution: " @@ -88,7 +87,9 @@ def compute_pca(self): for c in self.t_pdf.columns: self.t_pdf_s[c] = min_max_scaling(self.t_pdf[c]) for c in self.s_pdf.columns: - self.s_pdf_s[c] = min_max_scaling(self.s_pdf[c]) + self.s_pdf_s[c] = min_max_scaling(self.s_pdf[c], + self.t_pdf[c].min(), + self.t_pdf[c].max()) def plot(self, output_directory: Path) -> Dict[str, any]: s = time.time() @@ -152,8 +153,13 @@ def plot(self, output_directory: Path) -> Dict[str, any]: return plot_paths -def min_max_scaling(series): - return (series - series.min()) / (series.max() - series.min()) +def min_max_scaling(series, min_val=None, max_val=None): + if min_val is None: + min_val = series.min() + if max_val is None: + max_val = series.max() + + return (series - min_val) / (max_val - min_val) def plot_all_components_pairs(title: str, diff --git a/sdnist/metrics/regression.py b/sdnist/metrics/regression.py index 395f204..2121210 100644 --- a/sdnist/metrics/regression.py +++ b/sdnist/metrics/regression.py @@ -216,11 +216,12 @@ def plots(self) -> List[Path]: self.report_data = { "target_counts": relative_path(save_data_frame(self.tcm, self.o_path, - 'target_counts')), + 'target_counts'), level=3), "target_deidentified_counts_difference": relative_path(save_data_frame(self.diff, self.o_path, - "target_deidentified_counts_difference")), - "target_deidentified_difference_plot": relative_path(file_path), + "target_deidentified_counts_difference"), + level=3), + "target_deidentified_difference_plot": relative_path(file_path, level=3), "target_regression_slope_and_intercept": (self.t_slope, self.t_intercept), "deidentified_regression_slope_and_intercept": (self.s_slope, self.s_intercept) } diff --git a/sdnist/metrics/unique_exact_matches.py b/sdnist/metrics/unique_exact_matches.py index 6bfa20b..ef22520 100644 --- a/sdnist/metrics/unique_exact_matches.py +++ b/sdnist/metrics/unique_exact_matches.py @@ -6,6 +6,7 @@ from sdnist.report.dataset import Dataset import sdnist.utils as u + def unique_exact_matches(target_data: pd.DataFrame, deidentified_data: pd.DataFrame): td, dd = target_data, deidentified_data cols = td.columns.tolist() @@ -18,8 +19,7 @@ def unique_exact_matches(target_data: pd.DataFrame, deidentified_data: pd.DataFr perc_t_unique_records = round(t_unique_records/td.shape[0] * 100, 2) # Keep only one copy of each duplicate row in the deidentified data - # and also save the count of each row in the deidentified data - dd= dd.drop_duplicates(subset=cols) + dd = dd.drop_duplicates(subset=cols) merged = u_td.merge(dd, how='inner', on=cols) @@ -27,12 +27,13 @@ def unique_exact_matches(target_data: pd.DataFrame, deidentified_data: pd.DataFr t_rec_matched = merged.shape[0] # percent of unique target records that exactly match in deidentified data - perc_t_rec_matched = t_rec_matched/td.shape[0] * 100 + perc_t_rec_matched = t_rec_matched/t_unique_records * 100 perc_t_rec_matched = round(perc_t_rec_matched, 2) return t_rec_matched, perc_t_rec_matched, t_unique_records, perc_t_unique_records + if __name__ == '__main__': THIS_DIR = Path(__file__).parent s_path = Path(THIS_DIR, '..', '..', diff --git a/sdnist/report/__main__.py b/sdnist/report/__main__.py index 66a1672..daa8953 100644 --- a/sdnist/report/__main__.py +++ b/sdnist/report/__main__.py @@ -19,13 +19,14 @@ from sdnist.load import DEFAULT_DATASET + def run(synthetic_filepath: Path, output_directory: Path = REPORTS_DIR, dataset_name: TestDatasetName = TestDatasetName.NONE, data_root: Path = Path(DEFAULT_DATASET), labels_dict: Optional[Dict] = None, download: bool = False, - test_mode: bool = False): + show_report: bool = True): outfile = Path(output_directory, 'report.json') ui_data = ReportUIData(output_directory=output_directory) report_data = ReportData(output_directory=output_directory) @@ -60,10 +61,11 @@ def run(synthetic_filepath: Path, ui_data = json.load(f) log.end_msg() # Generate Report - generate(ui_data, output_directory, test_mode) + generate(ui_data, output_directory, show_report) log.msg(f'Reports available at path: {output_directory}', level=0, timed=False, msg_type='important') + def setup(): bundled_datasets = {"MA": TestDatasetName.ma2019, "TX": TestDatasetName.tx2019, diff --git a/sdnist/report/dataset/__init__.py b/sdnist/report/dataset/__init__.py index a948b0a..df33c1d 100644 --- a/sdnist/report/dataset/__init__.py +++ b/sdnist/report/dataset/__init__.py @@ -54,30 +54,24 @@ def unavailable_features(config: Dict, synthetic_data: pd.DataFrame): return cnf -def compute_feature_space(data_dict: Dict, - features: List[str]): - # list of features and their value length - f_list = [] - for f in features: - if "values" not in data_dict[f]: - vals = [0] * 269 # in case of INDP feature - else: - vals = data_dict[f]["values"] - if "min" in vals and f != 'AGEP': - continue - if f == 'AGEP': - f_list.append([f, 100]) - else: - f_list.append([f, len(vals)]) - f_df = pd.DataFrame(f_list, columns=['feature', 'len']) - f_df = f_df.sort_values(by='len') +def feature_space_size(target_df: pd.DataFrame, data_dict: Dict): + size = 1 - # get product of all feature lengths - n_features = f_df['len'].astype(object).product() + for col in target_df.columns: + if col in ['PINCP', 'POVPIP', 'WGTP', 'PWGTP', 'AGEP']: + size = size * 100 + elif col in ['SEX', 'MSP', 'HISP', 'RAC1P', 'HOUSING_TYPE', 'OWN_RENT', + 'INDP_CAT', 'EDU', 'PINCP_DECILE', 'DVET', 'DREM', 'DPHY', 'DEYE', + 'DEAR']: + size = size * len(data_dict[col]['values']) + elif col in ['PUMA', 'DENSITY']: + size = size * len(target_df['PUMA'].unique()) + elif col in ['NOC', 'NPF', 'INDP']: + size = size * len(target_df[col].unique()) + + return size - # return number of features and sorted list of features - return n_features @dataclass class Dataset: @@ -159,12 +153,12 @@ def __post_init__(self): self.features = list(set(self.features).difference(set(ind_features))) self.features = list(set(self.features).intersection(list(common_columns))) - self.feature_space = compute_feature_space(self.data_dict, self.features) - # raw subset data self.target_data = self.target_data[self.features] self.synthetic_data = self.synthetic_data[self.features] + self.feature_space = feature_space_size(self.target_data, self.data_dict) + # validation and clean data self.c_synthetic_data, self.validation_log = \ validate(self.synthetic_data, self.data_dict, self.features, self.log) @@ -176,6 +170,12 @@ def __post_init__(self): self.synthetic_data = self.synthetic_data[self.features] self.target_data = self.target_data[self.features] + # for f in self.target_data.columns: + # if f not in ['PINCP', 'INDP', 'PWGTP', 'WGTP', 'POVPIP', 'DENSITY']: + # print('T', f, self.target_data[f].unique().tolist()) + # print('S', f, self.synthetic_data[f].unique().tolist()) + # print() + # sort columns in the data self.target_data = self.target_data.reindex(sorted(self.target_data.columns), axis=1) self.synthetic_data = self.synthetic_data.reindex(sorted(self.target_data.columns), axis=1) @@ -299,13 +299,18 @@ def data_description(dataset: Dataset, f_desc = dataset.data_dict[feat]['description'] feat_title = f'{feat}: {f_desc}' if 'link' in dataset.data_dict[feat] and feat == 'INDP': - data = f"" \ + data_1 = f"" \ f"See codes in ACS data dictionary. " \ f"Find codes by searching the string: {feat}, in " \ f"the ACS data dictionary" dd_as.append(Attachment(name=feat_title, - _data=data, + _data=data_1, _type=AttachmentType.String)) + if "details" in dataset.data_dict[feat]: + data_2 = dataset.data_dict[feat]['details'] + dd_as.append(Attachment(name=None, + _data=data_2, + _type=AttachmentType.String)) elif 'values' in dataset.data_dict[feat]: f_name = feat_title diff --git a/sdnist/report/dataset/binning.py b/sdnist/report/dataset/binning.py index b543c79..801dde7 100644 --- a/sdnist/report/dataset/binning.py +++ b/sdnist/report/dataset/binning.py @@ -3,6 +3,7 @@ import numpy as np import math + def percentile_rank_target(data: pd.DataFrame, features: List[str]): data = data.copy() for c in features: diff --git a/sdnist/report/dataset/validate.py b/sdnist/report/dataset/validate.py index 526f45c..8048a16 100644 --- a/sdnist/report/dataset/validate.py +++ b/sdnist/report/dataset/validate.py @@ -3,6 +3,7 @@ from sdnist.utils import SimpleLogger + def validate(synth_data: pd.DataFrame, data_dict: Dict, features: List[str], diff --git a/sdnist/report/generate.py b/sdnist/report/generate.py index 1485ed9..eef381d 100644 --- a/sdnist/report/generate.py +++ b/sdnist/report/generate.py @@ -79,11 +79,16 @@ def generate(report_data: Dict[str, any], output_directory_path: Path, - test_mode: bool = False): + show_report: bool = True): out_dir = output_directory_path data = report_data + def debug(text): + print(text) + return '' + env = Environment(loader=FileSystemLoader(Path(FILE_DIR, 'resources/templates'))) + env.filters['debug'] = debug env.globals["enumerate"] = enumerate main_template = env.get_template('main.jinja2') @@ -96,7 +101,7 @@ def generate(report_data: Dict[str, any], with open(out_path, 'w') as f: f.write(out) - if not test_mode: + if show_report: webbrowser.open(f"file://{out_path}", new=True) # html_to_pdf(out_path, out_pdf_path) @@ -106,7 +111,7 @@ def generate(report_data: Dict[str, any], p_p = Path(FILE_DIR, '../../reports/TX_ACS_EXCERPT_2019_08-02-2022T15.14.12/report.pdf') p_o = Path(FILE_DIR, '../../reports/TX_ACS_EXCERPT_2019_08-02-2022T15.14.12/report0.pdf') - html_to_pdf_2(h_p, p_p) + # html_to_pdf_2(h_p, p_p) diff --git a/sdnist/report/plots/pearson_correlation.py b/sdnist/report/plots/pearson_correlation.py index 21b66ed..531c2eb 100644 --- a/sdnist/report/plots/pearson_correlation.py +++ b/sdnist/report/plots/pearson_correlation.py @@ -24,14 +24,15 @@ def _setup(self): if not self.o_path.exists(): os.mkdir(self.o_path) - def save(self) -> List[Path]: + def save(self, path_level=2) -> List[Path]: file_path = Path(self.o_path, 'pearson_corr_diff.jpg') self.report_data = { "correlation_difference": relative_path(save_data_frame(self.cd, self.o_path, - 'correlation_difference')), - "plot": relative_path(file_path) + 'correlation_difference'), + level=path_level), + "plot": relative_path(file_path, level=path_level) } cd = self.cd cd = cd.abs() diff --git a/sdnist/report/plots/propensity.py b/sdnist/report/plots/propensity.py index d56fc89..76f4467 100644 --- a/sdnist/report/plots/propensity.py +++ b/sdnist/report/plots/propensity.py @@ -32,7 +32,13 @@ def save(self, title: str = 'Distribution of data samples over 100 propensity bins') \ -> List[Path]: file_path = Path(self.o_path, f'{filename}.jpg') - ax = self.p_dist.plot(title=title, xlabel="100 Propensity Bins", ylabel='Record Counts') + ax = self.p_dist.plot(title=title, + xlabel="100 Propensity Bins", + ylabel='Record Counts', + color=['mediumblue', 'limegreen'], + alpha=0.8, + lw=2, + figsize=(12, 6)) fig = ax.get_figure() fig.savefig(file_path) self.report_data['plot'] = relative_path(file_path) diff --git a/sdnist/report/plots/univariate.py b/sdnist/report/plots/univariate.py index 9f83088..50efc6f 100644 --- a/sdnist/report/plots/univariate.py +++ b/sdnist/report/plots/univariate.py @@ -80,13 +80,14 @@ def _setup(self): raise Exception(f'Path {self.o_dir} does not exist. Cannot save plots') os.mkdir(self.out_path) - def report_data(self): + def report_data(self, level=2): return {"divergence": relative_path(save_data_frame(self.div_data, self.out_path, - 'divergence')), + 'divergence'), + level=level), "counts": self.uni_counts} - def save(self) -> Dict: + def save(self, level=2) -> Dict: if self.challenge == CENSUS: ignore_features = ['YEAR'] elif self.challenge == TAXI: @@ -106,7 +107,8 @@ def save(self) -> Dict: self.syn, self.tar, div_df[FEATURE].tolist(), - self.out_path) + self.out_path, + level=level) return self.feat_data def save_distribution_plot(self, @@ -114,7 +116,8 @@ def save_distribution_plot(self, synthetic: pd.DataFrame, target: pd.DataFrame, features: List, - output_directory: Path): + output_directory: Path, + level=2): ds = dataset o_path = output_directory bar_width = 0.4 @@ -138,26 +141,24 @@ def save_distribution_plot(self, st_df = o_tar[o_tar[INDP_CAT].isin([s])].copy() st_df.loc[:, f] = pd.to_numeric(st_df[f]).astype(int) ss_df = o_syn[o_syn[INDP_CAT].isin([int(s)])] - # print(s, type(s)) - # print(o_syn[INDP_CAT].unique().tolist()) + unique_ind_codes = st_df[f].unique().tolist() set(unique_ind_codes).update(set(ss_df[f].unique().tolist())) unique_ind_codes = list(unique_ind_codes) val_df = pd.DataFrame(unique_ind_codes, columns=[f]) + val_df[f] = val_df.astype(str) t_counts_df = st_df.groupby(by=f)[f].size().reset_index(name='count_target') s_counts_df = ss_df.groupby(by=f)[f].size().reset_index(name='count_deidentified') - # print(s) - # print(s_counts_df) - # print(ss_df[f].unique().tolist()) - # print(ss_df.shape) + t_counts_df[f] = t_counts_df[f].astype(str) + s_counts_df[f] = s_counts_df[f].astype(str) + merged = pd.merge(left=val_df, right=t_counts_df, on=f, how='left')\ .fillna(0) merged = pd.merge(left=merged, right=s_counts_df, on=f, how='left')\ .fillna(0) div = l1(pk=merged['count_target'], qk=merged['count_deidentified']) - # print(s) - # print(merged[['count_target', 'count_deidentified']]) + selected.append([merged, div, s]) selected = sorted(selected, key=lambda l: l[1], reverse=True) @@ -192,8 +193,9 @@ def save_distribution_plot(self, "divergence": div, "counts": relative_path(save_data_frame(merged, o_path, - f"Industry Category {s}")), - "plot": relative_path(file_path) + f"Industry Category {s}"), + level=level), + "plot": relative_path(file_path, level=level) } # if j < 2: saved_file_paths.append(file_path) @@ -225,8 +227,9 @@ def save_distribution_plot(self, self.uni_counts[f] = { "counts": relative_path(save_data_frame(c_sort_merged.copy(), o_path, - f'{f}_counts')), - "plot": relative_path(file_path) + f'{f}_counts'), + level=level), + "plot": relative_path(file_path, level) } if self.worst_univariates_to_display is None \ @@ -267,13 +270,14 @@ def save_distribution_plot(self, vals = updated_vals vals = [str(v) for v in vals] + if "-1" in vals: idx = vals.index("-1") vals[idx] = "N" if f == 'PUMA': f_val_dict = {i: v for i, v in enumerate(ds.schema[f]['values'])} - vals = [f_val_dict[int(v)] for v in vals] + vals = [f_val_dict[int(v)] if v != 'N' else 'N' for v in vals] plt.gca().set_xticks(x_axis, vals) plt.legend(loc='upper right') diff --git a/sdnist/report/report_data.py b/sdnist/report/report_data.py index 63ab1fd..99b8432 100644 --- a/sdnist/report/report_data.py +++ b/sdnist/report/report_data.py @@ -1,4 +1,5 @@ import json +import time from typing import List, Dict, Optional from dataclasses import dataclass, field from enum import Enum @@ -32,13 +33,19 @@ class AttachmentType(Enum): ParaAndImage = 'para_and_image' + @dataclass class Attachment: name: Optional[str] _data: any + group_id: int = -1 _type: AttachmentType = field(default=AttachmentType.Table) dotted_break: bool = field(default=False) + def __post_init(self): + if self.group_id == -1: + self.group_id = int(time.time() * 100) + @property def data(self) -> Dict[str, any]: d = self._data @@ -59,10 +66,16 @@ class ScorePacket: @property def data(self) -> Dict[str, any]: + attachments = dict() + for a in self.attachment: + if a.group_id in attachments: + attachments[a.group_id].append(a.data) + else: + attachments[a.group_id] = [a.data] d = { 'metric_name': self.metric_name, 'scores': self.score, - 'attachments': [a.data for a in self.attachment] + 'attachments': attachments } if self.score is None: del d['scores'] @@ -152,9 +165,12 @@ class ReportUIData: feature_desc: Dict[str, any] = field(default_factory=dict, init=False) # list containing ScorePacket objects scores: List[ScorePacket] = field(default_factory=list, init=False) + key_val_pairs: Dict[str, any] = field(default_factory=dict, init=False) def add(self, score_packet: ScorePacket): self.scores.append(score_packet) + def add_key_val(self, key: str, val: any): + self.key_val_pairs[key] = val def add_data_description(self, dataset_type: DatasetType, @@ -192,6 +208,8 @@ def data(self) -> Dict[str, any]: d['comparisons'] = [] d['motivation'] = [] d['observations'] = [] + for k, v in self.key_val_pairs.items(): + d[k] = v for s_pkt in self.scores: if s_pkt.evaluation_type == EvaluationType.Utility: diff --git a/sdnist/report/resources/templates/main.jinja2 b/sdnist/report/resources/templates/main.jinja2 index 3e0a70b..da7a417 100644 --- a/sdnist/report/resources/templates/main.jinja2 +++ b/sdnist/report/resources/templates/main.jinja2 @@ -384,58 +384,62 @@ {% if 'scores' in data %}
Score: {{ data['scores'] }}
{% endif %} - {% for i, a in enumerate(data['attachments']) %} - {% if a.type == 'image_links' and i > 0 %} - {% set style = 'attachment-div-break' %} - {% else %} - {% set style = 'attachment-div' %} - {% endif %} + {% for group, attachments in data['attachments'].items() %} +