From a3fb32eb9bc1439ad4c9d16017ccaf6036fe529f Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Thu, 20 Apr 2023 10:52:39 +0530
Subject: [PATCH 01/16] Add feature to group attachments for report ui pdf
 generation

---
 sdnist/report/report_data.py                  |  15 ++-
 sdnist/report/resources/templates/main.jinja2 | 100 +++++++++---------
 2 files changed, 66 insertions(+), 49 deletions(-)
diff --git a/sdnist/report/report_data.py b/sdnist/report/report_data.py
index 63ab1fd..a5898ad 100644
--- a/sdnist/report/report_data.py
+++ b/sdnist/report/report_data.py
@@ -1,4 +1,5 @@
 import json
+import time
 from typing import List, Dict, Optional
 from dataclasses import dataclass, field
 from enum import Enum
@@ -32,13 +33,19 @@ class AttachmentType(Enum):
     ParaAndImage = 'para_and_image'
 
 
+
 @dataclass
 class Attachment:
     name: Optional[str]
     _data: any
+    group_id: int = -1
     _type: AttachmentType = field(default=AttachmentType.Table)
     dotted_break: bool = field(default=False)
 
+    def __post_init(self):
+        if self.group_id == -1:
+            self.group_id = int(time.time() * 100)
+
     @property
     def data(self) -> Dict[str, any]:
         d = self._data
@@ -59,10 +66,16 @@ class ScorePacket:
 
     @property
     def data(self) -> Dict[str, any]:
+        attachments = dict()
+        for a in self.attachment:
+            if a.group_id in attachments:
+                attachments[a.group_id].append(a.data)
+            else:
+                attachments[a.group_id] = [a.data]
         d = {
             'metric_name': self.metric_name,
             'scores': self.score,
-            'attachments': [a.data for a in self.attachment]
+            'attachments': attachments
         }
         if self.score is None:
             del d['scores']
diff --git a/sdnist/report/resources/templates/main.jinja2 b/sdnist/report/resources/templates/main.jinja2
index 3e0a70b..e409d1d 100644
--- a/sdnist/report/resources/templates/main.jinja2
+++ b/sdnist/report/resources/templates/main.jinja2
@@ -384,58 +384,62 @@
     {% if 'scores' in data %}
         <p>Score: {{ data['scores'] }}</p>
     {% endif %}
-    {% for i, a in enumerate(data['attachments']) %}
-        {% if a.type == 'image_links' and i > 0 %}
-            {% set style = 'attachment-div-break' %}
-        {% else %}
-            {% set style = 'attachment-div' %}
-        {% endif %}
+    {% for group, attachments in data['attachments'].items() %}
+        <div>
+            {% for i, a in enumerate(attachments) %}
+                {% if a.type == 'image_links' and i > 0 %}
+                    {% set style = 'attachment-div-break' %}
+                {% else %}
+                    {% set style = 'attachment-div' %}
+                {% endif %}
 
-        {% if a.dotted_break == True and i != data['attachments']|length - 1 %}
-            {% set dashed_line = 'dashed-line' %}
-        {% else %}
-            {% set dashed_line = '' %}
-        {% endif %}
+                {% if a.dotted_break == True and i != data['attachments']|length - 1 %}
+                    {% set dashed_line = 'dashed-line' %}
+                {% else %}
+                    {% set dashed_line = '' %}
+                {% endif %}
 
-        <div class="{{ style }} {{ dashed_line }}">
-            {% if a.name and a.name != '--no-brake--' %}
-                <br>
-                {{ heading_3(a.name) }}
-            {% elif a.name == None %}
-                <br>
-            {% endif %}
-            {% if a.type == "table" %}
-                {{ table(a.data) }}
-            {% elif a.type == "wide_table" %}
-                {{ wide_table(a.data) }}
-            {% elif a.type == "image_links" %}
-                {{ draw_images(a.data) }}
-            {% elif a.type == "image_links_horizontal" %}
-                {{ draw_images_horizontal(a.data) }}
-            {% elif a.type == "string" %}
-                {{ string_data(a.data) }}
-            {% elif a.type == "para_and_image" %}
-                <div class="float-container">
-                    <div class="float-child-left">
-                        {% for d in a.data['para'] %}
-                            {% if d[0] == 'heading' %}
-                                {{ string_data('h4' + d[1]) }}
-                            {%  elif d[0] == 'text' %}
-                                {{ string_data(d[1]) }}
-                            {% endif %}
-                        {% endfor %}
-                    </div>
-                    <div class="float-child-right">
-                        {{ draw_images(a.data['image']) }}
-                    </div>
+                <div class="{{ style }} {{ dashed_line }}">
+                    {% if a.name and a.name != '--no-brake--' %}
+                        <br>
+                        {{ heading_3(a.name) }}
+                    {% elif a.name == None %}
+                        <br>
+                    {% endif %}
+                    {% if a.type == "table" %}
+                        {{ table(a.data) }}
+                    {% elif a.type == "wide_table" %}
+                        {{ wide_table(a.data) }}
+                    {% elif a.type == "image_links" %}
+                        {{ draw_images(a.data) }}
+                    {% elif a.type == "image_links_horizontal" %}
+                        {{ draw_images_horizontal(a.data) }}
+                    {% elif a.type == "string" %}
+                        {{ string_data(a.data) }}
+                    {% elif a.type == "para_and_image" %}
+                        <div class="float-container">
+                            <div class="float-child-left">
+                                {% for d in a.data['para'] %}
+                                    {% if d[0] == 'heading' %}
+                                        {{ string_data('h4' + d[1]) }}
+                                    {%  elif d[0] == 'text' %}
+                                        {{ string_data(d[1]) }}
+                                    {% endif %}
+                                {% endfor %}
+                            </div>
+                            <div class="float-child-right">
+                                {{ draw_images(a.data['image']) }}
+                            </div>
+                        </div>
+                    {% endif %}
                 </div>
-            {% endif %}
+        {#        {% if a.dotted_break == True %}#}
+        {#            <div>#}
+        {#                <hr class="dashed-line">#}
+        {#            </div>#}
+        {#        {% endif %}#}
+            {% endfor %}
         </div>
-{#        {% if a.dotted_break == True %}#}
-{#            <div>#}
-{#                <hr class="dashed-line">#}
-{#            </div>#}
-{#        {% endif %}#}
     {% endfor %}
 {% endmacro %}
 

From f47b6515ba9a330612d4db8b98c6e758ea90a060 Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Thu, 20 Apr 2023 11:24:31 +0530
Subject: [PATCH 02/16] Allow an arbitrary key value to be added to the data
 for report ui

---
 sdnist/report/report_data.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sdnist/report/report_data.py b/sdnist/report/report_data.py
index a5898ad..99b8432 100644
--- a/sdnist/report/report_data.py
+++ b/sdnist/report/report_data.py
@@ -165,9 +165,12 @@ class ReportUIData:
     feature_desc: Dict[str, any] = field(default_factory=dict, init=False)
     # list containing ScorePacket objects
     scores: List[ScorePacket] = field(default_factory=list, init=False)
+    key_val_pairs: Dict[str, any] = field(default_factory=dict, init=False)
 
     def add(self, score_packet: ScorePacket):
         self.scores.append(score_packet)
+    def add_key_val(self, key: str, val: any):
+        self.key_val_pairs[key] = val
 
     def add_data_description(self,
                              dataset_type: DatasetType,
@@ -205,6 +208,8 @@ def data(self) -> Dict[str, any]:
         d['comparisons'] = []
         d['motivation'] = []
         d['observations'] = []
+        for k, v in self.key_val_pairs.items():
+            d[k] = v
 
         for s_pkt in self.scores:
             if s_pkt.evaluation_type == EvaluationType.Utility:

From c4926868363cf7850799462c13ba954ce3aa263f Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 12:54:28 +0530
Subject: [PATCH 03/16] Added paragraphs for unique exact matches metric

---
 .gitignore                        | 56 +++++++++++++++++++------------
 sdnist/report/score/paragraphs.py | 42 ++++++++++++++++++++---
 2 files changed, 72 insertions(+), 26 deletions(-)

diff --git a/.gitignore b/.gitignore
index a62eef2..3c12061 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,21 +1,35 @@
-# include
-!sdnist/
-!sdnist/test/
-!sdnist/test/report/
-!sdnist/test/report/data/
-!sdnist/test/report/data/na2019_1000.csv
-
-# ignore
-report.json
-**.pyc
-**.DS_Store
-
-.ipynb_checkpoints
-toy_synthetic_data/
-dask-worker-space/
-results/
-build/
-sdnist.egg-info/
-
-**.pkl
-build
+# include
+!sdnist/
+!sdnist/test/
+!sdnist/test/report/
+!sdnist/test/report/data/
+!sdnist/test/report/data/na2019_1000.csv
+
+# ignore
+report.json
+**.pyc
+**.DS_Store
+
+.ipynb_checkpoints
+toy_synthetic_data/
+dask-worker-space/
+results/
+build/
+sdnist.egg-info/
+
+**.pkl
+build
+
+**/.idea/
+**/crc_acceleration_bundle_1.0/
+**/crc_n/
+**/crc_notebooks/
+**/create_data/
+**/data/
+**/diverse_communities_data_excerpts/
+**/meta_reports/
+**/reports/
+**/states_puma_geojson/
+**/venv/
+**/workspace/
+
diff --git a/sdnist/report/score/paragraphs.py b/sdnist/report/score/paragraphs.py
index 8dd7467..3783223 100644
--- a/sdnist/report/score/paragraphs.py
+++ b/sdnist/report/score/paragraphs.py
@@ -43,7 +43,7 @@
                   "Joshua Snoke and Gillian Raab</a> and <a href='https://www.researchgate.net/publication/323867757_STatistical_Election_to_Partition_Sequentially_STEPS_and_Its_Application_in_Differentially_Private_Release_and_Analysis_of_Youth_Voter_Registration_Data'>Claire Bowen</a>" \
                   ", all of whom have participated on the NIST Synthetic Data Challenges SME panels."
 
-k_marg_break_para = "In the metrics above we’ve considered all of the data together; " \
+k_marg_break_para = "In the metrics above we've considered all of the data together; " \
                     "however we know that algorithms may behave differently on different " \
                     "subgroups in the population. Below we look in more detail at deidentification " \
                     "performance just in the worst performing PUMA, based on k-marginal score."
@@ -78,7 +78,39 @@
                      "If the distribution is centered below 50% that means the deidentified records are very " \
                      "different from the target records, and the apparent matches are not real matches."
 
-unique_exact_match_para = "This is a count of unique records in the target data that were exactly reproduced " \
-                          "in the deidentified data. Because these records were unique outliers in the " \
-                          "target data, and they still appear unchanged in the deidentified data, " \
-                          "they are potentially vulnerable to reidentification."
\ No newline at end of file
+unique_exact_match_para_1 = "Unique Exact Match (UEM) is a simple privacy metric that counts the " \
+                            "percentage of singleton records in the target that are also present in " \
+                            "the deidentified data; these uniquely identifiable individuals leaked " \
+                            "through the deidentification process."
+
+unique_exact_match_para_2 = "Below we also include an estimate of the feature space size. The feature " \
+                            "space is the set of all possible record values given the selected target " \
+                            "data and feature subset. For instance, if we had two features, Hat" \
+                            " [cap, bonnet] and Color [green, blue, purple], our feature space would " \
+                            "consist of 2 x 3 = 6 possible combinations (e.g. 'green cap', " \
+                            "'blue bonnet').  Note that feature spaces are based on the " \
+                            "feature set, not on what records actually exist in the data. " \
+                            "Purple bonnet is a possible combination in this feature space, " \
+                            "but it's likely no one in the hypothetical data owns a purple " \
+                            "bonnet (and the count of that record value would be 0)."
+
+unique_exact_match_para_3 = "As we add features to the feature set, we increase the size of the " \
+                            "feature space, but we don't change the actual number of records in " \
+                            "the data-- it's the same people, but now they're spread out more thinly " \
+                            "across a wider set of possible record values. Large feature spaces will " \
+                            "disperse populations very sparsely (most possible record values will have " \
+                            "count 0 or 1) and as a result the data will contain very many uniquely " \
+                            "identifiable records. Intuitively, once you know enough pieces of " \
+                            "information about someone, everyone becomes very distinct from everyone else. " \
+                            "This can pose a challenge for privacy. "
+
+unique_exact_match_para_4 = "The Target Data Properties below provides an estimate of the feature " \
+                            "space size (100 is used for continuous feature), " \
+                            "along with the portion of the records in the ground truth " \
+                            "target data that are unique (ie, they are the only person " \
+                            "with that record value, they have a count of 1). " \
+                            "The Deidentified Data Properties reports the percentage " \
+                            "of those uniquely identifiable individuals that are still " \
+                            "present in the deidentified data. " \
+                            "Because they are unique, real records, they are " \
+                            "potentially vulnerable to reidentification."
\ No newline at end of file

From 7a3f9441c1d5c362cc53b2017d921c792e67f74d Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 12:55:16 +0530
Subject: [PATCH 04/16] Add parameter to control opening web-browser after
 report generation

---
 sdnist/report/__main__.py |  6 ++++--
 sdnist/report/generate.py | 11 ++++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/sdnist/report/__main__.py b/sdnist/report/__main__.py
index 66a1672..daa8953 100644
--- a/sdnist/report/__main__.py
+++ b/sdnist/report/__main__.py
@@ -19,13 +19,14 @@
 
 from sdnist.load import DEFAULT_DATASET
 
+
 def run(synthetic_filepath: Path,
         output_directory: Path = REPORTS_DIR,
         dataset_name: TestDatasetName = TestDatasetName.NONE,
         data_root: Path = Path(DEFAULT_DATASET),
         labels_dict: Optional[Dict] = None,
         download: bool = False,
-        test_mode: bool = False):
+        show_report: bool = True):
     outfile = Path(output_directory, 'report.json')
     ui_data = ReportUIData(output_directory=output_directory)
     report_data = ReportData(output_directory=output_directory)
@@ -60,10 +61,11 @@ def run(synthetic_filepath: Path,
             ui_data = json.load(f)
     log.end_msg()
     # Generate Report
-    generate(ui_data, output_directory, test_mode)
+    generate(ui_data, output_directory, show_report)
     log.msg(f'Reports available at path: {output_directory}', level=0, timed=False,
             msg_type='important')
 
+
 def setup():
     bundled_datasets = {"MA": TestDatasetName.ma2019,
                         "TX": TestDatasetName.tx2019,
diff --git a/sdnist/report/generate.py b/sdnist/report/generate.py
index 1485ed9..eef381d 100644
--- a/sdnist/report/generate.py
+++ b/sdnist/report/generate.py
@@ -79,11 +79,16 @@
 
 def generate(report_data: Dict[str, any],
              output_directory_path: Path,
-             test_mode: bool = False):
+             show_report: bool = True):
     out_dir = output_directory_path
     data = report_data
 
+    def debug(text):
+        print(text)
+        return ''
+
     env = Environment(loader=FileSystemLoader(Path(FILE_DIR, 'resources/templates')))
+    env.filters['debug'] = debug
     env.globals["enumerate"] = enumerate
 
     main_template = env.get_template('main.jinja2')
@@ -96,7 +101,7 @@ def generate(report_data: Dict[str, any],
     with open(out_path, 'w') as f:
         f.write(out)
 
-    if not test_mode:
+    if show_report:
         webbrowser.open(f"file://{out_path}", new=True)
     # html_to_pdf(out_path, out_pdf_path)
 
@@ -106,7 +111,7 @@ def generate(report_data: Dict[str, any],
     p_p = Path(FILE_DIR, '../../reports/TX_ACS_EXCERPT_2019_08-02-2022T15.14.12/report.pdf')
     p_o = Path(FILE_DIR, '../../reports/TX_ACS_EXCERPT_2019_08-02-2022T15.14.12/report0.pdf')
 
-    html_to_pdf_2(h_p, p_p)
+    # html_to_pdf_2(h_p, p_p)
 
 
 

From 772ddd5f669a1822588bda5e8a7efa099a8c73e9 Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 12:56:22 +0530
Subject: [PATCH 05/16] Update report paragraphs

---
 sdnist/report/score/privacy.py               | 18 +++++++++++----
 sdnist/report/score/utility/inconsistency.py | 23 ++++++++++++++++++++
 sdnist/report/score/utility/pca.py           |  6 ++---
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/sdnist/report/score/privacy.py b/sdnist/report/score/privacy.py
index 889b4f1..69279f6 100644
--- a/sdnist/report/score/privacy.py
+++ b/sdnist/report/score/privacy.py
@@ -22,13 +22,23 @@ def privacy_score(dataset: Dataset, ui_data: ReportUIData, report_data, log: Sim
         unique_target_records, perc_unique_target_records = \
         unique_exact_matches(ds.c_target_data, ds.c_synthetic_data)
     perc_t_rec_matched = perc_t_rec_matched
-    uem_para_a = Attachment(name=None,
-                            _data=unique_exact_match_para,
+    uem_para1_a = Attachment(name=None,
+                            _data=unique_exact_match_para_1,
+                            _type=AttachmentType.String)
+    uem_para2_a = Attachment(name=None,
+                            _data=unique_exact_match_para_2,
+                            _type=AttachmentType.String)
+    uem_para3_a = Attachment(name=None,
+                            _data=unique_exact_match_para_3,
+                            _type=AttachmentType.String)
+    uem_para4_a = Attachment(name=None,
+                            _data=unique_exact_match_para_4,
                             _type=AttachmentType.String)
 
+    feat_space_str = "{:0.3e}".format(ds.feature_space)
     target_matched_a = Attachment(name="Target Data Properties",
                                      _data=f"Feature space size (possible combinations): "
-                                           f"-Highlight-{f'{dataset.feature_space:,}'}-Highlight-<br>"
+                                           f"-Highlight-{feat_space_str}-Highlight-<br>"
                                            f"Number of unique records in Target Data: "
                                            f"-Highlight-{unique_target_records} "
                                            f"({perc_unique_target_records}%-Highlight-)",
@@ -41,7 +51,7 @@ def privacy_score(dataset: Dataset, ui_data: ReportUIData, report_data, log: Sim
                                      _type=AttachmentType.String)
     r_ui_d.add(PrivacyScorePacket("Unique Exact Matches",
                               None,
-                              [uem_para_a,
+                              [uem_para1_a, uem_para2_a, uem_para3_a, uem_para4_a,
                                target_matched_a,
                                deid_matched_a]))
     rd.add('unique_exact_matches', {
diff --git a/sdnist/report/score/utility/inconsistency.py b/sdnist/report/score/utility/inconsistency.py
index 612b221..1f2eed1 100644
--- a/sdnist/report/score/utility/inconsistency.py
+++ b/sdnist/report/score/utility/inconsistency.py
@@ -9,6 +9,22 @@
 
 from sdnist.utils import *
 
+ic_paragraphs = [
+    "In real world tabular data, it's common for record features to have "
+    "some deterministic, publicly known dependencies on each other: "
+    "knowing someone's AGEP= 3 necessarily tells you something about "
+    "their marital status, income and educational attainment. "
+    "Different deidentification methods may be better or worse at "
+    "automatically preserving these relationships. "
+    "When they fail (ex: producing toddlers with PhDs) we say "
+    "those records contain \"inconsistencies\". Our consistency "
+    "check metric below is not exhaustive, we don't catch everything "
+    "the way a production-grade system should, but this will give you "
+    "a sense of how consistency is preserved for age, work and household "
+    "features. Note that different deidentification approaches may do "
+    "better or worse with different types of inconsistencies."
+]
+
 
 class InconsistenciesReport:
     """
@@ -42,6 +58,13 @@ def _create(self):
         # add inconsistencies stats and data to json report data
         self.rd.add('inconsistencies', self.ic.report_data)
 
+        # create report attachments
+        for p in ic_paragraphs:
+            para_a = Attachment(name=None,
+                                _data=p,
+                                _type=AttachmentType.String)
+            self.attachments.append(para_a)
+
         # --------- Add inconsistencies stats and dat to ui report data
         # UI attachment for summary of inconsistencies found in the deidentified data
         a_sum_h = Attachment(name='Summary',
diff --git a/sdnist/report/score/utility/pca.py b/sdnist/report/score/utility/pca.py
index e349c89..4f5eb38 100644
--- a/sdnist/report/score/utility/pca.py
+++ b/sdnist/report/score/utility/pca.py
@@ -19,10 +19,10 @@
            "dimensions of the original feature space). Descriptions " \
            "of these new five dimensions (components) are " \
            "given in the components table; the components will change " \
-           "depending on which target data set you’re using. " \
+           "depending on which target data set you're using. " \
            "Five dimensions are better than 22, but we actually want to " \
            "get down to two dimensions so we can plot the data " \
-           "on simple (x,y) axes– the plots below show the data " \
+           "on simple (x,y) axes the plots below show the data " \
            "across each possible pair combination of our five components. " \
            "You can compare how the shapes change between the target data " \
            "and the deidentified data, and consider what that might mean in light " \
@@ -40,7 +40,7 @@
 
 pca_highlight_para = "The queries below explore the PCA metric results in more detail " \
                      "by zooming in on a single component-pair panel and highlighting " \
-                     "all individuals that satisfy a given constraint (such as MSP = “N”, " \
+                     "all individuals that satisfy a given constraint (such as MSP='N', " \
                      "individuals who are unmarried because they are children). " \
                      "If the deidentified data preserves the structure and feature " \
                      "correlations of the target data, the highlighted areas should have " \

From 75c81e2098102757d70936c0fa710ae31ed49c40 Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 12:57:00 +0530
Subject: [PATCH 06/16] Add 1% and 5% subsampling error to kmarginal metric

---
 sdnist/report/score/utility/__init__.py | 30 ++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/sdnist/report/score/utility/__init__.py b/sdnist/report/score/utility/__init__.py
index 7079341..149362c 100644
--- a/sdnist/report/score/utility/__init__.py
+++ b/sdnist/report/score/utility/__init__.py
@@ -125,12 +125,13 @@ def worst_score_breakdown(worst_scores: List,
 
     up = UnivariatePlots(s, t,
                          ds, out_dir, ds.challenge, worst_univariates_to_display=3)
-    u_feature_data = up.save()
-    k_marg_break_rd[f'worst_{len(wpf)}_puma_univariate'] = up.report_data()
+    u_feature_data = up.save(level=3)
+    k_marg_break_rd[f'worst_{len(wpf)}_puma_univariate'] = up.report_data(level=3)
     k_marg_break_rd[f'worst_{len(wpf)}_puma_k_marginal_scores'] = \
         relative_path(save_data_frame(wsh,
                                       out_dir,
-                                      f'worst_{len(wpf)}_puma_k_marginal_scores'))
+                                      f'worst_{len(wpf)}_puma_k_marginal_scores'),
+                      level=2)
     u_as = []
     u_as.append(Attachment(name=None,
                            _data=f"h3Univariate Distribution of Worst "
@@ -189,7 +190,7 @@ def worst_score_breakdown(worst_scores: List,
                                        corr_features)
     pcd.compute()
     pcp = PearsonCorrelationPlot(pcd.pp_corr_diff, out_dir)
-    pcp_saved_file_paths = pcp.save()
+    pcp_saved_file_paths = pcp.save(path_level=3)
     k_marg_break_rd['correlation_difference'] = {
         "pearson_correlation_difference": pcp.report_data
     }
@@ -246,16 +247,17 @@ def create_subsample(frac: float):
 
     # mapping of sub sample frac to k-marginal score of fraction
     ssample_score = dict()  # subsample scores dictionary
-    # find k-marginal of 10%, 20% ... 90% of sub-sample of target data
-    for i in range(1, 11):
+    # find k-marginal of 1%, 5%, 10%, 20% ... 90% of sub-sample of target data
+    sample_sizes = [1, 5] + [i*10 for i in range(1, 10)]
+    for i in sample_sizes:
         # using subsample of target data as synthetic data
-        s_sd = create_subsample(frac=i * 0.1)
+        s_sd = create_subsample(frac=i * 0.01)
         s_kmarg = k_marginal_cls(dataset.d_target_data,
                                  s_sd,
                                  group_features)
         s_kmarg.compute_score()
         s_score = int(s_kmarg.score)
-        ssample_score[i * 0.1] = s_score
+        ssample_score[i * 0.01] = s_score
 
     puma_scores = None
     if len(group_features):
@@ -344,6 +346,7 @@ def min_index(data_list: List[float]):
     # add k-marginal subsample and deidentified data scores to json report
     k_marg_synop_rd['subsample_error_comparison'] = \
         relative_path(save_data_frame(sedf_df, k_marg_synopsys_path, 'subsample_error_comparison'))
+    k_marg_synop_rd['sub_sampling_equivalent'] = int(min_frac * 100)
     k_marg_synop_rd['k_marginal_score'] = k_marginal_score
 
     report_data.add('k_marginal', {
@@ -420,10 +423,7 @@ def min_index(data_list: List[float]):
         bs_a = Attachment(name=f"{len(best_scores)} Best Performing " + '-'.join(group_features),
                           _data=best_scores)
 
-        report_data.add('k_marginal', {
-            "k_marginal_breakdown": k_marg_break_rd
-        })
-
+        report_data.add('worst_PUMA_breakdown', k_marg_break_rd)
         attachments.extend([as_para_a, as_a])
 
         metric_attachments = [k_marg_break_para_a, ws_para_a, ws_a]
@@ -436,7 +436,7 @@ def min_index(data_list: List[float]):
             metric_attachments.append(gp_a)
         metric_attachments.extend(worst_break_down)
 
-        kmarg_det_pkt = UtilityScorePacket('K-Marginal Score Breakdown',
+        kmarg_det_pkt = UtilityScorePacket('Worst Performing PUMAs Breakdown',
                                            None,
                                            metric_attachments)
 
@@ -609,10 +609,10 @@ def utility_score(dataset: Dataset, ui_data: ReportUIData, report_data: ReportDa
     p_dist_plot = PropensityDistribution(s.prob_dist, r_ui_d.output_directory)
     # pps = PropensityPairPlot(s.std_two_way_scores, rd.output_directory)
     #
-    prop_rep_data = {**s.report_data, **p_dist_plot.report_data}
-    rd.add('propensity mean square error', prop_rep_data)
 
     p_dist_paths = p_dist_plot.save()
+    prop_rep_data = {**s.report_data, **p_dist_plot.report_data}
+    rd.add('propensity mean square error', prop_rep_data)
     # pps_paths = pps.save('spmse',
     #                      'Two-Way Standardized Propensity Mean Square Error')
     rel_pd_path = ["/".join(list(p.parts)[-2:])

From e0ba6c0b9a9aa149939a26fa41d6e95180a0362d Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 12:58:44 +0530
Subject: [PATCH 07/16] Add separator between privacy metrics in report html

---
 sdnist/report/resources/templates/main.jinja2 | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sdnist/report/resources/templates/main.jinja2 b/sdnist/report/resources/templates/main.jinja2
index e409d1d..da7a417 100644
--- a/sdnist/report/resources/templates/main.jinja2
+++ b/sdnist/report/resources/templates/main.jinja2
@@ -492,8 +492,14 @@
         <br>
         <div class="heading-2-div">
             {{ section_title('Privacy Evaluation') }}
-            {% for item in data['privacy'] %}
+            {% for i, item in enumerate(data['privacy']) %}
                 <div class="evaluation-div">
+                    {% if i > 0 %}
+                        <br>
+                        <br>
+                        <div class="section-separator-bar"></div>
+                        <br>
+                    {% endif %}
                     {{ evaluation(item) }}
                 </div>
             {% endfor %}

From a35e3622ec84c908fba9a45c262b95f300b697a1 Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 12:59:37 +0530
Subject: [PATCH 08/16] Fix feature space size computation

---
 sdnist/report/dataset/__init__.py | 54 +++++++++++++++++--------------
 sdnist/report/dataset/binning.py  |  1 +
 sdnist/report/dataset/validate.py |  1 +
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/sdnist/report/dataset/__init__.py b/sdnist/report/dataset/__init__.py
index a948b0a..b990d32 100644
--- a/sdnist/report/dataset/__init__.py
+++ b/sdnist/report/dataset/__init__.py
@@ -54,30 +54,24 @@ def unavailable_features(config: Dict, synthetic_data: pd.DataFrame):
 
     return cnf
 
-def compute_feature_space(data_dict: Dict,
-                          features: List[str]):
-    # list of features and their value length
-    f_list = []
-    for f in features:
-        if "values" not in data_dict[f]:
-            vals = [0] * 269  # in case of INDP feature
-        else:
-            vals = data_dict[f]["values"]
-        if "min" in vals and f != 'AGEP':
-            continue
-        if f == 'AGEP':
-            f_list.append([f, 100])
-        else:
-            f_list.append([f, len(vals)])
 
-    f_df = pd.DataFrame(f_list, columns=['feature', 'len'])
-    f_df = f_df.sort_values(by='len')
+def feature_space_size(target_df: pd.DataFrame, data_dict: Dict):
+    size = 1
 
-    # get product of all feature lengths
-    n_features = f_df['len'].astype(object).product()
+    for col in target_df.columns:
+        if col in ['PINCP', 'POVPIP', 'WGTP', 'PWGTP', 'AGEP']:
+            size = size * 100
+        elif col in ['SEX', 'MSP', 'HISP', 'RAC1P', 'HOUSING_TYPE', 'OWN_RENT',
+                     'INDP_CAT', 'EDU', 'PINCP_DECILE', 'DVET', 'DREM', 'DPHY', 'DEYE',
+                     'DEAR']:
+            size = size * len(data_dict[col]['values'])
+        elif col in ['PUMA', 'DENSITY']:
+            size = size * len(target_df['PUMA'].unique())
+        elif col in ['NOC', 'NPF', 'INDP']:
+            size = size * len(target_df[col].unique())
+
+    return size
 
-    # return number of features and sorted list of features
-    return n_features
 
 @dataclass
 class Dataset:
@@ -159,12 +153,12 @@ def __post_init__(self):
         self.features = list(set(self.features).difference(set(ind_features)))
         self.features = list(set(self.features).intersection(list(common_columns)))
 
-        self.feature_space = compute_feature_space(self.data_dict, self.features)
-
         # raw subset data
         self.target_data = self.target_data[self.features]
         self.synthetic_data = self.synthetic_data[self.features]
 
+        self.feature_space = feature_space_size(self.target_data, self.data_dict)
+
         # validation and clean data
         self.c_synthetic_data, self.validation_log = \
             validate(self.synthetic_data, self.data_dict, self.features, self.log)
@@ -176,6 +170,12 @@ def __post_init__(self):
         self.synthetic_data = self.synthetic_data[self.features]
         self.target_data = self.target_data[self.features]
 
+        # for f in self.target_data.columns:
+        #     if f not in ['PINCP', 'INDP', 'PWGTP', 'WGTP', 'POVPIP', 'DENSITY']:
+        #         print('T', f, self.target_data[f].unique().tolist())
+        #         print('S', f, self.synthetic_data[f].unique().tolist())
+        #         print()
+
         # sort columns in the data
         self.target_data = self.target_data.reindex(sorted(self.target_data.columns), axis=1)
         self.synthetic_data = self.synthetic_data.reindex(sorted(self.target_data.columns), axis=1)
@@ -299,12 +299,16 @@ def data_description(dataset: Dataset,
         f_desc = dataset.data_dict[feat]['description']
         feat_title = f'{feat}: {f_desc}'
         if 'link' in dataset.data_dict[feat] and feat == 'INDP':
-            data = f"<a href={dataset.data_dict[feat]['link']}>" \
+            data_1 = f"<a href={dataset.data_dict[feat]['link']}>" \
                    f"See codes in ACS data dictionary.</a> " \
                    f"Find codes by searching the string: {feat}, in " \
                    f"the ACS data dictionary"
+            data_2 = dataset.data_dict[feat]['details']
             dd_as.append(Attachment(name=feat_title,
-                                    _data=data,
+                                    _data=data_1,
+                                    _type=AttachmentType.String))
+            dd_as.append(Attachment(name=None,
+                                    _data=data_2,
                                     _type=AttachmentType.String))
 
         elif 'values' in dataset.data_dict[feat]:
diff --git a/sdnist/report/dataset/binning.py b/sdnist/report/dataset/binning.py
index b543c79..801dde7 100644
--- a/sdnist/report/dataset/binning.py
+++ b/sdnist/report/dataset/binning.py
@@ -3,6 +3,7 @@
 import numpy as np
 import math
 
+
 def percentile_rank_target(data: pd.DataFrame, features: List[str]):
     data = data.copy()
     for c in features:
diff --git a/sdnist/report/dataset/validate.py b/sdnist/report/dataset/validate.py
index 526f45c..8048a16 100644
--- a/sdnist/report/dataset/validate.py
+++ b/sdnist/report/dataset/validate.py
@@ -3,6 +3,7 @@
 
 from sdnist.utils import SimpleLogger
 
+
 def validate(synth_data: pd.DataFrame,
              data_dict: Dict,
              features: List[str],

From 0af87bab4b3859ea32f6b8a0b188eecd785ad93c Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 13:00:20 +0530
Subject: [PATCH 09/16] Update propensity plot figure size and colors

---
 sdnist/report/plots/propensity.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sdnist/report/plots/propensity.py b/sdnist/report/plots/propensity.py
index d56fc89..76f4467 100644
--- a/sdnist/report/plots/propensity.py
+++ b/sdnist/report/plots/propensity.py
@@ -32,7 +32,13 @@ def save(self,
              title: str = 'Distribution of data samples over 100 propensity bins') \
             -> List[Path]:
         file_path = Path(self.o_path, f'{filename}.jpg')
-        ax = self.p_dist.plot(title=title, xlabel="100 Propensity Bins", ylabel='Record Counts')
+        ax = self.p_dist.plot(title=title,
+                              xlabel="100 Propensity Bins",
+                              ylabel='Record Counts',
+                              color=['mediumblue', 'limegreen'],
+                              alpha=0.8,
+                              lw=2,
+                              figsize=(12, 6))
         fig = ax.get_figure()
         fig.savefig(file_path)
         self.report_data['plot'] = relative_path(file_path)

From cc133a0721517a68bf760484d5ffe2d730a30996 Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 13:00:43 +0530
Subject: [PATCH 10/16] Update relative paths to the report data

---
 sdnist/metrics/inconsistency.py            |  6 ++--
 sdnist/metrics/regression.py               |  7 ++--
 sdnist/report/plots/pearson_correlation.py |  7 ++--
 sdnist/report/plots/univariate.py          | 40 ++++++++++++----------
 4 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/sdnist/metrics/inconsistency.py b/sdnist/metrics/inconsistency.py
index 9320ed3..3f0e25f 100644
--- a/sdnist/metrics/inconsistency.py
+++ b/sdnist/metrics/inconsistency.py
@@ -268,7 +268,7 @@ def compute(self):
                          'inconsistency_features': ic_data[2],
                          'inconsistency_violations': int(ic_data[3].split(' ')[0]),
                          'inconsistent_data_indexes': ic_dict[i[NAME]],
-                         'inconsistent_record_example': relative_path(row_path)}
+                         'inconsistent_record_example': relative_path(row_path, level=3)}
                     )
 
         # ------- Compute work-based Inconsistencies------------
@@ -298,7 +298,7 @@ def compute(self):
                          'inconsistency_features': ic_data[2],
                          'inconsistency_violations': int(ic_data[3].split(' ')[0]),
                          'inconsistent_data_indexes': ic_dict[i[NAME]],
-                         'inconsistent_record_example': relative_path(row_path)}
+                         'inconsistent_record_example': relative_path(row_path, level=3)}
                     )
 
         # ------- Compute housing-based Inconsistencies------------
@@ -328,7 +328,7 @@ def compute(self):
                          'inconsistency_features': ic_data[2],
                          'inconsistency_violations': int(ic_data[3].split(' ')[0]),
                          'inconsistent_data_indexes': ic_dict[i[NAME]],
-                         'inconsistent_record_example': relative_path(row_path)}
+                         'inconsistent_record_example': relative_path(row_path, level=3)}
                     )
 
         # -------- Compute overall stats---------------------
diff --git a/sdnist/metrics/regression.py b/sdnist/metrics/regression.py
index 395f204..2121210 100644
--- a/sdnist/metrics/regression.py
+++ b/sdnist/metrics/regression.py
@@ -216,11 +216,12 @@ def plots(self) -> List[Path]:
         self.report_data = {
             "target_counts": relative_path(save_data_frame(self.tcm,
                                                            self.o_path,
-                                                           'target_counts')),
+                                                           'target_counts'), level=3),
             "target_deidentified_counts_difference": relative_path(save_data_frame(self.diff,
                                                                 self.o_path,
-                                                                "target_deidentified_counts_difference")),
-            "target_deidentified_difference_plot": relative_path(file_path),
+                                                                "target_deidentified_counts_difference"),
+                                                                   level=3),
+            "target_deidentified_difference_plot": relative_path(file_path, level=3),
             "target_regression_slope_and_intercept": (self.t_slope, self.t_intercept),
             "deidentified_regression_slope_and_intercept": (self.s_slope, self.s_intercept)
         }
diff --git a/sdnist/report/plots/pearson_correlation.py b/sdnist/report/plots/pearson_correlation.py
index 21b66ed..531c2eb 100644
--- a/sdnist/report/plots/pearson_correlation.py
+++ b/sdnist/report/plots/pearson_correlation.py
@@ -24,14 +24,15 @@ def _setup(self):
         if not self.o_path.exists():
             os.mkdir(self.o_path)
 
-    def save(self) -> List[Path]:
+    def save(self, path_level=2) -> List[Path]:
         file_path = Path(self.o_path, 'pearson_corr_diff.jpg')
 
         self.report_data = {
             "correlation_difference": relative_path(save_data_frame(self.cd,
                                                                     self.o_path,
-                                                                    'correlation_difference')),
-            "plot": relative_path(file_path)
+                                                                    'correlation_difference'),
+                                                     level=path_level),
+            "plot": relative_path(file_path, level=path_level)
         }
         cd = self.cd
         cd = cd.abs()
diff --git a/sdnist/report/plots/univariate.py b/sdnist/report/plots/univariate.py
index 9f83088..50efc6f 100644
--- a/sdnist/report/plots/univariate.py
+++ b/sdnist/report/plots/univariate.py
@@ -80,13 +80,14 @@ def _setup(self):
             raise Exception(f'Path {self.o_dir} does not exist. Cannot save plots')
         os.mkdir(self.out_path)
 
-    def report_data(self):
+    def report_data(self, level=2):
         return {"divergence": relative_path(save_data_frame(self.div_data,
                                                             self.out_path,
-                                                            'divergence')),
+                                                            'divergence'),
+                                            level=level),
                 "counts": self.uni_counts}
 
-    def save(self) -> Dict:
+    def save(self, level=2) -> Dict:
         if self.challenge == CENSUS:
             ignore_features = ['YEAR']
         elif self.challenge == TAXI:
@@ -106,7 +107,8 @@ def save(self) -> Dict:
                                     self.syn,
                                     self.tar,
                                     div_df[FEATURE].tolist(),
-                                    self.out_path)
+                                    self.out_path,
+                                    level=level)
         return self.feat_data
 
     def save_distribution_plot(self,
@@ -114,7 +116,8 @@ def save_distribution_plot(self,
                                synthetic: pd.DataFrame,
                                target: pd.DataFrame,
                                features: List,
-                               output_directory: Path):
+                               output_directory: Path,
+                               level=2):
         ds = dataset
         o_path = output_directory
         bar_width = 0.4
@@ -138,26 +141,24 @@ def save_distribution_plot(self,
                     st_df = o_tar[o_tar[INDP_CAT].isin([s])].copy()
                     st_df.loc[:, f] = pd.to_numeric(st_df[f]).astype(int)
                     ss_df = o_syn[o_syn[INDP_CAT].isin([int(s)])]
-                    # print(s, type(s))
-                    # print(o_syn[INDP_CAT].unique().tolist())
+
                     unique_ind_codes = st_df[f].unique().tolist()
                     set(unique_ind_codes).update(set(ss_df[f].unique().tolist()))
                     unique_ind_codes = list(unique_ind_codes)
                     val_df = pd.DataFrame(unique_ind_codes, columns=[f])
+                    val_df[f] = val_df.astype(str)
 
                     t_counts_df = st_df.groupby(by=f)[f].size().reset_index(name='count_target')
                     s_counts_df = ss_df.groupby(by=f)[f].size().reset_index(name='count_deidentified')
-                    # print(s)
-                    # print(s_counts_df)
-                    # print(ss_df[f].unique().tolist())
-                    # print(ss_df.shape)
+                    t_counts_df[f] = t_counts_df[f].astype(str)
+                    s_counts_df[f] = s_counts_df[f].astype(str)
+
                     merged = pd.merge(left=val_df, right=t_counts_df, on=f, how='left')\
                         .fillna(0)
                     merged = pd.merge(left=merged, right=s_counts_df, on=f, how='left')\
                         .fillna(0)
                     div = l1(pk=merged['count_target'], qk=merged['count_deidentified'])
-                    # print(s)
-                    # print(merged[['count_target', 'count_deidentified']])
+
                     selected.append([merged, div, s])
                 selected = sorted(selected, key=lambda l: l[1], reverse=True)
 
@@ -192,8 +193,9 @@ def save_distribution_plot(self,
                         "divergence": div,
                         "counts": relative_path(save_data_frame(merged,
                                                 o_path,
-                                                f"Industry Category {s}")),
-                        "plot": relative_path(file_path)
+                                                f"Industry Category {s}"),
+                                                level=level),
+                        "plot": relative_path(file_path, level=level)
                     }
                     # if j < 2:
                     saved_file_paths.append(file_path)
@@ -225,8 +227,9 @@ def save_distribution_plot(self,
                 self.uni_counts[f] = {
                     "counts": relative_path(save_data_frame(c_sort_merged.copy(),
                                                             o_path,
-                                                            f'{f}_counts')),
-                    "plot": relative_path(file_path)
+                                                            f'{f}_counts'),
+                                            level=level),
+                    "plot": relative_path(file_path, level)
                 }
 
                 if self.worst_univariates_to_display is None \
@@ -267,13 +270,14 @@ def save_distribution_plot(self,
                     vals = updated_vals
 
                 vals = [str(v) for v in vals]
+
                 if "-1" in vals:
                     idx = vals.index("-1")
                     vals[idx] = "N"
 
                 if f == 'PUMA':
                     f_val_dict = {i: v for i, v in enumerate(ds.schema[f]['values'])}
-                    vals = [f_val_dict[int(v)] for v in vals]
+                    vals = [f_val_dict[int(v)] if v != 'N' else 'N' for v in vals]
 
                 plt.gca().set_xticks(x_axis, vals)
                 plt.legend(loc='upper right')

From adfdeb3e59b6d2c8a2588738fc454e4e143e1ddd Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 13:01:55 +0530
Subject: [PATCH 11/16] Fix issue with PCA plot scaling

---
 sdnist/metrics/pca.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/sdnist/metrics/pca.py b/sdnist/metrics/pca.py
index fd38d33..2dcf53d 100644
--- a/sdnist/metrics/pca.py
+++ b/sdnist/metrics/pca.py
@@ -48,11 +48,10 @@ def compute_pca(self):
         t_pca = PCA(n_components=cc)
 
         tdf_v = self.tar.values
-        sdf = self.syn.apply(lambda x: x - x.mean())
-        sdf_v = sdf.values
-
-        tdf_v = StandardScaler().fit_transform(tdf_v)
-        sdf_v = StandardScaler().fit_transform(sdf_v)
+        sdf_v = self.syn.values
+        scaler = StandardScaler().fit(tdf_v)
+        sdf_v = scaler.transform(sdf_v)
+        tdf_v = scaler.transform(tdf_v)
 
         t_pc = t_pca.fit_transform(tdf_v)
 
@@ -62,7 +61,7 @@ def compute_pca(self):
         self.t_comp_data = []
         for i, comp in enumerate(t_pca.components_):
             qc = [[n, round(v, 2)] for n, v in zip(self.tar.columns.tolist(), comp)]
-            qc = sorted(qc, key=lambda x: x[1], reverse=True)
+            qc = sorted(qc, key=lambda x: abs(x[1]), reverse=True)
             qc = [f'{v[0]} ({v[1]})' for v in qc]
             self.t_comp_data.append({"Principal Component": f"PC-{i}",
                                      "Features Contribution: "
@@ -88,7 +87,9 @@ def compute_pca(self):
         for c in self.t_pdf.columns:
             self.t_pdf_s[c] = min_max_scaling(self.t_pdf[c])
         for c in self.s_pdf.columns:
-            self.s_pdf_s[c] = min_max_scaling(self.s_pdf[c])
+            self.s_pdf_s[c] = min_max_scaling(self.s_pdf[c],
+                                              self.t_pdf[c].min(),
+                                              self.t_pdf[c].max())
 
     def plot(self, output_directory: Path) -> Dict[str, any]:
         s = time.time()
@@ -152,8 +153,13 @@ def plot(self, output_directory: Path) -> Dict[str, any]:
         return plot_paths
 
 
-def min_max_scaling(series):
-    return (series - series.min()) / (series.max() - series.min())
+def min_max_scaling(series, min_val=None, max_val=None):
+    if min_val is None:
+        min_val = series.min()
+    if max_val is None:
+        max_val = series.max()
+
+    return (series - min_val) / (max_val - min_val)
 
 
 def plot_all_components_pairs(title: str,

From 30ef49487db489598cdeb4191a53c7ebf0cb1a28 Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 13:02:21 +0530
Subject: [PATCH 12/16] Fix unique exact matches metric

---
 sdnist/metrics/unique_exact_matches.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sdnist/metrics/unique_exact_matches.py b/sdnist/metrics/unique_exact_matches.py
index 6bfa20b..ef22520 100644
--- a/sdnist/metrics/unique_exact_matches.py
+++ b/sdnist/metrics/unique_exact_matches.py
@@ -6,6 +6,7 @@
 from sdnist.report.dataset import Dataset
 import sdnist.utils as u
 
+
 def unique_exact_matches(target_data: pd.DataFrame, deidentified_data: pd.DataFrame):
     td, dd = target_data, deidentified_data
     cols = td.columns.tolist()
@@ -18,8 +19,7 @@ def unique_exact_matches(target_data: pd.DataFrame, deidentified_data: pd.DataFr
     perc_t_unique_records = round(t_unique_records/td.shape[0] * 100, 2)
 
     # Keep only one copy of each duplicate row in the deidentified data
-    # and also save the count of each row in the deidentified data
-    dd= dd.drop_duplicates(subset=cols)
+    dd = dd.drop_duplicates(subset=cols)
 
     merged = u_td.merge(dd, how='inner', on=cols)
 
@@ -27,12 +27,13 @@ def unique_exact_matches(target_data: pd.DataFrame, deidentified_data: pd.DataFr
     t_rec_matched = merged.shape[0]
 
     # percent of unique target records that exactly match in deidentified data
-    perc_t_rec_matched = t_rec_matched/td.shape[0] * 100
+    perc_t_rec_matched = t_rec_matched/t_unique_records * 100
 
     perc_t_rec_matched = round(perc_t_rec_matched, 2)
 
     return t_rec_matched, perc_t_rec_matched, t_unique_records, perc_t_unique_records
 
+
 if __name__ == '__main__':
     THIS_DIR = Path(__file__).parent
     s_path = Path(THIS_DIR, '..', '..',

From cc69a30a25df3b4e2d97ca7d686633fa78897b1b Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 13:04:58 +0530
Subject: [PATCH 13/16] Update data_dictionary.json to include detail for INDP

---
 .../data_dictionary.json                                   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nist diverse communities data excerpts/data_dictionary.json b/nist diverse communities data excerpts/data_dictionary.json
index fe1f111..cdf5979 100644
--- a/nist diverse communities data excerpts/data_dictionary.json	
+++ b/nist diverse communities data excerpts/data_dictionary.json	
@@ -127,12 +127,13 @@
   },
   "INDP": {
     "description": "Industry codes",
+    "details": "There are a total of 271 possible codes for INDP, 269 of these codes appear in the Diverse Community Data Excerpts (233 in MA, 264 in Texas and National)",
     "link": "https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2019.pdf"
   },
   "INDP_CAT": {
     "description": "Industry categories",
     "values": {
-      "N": "N/A (less than 16 years old/NILF who last worked more than 5 years ago or never worked)",
+      "N": "N/A (less than 16 years old, or last worked more than 5 years ago, or never worked)",
       "0": "AGR: Agriculture, Forestry, Fishing and Hunting",
       "1": "EXT: Mining, Quarrying, and Oil and Gas Extraction",
       "2": "UTL: Utilities",
@@ -160,7 +161,7 @@
       "N": "N/A (less than 3 years old)",
       "1": "No schooling completed",
       "2": "Nursery school, Preschool, or Kindergarten",
-      "3": "Grade 4 to grade 8",
+      "3": "Grade 1 to grade 8",
       "4": "Grade 9 to grade 12, no diploma",
       "5": "High School diploma",
       "6": "GED",
@@ -181,7 +182,7 @@
     }
   },
   "PINCP_DECILE": {
-    "description": "Person's total income in 10-percentile bins",
+    "description": "Person's total income rank (with respect to their state) discretized into 10% bins.",
     "values": {
       "N": "N/A (less than 15 years old",
       "9": "90th percentile",

From ea669b39011ac95f3fb28e3aeb1828b2e71c6664 Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 18:27:29 +0530
Subject: [PATCH 14/16] update version to 2.3

---
 CITATION.cff      |  2 +-
 README.md         | 39 +++++++++++----------------------------
 sdnist/version.py |  2 +-
 3 files changed, 13 insertions(+), 30 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index ba96fc9..fe56622 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -4,7 +4,7 @@ abstract: "SDNist provides benchmark data and a suite of both machine- and human
 message: >-
   If you use this repository or present information about it publicly, please cite us.
 type: software
-version: 2.2
+version: 2.3
 doi: 10.18434/mds2-2943
 date-released: 2023-4-14
 contact:
diff --git a/README.md b/README.md
index 1516717..137589b 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# SDNist v2.2: Deidentified Data Report Tool
+# SDNist v2.3: Deidentified Data Report Tool
 
 ## [SDNist is the offical software package for engaging in the NIST Collaborative Research Cycle](https://pages.nist.gov/privacy_collaborative_research_cycle)
 
@@ -37,7 +37,7 @@ Setting Up the SDNIST Report Tool
 
 ### Brief Setup Instructions
 
-SDNist requires Python version 3.7 or greater. If you have installed a previous version of the SDNist library, we recommend installing v2.2 in a virtual environment. v2.2 can be installed via [Release 2.2](https://github.com/usnistgov/SDNist/releases/tag/v2.2.0) or via the Pypi server: `pip install sdnist` or, if you already have a version installed, `pip install --upgrade sdnist`.
+SDNist requires Python version 3.7 or greater. If you have installed a previous version of the SDNist library, we recommend installing v2.3 in a virtual environment. v2.3 can be installed via [Release 2.3](https://github.com/usnistgov/SDNist/releases/tag/v2.3.0) or via the Pypi server: `pip install sdnist` or, if you already have a version installed, `pip install --upgrade sdnist`.
 
 The NIST Diverse Community Exceprt data will download on the fly.
 
@@ -61,13 +61,13 @@ The NIST Diverse Community Exceprt data will download on the fly.
     ```
 
 
-4.  In the already-opened terminal or powershell window, execute the following command to create a new Python environment. The sdnist library will be installed in this newly created Python environment:
+4. In the already-opened terminal or powershell window, execute the following command to create a new Python environment. The sdnist library will be installed in this newly created Python environment:
 
     ```
     c:\\sdnist-project> python -m venv venv
     ```
 
-6. The new Python environment will be created in the sdnist-project directory, and the files of the environment should be in the venv directory. To check whether a new Python environment was created successfully, use the following command to list all directories in the sdnist-project directory, and make sure the venv directory exists.
+5. The new Python environment will be created in the sdnist-project directory, and the files of the environment should be in the venv directory. To check whether a new Python environment was created successfully, use the following command to list all directories in the sdnist-project directory, and make sure the venv directory exists.
 
     **MAC OS/Linux:**
     ```
@@ -78,7 +78,7 @@ The NIST Diverse Community Exceprt data will download on the fly.
     c:\\sdnist-project> dir
     ```
 
-7. Now activate the Python environment and install the sdnist library into it.
+6. Now activate the Python environment and install the sdnist library into it.
 
     **MAC OS/Linux:**
     ```
@@ -107,27 +107,12 @@ The NIST Diverse Community Exceprt data will download on the fly.
     Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine
     ```
 
-
-8. Per step 5 above, the sdnist-2.2.0-py3-none-any.whl file should already be present in the sdnist-project directory. Check whether that is true by listing the files in the sdnist-project directory.
-
-      **MAC OS/Linux:**
-       ```
-       (venv) sdnist-project> ls
-       ```
-      **Windows:**
-       ```
-       (venv) c:\\sdnist-project> dir
-       ```
-       The sdnist-2.2.0-py3-none-any.whl file should be in the list printed by the above command; otherwise, follow steps 4 and 5 again to download the .whl file.
-
-
-9. Install sdnist Python library:
+7. Install sdnist Python library:
        ```
        (venv) c:\\sdnist-project> pip install sdnist
        ```
 
-
-10. Installation is successful if executing the following command outputs a help menu for the sdnist.report package:
+8. Installation is successful if executing the following command outputs a help menu for the sdnist.report package:
        ```
        (venv) c:\\sdnist-project> python -m sdnist.report -h
        ```
@@ -162,8 +147,7 @@ The NIST Diverse Community Exceprt data will download on the fly.
           NATIONAL              national2019
        ```
 
-
-11. These instructions install sdnist into a virtual environment. The virtual environment must be activated (step 9) each time a new terminal window is used with sdnist.
+9. These instructions install sdnist into a virtual environment. The virtual environment must be activated (step 9) each time a new terminal window is used with sdnist.
 
 
 Generate Data Quality Report
@@ -260,7 +244,7 @@ Setup Data for SDNIST Report Tool
 4. You can download the toy deidentified datasets from Github [Sdnist Toy Deidentified Dataset](https://github.com/usnistgov/SDNist/releases/download/v2.1.1/toy_deidentified_data.zip). Unzip the downloaded file, and move the unzipped toy_deidentified_dataset directory to the sdnist-project directory.
 
 
-5. Each toy deidentified dataset file is generated using the [Diverse Communities Data Excerpts](https://github.com/usnistgov/SDNist/releases/download/v2.2.0/diverse_communities_data_excerpts.zip). The syn_ma.csv, syn_tx.csv, and syn_national.csv deidentified dataset files are created from target datasets MA (ma2019.csv), TX (tx2019.csv), and NATIONAL(national2019.csv), respectively. You can use one of the toy deidentified dataset files for testing whether the sdnist.report package is installed correctly on your system.
+5. Each toy deidentified dataset file is generated using the [Diverse Communities Data Excerpts](https://github.com/usnistgov/SDNist/releases/download/v2.3.0/diverse_communities_data_excerpts.zip). The syn_ma.csv, syn_tx.csv, and syn_national.csv deidentified dataset files are created from target datasets MA (ma2019.csv), TX (tx2019.csv), and NATIONAL(national2019.csv), respectively. You can use one of the toy deidentified dataset files for testing whether the sdnist.report package is installed correctly on your system.
 
 
 6. Use the following commands for generating reports if you are using a toy deidentified dataset file:
@@ -287,7 +271,7 @@ by the sdnist.report package to generate a data quality report.
 Download Data Manually
 ----------------------
 
-1.  If the sdnist.report package is not able to download the datasets, you can download them from Github [Diverse Communities Data Excerpts](https://github.com/usnistgov/SDNist/releases/download/v2.2.0/diverse_communities_data_excerpts.zip).
+1.  If the sdnist.report package is not able to download the datasets, you can download them from Github [Diverse Communities Data Excerpts](https://github.com/usnistgov/SDNist/releases/download/v2.3.0/diverse_communities_data_excerpts.zip).
 3.  Unzip the **diverse_community_excerpts_data.zip** file and move the unzipped **diverse_community_excerpts_data** directory to the **sdnist-project** directory.
 4.  Delete the **diverse_community_excerpts_data.zip** file once the data is successfully extracted from the zip.
 
@@ -305,5 +289,4 @@ Credits
 - [Christine Task](mailto:christine.task@knexusresearch.com) - Project technical lead - christine.task@knexusresearch.com
 - [Karan Bhagat](https://github.com/kbtriangulum) - Contributor
 - [David Lee](https://www.linkedin.com/in/david-lee-13872922/) - Documentation
-- [Gary Howarth](https://www.nist.gov/people/gary-howarth) - Project PI - gary.howarth@nist.gov
-
+- [Gary Howarth](https://www.nist.gov/people/gary-howarth) - Project PI - gary.howarth@nist.gov
\ No newline at end of file
diff --git a/sdnist/version.py b/sdnist/version.py
index 8a124bf..55e4709 100644
--- a/sdnist/version.py
+++ b/sdnist/version.py
@@ -1 +1 @@
-__version__ = "2.2.0"
+__version__ = "2.3.0"

From c0e5b76c627099177e6f26c79e4ad46759aa4fd3 Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 22:08:53 +0530
Subject: [PATCH 15/16] Add backward compatibility with old diverse data

---
 sdnist/report/dataset/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sdnist/report/dataset/__init__.py b/sdnist/report/dataset/__init__.py
index b990d32..df33c1d 100644
--- a/sdnist/report/dataset/__init__.py
+++ b/sdnist/report/dataset/__init__.py
@@ -303,13 +303,14 @@ def data_description(dataset: Dataset,
                    f"See codes in ACS data dictionary.</a> " \
                    f"Find codes by searching the string: {feat}, in " \
                    f"the ACS data dictionary"
-            data_2 = dataset.data_dict[feat]['details']
             dd_as.append(Attachment(name=feat_title,
                                     _data=data_1,
                                     _type=AttachmentType.String))
-            dd_as.append(Attachment(name=None,
-                                    _data=data_2,
-                                    _type=AttachmentType.String))
+            if "details" in dataset.data_dict[feat]:
+                data_2 = dataset.data_dict[feat]['details']
+                dd_as.append(Attachment(name=None,
+                                        _data=data_2,
+                                        _type=AttachmentType.String))
 
         elif 'values' in dataset.data_dict[feat]:
             f_name = feat_title

From fa08a1240ae2ad75452ee595df5f591746e1987d Mon Sep 17 00:00:00 2001
From: Karan Bhagat <kb.in.triangulum@gmail.com>
Date: Mon, 19 Jun 2023 22:14:54 +0530
Subject: [PATCH 16/16] Update diverse data download link

---
 sdnist/load.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdnist/load.py b/sdnist/load.py
index de57962..1c10f91 100644
--- a/sdnist/load.py
+++ b/sdnist/load.py
@@ -82,7 +82,7 @@ def check_exists(root: Path, name: Path, download: bool, data_name: str = strs.D
     if not name.exists():
         print(f"{name} does not exist.")
         zip_path = Path(root.parent, 'data.zip')
-        version = "2.2.0"
+        version = "2.3.0"
 
         version_v = f"v{version}"
         sdnist_version = DEFAULT_DATASET