From 3108c5f32d8ff599bdade591574fc1d54f2ce165 Mon Sep 17 00:00:00 2001 From: Richard Preen Date: Wed, 5 Jun 2024 17:12:17 +0100 Subject: [PATCH] clean up repository and update packaging (#283) * mv notebooks to examples folder; rm dev_files * mv risk_examples inside examples * clean up readme * update README * update README * update links * update README * update README * update README * update README * remove pytest.ini * update examples README * add pyproject.toml and remove .pylintrc * separate metadata from setup.py to setup.cfg * update setup.cfg * update CHANGELOG.md --- .pylintrc | 615 ----------- CHANGELOG.md | 6 + CONTRIBUTING.md | 35 +- README.md | 59 +- development_files/safekeras.py.saved | 572 ----------- development_files/structural_attack.ipynb | 957 ------------------ development_files/test_safekeras2.py.saved | 796 --------------- docs/source/attacks/output_format.rst | 4 +- docs/source/safemodel/examples.rst | 10 +- docs/source/safemodel/safedecisiontree.rst | 2 +- docs/source/safemodel/safekeras.rst | 2 +- docs/source/safemodel/saferandomforest.rst | 3 +- docs/source/safemodel/safesvc.rst | 2 +- examples/README.md | 19 + .../notebooks}/README.md | 0 .../notebooks}/example-notebook-SVC.ipynb | 0 .../example-notebook-decisiontree.ipynb | 0 .../notebooks}/example-notebook-keras.ipynb | 0 .../example-notebook-randomforest.ipynb | 0 .../R/attribute_inference_ols.Rmd | 0 .../R/membership_inference_solvency.Rmd | 0 .../risk_examples}/README.md | 0 .../python/attribute_inference_cancer.ipynb | 0 .../python/instance_based_mimic.ipynb | 0 .../python/membership_inference_cancer.ipynb | 0 pyproject.toml | 32 + pytest.ini | 4 - setup.cfg | 84 ++ setup.py | 79 +- user_stories/README.md | 2 +- 30 files changed, 198 insertions(+), 3085 deletions(-) delete mode 100644 .pylintrc delete mode 100644 development_files/safekeras.py.saved delete mode 100644 development_files/structural_attack.ipynb delete mode 100644 development_files/test_safekeras2.py.saved create mode 100644 examples/README.md rename {example_notebooks => examples/notebooks}/README.md (100%) rename {example_notebooks => examples/notebooks}/example-notebook-SVC.ipynb (100%) rename {example_notebooks => examples/notebooks}/example-notebook-decisiontree.ipynb (100%) rename {example_notebooks => examples/notebooks}/example-notebook-keras.ipynb (100%) rename {example_notebooks => examples/notebooks}/example-notebook-randomforest.ipynb (100%) rename {risk_examples => examples/risk_examples}/R/attribute_inference_ols.Rmd (100%) rename {risk_examples => examples/risk_examples}/R/membership_inference_solvency.Rmd (100%) rename {risk_examples => examples/risk_examples}/README.md (100%) rename {risk_examples => examples/risk_examples}/python/attribute_inference_cancer.ipynb (100%) rename {risk_examples => examples/risk_examples}/python/instance_based_mimic.ipynb (100%) rename {risk_examples => examples/risk_examples}/python/membership_inference_cancer.ipynb (100%) create mode 100644 pyproject.toml delete mode 100644 pytest.ini create mode 100644 setup.cfg diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index a4a40419..00000000 --- a/.pylintrc +++ /dev/null @@ -1,615 +0,0 @@ -[MAIN] - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Load and enable all available extensions. Use --list-extensions to see a list -# all available extensions. -#enable-all-extensions= - -# In error mode, messages with a category besides ERROR or FATAL are -# suppressed, and no reports are done by default. Error mode is compatible with -# disabling specific errors. -#errors-only= - -# Always return a 0 (non-error) status code, even if lint errors are found. -# This is primarily useful in continuous integration scripts. -#exit-zero= - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code. -extension-pkg-allow-list= - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code. (This is an alternative name to extension-pkg-allow-list -# for backward compatibility.) -extension-pkg-whitelist= - -# Return non-zero exit code if any of these messages/categories are detected, -# even if score is above --fail-under value. Syntax same as enable. Messages -# specified are enabled, while categories only check already-enabled messages. -fail-on= - -# Specify a score threshold to be exceeded before program exits with error. -fail-under=10 - -# Interpret the stdin as a python script, whose filename needs to be passed as -# the module_or_package argument. -#from-stdin= - -# Files or directories to be skipped. They should be base names, not paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the ignore-list. The -# regex matches against paths and can be in Posix or Windows format. -ignore-paths= - -# Files or directories matching the regex patterns are skipped. The regex -# matches against base names, not paths. The default value ignores Emacs file -# locks -ignore-patterns=^\.# - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis). It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules=multiprocess,sklearn.datasets - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the -# number of processors available to use, and will cap the count on Windows to -# avoid hangs. -jobs=1 - -# Control the amount of potential inferred values when inferring a single -# object. This can help the performance when dealing with large functions or -# complex, nested conditions. -limit-inference-results=100 - -# List of plugins (as comma separated values of python module names) to load, -# usually to register additional checkers. -load-plugins= - -# Pickle collected data for later comparisons. -persistent=yes - -# Minimum Python version to use for version dependent checks. Will default to -# the version used to run pylint. -py-version=3.9 - -# Discover python modules and packages in the file system subtree. -recursive=no - -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode=yes - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - -# In verbose mode, extra non-checker-related info will be displayed. -#verbose= - - -[REPORTS] - -# Python expression which should return a score less than or equal to 10. You -# have access to the variables 'fatal', 'error', 'warning', 'refactor', -# 'convention', and 'info' which contain the number of messages in each -# category, as well as 'statement' which is the total number of statements -# analyzed. This score is used by the global evaluation report (RP0004). -evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details. -msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio). You can also give a reporter class, e.g. -# mypackage.mymodule.MyReporterClass. -#output-format= - -# Tells whether to display a full report or only the messages. -reports=no - -# Activate the evaluation score. -score=yes - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, -# UNDEFINED. -confidence=HIGH, - CONTROL_FLOW, - INFERENCE, - INFERENCE_FAILURE, - UNDEFINED - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then re-enable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - deprecated-pragma, - use-symbolic-message-instead, - invalid-name - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member, - useless-suppression - - -[BASIC] - -# Naming style matching correct argument names. -argument-naming-style=snake_case - -# Regular expression matching correct argument names. Overrides argument- -# naming-style. If left empty, argument names will be checked with the set -# naming style. -#argument-rgx= - -# Naming style matching correct attribute names. -attr-naming-style=snake_case - -# Regular expression matching correct attribute names. Overrides attr-naming- -# style. If left empty, attribute names will be checked with the set naming -# style. -#attr-rgx= - -# Bad variable names which should always be refused, separated by a comma. -bad-names=foo, - bar, - baz, - toto, - tutu, - tata - -# Bad variable names regexes, separated by a comma. If names match any regex, -# they will always be refused -bad-names-rgxs= - -# Naming style matching correct class attribute names. -class-attribute-naming-style=any - -# Regular expression matching correct class attribute names. Overrides class- -# attribute-naming-style. If left empty, class attribute names will be checked -# with the set naming style. -#class-attribute-rgx= - -# Naming style matching correct class constant names. -class-const-naming-style=UPPER_CASE - -# Regular expression matching correct class constant names. Overrides class- -# const-naming-style. If left empty, class constant names will be checked with -# the set naming style. -#class-const-rgx= - -# Naming style matching correct class names. -class-naming-style=PascalCase - -# Regular expression matching correct class names. Overrides class-naming- -# style. If left empty, class names will be checked with the set naming style. -#class-rgx= - -# Naming style matching correct constant names. -const-naming-style=UPPER_CASE - -# Regular expression matching correct constant names. Overrides const-naming- -# style. If left empty, constant names will be checked with the set naming -# style. -#const-rgx= - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming style matching correct function names. -function-naming-style=snake_case - -# Regular expression matching correct function names. Overrides function- -# naming-style. If left empty, function names will be checked with the set -# naming style. -#function-rgx= - -# Good variable names which should always be accepted, separated by a comma. -good-names=i, - j, - k, - ex, - Run, - _ - -# Good variable names regexes, separated by a comma. If names match any regex, -# they will always be accepted -good-names-rgxs= - -# Include a hint for the correct naming format with invalid-name. -include-naming-hint=no - -# Naming style matching correct inline iteration names. -inlinevar-naming-style=any - -# Regular expression matching correct inline iteration names. Overrides -# inlinevar-naming-style. If left empty, inline iteration names will be checked -# with the set naming style. -#inlinevar-rgx= - -# Naming style matching correct method names. -method-naming-style=snake_case - -# Regular expression matching correct method names. Overrides method-naming- -# style. If left empty, method names will be checked with the set naming style. -#method-rgx= - -# Naming style matching correct module names. -module-naming-style=snake_case - -# Regular expression matching correct module names. Overrides module-naming- -# style. If left empty, module names will be checked with the set naming style. -#module-rgx= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -# These decorators are taken in consideration only for invalid-name. -property-classes=abc.abstractproperty - -# Regular expression matching correct type variable names. If left empty, type -# variable names will be checked with the set naming style. -#typevar-rgx= - -# Naming style matching correct variable names. -variable-naming-style=snake_case - -# Regular expression matching correct variable names. Overrides variable- -# naming-style. If left empty, variable names will be checked with the set -# naming style. -#variable-rgx= - - -[CLASSES] - -# Warn about protected attribute access inside special methods -check-protected-access-in-special-methods=no - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp, - __post_init__ - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=cls - - -[DESIGN] - -# List of regular expressions of class ancestor names to ignore when counting -# public methods (see R0903) -exclude-too-few-public-methods= - -# List of qualified class names to ignore when counting class parents (see -# R0901) -ignored-parents= - -# Maximum number of arguments for function / method. -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in an if statement (see R0916). -max-bool-expr=5 - -# Maximum number of branch for function / method body. -max-branches=12 - -# Maximum number of locals for function / method body. -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body. -max-returns=6 - -# Maximum number of statements in function / method body. -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when caught. -#overgeneral-exceptions=BaseException, -# Exception - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=100 - -# Maximum number of lines in a module. -max-module-lines=1000 - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[IMPORTS] - -# List of modules that can be imported at any level, not just the top level -# one. -allow-any-import-level= - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Deprecated modules which should not be used, separated by a comma. -deprecated-modules= - -# Output a graph (.gv or any supported image format) of external dependencies -# to the given file (report RP0402 must not be disabled). -ext-import-graph= - -# Output a graph (.gv or any supported image format) of all (i.e. internal and -# external) dependencies to the given file (report RP0402 must not be -# disabled). -import-graph= - -# Output a graph (.gv or any supported image format) of internal dependencies -# to the given file (report RP0402 must not be disabled). -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - -# Couples of modules and preferred modules, separated by a comma. -preferred-modules= - - -[LOGGING] - -# The type of string formatting that logging methods do. `old` means using % -# formatting, `new` is for `{}` formatting. -logging-format-style=old - -# Logging modules to check that the string format arguments are in logging -# function parameter format. -logging-modules=logging - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME, - XXX, - TODO - -# Regular expression of note tags to take in consideration. -notes-rgx= - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=sys.exit,argparse.parse_error - - -[SIMILARITIES] - -# Comments are removed from the similarity computation -ignore-comments=yes - -# Docstrings are removed from the similarity computation -ignore-docstrings=yes - -# Imports are removed from the similarity computation -ignore-imports=yes - -# Signatures are removed from the similarity computation -ignore-signatures=yes - -# Minimum lines number of a similarity. -min-similarity-lines=10 - - -[SPELLING] - -# Limits count of emitted suggestions for spelling mistakes. -max-spelling-suggestions=4 - -# Spelling dictionary name. Available dictionaries: none. To make it work, -# install the 'python-enchant' package. -spelling-dict= - -# List of comma separated words that should be considered directives if they -# appear at the beginning of a comment and should not be checked. -spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains the private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to the private dictionary (see the -# --spelling-private-dict-file option) instead of raising a message. -spelling-store-unknown-words=no - - -[STRING] - -# This flag controls whether inconsistent-quotes generates a warning when the -# character used as a quote delimiter is used inconsistently within a module. -check-quote-consistency=no - -# This flag controls whether the implicit-str-concat should generate a warning -# on implicit string concatenation in sequences defined over several lines. -check-str-concat-over-line-jumps=no - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - -# Tells whether to warn about missing members when the owner of the attribute -# is inferred to be None. -ignore-none=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of symbolic message names to ignore for Mixin members. -ignored-checks-for-mixins=no-member, - not-async-context-manager, - not-context-manager, - attribute-defined-outside-init - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values, - thread._local, - _thread._local, - argparse.Namespace, - attacks.worst_case_attack.WorstCaseAttackArgs - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - -# Regex pattern to define which classes are considered mixins. -mixin-class-rgx=.*[Mm]ixin - -# List of decorators that change the signature of a decorated function. -signature-mutators= - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid defining new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of names allowed to shadow builtins -allowed-redefined-builtins= - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_, - _cb - -# A regular expression matching the name of dummy variables (i.e. expected to -# not be used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore. -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io diff --git a/CHANGELOG.md b/CHANGELOG.md index c2a31db0..c07b9fd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +Changes: +* Add support for scikit-learn MLPClassifier ([#276](https://github.com/AI-SDC/AI-SDC/pull/276)) +* Use default XGBoost params if not defined in structural attacks ([#277](https://github.com/AI-SDC/AI-SDC/pull/277)) +* Clean up documentation ([#282](https://github.com/AI-SDC/AI-SDC/pull/282)) +* Clean up repository and update packaging ([#283](https://github.com/AI-SDC/AI-SDC/pull/283)) + ## Version 1.1.3 (Apr 26, 2024) Changes: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 086bad5f..d260fa84 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,35 @@ -# General guidance for contributors +# General Guidance for Contributors + +## Development + +Clone the repository and install the local package including all dependencies within a virtual environment: + +``` +$ git clone https://github.com/AI-SDC/AI-SDC.git +$ cd AI-SDC +$ pip install .[test] +``` + +Then to run the tests: + +``` +$ pytest . +``` + +## Directory Structure + +* `aisdc` Contains the aisdc source code. + - `attacks` Contains a variety of privacy attacks on machine learning models. + - `preprocessing` Contains preprocessing modules for test datasets. + - `safemodel` The safemodel wrappers for common machine learning models. +* `docs` Contains Sphinx documentation files. +* `examples` Contains examples of how to run the code contained in this repository. +* `tests` Contains unit tests. +* `user_stories` Contains user guides. + +## Documentation + +Documentation is hosted here: https://ai-sdc.github.io/AI-SDC/ ## Style Guide @@ -26,8 +57,6 @@ To install as a hook that executes with every `git commit`: $ pre-commit install ``` -******************************************************************************* - ## Automatic Documentation The documentation is automatically built using [Sphinx](https://www.sphinx-doc.org) and github actions. diff --git a/README.md b/README.md index f54a73ce..1f826951 100644 --- a/README.md +++ b/README.md @@ -6,38 +6,19 @@ # AI-SDC -A collection of tools and resources for managing the statistical disclosure control of trained machine learning models. For a brief introduction, see [Smith et al. (2022)](https://doi.org/10.48550/arXiv.2212.01233). +A collection of tools and resources for managing the [statistical disclosure control](https://en.wikipedia.org/wiki/Statistical_disclosure_control) of trained [machine learning](https://en.wikipedia.org/wiki/Machine_learning) models. For a brief introduction, see [Smith et al. (2022)](https://doi.org/10.48550/arXiv.2212.01233). -### User Guides +The `aisdc` package provides: +* A variety of privacy attacks for assessing machine learning models. +* The safemodel package: a suite of open source wrappers for common machine learning frameworks, including [scikit-learn](https://scikit-learn.org) and [Keras](https://keras.io). It is designed for use by researchers in Trusted Research Environments (TREs) where disclosure control methods must be implemented. Safemodel aims to give researchers greater confidence that their models are more compliant with disclosure control. -A collection of user guides can be found in the 'user_stories' folder of this repository. These guides include configurable examples from the perspective of both a researcher and a TRE, with separate scripts for each. Instructions on how to use each of these scripts and which scripts to use are included in the README of the [`user_stories`](./user_stories) folder. +A collection of user guides can be found in the [`user_stories`](user_stories) folder of this repository. These guides include configurable examples from the perspective of both a researcher and a TRE, with separate scripts for each. Instructions on how to use each of these scripts and which scripts to use are included in the README located in the folder. -## Content - -* `aisdc` - - `attacks` Contains a variety of privacy attacks on machine learning models, including membership and attribute inference. - - `preprocessing` Contains preprocessing modules for test datasets. - - `safemodel` The safemodel package is an open source wrapper for common machine learning models. It is designed for use by researchers in Trusted Research Environments (TREs) where disclosure control methods must be implemented. Safemodel aims to give researchers greater confidence that their models are more compliant with disclosure control. -* `docs` Contains Sphinx documentation files. -* `example_notebooks` Contains short tutorials on the basic concept of "safe_XX" versions of machine learning algorithms, and examples of some specific algorithms. -* `examples` Contains examples of how to run the code contained in this repository: - - How to simulate attribute inference attacks `attribute_inference_example.py`. - - How to simulate membership inference attacks: - + Worst case scenario attack `worst_case_attack_example.py`. - + LIRA scenario attack `lira_attack_example.py`. - - Integration of attacks into safemodel classes `safemodel_attack_integration_bothcalls.py`. -* `risk_examples` Contains hypothetical examples of data leakage through machine learning models as described in the [Green Paper](https://doi.org/10.5281/zenodo.6896214). -* `tests` Contains unit tests. - -## Documentation - -Documentation is hosted here: https://ai-sdc.github.io/AI-SDC/ - -## Installation / End-user +## Installation [![PyPI package](https://img.shields.io/pypi/v/aisdc.svg)](https://pypi.org/project/aisdc) -Install `aisdc` (safest in a virtual env) and manually copy the [`examples`](examples/) and [`example_notebooks`](example_notebooks/). +Install `aisdc` and manually copy the [`examples`](examples/). To install only the base package, which includes the attacks used for assessing privacy: @@ -45,7 +26,7 @@ To install only the base package, which includes the attacks used for assessing $ pip install aisdc ``` -To install the base package and the safemodel package, which includes defensive wrappers for popular ML frameworks including [scikit-learn](https://scikit-learn.org) and [Keras](https://keras.io): +To additionally install the safemodel package: ``` $ pip install aisdc[safemodel] @@ -53,32 +34,14 @@ $ pip install aisdc[safemodel] ## Running -To run an example, simply execute the desired script or start up `jupyter notebook` and run one of the notebooks. - -For example, to run the `lira_attack_example.py`: +To run an example, simply execute the desired script or start up `jupyter notebook` and run one of the notebooks. For example, to run the LiRA example: ``` $ python -m lira_attack_example ``` -## Development - -Clone the repository and install the local package including all dependencies (safest in a virtual env): - -``` -$ git clone https://github.com/AI-SDC/AI-SDC.git -$ cd AI-SDC -$ pip install .[test] -``` - -Then run the tests: - -``` -$ pytest . -``` - ---- +## Acknowledgement -This work was funded by UK Research and Innovation under Grant Numbers MC_PC_21033 and MC_PC_23006 as part of Phase 1 of the DARE UK (Data and Analytics Research Environments UK) programme (https://dareuk.org.uk/), delivered in partnership with Health Data Research UK (HDR UK) and Administrative Data Research UK (ADR UK). The specific projects were Semi-Automatic checking of Research Outputs (SACRO -MC_PC_23006) and Guidelines and Resources for AI Model Access from TrusTEd Research environments (GRAIMATTER - MC_PC_21033).­ This project has also been supported by MRC and EPSRC [grant number MR/S010351/1]: PICTURES. +This work was funded by UK Research and Innovation under Grant Numbers MC_PC_21033 and MC_PC_23006 as part of Phase 1 of the [DARE UK](https://dareuk.org.uk) (Data and Analytics Research Environments UK) programme, delivered in partnership with Health Data Research UK (HDR UK) and Administrative Data Research UK (ADR UK). The specific projects were Semi-Automatic checking of Research Outputs (SACRO; MC_PC_23006) and Guidelines and Resources for AI Model Access from TrusTEd Research environments (GRAIMATTER; MC_PC_21033).­This project has also been supported by MRC and EPSRC [grant number MR/S010351/1]: PICTURES. diff --git a/development_files/safekeras.py.saved b/development_files/safekeras.py.saved deleted file mode 100644 index 33770987..00000000 --- a/development_files/safekeras.py.saved +++ /dev/null @@ -1,572 +0,0 @@ -"""Safekeras.py: - Jim Smith, Andrew McCarty and Richard Preen - UWE 2022. -""" -# general imports - - -import os -import warnings - -# import sys -from typing import Any, Tuple - -import numpy as np - -# tensorflow imports -import tensorflow as tf -import tensorflow_privacy as tfp -from dictdiffer import diff -from tensorflow.keras import Model as KerasModel # pylint: disable = import-error -from tensorflow_privacy import compute_dp_sgd_privacy - -# safemodel superclass -from ..reporting import get_reporting_string -from ..safemodel import SafeModel - -# suppress numerous deprecatino warnings -# shut tensorflow up -warnings.filterwarnings("ignore", category=DeprecationWarning) -warnings.filterwarnings("ignore", category=FutureWarning) - -# this is the current class that dpvarians of optimizers come from -# may change in later versions of tensorflow_privacy -DP_CLASS_STRING = ( - "tensorflow_privacy.privacy.optimizers.dp_optimizer_keras." - "make_keras_optimizer_class..DPOptimizerClass" -) - -DP_CLASS_STRING2 = ( - "tensorflow_privacy.privacy.optimizers.dp_optimizer_keras." - "make_keras_generic_optimizer_class..DPOptimizerClass" -) - - -def same_configs(m1: Any, m2: Any) -> Tuple[bool, str]: - """Checks if two models havethe same architecture.""" - num_layers = len(m1.layers) - if len(m2.layers) != num_layers: - errstr = get_reporting_string(name="different_layer_count") - return False, errstr - for layer in range(num_layers): - m1_layer_config = m1.layers[layer].get_config() - _ = m1_layer_config.pop("name") - m2_layer_config = m2.layers[layer].get_config() - _ = m2_layer_config.pop("name") - match = list(diff(m1_layer_config, m2_layer_config, expand=True)) - num_diffs = len(match) - if num_diffs > 0: - msg = get_reporting_string( - name="layer_configs_differ", layer=layer, length=num_diffs - ) - # f"Layer {layer} configs differ in {len(match)} places:\n" - for i in range(num_diffs): - if match[i][0] == "change": - msg += get_reporting_string( - name="param_changed_from_to", - key=match[i][1], - val=match[i][2][0], - cur_val=match[i][2][1], - ) - else: # should not be reachable as dense objects cannot be modified - msg += f"{match[i]}" # pragma: no cover - return False, msg - - return True, get_reporting_string(name="same_ann_config") - - -def same_weights(m1: Any, m2: Any) -> Tuple[bool, str]: - """Checks if two nets with same architecture havethe same weights.""" - num_layers = len(m1.layers) - if num_layers != len(m2.layers): - return False, "different numbers of layers" - # layer 0 is input layer determined by data size - for layer in range(1, num_layers): - m1layer = m1.layers[layer].get_weights() - m2layer = m2.layers[layer].get_weights() - if len(m1layer[0][0]) != len(m2layer[0][0]): - return False, f"layer {layer} not the same size." - for dim in range(len(m1layer)): # pylint: disable=consider-using-enumerate - m1d = m1layer[dim] - m2d = m2layer[dim] - # print(type(m1d), m1d.shape) - if not np.array_equal(m1d, m2d): # pragma: no cover - return False, f"dimension {dim} of layer {layer} differs" - return True, "weights match" - - -def check_checkpoint_equality(v1: str, v2: str) -> Tuple[bool, str]: - """Compares two checkpoints saved with tensorflow save_model - On the assumption that the optimiser is not going to be saved, - and that the model is going to be saved in frozen form - this only checks the architecture and weights layer by layer. - """ - msg = "" - same = True - - try: - model1 = tf.keras.models.load_model(v1) - except Exception as e: # pylint:disable=broad-except - msg = get_reporting_string(name="error_reloading_model_v1", e=e) - # f"Error re-loading model from {v1}: {e}" - return False, msg - try: - model2 = tf.keras.models.load_model(v2) - except Exception as e: # pylint:disable=broad-except - msg = get_reporting_string(name="error_reloading_model_v2", e=e) - # f"Error re-loading model from {v2}: {e}" - return False, msg - - same_config, config_message = same_configs(model1, model2) - if not same_config: - print("different config") - msg += config_message - same = False - - same_weight, weights_message = same_weights(model1, model2) - if not same_weight: - print("different weights") - msg += weights_message - same = False - - return same, msg - - -def check_DP_used(optimizer) -> Tuple[bool, str]: - """Checks whether the DP optimizer was actually the one used.""" - - key_needed = "_was_dp_gradients_called" - critical_val = optimizer.__dict__.get(key_needed, "missing") - - if critical_val is True: - reason = get_reporting_string(name="dp_optimizer_run") - DPused = True - elif critical_val == "missing": - reason = get_reporting_string(name="no_dp_gradients_key") - DPused = False - elif critical_val is False: - reason = get_reporting_string(name="changed_opt_no_fit") - DPused = False - else: # pragma: no cover - # not currently reachable because optimizer class does - # not support assignment - # but leave in to future-proof - reason = get_reporting_string( - name="unrecognised_combination" - ) # pragma: no cover - DPused = False # pragma: no cover - - return DPused, reason - - -def check_optimizer_allowed(optimizer) -> Tuple[bool, str]: - """Checks if the model's optimizer is in our white-list - default setting is not allowed. - """ - allowed = False - opt_type = str(type(optimizer)) - reason = get_reporting_string(name="optimizer_not_allowed", optimizer=opt_type) - if (DP_CLASS_STRING in opt_type) or (DP_CLASS_STRING2 in opt_type): - allowed = True - reason = get_reporting_string(name="optimizer_allowed", optimizer=opt_type) - - return allowed, reason - - -def check_optimizer_is_DP(optimizer) -> Tuple[bool, str]: - """Checks whether optimizer is one of tensorflow's DP versions.""" - DPused = False - reason = "None" - if "_was_dp_gradients_called" not in optimizer.__dict__: - reason = get_reporting_string(name="no_dp_gradients_key") - else: - reason = get_reporting_string(name="found_dp_gradients_key") - DPused = True - return DPused, reason - - -def load_safe_keras_model(name: str = "undefined") -> Tuple[bool, Any]: - """ - Reads model from file in appropriate format. - Optimizer is deliberately excluded in the save. - This is to prevent possibility of restarting training, - which could offer possible back door into attacks. - Thus optimizer cannot be loaded. - """ - the_model = None - model_load_file = name - msg = "" - if model_load_file == "undefined": - msg = "Please input a name with extension for the model to load." - - elif model_load_file[-3:] == ".tf": - # load from tf - the_model = tf.keras.models.load_model( - model_load_file # , custom_objects={"SafeKerasModel"} - ) - load = tf.keras.models.load_model(model_load_file, compile="False") - the_model.set_weights(load.get_weights()) - - else: - suffix = model_load_file.split(".")[-1] - msg = f"loading from a {suffix} file is currently not supported" - - if the_model is not None: - return (True, the_model) - # else - return (False, msg) - - -class SafeKerasModel(KerasModel, SafeModel): - """Privacy Protected Wrapper around tf.keras.Model class from tensorflow 2.8 - disabling pylont warnings about number of instance attributes - as this is necessarily complex. - """ - - # pylint: disable=too-many-instance-attributes - def __init__(self, *args: Any, **kwargs: Any) -> None: - """Creates model and applies constraints to params.""" - - # the_args = args - the_kwargs = kwargs - - # initialise all the values that get provided as options to keras - # and also l2 norm clipping and learning rates, batch sizes - ##inputs = kwargs.get("inputs","notFound") - ##if inputs=="notFound": - ## inputs = args[0] if len(args) == 3 else None - if "inputs" in kwargs.keys(): # pylint: disable=consider-iterating-dictionary - inputs = the_kwargs["inputs"] - elif len(args) == 3: # defaults is for Model(input,outputs,names) - inputs = args[0] - self.outputs = None - ##outputs = kwargs.get("outputs","notFound") - ##if outputs=="notFound": - ## outputs = args[1] if len(args) == 3 else None - if "outputs" in kwargs.keys(): # pylint: disable=consider-iterating-dictionary - outputs = the_kwargs["outputs"] - elif len(args) == 3: - # self.outputs = args[1] - outputs = args[1] - - # call the keras super class first as this comes first in chain - super().__init__( # pylint: disable=unexpected-keyword-arg - inputs=inputs, outputs=outputs # pylint: disable=used-before-assignment - ) - - # set values where the user has supplied them - # if not supplied set to a value that preliminary_check - # will over-ride with TRE-specific values from rules.json - defaults = { - "l2_norm_clip": 1.0, - "noise_multiplier": 0.5, - "min_epsilon": 10, - "delta": 1e-5, - "batch_size": 25, - "num_microbatches": None, - "learning_rate": 0.1, - "optimizer": tfp.DPKerasSGDOptimizer, - "num_samples": 250, - "epochs": 20, - "current_epsilon": 999, - } - - for key, val in defaults.items(): - if kwargs.get(key, "missing") != "missing": - setattr(self, key, kwargs[key]) - else: - setattr(self, key, val) - - if self.batch_size == 0: - msg = get_reporting_string(name="batch_size_zero") - print(msg) - self.batch_size = 32 - - SafeModel.__init__(self) - - self.model_type: str = "KerasModel" - # remove. this from default class - _ = self.__dict__.pop("saved_model") - super().preliminary_check(apply_constraints=True, verbose=True) - - def dp_epsilon_met( - self, num_examples: int, batch_size: int = 0, epochs: int = 0 - ) -> Tuple[bool, str]: - """Checks if epsilon is sufficient for Differential Privacy - Provides feedback to user if epsilon is not sufficient. - """ - privacy = compute_dp_sgd_privacy( - n=num_examples, - batch_size=batch_size, - noise_multiplier=self.noise_multiplier, - epochs=epochs, - delta=self.delta, - ) - ok = privacy[0] < self.min_epsilon - return ok, privacy[0] - - def check_epsilon( - self, num_samples: int, batch_size: int, epochs: int - ) -> Tuple[bool, str]: - """Computes the level of privacy guarantee is within recommended limits, - and produces feedback". - """ - msg = "" - ok = False - if batch_size == 0: - msg += get_reporting_string(name="division_by_zero") - batch_size = 1 - ( - ok, - self.current_epsilon, # pylint: disable=attribute-defined-outside-init - ) = self.dp_epsilon_met( - num_examples=num_samples, batch_size=batch_size, epochs=epochs - ) - - key_name = "dp_requirements_met" if ok else "dp_requirements_not_met" - get_reporting_string( - name=key_name, - current_epsilon=self.current_epsilon, - num_samples=num_samples, - batch_size=batch_size, - epochs=epochs, - ) - print(msg) - return ok, msg - - def compile( - self, optimizer=None, loss="categorical_crossentropy", metrics=["accuracy"] - ): # pylint:disable=dangerous-default-value) - """ - Replaces the optimiser with a DP variant if needed and creates the - necessary DP params in the opt and loss dict, then calls tf compile. - Allow None as default value for optimizer param because we explicitly - deal with it. - """ - replace_message = get_reporting_string(name="warn_possible_disclosure_risk") - # "WARNING: model parameters may present a disclosure risk" - using_DP_SGD = get_reporting_string(name="using_dp_sgd") - # "Changed parameter optimizer = 'DPKerasSGDOptimizer'" - Using_DP_Adagrad = get_reporting_string(name="using_dp_adagrad") - # "Changed parameter optimizer = 'DPKerasAdagradOptimizer'" - using_DP_Adam = get_reporting_string(name="using_dp_adam") - # "Changed parameter optimizer = 'DPKerasAdamOptimizer'" - - optimizer_dict = { - None: (using_DP_SGD, tfp.DPKerasSGDOptimizer), - tfp.DPKerasSGDOptimizer: ("", tfp.DPKerasSGDOptimizer), - tfp.DPKerasAdagradOptimizer: ("", tfp.DPKerasAdagradOptimizer), - tfp.DPKerasAdamOptimizer: ("", tfp.DPKerasAdamOptimizer), - "Adagrad": ( - replace_message + Using_DP_Adagrad, - tfp.DPKerasAdagradOptimizer, - ), - "Adam": (replace_message + using_DP_Adam, tfp.DPKerasAdamOptimizer), - "SGD": (replace_message + using_DP_SGD, tfp.DPKerasSGDOptimizer), - } - - val = optimizer_dict.get(optimizer, "unknown") - if val == "unknown": - opt_msg = using_DP_SGD - opt_used = tfp.DPKerasSGDOptimizer - else: - opt_msg = val[0] - opt_used = val[1] - - self.optimizer = opt_used # pylint: disable=attribute-defined-outside-init - opt = opt_used( - l2_norm_clip=self.l2_norm_clip, - noise_multiplier=self.noise_multiplier, - num_microbatches=self.num_microbatches, - learning_rate=self.learning_rate, - ) - - if len(opt_msg) > 0: - print(get_reporting_string(name="during_compilation", opt_msg=opt_msg)) - - super().compile(opt, loss, metrics) - - def fit( # pylint:disable=too-many-arguments - self, - X: Any, - Y: Any, - validation_data: Any, - epochs: int, - batch_size: int, - refine_epsilon: bool = False, - ) -> Any: - """ - Overrides the tensorflow fit() method with some extra functionality: - (i) records number of samples for checking DP epsilon values. - (ii) does an automatic epsilon check and reports. - (iia) if user sets refine_epsilon = true, return without fitting the model. - (iii) then calls the tensorflow fit() function. - (iv) finally makes a saved copy of the newly fitted model. - """ - - # pylint can't cope that we first declared these via a dict :( - self.num_samples = X.shape[0] # pylint: disable=attribute-defined-outside-init - self.epochs = epochs # pylint: disable=attribute-defined-outside-init - self.batch_size = batch_size # pylint: disable=attribute-defined-outside-init - # make sure you are passing keywords through - but also checking batch size epochs - ok, msg = self.check_epsilon(X.shape[0], batch_size, epochs) - - if not ok: - print(msg) - if refine_epsilon: - print( - "Not continuing with fitting model, " - "as return epsilon was above max recommended value, " - "and user set refine_epsilon= True" - ) - return False, None - - returnval = super().fit( - X, - Y, - validation_data=validation_data, - epochs=epochs, - batch_size=batch_size, - ) - - # make a saved copy for later analysis - if not os.path.exists("tfsaves"): - os.mkdir("tfsaves") - self.save("tfsaves/fit_model.tf") - # pylint: disable=attribute-defined-outside-init - self.saved_was_dpused, self.saved_reason = check_DP_used(self.optimizer) - self.saved_epsilon = self.current_epsilon - return returnval - - def posthoc_check(self, verbose: bool = True) -> Tuple[str, bool]: - """Checks whether model should be considered unsafe - for example, has been changed since fit() was last run, - or does not meet DP policy. - """ - - disclosive = False - msg = "" - - # have the model architecture or weights been changed? - self.save("tfsaves/requested_model.tf") - models_same, same_msg = check_checkpoint_equality( - "tfsaves/fit_model.tf", - "tfsaves/requested_model.tf", - ) - if not models_same: - msg += same_msg - disclosive = True - - # was a dp-enbled optimiser provided? - allowed, allowedmessage = check_optimizer_allowed(self.optimizer) - if not allowed: - msg += allowedmessage - disclosive = True - - # was the dp-optimiser used during fit() - dpused, dpusedmessage = check_DP_used(self.optimizer) - if not dpused: - msg += dpusedmessage - disclosive = True - - # have values been changed since saved immediately after fit()? - if ( - dpused != self.saved_was_dpused - or dpusedmessage != self.saved_reason - or self.saved_epsilon != self.current_epsilon - ): - msg += get_reporting_string(name="opt_config_changed") - disclosive = True - - # if not what was the value of epsilon achieved - eps_met, cur_eps = self.dp_epsilon_met( - num_examples=self.num_samples, - batch_size=self.batch_size, - epochs=self.epochs, - ) - if not eps_met: - dpepsilonmessage = get_reporting_string( - name="epsilon_above_normal", - current_epsilon=cur_eps, - ) - if verbose: - print( - get_reporting_string( - name="recommend_further_discussion", msg=dpepsilonmessage - ) - ) - msg += dpepsilonmessage - disclosive = True - - if disclosive: - msg = get_reporting_string(name="recommend_not_release") + msg - return msg, True - - # passed all the tests!! - if verbose: - msg = get_reporting_string(name="recommend_allow_release") - msg += get_reporting_string( - name="allow_release_eps_below_max", current_epsilon=cur_eps - ) - return msg, False - - def save(self, name: str = "undefined") -> None: - """Writes model to file in appropriate format. - - Parameters - ---------- - - name : string - The name of the file to save - - Returns - ------- - - Notes - ----- - - No return value - - Optimizer is deliberately excluded. - To prevent possible to restart training and thus - possible back door into attacks. - """ - - self.model_save_file = name - while self.model_save_file == "undefined": - print(get_reporting_string(name="input_filename_with_extension")) - return - - thename = self.model_save_file.split(".") - # print(f'in save(), parsed filename is {thename}') - if len(thename) == 1: - print(get_reporting_string(name="filename_must_indicate_type")) - # "file name must indicate type as a suffix") - else: - suffix = self.model_save_file.split(".")[-1] - - if suffix in ("h5", "tf"): - try: - tf.keras.models.save_model( - self, - self.model_save_file, - include_optimizer=False, - # save_traces=False, - save_format=suffix, - ) - # pragma:no cover - except Exception as er: # pylint:disable=broad-except # pragma:no cover - print( # pragma:no cover - get_reporting_string( - name="error_saving_file", suffix=suffix, er=er - ) - ) - # f"saving as a {suffix} file gave this error message: {er}") - else: - print( - get_reporting_string( - name="suffix_not_supported_for_type", model_type=self.model_type - ) - ) - # f"{suffix} file suffix not supported " - # f"for models of type {self.model_type}.\n" diff --git a/development_files/structural_attack.ipynb b/development_files/structural_attack.ipynb deleted file mode 100644 index f64020d0..00000000 --- a/development_files/structural_attack.ipynb +++ /dev/null @@ -1,957 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "5b5bd89b-c0f9-476a-80a2-79ad044e11d2", - "metadata": {}, - "source": [ - "# Notebook for developing code to go into structural_attack class" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "dd7f7614-cbac-43a5-bf90-a59712eca953", - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "import os\n", - "\n", - "\n", - "# for development use local copy of aisdc in preference to installed version\n", - "sys.path.insert(0, os.path.abspath(\"..\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "711cbd17-2e8e-452c-b9be-0b662579e333", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "import numpy as np\n", - "from sklearn.datasets import load_breast_cancer\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.tree import DecisionTreeClassifier\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from xgboost import XGBClassifier\n", - "from sklearn.svm import SVC\n", - "\n", - "\n", - "from aisdc.attacks.structural_attack import (\n", - " StructuralAttack,\n", - ") # pylint: disable = import-error\n", - "from aisdc.attacks.target import Target # pylint: disable = import-error" - ] - }, - { - "cell_type": "markdown", - "id": "536bf3bd-b5cc-4c8e-abed-bcd6dcfdf96e", - "metadata": {}, - "source": [ - "## helper function for test" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b761fd2e-4a96-49f3-bbc4-888e26382c15", - "metadata": {}, - "outputs": [], - "source": [ - "def get_target(modeltype: str, **kwargs) -> Target:\n", - " \"\"\"loads dataset and creates target of the desired type\"\"\"\n", - "\n", - " X, y = load_breast_cancer(return_X_y=True, as_frame=False)\n", - " train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3)\n", - "\n", - " # these types should be handled\n", - " if modeltype == \"dt\":\n", - " target_model = DecisionTreeClassifier(**kwargs)\n", - " elif modeltype == \"rf\":\n", - " target_model = RandomForestClassifier(**kwargs)\n", - " elif modeltype == \"xgb\":\n", - " target_model = XGBClassifier(**kwargs)\n", - " # should get polite error but not DoF yet\n", - " elif modeltype == \"svc\":\n", - " target_model = SVC(**kwargs)\n", - " else:\n", - " raise NotImplementedError(\"model type passed to get_model unknown\")\n", - "\n", - " # Train the classifier\n", - " target_model.fit(train_X, train_y)\n", - "\n", - " # Wrap the model and data in a Target object\n", - " target = Target(model=target_model)\n", - " target.add_processed_data(train_X, train_y, test_X, test_y)\n", - "\n", - " return target" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c4821ec0-f718-45f3-8912-3fcf69056e4e", - "metadata": {}, - "outputs": [], - "source": [ - "import importlib\n", - "import aisdc.attacks.structural_attack\n", - "\n", - "importlib.reload(aisdc.attacks.structural_attack)\n", - "from aisdc.attacks.structural_attack import StructuralAttack" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "8413ddf8-730f-4bf6-8299-7e01dc4806e3", - "metadata": {}, - "outputs": [], - "source": [ - "def test_dt():\n", - " \"\"\"test for decision tree classifier\"\"\"\n", - "\n", - " print(\"\\n\\n\\n====== Non Disclosive ====\\n\\n\")\n", - "\n", - " param_dict = {\"max_depth\": 1, \"min_samples_leaf\": 150}\n", - " target = get_target(\"dt\", **param_dict)\n", - " target_path = target.save(\"dt.sav\")\n", - " myattack = StructuralAttack(target_path=\"dt.sav\")\n", - " myattack.attack(target)\n", - " # assert myattack.DoF_risk ==0 ,\"should be no DoF risk with devision stump\"\n", - " # assert myattack.k_anonymity_risk ==0, 'should be no k-anonymity risk with min_samples_leaf 150'\n", - " # assert myattack.class_disclosure_risk ==0,'no class disclsoure risk for stump with min samles leaf 150'\n", - " # assert myattack.unnecessary_risk ==0, 'not unnecessary risk if max_depth < 3.5'\n", - " print(\n", - " f\"equiv_classes is {myattack.equiv_classes}\\n\"\n", - " f\"equiv_counts is {myattack.equiv_counts}\\n\"\n", - " f\"equiv_members is {myattack.equiv_members}\\n\"\n", - " )\n", - "\n", - " print(\"\\n\\n\\n====== Now Disclosive ====\\n\\n\")\n", - " # highly disclosive\n", - " param_dict = {\"max_depth\": None, \"min_samples_leaf\": 5, \"min_samples_split\": 2}\n", - " target2 = get_target(\"dt\", **param_dict)\n", - " myattack2 = StructuralAttack()\n", - " myattack2.attack(target2)\n", - " # assert myattack2.DoF_risk ==0 ,\"should be no DoF risk with decision stump\"\n", - " # assert myattack2.k_anonymity_risk ==1, 'should be k-anonymity risk with unlimited depth and min_samples_leaf 5'\n", - " # assert myattack2.class_disclosure_risk ==1,'should be class disclosure risk with unlimited depth and min_samples_leaf 5'\n", - " # assert myattack2.unnecessary_risk ==1, ' unnecessary risk with unlimited depth and min_samples_leaf 5'\n", - " # print(f' attack._get_param_names returns {myattack2._get_param_names()}')\n", - " # print(f' attack.get_params returns {myattack2.get_params()}')\n", - "\n", - " print(\n", - " f\"equiv_classes is {myattack2.equiv_classes}\\n\"\n", - " f\"equiv_counts is {myattack2.equiv_counts}\\n\"\n", - " f\"equiv_members is {myattack2.equiv_members}\\n\"\n", - " )\n", - "\n", - " # myattack.make_report()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "215781af-a74d-4300-b572-1a9f696457b8", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:acro:version: 0.4.2\n", - "INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}\n", - "INFO:acro:automatic suppression: False\n", - "INFO:structural_attack:Thresholds for count 10 and DoF 10\n", - "INFO:acro:version: 0.4.2\n", - "INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}\n", - "INFO:acro:automatic suppression: False\n", - "INFO:structural_attack:Thresholds for count 10 and DoF 10\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "====== Non Disclosive ====\n", - "\n", - "\n", - "ingroup [ 0 2 5 7 9 10 11 12 14 15 17 19 20 22 24 26 28 29\n", - " 30 32 34 35 36 37 38 39 40 41 43 44 46 47 48 50 51 52\n", - " 53 54 57 58 61 63 64 65 66 69 71 72 73 76 78 81 82 85\n", - " 86 87 88 89 92 93 94 97 98 102 103 105 106 108 109 110 113 115\n", - " 116 117 118 121 123 125 126 128 130 131 132 134 135 137 138 139 140 141\n", - " 142 143 145 146 147 148 149 153 154 156 158 162 163 167 169 170 172 173\n", - " 176 178 180 181 182 183 184 186 187 188 192 193 195 196 197 198 199 201\n", - " 202 203 206 207 209 211 213 215 218 219 220 222 223 224 225 226 228 229\n", - " 231 233 235 237 238 240 241 242 245 246 247 248 250 252 254 256 258 259\n", - " 261 262 264 265 269 272 273 274 275 276 277 278 279 282 284 285 286 288\n", - " 289 294 299 300 303 304 307 308 309 311 312 314 315 316 317 318 319 321\n", - " 323 324 328 329 330 332 334 335 336 340 342 344 346 347 351 352 354 355\n", - " 357 358 360 362 365 366 367 371 373 374 375 377 379 380 381 384 386 388\n", - " 389 392 394 395 396 397],count 240\n", - "ingroup [ 1 3 4 6 8 13 16 18 21 23 25 27 31 33 42 45 49 55\n", - " 56 59 60 62 67 68 70 74 75 77 79 80 83 84 90 91 95 96\n", - " 99 100 101 104 107 111 112 114 119 120 122 124 127 129 133 136 144 150\n", - " 151 152 155 157 159 160 161 164 165 166 168 171 174 175 177 179 185 189\n", - " 190 191 194 200 204 205 208 210 212 214 216 217 221 227 230 232 234 236\n", - " 239 243 244 249 251 253 255 257 260 263 266 267 268 270 271 280 281 283\n", - " 287 290 291 292 293 295 296 297 298 301 302 305 306 310 313 320 322 325\n", - " 326 327 331 333 337 338 339 341 343 345 348 349 350 353 356 359 361 363\n", - " 364 368 369 370 372 376 378 382 383 385 387 390 391 393],count 158\n", - "equiv_classes is [1 2]\n", - "equiv_counts is [240 158]\n", - "equiv_members is [array([ 0, 2, 5, 7, 9, 10, 11, 12, 14, 15, 17, 19, 20,\n", - " 22, 24, 26, 28, 29, 30, 32, 34, 35, 36, 37, 38, 39,\n", - " 40, 41, 43, 44, 46, 47, 48, 50, 51, 52, 53, 54, 57,\n", - " 58, 61, 63, 64, 65, 66, 69, 71, 72, 73, 76, 78, 81,\n", - " 82, 85, 86, 87, 88, 89, 92, 93, 94, 97, 98, 102, 103,\n", - " 105, 106, 108, 109, 110, 113, 115, 116, 117, 118, 121, 123, 125,\n", - " 126, 128, 130, 131, 132, 134, 135, 137, 138, 139, 140, 141, 142,\n", - " 143, 145, 146, 147, 148, 149, 153, 154, 156, 158, 162, 163, 167,\n", - " 169, 170, 172, 173, 176, 178, 180, 181, 182, 183, 184, 186, 187,\n", - " 188, 192, 193, 195, 196, 197, 198, 199, 201, 202, 203, 206, 207,\n", - " 209, 211, 213, 215, 218, 219, 220, 222, 223, 224, 225, 226, 228,\n", - " 229, 231, 233, 235, 237, 238, 240, 241, 242, 245, 246, 247, 248,\n", - " 250, 252, 254, 256, 258, 259, 261, 262, 264, 265, 269, 272, 273,\n", - " 274, 275, 276, 277, 278, 279, 282, 284, 285, 286, 288, 289, 294,\n", - " 299, 300, 303, 304, 307, 308, 309, 311, 312, 314, 315, 316, 317,\n", - " 318, 319, 321, 323, 324, 328, 329, 330, 332, 334, 335, 336, 340,\n", - " 342, 344, 346, 347, 351, 352, 354, 355, 357, 358, 360, 362, 365,\n", - " 366, 367, 371, 373, 374, 375, 377, 379, 380, 381, 384, 386, 388,\n", - " 389, 392, 394, 395, 396, 397]), array([ 1, 3, 4, 6, 8, 13, 16, 18, 21, 23, 25, 27, 31,\n", - " 33, 42, 45, 49, 55, 56, 59, 60, 62, 67, 68, 70, 74,\n", - " 75, 77, 79, 80, 83, 84, 90, 91, 95, 96, 99, 100, 101,\n", - " 104, 107, 111, 112, 114, 119, 120, 122, 124, 127, 129, 133, 136,\n", - " 144, 150, 151, 152, 155, 157, 159, 160, 161, 164, 165, 166, 168,\n", - " 171, 174, 175, 177, 179, 185, 189, 190, 191, 194, 200, 204, 205,\n", - " 208, 210, 212, 214, 216, 217, 221, 227, 230, 232, 234, 236, 239,\n", - " 243, 244, 249, 251, 253, 255, 257, 260, 263, 266, 267, 268, 270,\n", - " 271, 280, 281, 283, 287, 290, 291, 292, 293, 295, 296, 297, 298,\n", - " 301, 302, 305, 306, 310, 313, 320, 322, 325, 326, 327, 331, 333,\n", - " 337, 338, 339, 341, 343, 345, 348, 349, 350, 353, 356, 359, 361,\n", - " 363, 364, 368, 369, 370, 372, 376, 378, 382, 383, 385, 387, 390,\n", - " 391, 393])]\n", - "\n", - "\n", - "\n", - "\n", - "====== Now Disclosive ====\n", - "\n", - "\n", - "ingroup [ 34 89 95 100 157 252 282 296],count 8\n", - "ingroup [ 29 42 183 337 393],count 5\n", - "ingroup [ 1 2 3 4 6 7 9 10 12 13 16 20 21 22 27 28 35 36\n", - " 37 39 40 46 48 49 50 51 52 55 57 58 63 64 66 67 69 70\n", - " 72 76 77 79 80 81 85 93 97 102 104 105 106 110 111 112 114 115\n", - " 116 118 119 121 123 125 128 130 131 135 137 138 139 141 142 146 150 152\n", - " 155 156 158 159 160 162 164 165 167 168 169 170 171 172 175 176 178 180\n", - " 182 184 186 189 192 195 198 199 200 201 203 204 205 208 210 212 214 215\n", - " 216 219 220 221 222 225 226 228 230 235 238 240 241 244 246 247 250 251\n", - " 253 254 256 259 261 263 264 265 267 268 269 270 272 273 274 276 278 279\n", - " 281 284 287 288 290 291 293 295 297 298 299 300 301 303 304 311 312 317\n", - " 318 319 320 321 322 323 324 325 326 327 331 332 334 342 343 345 346 347\n", - " 348 349 353 356 359 360 362 364 365 369 370 373 375 376 377 378 380 382\n", - " 384 385 387 390 392 396 397],count 205\n", - "ingroup [133 193 285 330 352],count 5\n", - "ingroup [ 31 53 68 122 394],count 5\n", - "ingroup [ 60 103 237 262 266 391],count 6\n", - "ingroup [ 88 124 147 174 207 379],count 6\n", - "ingroup [ 75 113 140 179 181 232 307 361],count 8\n", - "ingroup [ 18 47 191 234 351],count 5\n", - "ingroup [ 11 117 134 149 242 271 277 329 338 371 395],count 11\n", - "ingroup [145 248 302 339 374],count 5\n", - "ingroup [ 19 255 341 366 368],count 5\n", - "ingroup [ 0 5 8 14 15 17 23 24 25 26 30 32 33 38 41 43 44 45\n", - " 54 56 59 61 62 65 71 73 74 78 82 83 84 86 87 90 91 92\n", - " 94 96 98 99 101 107 108 109 120 126 127 129 132 136 143 144 148 151\n", - " 153 154 161 163 166 173 177 185 187 188 190 194 196 197 202 206 209 211\n", - " 213 217 218 223 224 227 229 231 233 236 239 243 245 249 257 258 260 275\n", - " 280 283 286 289 292 294 305 306 308 309 310 313 314 315 316 328 333 335\n", - " 336 340 344 350 354 355 357 358 363 367 372 381 383 386 388 389],count 124\n", - "equiv_classes is [ 5 6 7 10 11 12 15 16 18 19 22 23 24]\n", - "equiv_counts is [ 8 5 205 5 5 6 6 8 5 11 5 5 124]\n", - "equiv_members is [array([ 34, 89, 95, 100, 157, 252, 282, 296]), array([ 29, 42, 183, 337, 393]), array([ 1, 2, 3, 4, 6, 7, 9, 10, 12, 13, 16, 20, 21,\n", - " 22, 27, 28, 35, 36, 37, 39, 40, 46, 48, 49, 50, 51,\n", - " 52, 55, 57, 58, 63, 64, 66, 67, 69, 70, 72, 76, 77,\n", - " 79, 80, 81, 85, 93, 97, 102, 104, 105, 106, 110, 111, 112,\n", - " 114, 115, 116, 118, 119, 121, 123, 125, 128, 130, 131, 135, 137,\n", - " 138, 139, 141, 142, 146, 150, 152, 155, 156, 158, 159, 160, 162,\n", - " 164, 165, 167, 168, 169, 170, 171, 172, 175, 176, 178, 180, 182,\n", - " 184, 186, 189, 192, 195, 198, 199, 200, 201, 203, 204, 205, 208,\n", - " 210, 212, 214, 215, 216, 219, 220, 221, 222, 225, 226, 228, 230,\n", - " 235, 238, 240, 241, 244, 246, 247, 250, 251, 253, 254, 256, 259,\n", - " 261, 263, 264, 265, 267, 268, 269, 270, 272, 273, 274, 276, 278,\n", - " 279, 281, 284, 287, 288, 290, 291, 293, 295, 297, 298, 299, 300,\n", - " 301, 303, 304, 311, 312, 317, 318, 319, 320, 321, 322, 323, 324,\n", - " 325, 326, 327, 331, 332, 334, 342, 343, 345, 346, 347, 348, 349,\n", - " 353, 356, 359, 360, 362, 364, 365, 369, 370, 373, 375, 376, 377,\n", - " 378, 380, 382, 384, 385, 387, 390, 392, 396, 397]), array([133, 193, 285, 330, 352]), array([ 31, 53, 68, 122, 394]), array([ 60, 103, 237, 262, 266, 391]), array([ 88, 124, 147, 174, 207, 379]), array([ 75, 113, 140, 179, 181, 232, 307, 361]), array([ 18, 47, 191, 234, 351]), array([ 11, 117, 134, 149, 242, 271, 277, 329, 338, 371, 395]), array([145, 248, 302, 339, 374]), array([ 19, 255, 341, 366, 368]), array([ 0, 5, 8, 14, 15, 17, 23, 24, 25, 26, 30, 32, 33,\n", - " 38, 41, 43, 44, 45, 54, 56, 59, 61, 62, 65, 71, 73,\n", - " 74, 78, 82, 83, 84, 86, 87, 90, 91, 92, 94, 96, 98,\n", - " 99, 101, 107, 108, 109, 120, 126, 127, 129, 132, 136, 143, 144,\n", - " 148, 151, 153, 154, 161, 163, 166, 173, 177, 185, 187, 188, 190,\n", - " 194, 196, 197, 202, 206, 209, 211, 213, 217, 218, 223, 224, 227,\n", - " 229, 231, 233, 236, 239, 243, 245, 249, 257, 258, 260, 275, 280,\n", - " 283, 286, 289, 292, 294, 305, 306, 308, 309, 310, 313, 314, 315,\n", - " 316, 328, 333, 335, 336, 340, 344, 350, 354, 355, 357, 358, 363,\n", - " 367, 372, 381, 383, 386, 388, 389])]\n", - "\n" - ] - } - ], - "source": [ - "test_dt()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "deb9d05a-24aa-464c-9a7b-0647ae1a56e1", - "metadata": {}, - "outputs": [], - "source": [ - "def test_rf():\n", - " \"\"\"test for decision tree classifier\"\"\"\n", - "\n", - " print(\"\\n\\n\\n====== Non Disclosive ====\\n\\n\")\n", - "\n", - " param_dict = {\"max_depth\": 1, \"min_samples_leaf\": 150, \"n_estimators\": 5}\n", - " target = get_target(\"rf\", **param_dict)\n", - " target_path = target.save(\"dt.sav\")\n", - " myattack = StructuralAttack(target_path=\"dt.sav\")\n", - " myattack.attack(target)\n", - " # assert myattack.DoF_risk ==0 ,\"should be no DoF risk with devision stump\"\n", - " # assert myattack.k_anonymity_risk ==0, 'should be no k-anonymity risk with min_samples_leaf 150'\n", - " # assert myattack.class_disclosure_risk ==0,'no class disclsoure risk for stump with min samles leaf 150'\n", - " # assert myattack.unnecessary_risk ==0, 'not unnecessary risk if max_depth < 3.5'\n", - " print(\n", - " f\" {len(myattack.equiv_classes)} equiv_classes:\\n{myattack.equiv_classes}\\n\"\n", - " f\"equiv_counts is {myattack.equiv_counts}\\n\"\n", - " # f'equiv_members is {myattack.equiv_members}\\n'\n", - " )\n", - " for i in range(len(myattack.equiv_members)):\n", - " print(\n", - " f\" {len(myattack.equiv_members[i])} members for group {i}\\n\"\n", - " f\"{myattack.equiv_members[i]}\"\n", - " )\n", - "\n", - " print(\"\\n\\n\\n====== Now Disclosive ====\\n\\n\")\n", - " # highly disclosive\n", - " param_dict = {\n", - " \"max_depth\": None,\n", - " \"min_samples_leaf\": 5,\n", - " \"min_samples_split\": 2,\n", - " \"n_estimators\": 5,\n", - " }\n", - " target2 = get_target(\"rf\", **param_dict)\n", - " myattack2 = StructuralAttack()\n", - " myattack2.attack(target2)\n", - " # assert myattack2.DoF_risk ==0 ,\"should be no DoF risk with decision stump\"\n", - " # assert myattack2.k_anonymity_risk ==1, 'should be k-anonymity risk with unlimited depth and min_samples_leaf 5'\n", - " # assert myattack2.class_disclosure_risk ==1,'should be class disclosure risk with unlimited depth and min_samples_leaf 5'\n", - " # assert myattack2.unnecessary_risk ==1, ' unnecessary risk with unlimited depth and min_samples_leaf 5'\n", - " print(f\" attack._get_param_names returns {myattack2._get_param_names()}\")\n", - " print(f\" attack.get_params returns {myattack2.get_params()}\")\n", - "\n", - " print(\n", - " f\" {len(myattack2.equiv_classes)} equiv_classes:\\n{myattack2.equiv_classes}\\n\"\n", - " f\"equiv_counts is {myattack2.equiv_counts}\\n\"\n", - " # f'equiv_members is {myattack2.equiv_members}\\n'\n", - " )\n", - " for i in range(len(myattack2.equiv_members)):\n", - " print(\n", - " f\" {len(myattack2.equiv_members[i])} members for group {i}\\n\"\n", - " f\"{myattack2.equiv_members[i]}\"\n", - " )\n", - "\n", - " # myattack.make_report()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "b962b63e-4b6b-47e0-b718-7d0e649dabc5", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:acro:version: 0.4.2\n", - "INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}\n", - "INFO:acro:automatic suppression: False\n", - "INFO:structural_attack:Thresholds for count 10 and DoF 10\n", - "INFO:acro:version: 0.4.2\n", - "INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}\n", - "INFO:acro:automatic suppression: False\n", - "INFO:structural_attack:Thresholds for count 10 and DoF 10\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "====== Non Disclosive ====\n", - "\n", - "\n", - " 1 equiv_classes:\n", - "[[0.33919598 0.66080402]]\n", - "equiv_counts is [398]\n", - "\n", - " 398 members for group 0\n", - "[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17\n", - " 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35\n", - " 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53\n", - " 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71\n", - " 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89\n", - " 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107\n", - " 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125\n", - " 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143\n", - " 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161\n", - " 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179\n", - " 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197\n", - " 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215\n", - " 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233\n", - " 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251\n", - " 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269\n", - " 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287\n", - " 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305\n", - " 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323\n", - " 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341\n", - " 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359\n", - " 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377\n", - " 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395\n", - " 396 397]\n", - "\n", - "\n", - "\n", - "====== Now Disclosive ====\n", - "\n", - "\n", - " attack._get_param_names returns ['risk_appetite_config', 'target_path', 'output_dir', 'report_name']\n", - " attack.get_params returns {'risk_appetite_config': 'default', 'target_path': None, 'output_dir': 'outputs_structural', 'report_name': 'report_structural'}\n", - " 88 equiv_classes:\n", - "[[0. 1. ]\n", - " [0.01818182 0.98181818]\n", - " [0.02 0.98 ]\n", - " [0.025 0.975 ]\n", - " [0.03333333 0.96666667]\n", - " [0.04040404 0.95959596]\n", - " [0.04444444 0.95555556]\n", - " [0.05333333 0.94666667]\n", - " [0.05714286 0.94285714]\n", - " [0.06 0.94 ]\n", - " [0.06444444 0.93555556]\n", - " [0.06666667 0.93333333]\n", - " [0.08 0.92 ]\n", - " [0.08222222 0.91777778]\n", - " [0.08666667 0.91333333]\n", - " [0.1 0.9 ]\n", - " [0.11428571 0.88571429]\n", - " [0.12 0.88 ]\n", - " [0.13333333 0.86666667]\n", - " [0.13611111 0.86388889]\n", - " [0.15555556 0.84444444]\n", - " [0.16444444 0.83555556]\n", - " [0.16666667 0.83333333]\n", - " [0.16666667 0.83333333]\n", - " [0.18222222 0.81777778]\n", - " [0.19428571 0.80571429]\n", - " [0.24040404 0.75959596]\n", - " [0.25555556 0.74444444]\n", - " [0.25714286 0.74285714]\n", - " [0.26984127 0.73015873]\n", - " [0.27111111 0.72888889]\n", - " [0.29047619 0.70952381]\n", - " [0.30277778 0.69722222]\n", - " [0.30707071 0.69292929]\n", - " [0.36103896 0.63896104]\n", - " [0.38608059 0.61391941]\n", - " [0.38698413 0.61301587]\n", - " [0.44761905 0.55238095]\n", - " [0.44888889 0.55111111]\n", - " [0.47777778 0.52222222]\n", - " [0.48770563 0.51229437]\n", - " [0.52031746 0.47968254]\n", - " [0.54761905 0.45238095]\n", - " [0.57818182 0.42181818]\n", - " [0.57936508 0.42063492]\n", - " [0.5847619 0.4152381 ]\n", - " [0.58888889 0.41111111]\n", - " [0.59261905 0.40738095]\n", - " [0.5956044 0.4043956 ]\n", - " [0.62 0.38 ]\n", - " [0.62666667 0.37333333]\n", - " [0.63142857 0.36857143]\n", - " [0.64322344 0.35677656]\n", - " [0.64484848 0.35515152]\n", - " [0.65555556 0.34444444]\n", - " [0.65655678 0.34344322]\n", - " [0.70989011 0.29010989]\n", - " [0.72380952 0.27619048]\n", - " [0.72666667 0.27333333]\n", - " [0.7556044 0.2443956 ]\n", - " [0.76103896 0.23896104]\n", - " [0.78888889 0.21111111]\n", - " [0.8 0.2 ]\n", - " [0.80285714 0.19714286]\n", - " [0.82222222 0.17777778]\n", - " [0.82666667 0.17333333]\n", - " [0.82698413 0.17301587]\n", - " [0.84131868 0.15868132]\n", - " [0.85333333 0.14666667]\n", - " [0.85555556 0.14444444]\n", - " [0.85555556 0.14444444]\n", - " [0.85714286 0.14285714]\n", - " [0.86 0.14 ]\n", - " [0.86666667 0.13333333]\n", - " [0.86989011 0.13010989]\n", - " [0.87179487 0.12820513]\n", - " [0.88888889 0.11111111]\n", - " [0.89333333 0.10666667]\n", - " [0.89846154 0.10153846]\n", - " [0.9047619 0.0952381 ]\n", - " [0.93142857 0.06857143]\n", - " [0.93333333 0.06666667]\n", - " [0.93846154 0.06153846]\n", - " [0.94285714 0.05714286]\n", - " [0.96 0.04 ]\n", - " [0.96666667 0.03333333]\n", - " [0.97142857 0.02857143]\n", - " [1. 0. ]]\n", - "equiv_counts is [173 8 3 3 8 1 5 3 1 5 1 1 3 1 5 2 1 1\n", - " 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4\n", - " 1 2 1 1 2 1 1 1 3 5 1 1 1 7 1 86]\n", - "\n", - " 173 members for group 0\n", - "[ 1 2 3 5 6 7 10 11 15 18 20 22 26 28 29 30 32 36\n", - " 37 39 48 50 53 60 62 63 64 65 66 68 69 70 71 73 75 78\n", - " 81 82 84 88 92 93 96 102 103 109 110 115 117 119 122 123 124 126\n", - " 130 131 134 135 141 143 148 149 150 151 156 158 159 161 162 163 165 166\n", - " 174 180 181 182 186 189 191 192 195 197 198 199 200 202 203 206 210 212\n", - " 213 215 216 218 220 221 223 226 231 232 233 235 237 240 241 245 247 248\n", - " 253 254 256 259 260 261 263 264 267 269 270 271 272 281 284 285 290 291\n", - " 295 304 305 306 307 309 311 314 315 317 318 321 324 327 328 331 333 334\n", - " 335 337 338 339 341 342 345 349 350 355 360 361 365 369 370 373 375 377\n", - " 378 382 385 386 387 388 389 391 392 394 395]\n", - " 8 members for group 1\n", - "[ 16 43 104 121 152 177 229 384]\n", - " 3 members for group 2\n", - "[ 13 336 380]\n", - " 3 members for group 3\n", - "[ 98 287 301]\n", - " 8 members for group 4\n", - "[ 51 170 214 222 262 273 276 396]\n", - " 1 members for group 5\n", - "[279]\n", - " 5 members for group 6\n", - "[ 27 35 42 106 371]\n", - " 3 members for group 7\n", - "[ 55 129 138]\n", - " 1 members for group 8\n", - "[80]\n", - " 5 members for group 9\n", - "[ 79 116 164 252 286]\n", - " 1 members for group 10\n", - "[160]\n", - " 1 members for group 11\n", - "[77]\n", - " 3 members for group 12\n", - "[ 49 120 289]\n", - " 1 members for group 13\n", - "[176]\n", - " 5 members for group 14\n", - "[ 87 173 208 322 359]\n", - " 2 members for group 15\n", - "[91 99]\n", - " 1 members for group 16\n", - "[146]\n", - " 1 members for group 17\n", - "[376]\n", - " 2 members for group 18\n", - "[154 280]\n", - " 1 members for group 19\n", - "[196]\n", - " 1 members for group 20\n", - "[108]\n", - " 1 members for group 21\n", - "[46]\n", - " 1 members for group 22\n", - "[89]\n", - " 1 members for group 23\n", - "[234]\n", - " 1 members for group 24\n", - "[275]\n", - " 1 members for group 25\n", - "[72]\n", - " 1 members for group 26\n", - "[308]\n", - " 1 members for group 27\n", - "[136]\n", - " 1 members for group 28\n", - "[243]\n", - " 1 members for group 29\n", - "[368]\n", - " 1 members for group 30\n", - "[381]\n", - " 1 members for group 31\n", - "[67]\n", - " 1 members for group 32\n", - "[172]\n", - " 1 members for group 33\n", - "[111]\n", - " 1 members for group 34\n", - "[294]\n", - " 1 members for group 35\n", - "[348]\n", - " 1 members for group 36\n", - "[219]\n", - " 1 members for group 37\n", - "[178]\n", - " 1 members for group 38\n", - "[204]\n", - " 1 members for group 39\n", - "[312]\n", - " 1 members for group 40\n", - "[320]\n", - " 1 members for group 41\n", - "[169]\n", - " 1 members for group 42\n", - "[56]\n", - " 1 members for group 43\n", - "[364]\n", - " 1 members for group 44\n", - "[155]\n", - " 1 members for group 45\n", - "[113]\n", - " 1 members for group 46\n", - "[288]\n", - " 1 members for group 47\n", - "[54]\n", - " 1 members for group 48\n", - "[282]\n", - " 1 members for group 49\n", - "[351]\n", - " 1 members for group 50\n", - "[372]\n", - " 1 members for group 51\n", - "[105]\n", - " 1 members for group 52\n", - "[86]\n", - " 1 members for group 53\n", - "[8]\n", - " 1 members for group 54\n", - "[224]\n", - " 1 members for group 55\n", - "[225]\n", - " 1 members for group 56\n", - "[40]\n", - " 1 members for group 57\n", - "[313]\n", - " 1 members for group 58\n", - "[145]\n", - " 1 members for group 59\n", - "[19]\n", - " 1 members for group 60\n", - "[83]\n", - " 1 members for group 61\n", - "[193]\n", - " 1 members for group 62\n", - "[362]\n", - " 1 members for group 63\n", - "[14]\n", - " 1 members for group 64\n", - "[292]\n", - " 1 members for group 65\n", - "[356]\n", - " 1 members for group 66\n", - "[397]\n", - " 1 members for group 67\n", - "[367]\n", - " 1 members for group 68\n", - "[296]\n", - " 2 members for group 69\n", - "[ 0 207]\n", - " 2 members for group 70\n", - "[ 0 207]\n", - " 4 members for group 71\n", - "[ 52 242 302 344]\n", - " 1 members for group 72\n", - "[293]\n", - " 2 members for group 73\n", - "[323 330]\n", - " 1 members for group 74\n", - "[326]\n", - " 1 members for group 75\n", - "[379]\n", - " 2 members for group 76\n", - "[ 34 363]\n", - " 1 members for group 77\n", - "[266]\n", - " 1 members for group 78\n", - "[157]\n", - " 1 members for group 79\n", - "[297]\n", - " 3 members for group 80\n", - "[ 47 94 217]\n", - " 5 members for group 81\n", - "[ 24 168 205 319 329]\n", - " 1 members for group 82\n", - "[171]\n", - " 1 members for group 83\n", - "[268]\n", - " 1 members for group 84\n", - "[12]\n", - " 7 members for group 85\n", - "[114 127 132 209 250 283 298]\n", - " 1 members for group 86\n", - "[257]\n", - " 86 members for group 87\n", - "[ 4 9 17 21 23 25 31 33 38 41 44 45 57 58 59 61 74 76\n", - " 85 90 95 97 100 101 107 112 118 125 128 133 137 139 140 142 144 147\n", - " 153 167 175 179 183 184 185 187 188 190 194 201 211 227 228 230 236 238\n", - " 239 244 246 249 251 255 258 265 274 277 278 299 300 303 310 316 325 332\n", - " 340 343 346 347 352 353 354 357 358 366 374 383 390 393]\n" - ] - } - ], - "source": [ - "test_rf()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d9fc566-9c2f-468f-b664-41ff71fa366f", - "metadata": {}, - "outputs": [], - "source": [ - "from acro import ACRO\n", - "\n", - "acro = ACRO()\n", - "\n", - "from scipy.io.arff import loadarff\n", - "\n", - "path = os.path.join(\"../data\", \"nursery.arff\")\n", - "data = loadarff(path)\n", - "df = pd.DataFrame(data[0])\n", - "df = df.select_dtypes([object])\n", - "df = df.stack().str.decode(\"utf-8\").unstack()\n", - "df.rename(columns={\"class\": \"recommend\"}, inplace=True)\n", - "df.head()\n", - "df[\"children\"].replace(to_replace={\"more\": \"4\"}, inplace=True)\n", - "df[\"children\"] = pd.to_numeric(df[\"children\"])\n", - "\n", - "df[\"children\"] = df.apply(\n", - " lambda row: (\n", - " row[\"children\"] if row[\"children\"] in (1, 2, 3) else np.random.randint(4, 10)\n", - " ),\n", - " axis=1,\n", - ")\n", - "\n", - "mytable = acro.crosstab(\n", - " [data.survivor, data.year], data.grant_type, values=data.inc_grants, aggfunc=\"mean\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d5cbff4-08cb-4e4d-933b-5cfd7928615f", - "metadata": {}, - "outputs": [], - "source": [ - "def get_whitebox_class_disclosure(yprobs:np.ndarray, \n", - " true_labels:np.array,\n", - " threshold:int,\n", - " ignore_zeros:Bool)->tuple[int,int]:\n", - " \"\"\" \n", - " function that ingests the proba values created\n", - " when a classifier is applied to a set of records\n", - " and returns details of whitebox group membership \n", - "\n", - " Parameters\n", - " ----------\n", - " yprobs: int\n", - " numpy 2d array, one row per record, one column per output class\n", - " true_labels: numpy 1Darray\n", - " one element for each row in yprobs, giving the actual class label\n", - " threshold :int\n", - " minimum number of (non-zero) records of each class in each equivalence group\n", - " ignore_zeros:Bool\n", - " should the threshold checking ignore 'evidential zeros' i.e. unrepresented classes\n", - " \n", - " Returns\n", - " --------\n", - " tuple [int,int]: \n", - " model is whitebox class disclosive (1) or not (0)\n", - " according to probability*membership tuple[0]\n", - " or actual group member labels tuple[1]\n", - " \n", - " \"\"\"\n", - " n_classes = yprobs.shape[1]\n", - " n_rows=yprobs.shape[0]\n", - " assert len(true_labels)==n_rows, f\"shape mismatch:lengths of yprobs {n_rows} and true_classes{len(true_classes)}\"\n", - " \n", - " uniques = np.unique(yprobs,axis=0,return_counts=True)\n", - " #groups are equivalance classes in predicted class probability space \n", - " uniq_probs=uniques[0]\n", - " uniq_freqs=uniques[1]\n", - " class_freqs= np.zeros( uniq_probs.shape,dtype=float)\n", - " membership=[]\n", - "\n", - " #check disclosure according to proba values\n", - " disclosive_by_freqs=1\n", - " for group in range( len(uniq_probs)):\n", - " class_freqs[group]= uniq_probs[group,:]*uniq_freqs[group]\n", - " for label in range(n_classes):\n", - " if class_freqs[group][label]== 0 and not ignore_zeros:\n", - " disclosive_by_freqs = 1\n", - " elif 0< class_freqs[group][label]< threshold :\n", - " disclosive_by_freqs = 1\n", - " else:\n", - " pass\n", - " \n", - " #now according to the labels of records falling in to each group\n", - " disclosive_by_labels=0\n", - " for prob_vals in uniq_probs:\n", - " ingroup = np.all(yprobs==prb_vals,axis=1)\n", - " \n", - " \n", - "def test_whitebox_class_disclosure(): \n", - "uprobs=uniques[0]\n", - "ufreqs=uniques[1]\n", - "class_freqs= np.zeros( uprobs.shape,dtype=float)\n", - "for group in range( len(uprobs)):\n", - " class_freqs[group]= uprobs[group,:]*ufreqs[group]\n", - " print(f'group {group} class_membership {class_freqs[group]}')\n", - " errmsg=f'class sum {class_freqs[group].sum()} should equal group count {ufreqs[group]}'\n", - " np.testing.assert_almost_equal( class_freqs[group].sum(), ufreqs[group],0.001),errmsg\n", - "print(f'class freqs are:\\n{class_freqs}')\n", - " \n", - " \n", - "uniqvals= [ [0.1,0.2,0.7],\n", - " [0.6,0.4,0.0],\n", - " [0.2,0.4,0.4]]\n", - "\n", - "\n", - "yprobs = np.zeros((20,3),dtype=float)\n", - "for i in range (20):\n", - " randval = np.random.randint(0,3)\n", - " yprobs[i] = np.array(uniqvals[randval])\n", - "#print( f'yprobs is \\n{yprobs}')\n", - "sorted_probs = yprobs[np.lexsort(([yprobs[:, i] for i in range(yprobs.shape[1]-1, -1, -1)]))]\n", - "#print( f'sorted_probs is \\n{sorted_probs}')\n", - "uniques = np.unique(sorted_probs,axis=0,return_counts=True)\n", - "print(f'np.uniq gives {len(uniques[0])}') \n", - "\n", - "uprobs=uniques[0]\n", - "ufreqs=uniques[1]\n", - "class_freqs= np.zeros( uprobs.shape,dtype=float)\n", - "for group in range( len(uprobs)):\n", - " class_freqs[group]= uprobs[group,:]*ufreqs[group]\n", - " print(f'group {group} class_membership {class_freqs[group]}')\n", - " errmsg=f'class sum {class_freqs[group].sum()} should equal group count {ufreqs[group]}'\n", - " np.testing.assert_almost_equal( class_freqs[group].sum(), ufreqs[group],0.001),errmsg\n", - "print(f'class freqs are:\\n{class_freqs}')\n", - "\n", - "#class disclosure step 3:loop through all similarity groups\n", - "r_ends = []\n", - "group_first = 0\n", - "group_last= 0\n", - "possible_next=group_last+1\n", - "while possible_next