diff --git a/docs/source/apriori.md b/docs/source/apriori.md new file mode 100644 index 0000000..f52da89 --- /dev/null +++ b/docs/source/apriori.md @@ -0,0 +1,35 @@ +# Apriori + +Apriori files can be used to control how TauArgus suppresses cells. +These files can mark individual cells as protected, safe, or modify the suppression cost. + +## Use an existing file + +```python +import piargus as pa + +apriori = pa.Apriori.from_hst("apriori.hst") +``` + +## Create an apriori file programmatically + +```python +import piargus as pa + +apriori = pa.Apriori(expand_trivial=True) +apriori.change_status(['A', 'ExampleDam'], pa.SAFE) +apriori.change_status(['A', 'ExampleCity'], pa.SAFE) +apriori.change_cost(['C', 'ExampleDam'], 10) +apriori.change_protection_level(['C', 'ExampleCity'], 5) + +apriori.to_hist("apriori.hst") +``` + +## Attaching apriori to a table + +Simply pass it as a parameter when creating a `Table` or `TableData` instance: + +```python +table = pa.Table(['symbol', 'regio'], 'income', ..., + apriori=apriori) +``` diff --git a/docs/source/conf.py b/docs/source/conf.py index 95c1ffd..1a56809 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -40,7 +40,7 @@ html_theme = 'sphinx_rtd_theme' html_static_path = ['_static'] -myst_enable_extensions = ["colon_fence"] +myst_enable_extensions = ["colon_fence", "dollarmath"] # Code is in src. Make sure sphinx can find it import sys diff --git a/docs/source/hierarchies.md b/docs/source/hierarchies.md index 242f36e..ac66580 100644 --- a/docs/source/hierarchies.md +++ b/docs/source/hierarchies.md @@ -1,12 +1,12 @@ # Hierarchies # -For explanatory variables, it is recommended to supply a hierarchy. -There are 3 kinds of hierarchy supported by PiArgus. +Hierarchies are important for explanatory variables. +There are three types supported by PiArgus. ## FlatHierarchy ## -This is the default if no hierarchy is supplied. -All labels are of the same level with a single total. +The `FlatHierarchy` is used by default if no hierarchy is specified. +All codes add up to a single total. ```python import piargus as pa @@ -15,6 +15,8 @@ datacol = ["A", "B", "C", "B", "A"] hierarchy = pa.FlatHierarchy(total_code="Total") ``` +This creates a simple structure where all values are aggregated into one total. + ```{mermaid} graph LR; Total --> A; @@ -24,7 +26,7 @@ Total --> C; ## LevelHierarchy ## -A level hierarchy is useful when the hierarchy is encoded within the code itself. +A `LevelHierarchy` is used when the hierarchical relationships are encoded directly within the data. ```python import piargus as pa @@ -33,6 +35,9 @@ datacol = ["11123", "11234", "23456"] hierarchy = pa.LevelHierarchy(levels=[2, 3], total_code="Total") ``` +In this example, the first two digits represent a higher-level grouping, +and the next 3 digits represent a more detailed level within that group. + ```{mermaid} graph LR; Total --> 11; @@ -44,8 +49,7 @@ Total --> 23; ## TreeHierarchy ## -For complex hierarchies, a TreeHierarchy can be used. -These are typically stored in a hrc-file. +A `TreeHierarchy` is used for complex hierarchies, typically stored in `.hrc` files. ```python import piargus as pa @@ -54,6 +58,8 @@ datacol = ["PV20", "PV21", "PV22"] hierarchy = pa.TreeHierarchy.from_hrc("provinces.hrc", total_code="NL01") ``` +These hierarchies have a tree-like structure. + ```{mermaid} graph LR; NL01 --> LD01; @@ -63,22 +69,30 @@ LD01 --> PV21; LD02 --> PV22; ``` -The file provinces.hrc may look like this: -```hrc -LD01 -@PV20 -@PV21 -LD02 -@PV22 -``` +### Creating a TreeHierarchy programmatically + +You can also create a TreeHierarchy programmatically, without relying on an external `.hrc` file. -It can also be created programmatically: ```python import piargus as pa hierarchy = pa.TreeHierarchy(total_code="NL01") -hierarchy.create_node(["NL01", "LD01", "PV20"]) -hierarchy.create_node(["NL01", "LD01", "PV21"]) -hierarchy.create_node(["NL01", "LD02", "PV22"]) +hierarchy.create_node(["LD01", "PV20"]) +hierarchy.create_node(["LD01", "PV21"]) +hierarchy.create_node(["LD02", "PV22"]) hierarchy.to_hrc('provinces.hrc') ``` + +## Attaching a hierarchy to inputdata + +To apply a hierarchy to your data, simply pass the hierarchy as part of the +`MicroData` or `TableData` constructor: + +```python +import piargus as pa + +pa.MicroData(data_df, ..., + hierarchies = {"region": region_hierarchy}) +``` + +This will apply the specified `region_hierarchy` to the `region` column in your data. diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 2085a5a..7c377c7 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -3,13 +3,13 @@ Installation Download and install the latest version of `τ-ARGUS `_. Make sure to setup the location of the program on your path. -For example (powershell): +For example, in Powershell: .. code-block:: powershell $env:path += ";\Path\To\Folder\Containing\TauArgus\Program" # Please adapt locally to put your own path here -Use `pip `_ to install piargus. +Next, use `pip `_ to install piargus. .. code-block:: powershell diff --git a/docs/source/result.md b/docs/source/result.md new file mode 100644 index 0000000..d9489a5 --- /dev/null +++ b/docs/source/result.md @@ -0,0 +1,88 @@ +# Result analysis # + +## Working directory + +The job can accept a `directory` argument. +When provided, all temporary files and output tables will be created and stored in the specified location. +After the job completes, this directory can be inspected for further analysis. + +```python +import piargus as pa + +job = pa.Job(directory="argus_workdir") +``` + +If no directory is provided, a temporary directory will be created automatically. +This directory will be cleaned up once the job is finished. + +## Report + +When tau argus is run, it returns a result that can be printed. +It will display all output written to the logbook. + +```python +import piargus as pa + +tau = pa.TauArgus() +report = tau.run(job) +print(job) +``` + +## Table result + +The resulting tables can be obtained from the specification `Table`. + +```python +import piargus as pa + +table_spec = pa.Table(...) + +job = pa.Job(inputdata, [table_spec]) + +try: + tau.run(job) +except pa.TauArgusException as err: + print("An error occurred:") + print(err.result) +else: + print("Job completed succesfully") + table_result = table_spec.load_result() +``` + +### TableResult methods + +The `TableResult` object provides three key methods: + +```python +table_result.safe() +table_result.status() +table_result.unsafe(unsafe_marker='X') +``` + +Each of these methods returns a Pandas [Series](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html). +The index is a multi-index containing the explanatory variables. +You can reshape the result into a preferred format using Pandas methods like `stack`, `unstack`, and `swaplevel`. + +#### `unsafe()` + +This returns the aggregated data in unprotected form. + +#### `safe(unsafe_marker='X')` + +This returns the aggregated data in its protected form. +Unsafe cells are marked by a special value, with `X` as the default marker. +Since this converts the resulting `pd.Series` to a string data type, you can pass `pd.NA` or a dummy value to keep the result in a numeric format. + +#### `status()` + +This method returns the safety status for each observation as a `pd.Series`. + +The following status codes are used: + +| Code | Meaning | +|------|------------------| +| S | Safe | +| P | Protected | +| U | Primary unsafe | +| M | Secondary unsafe | +| Z | Empty | diff --git a/docs/source/suppression.md b/docs/source/suppression.md new file mode 100644 index 0000000..18c9c4e --- /dev/null +++ b/docs/source/suppression.md @@ -0,0 +1,55 @@ +# Safety methods and suppression # + +Tau argus performs two kinds of suppression: + +1. Primary suppression suppresses cells that violate safety rules. +2. Secondary suppression suppresses cells to protect other cells. + +## Safety rules ## + +Cells directly violating one of these rules are protected during primary suppression. + +| Rule | Meaning | +|-------------------------|-----------------------------------| +| pa.percent_rule(p, n) | $p%$-rule | +| pa.dominance_rule(n, k) | $N,K$ dominance rule | +| pa.frequency_rule(n) | Every cell needs $n$ contributors | + +## Suppression methods ## + +Methods for secondary suppression aim to minimize the suppression cost while protecting the data. + +| Method | Description | Optimality | Speed | +|--------------|---------------------------------------------------|------------|--------| +| `pa.OPTIMAL` | Minimizes suppression costs (slowest) | High | Slow | +| `pa.MOD` | Protects sub-tables first and combines the result | Medium | Medium | +| `pa.GH` | Hypercube method | Low | Fast | + +## Specifying rules ## + +Safety rules can be set for individual observations. +If some of the observations belong to the same unit, a safety rule can also be set on a holding-level. +In that case the microdata should have a `holding`-column. +If there is no holding information, safety rules can only be set on an individual level (per cell). +Suppression methods are also be set per table. + +```python +import piargus as pa + +table = pa.Table(response, explanatory, ..., + safety_rule={"individual": pa.percent_rule(20), + "holding": pa.percent_rule(30)}, + suppress_method=pa.MODULAR) +``` + +If there are multiple linked tables, a safety rule can also be set on a job: + +```python +job = pa.Job(tables, ..., + linked_suppress_method=pa.MODULAR) +``` + +## Disclaimer + +For a more official and theoretical explanation of suppression in argus, please consult the [tau-manual](https://research.cbs.nl/casc/Software/TauManualV4.1.pdf). +This page is meant as a practical overview, but is not authoritative. diff --git a/docs/source/tauargus.md b/docs/source/tauargus.md new file mode 100644 index 0000000..9ef0a5c --- /dev/null +++ b/docs/source/tauargus.md @@ -0,0 +1,54 @@ +# TauArgus + +The `TauArgus` class wraps the `tauargus.exe` program. +You can either add the directory containing `tauargus.exe` to your `PATH` environment variable or pass the executable’s location as follows: + +```python +import piargus as pa + +tau = pa.TauArgus(r"C:\\path\to\argus.exe") +``` + +To test the setup: + +```python +print("Tau:", tau.version_info()) +``` + +## Running jobs + +If you have created a job, it can be run as follows: + +```python +job = pa.Job(...) +tau.run(job) +``` + +Multiple jobs can be run at the same time by passing them as a list: + +```python +tau.run([job1, job2, ...]) +``` + +## Running batch files + +If you have created a batch file, it can be run as follows: + +```python +tau.run("myjob.arb") +``` + +To simplify the creation of batch files, `BatchWriter` may be used. + +```python +with open("myjob.arb", "w") as output_file: + batch_writer = pa.BatchWriter(output_file) + + batch_writer.open_microdata("microdata.csv") + batch_writer.open_metadata("metadata.rda") + batch_writer.specify_table(["explanator1", "explanatory2"], "response") + batch_writer.safety_rule(individual="NK(3, 70)") + batch_writer.read_microdata() + batch_writer.suppress("MOD") + batch_writer.write_table(1, 2, "AS+", "protected.csv") +``` diff --git a/docs/source/tutorial.md b/docs/source/tutorial.md index 5169eef..1833663 100644 --- a/docs/source/tutorial.md +++ b/docs/source/tutorial.md @@ -1,7 +1,7 @@ -# Tutorial +# Getting started -Make sure to install piargus and pandas (dependency). -If you have both installed, make sure to import them: +Ensure that piargus and TauArgus are installed. +If both are installed, you can start by importing piargus along with pandas: ```python import pandas as pd @@ -10,28 +10,28 @@ import piargus as pa ## Loading data -You should initially get your data in the form of a pandas dataframe. -The easiest way to do this is by `pd.read_csv()`. +There are two primary ways to use piargus: +starting from **microdata** or **table data**. +In both cases, your data must be in the form of a pandas `Dataframe`. +If your data is stored in a CSV file, it can be loaded using `pd.read_csv()`. ```python -input_df = pd.read_csv('dataset.csv') -print(input_df) # To see if it looks correct. +input_df = pd.read_csv('data.csv') ``` -Consult the [pandas reference](https://pandas.pydata.org/docs/reference/io.html) for more options. -It's possible to work on microdata or tabledata. -Depending on your choice, continue in the appropriate section below. +For more options to load data, consult the [pandas documentation](https://pandas.pydata.org/docs/reference/io.html). -## Starting from microdata +## Starting from Microdata + +First, convert your `input_df` into a microdata-object: -First convert `input_df` to a microdata-object: ```python input_data = pa.MicroData(input_df) ``` -If one of more of the columns is hierarchical, it is important to specify this. -In this example, it is assumed regio is hierarchical and its hierarchy is stored in a file "regio.hrc". -Depending on your data, there can also be other considerations that you may need to apply. +If any columns are hierarchical, specify them. +For example, if `regio` is hierarchical and its hierarchy is stored in a file `regio.hrc`, +you can load the hierarchy as follows: ```python regio_hierarchy = pa.TreeHierarchy.load_hrc("regio.hrc") @@ -42,12 +42,10 @@ input_data = pa.MicroData( ) ``` -Now, we want to set up a table to generate. -We want the table to use `sbi` and `regio` as independent variables and `income` as response variable. -As primary suppression we will use the [p%-rule](https://link.springer.com/chapter/10.1007/978-3-642-33627-0_1). -For secondary suppression, we will use `optimal`. -This algorithm is generally slow for bigger datasets, but minimizes the suppression cost when the data is small. -By default the suppression cost is the sum of the values, but this can be configured by a parameter `cost`. +### Setting up a Table + +Set up a table with `sbi` and `regio` as explanatory variables and `income` as the response variable. +Use the [p%-rule](https://link.springer.com/chapter/10.1007/978-3-642-33627-0_1) as a safety rule and `OPTIMAL` as a method for secondary suppression: ```python output_table = pa.Table(explanatory=['sbi', 'regio'], @@ -56,7 +54,10 @@ output_table = pa.Table(explanatory=['sbi', 'regio'], suppression_method=pa.OPTIMAL) ``` -Now we can set up a job and run the problem: +### Running the Job + +To run the table generation job with `TauArgus`: + ```python tau = pa.TauArgus(r'') job = pa.Job(input_data, [output_table], directory='tau', name="my-microdata") @@ -69,8 +70,7 @@ print(table_result) table_result.dataframe().to_csv('output/microdata_result.csv') ``` -And we're mostly done. -Output will look something like this: +The output will look like this: ``` @@ -112,7 +112,7 @@ C Total x M 121.84 ExampleCity x U 73.40 ``` -The status can be interpreted as follows: +### Interpreting Status codes | Status | Meaning | |--------|---------------------------------| @@ -122,11 +122,10 @@ The status can be interpreted as follows: | M | Unsafe by secondary suppression | | Z | Empty cell | -So nearly everything but the total has been suppressed. +## Starting from TableData -## Starting from tabular data +To work with tabular data, convert `input_df` into a `TableData` object: -First convert `input_df` to a tabledata-object: ```python input_data = pa.TableData( input_df, @@ -137,8 +136,7 @@ input_data = pa.TableData( ) ``` -The additional parameters to TableData are the same as those to Table when supplying microdata. -More parameters can be set: +You can also specify additional parameters to `TableData`: | Parameter | Meaning | Example | |--------------------|-------------------------------------------------|---------------------------| @@ -146,7 +144,7 @@ More parameters can be set: | `frequency` | Column with number of contributors to response. | `"n_obs"` | | `top_contributors` | Columns with top contributors. | `["max", "max2", "max3"]` | -Now to run this: +To run the data protection job: ```python job = pa.Job(table, directory='tau', name='my-table-data') diff --git a/docs/source/userguide.rst b/docs/source/userguide.rst index 80d742d..3e2ba33 100644 --- a/docs/source/userguide.rst +++ b/docs/source/userguide.rst @@ -2,9 +2,11 @@ User guide ---------- .. toctree:: - :maxdepth: 2 - :caption: Contents: installation tutorial hierarchies + apriori + suppression + tauargus + result diff --git a/src/piargus/inputspec/inputdata.py b/src/piargus/inputspec/inputdata.py index eae32f6..a70f546 100644 --- a/src/piargus/inputspec/inputdata.py +++ b/src/piargus/inputspec/inputdata.py @@ -30,7 +30,7 @@ def __init__( :param codelists: Codelists (dicts) for categorical data in the dataset. :param column_lengths: For each column the length. :param total_codes: Codes within explanatory that are used for the totals. - The lengths can also be derived by calling resolve_column_lengths. + The lengths can also be derived by calling resolve_column_lengths. """ if hierarchies is None: diff --git a/src/piargus/job.py b/src/piargus/job.py index 28c478b..7fcd3ac 100644 --- a/src/piargus/job.py +++ b/src/piargus/job.py @@ -47,6 +47,7 @@ def __init__( Options are: * `GHMITER` ("GH"): Hypercube * `MODULAR` ("MOD"): Modular + :param linked_suppress_method_args: Parameters to pass to suppress_method. :param directory: Where to write tau-argus files. :param name: Name from which to derive the name of some temporary files. diff --git a/src/piargus/outputspec/table.py b/src/piargus/outputspec/table.py index e32ec7b..981faae 100644 --- a/src/piargus/outputspec/table.py +++ b/src/piargus/outputspec/table.py @@ -13,7 +13,7 @@ class Table: """ A Table describes what the protected table should look like. - Usually there is are a few explanatory columns one one response. + Usually there are a few explanatory columns and one response. """ def __init__( self, @@ -45,14 +45,15 @@ def __init__( Can be supplied as: * str where parts are separated by | * A sequence of parts - * A dict with keys {"individual": x "holding": y} with separate rules on individual and - holding level. + * A dict with keys {"individual": x "holding": y} with separate rules on individual and holding level. + Each part can be: * "P(p, n=1)": p% rule * "NK(n, k)": (n, k)-dominance rule * "ZERO(safety_range)": Zero rule * "FREQ(minfreq, safety_range)": Frequency rule * "REQ(percentage_1, percentage_2, safety_margin)": Request rule + See the Tau-Argus manual for details on those rules. :param apriori: Apriori file to change parameters. :param suppress_method: Method to use for secondary suppression. @@ -64,6 +65,7 @@ def __init__( * `ROUNDING` ("RND"): Controlled rounding * `TABULAR_ADJUSTMENT` ("CTA"): Controlled Tabular Adjustment * None: No secondary suppression is applied + See the Tau-Argus manual for details on those rules. :param suppress_method_args: Parameters to pass to suppress_method. """ diff --git a/src/piargus/result/tableresult.py b/src/piargus/result/tableresult.py index 9af318e..a9b447f 100644 --- a/src/piargus/result/tableresult.py +++ b/src/piargus/result/tableresult.py @@ -29,12 +29,14 @@ def status(self, recode=True) -> pd.Series: """Return the status of each response. :param recode: If True, readable codes will be returned. - S - safe - P - protected - U - primary unsafe - M - secondary unsafe - Z - empty - Otherwise raw status codes from Tau-Argus are returned. See the documentation of Tau-Argus. + The following codes are used by default: + * `S`: safe + * `P`: protected + * `U`: primary unsafe + * `M`: secondary unsafe + * `Z`: empty + + Otherwise, raw status codes from Tau-Argus are returned. See the documentation of Tau-Argus. :returns: Status for each combination. """ status_num = self._df['Status']