Merge branch 'release/0.4.5'

great-expectations · Dec 19, 2018 · c2c4741 · c2c4741
2 parents bbc6960 + 9eff17c
commit c2c4741
Show file tree

Hide file tree

Showing 59 changed files with 4,249 additions and 3,029 deletions.
diff --git a/docs/source/autoinspection.rst b/docs/source/autoinspection.rst
@@ -0,0 +1,34 @@
+.. _autoinspection:
+
+================================================================================
+Autoinspection
+================================================================================
+
+It can be very convenient to have great expectations automatically review a \
+dataset and suggest expectations that may be appropriate. Currently, there \
+a very basic, but easily extensible, autoinspection capability available.
+
+Dataset objects have an `autoinspect` method which allows you to provide a \
+function that will evaluate a dataset object and add expectations to it. \
+By default `autoinspect` will call the autoinspect function \
+:func:`columns_exist <great_expectations.dataset.autoinspect.columns_exist>` \
+which will add an `expect_column_to_exist` expectation for each column \
+currently present on the dataset.
+
+To implement additional autoinspection functions, you simply take a single \
+parameter, a Dataset, and evaluate and add expectations to that object.
+
+
+.. code-block:: python
+
+    >> import great_expectations as ge
+    >> df = ge.dataset.PandasDataset({"col": [1, 2, 3, 4, 5]})
+    >> df.autoinspect(ge.dataset.autoinspect.columns_exist)
+    >> df.get_expectations_config()
+        {'dataset_name': None,
+         'meta': {'great_expectations.__version__': '0.4.4__develop'},
+         'expectations': [
+             {'expectation_type': 'expect_column_to_exist',
+              'kwargs': {'column': 'col'}
+             }]
+        }
diff --git a/docs/source/custom_expectations.rst b/docs/source/custom_expectations.rst
@@ -74,7 +74,7 @@ For SqlAlchemyDataset, the decorators work slightly differently. See the MetaSql
             mode_query = sa.select([
                 sa.column(column).label('value'),
                 sa.func.count(sa.column(column)).label('frequency')
-            ]).select_from(sa.table(self.table_name)).group_by(sa.column(column)).order_by(sa.desc(sa.column('frequency')))
+            ]).select_from(self._table).group_by(sa.column(column)).order_by(sa.desc(sa.column('frequency')))
 
             mode = self.engine.execute(mode_query).scalar()
             return {

diff --git a/docs/source/dataset_module.rst b/docs/source/dataset_module.rst
@@ -59,3 +59,12 @@ great_expectations.dataset.util
     :members:
     :undoc-members:
     :show-inheritance:
+
+
+great_expectations.dataset.autoinspect
+-------------------------------
+
+.. automodule:: great_expectations.dataset.autoinspect
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -28,6 +28,7 @@ Advanced
 
    standard_arguments
    result_format
+   autoinspection
    evaluation_parameters
    custom_expectations
    conventions

diff --git a/docs/source/result_format.rst b/docs/source/result_format.rst
@@ -29,9 +29,9 @@ including interactive exploratory work and automatic validation.
 +---------------------------------------+----------------+----------------+----------------+----------------+
 |    missing_percent                    |no              |yes             |yes             |yes             |
 +---------------------------------------+----------------+----------------+----------------+----------------+
-|    detail (dictionary)                |Defined on a per-expectation basis                                 |
+|    details (dictionary)               |Defined on a per-expectation basis                                 |
 +---------------------------------------+----------------+----------------+----------------+----------------+
-| Fields defined only for `column_map_expectation` type expectations:                                       |
+| Fields defined for `column_map_expectation` type expectations:                                       |
 +---------------------------------------+----------------+----------------+----------------+----------------+
 |    unexpected_count                   |no              |yes             |yes             |yes             |
 +---------------------------------------+----------------+----------------+----------------+----------------+
@@ -49,7 +49,7 @@ including interactive exploratory work and automatic validation.
 +---------------------------------------+----------------+----------------+----------------+----------------+
 |    unexpected_list                    |no              |no              |no              |yes             |
 +---------------------------------------+----------------+----------------+----------------+----------------+
-| Fields defined only for `column_aggregate_expectation` type expectations:                                 |
+| Fields defined for `column_aggregate_expectation` type expectations:                                 |
 +---------------------------------------+----------------+----------------+----------------+----------------+
 |    observed_value                     |no              |yes             |yes             |yes             |
 +---------------------------------------+----------------+----------------+----------------+----------------+

diff --git a/docs/source/roadmap_changelog.rst b/docs/source/roadmap_changelog.rst
@@ -12,6 +12,28 @@ Planned Features
 * Real-time/streaming and adaption of distributional expectations
 
 
+v.0.4.5
+----------------
+* Add a new autoinspect API and remove default expectations.
+* Improve details for expect_table_columns_to_match_ordered_list (#379, thanks @rlshuhart)
+* Linting fixes (thanks @elsander)
+* Add support for dataset_class in from_pandas (thanks @jtilly)
+* Improve redshift compatibility by correcting faulty isnull operator (thanks @avanderm)
+* Adjust partitions to use tail_weight to improve JSON compatibility and
+  support special cases of KL Divergence (thanks @anhollis)
+* Enable custom_sql datasets for databases with multiple schemas, by
+  adding a fallback for column reflection (#387, thanks @elsander)
+* Remove `IF NOT EXISTS` check for custom sql temporary tables, for
+  Redshift compatibility (#372, thanks @elsander)
+* Allow users to pass args/kwargs for engine creation in
+  SqlAlchemyDataContext (#369, thanks @elsander)
+* Add support for custom schema in SqlAlchemyDataset (#370, thanks @elsander)
+* Use getfullargspec to avoid deprecation warnings.
+* Add expect_column_values_to_be_unique to SqlAlchemyDataset
+* Fix map expectations for categorical columns (thanks @eugmandel)
+* Improve internal testing suite (thanks @anhollis and @ccnobbli)
+* Consistently use value_set instead of mixing value_set and values_set (thanks @njsmith8)
+
 v.0.4.4
 ----------------
 * Improve CLI help and set CLI return value to the number of unmet expectations

diff --git a/docs/source/validation.rst b/docs/source/validation.rst
@@ -20,58 +20,58 @@ Once you've constructed and stored Expectations, you can use them to validate ne
     {
       "results" : [
         {
-          "expectation_type": "expect_column_to_exist", 
-          "success": True, 
+          "expectation_type": "expect_column_to_exist",
+          "success": True,
           "kwargs": {
             "column": "Unnamed: 0"
           }
-        }, 
+        },
         ...
         {
           "unexpected_list": 30.397989417989415,
-          "expectation_type": "expect_column_mean_to_be_between", 
-          "success": True, 
+          "expectation_type": "expect_column_mean_to_be_between",
+          "success": True,
           "kwargs": {
-            "column": "Age", 
-            "max_value": 40, 
+            "column": "Age",
+            "max_value": 40,
             "min_value": 20
           }
-        }, 
+        },
         {
           "unexpected_list": [],
-          "expectation_type": "expect_column_values_to_be_between", 
-          "success": True, 
+          "expectation_type": "expect_column_values_to_be_between",
+          "success": True,
           "kwargs": {
-            "column": "Age", 
-            "max_value": 80, 
+            "column": "Age",
+            "max_value": 80,
             "min_value": 0
           }
-        }, 
+        },
         {
           "unexpected_list": [
-            "Downton (?Douton), Mr William James", 
-            "Jacobsohn Mr Samuel", 
+            "Downton (?Douton), Mr William James",
+            "Jacobsohn Mr Samuel",
             "Seman Master Betros"
-          ], 
-          "expectation_type": "expect_column_values_to_match_regex", 
-          "success": True, 
+          ],
+          "expectation_type": "expect_column_values_to_match_regex",
+          "success": True,
           "kwargs": {
-            "regex": "[A-Z][a-z]+(?: \\([A-Z][a-z]+\\))?, ", 
-            "column": "Name", 
+            "regex": "[A-Z][a-z]+(?: \\([A-Z][a-z]+\\))?, ",
+            "column": "Name",
             "mostly": 0.95
           }
-        }, 
+        },
         {
           "unexpected_list": [
             "*"
-          ], 
-          "expectation_type": "expect_column_values_to_be_in_set", 
-          "success": False, 
+          ],
+          "expectation_type": "expect_column_values_to_be_in_set",
+          "success": False,
           "kwargs": {
-            "column": "PClass", 
-            "values_set": [
-              "1st", 
-              "2nd", 
+            "column": "PClass",
+            "value_set": [
+              "1st",
+              "2nd",
               "3rd"
             ]
           }
@@ -106,58 +106,58 @@ This is especially powerful when combined with great_expectations's command line
     {
       "results" : [
         {
-          "expectation_type": "expect_column_to_exist", 
-          "success": True, 
+          "expectation_type": "expect_column_to_exist",
+          "success": True,
           "kwargs": {
             "column": "Unnamed: 0"
           }
-        }, 
+        },
         ...
         {
           "unexpected_list": 30.397989417989415,
-          "expectation_type": "expect_column_mean_to_be_between", 
-          "success": True, 
+          "expectation_type": "expect_column_mean_to_be_between",
+          "success": True,
           "kwargs": {
-            "column": "Age", 
-            "max_value": 40, 
+            "column": "Age",
+            "max_value": 40,
             "min_value": 20
           }
-        }, 
+        },
         {
           "unexpected_list": [],
-          "expectation_type": "expect_column_values_to_be_between", 
-          "success": True, 
+          "expectation_type": "expect_column_values_to_be_between",
+          "success": True,
           "kwargs": {
-            "column": "Age", 
-            "max_value": 80, 
+            "column": "Age",
+            "max_value": 80,
             "min_value": 0
           }
-        }, 
+        },
         {
           "unexpected_list": [
-            "Downton (?Douton), Mr William James", 
-            "Jacobsohn Mr Samuel", 
+            "Downton (?Douton), Mr William James",
+            "Jacobsohn Mr Samuel",
             "Seman Master Betros"
-          ], 
-          "expectation_type": "expect_column_values_to_match_regex", 
-          "success": True, 
+          ],
+          "expectation_type": "expect_column_values_to_match_regex",
+          "success": True,
           "kwargs": {
-            "regex": "[A-Z][a-z]+(?: \\([A-Z][a-z]+\\))?, ", 
-            "column": "Name", 
+            "regex": "[A-Z][a-z]+(?: \\([A-Z][a-z]+\\))?, ",
+            "column": "Name",
             "mostly": 0.95
           }
-        }, 
+        },
         {
           "unexpected_list": [
             "*"
-          ], 
-          "expectation_type": "expect_column_values_to_be_in_set", 
-          "success": False, 
+          ],
+          "expectation_type": "expect_column_values_to_be_in_set",
+          "success": False,
           "kwargs": {
-            "column": "PClass", 
-            "values_set": [
-              "1st", 
-              "2nd", 
+            "column": "PClass",
+            "value_set": [
+              "1st",
+              "2nd",
               "3rd"
             ]
           }
@@ -184,4 +184,3 @@ Useful deployment patterns include:
 * Validate as part of an Airflow task: if Expectations are violated, raise an error and stop DAG propagation until the problem is resolved. Alternatively, you can implement expectations that raise warnings without halting the DAG.
 
 For certain deployment patterns, it may be useful to parameterize expectations, and supply evaluation parameters at validation time. See :ref:`evaluation_parameters` for more information.
-
diff --git a/great_expectations/__init__.py b/great_expectations/__init__.py
@@ -1,5 +1,6 @@
 from .util import *
 from great_expectations import dataset
 from great_expectations.data_context import get_data_context
+from .file_expectations import *
 
 from .version import __version__
diff --git a/great_expectations/cli.py b/great_expectations/cli.py
@@ -9,12 +9,14 @@
 
 
 def dispatch(args):
-    parser = argparse.ArgumentParser(description='great_expectations command-line interface')
+    parser = argparse.ArgumentParser(
+        description='great_expectations command-line interface')
 
     subparsers = parser.add_subparsers(dest='command')
     subparsers.required = True
 
-    validate_parser = subparsers.add_parser('validate', description='Validate expectations for your dataset.')
+    validate_parser = subparsers.add_parser(
+        'validate', description='Validate expectations for your dataset.')
     validate_parser.set_defaults(func=validate)
 
     validate_parser.add_argument('dataset',
@@ -32,11 +34,12 @@ def dispatch(args):
                                  help='Specify whether to only return expectations that are not met during evaluation (defaults to False).')
     # validate_parser.add_argument('--no_catch_exceptions', '-e', default=True, action='store_false')
     # validate_parser.add_argument('--only_return_failures', '-f', default=False, action='store_true')
-    custom_dataset_group = validate_parser.add_argument_group('custom_dataset', description='Arguments defining a custom dataset to use for validation.')
+    custom_dataset_group = validate_parser.add_argument_group(
+        'custom_dataset', description='Arguments defining a custom dataset to use for validation.')
     custom_dataset_group.add_argument('--custom_dataset_module', '-m', default=None,
-                                 help='Path to a python module containing a custom dataset class.')
+                                      help='Path to a python module containing a custom dataset class.')
     custom_dataset_group.add_argument('--custom_dataset_class', '-c', default=None,
-                                 help='Name of the custom dataset class to use during evaluation.')
+                                      help='Name of the custom dataset class to use during evaluation.')
 
     version_parser = subparsers.add_parser('version')
     version_parser.set_defaults(func=version)
@@ -61,20 +64,25 @@ def validate(parsed_args):
     expectations_config = json.load(open(expectations_config_file))
 
     if parsed_args["evaluation_parameters"] is not None:
-        evaluation_parameters = json.load(open(parsed_args["evaluation_parameters"]))
+        evaluation_parameters = json.load(
+            open(parsed_args["evaluation_parameters"]))
     else:
         evaluation_parameters = None
 
     if parsed_args["custom_dataset_module"]:
-        sys.path.insert(0, os.path.dirname(parsed_args["custom_dataset_module"]))
-        module_name = os.path.basename(parsed_args["custom_dataset_module"]).split('.')[0]
+        sys.path.insert(0, os.path.dirname(
+            parsed_args["custom_dataset_module"]))
+        module_name = os.path.basename(
+            parsed_args["custom_dataset_module"]).split('.')[0]
         custom_module = __import__(module_name)
-        dataset_class = getattr(custom_module, parsed_args["custom_dataset_class"])
+        dataset_class = getattr(
+            custom_module, parsed_args["custom_dataset_class"])
 
     else:
         dataset_class = PandasDataset
 
-    df = read_csv(data_set, expectations_config=expectations_config, dataset_class=dataset_class)
+    df = read_csv(data_set, expectations_config=expectations_config,
+                  dataset_class=dataset_class)
 
     result = df.validate(
         evaluation_parameters=evaluation_parameters,

diff --git a/great_expectations/data_context/__init__.py b/great_expectations/data_context/__init__.py
@@ -1,7 +1,8 @@
 from .pandas_context import PandasCSVDataContext
 from .sqlalchemy_context import SqlAlchemyDataContext
 
-def get_data_context(context_type, options):
+
+def get_data_context(context_type, options, *args, **kwargs):
     """Return a data_context object which exposes options to list datasets and get a dataset from
     that context. This is a new API in Great Expectations 0.4, and is subject to rapid change.
 
@@ -10,8 +11,8 @@ def get_data_context(context_type, options):
     :return: a new DataContext object
     """
     if context_type == "SqlAlchemy":
-        return SqlAlchemyDataContext(options)
+        return SqlAlchemyDataContext(options, *args, **kwargs)
     elif context_type == "PandasCSV":
-        return PandasCSVDataContext(options)
+        return PandasCSVDataContext(options, *args, **kwargs)
     else:
-        raise ValueError("Unknown data context.")
+        raise ValueError("Unknown data context.")