diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 000000000000..b1f549104b67 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,4 @@ +[settings] +multi_line_output=3 +force_grid_wrap=0 +include_trailing_comma=1 diff --git a/.travis.yml b/.travis.yml index 5ddffc67a68f..1506850860d5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,21 +3,20 @@ branches: - master - develop - /^\d.*/ + - aylr.* # dist: xenial language: python +cache: pip os: - - linux +- linux +env: + global: + secure: kEzD0ak7sXZspOv4014PRC5+DWQirM1NsQyTfwuZt7u0flTCtYg80Sw4lOuYZmtdztXrY/XresnK6HpCUkFtwwVuODDnrOnj1nO4l4Sb9awoxZn6zo3Q5TyC/sflDrakPsxwKEvxIH1cvzWqRwzmGXc5LEqn5SJyHR8RBhx6zUSgt8KIXLkfLTu0naj4dJH0Hmu4pqjRTmekwfoFHnYJ7aWopZChvxc6Mo2FmFFhl7Zt70EKh6RRB+aUG5xwvwf5iE+YX9YUQfqnA+2ys3O3m4wWGTxwJcdt8ZoexlYjAnSrOePdVO7EvosaBQYEyyU3xS28DzKPQXGoyJLCLNuAmzpxq4yRzPCJJF4Swl0yqY4+ab2OJrNP4A7MuMrQUDdNebUA4BY6PoTM82zfVkDCxf/fDqYPgMOAlNJb+R3jUffRJVlVq/M4sCr3sbGrnzELeTAf63jNIyH0U5LNQzzQUAT/Kyiimey17YqC9RhOUTNrojTkyXJ+ckc7z24yEfT1hUo6DAMM2b0ko/iB1MT3+ujT6qxamHe5imeX+4W1PpZsqkaRgo/dvhIsYoVPFeoBkY/N7PLgIwXhBuuFCH61p2Kx+z8hs34lVZ5GYVHDjeoDEEfm3SQrOuM2uASL9rGWj5yXiUs0OtTjmmKgEBZtNQ9JWHPY0AAdnfgS/GhWm54= matrix: include: - dist: trusty python: 2.7 env: PANDAS=0.22.0 -# - dist: trusty -# python: 2.7 -# env: PANDAS=0.23.4 -# - dist: trusty -# python: 2.7 -# env: PANDAS=0.24.2 - dist: trusty python: 2.7 env: PANDAS=latest @@ -40,20 +39,23 @@ matrix: # python: 3.7 # env: PANDAS=latest addons: - postgresql: "9.4" + postgresql: '9.4' services: - - postgresql - - mysql + - postgresql + - mysql install: # - ./travis-java.sh - sudo apt-get install -y pandoc - pip install --only-binary=numpy,scipy numpy scipy - if [ "$PANDAS" = "latest" ]; then pip install pandas; else pip install pandas==$PANDAS; fi - pip install -r requirements-dev.txt + - pip install pytest-slack # Send a webhook when on travis before_script: - psql -c 'create database test_ci;' -U postgres - mysql -u root --password="" -e 'create database test_ci;' script: - - pytest --cov=great_expectations tests/ + - pytest --cov=great_expectations --slack_hook=$SLACK_WEBHOOK --slack_report_link=$TRAVIS_BUILD_WEB_URL --slack_channel=notifications-great_expectations tests/ after_success: - coveralls + - bash <(curl -s https://codecov.io/bash) + diff --git a/docs/changelog/changelog.rst b/docs/changelog/changelog.rst index 73566b498afe..388ede4287fe 100644 --- a/docs/changelog/changelog.rst +++ b/docs/changelog/changelog.rst @@ -1,5 +1,73 @@ .. _changelog: +0.9.0 +----------------- + +Version 0.9.0 is a major update to Great Expectations! The DataContext has continued to evolve into a powerful tool +for ensuring that Expectation Suites can properly represent the way users think about their data, and upgrading will +make it much easier to store and share expectation suites, and to build data docs that support your whole team. +You’ll get awesome new features including improvements to data docs look and the ability to choose and store metrics +for building flexible data quality dashboards. + +The changes for version 0.9.0 fall into several broad areas: + +1. Onboarding + +Release 0.9.0 of Great Expectations makes it much easier to get started with the project. The `init` flow has grown +to support a much wider array of use cases and to use more natural language rather than introducing +GreatExpectations concepts earlier. You can more easily configure different backends and datasources, take advantage +of guided walkthroughs to find and profile data, and share project configurations with colleagues. + +If you have already completed the `init` flow using a previous version of Great Expectations, you do not need to +rerun the command. However, **there are some small changes to your configuration that will be required**. See +:ref:`migrating_versions` for details. + +2. CLI Command Improvements + +With this release we have introduced a consistent naming pattern for accessing subcommands based on the noun (a +Great Expectations object like `suite` or `docs`) and verb (an action like `edit` or `new`). The new user experience +will allow us to more naturally organize access to CLI tools as new functionality is added. + +3. Expectation Suite Naming and Namespace Changes + +Defining shared expectation suites and validating data from different sources is much easier in this release. The +DataContext, which manages storage and configuration of expectations, validations, profiling, and data docs, no +longer requires that expectation suites live in a datasource-specific “namespace.” Instead, you should name suites +with the logical name corresponding to your data, making it easy to share them or validate against different data +sources. For example, the expectation suite "npi" for National Provider Identifier data can now be shared across +teams who access the same logical data in local systems using Pandas, on a distributed Spark cluster, or via a +relational database. + +Batch Kwargs, or instructions for a datasource to build a batch of data, are similarly freed from a required +namespace, and you can more easily integrate Great Expectations into workflows where you do not need to use a +BatchKwargsGenerator (usually because you have a batch of data ready to validate, such as in a table or a known +directory). + +The most noticeable impact of this API change is in the complete removal of the DataAssetIdentifier class. For +example, the `create_expectation_suite` and `get_batch` methods now no longer require a data_asset_name parameter, +relying only on the expectation_suite_name and batch_kwargs to do their job. Similarly, there is no more asset name +normalization required. See the upgrade guide for more information. + +4. Metrics and Evaluation Parameter Stores + +Metrics have received much more love in this release of Great Expectations! We've improved the system for declaring +evaluation parameters that support dependencies between different expectation suites, so you can easily identify a +particular field in the result of one expectation to use as the input into another. And the MetricsStore is now much +more flexible, supporting a new ValidationAction that makes it possible to select metrics from a validation result +to be saved in a database where they can power a dashboard. + +5. Internal Type Changes and Improvements + +Finally, in this release, we have done a lot of work under the hood to make things more robust, including updating +all of the internal objects to be more strongly typed. That change, while largely invisible to end users, paves the +way for some really exciting opportunities for extending Great Expectations as we build a bigger community around +the project. + + +We are really excited about this release, and encourage you to upgrade right away to take advantage of the more +flexible naming and simpler API for creating, accessing, and sharing your expectations. As always feel free to join +us on Slack for questions you don't see addressed! + 0.8.9__develop ----------------- @@ -53,7 +121,7 @@ * Add support to S3 generator for retrieving directories by specifying the `directory_assets` configuration * Fix warning regarding implicit class_name during init flow * Expose build_generator API publicly on datasources -* Allow configuration of known extensions and return more informative message when SubdirReaderGenerator cannot find +* Allow configuration of known extensions and return more informative message when SubdirReaderBatchKwargsGenerator cannot find relevant files. * Add support for allow_relative_error on internal dataset quantile functions, and add support for build_continuous_partition_object in Redshift @@ -140,8 +208,8 @@ Highlights include: 3. Partitioners: Batch Kwargs are clarified and enhanced to help easily reference well-known chunks of data using a partition_id. Batch ID and Batch Fingerprint help round out support for enhanced metadata around data - assets that GE validates. See :ref:`batch_identifiers` for more information. The `GlobReaderGenerator`, - `QueryGenerator`, `S3Generator`, `SubdirReaderGenerator`, and `TableGenerator` all support partition_id for + assets that GE validates. See :ref:`batch_identifiers` for more information. The `GlobReaderBatchKwargsGenerator`, + `QueryBatchKwargsGenerator`, `S3GlobReaderBatchKwargsGenerator`, `SubdirReaderBatchKwargsGenerator`, and `TableBatchKwargsGenerator` all support partition_id for easily accessing data assets. 4. Other Improvements: @@ -166,7 +234,7 @@ v0.7.10 ----------------- * Fix an issue in generated documentation where the Home button failed to return to the index * Add S3 Generator to module docs and improve module docs formatting -* Add support for views to QueryGenerator +* Add support for views to QueryBatchKwargsGenerator * Add success/failure icons to index page * Return to uniform histogram creation during profiling to avoid large partitions for internal performance reasons @@ -200,7 +268,7 @@ v0.7.8 - PY2 failure on encountering unicode (#676) -v.0.7.7 +0.7.7 ----------------- * Standardize the way that plugin module loading works. DataContext will begin to use the new-style class and plugin identification moving forward; yml configs should specify class_name and module_name (with module_name optional for @@ -226,7 +294,7 @@ v.0.7.7 - Add run_id to path for validation files -v.0.7.6 +0.7.6 ----------------- * New Validation Renderer! Supports turning validation results into HTML and displays differences between the expected and the observed attributes of a dataset. @@ -239,11 +307,11 @@ v.0.7.6 * Bug fixes: improved internal logic of rendering data documentation, slack notification, and CLI profile command when datasource argument was not provided. -v.0.7.5 +0.7.5 ----------------- * Fix missing requirement for pypandoc brought in from markdown support for notes rendering. -v.0.7.4 +0.7.4 ----------------- * Fix numerous rendering bugs and formatting issues for rendering documentation. * Add support for pandas extension dtypes in pandas backend of expect_column_values_to_be_of_type and @@ -254,7 +322,7 @@ v.0.7.4 * Add support for rendering expectation_suite and expectation_level notes from meta in docs. * Fix minor formatting issue in readthedocs documentation. -v.0.7.3 +0.7.3 ----------------- * BREAKING: Harmonize expect_column_values_to_be_of_type and expect_column_values_to_be_in_type_list semantics in Pandas with other backends, including support for None type and type_list parameters to support profiling. @@ -270,7 +338,7 @@ v.0.7.3 * Allow user to specify data_assets to profile via CLI * Support CLI rendering of expectation_suite and EVR-based documentation -v.0.7.2 +0.7.2 ----------------- * Improved error detection and handling in CLI "add datasource" feature * Fixes in rendering of profiling results (descriptive renderer of validation results) @@ -278,7 +346,7 @@ v.0.7.2 * Added convenience methods to display HTML renderers of sections in Jupyter notebooks * Implemented prescriptive rendering of expectations for most expectation types -v.0.7.1 +0.7.1 ------------ * Added documentation/tutorials/videos for onboarding and new profiling and documentation features @@ -297,7 +365,7 @@ v.0.7.1 * Other internal improvements and bug fixes -v.0.7.0 +0.7.0 ------------ Version 0.7 of Great Expectations is HUGE. It introduces several major new features @@ -372,13 +440,13 @@ to top-level names. * Documentation reorganization and improvements * Introduce custom exceptions for more detailed error logs -v.0.6.1 +0.6.1 ------------ * Re-add testing (and support) for py2 * NOTE: Support for SqlAlchemyDataset and SparkDFDataset is enabled via optional install \ (e.g. ``pip install great_expectations[sqlalchemy]`` or ``pip install great_expectations[spark]``) -v.0.6.0 +0.6.0 ------------ * Add support for SparkDFDataset and caching (HUGE work from @cselig) * Migrate distributional expectations to new testing framework @@ -389,13 +457,13 @@ v.0.6.0 We anticipate this will become the future default behavior. * BREAKING CHANGE: Drop official support pandas < 0.22 -v.0.5.1 +0.5.1 --------------- * **Fix** issue where no result_format available for expect_column_values_to_be_null caused error * Use vectorized computation in pandas (#443, #445; thanks @RoyalTS) -v.0.5.0 +0.5.0 ---------------- * Restructured class hierarchy to have a more generic DataAsset parent that maintains expectation logic separate \ from the tabular organization of Dataset expectations @@ -412,7 +480,7 @@ v.0.5.0 * Minor documentation, warning, and testing improvements (thanks @zdog). -v.0.4.5 +0.4.5 ---------------- * Add a new autoinspect API and remove default expectations. * Improve details for expect_table_columns_to_match_ordered_list (#379, thanks @rlshuhart) @@ -434,14 +502,14 @@ v.0.4.5 * Improve internal testing suite (thanks @anhollis and @ccnobbli) * Consistently use value_set instead of mixing value_set and values_set (thanks @njsmith8) -v.0.4.4 +0.4.4 ---------------- * Improve CLI help and set CLI return value to the number of unmet expectations * Add error handling for empty columns to SqlAlchemyDataset, and associated tests * **Fix** broken support for older pandas versions (#346) * **Fix** pandas deepcopy issue (#342) -v.0.4.3 +0.4.3 ------- * Improve type lists in expect_column_type_to_be[_in_list] (thanks @smontanaro and @ccnobbli) * Update cli to use entry_points for conda compatibility, and add version option to cli @@ -454,7 +522,7 @@ v.0.4.3 * Implement expect_column_value_lenghts_to_[be_between|equal] for SQAlchemy (thanks @ccnobbli) * **Fix** PandasDataset subclasses to inherit child class -v.0.4.2 +0.4.2 ------- * **Fix** bugs in expect_column_values_to_[not]_be_null: computing unexpected value percentages and handling all-null (thanks @ccnobbli) * Support mysql use of Decimal type (thanks @bouke-nederstigt) @@ -465,11 +533,11 @@ v.0.4.2 * **Fix** documentation errors and other small errors (thanks @roblim, @ccnobbli) -v.0.4.1 +0.4.1 ------- * Correct inclusion of new data_context module in source distribution -v.0.4.0 +0.4.0 ------- * Initial implementation of data context API and SqlAlchemyDataset including implementations of the following \ expectations: @@ -505,19 +573,21 @@ v.0.4.0 * Behind-the-scenes improvements to testing framework to ensure parity across data contexts. * Documentation improvements, bug-fixes, and internal api improvements -v.0.3.2 +0.3.2 ------- * Include requirements file in source dist to support conda -v.0.3.1 +0.3.1 -------- * **Fix** infinite recursion error when building custom expectations * Catch dateutil parsing overflow errors -v.0.2 +0.2 ----- * Distributional expectations and associated helpers are improved and renamed to be more clear regarding the tests they apply * Expectation decorators have been refactored significantly to streamline implementing expectations and support custom expectations * API and examples for custom expectations are available * New output formats are available for all expectations * Significant improvements to test suite and compatibility + +*Last updated*: |lastupdate| diff --git a/docs/community.rst b/docs/community.rst index bb72eaadc464..0046ac6a6707 100644 --- a/docs/community.rst +++ b/docs/community.rst @@ -35,3 +35,5 @@ Contribute code or documentation We welcome contributions to Great Expectations. Please start with our :ref:`contributing` guide and don't be shy with questions! + +*last updated*: |lastupdate| diff --git a/docs/conf.py b/docs/conf.py index adeb1a1a3bca..9998d001f312 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -39,7 +39,9 @@ 'sphinx.ext.coverage', # 'sphinx.ext.mathjax' 'sphinx.ext.napoleon', - 'sphinxcontrib.contentui' + 'sphinxcontrib.contentui', + 'sphinxcontrib.lastupdate', + 'sphinx.ext.autosectionlabel' ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/conftest.py b/docs/conftest.py new file mode 100644 index 000000000000..9201958c56a4 --- /dev/null +++ b/docs/conftest.py @@ -0,0 +1,50 @@ +# import pytest +# +# import numpy +# import great_expectations +# +# +# @pytest.fixture +# def pandas_npi_dataset(): +# npi = great_expectations.dataset.PandasDataset({"provider_id": [1,2,3]}) +# return npi +# +# +# @pytest.fixture(autouse=True) +# def add_standard_imports(doctest_namespace, pandas_npi_dataset): +# doctest_namespace["np"] = numpy +# doctest_namespace["ge"] = great_expectations +# doctest_namespace["npi"] = pandas_npi_dataset +# doctest_namespace["ExpectationValidationResult"] = great_expectations.core.ExpectationValidationResult +# +# +# from os import chdir, getcwd +# from shutil import rmtree +# from tempfile import mkdtemp +# import pytest +# from sybil import Sybil +# from sybil.parsers.codeblock import CodeBlockParser +# from sybil.parsers.doctest import DocTestParser +# from sybil.parsers.skip import skip +# +# @pytest.fixture(scope="module") +# def tempdir(): +# # there are better ways to do temp directories, but it's a simple example: +# path = mkdtemp() +# cwd = getcwd() +# try: +# chdir(path) +# yield path +# finally: +# chdir(cwd) +# rmtree(path) +# +# pytest_collect_file = Sybil( +# parsers=[ +# DocTestParser(), +# CodeBlockParser(), +# skip +# ], +# pattern='*.rst', +# fixtures=['tempdir'] +# ).pytest() diff --git a/docs/example_code/custom_renderer.py b/docs/example_code/custom_renderer.py index d722b75f248f..a0472e5adcac 100644 --- a/docs/example_code/custom_renderer.py +++ b/docs/example_code/custom_renderer.py @@ -1,22 +1,26 @@ import altair as alt import pandas as pd +from great_expectations.render.renderer.renderer import Renderer from great_expectations.render.types import ( RenderedDocumentContent, RenderedSectionContent, RenderedComponentContent, -) -from great_expectations.render.renderer import Renderer + RenderedHeaderContent, RenderedBulletListContent, RenderedTableContent, RenderedStringTemplateContent, + RenderedGraphContent, ValueListContent) class CustomPageRenderer(Renderer): @classmethod def _get_header_content_block(cls, header="", subheader="", highlight=True): - return RenderedComponentContent(**{ + return RenderedHeaderContent(**{ "content_block_type": "header", - "header": { - "template": header - }, + "header": RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { + "template": header, + } + }), "subheader": subheader, "styling": { "classes": ["col-12"], @@ -28,7 +32,7 @@ def _get_header_content_block(cls, header="", subheader="", highlight=True): @classmethod def _get_bullet_list_content_block(cls, header="", subheader="", col=12): - return RenderedComponentContent(**{ + return RenderedBulletListContent(**{ "content_block_type": "bullet_list", "header": header, "subheader": subheader, @@ -48,7 +52,7 @@ def _get_bullet_list_content_block(cls, header="", subheader="", col=12): @classmethod def _get_table_content_block(cls, header="", subheader="", col=12): - return RenderedComponentContent(**{ + return RenderedTableContent(**{ "content_block_type": "table", "header": header, "subheader": subheader, @@ -84,7 +88,7 @@ def _get_graph_content_block(cls, header="", subheader="", col=12): ).properties(height=200, width=200, autosize="fit") chart = bars.to_json() - return RenderedComponentContent(**{ + return RenderedGraphContent(**{ "content_block_type": "graph", "header": header, "subheader": subheader, @@ -99,7 +103,7 @@ def _get_graph_content_block(cls, header="", subheader="", col=12): @classmethod def _get_tooltip_string_template_content_block(cls): - return RenderedComponentContent(**{ + return RenderedStringTemplateContent(**{ "content_block_type": "string_template", "string_template": { "template": "This is a string template with tooltip, using a top-level custom tag.", @@ -118,7 +122,7 @@ def _get_tooltip_string_template_content_block(cls): @classmethod def _get_string_template_content_block(cls): - return RenderedComponentContent(**{ + return RenderedStringTemplateContent(**{ "content_block_type": "string_template", "string_template": { "template": "$icon This is a Font Awesome Icon, using a param-level custom tag\n$red_text\n$bold_serif", @@ -156,7 +160,7 @@ def _get_string_template_content_block(cls): @classmethod def _get_value_list_content_block(cls, header="", subheader="", col=12): - return RenderedComponentContent(**{ + return ValueListContent(**{ "content_block_type": "value_list", "header": header, "subheader": subheader, diff --git a/docs/features.rst b/docs/features.rst index f00693f8f2b8..4678154099ad 100644 --- a/docs/features.rst +++ b/docs/features.rst @@ -18,6 +18,9 @@ and our `blog `__ for more information on how /features/custom_expectations /features/data_context /features/validation_operators_and_actions + /features/profilers /features/datasource - /features/batch_generator + /features/batch_kwargs_generator /features/ge_on_teams + +*last updated*: |lastupdate| diff --git a/docs/features/batch_generator.rst b/docs/features/batch_generator.rst deleted file mode 100644 index 7c23147c0f09..000000000000 --- a/docs/features/batch_generator.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _batch_generator: - -################## -Batch Generators -################## - -A generator builds instructions for GE datasources by inspecting data and helping to identify useful elements such as -batches. Batch generators produce identifying information, called "batch_kwargs" that datasources can use to get -individual batches of data. They add flexibility in how to obtain data such as with time-based partitioning, -downsampling, or other techniques appropriate for the datasource. - -For example, a generator could produce a SQL query that logically represents "rows in -the Events table with a timestamp on February 7, 2012," which a SQLAlchemyDatasource -could use to materialize a SQLAlchemyDataset corresponding to that batch of data and -ready for validation. - -******** -Batch -******** - -A batch is a sample from a data asset, sliced according to a particular rule. -For example, an hourly slide of the Events table or “most recent `users` records.” - -A Batch is the primary unit of validation in the Great Expectations DataContext. -Batches include metadata that identifies how they were constructed--the same “batch_kwargs” -assembled by the generator, While not every datasource will enable re-fetching a -specific batch of data, GE can store snapshots of batches or store metadata from an -external data version control system. - -See more detailed documentation on the :ref:`generator_module`. diff --git a/docs/features/batch_kwargs_generator.rst b/docs/features/batch_kwargs_generator.rst new file mode 100644 index 000000000000..b4f925dd3ade --- /dev/null +++ b/docs/features/batch_kwargs_generator.rst @@ -0,0 +1,35 @@ +.. _batch_kwargs_generator: + +######################## +Batch Kwargs Generators +######################## + +Batch Kwargs are specific instructions for a Datasource about what data should be prepared as a “batch” for +validation. The batch could be a specific database table, the most recent log file delivered to S3, or even a subset +of one of those objects such as the first 10,000 rows. + +A BatchKwargsGenerator builds those instructions for GE datasources by inspecting storage backends or data, or by +maintaining configuration such as commonly-used paths or filepath conventions. That allows BatchKwargsGenerators to add +flexibility in how to obtain data such as by exposing time-based partitions or sampling data. + +For example, a Batch Kwargs Generator could be **configured** to produce a SQL query that logically represents "rows in +the Events table with a type code of 'X' that occurred within seven days of a given timestamp." With that +configuration, you could provide a timestamp as a partition name, and the Batch Kwargs Generator will produce +instructions that a SQLAlchemyDatasource could use to materialize a SQLAlchemyDataset corresponding to that batch of +data and ready for validation. + +******** +Batch +******** + +A batch is a sample from a data asset, sliced according to a particular rule. +For example, an hourly slide of the Events table or “most recent `users` records.” + +A Batch is the primary unit of validation in the Great Expectations DataContext. +Batches include metadata that identifies how they were constructed--the same “batch_kwargs” +assembled by the generator, "batch_markers" that provide more detailed metadata to aid in replicating complicated +workflows, and optionally "batch_parameters" that include information such as an asset or partition name. + +See more detailed documentation on the :ref:`generator_module`. + +*last updated*: |lastupdate| diff --git a/docs/features/custom_expectations.rst b/docs/features/custom_expectations.rst index 429f937d6366..1a9826ae083e 100644 --- a/docs/features/custom_expectations.rst +++ b/docs/features/custom_expectations.rst @@ -21,3 +21,5 @@ Building custom expectations is easy and allows your custom logic to become part even profiling workflows that make Great Expectations stand out. See the guide on :ref:`custom_expectations_reference` for more information on building expectations and updating DataContext configurations to automatically load batches of data with custom Data Assets. + +*last updated*: |lastupdate| diff --git a/docs/features/data_context.rst b/docs/features/data_context.rst index ea0ed9a8ce25..ec4eded2ca0e 100644 --- a/docs/features/data_context.rst +++ b/docs/features/data_context.rst @@ -11,42 +11,14 @@ expectation suites, datasources, notification settings, and data fixtures. The DataContext is configured via a yml file stored in a directory called great_expectations; the configuration file as well as managed expectation suites should be stored in version control. -DataContexts use data sources you're already familiar with. Generators help introspect data stores and data execution -frameworks (such as airflow, Nifi, dbt, or dagster) to describe and produce batches of data ready for analysis. This -enables fetching, validation, profiling, and documentation of your data in a way that is meaningful within your +DataContexts manage connections to your data and compute resources, and support integration with execution +frameworks (such as airflow, Nifi, dbt, or dagster) to describe and produce batches of data ready for analysis. Those +features enable fetching, validation, profiling, and documentation of your data in a way that is meaningful within your existing infrastructure and work environment. -DataContexts use a datasource-based namespace, where each accessible type of data has a three-part -normalized *data_asset_name*, consisting of *datasource/generator/generator_asset*. - -- The datasource actually connects to a source of data and returns Great Expectations DataAssets \ - connected to a compute environment and ready for validation. - -- The Generator knows how to introspect datasources and produce identifying "batch_kwargs" that define \ - particular slices of data. - -- The generator_asset is a specific name -- often a table name or other name familiar to users -- that \ - generators can slice into batches. - -An expectation suite is a collection of expectations ready to be applied to a batch of data. Since -in many projects it is useful to have different expectations evaluate in different contexts--profiling -vs. testing; warning vs. error; high vs. low compute; ML model or dashboard--suites provide a namespace -option for selecting which expectations a DataContext returns. - - -A Great Expectations DataContext describes data assets using a three-part namespace consisting of -**datasource_name**, **generator_name**, and **generator_asset**. - -To run validation for a data_asset, we need two additional elements: - -* a **batch** to validate; in our case it is a file loaded into a Pandas DataFrame -* an **expectation_suite** to validate against - -.. image:: ../images/data_asset_namespace.png - - -In many simple projects, the datasource or generator name may be omitted and the DataContext will infer -the correct name when there is no ambiguity. +DataContexts also manage Expectation Suites. Expectation Suites combine multiple Expectation Configurations into an +overall description of a dataset. Expectation Suites should have names corresponding to the kind of data they +define, like “NPI” for National Provider Identifier data or “company.users” for a users table. The DataContext also provides other services, such as storing and substituting evaluation parameters during validation. See :ref:`data_context_evaluation_parameter_store` for more information. diff --git a/docs/features/data_docs.rst b/docs/features/data_docs.rst index 5cff3097d53c..e984dc9222ec 100644 --- a/docs/features/data_docs.rst +++ b/docs/features/data_docs.rst @@ -6,15 +6,7 @@ Data Docs Data Docs compiles raw Great Expectations objects including Expectations and Validations into structured documents such as HTML documentation that display -key characteristics of a dataset. Together, Data Docs, Profiling, and Validation -are the three core services offered by GE. - - -Data Docs is implemented in the :py:mod:`great_expectations.render` module. - -******************* -HTML documentation -******************* +key characteristics of a dataset. HTML documentation takes expectation suites and validation results and produces clear, functional, and self-healing documentation of expected and observed data characteristics. Together with profiling, it can help to rapidly create @@ -29,16 +21,26 @@ And then detailed statistics for each column: .. image:: ../images/movie_db_profiling_screenshot_1.jpg - -The GE DataContext uses a configurable "data documentation site" to define which artifacts to compile and how to render them as documentation. Multiple sites can be configured inside a project, each suitable for a particular data documentation use case. +The Great Expectations DataContext uses a configurable "data documentation site" to define which artifacts to compile +and how to render them as documentation. Multiple sites can be configured inside a project, each suitable for a +particular data documentation use case. For example, we have identified three common use cases for using documentation in a data project. They are to: -1. Visualize all Great Expectations artifacts in the local repo of a project as HTML: expectation suites, validation results and profiling results. -2. Maintain a "shared source of truth" for a team working on a data project. This documentation renders all the artifacts committed in the source control system (expectation suites and profiling results) and a continuously updating data quality report, built from a chronological list of validations by run id. -3. Share a spec of a dataset with a client or a partner. This is similar to API documentation in software development. This documentation would include profiling results of the dataset to give the reader a quick way to grasp what the data looks like, and one or more expectation suites that encode what is expected from the data to be considered valid. +1. Visualize all Great Expectations artifacts from the local repository of a project as HTML: expectation suites, +validation results and profiling results. -To support these (and possibly other) use cases GE has a concept of "data documentation site". Multiple sites can be configured inside a project, each suitable for a particular data documentation use case. +2. Maintain a "shared source of truth" for a team working on a data project. Such documentation renders all the +artifacts committed in the source control system (expectation suites and profiling results) and a continuously +updating data quality report, built from a chronological list of validations by run id. + +3. Share a spec of a dataset with a client or a partner. This is similar to API documentation in software +development. This documentation would include profiling results of the dataset to give the reader a quick way to +grasp what the data looks like, and one or more expectation suites that encode what is expected from the data to be +considered valid. + +To support these (and possibly other) use cases Great Expectations has a concept of "data documentation site". Multiple +sites can be configured inside a project, each suitable for a particular data documentation use case. Here is an example of a site: @@ -62,3 +64,5 @@ The HTML documentation generated by Great Expectations Data Docs is fully custom of these pages or create your own, see :ref:`customizing_data_docs`. See the :ref:`data_docs_reference` for more information. + +*last updated*: |lastupdate| diff --git a/docs/features/datasource.rst b/docs/features/datasource.rst index 4bb450a20589..6017930f6ef4 100644 --- a/docs/features/datasource.rst +++ b/docs/features/datasource.rst @@ -5,27 +5,16 @@ Datasources ############## Datasources are responsible for connecting data and compute infrastructure. Each Datasource provides -Great Expectations DataAssets (or batches in a DataContext) connected to a specific compute environment, such as a +Great Expectations Data Assets connected to a specific compute environment, such as a SQL database, a Spark cluster, or a local in-memory Pandas DataFrame. Datasources know how to access data from relevant sources such as an existing object from a DAG runner, a SQL database, an S3 bucket, GCS, or a local filesystem. -To bridge the gap between those worlds, Datasources interact closely with *generators* which +To bridge the gap between those worlds, Datasources can interact closely with :ref:`batch_kwargs_generator` which are aware of a source of data and can produce produce identifying information, called -"batch_kwargs" that datasources can use to get individual batches of data. They add flexibility -in how to obtain data such as with time-based partitioning, downsampling, or other techniques -appropriate for the datasource. +"batch_kwargs" that datasources can use to get individual batches of data. -For example, a generator could produce a SQL query that logically represents "rows in the Events -table with a timestamp on February 7, 2012," which a SqlAlchemyDatasource could use to materialize -a SqlAlchemyDataset corresponding to that batch of data and ready for validation. - -Since opinionated DAG managers such as airflow, dbt, prefect.io, dagster can also act as datasources -and/or generators for a more generic datasource. - -When adding custom expectations by subclassing an existing DataAsset type, use the data_asset_type parameter -to configure the datasource to load and return DataAssets of the custom type. - -See :ref:`batch_generator` for more detail about how batch generators interact with datasources and DAG runners. +See :ref:`datasource_reference` for more detail about configuring and using datasources in your DataContext. See datasource module docs :ref:`datasource_module` for more detail about available datasources. +*last updated*: |lastupdate| diff --git a/docs/features/expectations.rst b/docs/features/expectations.rst index 21c8c33b5d9d..516c83c77512 100644 --- a/docs/features/expectations.rst +++ b/docs/features/expectations.rst @@ -8,7 +8,7 @@ Expectations are the workhorse abstraction in Great Expectations. Like assertion Expectations provide a flexible, declarative language for describing expected behavior. Unlike traditional unit tests, Great Expectations applies Expectations to data instead of code. -Expectations *enhance communication* about your data and and *amplify quality* in data applications. Using expectations +Expectations *enhance communication* about your data and *amplify quality* in data applications. Using expectations helps reduce trips to domain experts and avoids leaving insights about data on the "cutting room floor." ************************** @@ -29,8 +29,8 @@ There are several paths to generating expectations: expectations. Interviewing experts and encoding their tacit knowledge of common distributions, values, or failure conditions can be can excellent way to generate expectations. -3. Exploratory Analysis. Using GE in an exploratory analysis workflow such as enabled by the ``create_expectations`` - notebook is an important way to develop experience with both raw and derived datasets and generate useful and +3. Exploratory Analysis. Using GE in an exploratory analysis workflow (e.g. within Jupyter notebooks) is an important \ + way to develop experience with both raw and derived datasets and generate useful and testable expectations about characteristics that may be important for the data's eventual purpose, whether reporting or feeding another downstream model or data system. @@ -41,24 +41,18 @@ Expectations come to your data Great Expectations's connect-and-expect API makes it easy to declare Expectations within the tools you already use for data exploration: jupyter notebooks, the ipython console, scratch scripts, etc. -.. code-block:: bash - - >> import great_expectations as ge - >> my_df = ge.read_csv("./tests/examples/titanic.csv") - - >> my_df.expect_column_values_to_be_in_set( - "Sex", - ["male", "female"] - ) - { - 'success': True, - 'summary_obj': { - 'unexpected_count': 0, - 'unexpected_percent': 0.0, - 'unexpected_percent_nonmissing': 0.0, - 'partial_unexpected_list': [] - } +>>> import great_expectations as ge +>>> my_df = ge.read_csv("./tests/examples/titanic.csv") +>>> my_df.expect_column_values_to_be_in_set("Sex", ["male", "female"]) +{ + 'success': True, + 'summary_obj': { + 'unexpected_count': 0, + 'unexpected_percent': 0.0, + 'unexpected_percent_nonmissing': 0.0, + 'partial_unexpected_list': [] } +} @@ -144,7 +138,7 @@ practical use cases: * Form validation and regex pattern-matching for names, URLs, dates, addresses, etc. * Checks for missing data * Crosstabs -* Distributions for statistical modeling. +* Distributions for statistical modeling. * etc. You can also add notes or even structured metadata to expectations to describe the intent of an expectation or anything @@ -160,3 +154,5 @@ else relevant for understanding it: "source": "max@company.com" } ) + +*last updated*: |lastupdate| diff --git a/docs/features/metrics.rst b/docs/features/metrics.rst new file mode 100644 index 000000000000..0b5a3883bec8 --- /dev/null +++ b/docs/features/metrics.rst @@ -0,0 +1,24 @@ +.. _metrics: + +############## +Metrics +############## + +Metrics are values derived from one or more Data Assets that can be used to evaluate expectations or to summarize the +result of Validation. A Metric is obtained from an ExpectationValidationResult or ExpectationSuiteValidationResult by +providing the `metric_name` and `metric_kwargs` or `metric_kwargs_id`. + +A metric name is a dot-delimited string that identifies the value, such as `expect_column_values_to_be_unique +.success` or `expect_column_values_to_be_between.result.unexpected_percent`. + +Metric Kwargs are key-value pairs that identify the metric within the context of the validation, such as "column": +"Age". Different metrics may require different Kwargs. + +A metric_kwargs_id is a string representation of the Metric Kwargs that can be used as a database key. For simple +cases, it could be easily readable, such as `column=Age`, but when there are multiple keys and values or complex +values, it will most likely be an md5 hash of key/value pairs. It can also be None in the case that there are no +kwargs required to identify the metric. + +See the :ref:`metrics_reference` or :ref:`metrics_tutorial ` for more information. + +*Last updated:* |lastupdate| diff --git a/docs/features/profilers.rst b/docs/features/profilers.rst new file mode 100644 index 000000000000..a28248be5836 --- /dev/null +++ b/docs/features/profilers.rst @@ -0,0 +1,24 @@ +.. _profilers: + +############## +Profilers +############## + +Great Expectations provides a mechanism to automatically generate expectations, using a feature called a `Profiler`. A +Profiler builds an Expectation Suite from one or more Data Assets. It usually also validates the data against the +newly-generated Expectation Suite to return a Validation Result. There are several Profilers included with Great +Expectations. + +A Profiler makes it possible to quickly create a starting point for generating expectations about a Dataset. For +example, during the `init` flow, Great Expectations uses the `SampleExpectationsDatasetProfiler` to demonstrate +important features of Expectations by creating and validating an Expectation Suite that has several different kinds of +expectations built from a small sample of data. A Profiler is also critical to generating the Expectation Suites used +during :ref:`profiling`. + +You can also extend Profilers to capture organizational knowledge about your data. For example, a team might have a +convention that all columns **named** "id" are primary keys, whereas all columns ending with the +**suffix** "_id" are foreign keys. In that case, when the team using Great Expectations first encounters a new +dataset that followed the convention, a Profiler could use that knowledge to add an expect_column_values_to_be_unique +Expectation to the "id" column (but not, for example an "address_id" column). + +*Last updated:* |lastupdate| diff --git a/docs/features/profiling.rst b/docs/features/profiling.rst index fc39592f18a3..f380679d2766 100644 --- a/docs/features/profiling.rst +++ b/docs/features/profiling.rst @@ -4,12 +4,18 @@ Profiling ############## -Profiling evaluates a data asset and summarizes its observed characteristics. By computing the observed properties of -data, Profiling helps to reason about the data's expected properties when creating expectation suites. +Profiling is a way of Rendering Validation Results to produce a summary of observed characteristics. When Validation +Results are rendered as Profiling data, they create a new section in :ref:`data_docs`. By computing the **observed** +properties of data, Profiling helps to understand and reason about the data's **expected** properties. -Profiling results are usually rendered into HTML - see :ref:`data_docs`. -GE ships with the default BasicDatasetProfiler, which will produce an expectation_suite and so validation_results -that compile to a page for each table or DataFrame including an overview section: +To produce a useful data overview, Great Expectations uses a :ref:`profiler ` to build a special Expectation +Suite. Unlike the Expectations that are typically used for data validation, expectations for Profiling do not +necessarily apply any constraints. They can simply identify statistics or other data characteristics that should be +evaluated and made available in Great Expectations. For example, when the included ``BasicDatasetProfiler`` +encounters a numeric column, it will add an ``expect_column_mean_to_be_between`` expectation but choose the min_value +and max_value to both be None: essentially only saying that it expects a mean to exist. + +The default BasicDatasetProfiler will thus produce a page for each table or DataFrame including an overview section: .. image:: ../images/movie_db_profiling_screenshot_2.jpg @@ -17,63 +23,12 @@ And then detailed statistics for each column: .. image:: ../images/movie_db_profiling_screenshot_1.jpg - Profiling is still a beta feature in Great Expectations. Over time, we plan to extend and improve the ``BasicDatasetProfiler`` and also add additional profilers. -Profiling relies on automated inspection of data batches to generate and encode expectations. Together, -encoding expectations, testing data, and presenting expectation validation results are the three core services -offered by GE. - -Warning: ``BasicDatasetProfiler`` will evaluate the entire batch -without limits or sampling, which may be very time consuming. As a rule of thumb, we recommend starting with batches -smaller than 100MB. - -**************************** -Expectations and Profiling -**************************** - -In order to characterize a data asset, profiling creates an expectation suite. Unlike the expectations that are -typically used for data validation, these expectations do not necessarily apply any constraints; they can simply -identify statistics or other data characteristics that should be evaluated and made available in GE. For example, when -the ``BasicDatasetProfiler`` it encounters a numeric column, it will add an ``expect_column_mean_to_be_between`` -expectation but choose the min_value and max_value to both be None: essentially saying only that it expects the mean -to exist. - -.. code-block:: json - - { - "expectation_type": "expect_column_mean_to_be_between", - "kwargs": { - "column": "rating", - "min_value": null, - "max_value": null - } - } - -To "profile" a datasource, therefore, the :class:`~great_expectations.profile.basic_dataset_profiler.\ -BasicDatasetProfiler` included in GE will generate a large number of very loosely-specified expectations. Effectively -it is asserting that the given statistic is relevant for evaluating batches of that data asset, but it is not yet sure -what the statistic's value should be. - -In addition to creating an expectation suite, profiling data tests the suite against data. -The validation_result contains the output of that expectation suite when validated against the same batch of data. -For a loosely specified expectation like in our example above, getting the observed value was the sole purpose of -the expectation. - -.. code-block:: json - - { - "success": true, - "result": { - "observed_value": 4.05, - "element_count": 10000, - "missing_count": 0, - "missing_percent": 0 - } - } - -Running a profiler on a data asset can also be useful to produce a large number of expectations to review -and potentially transfer to a new expectation suite used for validation in a pipeline. +Warning: ``BasicDatasetProfiler`` will evaluate the entire batch without limits or sampling, which may be very time +consuming. As a rule of thumb, we recommend starting with small batches of data. See the :ref:`profiling_reference` for more information. + +*last updated*: |lastupdate| diff --git a/docs/features/validation.rst b/docs/features/validation.rst index 5e817d6ebc89..33b48e734ec3 100644 --- a/docs/features/validation.rst +++ b/docs/features/validation.rst @@ -7,10 +7,10 @@ Validation Once you've constructed and stored Expectations, you can use them to validate new data. Validation generates a report that details any specific deviations from expected values. -We recommend using a :ref:`data_context` to manage expectation suites and coordinate validation across runs. +We recommend using :ref:`data_context` to manage expectation suites and coordinate validation across runs. ******************* -Validation results +Validation Results ******************* The report contains information about: @@ -106,98 +106,21 @@ Reviewing Validation Results ***************************** The easiest way to review Validation Results is to view them from your local Data Docs site, where you can also conveniently -view Profiling Results and Expectation Suites. Out of the box, Great Expectations Data Docs is configured to compile a local +view Expectation Suites and with additional configuration, Profiling Results (see :ref:`data_docs_site_configuration`). Out of the box, Great Expectations Data Docs is configured to compile a local data documentation site when you start a new project by running ``great_expectations init``. By default, this local site is -saved to the ``uncommitted/data_docs/local_site/`` directory of your project. +saved to the ``uncommitted/data_docs/local_site/`` directory of your project and will contain pages for Expectation Suites \ +and Validation Results. If you would like to review the raw validation results in JSON format, the default Validation Results directory is ``uncommitted/validations/``. Note that by default, Data Docs will only compile Validation Results located in this directory. To learn more about setting up Great Expectations for your team read :ref:`using_ge_on_teams`. -Command-line validation -======================== - -This is especially powerful when combined with great_expectations's command line tool, which lets you validate in a one-line bash script. - -.. code-block:: bash - - $ great_expectations validate tests/examples/titanic.csv \ - tests/examples/titanic_expectations.json - { - "results" : [ - { - "expectation_type": "expect_column_to_exist", - "success": True, - "kwargs": { - "column": "Unnamed: 0" - } - }, - ... - { - "unexpected_list": 30.397989417989415, - "expectation_type": "expect_column_mean_to_be_between", - "success": True, - "kwargs": { - "column": "Age", - "max_value": 40, - "min_value": 20 - } - }, - { - "unexpected_list": [], - "expectation_type": "expect_column_values_to_be_between", - "success": True, - "kwargs": { - "column": "Age", - "max_value": 80, - "min_value": 0 - } - }, - { - "unexpected_list": [ - "Downton (?Douton), Mr William James", - "Jacobsohn Mr Samuel", - "Seman Master Betros" - ], - "expectation_type": "expect_column_values_to_match_regex", - "success": True, - "kwargs": { - "regex": "[A-Z][a-z]+(?: \\([A-Z][a-z]+\\))?, ", - "column": "Name", - "mostly": 0.95 - } - }, - { - "unexpected_list": [ - "*" - ], - "expectation_type": "expect_column_values_to_be_in_set", - "success": False, - "kwargs": { - "column": "PClass", - "value_set": [ - "1st", - "2nd", - "3rd" - ] - } - } - ] - "success", False, - "statistics": { - "evaluated_expectations": 10, - "successful_expectations": 9, - "unsuccessful_expectations": 1, - "success_percent": 90.0 - } - } - ********************* Validation Operators ********************* -The example above demonstrates how to validate one batch of data against one expectation suite. The `validate` method returns a dictionary of validation results. This is sufficient when you explore your data and get to know Great Expectations. +The example above demonstrates how to validate one batch of data against one expectation suite. The `validate` method returns a dictionary of validation results. This is sufficient when exploring your data and getting to know Great Expectations. When deploying Great Expectations in a real data pipeline, you will typically discover additional needs: * validating a group of batches that are logically related @@ -227,3 +150,5 @@ Useful deployment patterns include: For certain deployment patterns, it may be useful to parameterize expectations, and supply evaluation parameters at \ validation time. See :ref:`evaluation_parameters` for more information. + +*last updated*: |lastupdate| diff --git a/docs/features/validation_operators_and_actions.rst b/docs/features/validation_operators_and_actions.rst index 750871710ffb..da4b54b0a43e 100644 --- a/docs/features/validation_operators_and_actions.rst +++ b/docs/features/validation_operators_and_actions.rst @@ -108,7 +108,7 @@ This is an example of invoking an instance of a Validation Operator from Python: validation_operator_name="perform_action_list_operator", ) -* `assets_to_validate` - an iterable that specifies the data assets that the operator will validate. The members of the list can be either batches or triples that will allow the operator to fetch the batch: (data_asset_name, expectation_suite_name, batch_kwargs) using this method: :py:meth:`~great_expectations.data_context.ConfigOnlyDataContext.get_batch` +* `assets_to_validate` - an iterable that specifies the data assets that the operator will validate. The members of the list can be either batches or triples that will allow the operator to fetch the batch: (data_asset_name, expectation_suite_name, batch_kwargs) using this method: :py:meth:`~great_expectations.data_context.BaseDataContext.get_batch` * run_id - pipeline run id, a timestamp or any other string that is meaningful to you and will help you refer to the result of this operation later * validation_operator_name you can instances of a class that implements a Validation Operator @@ -127,10 +127,4 @@ The only requirement from an action is for it to have a take_action method. GE comes with a list of actions that we consider useful and you can reuse in your pipelines. Most of them take in validation results and do something with them. - - - - - - - +*last updated*: |lastupdate| diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 4325122b07bc..d4d910928f37 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -11,12 +11,17 @@ It's easy! Just use pip install: $ pip install great_expectations +(We recommend deploying within a virtual environment. If you're not familiar with pip, virtual environments, notebooks, +or git, you may want to check out the :ref:`supporting_resources` section before continuing.) + + Once Great Expectations is installed, follow this tutorial for a quick start. .. toctree:: :maxdepth: 1 /getting_started/cli_init + /getting_started/typical_workflow -(We recommend deploying within a virtual environment. If you're not familiar with pip, virtual environments, notebooks, -or git, you may want to check out the :ref:`supporting_resources` section before continuing.) + +*last updated*: |lastupdate| diff --git a/docs/getting_started/cli_init.rst b/docs/getting_started/cli_init.rst index d211d7d28f08..28087d1996d7 100644 --- a/docs/getting_started/cli_init.rst +++ b/docs/getting_started/cli_init.rst @@ -3,452 +3,31 @@ Run ``great_expectations init`` =============================================== -Video ------- +The command line interface (CLI) provides the easiest way to start using Great Expectations. -.. +The `init` command will walk you through setting up a new project and connecting to your data. -Watch `the video on YouTube `_. +Make sure that the machine that you installed GE on has access to a filesystem with data files (e.g., CSV) or a database. - -Default Project Structure ----------------------------------------- - -Great Expectations provides a default project framework that simplifies operations such as connecting to data sources; -fetching, profiling and validating batches of data; and compiling to human-readable documentation. - -This tutorial uses example data from the United States Centers for Medicare and Medicaid Services `National Provider +If you prefer to use some sample data first, we suggest this example data from the United States Centers for Medicare and Medicaid Services `National Provider Identifier Standard `_ -(NPI). If you want to follow along with this exact example, start with: +(NPI). Some later Great Expectations tutorials use this dataset, so this will make it easy to follow along. + +To download this sample dataset: .. code-block:: bash git clone https://github.com/superconductive/ge_example_project.git cd ge_example_project -By default, everything in the Great Expectations deployment framework will be expressed in a directory structure -within a ``great_expectations/`` folder within your version control system. To create this folder, navigate to the -root of your project directory in a terminal and run: - -.. code-block:: bash - - great_expectations init - -The command line interface (CLI) will scaffold and populate the configuration -and other artifacts necessary to get started with Great Expectations. This can -be run to start a new project and to onboard a teammate to an existing project. - - -If you inspect the ``great_expectations/`` directory after the init command has run, it should contain: - -.. code-block:: bash - - great_expectations - ├── .gitignore - ├── datasources - ├── expectations - ├── fixtures - ├── great_expectations.yml - ├── notebooks - │   ├── pandas - │   ├── spark - │   └── sql - ├── plugins - └── uncommitted - ├── config_variables.yml - ├── documentation - │   └── local_site - ├── samples - └── validations - - -Adding Datasources ----------------------------------------- - -Next, the CLI will ask you if you want to configure a Datasource. - -Datasources allow you to configure connections to data to evaluate Expectations. Great Expectations currently supports -native evaluation of Expectations in three compute environments: - -1. Pandas DataFrames -2. Relational databases via SQL Alchemy -3. Spark DataFrames - -A Datasource could be a local pandas environment with some configuration to parse CSV files from a directory; a -connection to postgresql instance; a Spark cluster connected to an S3 bucket; etc. In the future, we plan to add -support for other compute environments, such as dask. (If you'd like to use or contribute to those environments, -please chime in on `GitHub issues `_.) - -Our example project has a ``data/`` folder containing several CSVs. Within the CLI, we can configure a Pandas DataFrame -Datasource like so: - -.. code-block:: bash - - ========== Datasources ========== - - See https://docs.greatexpectations.io/en/latest/features/datasource.html for more information about datasources. - - - Configure a datasource: - 1. Pandas DataFrame - 2. Relational database (SQL) - 3. Spark DataFrame - 4. Skip datasource configuration - : 1 - 1 - - Enter the path of the root directory where the data files are stored. - (The path may be either absolute or relative to current directory.) - : data - - Give your new data source a short name. - [data__dir]: - - - -This step adds a new block for Datasource configuration to ``great_expectations/great_expectations.yml``. Don't worry -about these details yet. For now, it's enough to know that we've configured a Datasource and the configuration -information is stored in this file. - -.. code-block:: bash - - datasources: - data__dir: - class_name: PandasDatasource - data_asset_type: - class_name: PandasDataset - generators: - default: - class_name: SubdirReaderGenerator - base_directory: ../data - reader_options: - sep: - engine: python - -For a SQL data source, configuration would look like this instead: - -.. code-block:: bash - - ========== Datasources ========== - - See https://docs.greatexpectations.io/en/latest/features/datasource.html for more information about datasources. - - - Configure a datasource: - 1. Pandas DataFrame - 2. Relational database (SQL) - 3. Spark DataFrame - 4. Skip datasource configuration - : 2 - 2 - - Give your new data source a short name. - [mydb]: my_db - - Great Expectations relies on sqlalchemy to connect to relational databases. - Please make sure that you have it installed. - - Next, we will configure database credentials and store them in the "my_db" section - of this config file: great_expectations/uncommitted/credentials/profiles.yml: - - What is the driver for the sqlalchemy connection? [postgres]: postgres - What is the host for the sqlalchemy connection? [localhost]: my_db_host.internal.priv - What is the port for the sqlalchemy connection? [5432]: - What is the username for the sqlalchemy connection? [postgres]: user - What is the password for the sqlalchemy connection?: - What is the database name for the sqlalchemy connection? [postgres]: - - -The corresponding config would be: - -.. code-block:: bash - - datasources: - my_db: - class_name: SqlAlchemyDatasource - credentials: ${my_db} - data_asset_type: - class_name: SqlAlchemyDataset - generators: - default: - class_name: TableGenerator - -Note: the SQL credentials you entered are stored in the ``uncommitted/config_variables.yml`` file. -Note that this file goes in the ``uncommitted/`` directory, which should *NOT* be committed to source control. -The ${my_db} variable is substituted with the credentials at runtime. - - -Configuring Slack Notifications ----------------------------------------- - -Great Expectations can post messages to a Slack channel each time a dataset is validated. This helps teams to monitor -data quality in their pipeline in real time. Here is what these messages look like: - -.. image:: ../images/validation_result_slack_message_example.png - :width: 400px -The ``great_expectations init`` command prompts you to enter a Slack webhook URL to enable this functionality. - -Obtaining this URL is easy. This article walks you through the steps: -`Incoming Webhooks For Slack `_ - -Since Slack webhook URLs are security credentials, we store them in the ``uncommitted/config_variables.yml`` file that -will not be checked in into your source control. The config property name is `validation_notification_slack_webhook` - -If you don't have a Slack webhook URL right now, you can decline the ``init`` command's prompt and configure this -feature later. - -Profiling data ----------------------------------------- - -Now that we've configured a DataSource, the next step is to profile it. Profiling will generate a very loose set of -Expectations for your data. By default, they will cover a wide range of statistics and other characteristics -of the Dataset that could be useful for future validation and data exploration. - -Profiling will also evaluate those Expectations against your actual data, producing a set of Expectation -Validation Results (EVRs), which will contain observed values and other context derived from the data itself. - -Profiling results can provide a lot of useful information for creating the Expectations you will -use later. They also provide the raw materials for first-pass data documentation. For more details on profiling, -please see :ref:`profiling`. - -Within the CLI, it's easy to profile our data. - -Note: the current default profiler uses first 1000 records of a table (or a file). - -.. code-block:: bash - - ========== Profiling ========== - - Profiling 'data__dir' will create expectations and documentation. - - Found 1 data assets from generator default - - Would you like to profile 'data__dir'? - [Y/n]: - Profiling 'data__dir' with 'BasicDatasetProfiler' - Profiling all 1 data assets from generator default - Profiling 'npidata'... - Preparing column 1 of 329: NPI - Preparing column 2 of 329: Entity Type Code - ... - ... - Preparing column 329 of 329: Healthcare Provider Taxonomy Group_15 - 2039 expectation(s) included in expectation_suite. - Profiled 329 columns using 18877 rows from npidata (17.647 sec) - - Profiled 1 of 1 named data assets, with 18877 total rows and 329 columns in 17.65 seconds. - Generated, evaluated, and stored 2039 Expectations. Please review results using data-docs. - -The default profiler (``BasicDatasetProfiler``) will add two JSON files in your ``great_expectations/`` directory. -They will be placed in subdirectories that include the three components of names described above. Great -Expectations' DataContexts can fetch these objects by name, so you won't usually need to access these files directly. -Still, it's useful to see how they're stored, to get a sense for how namespaces work. - -.. code-block:: bash - - great_expectations - ├── .gitignore - ├── datasources - ├── expectations - │   └── data__dir - │   └── default - │   └── npidata - │   └── BasicDatasetProfiler.json - ├── fixtures - ├── great_expectations.yml - ├── notebooks - │   ├── pandas - │   ├── spark - │   └── sql - ├── plugins - └── uncommitted - ├── config_variables.yml - ├── documentation - │   ├── local_site - │   └── team_site - ├── samples - └── validations - └── profiling - └── data__dir - └── default - └── npidata - └── BasicDatasetProfiler.json - - -We won't go into full detail on the contents of Expectation and EVR objects here. But as a quick illustration, -Expectation Suite JSON objects consist mainly of Expectations like: - -.. code-block:: json - - { - "expectation_type": "expect_column_distinct_values_to_be_in_set", - "kwargs": { - "column": "Entity Type Code", - "value_set": null, - "result_format": "SUMMARY" - }, - "meta": { - "BasicDatasetProfiler": { - "confidence": "very low" - } - } - } - -Expectation Suites created by the BasicDatasetProfiler are very loose and unopinionated. (Hence, the null -``value_set`` parameter.) They are more like placeholders for Expectations than actual Expectations. -(A tighter Expectation might include something like ``value_set=[1, 2]``.) That said, even these loose -Expectations can be evaluated against data to produce EVRs. - -EVRs contain Expectations, *plus* validation results from a evaluation against a specific batch of data. - -.. code-block:: bash - - { - "success": true, - "result": { - "observed_value": [ - 1.0, - 2.0 - ], - "element_count": 18877, - "missing_count": 382, - "missing_percent": 2.023626635588282, - "details": { - "value_counts": [ - { - "value": 1.0, - "count": 15689 - }, - { - "value": 2.0, - "count": 2806 - } - ] - } - }, - "expectation_config": { - "expectation_type": "expect_column_distinct_values_to_be_in_set", - "kwargs": { - "column": "Entity Type Code", - "value_set": null, - "result_format": "SUMMARY" - }, - "meta": { - "BasicDatasetProfiler": { - "confidence": "very low" - } - } - }, - "exception_info": { - "raised_exception": false, - "exception_message": null, - "exception_traceback": null - } - } - -The full Expectation Suite and EVR are JSON objects that also contain additional metadata, which we won't go into here. -For more information about these objects please see :ref:`validation_result`. - -Data Docs ----------------------------------------------------------- - -Expectation Suites and EVR's contain a huge amount of useful information about your data, but they aren't very easy to -consume as JSON objects. To make them more accessible, Great Expectations provides tools to render Expectation Suites -and EVRs to documentation. - -We call this feature "Compile to Docs." This approach to documentation has two significant advantages. - -First, for engineers, Compile to Docs makes it possible to automatically keep your documentation in sync with your -tests. This prevents documentation rot and can save a huge amount of time on otherwise unrewarding document maintenance. - -Second, the ability to translate Expectations back and forth betwen human- and machine-readable formats opens up -many opportunities for domain experts and stakeholders who aren't engineers to collaborate more closely with -engineers on data applications. - -Within the CLI, we compile to documentation as follows: - -.. code-block:: bash - - ========== Data Docs ========== - - Great Expectations can create data documentation from the data you just profiled. - - To learn more: https://docs.greatexpectations.io/en/latest/features/data_docs.html - - Build HTML Data Docs? [Y/n]: - - Building Data Docs... - ... - - The following data documentation HTML sites were generated: - - local_site: - great_expectations/uncommitted/data_docs/local_site/index.html - - -Opening `great_expectations/uncommitted/data_docs/local_site/index.html` in a browser will give you a page like: - -.. image:: ../images/index_render.png - -Clicking through to the profiling results will present an overview of the data, built from expectations and validated -using the batch that was just profiled. - -.. image:: ../images/profiling_render.png - -Clicking through to the second link will show you descriptive data documentation. This renders the full content of validation results, not just the Expectations themselves. - -.. image:: ../images/prescriptive_render.png - - -Note also that the default ``great_expectations/`` setup stores compiled documentation in the ``uncommitted/data_docs/`` -directory, with a subdirectory structure that mirrors the project namespace. - -After the init command completes, you should see the following directory structure : +Once you have decided which data you will use, you are ready to start. Run this command in the terminal: .. code-block:: bash - great_expectations - ├── .gitignore - ├── datasources - ├── expectations - │   └── data__dir - │   └── default - │   └── npidata - │   └── BasicDatasetProfiler.json - ├── fixtures - ├── great_expectations.yml - ├── notebooks - │ ├── pandas - │ ├── spark - │ └── sql - ├── plugins - └── uncommitted - ├── config_variables.yml - ├── documentation - │   └── local_site - │      ├── expectations - │      │   └── data__dir - │      │   └── default - │      │   ├── npidata - │      │      └── BasicDatasetProfiler.html - │      ├── index.html - │      └── validations - │      └── profiling - │     └── data__dir - │      └── default - │      └── npidata - │         └── BasicDatasetProfiler.html - └── validations -       └── profiling -      └── data__dir -       └── default -       └── npidata -          └── BasicDatasetProfiler.json + great_expectations init -Next Steps ------------ +After you complete the `init` command, read this article to get a more complete picture of how data teams use Great Expectations: :ref:`typical_workflow`. -Once you have opened datadocs, a prompt will suggest possible next steps, such as to :ref:`tutorial_create_expectations` or -:ref:`tutorial_validate_data`. +*last updated*: |lastupdate| diff --git a/docs/getting_started/typical_workflow.rst b/docs/getting_started/typical_workflow.rst new file mode 100644 index 000000000000..5562a22faae2 --- /dev/null +++ b/docs/getting_started/typical_workflow.rst @@ -0,0 +1,345 @@ + +Typical Workflow +=============================================== + +This article describes how data teams typically use Great Expectations. + +The objective of this workflow is to gain control and confidence in your data pipeline and to address the challenges of validating and monitoring the quality and accuracy of your data. + +Once the setup is complete, the workflow looks like a loop over the following steps: + +1. Data team members capture and document their shared understanding of their data as expectations. +2. As new data arrives in the pipeline, Great Expectations evaluates it against these expectations. +3. If the observed properties of the data are found to be different from the expected ones, the team responds by rejecting (or fixing) the data, updating the expectations, or both. + +The article focuses on the "What" and the "Why" of each step in this workflow, and touches on the "How" only briefly. The exact details of configuring and executing these steps are intentionally left out - they can be found in the tutorials and reference linked from each section. + +If you have not installed Great Expectations and executed the CLI init command, as described in this :ref:`tutorial`, we recommend you do so before reading the rest of the article. This will make a lot of concepts mentioned below more familiar to you. + + +Setting up a project +---------------------------------------- + +To use Great Expectations in a new data project, a :ref:`Data Context` needs to be initialized. +You will see references to the Data Context throughout the documentation. +A Data Context provides the core services used in a Great Expectations project. + +The command line interface (CLI) command ``init`` does the initialization. Run this command in the terminal in the root of your project's repo: + +.. code-block:: bash + + great_expectations init + +This command has to be run only once per project. + +The command creates ``great_expectations`` subdirectory in the current directory. The team member who runs it, commits the generated directory into the version control. The contents of ``great_expectations`` look like this: + +.. code-block:: bash + + great_expectations + ... + ├── expectations + ... + ├── great_expectations.yml + ├── notebooks + ... + ├── .gitignore + └── uncommitted + ├── config_variables.yml + ├── documentation + │   └── local_site + └── validations + +* The ``great_expectations/great_expectations.yml`` configuration file defines how to access the project's data, expectations, validation results, etc. +* The ``expectations`` directory is where the expectations are stored as JSON files. +* The ``uncommitted`` directory is the home for files that should not make it into the version control - it is configured to be excluded in the ``.gitignore`` file. Each team member will have their own content of this directory. In Great Expectations, files should not go into the version control for two main reasons: + + * They contain sensitive information. For example, to allow the ``great_expectations/great_expectations.yml`` configuration file, it must not contain any database credentials and other secrets. These secrets are stored in the ``uncommitted/config_variables.yml`` that is not checked in. + + * They are not a "primary source of truth" and can be regenerated. For example, ``uncommitted/documentation`` contains generated data documentation (this article will cover data documentation in a later section). + + +Adding Datasources +---------------------------------------- + +Evaluating an expectation against a batch of data is the fundamental operation in Great Expectations. + +For example, imagine that we have a movie ratings table in the database. This expectation says that we expect that column "rating" takes only 1, 2, 3, 4 and 5: + +.. code-block:: json + + { + "kwargs": { + "column": "rating", + "value_set": [1, 2, 3, 4, 5] + }, + "expectation_type": "expect_column_distinct_values_to_be_in_set" + } + +When Great Expectations evaluates this expectation against a dataset that has a column named "rating", it returns a validation result saying whether the data meets the expectation. + + +A :ref:`Datasource` is a connection to a compute environment (a backend such as Pandas, Spark, or a SQL-compatible database) and one or more storage environments. + +You can have multiple Datasources in a project (Data Context). For example, this is useful if the team’s pipeline consists of both a Spark cluster and a Redshift database. + +All the Datasources that your project uses are configured in the project's configuration file ``great_expectations/great_expectations.yml``: + + +.. code-block:: + + datasources: + + our_product_postgres_database: + class_name: SqlAlchemyDatasource + data_asset_type: + class_name: SqlAlchemyDataset + credentials: ${prod_db_credentials} + + our_redshift_warehouse: + class_name: SqlAlchemyDatasource + data_asset_type: + class_name: SqlAlchemyDataset + credentials: ${warehouse_credentials} + + + +The easiest way to add a datasource to the project is to use the CLI convenience command: + +.. code-block:: bash + + great_expectations datasource new + +This command asks for the required connection attributes and tests the connection to the new Datasource. + +The intrepid can add Datasources by editing the configuration file, however there are less guardrails around this approach. + +A Datasource knows how to load data into the computation environment. +For example, you can use a PySpark Datasource object to load data into a DataFrame from a directory on AWS S3. +This is beyond the scope of this article. + +After a team member adds a new Datasource to the Data Context, they commit the updated configuration file into the version control in order to make the change available to the rest of the team. + +Because ``great_expectations/great_expectations.yml`` is committed into version control, the CLI command **does not store the credentials in this file**. +Instead it saves them in a separate file: ``uncommitted/config_variables.yml`` which is not committed into version control. + +This means that that when another team member checks out the updated configuration file with the newly added Datasource, they must add their own credentials to their ``uncommitted/config_variables.yml`` or in environment variables. + +Setting up Data Docs +---------------------------------------------------------- + +:ref:`Data Docs` is a feature of Great Expectations that creates data documentation by compiling expectations and validation results into HTML. + +Data Docs produces a visual data quality report of what you expect from your data, and how the observed properties of your data differ from your expectations. +It helps to keep your entire team on the same page as data evolves. + +Here is what the ``expect_column_distinct_values_to_be_in_set`` expectation about the `rating` column of the movie ratings table from the earlier example looks like in Data Docs: + +.. image:: ../images/exp_ratings_col_dist_val_set.png + +This approach to data documentation has two significant advantages. + +1. **Your docs are your tests** and **your tests are your docs.** +For engineers, Data Docs makes it possible to **automatically keep your data documentation in sync with your tests**. +This prevents documentation rot and can save a huge amount of time and pain maintaining documentation. + +2. The ability to translate expectations back and forth between human and machine-readable formats opens up +many opportunities for domain experts and stakeholders who aren't engineers to collaborate more closely with +engineers on data applications. + +Multiple sites can be configured inside a project, each suitable for a particular use case. +For example, some data teams use one site that has expectations and validation results from all the runs of their data pipeline for monitoring the pipeline's health, +and another site that has only the expectations for communicating with their downstream clients. +This is analogous to API documentation in software development. + +To set up Data Docs for a project, an entry ``data_docs_sites`` must be defined in the project's configuration file. +By default Data Docs site files are published to the local filesystem here: ``great_expectations/uncommitted/data_docs/``. +You can see this by running: + +.. code-block:: bash + + great_expectations docs build + +To make a site available more broadly, a team member could configure Great Expectations to publish the site to a shared location, +such as a :ref:`AWS S3`, GCS. + +The site's configuration defines what to compile and where to store results. +Data Docs is very customizable - see the :ref:`Data Docs Reference` for more information. + + +Authoring expectation suites +---------------------------------------------------------- + +Earlier in this article we said that capturing and documenting the team's shared understanding of its data as expectations is the core part of this typical workflow. + +Expectation Suites combine multiple expectations into an overall description of a dataset. For example, a team can group all the expectations about its ``rating`` table in the movie ratings database from our previous example into an Expectation Suite and call it ``movieratings.ratings``. Note these names are completely flexible and the only constraint on the name of a suite is that it must be unique to a given project. + +Each Expectation Suite is saved as a JSON file in the ``great_expectations/expectations`` subdirectory of the Data Context. Users check these files into the version control each time they are updated, same way they treat their source files. This discipline allows data quality to be an integral part of versioned pipeline releases. + +The lifecycle of an Expectation Suite starts with creating it. Then it goes through an iterative loop of Review and Edit as the team's understanding of the data described by the suite evolves. + +Create +******************************************** + + +While you could hand-author an Expectation Suite by writing a JSON file, just like with other features it is easier to let CLI save you time and typos. +Run this command in the root directory of your project (where the init command created the ``great_expectations`` subdirectory: + + +.. code-block:: bash + + great_expectations suite new + + +This command prompts you to name your new Expectation Suite and to select a sample batch of data the suite will describe. +Then it uses a sample of the selected data to add some initial expectations to the suite. +The purpose of these is expectations is to provide examples of data assertions, and not to be meaningful. +They are intended only a starting point for you to build upon. + +The command concludes by saving the newly generated Expectation Suite as a JSON file and rendering the expectation suite into an HTML page in Data Docs. + + +Review +******************************************** + +Reviewing expectations is best done visually in Data Docs. Here's an example of what that might look like: + +.. image:: ../images/sample_e_s_view.png + +Note that many of these expectations might have meaningless ranges. +Also note that all expectations will have passed, since this is an example suite only. +When you interactively edit your suite you will likely see failures as you iterate. + + +Edit +******************************************** + +Editing an Expectation Suite means adding, removing, and modifying the arguments of existing expectations. + + +Similar to writing SQL queries, Expectations are best edited interactively against your data. +The best interface for this is in a Jupyter notebook where you can get instant feedback as you iterate. + +For every expectation type there is a Python method that sets its arguments, evaluates this expectation against a sample batch of data and adds it to the Expectation Suite. + +The screenshot below shows the Python method and the Data Docs view for the same expectation (``expect_column_distinct_values_to_be_in_set``): + +.. image:: ../images/exp_html_python_side_by_side.png + +The Great Expectations CLI command ``suite edit`` generates a Jupyter notebook to edit a suite. +This command saves you time by generating boilerplate that loads a batch of data and builds a cell for every expectation in the suite. +This makes editing suites a breeze. + +For example, to edit a suite called ``movieratings.ratings`` you would run: + +.. code-block:: bash + + great_expectations suite edit movieratings.ratings + +These generated Jupyter notebooks can be discarded and should not be kept in source control since they are auto-generated at will, and may contain snippets of actual data. + +To make this easier still, the Data Docs page for each Expectation Suite has the CLI command syntax for you. +Simply press the "How to Edit This Suite" button, and copy/paste the CLI command into your terminal. + +.. image:: ../images/edit_e_s_popup.png + + +Deploying automated testing into a pipeline +------------------------------------------- + +So far, your team members used Great Expectations to capture and document their expectations about your data. + +It is time for your team to benefit from Great Expectations' automated testing that systematically surfaces errors, discrepancies and surprises lurking in your data. +A data engineer can add a :ref:`Validation Operators` to your pipeline and configure it. +These Validation Operators evaluate the new batches of data that flow through your pipeline against the expectations your team defined in the previous sections. + +While data pipelines can be implemented with various technologies, at their core they are all DAGs (directed acyclic graphs) of computations and transformations over data. + +This drawing shows an example of a node in a pipeline that loads data from a CSV file into a database table. + +- Two expectation suites are deployed to monitor data quality in this pipeline. +- The first suite validates the pipeline's input - the CSV file - before the pipeline executes. +- The second suite validates the pipeline's output - the data loaded into the table. + +.. image:: ../images/pipeline_diagram_two_nodes.png + +To implement this validation logic, a data engineer inserts a Python code snippet into the pipeline - before and after the node. The code snippet prepares the data for the GE Validation Operator and calls the operator to perform the validation. + +The exact mechanism of deploying this code snippet depends on the technology used for the pipeline. + +If Airflow drives the pipeline, the engineer adds a new node in the Airflow DAG. This node will run a PythonOperator that executes this snippet. If the data is invalid, the Airflow PythonOperator will raise an error which will stop the rest of the execution. + +If the pipeline uses something other than Airflow for orchestration, as long as it is possible to add a Python code snippet before and/or after a node, this will work. + +Below is an example of this code snippet, with comments that explain what each line does. + +.. code-block:: python + + # Data Context is a GE object that represents your project. + # Your project's great_expectations.yml contains all the config + # options for the project's GE Data Context. + context = ge.data_context.DataContext() + + datasource = "my_production_postgres" # a datasource configured in your great_expectations.yml + + # Tell GE how to fetch the batch of data that should be validated... + + # ... from the result set of a SQL query: + batch_kwargs = {"query": "your SQL query", "datasource": datasource_name} + + # ... or from a database table: + # batch_kwargs = {"table": "name of your db table", "datasource": datasource_name} + + # ... or from a file: + # batch_kwargs = {"path": "path to your data file", "datasource": datasource_name} + + # ... or from a Pandas or PySpark DataFrame + # batch_kwargs = {"dataset": "your Pandas or PySpark DataFrame", "datasource": datasource_name} + + # Get the batch of data you want to validate. + # Specify the name of the expectation suite that holds the expectations. + expectation_suite_name = "movieratings.ratings" # this is an example of + # a suite that you created + batch = context.get_batch(batch_kwargs, expectation_suite_name) + + # Call a validation operator to validate the batch. + # The operator will evaluate the data against the expectations + # and perform a list of actions, such as saving the validation + # result, updating Data Docs, and firing a notification (e.g., Slack). + results = context.run_validation_operator( + "action_list_operator", + assets_to_validate=[batch], + run_id=run_id) # e.g., Airflow run id or some run identifier that your pipeline uses. + + if not results["success"]: + # Decide what your pipeline should do in case the data does not + # meet your expectations. + + +Responding to validation results +---------------------------------------- + +A :ref:`Validation Operator` is deployed at a particular point in your data pipeline. + +A new batch of data arrives and the operator validates it against an expectation suite (see the previous step). + +The :ref:`actions` of the operator store the validation result, add an HTML view of the result to the Data Docs website, and fire a configurable notification (by default, Slack). + +If the data meets all the expectations in the suite, no action is required. This is the beauty of automated testing. No team members have to be interrupted. + +In case the data violates some expectations, team members must get involved. + +In the world of software testing, if a program does not pass a test, it usually means that the program is wrong and must be fixed. + +In pipeline and data testing, if data does not meet expectations, the response to a failing test is triaged into 3 categories: + +1. **The data is fine, and the validation result revealed a characteristic that the team was not aware of.** + The team's data scientists or domain experts update the expectations to reflect this new discovery. + They use the process described above in the Review and Edit sections to update the expectations while testing them against the data batch that failed validation. +2. **The data is "broken"**, and **can be recovered.** + For example, the users table could have dates in an incorrect format. + Data engineers update the pipeline code to deal with this brokenness and fix it on the fly. +3. **The data is "broken beyond repair".** + The owners of the pipeline go upstream to the team (or external partner) who produced the data and address it with them. + For example, columns in the users table could be missing entirely. + The validation results in Data Docs makes it easy to communicate exactly what is broken, since it shows the expectation that was not met and observed examples of non-conforming data. diff --git a/docs/images/edit_e_s_popup.png b/docs/images/edit_e_s_popup.png new file mode 100644 index 000000000000..b16951a5c862 Binary files /dev/null and b/docs/images/edit_e_s_popup.png differ diff --git a/docs/images/exp_html_python_side_by_side.png b/docs/images/exp_html_python_side_by_side.png new file mode 100644 index 000000000000..397b54c26be3 Binary files /dev/null and b/docs/images/exp_html_python_side_by_side.png differ diff --git a/docs/images/exp_ratings_col_dist_val_set.png b/docs/images/exp_ratings_col_dist_val_set.png new file mode 100644 index 000000000000..daf054eda469 Binary files /dev/null and b/docs/images/exp_ratings_col_dist_val_set.png differ diff --git a/docs/images/pipeline_diagram_two_nodes.png b/docs/images/pipeline_diagram_two_nodes.png new file mode 100644 index 000000000000..8e920f2e3e38 Binary files /dev/null and b/docs/images/pipeline_diagram_two_nodes.png differ diff --git a/docs/images/sample_e_s_view.png b/docs/images/sample_e_s_view.png new file mode 100644 index 000000000000..e96b41d2892d Binary files /dev/null and b/docs/images/sample_e_s_view.png differ diff --git a/docs/index.rst b/docs/index.rst index c8db636503f2..0f6226d8cec0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,9 +7,10 @@ Welcome to Great Expectations! ################################ -Great Expectations is a leading tool for :ref:`profiling `, :ref:`validating `, and -:ref:`documenting ` your data to maintain quality and improve communication between teams. -Head over to the :ref:`intro` to learn more, or jump straight to our :ref:`getting_started` guide. +Great Expectations is a leading tool for :ref:`validating `, +:ref:`documenting `, and :ref:`profiling `, your data to maintain quality and improve +communication between teams. Head over to the :ref:`intro` to learn more, or jump straight to our +:ref:`getting_started` guide. .. toctree:: :maxdepth: 2 @@ -30,3 +31,5 @@ Index * :ref:`genindex` * :ref:`modindex` + +*last updated*: |lastupdate| diff --git a/docs/intro.rst b/docs/intro.rst index 9d2b8a8ac927..62a178d197d2 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -55,11 +55,11 @@ Key features **Automated data profiling** - Writing pipeline test from scratch can be tedious and counterintuitive. Great Expectations jump starts the process by providing powerful tools for automated data profiling. This provides the double benefit of helping you explore data faster, and capturing knowledge for future documentation and testing. + Writing pipeline tests from scratch can be tedious and counterintuitive. Great Expectations jump starts the process by providing powerful tools for automated data profiling. This provides the double benefit of helping you explore data faster, and capturing knowledge for future documentation and testing. **DataContexts and DataSources** - ...allow you to configure connections your data stores, using names you’re already familiar with: “the ml_training_results bucket in S3,” “the Users table in Redshift.” Great Expectations provides convenience libraries to introspect most common data stores (Ex: SQL databases, data directories and S3 buckets.) We are also working to integrate with pipeline execution frameworks (Ex: airflow, dbt, dagster, prefect.io). The Great Expectations framework lets you fetch, validate, profile, and document your data in a way that’s meaningful within your existing infrastructure and work environment. + ...allow you to configure connections your data stores, using names you’re already familiar with: “the ml_training_results bucket in S3,” “the Users table in Redshift.” Great Expectations provides convenience libraries to introspect most common data stores (Ex: SQL databases, data directories and S3 buckets.) We are also working to integrate with pipeline execution frameworks (Ex: Airflow, dbt, Dagster, Prefect). The Great Expectations framework lets you fetch, validate, profile, and document your data in a way that’s meaningful within your existing infrastructure and work environment. **Tooling for validation** @@ -99,15 +99,15 @@ What does Great Expectations NOT do? **Great Expectations is NOT a pipeline execution framework.** - We aim to integrate seamlessly with DAG execution tools like `Spark `__, `Airflow `__, `dbt `__, `prefect `__, `dagster `__, `Kedro `__, etc. We DON'T execute your pipelines for you. + We aim to integrate seamlessly with DAG execution tools like `Spark `__, `Airflow `__, `dbt `__, `Prefect `__, `Dagster `__, `Kedro `__, etc. We DON'T execute your pipelines for you. **Great Expectations is NOT a data versioning tool.** - + Great Expectations does not store data itself. Instead, it deals in metadata about data: Expectations, validation results, etc. If you want to bring your data itself under version control, check out tools like: `DVC `__ and `Quilt `__. -**Great Expectations currently works best in a python/bash environment.** +**Great Expectations currently works best in a Python/Bash environment.** - Great Expectations is python-based. You can invoke it from the command line without using a python programming environment, but if you're working in another ecosystem, other tools might be a better choice. If you're running in a pure R environment, you might consider `assertR `__ as an alternative. Within the Tensorflow ecosystem, `TFDV `__ fulfills a similar function as Great Expectations. + Great Expectations is Python-based. You can invoke it from the command line without using a Python programming environment, but if you're working in another ecosystem, other tools might be a better choice. If you're running in a pure R environment, you might consider `assertR `__ as an alternative. Within the Tensorflow ecosystem, `TFDV `__ fulfills a similar function as Great Expectations. *********************************** Who maintains Great Expectations? @@ -115,9 +115,10 @@ Who maintains Great Expectations? Great Expectations is under active development by James Campbell, Abe Gong, Eugene Mandel and Rob Lim, with help from many others. -If you have questions, comments, or just want to have a good old-fashioned chat about data pipelines, please hop on our public Slack channel:https://greatexpectations.io/slack +If you have questions, comments, or just want to have a good old-fashioned chat about data pipelines, please hop on our public Slack channel: https://greatexpectations.io/slack If you'd like to contribute to Great Expectations, please head to the :ref:`community` section. If you'd like hands-on assistance setting up Great Expectations, establishing a healthy practice of data testing, or adding functionality to Great Expectations, please see options for consulting help `here `__. +*last updated*: |lastupdate| diff --git a/docs/module_docs.rst b/docs/module_docs.rst index 4ce079d03989..dc41b024b4a4 100644 --- a/docs/module_docs.rst +++ b/docs/module_docs.rst @@ -16,3 +16,5 @@ Module docs /module_docs/store_module /module_docs/validation_operators_module /module_docs/great_expectations_module + +*last updated*: |lastupdate| diff --git a/docs/module_docs/data_context_module.rst b/docs/module_docs/data_context_module.rst index 3cd705447493..7ca464b8ca2a 100644 --- a/docs/module_docs/data_context_module.rst +++ b/docs/module_docs/data_context_module.rst @@ -24,3 +24,5 @@ DataContext :members: :undoc-members: :show-inheritance: + +*last updated*: |lastupdate| diff --git a/docs/module_docs/dataset_module.rst b/docs/module_docs/dataset_module.rst index 624bae8d7d45..5db2daaa7b49 100644 --- a/docs/module_docs/dataset_module.rst +++ b/docs/module_docs/dataset_module.rst @@ -81,3 +81,4 @@ util :undoc-members: :show-inheritance: +*last updated*: |lastupdate| diff --git a/docs/module_docs/datasource_module.rst b/docs/module_docs/datasource_module.rst index 8c7339e70f3e..ad5a85df19bc 100644 --- a/docs/module_docs/datasource_module.rst +++ b/docs/module_docs/datasource_module.rst @@ -41,3 +41,5 @@ SparkDFDatasource :members: :undoc-members: :show-inheritance: + +*last updated*: |lastupdate| diff --git a/docs/module_docs/generator_module.rst b/docs/module_docs/generator_module.rst index d6350bdefc36..0258044328ef 100644 --- a/docs/module_docs/generator_module.rst +++ b/docs/module_docs/generator_module.rst @@ -10,9 +10,9 @@ Generator Module :members: :undoc-members: :show-inheritance: - :exclude-members: BatchGenerator + :exclude-members: BatchKwargsGenerator - .. autoclass:: great_expectations.datasource.generator.batch_generator.BatchGenerator + .. autoclass:: great_expectations.datasource.generator.batch_kwargs_generator.BatchKwargsGenerator :members: :undoc-members: @@ -26,46 +26,48 @@ InMemoryGenerator :show-inheritance: -QueryGenerator +QueryBatchKwargsGenerator ------------------------------------------------------------------------ -.. autoclass:: great_expectations.datasource.generator.query_generator.QueryGenerator +.. autoclass:: great_expectations.datasource.generator.query_generator.QueryBatchKwargsGenerator :members: :undoc-members: :show-inheritance: -SubdirReaderGenerator +SubdirReaderBatchKwargsGenerator ---------------------------------------------------------------------------------------- -.. autoclass:: great_expectations.datasource.generator.subdir_reader_generator.SubdirReaderGenerator +.. autoclass:: great_expectations.datasource.generator.subdir_reader_generator.SubdirReaderBatchKwargsGenerator :members: :undoc-members: :show-inheritance: -GlobReaderGenerator +GlobReaderBatchKwargsGenerator ------------------------------------------------------------------------------------- -.. autoclass:: great_expectations.datasource.generator.glob_reader_generator.GlobReaderGenerator +.. autoclass:: great_expectations.datasource.generator.glob_reader_generator.GlobReaderBatchKwargsGenerator :members: :undoc-members: :show-inheritance: -S3Generator +S3GlobReaderBatchKwargsGenerator ------------------------------------------------------------------------------------- -.. autoclass:: great_expectations.datasource.generator.s3_generator.S3Generator +.. autoclass:: great_expectations.datasource.generator.s3_generator.S3GlobReaderBatchKwargsGenerator :members: :undoc-members: :show-inheritance: -DatabricksTableGenerator +DatabricksTableBatchKwargsGenerator --------------------------------------------------------------------------------------- -.. autoclass:: great_expectations.datasource.generator.databricks_generator.DatabricksTableGenerator +.. autoclass:: great_expectations.datasource.generator.databricks_generator.DatabricksTableBatchKwargsGenerator :members: :undoc-members: :show-inheritance: + +*last updated*: |lastupdate| diff --git a/docs/module_docs/profile_module.rst b/docs/module_docs/profile_module.rst index 74e42d06c6b0..e6a0c43f52db 100644 --- a/docs/module_docs/profile_module.rst +++ b/docs/module_docs/profile_module.rst @@ -14,3 +14,5 @@ Profile Module :members: :undoc-members: :show-inheritance: + +*last updated*: |lastupdate| diff --git a/docs/module_docs/render_module.rst b/docs/module_docs/render_module.rst index 6f6f295ff233..225647c730ac 100644 --- a/docs/module_docs/render_module.rst +++ b/docs/module_docs/render_module.rst @@ -116,3 +116,5 @@ View Module :members: :undoc-members: :show-inheritance: + +*last updated*: |lastupdate| diff --git a/docs/module_docs/store_module.rst b/docs/module_docs/store_module.rst index 3f0a5908f9b5..a12ebe9dc90a 100644 --- a/docs/module_docs/store_module.rst +++ b/docs/module_docs/store_module.rst @@ -30,3 +30,5 @@ Store Module :members: :undoc-members: :show-inheritance: + +*last updated*: |lastupdate| diff --git a/docs/module_docs/validation_operators_module.rst b/docs/module_docs/validation_operators_module.rst index 86495c37d8b6..5e28bfa7e260 100644 --- a/docs/module_docs/validation_operators_module.rst +++ b/docs/module_docs/validation_operators_module.rst @@ -36,3 +36,4 @@ WarningAndFailureExpectationSuitesValidationOperator :undoc-members: :show-inheritance: +*last updated*: |lastupdate| diff --git a/docs/reference.rst b/docs/reference.rst index 8670a2c6c1f3..c4b35ab8e158 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -87,3 +87,5 @@ Supporting Resources :maxdepth: 2 /reference/supporting_resources + +*last updated*: |lastupdate| diff --git a/docs/reference/__doctest_example.rst b/docs/reference/__doctest_example.rst new file mode 100644 index 000000000000..1ed6c4b22bad --- /dev/null +++ b/docs/reference/__doctest_example.rst @@ -0,0 +1,111 @@ +################################# +Doctest Examples +################################# + +Use these examples during migration to testable docs. + +The block below is not rendered in the final documentation, but *does* affect the namespace. + +.. invisible-code-block: python + + import great_expectations as ge + import pandas as pd + npi = ge.dataset.PandasDataset({"provider_id": [1,2,3]}) + from great_expectations.core import ExpectationValidationResult, ExpectationConfiguration + res = npi.expect_column_values_to_be_unique("provider_id") + + +This block is a standard doctest block, with one statement. + +>>> npi.expect_column_values_to_be_unique("provider_id") == ExpectationValidationResult( +... **{ +... "result": { +... "element_count": 3, +... "missing_count": 0, +... "missing_percent": 0.0, +... "unexpected_count": 0, +... "unexpected_percent": 0.0, +... "unexpected_percent_nonmissing": 0.0, +... "partial_unexpected_list": [] +... }, +... "success": True, +... "exception_info": None, +... "meta": {}, +... "expectation_config": ExpectationConfiguration(**{ +... "expectation_type": "expect_column_values_to_be_unique", +... "meta": {}, +... "kwargs": { +... "column": "provider_id", +... "result_format": "BASIC" +... } +... }) +... }) +True + + +This block is tested only in that it must not raise an exception. No output test happens from a code-block. + +.. code-block:: python + + assert npi.expect_column_values_to_be_unique("provider_id") != ExpectationValidationResult( + meta={}, + result={ + "element_count": 3, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + success=True, + expectation_config=ExpectationConfiguration(**{ + "expectation_type": "expect_column_values_to_be_unique", + "meta": {}, + "kwargs": { + "column": "provider_id", + "result_format": "BASIC" + } + }), + exception_info=None + ) + + +These three lines will be evaluated as classic doctest: + +>>> df = pd.read_csv("/opt/data/titanic/Titanic.csv") +>>> df = ge.dataset.PandasDataset(df) +>>> res = df.expect_column_values_to_be_in_set("Sex", ["male", "female"]) + +This section would often fail, but will be skipped because of the Sphinx comment. It **will** be rendered. + +.. skip: next + +>>> print(res) + { + "exception_info": null, + "success": true, + "meta": {}, + "result": { + "element_count": 1313, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_in_set", + "meta": {}, + "kwargs": { + "column": "Sex", + "value_set": [ + "male", + "female" + ], + "result_format": "BASIC" + } + } + } + diff --git a/docs/reference/batch_identification.rst b/docs/reference/batch_identification.rst index cc720e8b5621..17bbd0b10ab4 100644 --- a/docs/reference/batch_identification.rst +++ b/docs/reference/batch_identification.rst @@ -31,3 +31,5 @@ Batch Id ****************** Batch Fingerprint ****************** + +*last updated*: |lastupdate| diff --git a/docs/reference/batch_kwargs.rst b/docs/reference/batch_kwargs.rst index 6c0524365dfe..b49ed6691b07 100644 --- a/docs/reference/batch_kwargs.rst +++ b/docs/reference/batch_kwargs.rst @@ -9,3 +9,5 @@ Batch Kwargs represent the information required by a :ref:`Datasource` to fetch The `partition_id` provides a single string that can be used to represent a data asset inside the namespace defined by a given datasource/generator/generator_asset triple. + +*last updated*: |lastupdate| diff --git a/docs/reference/contributing.rst b/docs/reference/contributing.rst index 80517d262b74..e413eb4ccbeb 100644 --- a/docs/reference/contributing.rst +++ b/docs/reference/contributing.rst @@ -9,3 +9,5 @@ ecosystem including plugins and examples using GE. For contributing directly to great expectations, the contributors' guide is located `here `__. + +*last updated*: |lastupdate| diff --git a/docs/reference/creating_expectations.rst b/docs/reference/creating_expectations.rst index 3e1e0e352bae..5d8d4fa55b1d 100644 --- a/docs/reference/creating_expectations.rst +++ b/docs/reference/creating_expectations.rst @@ -29,3 +29,5 @@ This is how you always know what to expect from your data. >> my_df.save_expectation_suite("my_titanic_expectations.json") For more detail on how to control expectation output, please see :ref:`standard_arguments` and :ref:`result_format`. + +*last updated*: |lastupdate| diff --git a/docs/reference/custom_expectations.rst b/docs/reference/custom_expectations.rst index 2829e06361f8..504e2ecf3bb2 100644 --- a/docs/reference/custom_expectations.rst +++ b/docs/reference/custom_expectations.rst @@ -355,7 +355,7 @@ A similar approach works for the command-line tool. .. code-block:: bash - >> great_expectations validate \ + >> great_expectations validation csv \ my_data_file.csv \ my_expectations.json \ dataset_class=custom_dataset.CustomPandasDataset @@ -387,7 +387,7 @@ CustomPandasDataset in a DataContext. Note the use of standard python dot notati class_name: CustomPandasDataset generators: default: - class_name: SubdirReaderGenerator + class_name: SubdirReaderBatchKwargsGenerator base_directory: /data reader_options: sep: \t @@ -432,3 +432,5 @@ structure below. "success": False, "unexpected_list": [2,2,2,2,2,2,2,2] } + +*last updated*: |lastupdate| diff --git a/docs/reference/data_asset_features.rst b/docs/reference/data_asset_features.rst index 10ffe0d1da2a..0541d5b390a1 100644 --- a/docs/reference/data_asset_features.rst +++ b/docs/reference/data_asset_features.rst @@ -23,11 +23,11 @@ At initialization .. code-block:: python - >> import great_expectations as ge - >> import pandas as pd - >> df = pd.read_csv("./tests/examples/titanic.csv") - >> ge_df = ge.dataset.PandasDataset(df, interactive_evaluation=False) - >> ge_df.expect_column_values_to_be_in_set('Sex', ["male", "female"]) + import great_expectations as ge + import pandas as pd + df = pd.read_csv("../tests/examples/titanic.csv") + ge_df = ge.dataset.PandasDataset(df, interactive_evaluation=False) + ge_df.expect_column_values_to_be_in_set('Sex', ["male", "female"]) { 'stored_configuration': { @@ -82,3 +82,4 @@ Dynamically adjusting interactive evaluation } } +*last updated*: |lastupdate| diff --git a/docs/reference/data_context_reference.rst b/docs/reference/data_context_reference.rst index 87f46647448c..f6735ca20825 100644 --- a/docs/reference/data_context_reference.rst +++ b/docs/reference/data_context_reference.rst @@ -23,7 +23,7 @@ Datasources Datasources tell Great Expectations where your data lives and how to get it. -Using the CLI command `great_expectations add-datasource` is the easiest way to +Using the CLI command ``great_expectations datasource new`` is the easiest way to add a new datasource. The `datasources` section declares which :ref:`datasource` objects should be available in the DataContext. @@ -50,7 +50,7 @@ represent two public datasets available from the resource. class_name: PandasDatasource generators: s3: - class_name: S3Generator + class_name: S3GlobReaderBatchKwargsGenerator bucket: nyc-tlc delimiter: '/' reader_options: @@ -78,7 +78,7 @@ Here is an example for a SQL based pipeline: class_name: SqlAlchemyDataset generators: default: - class_name: TableGenerator + class_name: TableBatchKwargsGenerator Note the ``credentials`` key references a corresponding key in the ``config_variables.yml`` file which is not in source control that would look @@ -139,15 +139,15 @@ would include the following: expectations_store: class_name: ExpectationsStore store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: expectations/ validations_store: class_name: ValidationsStore store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/validations/ evaluation_parameter_store: - class_name: InMemoryEvaluationParameterStore + class_name: EvaluationParameterStore The `expectations_store` provides access to expectations_suite objects, using the DataContext's namespace; the `validations_store` does the same for validations. See :ref:`evaluation_parameters` for more information on the @@ -168,18 +168,18 @@ providing the bucket/prefix combination: expectations_store: class_name: ExpectationsStore store_backend: - class_name: FixedLengthTupleS3StoreBackend + class_name: TupleS3StoreBackend base_directory: expectations/ bucket: ge.my_org.com prefix: validations_store: class_name: ValidationsStore store_backend: - class_name: FixedLengthTupleS3StoreBackend + class_name: TupleS3StoreBackend bucket: ge.my_org.com prefix: common_validations evaluation_parameter_store: - class_name: InMemoryEvaluationParameterStore + class_name: EvaluationParameterStore GE uses `boto3 `_ to access AWS, so credentials simply need to be available in any standard place searched by that library. You may also specify keyword arguments @@ -260,20 +260,16 @@ new directory or use this template: # Welcome to Great Expectations! Always know what to expect from your data. # - # Here you can define datasources, generators, integrations and more. This file - # is intended to be committed to your repo. For help with configuration please: + # Here you can define datasources, batch kwarg generators, integrations and + # more. This file is intended to be committed to your repo. For help with + # configuration please: # - Read our docs: https://docs.greatexpectations.io/en/latest/reference/data_context_reference.html#configuration # - Join our slack channel: http://greatexpectations.io/slack - # - # NOTE: GE uses the names of configured `datasources` and `generators` to manage - # how `expectations` and other artifacts are stored in the `expectations/` and - # `datasources/` folders. If you need to rename an existing `datasource` or - # `generator`, be sure to also update the relevant directory names. config_version: 1 # Datasources tell Great Expectations where your data lives and how to get it. - # You can use the CLI command `great_expectations add-datasource` to help you + # You can use the CLI command `great_expectations datasource new` to help you # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/features/datasource.html datasources: {} edw: @@ -283,7 +279,7 @@ new directory or use this template: class_name: SqlAlchemyDataset generators: default: - class_name: TableGenerator + class_name: TableBatchKwargsGenerator # This config file supports variable substitution which enables: 1) keeping # secrets out of source control & 2) environment-based configuration changes @@ -317,10 +313,10 @@ new directory or use this template: action_list: - name: store_validation_result action: - class_name: StoreAction + class_name: StoreValidationResultAction - name: store_evaluation_params action: - class_name: ExtractAndStoreEvaluationParamsAction + class_name: StoreEvaluationParametersAction - name: update_data_docs action: class_name: UpdateDataDocsAction @@ -343,17 +339,17 @@ new directory or use this template: expectations_store: class_name: ExpectationsStore store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: expectations/ validations_store: class_name: ValidationsStore store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/validations/ evaluation_parameter_store: # Evaluation Parameters enable dynamic expectations. Read more here: # https://docs.greatexpectations.io/en/latest/reference/evaluation_parameters.html - class_name: InMemoryEvaluationParameterStore + class_name: EvaluationParameterStore expectations_store_name: expectations_store validations_store_name: validations_store evaluation_parameter_store_name: evaluation_parameter_store @@ -366,7 +362,7 @@ new directory or use this template: local_site: class_name: SiteBuilder store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/data_docs/local_site/ - +*last updated*: |lastupdate| diff --git a/docs/reference/data_docs_reference.rst b/docs/reference/data_docs_reference.rst index fa745533e6e9..508bba8e9f49 100644 --- a/docs/reference/data_docs_reference.rst +++ b/docs/reference/data_docs_reference.rst @@ -16,6 +16,8 @@ add a new site to the configuration is to copy the "local_site" configuration block in great_expectations.yml, give the copy a new name and modify the details as needed. +.. _data_docs_site_configuration: + *************************************** Data Docs Site Configuration *************************************** @@ -28,7 +30,7 @@ The default Data Docs site configuration looks like this: local_site: class_name: SiteBuilder store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/data_docs/local_site/ Here is an example of a site configuration from great_expectations.yml with defaults defined explicitly: @@ -37,11 +39,10 @@ Here is an example of a site configuration from great_expectations.yml with defa data_docs_sites: local_site: # site name - datasource_whitelist: '*' # used to restrict the Datasources module_name: great_expectations.render.renderer.site_builder class_name: SiteBuilder store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/data_docs/local_site/ site_index_builder: class_name: DefaultSiteIndexBuilder @@ -77,6 +78,25 @@ attribute allows to include (``eq`` for exact match) or exclude (``ne``) validat .. _customizing_data_docs_store_backend: +Limiting Validation Results +============================ + +If you would like to limit rendered Validation Results to the n most-recent, you may +do so by setting the `validation_results_limit` key in your Data Docs configuration: + +.. code-block:: yaml + + data_docs_sites: + local_site: + class_name: SiteBuilder + store_backend: + class_name: TupleFilesystemStoreBackend + base_directory: uncommitted/data_docs/local_site/ + site_index_builder: + class_name: DefaultSiteIndexBuilder + show_cta_footer: true + validation_results_limit: 5 + Automatically Publishing Data Docs ===================================== @@ -88,7 +108,7 @@ will automatically save the resulting site to that bucket. .. code-block:: yaml store_backend: - class_name: FixedLengthTupleS3StoreBackend + class_name: TupleS3StoreBackend bucket: data-docs.my_org.org prefix: @@ -112,7 +132,7 @@ the validations renderer, and no profiling results are rendered at all. local_site: class_name: SiteBuilder store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/data_docs/local_site/ site_section_builders: expectations: @@ -162,7 +182,7 @@ suites available to the configured context and validations available in the .. code-block:: bash - great_expectations build-docs + great_expectations docs build When called without additional arguments, this command will render all the Data @@ -173,9 +193,9 @@ The command will print out the locations of index.html file for each site. To disable the web browser opening behavior use `--no-view` option. -To render just one site, use `--site_name SITE_NAME` option. +To render just one site, use `--site-name SITE_NAME` option. -Here is when the `build-docs` command should be called: +Here is when the ``docs build`` command should be called: * when you want to fully rebuild a Data Docs site * after a new expectation suite is added or an existing one is edited @@ -201,18 +221,16 @@ for how to profile a single batch of data and build documentation from the valid from great_expectations.data_context.util import safe_mmkdir from great_expectations.render.view import DefaultJinjaPageView - profiling_html_filepath = '/path/into/which/to/save/results' + profiling_html_filepath = '/path/into/which/to/save/results.html' # obtain the DataContext object context = ge.data_context.DataContext() - # load a batch from the data asset - data_asset_name = context.normalize_data_asset_name('ratings') - context.create_expectation_suite(data_asset_name, 'default'), + # load a batch to profile + context.create_expectation_suite('default') batch = context.get_batch( - data_asset_name=data_asset_name, + batch_kwargs=context.build_batch_kwargs("my_datasource", "my_batch_kwargs_generator", "my_asset") expectation_suite_name='default', - batch_kwargs=context.yield_batch_kwargs(data_asset_name) ) # run the profiler on the batch - this returns an expectation suite and validation results for this suite @@ -348,7 +366,7 @@ Before modifying your project configuration, the relevant section looks like thi local_site: class_name: SiteBuilder store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/data_docs/local_site/ This is what it looks like after your changes are added: @@ -359,7 +377,7 @@ This is what it looks like after your changes are added: local_site: class_name: SiteBuilder store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/data_docs/local_site/ site_section_builders: expectations: @@ -389,7 +407,7 @@ Note that if your ``data_docs_sites`` configuration contains a ``site_section_bu defaults for anything you would like rendered. By omitting the ``profiling`` key within ``site_section_builders``, your third goal is achieved and Data Docs will no longer render Profiling Results pages. -Lastly, to compile your newly-customized Data Docs local site, you run ``great_expectations build-docs`` from the command line. +Lastly, to compile your newly-customized Data Docs local site, you run ``great_expectations docs build`` from the command line. ``site_section_builders`` defaults: @@ -454,3 +472,6 @@ Dependencies * Vega-Lite 3.2.1 * Vega-Embed 4.0.0 +Data Docs is implemented in the :py:mod:`great_expectations.render` module. + +*last updated*: |lastupdate| diff --git a/docs/reference/datasource_reference.rst b/docs/reference/datasource_reference.rst new file mode 100644 index 000000000000..38aa0454ca26 --- /dev/null +++ b/docs/reference/datasource_reference.rst @@ -0,0 +1,32 @@ +.. _datasource_reference: + +############################# +Datasource Reference +############################# + +To have a Datasource produce Data Assets of a custom type, such as when adding custom expectations by subclassing an +existing DataAsset type, use the `data_asset_type` parameter to configure the datasource to load and return DataAssets +of the custom type. + +For example: + +.. code-block:: yaml + + datasources: + pandas: + class_name: PandasDatasource + data_asset_type: + class_name: MyCustomPandasAsset + module_name: internal_pandas_assets + +Given the above configuration, we can observe the following: + +>>> batch_kwargs = { +... "datasource": "pandas", +... "dataset": {"a": [1, 2, 3]} +... } +>>> batch = context.get_batch(batch_kwargs, my_suite) +>>> isinstance(batch, MyCustomPandasAsset) +True + +*Last updated:* |lastupdate| diff --git a/docs/reference/distributional_expectations.rst b/docs/reference/distributional_expectations.rst index 7ec4ffc379ae..477d8eb34465 100644 --- a/docs/reference/distributional_expectations.rst +++ b/docs/reference/distributional_expectations.rst @@ -48,11 +48,25 @@ For continuous data: Example continuous partition object: .. code-block:: python - - { - "bins": [ 0, 1, 2, 10], - "weights": [0.3, 0.3, 0.4] - } + partition = { + "bins": [0, 1, 2, 10], + "weights": [0.3, 0.3, 0.4] + } + +>>> json.dumps(partition, indent=2) +{ + "bins": [ + 0, + 1, + 2, + 10 + ], + "weights": [ + 0.3, + 0.3, + 0.4 + ] +} For discrete/categorical data: @@ -106,3 +120,5 @@ For categorical data, the expect_column_chisquare_test_p_value_to_be_greater_tha Distributional Expectations Alternatives -------------------------------------------------------------------------------- The core partition density object used in current expectations focuses on a particular (partition-based) method of "compressing" the data into a testable form, however it may be desireable to use alternative nonparametric approaches (e.g. Fourier transform/wavelets) to describe expected data. + +*last updated*: |lastupdate| diff --git a/docs/reference/evaluation_parameters.rst b/docs/reference/evaluation_parameters.rst index 7aa871cd1d9a..6243f1b48d9f 100644 --- a/docs/reference/evaluation_parameters.rst +++ b/docs/reference/evaluation_parameters.rst @@ -4,55 +4,44 @@ Evaluation Parameters ###################### -Often, the specific parameters associated with an expectation will be derived from upstream steps in a processing \ -pipeline. For example, we may want to `expect_table_row_count_to_equal` a value stored in a previous step, but we \ -may still want to ensure that we can use the same expectation configuration object. +Often, the specific parameters associated with an expectation will be derived from upstream steps in a processing +pipeline. For example, we may want to `expect_table_row_count_to_equal` a value stored in a previous step. -Great Expectations makes working with parameters of that kind easy! When declaring an expectation, you can specify that \ -a particular argument is an evaluation parameter that should be substituted at evaluation time, and provide a temporary \ -value that should be used during the initial evaluation of the expectation. +Great Expectations makes it possible to use "Evaluation Parameters" to accomplish that goal. We declare Expectations +using parameters that need to be provided at validation time; during interactive development, we can even provide a +temporary value that should be used during the initial evaluation of the expectation. -.. code-block:: python - - >> my_df.expect_table_row_count_to_equal( - value={"$PARAMETER": "upstream_row_count", - "$PARAMETER.upstream_row_count": 10}, - result_format={'result_format': 'BOOLEAN_ONLY'} - ) - { - 'success': True - } +>>> my_df.expect_table_row_count_to_equal( +... value={"$PARAMETER": "upstream_row_count", "$PARAMETER.upstream_row_count": 10}, +... result_format={'result_format': 'BOOLEAN_ONLY'}) +{ + 'success': True +} You can also store parameter values in a special dictionary called evaluation_parameters that is stored in the \ expectation_suite to be available to multiple expectations or while declaring additional expectations. -.. code-block:: python - - >> my_df.set_evaluation_parameter("upstream_row_count", 10) - >> my_df.get_evaluation_parameter("upstream_row_count") +>>> my_df.set_evaluation_parameter("upstream_row_count", 10) +>>> my_df.get_evaluation_parameter("upstream_row_count") If a parameter has been stored, then it does not need to be provided for a new expectation to be declared: -.. code-block:: python - - >> my_df.set_evaluation_parameter("upstream_row_count", 10) - >> my_df.expect_table_row_count_to_be_between(max_value={"$PARAMETER": "upstream_row_count"}) +>>> my_df.set_evaluation_parameter("upstream_row_count", 10) +>>> my_df.expect_table_row_count_to_be_between(max_value={"$PARAMETER": "upstream_row_count"}) When validating expectations, you can provide evaluation parameters based on upstream results: -.. code-block:: python - - >> my_df.validate(expectation_suite=my_dag_step_config, evaluation_parameters={"upstream_row_count": upstream_row_count}) +>>> my_df.validate(expectation_suite=my_dag_step_config, evaluation_parameters={"upstream_row_count":upstream_row_count}) Finally, the command-line tool also allows you to provide a JSON file that contains parameters to use during evaluation: .. code-block:: bash - >> cat my_parameters_file.json + >>> cat my_parameters_file.json { "upstream_row_count": 10 } - >> great_expectations validate --evaluation_parameters=my_parameters_file.json dataset_file.csv expectation_suite.json + >>> great_expectations validation csv --evaluation_parameters=my_parameters_file.json dataset_file.csv expectation_suite.json .. _data_context_evaluation_parameter_store: @@ -62,7 +51,7 @@ DataContext Evaluation Parameter Store *************************************** When a DataContext has a configured evaluation parameter store, it can automatically identify and store evaluation -parameters that are referenced in other expectation suites. The evaluation parameter store uses a URN schema for +parameters that are referenced in other expectation suites. The evaluation parameter store uses a URN schema for identifying dependencies between expectation suites. The DataContext-recognized URN must begin with the string ``urn:great_expectations:validations``. Valid URNs must have @@ -70,13 +59,13 @@ one of the following structures to be recognized by the Great Expectations DataC :: - urn:great_expectations:validations:::expectations::columns::result: - urn:great_expectations:validations:::expectations::columns::details: - urn:great_expectations:validations:::expectations::result: - urn:great_expectations:validations:::expectations::details: + urn:great_expectations:validations:: + urn:great_expectations:validations::: Replace names in ``<>`` with the desired name. For example: :: - urn:great_expectations:validations:my_source/default/notable_works_by_charles_dickens:my_suite:expectations:expect_column_proportion_of_unique_values_to_be_between:columns:Title:result:observed_value + urn:great_expectations:validations:dickens_data:expect_column_proportion_of_unique_values_to_be_between.result.observed_value:column=Title + +*last updated*: |lastupdate| diff --git a/docs/reference/extending_great_expectations.rst b/docs/reference/extending_great_expectations.rst index 5ba5d2bed02f..ef4d20e5fd30 100644 --- a/docs/reference/extending_great_expectations.rst +++ b/docs/reference/extending_great_expectations.rst @@ -23,3 +23,5 @@ your dataset see consistent documentation no matter which backend is implementin `@DocInherit` overrides your function's __get__ method with one that will replace the local docstring with the docstring from its parent. It is defined in `Dataset.util`. + +*last updated*: |lastupdate| diff --git a/docs/reference/glossary.rst b/docs/reference/glossary.rst new file mode 100644 index 000000000000..c328e4f941a0 --- /dev/null +++ b/docs/reference/glossary.rst @@ -0,0 +1,123 @@ +.. _glossary: + + +################################ +Great Expectations Glossary +################################ + + +************* +Expectations +************* + +Expectations are assertions for data. They help accelerate data engineering and increase analytic integrity, by making it possible to answer a critical question: + +- What can I expect of my data? + +**Expectations** are declarative statements that a computer can evaluate, and that are semantically meaningful to +humans, like expect_column_values_to_be_unique or expect_column_mean_to_be_between. + +**Expectation Configurations** describe specific Expectations for data. They combine an Expectation and specific +parameters to make it possible to evaluate whether the expectation is true on a dataset. For example, they might provide expected values or the name of a column whose values should be unique. + +**Expectation Suites** combine multiple Expectation Configurations into an overall description of a dataset. Expectation +Suites should have names corresponding to the kind of data they define, like “NPI” for National Provider Identifier data or “company.users” for a users table. + +************************** +DataAssets and Validation +************************** +In addition to specifying Expectations, Great Expectations also allows you to validate your data against an Expectation Suite. Validation produces a detailed report of how the data meets your expectations -- and where it doesn’t. + +DataAssets and Validations, answer the questions: + +- How do I describe my Expectations to Great Expectations? +- Does my data meet my Expectations? + +A **DataAsset** is a Great Expectations object that can create and validate Expectations against specific data. +DataAssets are connected to data. They can evaluate Expectations wherever you access your data, using Pandas, Spark, or SqlAlchemy. + +An •Expectation Validation Result** captures the output of checking an expectation against data. It describes whether +the data met the expectation, and additional metrics from the data such as the percentage of unique values or observed mean. + +An **Expectation Suite Validation Result** combines multiple Expectation Validation Results and metadata about the +validation into a single report. + +************************** +Datasources and Batches +************************** + +Great Expectations lets you focus on your data, not writing tests. It validates your expectations no matter where the data is located. + +Datasources, Generators, Batch Parameters and Batch Kwargs make it easier to connect Great Expectations to your data. Together, they address questions such as: + +- How do I get data into my Great Expectations data asset? +- How do I tell my Datasource how to access my specific data? +- How do I use Great Expectations to store Batch Kwargs configurations or logically describe data when I need to build +equivalent Batch Kwargs for different datasources? +- How do I know what data is available from my datasource? + +A **Datasource** is a connection to a compute environment (a backend such as Pandas, Spark, or a SQL-compatible +database) and one or more storage environments. It produces batches of data that Great Expectations can validate in that environment. + +**Batch Kwargs** are specific instructions for a Datasource about what data should be prepared as a “batch” for +validation. The batch could be a specific database table, the most recent log file delivered to S3, or even a subset of one of those objects such as the first 10,000 rows. + +**Batch Parameters** provide instructions for how to retrieve stored Batch Kwargs or build new Batch Kwargs that reflect +partitions, deliveries, or slices of logical data assets. + +A **Batch Kwargs Generator** translates Batch Parameters to datasource-specific Batch Kwargs. A Batch Kwargs Generator +can also identify data assets and partitions by inspecting a storage environment. + +************************** +Profiling +************************** +Profiling helps you understand your data by describing it and even building expectation suites based on previous batches of data. Profiling lets you ask: + +- What is this dataset like? + +A **Profiler** reviews data assets and produces new Expectation Suites and Expectation Suite Validation Results that +describe the data. A profiler can create a “stub” of high-level expectations based on what it sees in the data. Profilers can also be extended to create more specific expectations based on team conventions or statistical properties. Finally, Profilers can take advantage of metrics produced by Great Expectations when validating data to create useful overviews of data. + +************************** +Data Docs +************************** + +With Great Expectations, your tests can update your docs, and your docs can validate your data. Data Docs makes it possible to produce clear visual descriptions of what you expect, what you observe, and how they differ. Does my data meet my expectations? + +An **Expectation Suite Renderer** creates a page that shows what you expect from data. Its language is prescriptive, for +example translating a fully-configured expect_column_values_to_not_be_null expectation into “column “address” values must not be null, at least 80% of the time” + +A **Validation Result Renderer** produces an overview of the result of validating a batch of data with an Expectation +Suite. It shows the difference between observed and expected values. + +A **Profiling Renderer** details the observed metrics produced from a validation without comparing them to +specific expected values. It provides a detailed look into what Great Expectations learned about your data. + +************************** +Data Context +************************** + +A **Data Context** stitches together all the features available with Great Expectations, making it possible to easily +manage configurations for datasources, and data docs sites and to store expectation suites and validations. Data Contexts also unlock more powerful features such as Evaluation Parameter Stores. + + +A **Data Context Configuration** is a yaml file that can be committed to source control to ensure that all the settings +related to your validation are appropriately versioned and visible to your team. It can flexibly describe plugins and other customizations for accessing datasources or building data docs sites. + + +A **Store** allows you to manage access to Expectations, Validations and other Great Expectations assets in a +standardized way, making it easy to share resources across a team that uses AWS, Azure, GCP, local storage, or something else entirely. + +A **Metric** is simply a value produced by Great Expectations when evaluating one or more batches of data, such as an +observed mean or distribution of data. + +An **Evaluation Parameter** Store makes it possible to build expectation suites that depend on values from other batches +of data, such as ensuring that the number of rows in a downstream dataset equals the number of unique values from an upstream one. A Data Context can manage a store to facilitate that validation scenario. + +************************** +Validation Operators +************************** + +A **Validation Operator** stitches together resources provided by the Data Context to build mini-programs that +demonstrate the full potential of Great Expectations. They take configurable Actions such as updating Data Docs, sending a notification to your team about validation results, or storing a result in a shared S3 bucket. + diff --git a/docs/reference/implemented_expectations.rst b/docs/reference/implemented_expectations.rst index af6345096d08..77fba4c34b86 100644 --- a/docs/reference/implemented_expectations.rst +++ b/docs/reference/implemented_expectations.rst @@ -99,3 +99,5 @@ out the missing implementations! +------------------------------------------------------------------------------+------------+---------+-----------+ |`expect_multicolumn_values_to_be_unique` | Y | N | N | +------------------------------------------------------------------------------+------------+---------+-----------+ + +*last updated*: |lastupdate| diff --git a/docs/reference/improving_library_documentation.rst b/docs/reference/improving_library_documentation.rst index 62b7236a4f39..96d72d3a001b 100644 --- a/docs/reference/improving_library_documentation.rst +++ b/docs/reference/improving_library_documentation.rst @@ -56,3 +56,4 @@ Resources * We follow the `Sphinx guide for sections `__. +*last updated*: |lastupdate| diff --git a/docs/reference/integrations/bigquery.rst b/docs/reference/integrations/bigquery.rst index b4987fcb9151..51c1adae6239 100644 --- a/docs/reference/integrations/bigquery.rst +++ b/docs/reference/integrations/bigquery.rst @@ -6,7 +6,7 @@ BigQuery To add a BigQuery datasource do this: -1. Run ``great_expectations add-datasource`` +1. Run ``great_expectations datasource new`` 2. Choose the *SQL* option from the menu. 3. When asked which sqlalchemy driver to use enter ``bigquery``. 4. Consult the `PyBigQuery `_ diff --git a/docs/reference/metric_reference.rst b/docs/reference/metric_reference.rst new file mode 100644 index 000000000000..9516d5b67491 --- /dev/null +++ b/docs/reference/metric_reference.rst @@ -0,0 +1,24 @@ +.. _metrics_reference: + + +####################### +Metrics Reference +####################### + +Metrics are still a **beta feature** in Great Expectations. Expect changes to the API. + +A Metric is a value that Great Expectations can use to evaluate expectations or to store externally. A metric could +be a statistic, such as the minimum value of the column, or a more complex object, such as a histogram. + +Expectation Validation Results and Expectation Suite Validation Results can expose metrics that are defined by +specific expectations that have been validated, called "Expectation Defined Metrics." In the future, we plan to allow +Expectations to have more control over the metrics that they generate, expose, and use in testing. + +The following examples demonstrate how metrics are defined: + +.. code-block:: pythohn + + res = df.expect_column_values_to_be_in_set("Sex", ["male", "female"]) + res.get_metric("expect_column_values_to_be_in_set.result.missing_count", column="Sex") + +*Last updated:* |lastupdate| diff --git a/docs/reference/migrating_versions.rst b/docs/reference/migrating_versions.rst index f9b7bbd4addb..b40488d2936b 100644 --- a/docs/reference/migrating_versions.rst +++ b/docs/reference/migrating_versions.rst @@ -7,7 +7,7 @@ Migrating Between Versions While we are committed to keeping Great Expectations as stable as possible, sometimes breaking changes are necessary to maintain our trajectory. This is especially true as the library has evolved from just a data quality tool to a -slightly more opinionated framework. +more capable framework including data docs and profiling in addition to validation. Great Expectations provides a warning when the currently-installed version is different from the version stored in the expectation suite. @@ -16,15 +16,17 @@ Since expectation semantics are usually consistent across versions, there is little change required when upgrading great expectations, with some exceptions noted here. -********************************* -Using the check-config Command -********************************* +*************************************** +Using the project check-config Command +*************************************** To facilitate this substantial config format change, starting with version 0.8.0 -we introduced `check-config` to sanity check your config files. From your +we introduced ``project check-config`` to sanity check your config files. From your project directory, run: ->>> great_expectations check-config +.. code-block:: bash + + great_expectations project check-config This can be used at any time and will grow more robust and helpful as our internal config typing system improves. @@ -33,6 +35,210 @@ You will most likely be prompted to install a new template. Rest assured that your original yaml file will be archived automatically for you. Even so, it's in your source control system already, right? ;-) +************************* +Upgrading to 0.9.x +************************* + +In the 0.9.0 release, there are several changes to the DataContext API. + + +Follow these steps to upgrade your existing Great Expectations project: + +* In the terminal navigate to the parent of the ``great_expectations`` directory of your project. + +* Run this command: + +.. code-block:: bash + + great_expectations project check-config + +* For every item that needs to be renamed the command will display a message that looks like this: ``The class name 'X' has changed to 'Y'``. Replace all occurrences of X with Y in your project's ``great_expectations.yml`` config file. + +* After saving the config file, rerun the check-config command. + +* Depending on your configuration, you will see 3-6 of these messages. + +* The command will display this message when done: ``Your config file appears valid!``. + +* Rename your Expectation Suites to make them compatible with the new naming. Save this Python code snippet in a file called ``update_project.py``, then run it using the command: ``python update_project.py PATH_TO_GE_CONFIG_DIRECTORY``: + +.. code-block:: python + + #!/usr/bin/env python3 + import sys + import os + import json + import uuid + import shutil + def update_validation_result_name(validation_result): + data_asset_name = validation_result["meta"].get("data_asset_name") + if data_asset_name is None: + print(" No data_asset_name in this validation result. Unable to update it.") + return + data_asset_name_parts = data_asset_name.split("/") + if len(data_asset_name_parts) != 3: + print(" data_asset_name in this validation result does not appear to be normalized. Unable to update it.") + return + expectation_suite_suffix = validation_result["meta"].get("expectation_suite_name") + if expectation_suite_suffix is None: + print(" No expectation_suite_name found in this validation result. Unable to update it.") + return + expectation_suite_name = ".".join( + data_asset_name_parts + + [expectation_suite_suffix] + ) + validation_result["meta"]["expectation_suite_name"] = expectation_suite_name + try: + del validation_result["meta"]["data_asset_name"] + except KeyError: + pass + def update_expectation_suite_name(expectation_suite): + data_asset_name = expectation_suite.get("data_asset_name") + if data_asset_name is None: + print(" No data_asset_name in this expectation suite. Unable to update it.") + return + data_asset_name_parts = data_asset_name.split("/") + if len(data_asset_name_parts) != 3: + print(" data_asset_name in this expectation suite does not appear to be normalized. Unable to update it.") + return + expectation_suite_suffix = expectation_suite.get("expectation_suite_name") + if expectation_suite_suffix is None: + print(" No expectation_suite_name found in this expectation suite. Unable to update it.") + return + expectation_suite_name = ".".join( + data_asset_name_parts + + [expectation_suite_suffix] + ) + expectation_suite["expectation_suite_name"] = expectation_suite_name + try: + del expectation_suite["data_asset_name"] + except KeyError: + pass + def update_context_dir(context_root_dir): + # Update expectation suite names in expectation suites + expectations_dir = os.path.join(context_root_dir, "expectations") + for subdir, dirs, files in os.walk(expectations_dir): + for file in files: + if file.endswith(".json"): + print("Migrating suite located at: " + str(os.path.join(subdir, file))) + with open(os.path.join(subdir, file), 'r') as suite_fp: + suite = json.load(suite_fp) + update_expectation_suite_name(suite) + with open(os.path.join(subdir, file), 'w') as suite_fp: + json.dump(suite, suite_fp) + # Update expectation suite names in validation results + validations_dir = os.path.join(context_root_dir, "uncommitted", "validations") + for subdir, dirs, files in os.walk(validations_dir): + for file in files: + if file.endswith(".json"): + print("Migrating validation_result located at: " + str(os.path.join(subdir, file))) + try: + with open(os.path.join(subdir, file), 'r') as suite_fp: + suite = json.load(suite_fp) + update_validation_result_name(suite) + with open(os.path.join(subdir, file), 'w') as suite_fp: + json.dump(suite, suite_fp) + try: + run_id = suite["meta"].get("run_id") + es_name = suite["meta"].get("expectation_suite_name").split(".") + filename = "converted__" + str(uuid.uuid1()) + ".json" + os.makedirs(os.path.join( + context_root_dir, "uncommitted", "validations", + *es_name, run_id + ), exist_ok=True) + shutil.move(os.path.join(subdir, file), + os.path.join( + context_root_dir, "uncommitted", "validations", + *es_name, run_id, filename + ) + ) + except OSError as e: + print(" Unable to move validation result; file has been updated to new " + "format but not moved to new store location.") + except KeyError: + pass # error will have been generated above + except json.decoder.JSONDecodeError: + print(" Unable to process file: error reading JSON.") + if __name__ == "__main__": + if len(sys.argv) < 2: + print("Please provide a path to update.") + sys.exit(-1) + path = str(os.path.abspath(sys.argv[1])) + print("About to update context dir for path: " + path) + update_context_dir(path) + +* Rebuild Data Docs: + +.. code-block:: bash + + great_expectations docs build + +* This project has now been migrated to 0.9.0. Please see the list of changes below for more detailed information. + + +CONFIGURATION CHANGES: + +- FixedLengthTupleXXXX stores are renamed to TupleXXXX stores; they no + longer allow or require (or allow) a key_length to be specified, but they + do allow `filepath_prefix` and/or `filepath_suffix` to be configured as an + alternative to an the `filepath_template`. +- ExtractAndStoreEvaluationParamsAction is renamed to + StoreEvaluationParametersAction; a new StoreMetricsAction is available as + well to allow DataContext-configured metrics to be saved. +- The InMemoryEvaluationParameterStore is replaced with the + EvaluationParameterStore; EvaluationParameterStore and MetricsStore can + both be configured to use DatabaseStoreBackend instead of the + InMemoryStoreBackend. +- The `type` key can no longer be used in place of class_name in + configuration. Use `class_name` instead. +- BatchKwargsGenerators are more explicitly named; we avoid use of the term + "Generator" because it is ambiguous. All existing BatchKwargsGenerators have + been renamed by substituting "BatchKwargsGenerator" for "Generator"; for + example GlobReaderGenerator is now GlobReaderBatchKwargsGenerator. +- ReaderMethod is no longer an enum; it is a string of the actual method to + be invoked (e.g. `read_csv` for pandas). That change makes it easy to + specify arbitrary reader_methods via batch_kwargs (including read_pickle), + BUT existing configurations using enum-based reader_method in batch_kwargs + will need to update their code. For example, a pandas datasource would use + `reader_method: read_csv`` instead of `reader_method: csv` + +CODE CHANGES: + +- DataAssetName and name normalization have been completely eliminated, which + causes several related changes to code using the DataContext. + + - data_asset_name is **no longer** a parameter in the + create_expectation_suite, get_expectation_suite, or get_batch commands; + expectation suite names exist in an independent namespace. + - batch_kwargs alone now define the batch to be received, and the + datasource name **must** be included in batch_kwargs as the "datasource" + key. + - **A generator name is therefore no longer required to get data or define + an expectation suite.** + - The BatchKwargsGenerators API has been simplified; `build_batch_kwargs` + should be the entrypoint for all cases of using a generator to get + batch_kwargs, including when explicitly specifying a partition, limiting + the number of returned rows, accessing saved kwargs, or using any other + BatchKwargsGenerator feature. BatchKwargsGenerators *must* be attached to + a specific datasource to be instantiated. + - This tutorial uses the latest API for validating data: :ref:`tutorial_validate_data` + +- **Database store tables are not compatible** between versions and require a + manual migration; the new default table names are: `ge_validations_store`, + `ge_expectations_store`, `ge_metrics`, and `ge_evaluation_parameters`. The + Validations Store uses a three-part compound primary key consisting of + run_id, expectation_suite_name, and batch_identifier; Expectations Store + uses the expectation_suite_name as its only key. Both Metrics and + Evaluation Parameters stores use `run_id`, `expectation_suite_name`, + `metric_id`, and `metric_kwargs_id` to form a compound primary key. +- The term "batch_fingerprint" is no longer used, and has been replaced with + "batch_markers". It is a dictionary that, like batch_kwargs, can be used to + construct an ID. +- `get_data_asset_name` and `save_data_asset_name` are removed. +- There are numerous under-the-scenes changes to the internal types used in + GreatExpectations. These should be transparent to users. + + ************************* Upgrading to 0.8.x ************************* @@ -64,7 +270,7 @@ BREAKING: - InMemoryBatchKwargs use the key dataset instead of df to be more explicit -Pre-0.8.x configuration files ``great_expectations.yml`` are not compatible with 0.8.x. Run ``great_expectations check-config`` - it will offer to create a new config file. The new config file will not have any customizations you made, so you will have to copy these from the old file. +Pre-0.8.x configuration files ``great_expectations.yml`` are not compatible with 0.8.x. Run ``great_expectations project check-config`` - it will offer to create a new config file. The new config file will not have any customizations you made, so you will have to copy these from the old file. If you run into any issues, please ask for help on `Slack `__. diff --git a/docs/reference/profiling_reference.rst b/docs/reference/profiling_reference.rst index 9ef036a3e0b2..450a45595573 100644 --- a/docs/reference/profiling_reference.rst +++ b/docs/reference/profiling_reference.rst @@ -5,6 +5,55 @@ Profiling Reference ############################## +Profiling produces a special kind of :ref:`data_docs` that are purely descriptive. + +**************************** +Expectations and Profiling +**************************** + +In order to characterize a data asset, Profiling uses an Expectation Suite. Unlike the Expectations that are +typically used for data validation, these expectations do not necessarily apply any constraints; they can simply +identify statistics or other data characteristics that should be evaluated and made available in GE. For example, when +the ``BasicDatasetProfiler`` encounters a numeric column, it will add an ``expect_column_mean_to_be_between`` +expectation but choose the min_value and max_value to both be None: essentially only saying that it expects a mean +to exist. + +.. code-block:: json + + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "rating", + "min_value": null, + "max_value": null + } + } + +To "profile" a datasource, therefore, the :class:`~great_expectations.profile.basic_dataset_profiler.\ +BasicDatasetProfiler` included in GE will generate a large number of very loosely-specified expectations. Effectively +it is asserting that the given statistic is relevant for evaluating batches of that data asset, but it is not yet sure +what the statistic's value should be. + +In addition to creating an expectation suite, profiling data tests the suite against data. +The validation_result contains the output of that expectation suite when validated against the same batch of data. +For a loosely specified expectation like in our example above, getting the observed value was the sole purpose of +the expectation. + +.. code-block:: json + + { + "success": true, + "result": { + "observed_value": 4.05, + "element_count": 10000, + "missing_count": 0, + "missing_percent": 0 + } + } + +Running a profiler on a data asset can also be useful to produce a large number of expectations to review +and potentially transfer to a new expectation suite used for validation in a pipeline. + ********************** How to Run Profiling ********************** @@ -12,30 +61,29 @@ How to Run Profiling Run During Init =================== -The ``great_expectations init`` command offers to profile a newly added datasource. If you agree, data assets in that -datasource will be profiled (e.g., tables in the database). By default the profiler will select the first 20 data -assets. +The ``great_expectations init`` command will auto-generate an example Expectation Suite using a very basic profiler that +quickly glances at 1,000 rows of your data. This is not a production suite - it is only meant to show examples +of Expectations, many of which may not be meaningful. -Expectation suites generated by the profiler will be saved in the configured ``expectations`` directory for expectation -suites. The expectation suite name by default is the name of hte profiler that generated it. Validation results will be -saved in the ``uncommitted/validations`` directory by default; the CLI will then offer to move them to the -``fixtures/validations`` directory from which data documentation is built. +Expectation Suites generated by the profiler will be saved in the configured ``expectations`` directory for Expectation +Suites. The Expectation Suite name by default is the name of the profiler that generated it. Validation results will be +saved in the ``uncommitted/validations`` directory by default. When profiling is complete, Great Expectations will +build and launch Data Docs based on your data. Run From Command Line ======================= -The GE command-line interface can also profile a datasource: +The GE command-line interface can profile a datasource: .. code-block:: bash - great_expectations profile DATASOURCE_NAME + great_expectations datasource profile DATASOURCE_NAME -Just as when running during init, expectation suites generated by the profiler will be saved in the configured -``expectations`` directory for expectation suites. The expectation suite name by default is the name of the profiler +Expectation Suites generated by the profiler will be saved in the configured +``expectations`` directory for Expectation Suites. The Expectation Suite name by default is the name of the profiler that generated it. Validation results will be saved in the ``uncommitted/validations`` directory by default. -The CLI will offer to move resulting validations to the -``fixtures/validations`` directory from which data documentation is built and to regenerate the HTML documentation. +When profiling is complete, Great Expectations will build and launch Data Docs based on your data. See :ref:`data_docs` for more information. @@ -67,8 +115,8 @@ Custom Profilers ******************* Like most things in Great Expectations, Profilers are designed to be extensibile. You can develop your own profiler -by subclassing ``DataetProfiler``, or from the parent ``DataAssetProfiler`` class itself. For help, advice, and ideas -on developing custom profilers, please get in touch on `the Great Expectations slack channel \ +by subclassing ``DatasetProfiler``, or from the parent ``DataAssetProfiler`` class itself. For help, advice, and ideas +on developing custom profilers, please get in touch on `the Great Expectations slack channel\ `_. @@ -89,3 +137,5 @@ Data Samples Since profiling and expectations are so tightly linked, getting samples of *expected* data requires a slightly different approach than the normal path for profiling. Stay tuned for more in this area! + +*last updated*: |lastupdate| diff --git a/docs/reference/standard_arguments.rst b/docs/reference/standard_arguments.rst index 59b1fa396c26..ea964d9717c9 100644 --- a/docs/reference/standard_arguments.rst +++ b/docs/reference/standard_arguments.rst @@ -69,7 +69,10 @@ All Expectations accept a boolean `catch_exceptions` parameter. If true, executi `meta` ------------------------------------------------------------------------------ -All Expectations accept an optional `meta` parameter. If `meta` is a valid JSON-serializable dictionary, it will be passed through to the `expectation_result` object without modification. +All Expectations accept an optional `meta` parameter. If `meta` is a valid JSON-serializable dictionary, it will be \ +passed through to the `expectation_result` object without modification. The `meta` parameter can be used to add \ +helpful markdown annotations to Expectations (shown below). These Expectation "notes" are rendered within \ +Expectation Suite pages in Data Docs. .. code-block:: bash @@ -77,15 +80,23 @@ All Expectations accept an optional `meta` parameter. If `meta` is a valid JSON- "my_column", ["a", "b", "c"], meta={ - "foo": "bar", - "baz": [1,2,3,4] + "notes": { + "format": "markdown", + "content": [ + "#### These are expectation notes \n - you can use markdown \n - or just strings" + ] + } } ) { "success": False, "meta": { - "foo": "bar", - "baz": [1,2,3,4] + "notes": { + "format": "markdown", + "content": [ + "#### These are expectation notes \n - you can use markdown \n - or just strings" + ] + } } } @@ -165,5 +176,6 @@ In validation mode, they can be overridden using flags: .. code-block:: bash - great_expectations my_dataset.csv my_expectations.json --result_format=BOOLEAN_ONLY --catch_exceptions=False --include_config=True + great_expectations validation csv my_dataset.csv my_expectations.json --result_format=BOOLEAN_ONLY --catch_exceptions=False --include_config=True +*last updated*: |lastupdate| diff --git a/docs/reference/stores_reference.rst b/docs/reference/stores_reference.rst index 05d18814248b..fba332849274 100644 --- a/docs/reference/stores_reference.rst +++ b/docs/reference/stores_reference.rst @@ -8,3 +8,4 @@ Stores Stores require a :ref:`data_context` which manages their creation and configuration. A store provides an abstraction for getting and setting key values in the GE ecosystem. +*last updated*: |lastupdate| diff --git a/docs/reference/supporting_resources.rst b/docs/reference/supporting_resources.rst index c3e23a04a39c..ade38cc5a0aa 100644 --- a/docs/reference/supporting_resources.rst +++ b/docs/reference/supporting_resources.rst @@ -53,3 +53,4 @@ useful-with-pip-install>`__ flag. $ git clone https://github.com/great-expectations/great_expectations.git $ pip install -e great_expectations/ +*last updated*: |lastupdate| diff --git a/docs/reference/usage_statistics.rst b/docs/reference/usage_statistics.rst index f58ca71ca5bf..333893631f36 100644 --- a/docs/reference/usage_statistics.rst +++ b/docs/reference/usage_statistics.rst @@ -8,3 +8,5 @@ Usage Statistics We use CDN fetch rates to get a sense of total community usage of Great Expectations. Specifically, we host images and style sheets on a public CDN and count the number of unique IPs from which resources are fetched. Other than standard web request data, we don’t collect any data data that could be used to identify individual users. You can suppress the images by changing `static_images_dir` in `great_expectations/render/view/templates/top_navbar.j2`. Please reach out on Slack if you have any questions or comments. + +*last updated*: |lastupdate| diff --git a/docs/reference/validation_operators.rst b/docs/reference/validation_operators.rst index f257d28dc009..9535d5e4605e 100644 --- a/docs/reference/validation_operators.rst +++ b/docs/reference/validation_operators.rst @@ -11,3 +11,5 @@ Validation Operators /reference/validation_operators/action_list_validation_operator /reference/validation_operators/warning_and_failure_expectation_suites_validation_operator /reference/validation_operators/actions + +*last updated*: |lastupdate| diff --git a/docs/reference/validation_operators/action_list_validation_operator.rst b/docs/reference/validation_operators/action_list_validation_operator.rst index df2c292da74e..c602b3f05d98 100644 --- a/docs/reference/validation_operators/action_list_validation_operator.rst +++ b/docs/reference/validation_operators/action_list_validation_operator.rst @@ -9,7 +9,7 @@ ActionListValidationOperator validates each batch in its `run` method's `assets_ Then it invokes a list of configured actions on every validation result. -Each action in the list must be an instance of NamespacedValidationAction +Each action in the list must be an instance of ValidationAction class (or its descendants). Read more about actions here: :ref:`actions`. The init command includes this operator in the default configuration file. @@ -31,7 +31,7 @@ An instance of ActionListValidationOperator is included in the default configura action_list: - name: store_validation_result action: - class_name: StoreAction + class_name: StoreValidationResultAction target_store_name: validations_store - name: send_slack_notification_on_validation_result action: @@ -60,7 +60,7 @@ This is an example of invoking an instance of a Validation Operator from Python: validation_operator_name="perform_action_list_operator", ) -* `assets_to_validate` - an iterable that specifies the data assets that the operator will validate. The members of the list can be either batches or triples that will allow the operator to fetch the batch: (data_asset_name, expectation_suite_name, batch_kwargs) using this method: :py:meth:`~great_expectations.data_context.ConfigOnlyDataContext.get_batch` +* `assets_to_validate` - an iterable that specifies the data assets that the operator will validate. The members of the list can be either batches or triples that will allow the operator to fetch the batch: (data_asset_name, expectation_suite_name, batch_kwargs) using this method: :py:meth:`~great_expectations.data_context.BaseDataContext.get_batch` * run_id - pipeline run id, a timestamp or any other string that is meaningful to you and will help you refer to the result of this operation later * validation_operator_name you can instances of a class that implements a Validation Operator diff --git a/docs/reference/validation_operators/actions.rst b/docs/reference/validation_operators/actions.rst index 9b8e15dc3dae..eef3ec9015bc 100644 --- a/docs/reference/validation_operators/actions.rst +++ b/docs/reference/validation_operators/actions.rst @@ -31,10 +31,10 @@ Configuration class_name: SlackRenderer -StoreAction +StoreValidationResultAction ----------- -StoreAction is a namespace-aware validation action that stores a validation result +StoreValidationResultAction is a namespace-aware validation action that stores a validation result in the store. Configuration @@ -44,16 +44,16 @@ Configuration - name: store_validation_result action: - class_name: StoreAction + class_name: StoreValidationResultAction # name of the store where the actions will store validation results # the name must refer to a store that is configured in the great_expectations.yml file target_store_name: validations_store -ExtractAndStoreEvaluationParamsAction +StoreEvaluationParametersAction ------------------------------------- -ExtractAndStoreEvaluationParamsAction is a namespace-aware validation action that +StoreEvaluationParametersAction is a namespace-aware validation action that extracts evaluation parameters from a validation result and stores them in the store configured for this action. @@ -67,7 +67,7 @@ Configuration - name: store_evaluation_params action: - class_name: ExtractAndStoreEvaluationParamsAction + class_name: StoreEvaluationParametersAction # name of the store where the action will store the parameters # the name must refer to a store that is configured in the great_expectations.yml file target_store_name: evaluation_parameter_store @@ -93,5 +93,7 @@ Configuration Dependencies ~~~~~~~~~~~~ -When configured inside action_list of an operator, StoreAction action has to be configured before this action, +When configured inside action_list of an operator, StoreValidationResultAction action has to be configured before this action, since the building of data docs fetches validation results from the store. + +*last updated*: |lastupdate| diff --git a/docs/reference/validation_operators/warning_and_failure_expectation_suites_validation_operator.rst b/docs/reference/validation_operators/warning_and_failure_expectation_suites_validation_operator.rst index a707ff832fad..c05c30ca82f4 100644 --- a/docs/reference/validation_operators/warning_and_failure_expectation_suites_validation_operator.rst +++ b/docs/reference/validation_operators/warning_and_failure_expectation_suites_validation_operator.rst @@ -12,7 +12,7 @@ The "failure" expectation suite contains expectations that are considered import WarningAndFailureExpectationSuitesValidationOperator retrieves the two expectation suites ("failure" and "warning") for every data asset in the `assets_to_validate` argument of its `run` method. It does not require both suites to be present. The operator invokes a list of actions on every validation result. The list is configured for the operator. -Each action in the list must be an instance of NamespacedValidationAction +Each action in the list must be an instance of ValidationAction class (or its descendants). Read more about actions here: :ref:`actions`. After completing all the validations, it sends a Slack notification with the success status. @@ -51,11 +51,11 @@ Below is an example of this operator's configuration: action_list: - name: store_validation_result action: - class_name: StoreAction + class_name: StoreValidationResultAction target_store_name: validations_store - name: store_evaluation_params action: - class_name: ExtractAndStoreEvaluationParamsAction + class_name: StoreEvaluationParametersAction target_store_name: evaluation_parameter_store @@ -72,7 +72,7 @@ This is an example of invoking an instance of a Validation Operator from Python: validation_operator_name="operator_instance_name", ) -* `assets_to_validate` - an iterable that specifies the data assets that the operator will validate. The members of the list can be either batches or triples that will allow the operator to fetch the batch: (data_asset_name, expectation_suite_name, batch_kwargs) using this method: :py:meth:`~great_expectations.data_context.ConfigOnlyDataContext.get_batch` +* `assets_to_validate` - an iterable that specifies the data assets that the operator will validate. The members of the list can be either batches or triples that will allow the operator to fetch the batch: (data_asset_name, expectation_suite_name, batch_kwargs) using this method: :py:meth:`~great_expectations.data_context.BaseDataContext.get_batch` * run_id - pipeline run id, a timestamp or any other string that is meaningful to you and will help you refer to the result of this operation later * validation_operator_name you can instances of a class that implements a Validation Operator @@ -98,3 +98,5 @@ The value of "success" is True if no critical expectation suites ("failure") fai } } } + +*last updated*: |lastupdate| diff --git a/docs/requirements.txt b/docs/requirements.txt index 156eda004d79..09b133d07d0f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,5 @@ sphinx>=2.1 +sybil>=1.2.1 sphinx_rtd_theme>=0.4.3 sphinxcontrib-contentui>=0.2.4 +sphinxcontrib-lastupdate>=1.1 \ No newline at end of file diff --git a/docs/roadmap_changelog.rst b/docs/roadmap_changelog.rst index fbf3184ef9ef..37d72c7a1c1f 100644 --- a/docs/roadmap_changelog.rst +++ b/docs/roadmap_changelog.rst @@ -22,3 +22,5 @@ Changelog :maxdepth: 2 /changelog/changelog + +*last updated*: |lastupdate| diff --git a/docs/tutorials.rst b/docs/tutorials.rst index e76c3d5a3e69..96313e5749f3 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -13,5 +13,6 @@ This is a collection of tutorials that walk you through a variety of useful Grea /tutorials/create_expectations /tutorials/validate_data /tutorials/publishing_data_docs_to_s3 + /tutorials/saving_metrics - +*last updated*: |lastupdate| diff --git a/docs/tutorials/create_expectations.rst b/docs/tutorials/create_expectations.rst index c1991bd0502c..be444d8734a8 100644 --- a/docs/tutorials/create_expectations.rst +++ b/docs/tutorials/create_expectations.rst @@ -3,262 +3,213 @@ Create Expectations ============================== -Creating expectations is an opportunity to blend contextual knowledge from subject-matter experts and insights from -profiling and performing exploratory analysis on your dataset. This tutorial covers creating expectations for a data asset using a Jupyter notebook. +This tutorial covers the workflow of creating and editing expectations. -Video ------- +The tutorial assumes that you have created a new Data Context (project), as covered here: :ref:`tutorial_init`. -Want to watch a video walkthrough of this tutorial? `James `_ (one of the original core contributors to Great Expectations) walks you through this tutorial in a `video on YouTube `_. +Creating expectations is an opportunity to blend contextual knowledge from subject-matter experts and insights from +profiling and performing exploratory analysis on your dataset. -0. Open Jupyter Notebook ------------------------- +Once the initial setup of Great Expectations is complete, the workflow looks like a loop over the following steps: -This tutorial assumes that: +1. Data team members capture and document their shared understanding of their data as expectations. +2. As new data arrives in the pipeline, Great Expectations evaluates it against these expectations. +3. If the observed properties of the data are found to be different from the expected ones, the team responds by rejecting (or fixing) the data, updating the expectations, or both. -* you ran ``great_expectations init`` and completed the steps covered in the previous tutorial: :ref:`tutorial_init`. -* your current directory is the root of the project where you ran ``great_expectations init`` +For a broader understanding of the typical workflow read this article: :ref:`typical_workflow`. -You can either follow the tutorial with the dataset that it uses or you can execute the same steps on your project with your own data. -If you get stuck, find a bug or want to ask a question, go to `our Slack `_ - this is the best way to get help from the contributors and other users. +Expectations are grouped into Expectations Suites. An Expectation Suite combines multiple expectations into an overall description of a dataset. For example, a team can group all the expectations about the ``rating`` table in the movie ratings database into an Expectation Suite and call it "movieratings.table.expectations". +Each Expectation Suite is saved as a JSON file in the ``great_expectations/expectations`` subdirectory of the Data Context. Users check these files into version control each time they are updated, in the same way they treat their source files. -The ``great_expectations init`` command created a ``great_expectations/notebooks/`` folder in your project. The folder contains example notebooks for pandas, Spark and SQL datasources. +The lifecycle of an Expectation Suite starts with creating it. Then it goes through a loop of Review and Edit as the team's understanding of the data described by the suite evolves. -If you are following this tutorial using the NPI dataset, open the pandas notebook. If you are working with a different dataset, follow along in the notebook with instructions tailored to your datasource: +We will describe the Create, Review and Edit steps in brief: -.. content-tabs:: +Create an Expectation Suite +---------------------------------------- - .. tab-container:: tab0 - :title: pandas - .. code-block:: bash +Expectation Suites are saved as JSON files, so you *could* create a new suite by writing a file directly. However the preferred way is to let the CLI save you time and typos. If you cannot use the CLI in your environment (e.g., in a Databricks cluster), you can create and edit an Expectation Suite in a notebook. Jump to this section for details: :ref:`Jupyter Notebook for Creating and Editing Expectation Suites`. - jupyter notebook great_expectations/notebooks/pandas/create_expectations.ipynb +To continue with the CLI, run this command in the root directory of your project (where the init command created the ``great_expectations`` subdirectory: - .. tab-container:: tab1 - :title: pyspark - .. code-block:: bash +.. code-block:: bash - jupyter notebook great_expectations/notebooks/spark/create_expectations.ipynb + great_expectations suite new - .. tab-container:: tab2 - :title: SQLAlchemy - .. code-block:: bash +This command prompts you to name your new Expectation Suite and to select a sample batch of the dataset the suite will describe. Then it profiles the selected sample and adds some initial expectations to the suite. The purpose of these expectations is to provide examples of what properties of data can be described using Great Expectations. They are only a starting point that the user builds on. - jupyter notebook great_expectations/notebooks/sql/create_expectations.ipynb +The command concludes by saving the newly generated Expectation Suite as a JSON file and rendering the expectation suite into an HTML page in the Data Docs website of the Data Context. -1. Get a DataContext Object ---------------------------- -A DataContext represents a Great Expectations project. It organizes datasources, notification settings, data documentation sites, and storage and access for expectation suites and validation results. -The DataContext is configured via a yml file stored in a directory called ``great_expectations``. -This entire directory, which includes configuration files as well as expectation suites, should be stored in version control. +Review an Expectation Suite +---------------------------------------- -Instantiating a DataContext loads your project configuration and all its resources. +:ref:`Data Docs` is a feature of Great Expectations that creates data documentation by compiling expectations and validation results into HTML. +Data Docs produces a visual data quality report of what you expect from your data, and how the observed properties of your data differ from your expectations. +It helps to keep your entire team on the same page as data evolves. +Reviewing expectations is best done in Data Docs: -:: +.. image:: ../images/sample_e_s_view.png - context = ge.data_context.DataContext() +Edit an Expectation Suite +---------------------------------------- -To read more about DataContext, see: :ref:`data_context` +The best interface for editing an Expectation Suite is a Jupyter notebook. +Editing an Expectation Suite means adding expectations, removing expectations, and modifying the arguments of existing expectations. +For every expectation type there is a Python method that sets its arguments, evaluates this expectation against a sample batch of data and adds it to the Expectation Suite. -2. List Data Assets -------------------- +Take a look at the screenshot below. It shows the HTML view and the Python method for the same expectation (``expect_column_distinct_values_to_be_in_set``) side by side: -A Data Asset is data you can describe with expectations. +.. image:: ../images/exp_html_python_side_by_side .png +The CLI provides a command that, given an Expectation Suite, generates a Jupyter notebook to edit it. It takes care of generating a cell for every expectation in the suite and of getting a sample batch of data. The HTML page for each Expectation Suite has the CLI command syntax in order to make it easier for users. -.. content-tabs:: +.. image:: ../images/edit_e_s_popup.png - .. tab-container:: tab0 - :title: pandas +The generated Jupyter notebook can be discarded, since it is auto-generated. - A Pandas datasource generates data assets from Pandas DataFrames or CSV files. In this example the pipeline processes NPI data that it reads from CSV files in the ``npidata`` directory into Pandas DataFrames. This is the data you want to describe with expectations. That directory and its files form a data asset, named "npidata" (based on the directory name). +To understand this auto-generated notebook in more depth, jump to this section: :ref:`Jupyter Notebook for Creating and Editing Expectation Suites`. - .. tab-container:: tab1 - :title: pyspark - A Spark datasource generates data assets from Spark DataFrames or CSV files. The data loaded into a data asset is the data you want to describe and specify with expectations. If this example read CSV files in a directory called ``npidata`` into a Spark DataFrame, the resulting data asset would be called "npidata" based on the directory name. - .. tab-container:: tab2 - :title: SQLAlchemy - A SQLAlchemy datasource generates data assets from tables, views and query results. - * If the data resided in a table (or view) in a database, it would be accessible as a data asset with the name of that table (or view). - * If the data did not reside in one table ``npidata`` and, instead, the example pipeline ran an SQL query that fetched the data (probably from multiple tables), the result set of that query would be accessible as a data asset. The name of this data asset would be up to us (e.g., "npidata" or "npidata_query"). +Jupyter Notebook for Creating and Editing Expectation Suites +------------------------------------------------------------ +If you used the CLI `suite new` command to create an Expectation Suite and then the `suite edit` command to edit it, then the CLI generated a notebook in the ``great_expectations/uncommitted/`` folder for you. There is no need to check this notebook in to version control. Next time you decide to +edit this Expectation Suite, use the CLI again to generate a new notebook that reflects the expectations in the suite at that time. -Use this convenience method to list all data assets and expectation suites in your project (using the `DataContext`). - -.. code-block:: python +If you do not use the CLI, create a new notebook in the``great_expectations/notebooks/`` folder in your project. - great_expectations.jupyter_ux.list_available_data_asset_names(context) - -The output looks like this: - -.. image:: ../images/list_data_assets.png - :width: 600px - -``npidata`` is the short name of the data asset. Full names of data assets in a DataContext consist of three parts, for example: ``data__dir/default/npidata``. You don't need to know (yet) how the namespace is managed and the exact meaning of each part. The :ref:`data_context` article describes this in detail. - - -3. Pick a data asset and set the expectation suite name -------------------------------------------------------- - -The ``normalize_data_asset_name`` method converts the short name of a data asset to a full name: - -.. code-block:: python - - data_asset_name = "npidata" - normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name) - normalized_data_asset_name +1. Setup +******************************************** .. code-block:: python - expectation_suite_name = "warning" + from datetime import datetime + import great_expectations as ge + import great_expectations.jupyter_ux + from great_expectations.data_context.types.resource_identifiers import ValidationResultIdentifier -4. Create a new empty expectation suite ---------------------------------------- - -Individual Expectations are organized into expectation suites. We recommend 'warning' or 'default' as the name -for a first expectation suite associated with a data asset. - -Let's create a new empty suite in our project so we can start writing Expectations! - -.. code-block:: python + # Data Context is a GE object that represents your project. + # Your project's great_expectations.yml contains all the config + # options for the project's GE Data Context. + context = ge.data_context.DataContext() - context.create_expectation_suite(data_asset_name=data_asset_name, - expectation_suite_name=expectation_suite_name) + # Create a new empty Expectation Suite + # and give it a name + expectation_suite_name = "ratings.table.warning" # this is just an example + context.create_expectation_suite( + expectation_suite_name) If an expectation suite with this name already exists for this data_asset, you will get an error. If you would like to overwrite this expectation suite, set ``overwrite_existing=True``. -5. Load a batch of data to create Expectations ----------------------------------------------- - -Expectations describe data assets. Data assets are composed of batches. Validation checks expectations against a batch of data. - -For example, a batch could be the most recent day of log data. For a database table, a batch could be the data in that table at a particular time. - -To create expectations about a data asset you will load a batch of data as a Great Expectations :class:`Dataset ` and then call expectation methods. - -The DataContext's ``get_batch`` method is used to load a batch of a data asset: - -.. code-block:: python - - batch = context.get_batch(normalized_data_asset_name, - expectation_suite_name, - batch_kwargs) - +2. Load a batch of data to create Expectations +********************************************** -Calling this method asks the Context to get a batch of data from the data asset ``normalized_data_asset_name`` and attach the expectation suite ``expectation_suite_name`` to it. The ``batch_kwargs`` argument specifies which batch of the data asset should be loaded. +Select a sample batch of the dataset the suite will describe. -If you have no preference as to which batch of the data asset should be loaded, use the ``yield_batch_kwargs`` method on the data context: - -.. code-block:: python - - batch_kwargs = context.yield_batch_kwargs(data_asset_name) - -This is most likely sufficient for the purpose of this tutorial. +``batch_kwargs`` provide detailed instructions for the datasource how to construct a batch. Each datasource accepts different types of ``batch_kwargs``: +.. content-tabs:: -.. toggle-header:: - :header: **Click here to learn how to specify batch_kwargs for fetching a particular batch** + .. tab-container:: tab0 + :title: pandas - ``batch_kwargs`` provide detailed instructions for the datasource how to construct a batch. Each datasource accepts different types of ``batch_kwargs``: + A pandas datasource can accept ``batch_kwargs`` that describe either a path to a file or an existing DataFrame. For example, if the data asset is a collection of CSV files in a folder that are processed with Pandas, then a batch could be one of these files. Here is how to construct ``batch_kwargs`` that specify a particular file to load: - .. content-tabs:: + .. code-block:: python - .. tab-container:: tab0 - :title: pandas + batch_kwargs = {'path': "PATH_OF_THE_FILE_YOU_WANT_TO_LOAD"} - A pandas datasource can accept ``batch_kwargs`` that describe either a path to a file or an existing DataFrame. For example, if the data asset is a collection of CSV files in a folder that are processed with Pandas, then a batch could be one of these files. Here is how to construct ``batch_kwargs`` that specify a particular file to load: + To instruct ``get_batch`` to read CSV files with specific options (e.g., not to interpret the first line as the + header or to use a specific separator), add them to the ``batch_kwargs`` under the "reader_options" key. - .. code-block:: python + See the complete list of options for `Pandas read_csv `__. - batch_kwargs = {'path': "PATH_OF_THE_FILE_YOU_WANT_TO_LOAD"} + ``batch_kwargs`` might look like the following: - To instruct ``get_batch`` to read CSV files with specific options (e.g., not to interpret the first line as the - header or to use a specific separator), add them to the the ``batch_kwargs``. + .. code-block:: json - See the complete list of options for `Pandas read_csv `__. + { + "path": "/data/npidata/npidata_pfile_20190902-20190908.csv", + "reader_options": { + "sep": "|" + } + } - ``batch_kwargs`` might look like the following: + | + If you already loaded the data into a Pandas DataFrame called `df`, you could use following ``batch_kwargs`` to instruct the datasource to use your DataFrame as a batch: - .. code-block:: json + .. code-block:: python - { - "path": "/data/npidata/npidata_pfile_20190902-20190908.csv", - "partition_id": "npidata_pfile_20190902-20190908", - "sep": null, - "engine": "python" - } + batch_kwargs = {'dataset': df} - | - If you already loaded the data into a Pandas DataFrame, here is how you construct ``batch_kwargs`` that instruct the datasource to use your dataframe as a batch: + .. tab-container:: tab1 + :title: pyspark - .. code-block:: python + A pyspark datasource can accept ``batch_kwargs`` that describe either a path to a file or an existing DataFrame. For example, if the data asset is a collection of CSV files in a folder that are processed with Pandas, then a batch could be one of these files. Here is how to construct ``batch_kwargs`` that specify a particular file to load: - batch_kwargs = {'df': "YOUR_PANDAS_DF"} + .. code-block:: python - .. tab-container:: tab1 - :title: pyspark + batch_kwargs = {'path': "PATH_OF_THE_FILE_YOU_WANT_TO_LOAD"} - A pyspark datasource can accept ``batch_kwargs`` that describe either a path to a file or an existing DataFrame. For example, if the data asset is a collection of CSV files in a folder that are processed with Pandas, then a batch could be one of these files. Here is how to construct ``batch_kwargs`` that specify a particular file to load: + To instruct ``get_batch`` to read CSV files with specific options (e.g., not to interpret the first line as the + header or to use a specific separator), add them to the ``batch_kwargs`` under the "reader_options" key. - .. code-block:: python + See the complete list of options for `Spark DataFrameReader `__ - batch_kwargs = {'path': "PATH_OF_THE_FILE_YOU_WANT_TO_LOAD"} + .. tab-container:: tab2 + :title: SQLAlchemy - To instruct ``get_batch`` to read CSV files with specific options (e.g., not to interpret the first line as the - header or to use a specific separator), add them to the the ``batch_kwargs``. + A SQLAlchemy datasource can accept ``batch_kwargs`` that instruct it load a batch from a table, a view, or a result set of a query: - See the complete list of options for `Spark DataFrameReader `__ + If you would like to validate an entire table (or a view) in your database's default schema: - .. tab-container:: tab2 - :title: SQLAlchemy + .. code-block:: python - A SQLAlchemy datasource can accept ``batch_kwargs`` that instruct it load a batch from a table, a view, or a result set of a query: + batch_kwargs = {'table': "YOUR TABLE NAME"} - If you would like to validate an entire table (or a view) in your database's default schema: + If you would like to validate an entire table or view from a non-default schema in your database: - .. code-block:: python + .. code-block:: python - batch_kwargs = {'table': "YOUR TABLE NAME"} + batch_kwargs = {'table': "YOUR TABLE NAME", "schema": "YOUR SCHEMA"} - If you would like to validate an entire table or view from a non-default schema in your database: + If you would like to validate using a query to construct a temporary table: - .. code-block:: python + .. code-block:: python - batch_kwargs = {'table': "YOUR TABLE NAME", "schema": "YOUR SCHEMA"} + batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE'} - If you would like to validate using a query to construct a temporary table: - .. code-block:: python - batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE'} +The DataContext's ``get_batch`` method is used to load a batch of a data asset: +.. code-block:: python - The examples of ``batch_kwargs`` above can also be the outputs of "generators" used by Great Expectations. You can read about the default Generators' behavior and how to implement additional generators in this article: :ref:`batch_generator`. + batch = context.get_batch(batch_kwargs, expectation_suite_name) + batch.head() +Calling this method asks the Context to get a batch of data and attach the expectation suite ``expectation_suite_name`` to it. The ``batch_kwargs`` argument specifies which batch of the data asset should be loaded. | -Now you have the contents of one of the files loaded as batch of the data asset ``data__dir/default/npidata``. - -6. Author Expectations ------------------------ +3. Author Expectations +******************************************** Now that you have a batch of data, you can call ``expect`` methods on the data asset in order to check whether this expectation is true for this batch of data. @@ -306,34 +257,9 @@ How do I know which types of expectations I can add? :width: 400px -7. Review and save your Expectations ------------------------------------- -.. image:: ../images/get_expectation_suite_output.png - -.. code-block:: python - - batch.save_expectation_suite() - -The ``expectations_store`` attribute in the ``great_expectations.yml`` configuration file controls the location where the DataContext saves the expectation suite. - -When you call ``get_expectation_suite``, you might see this warning in the output: - -.. image:: ../images/failing_expectations_warning.png - -That is produced since, by default, GE will drop any expectation that was not successful on its last run. - -Sometimes, you may want to save an expectation even though it did not validate successfully on the current batch (e.g., you -have a reason to believe that the expectation is correct and the current batch has bad entries). In this case, pass -an additional argument to the ``save_expectation_suite`` method: - -.. code-block:: python - - batch.save_expectation_suite(discard_failed_expectations=False) - - -8. View the Expectations in Data Docs -------------------------------------- +4. Finalize +******************************************** Data Docs compiles Expectations and Validations into HTML documentation. By default the HTML website is hosted on your local filesystem. When you are working in a team, the website can be hosted in the cloud (e.g., on S3) and serve as the shared source of truth for the team working on the data pipeline. @@ -341,15 +267,24 @@ To view the expectation suite you just created as HTML, rebuild the data docs an .. code-block:: python - context.build_data_docs() - context.open_data_docs() - -Read more about the capabilities and configuration of Data Docs here: :ref:`data_docs`. + # save the Expectation Suite (by default to a JSON file in great_expectations/expectations folder + batch.save_expectation_suite(discard_failed_expectations=False) +# This step is optional, but useful - evaluate the expectations against the current batch of data + run_id = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") + results = context.run_validation_operator("action_list_operator", assets_to_validate=[batch], run_id=run_id) + expectation_suite_identifier = list(results["details"].keys())[0] + validation_result_identifier = ValidationResultIdentifier( + expectation_suite_identifier=expectation_suite_identifier, + batch_identifier=batch.batch_kwargs.to_id(), + run_id=run_id + ) + + # Update the Data Docs site to display the new Expectation Suite + # and open the site in the browser + context.build_data_docs() + context.open_data_docs(validation_result_identifier) -Congratulations! ----------------- -Now you you know the basics of creating expectations. -What is next? This is a collection of tutorials that walk you through a variety of useful Great Expectations workflows: :ref:`tutorials`. +*last updated*: |lastupdate| diff --git a/docs/tutorials/publishing_data_docs_to_s3.rst b/docs/tutorials/publishing_data_docs_to_s3.rst index 9d38656fde6d..ce9371239cf2 100644 --- a/docs/tutorials/publishing_data_docs_to_s3.rst +++ b/docs/tutorials/publishing_data_docs_to_s3.rst @@ -24,7 +24,7 @@ Configuring data docs requires three simple steps: Configure your bucket policy to enable appropriate access. **IMPORTANT**: your policy should provide access only to appropriate users; data-docs can include critical information about raw data and should generally **not** be -publicly accessible. The example policy below **enforces IP-based access** access. +publicly accessible. The example policy below **enforces IP-based access**. **Modify the bucket name and IP addresses below for your situation.** @@ -70,12 +70,12 @@ Modify the policy above and save it to a file called `ip-policy.json` in your lo local_site: class_name: SiteBuilder store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/data_docs/local_site/ s3_site: class_name: SiteBuilder store_backend: - class_name: FixedLengthTupleS3StoreBackend + class_name: TupleS3StoreBackend bucket: data-docs.my_org # UPDATE the bucket name here to match the bucket you configured above. # ... additional configuration below @@ -84,7 +84,7 @@ Modify the policy above and save it to a file called `ip-policy.json` in your lo .. code-block:: bash - > great_expectations build-docs + > great_expectations docs build Building... You're now ready to visit the site! Your site will be available at the following URL: @@ -105,3 +105,5 @@ For more information on static site hosting in AWS, see the following: - `AWS Website Hosting `_ - `AWS Static Site Access Permissions `_ - `AWS Website configuration `_ + +*last updated*: |lastupdate| diff --git a/docs/tutorials/saving_metrics.rst b/docs/tutorials/saving_metrics.rst new file mode 100644 index 000000000000..a90f9d121ac1 --- /dev/null +++ b/docs/tutorials/saving_metrics.rst @@ -0,0 +1,80 @@ +.. _saving_metrics: + +############### +Saving Metrics +############### + +Saving metrics during Validation makes it easy to construct a new data series based on observed +dataset characteristics computed by Great Expectations. That data series can serve as the source for a dashboard or +overall data quality metrics, for example. + +Storing metrics is still a **beta** feature of Great Expectations, and we expect configuration and +capability to evolve rapidly. + +********************************* +Adding a MetricsStore +********************************* + +A MetricStore is a special store that can store Metrics computed during Validation. A Metric store tracks the run_id +of the validation and the expectation suite name in addition to the metric name and metric kwargs. + +In most cases, a MetricStore will be configured as a SQL database. To add a MetricStore to your DataContext, add the +following yaml block to the "stores" section: + +.. code-block:: yaml + + stores: + # ... + metrics_store: # You can choose any name for your metric store + class_name: MetricStore + store_backend: + class_name: DatabaseStoreBackend + # These credentials can be the same as those used in a Datasource configuration + credentials: ${my_store_credentials} + + +The next time your DataContext is loaded, it will connect to the database and initialize a table to store metrics if +one has not already been created. See the :ref:`metrics_reference` for more information on additional configuration +options. + +********************************* +Configuring a Validation Action +********************************* + +Once a MetricStore is available, it is possible to configure a new `StoreMetricsAction` to save metrics during +validation. + +Add the following yaml block to your DataContext validation operators configuration: + +.. code-block:: yaml + + validation_operators: + # ... + action_list_operator: + class_name: ActionListValidationOperator + action_list: + # ... + - name: store_metrics + action: + class_name: StoreMetricsAction + target_store_name: metrics_store # This should match the name of the store configured above + # Note that the syntax for selecting requested metrics will change in a future release + requested_metrics: + *: # The asterisk here matches *any* expectation suite name + # use the 'kwargs' key to request metrics that are defined by kwargs, + # for example because they are defined only for a particular column + # - column: + # Age: + # - expect_column_min_to_be_between.result.observed_value + - statistics.evaluated_expectations + - statistics.successful_expectations + + +The `StoreMetricsValidationAction` processes an `ExpectationValidationResult` and stores Metrics to a configured Store. +Now, when your operator is executed, the requested metrics will be available in your database! + +.. code-block:: python + + context.run_validation_operator('action_list_operator', (batch_kwargs, expectation_suite_name)) + +*Last updated*: |lastupdate| diff --git a/docs/tutorials/validate_data.rst b/docs/tutorials/validate_data.rst index d07555870290..8ece6a3d1f27 100644 --- a/docs/tutorials/validate_data.rst +++ b/docs/tutorials/validate_data.rst @@ -5,34 +5,26 @@ Validate Data ============== -Expectations describe data assets. Data assets are composed of batches. Validation checks expectations against a batch of data. +Expectations describe Data Assets. Data Assets are composed of Batches. Validation checks Expectations against a Batch of data. Expectation Suites combine multiple Expectations into an overall description of a Batch. -Validation = checking if a batch of data from a data asset X conforms to all expectations in expectation suite Y. Expectation suite Y is a collection of expectations that you created that specify what a valid batch of data asset X should look like. +Validation = checking if a Batch of data from a Data Asset X conforms to all Expectations in Expectation Suite Y. Expectation Suite Y is a collection of Expectations that you created that specify what a valid Batch of Data Asset X should look like. -To run validation you need a **batch** of data. To get a **batch** of data you need: - -* to specify which **data asset** the batch is from -* to specify an **expectation suite** to validate against +To run Validation you need a **Batch** of data. To get a **Batch** of data you need: +* to provide `batch_kwargs` to a :ref:`Data Context` +* to specify an **Expectation Suite** to validate against This tutorial will explain each of these objects, show how to obtain them, execute validation and view its result. -Video ------- - -If you prefer videos to written tutorials, `James `_ (one of the original core contributors) walks you through this turorial in a `video on YouTube `_. - 0. Open Jupyter Notebook ------------------------ This tutorial assumes that: -* you ran ``great_expectations init`` and went through the steps covered in the previous tutorial: :ref:`tutorial_init`. +* you ran ``great_expectations init`` * your current directory is the root of the project where you ran ``great_expectations init`` -The dataset used in this tutorial is a folder with CSV files containing National Provider Identifier (NPI) data that are processed with pandas. - -You can either follow the tutorial with the dataset that it uses or you can execute the same steps on your project with your own data. +You can either follow the tutorial with the sample National Provider Identifier (NPI) dataset (processed with Pandas) referenced in the :ref:`great_expectations init` tutorial, or you can execute the same steps on your project with your own data. If you get stuck, find a bug or want to ask a question, go to `our Slack `_ - this is the best way to get help from the contributors and other users. @@ -71,9 +63,9 @@ If you are following this tutorial using the NPI dataset, open the pandas notebo 1. Get a DataContext Object --------------------------- -A DataContext represents a Great Expectations project. It organizes datasources, notification settings, data documentation sites, and storage and access for expectation suites and validation results. +A DataContext represents a Great Expectations project. It organizes Datasources, notification settings, data documentation sites, and storage and access for Expectation Suites and Validation Results. The DataContext is configured via a yml file stored in a directory called great_expectations; -the configuration file as well as managed expectation suites should be stored in version control. +the configuration file as well as managed Expectation Suites should be stored in version control. Instantiating a DataContext loads your project configuration and all its resources. @@ -84,231 +76,124 @@ Instantiating a DataContext loads your project configuration and all its resourc To read more about DataContexts, see: :ref:`data_context` - -2. List Data Assets -------------------- - -A Data Asset is data you can describe with expectations. - -.. content-tabs:: - - .. tab-container:: tab0 - :title: pandas - - A Pandas datasource generates data assets from Pandas DataFrames or CSV files. In this example the pipeline processes NPI data that it reads from CSV files in the ``npidata`` directory into Pandas DataFrames. This is the data you want to describe with expectations. That directory and its files form a data asset, named "npidata" (based on the directory name). - - .. tab-container:: tab1 - :title: pyspark - - A Spark datasource generates data assets from Spark DataFrames or CSV files. The data loaded into a data asset is the data you want to describe and specify with expectations. If this example read CSV files in a directory called ``npidata`` into a Spark DataFrame, the resulting data asset would be called "npidata" based on the directory name. - - .. tab-container:: tab2 - :title: SQLAlchemy - - A SQLAlchemy datasource generates data assets from tables, views and query results. - - * If the data resided in a table (or view) in a database, it would be accessible as a data asset with the name of that table (or view). - * If the data did not reside in one table ``npidata`` and, instead, the example pipeline ran an SQL query that fetched the data (probably from multiple tables), the result set of that query would be accessible as a data asset. The name of this data asset would be up to us (e.g., "npidata" or "npidata_query"). - - -Great Expectations' ``jupyter_ux`` module has a convenience method that lists all data assets and expectation suites known to a Data Context: - -.. code-block:: python - - great_expectations.jupyter_ux.list_available_data_asset_names(context) - -Here is the output of this method when executed in our example project: - -.. image:: ../images/list_data_assets.png - :width: 600px - -``npidata`` is the short name of the data asset. Full names of data assets in a DataContext consist of three parts, for example: ``data__dir/default/npidata``. You don't need to know (yet) how the namespace is managed and the exact meaning of each part. The :ref:`data_context` article describes this in detail. - - -3. Pick a data asset and expectation suite +2. Choose an Expectation Suite ------------------------------------------- -The previous section showed how to list all data assets and expectation suites in a project. - -In this section you choose a data asset name from this list. - -The ``normalize_data_asset_name`` method converts the short name of a data asset to a full name: +The ``context`` instantiated in the previous section has a convenience method that lists all Expectation Suites created in a project: .. code-block:: python - data_asset_name = "npidata" - normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name) - normalized_data_asset_name + for expectation_suite_id in context.list_expectation_suites(): + print(expectation_suite_id.expectation_suite_name) - -Choose the expectation suite you will validate the batch against: +Choose the Expectation Suite you will use to validate a Batch of data: .. code-block:: python expectation_suite_name = "warning" -3.a. If you don't have an expectation suite, let's create a simple one -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If you don't have an expectation suite for this data asset, the notebook's next cell will create a suite of very basic expectations, so that you have some expectations to play with. The expectation suite will have `expect_column_to_exist` expectations for each column. - -If you created an expectation suite for this data asset, you can skip executing the next cell (if you execute it, it will do nothing). - - -4. Load a batch of data to validate ------------------------------------ - -Expectations describe data assets. Data assets are composed of batches. Validation checks expectations against a batch of data. - -For example, a batch could be the most recent day of log data. For a database table, a batch could be the data in that table at a particular time. - -In order to validate a batch of data you will load it as a Great Expectations :class:`Dataset `. - -The DataContext's ``get_batch`` method is used to load a batch of a data asset: - -.. code-block:: python - - batch = context.get_batch(normalized_data_asset_name, - expectation_suite_name, - batch_kwargs) - - -Calling this method asks the Context to get a batch of data from the data asset ``normalized_data_asset_name`` and attach the expectation suite ``expectation_suite_name`` to it. The ``batch_kwargs`` argument specifies which batch of the data asset should be loaded. - -If you have no preference as to which batch of the data asset should be loaded, use the ``yield_batch_kwargs`` method on the data context: - -.. code-block:: python - - batch_kwargs = context.yield_batch_kwargs(data_asset_name) - -This tutorial and its notebook provide a playground for validation. When Great Expectations is integrated into a data pipeline, the pipeline calls GE to validate a specific batch (an input to a pipeline's step or its output). +3. Load a batch of data you want to validate +--------------------------------------------- -.. toggle-header:: - :header: **Click here to learn how to specify batch_kwargs for fetching a particular batch** +Expectations describe Batches of data - Expectation Suites combine multiple Expectations into an overall description of a Batch. Validation checks a Batch against an Expectation Suite. - ``batch_kwargs`` provide detailed instructions for the datasource how to construct a batch. Each datasource accepts different types of ``batch_kwargs``: +For example, a Batch could be the most recent day of log data. For a database table, a Batch could be the data in that table at a particular time. - .. content-tabs:: +In order to validate a Batch of data, you will load it as a Great Expectations :class:`Dataset `. - .. tab-container:: tab0 - :title: pandas +Batches are obtained by using a Data Context's ``get_batch`` method, which accepts ``batch_kwargs`` and ``expectation_suite_name`` as arguments. - A pandas datasource can accept ``batch_kwargs`` that describe either a path to a file or an existing DataFrame. For example, if the data asset is a collection of CSV files in a folder that are processed with Pandas, then a batch could be one of these files. Here is how to construct ``batch_kwargs`` that specify a particular file to load: +Calling this method asks the Context to get a Batch of data using the provided ``batch_kwargs`` and attach the Expectation Suite ``expectation_suite_name`` to it. - .. code-block:: python +The ``batch_kwargs`` argument is a dictionary that specifies a batch of data - it contains all the information necessary for a Data Context to obtain a batch of data from a :ref:`Datasource`. The keys of a ``batch_kwargs`` +dictionary will vary depending on the type of Datasource and how it generates Batches, but will always have a ``datasource`` key with the name of a Datasource. To list the Datasources configured in a project, you may use a Data Context's ``list_datasources`` method. - batch_kwargs = {'path': "PATH_OF_THE_FILE_YOU_WANT_TO_LOAD"} - - To instruct ``get_batch`` to read CSV files with specific options (e.g., not to interpret the first line as the - header or to use a specific separator), add them to the the ``batch_kwargs``. - - See the complete list of options for `Pandas read_csv `__. - - ``batch_kwargs`` might look like the following: - - .. code-block:: json - - { - "path": "/data/npidata/npidata_pfile_20190902-20190908.csv", - "partition_id": "npidata_pfile_20190902-20190908", - "sep": null, - "engine": "python" - } - - | - If you already loaded the data into a Pandas DataFrame, here is how you construct ``batch_kwargs`` that instruct the datasource to use your dataframe as a batch: - - .. code-block:: python - - batch_kwargs = {'df': "YOUR_PANDAS_DF"} - - .. tab-container:: tab1 - :title: pyspark - - A pyspark datasource can accept ``batch_kwargs`` that describe either a path to a file or an existing DataFrame. For example, if the data asset is a collection of CSV files in a folder that are processed with Pandas, then a batch could be one of these files. Here is how to construct ``batch_kwargs`` that specify a particular file to load: - - .. code-block:: python - - batch_kwargs = {'path': "PATH_OF_THE_FILE_YOU_WANT_TO_LOAD"} - - To instruct ``get_batch`` to read CSV files with specific options (e.g., not to interpret the first line as the - header or to use a specific separator), add them to the the ``batch_kwargs``. +.. content-tabs:: - See the complete list of options for `Spark DataFrameReader `__ + .. tab-container:: tab0 + :title: pandas - .. tab-container:: tab2 - :title: SQLAlchemy + A Pandas Datasource generates Batches from Pandas DataFrames or CSV files. A Pandas Datasource can accept ``batch_kwargs`` that describe either a path to a file or an existing DataFrame: - A SQLAlchemy datasource can accept ``batch_kwargs`` that instruct it load a batch from a table, a view, or a result set of a query: + .. code-block:: python - If you would like to validate an entire table (or a view) in your database's default schema: + # list datasources of the type PandasDatasource in your project + [datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource'] + datasource_name = # TODO: set to a datasource name from above - .. code-block:: python + # If you would like to validate a file on a filesystem: + batch_kwargs = {'path': "YOUR_FILE_PATH", 'datasource': datasource_name} - batch_kwargs = {'table': "YOUR TABLE NAME"} + # If you already loaded the data into a Pandas Data Frame: + batch_kwargs = {'dataset': "YOUR_DATAFRAME", 'datasource': datasource_name} - If you would like to validate an entire table or view from a non-default schema in your database: + batch = context.get_batch(batch_kwargs, expectation_suite_name) + batch.head() - .. code-block:: python + .. tab-container:: tab1 + :title: pyspark - batch_kwargs = {'table': "YOUR TABLE NAME", "schema": "YOUR SCHEMA"} + A Spark Datasource generates Batches from Spark DataFrames or CSV files. A Spark Datasource can accept ``batch_kwargs`` that describe either a path to a file or an existing DataFrame: - If you would like to validate using a query to construct a temporary table: + .. code-block:: python - .. code-block:: python + # list datasources of the type SparkDFDatasource in your project + [datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource'] + datasource_name = # TODO: set to a datasource name from above - batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE'} + # If you would like to validate a file on a filesystem: + batch_kwargs = {'path': "YOUR_FILE_PATH", 'datasource': datasource_name} + # To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true') + # If you already loaded the data into a PySpark Data Frame: + batch_kwargs = {'dataset': "YOUR_DATAFRAME", 'datasource': datasource_name} - The examples of ``batch_kwargs`` above can also be the outputs of "generators" used by Great Expectations. You can read about the default Generators' behavior and how to implement additional generators in this article: :ref:`batch_generator`. + batch = context.get_batch(batch_kwargs, expectation_suite_name) + batch.head() -| -Now you have the contents of one of the files loaded as batch of the data asset ``data__dir/default/npidata``. + .. tab-container:: tab2 + :title: SQLAlchemy + A SQLAlchemy Datasource generates Batches from tables, views and query results. A SQLAlchemy Datasource can accept ``batch_kwargs`` that instruct it load a batch from a table, a view, or a result set of a query: -5. Set a Run Id ---------------- + .. code-block:: python -A ``run_id`` links together validations of different data assets, making it possible to track "runs" of a pipeline and -follow data assets as they are transformed, joined, annotated, enriched, or evaluated. The run id can be any string; -by default, Great Expectations will use an ISO 8601-formatted UTC datetime string. + # list datasources of the type SqlAlchemyDatasource in your project + [datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource'] + datasource_name = # TODO: set to a datasource name from above + # If you would like to validate an entire table or view in your database's default schema: + batch_kwargs = {'table': "YOUR_TABLE", 'datasource': datasource_name} -The default ``run_id`` generated by Great Expectations is built using the following code: + # If you would like to validate an entire table or view from a non-default schema in your database: + batch_kwargs = {'table': "YOUR_TABLE", "schema": "YOUR_SCHEMA", 'datasource': datasource_name} -.. code-block:: python + # If you would like to validate the result set of a query: + # batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name} - run_id = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") + batch = context.get_batch(batch_kwargs, expectation_suite_name) + batch.head() + The examples of ``batch_kwargs`` above can also be the outputs of "Generators" used by Great Expectations. You +can read about the default Generators' behavior and how to implement additional Generators in this article: +:ref:`batch_kwargs_generator`. -When you integrate validation in your pipeline, your pipeline runner probably has a run id that can be inserted here to make smoother integration. +4. Validate the batch +----------------------- -6. Validate the batch ---------------------- +When Great Expectations is integrated into a data pipeline, the pipeline calls GE to validate a specific batch (an input to a pipeline's step or its output). -Validation evaluates your expectations against the given batch and produces a report that describes observed values and -any places where expectations are not met. To validate the batch of data call the :meth:`~great_expectations.\ +Validation evaluates the Expectations of an Expectation Suite against the given Batch and produces a report that describes observed values and +any places where Expectations are not met. To validate the Batch of data call the :meth:`~great_expectations.\ data_asset.data_asset.DataAsset.validate` method on the batch: .. code-block:: python - validation_result = batch.validate(run_id=run_id) - - -In a data pipeline you may take specific actions based on the the result of the validation. - -A common pattern is to check the ``validation_result``'s ``success`` key (``True`` if the batch meets all the expectations in the expectation suite), and stop or issue a warning in the code in case of failure: - -.. code-block:: python - - if validation_result["success"]: - logger.info("This file meets all expectations from a valid batch of {0:s}".format(str(data_asset_name))) - else: - logger.warning("This file is not a valid batch of {0:s}".format(str(data_asset_name))) + validation_result = batch.validate() -The ``validation_result`` object has detailed information about every expectation in the suite that was used to validate the batch: whether the batch met the expectation and even more details if it did not. You can read more about the result object's structure here: :ref:`validation_result`. +The ``validation_result`` object has detailed information about every Expectation in the Expectation Suite that was used to validate the Batch: whether the Batch met the Expectation and even more details if it did not. You can read more about the result object's structure here: :ref:`validation_result`. You can print this object out: @@ -324,22 +209,21 @@ Here is what a part of this object looks like: Don't panic! This blob of JSON is meant for machines. :ref:`data_docs` are an compiled HTML view of both expectation suites and validation results that is far more suitable for humans. You will see how easy it is to build them in the next sections. - -7. Validation Operators +5. Validation Operators ----------------------- -The ``validate()`` method evaluates one batch of data against one expectation suite and returns a dictionary of validation results. This is sufficient when you explore your data and get to know Great Expectations. +The ``validate()`` method evaluates one Batch of data against one Expectation Suite and returns a dictionary of Validation Results. This is sufficient when you explore your data and get to know Great Expectations. When deploying Great Expectations in a real data pipeline, you will typically discover these additional needs: -* Validating a group of batches that are logically related (e.g. Did all my salesforce integrations work last night?). -* Validating a batch against several expectation suites (e.g. Did my nightly clickstream event job have any **critical** failures I need to deal with asap or **warnings** I should investigate later?). -* Doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.). +* Validating a group of Batches that are logically related (e.g. Did all my Salesforce integrations work last night?). +* Validating a Batch against several Expectation Suites (e.g. Did my nightly clickstream event job have any **critical** failures I need to deal with ASAP or **warnings** I should investigate later?). +* Doing something with the Validation Results (e.g., saving them for a later review, sending notifications in case of failures, etc.). -Validation Operators provide a convenient abstraction for both bundling the validation of multiple expectation suites and the actions that should be taken after the validation. See the +Validation Operators provide a convenient abstraction for both bundling the validation of multiple Expectation Suites and the actions that should be taken after the validation. See the :ref:`validation_operators_and_actions` for more information. -An instance of ``action_list_operator`` operator is configured in the default ``great_expectations.yml`` configuration file. ``ActionListValidationOperator`` validates each batch in the list that is passed as ``assets_to_validate`` argument to its ``run`` method against the expectation suite included within that batch and then invokes a list of configured actions on every validation result. +An instance of ``action_list_operator`` operator is configured in the default ``great_expectations.yml`` configuration file. ``ActionListValidationOperator`` validates each Batch in the list that is passed as ``assets_to_validate`` argument to its ``run`` method against the Expectation Suite included within that Batch and then invokes a list of configured actions on every Validation Result. Below is the operator's configuration snippet in the ``great_expectations.yml`` file: @@ -350,10 +234,10 @@ Below is the operator's configuration snippet in the ``great_expectations.yml`` action_list: - name: store_validation_result action: - class_name: StoreAction + class_name: StoreValidationResultAction - name: store_evaluation_params action: - class_name: ExtractAndStoreEvaluationParamsAction + class_name: StoreEvaluationParametersAction - name: update_data_docs action: class_name: UpdateDataDocsAction @@ -383,14 +267,10 @@ as ``s3`` or ``gcs``, edit stores section of the DataContext configuration objec validations_store: class_name: ValidationsStore store_backend: - class_name: FixedLengthTupleS3Backend + class_name: TupleS3Backend bucket: my_bucket prefix: my_prefix -Validation results will be stored according to the same hierarchical namespace used to refer to data assets elsewhere -in the context, and will have the run_id prepended: -``base_location/run_id/datasource_name/generator_name/generator_asset/expectation_suite_name.json``. - Removing the store_validation_result action from the action_list_operator configuration will disable automatically storing validation_result objects. @@ -419,9 +299,32 @@ the slack webhook URL in the uncommitted/config_variables.yml file: validation_notification_slack_webhook: https://slack.com/your_webhook_url +Running the Validation Operator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Before running the Validation Operator, create a ``run_id``. A ``run_id`` links together validations of different data assets, making it possible to track "runs" of a pipeline and +follow data assets as they are transformed, joined, annotated, enriched, or evaluated. The run id can be any string; +by default, Great Expectations will use an ISO 8601-formatted UTC datetime string. +The default ``run_id`` generated by Great Expectations is built using the following code: -8. View the Validation Results in Data Docs +.. code-block:: python + + run_id = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") + +When you integrate validation in your pipeline, your pipeline runner probably has a run id that can be inserted here to make smoother integration. + +Finally, run the Validation Operator: + +.. code-block:: python + + results = context.run_validation_operator( + "action_list_operator", + assets_to_validate=[batch], + run_id=run_id) + + +6. View the Validation Results in Data Docs ------------------------------------------- Data Docs compiles raw Great Expectations objects including Expectations and Validations into structured documents such as HTML documentation. By default the HTML website is hosted on your local filesystem. When you are working in a team, the website can be hosted in the cloud (e.g., on S3) and serve as the shared source of truth for the team working on the data pipeline. @@ -437,11 +340,11 @@ You can open the page programmatically and examine the result: context.open_data_docs() - Congratulations! ---------------- -Now you you know how to validate a batch of data. +Now you you know how to validate a Batch of data. What is next? This is a collection of tutorials that walk you through a variety of useful Great Expectations workflows: :ref:`tutorials`. +*last updated*: |lastupdate| diff --git a/examples/integrations/airflow/operators/expectation_operator.py b/examples/integrations/airflow/operators/expectation_operator.py index cec6ebe34f3e..439ccae5ae1f 100644 --- a/examples/integrations/airflow/operators/expectation_operator.py +++ b/examples/integrations/airflow/operators/expectation_operator.py @@ -169,7 +169,7 @@ def execute(self, context): self._store_results(results) for result in results['results']: - if result['success'] is False: + if result.success is False: if self.fail_on_error is True: raise AirflowException("Validation failed for dataset {name}".format(name=self.dataset_name)) diff --git a/great_expectations/__init__.py b/great_expectations/__init__.py index a074acf9a768..ff8b563bf123 100644 --- a/great_expectations/__init__.py +++ b/great_expectations/__init__.py @@ -1,8 +1,8 @@ from ._version import get_versions + __version__ = get_versions()['version'] rtd_url_ge_version = __version__.replace(".", "_") del get_versions from .util import from_pandas, read_csv, read_excel, read_json, read_parquet, read_pickle, read_table, validate - from great_expectations.data_context import DataContext diff --git a/great_expectations/cli/__init__.py b/great_expectations/cli/__init__.py index 43f4372ce1dc..a8120f0db96b 100755 --- a/great_expectations/cli/__init__.py +++ b/great_expectations/cli/__init__.py @@ -1 +1 @@ -from .cli import main, cli +from .cli import cli, main diff --git a/great_expectations/cli/cli.py b/great_expectations/cli/cli.py index 76dd875c3cde..c1a8ae8e14cb 100644 --- a/great_expectations/cli/cli.py +++ b/great_expectations/cli/cli.py @@ -1,534 +1,62 @@ # -*- coding: utf-8 -*- -import click -import json import logging -import os -import shutil -import sys -import warnings - -from great_expectations.cli.init_messages import ( - BUILD_DOCS_PROMPT, - COMPLETE_ONBOARDING_PROMPT, - GREETING, - LETS_BEGIN_PROMPT, - NEW_TEMPLATE_INSTALLED, - NEW_TEMPLATE_PROMPT, - NO_DATASOURCES_FOUND, - ONBOARDING_COMPLETE, - PROJECT_IS_COMPLETE, - RUN_INIT_AGAIN, - SLACK_LATER, - SLACK_SETUP_INTRO, - SLACK_SETUP_COMPLETE, - SLACK_SETUP_PROMPT, - SLACK_WEBHOOK_PROMPT, -) -from .datasource import ( - add_datasource as add_datasource_impl, - profile_datasource, - build_docs as build_documentation_impl, - MSG_GO_TO_NOTEBOOK, -) -from great_expectations.cli.util import cli_message, is_sane_slack_webhook -from great_expectations.data_context import DataContext -from great_expectations.data_asset import FileDataAsset -from great_expectations.dataset import Dataset, PandasDataset -import great_expectations.exceptions as ge_exceptions -from great_expectations import __version__ as ge_version -from great_expectations import read_csv -#FIXME: This prevents us from seeing a huge stack of these messages in python 2. We'll need to fix that later. -# tests/test_cli.py::test_cli_profile_with_datasource_arg -# /Users/abe/Documents/superconductive/tools/great_expectations/tests/test_cli.py:294: Warning: Click detected the use of the unicode_literals __future__ import. This is heavily discouraged because it can introduce subtle bugs in your code. You should instead use explicit u"" literals for your unicode strings. For more information see https://click.palletsprojects.com/python3/ -# cli, ["profile", "my_datasource", "-d", project_root_dir]) -click.disable_unicode_literals_warning = True - - -warnings.filterwarnings('ignore') - -try: - from termcolor import colored -except ImportError: - colored = None +import click -# Take over the entire GE module logging namespace when running CLI -logger = logging.getLogger("great_expectations") +from great_expectations import __version__ as ge_version +from great_expectations.cli.cli_logging import _set_up_logger, logger +from great_expectations.cli.datasource import datasource +from great_expectations.cli.docs import docs +from great_expectations.cli.init import init +from great_expectations.cli.project import project +from great_expectations.cli.suite import suite -# class NaturalOrderGroup(click.Group): -# def __init__(self, name=None, commands=None, **attrs): -# if commands is None: -# commands = OrderedDict() -# elif not isinstance(commands, OrderedDict): -# commands = OrderedDict(commands) -# click.Group.__init__(self, name=name, -# commands=commands, -# **attrs) -# -# def list_commands(self, ctx): -# return self.commands.keys() # TODO: consider using a specified-order supporting class for help (but wasn't working with python 2) -# @click.group(cls=NaturalOrderGroup) @click.group() @click.version_option(version=ge_version) -@click.option('--verbose', '-v', is_flag=True, default=False, - help='Set great_expectations to use verbose output.') -def cli(verbose): - """great_expectations command-line interface""" - if verbose: - logger.setLevel(logging.DEBUG) - - -@cli.command() -@click.argument('dataset') -@click.argument('expectation_suite_file') -@click.option('--evaluation_parameters', '-p', default=None, - help='Path to a file containing JSON object used to evaluate parameters in expectations config.') -@click.option('--result_format', '-o', default="SUMMARY", - help='Result format to use when building evaluation responses.') -@click.option('--catch_exceptions', '-e', default=True, type=bool, - help='Specify whether to catch exceptions raised during evaluation of expectations (defaults to True).') -@click.option('--only_return_failures', '-f', default=False, type=bool, - help='Specify whether to only return expectations that are not met during evaluation ' - '(defaults to False).') -@click.option('--custom_dataset_module', '-m', default=None, - help='Path to a python module containing a custom dataset class.') -@click.option('--custom_dataset_class', '-c', default=None, - help='Name of the custom dataset class to use during evaluation.') -def validate( - dataset, - expectation_suite_file, - evaluation_parameters, - result_format, - catch_exceptions, only_return_failures, custom_dataset_module, custom_dataset_class): - """Validate a CSV file against an expectation suite. - - DATASET: Path to a file containing a CSV file to validate using the provided expectation_suite_file. - - EXPECTATION_SUITE_FILE: Path to a file containing a valid great_expectations expectations suite to use to \ -validate the data. - """ - - """ - Read a dataset file and validate it using an expectation suite saved in another file. Uses parameters defined in - the dispatch method. - - :param parsed_args: A Namespace object containing parsed arguments from the dispatch method. - :return: The number of unsuccessful expectations - """ - expectation_suite_file = expectation_suite_file - - expectation_suite = json.load(open(expectation_suite_file)) - - if evaluation_parameters is not None: - evaluation_parameters = json.load( - open(evaluation_parameters, "r")) - - # Use a custom data_asset module and class if provided. Otherwise infer from the expectation suite - if custom_dataset_module: - sys.path.insert(0, os.path.dirname( - custom_dataset_module)) - module_name = os.path.basename( - custom_dataset_module).split('.')[0] - custom_module = __import__(str(module_name)) - dataset_class = getattr( - custom_module, custom_dataset_class) - elif "data_asset_type" in expectation_suite: - if (expectation_suite["data_asset_type"] == "Dataset" or - expectation_suite["data_asset_type"] == "PandasDataset"): - dataset_class = PandasDataset - elif expectation_suite["data_asset_type"].endswith("Dataset"): - logger.info("Using PandasDataset to validate dataset of type %s." % - expectation_suite["data_asset_type"]) - dataset_class = PandasDataset - elif expectation_suite["data_asset_type"] == "FileDataAsset": - dataset_class = FileDataAsset - else: - logger.critical("Unrecognized data_asset_type %s. You may need to specify custom_dataset_module and \ - custom_dataset_class." % expectation_suite["data_asset_type"]) - return -1 - else: - dataset_class = PandasDataset - - if issubclass(dataset_class, Dataset): - da = read_csv(dataset, expectation_suite=expectation_suite, - dataset_class=dataset_class) - else: - da = dataset_class(dataset, config=expectation_suite) - - result = da.validate( - evaluation_parameters=evaluation_parameters, - result_format=result_format, - catch_exceptions=catch_exceptions, - only_return_failures=only_return_failures, - ) - - # Note: Should this be rendered through cli_message? - # Probably not, on the off chance that the JSON object contains tags - print(json.dumps(result, indent=2)) - sys.exit(result['statistics']['unsuccessful_expectations']) - - -@cli.command() -@click.option( - '--target_directory', - '-d', - default="./", - help='The root of the project directory where you want to initialize Great Expectations.' -) -@click.option( - # Note this --no-view option is mostly here for tests - "--view/--no-view", - help="By default open in browser unless you specify the --no-view flag", - default=True -) -def init(target_directory, view): - """ - Create a new project and help with onboarding. - - This guided input walks the user through setting up a new project and also - onboards a new developer in an existing project. - - It scaffolds directories, sets up notebooks, creates a project file, and - appends to a `.gitignore` file. - """ - target_directory = os.path.abspath(target_directory) - ge_dir = _get_full_path_to_ge_dir(target_directory) - ge_yml = os.path.join(ge_dir, DataContext.GE_YML) - - cli_message(GREETING) - - # TODO this should be a property - if os.path.isfile(ge_yml): - if DataContext.all_uncommitted_directories_exist(ge_dir) and \ - DataContext.config_variables_yml_exist(ge_dir): - # Ensure the context can be instantiated - try: - _ = DataContext(ge_dir) - cli_message(PROJECT_IS_COMPLETE) - except ge_exceptions.DataContextError as e: - cli_message("{}".format(e)) - exit(5) - else: - _complete_onboarding(target_directory) - - try: - # if expectations exist, offer to build docs - context = DataContext(ge_dir) - if context.list_expectation_suite_keys(): - if click.confirm(BUILD_DOCS_PROMPT, default=True): - context.build_data_docs() - context.open_data_docs() - except ge_exceptions.DataContextError as e: - cli_message("{}".format(e)) - else: - if not click.confirm(LETS_BEGIN_PROMPT, default=True): - cli_message(RUN_INIT_AGAIN) - exit(0) - - context, data_source_name, data_source_type = _create_new_project(target_directory) - if not data_source_name: # no datasource was created - return - - profile_datasource(context, data_source_name, open_docs=view, additional_batch_kwargs={"limit": 1000}) - cli_message("""\nGreat Expectations is now set up in your project!""") - - -def _slack_setup(context): - webhook_url = None - cli_message(SLACK_SETUP_INTRO) - if not click.confirm(SLACK_SETUP_PROMPT, default=True): - cli_message(SLACK_LATER) - return context - else: - webhook_url = click.prompt(SLACK_WEBHOOK_PROMPT, default="") - - while not is_sane_slack_webhook(webhook_url): - cli_message("That URL was not valid.\n") - if not click.confirm(SLACK_SETUP_PROMPT, default=True): - cli_message(SLACK_LATER) - return context - webhook_url = click.prompt(SLACK_WEBHOOK_PROMPT, default="") - - context.save_config_variable("validation_notification_slack_webhook", webhook_url) - cli_message(SLACK_SETUP_COMPLETE) - - return context - - -def _get_full_path_to_ge_dir(target_directory): - return os.path.abspath(os.path.join(target_directory, DataContext.GE_DIR)) - - -def _create_new_project(target_directory): - try: - context = DataContext.create(target_directory) - data_source_name, data_source_type = add_datasource_impl(context) - return context, data_source_name, data_source_type - except ge_exceptions.DataContextError as err: - logger.critical(err.message) - sys.exit(-1) - - -def _complete_onboarding(target_dir): - if click.confirm(COMPLETE_ONBOARDING_PROMPT, default=True): - DataContext.create(target_dir) - cli_message(ONBOARDING_COMPLETE) - else: - cli_message(RUN_INIT_AGAIN) - - - -@cli.command() @click.option( - '--directory', - '-d', - default=None, - help="The project's great_expectations directory." + "--verbose", + "-v", + is_flag=True, + default=False, + help="Set great_expectations to use verbose output.", ) -@click.option( - "--view/--no-view", - help="By default open in browser unless you specify the --no-view flag", - default=True -) -def add_datasource(directory, view): - """Add a new datasource to the data context.""" - try: - context = DataContext(directory) - except ge_exceptions.ConfigNotFoundError as err: - cli_message("{}".format(err.message)) - return - except ge_exceptions.ZeroDotSevenConfigVersionError as err: - _offer_to_install_new_template(err, context.root_directory) - - data_source_name, data_source_type = add_datasource_impl(context) - - if not data_source_name: # no datasource was created - return - - profile_datasource(context, data_source_name, open_docs=view) - - -@cli.command() -@click.option( - '--directory', - '-d', - default=None, - help="The project's great_expectations directory." -) -def list_datasources(directory): - """List known datasources.""" - try: - context = DataContext(directory) - datasources = context.list_datasources() - # TODO Pretty up this console output - cli_message(str([d for d in datasources])) - except ge_exceptions.ConfigNotFoundError as err: - cli_message("{}".format(err.message)) - return - except ge_exceptions.ZeroDotSevenConfigVersionError as err: - _offer_to_install_new_template(err, context.root_directory) - - -@cli.command() -@click.argument('datasource_name', default=None, required=False) -@click.option('--data_assets', '-l', default=None, - help='Comma-separated list of the names of data assets that should be profiled. Requires datasource_name specified.') -@click.option('--profile_all_data_assets', '-A', is_flag=True, default=False, - help='Profile ALL data assets within the target data source. ' - 'If True, this will override --max_data_assets.') -@click.option( - "--directory", - "-d", - default=None, - help="The project's great_expectations directory." -) -@click.option('--batch_kwargs', default=None, - help='Additional keyword arguments to be provided to get_batch when loading the data asset. Must be a valid JSON dictionary') -@click.option( - "--view/--no-view", - help="By default open in browser unless you specify the --no-view flag", - default=True -) -def profile(datasource_name, data_assets, profile_all_data_assets, directory, view, batch_kwargs): - """ - Profile datasources from the specified context. - - If the optional data_assets and profile_all_data_assets arguments are not specified, the profiler will check - if the number of data assets in the datasource exceeds the internally defined limit. If it does, it will - prompt the user to either specify the list of data assets to profile or to profile all. - If the limit is not exceeded, the profiler will profile all data assets in the datasource. - - :param datasource_name: name of the datasource to profile - :param data_assets: if this comma-separated list of data asset names is provided, only the specified data assets will be profiled - :param profile_all_data_assets: if provided, all data assets will be profiled - :param directory: - :param view: Open the docs in a browser - :param batch_kwargs: Additional keyword arguments to be provided to get_batch when loading the data asset. - :return: +def cli(verbose): """ +Welcome to the great_expectations CLI! - try: - context = DataContext(directory) - except ge_exceptions.ConfigNotFoundError as err: - cli_message("{}".format(err.message)) - return - except ge_exceptions.ZeroDotSevenConfigVersionError as err: - _offer_to_install_new_template(err, context.root_directory) - return - - if batch_kwargs is not None: - batch_kwargs = json.loads(batch_kwargs) - - if datasource_name is None: - datasources = [datasource["name"] for datasource in context.list_datasources()] - if not datasources: - cli_message(NO_DATASOURCES_FOUND) - sys.exit(-1) - elif len(datasources) > 1: - cli_message( - "Error: please specify the datasource to profile. "\ - "Available datasources: " + ", ".join(datasources) + "" - ) - sys.exit(-1) - else: - profile_datasource( - context, - datasources[0], - data_assets=data_assets, - profile_all_data_assets=profile_all_data_assets, - open_docs=view, - additional_batch_kwargs=batch_kwargs - ) - else: - profile_datasource( - context, - datasource_name, - data_assets=data_assets, - profile_all_data_assets=profile_all_data_assets, - open_docs=view, - additional_batch_kwargs=batch_kwargs - ) - +Most commands follow this format: great_expectations -@cli.command() -@click.option( - '--directory', - '-d', - default=None, - help="The project's great_expectations directory." -) -@click.option('--site_name', '-s', - help='The site for which to generate documentation. See data_docs section in great_expectations.yml') -@click.option( - "--view/--no-view", - help="By default open in browser unless you specify the --no-view flag", - default=True -) -def build_docs(directory, site_name, view=True): - """Build Data Docs for a project.""" - logger.debug("Starting cli.build_docs") +The nouns are: datasource, docs, project, suite - try: - context = DataContext(directory) - build_documentation_impl( - context, - site_name=site_name - ) - if view: - context.open_data_docs() - except ge_exceptions.ConfigNotFoundError as err: - cli_message("{}".format(err.message)) - sys.exit(1) - except ge_exceptions.ZeroDotSevenConfigVersionError as err: - _offer_to_install_new_template(err, context.root_directory) - return - except ge_exceptions.PluginModuleNotFoundError as err: - cli_message(err.cli_colored_message) - sys.exit(1) - except ge_exceptions.PluginClassNotFoundError as err: - cli_message(err.cli_colored_message) - sys.exit(1) +Most nouns accept the following verbs: new, list, edit +In addition, the CLI supports the following special commands: -@cli.command() -@click.option( - '--directory', - '-d', - default="./great_expectations", - help="The project's great_expectations directory." -) -def check_config(directory): - """Check a config for validity and help with migrations.""" - cli_message("Checking your config files for validity...\n") +- great_expectations init : same as `project new` - try: - is_config_ok, error_message = do_config_check(directory) - if is_config_ok: - cli_message("Your config file appears valid!") - else: - cli_message("Unfortunately, your config appears to be invalid:\n") - cli_message("{}".format(error_message)) - sys.exit(1) - except ge_exceptions.ZeroDotSevenConfigVersionError as err: - _offer_to_install_new_template(err, directory) +- great_expectations datasource profile : profile a datasource - -def _offer_to_install_new_template(err, ge_dir): - ge_dir = os.path.abspath(ge_dir) - cli_message("{}".format(err.message)) - ge_yml = os.path.join(ge_dir, DataContext.GE_YML) - archived_yml = ge_yml + ".archive" - - if click.confirm( - NEW_TEMPLATE_PROMPT.format(ge_yml, archived_yml), - default=True - ): - # archive existing project config - shutil.move(ge_yml, archived_yml) - DataContext.write_project_template_to_disk(ge_dir) - - cli_message(NEW_TEMPLATE_INSTALLED.format("file://" + ge_yml, "file://" + archived_yml)) - else: - cli_message( - """\nOK. To continue, you will need to upgrade your config file to the latest format. - - Please see the docs here: https://docs.greatexpectations.io/en/latest/reference/data_context_reference.html - - We are super sorry about this breaking change! :] - - If you are running into any problems, please reach out on Slack and we can - help you in realtime: https://greatexpectations.io/slack""" - ) - sys.exit(0) +- great_expectations docs build : compile documentation from expectations +""" + _set_up_logger() + if verbose: + # Note we are explicitly not using a logger in all CLI output to have + # more control over console UI. + logger.setLevel(logging.DEBUG) -def do_config_check(target_directory): - try: - DataContext(context_root_dir=target_directory) - return True, None - except ( - ge_exceptions.InvalidConfigurationYamlError, - ge_exceptions.InvalidTopLevelConfigKeyError, - ge_exceptions.MissingTopLevelConfigKeyError, - ge_exceptions.InvalidConfigValueTypeError, - ge_exceptions.InvalidConfigVersionError, - ge_exceptions.UnsupportedConfigVersionError, - ge_exceptions.DataContextError, - ge_exceptions.PluginClassNotFoundError - ) as err: - return False, err.message +cli.add_command(datasource) +cli.add_command(docs) +cli.add_command(init) +cli.add_command(project) +cli.add_command(suite) def main(): - handler = logging.StreamHandler() - # Just levelname and message Could re-add other info if we want - formatter = logging.Formatter( - '%(message)s') - # '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - logger.setLevel(logging.INFO) cli() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/great_expectations/cli/cli_logging.py b/great_expectations/cli/cli_logging.py new file mode 100644 index 000000000000..a5f0e42a1127 --- /dev/null +++ b/great_expectations/cli/cli_logging.py @@ -0,0 +1,25 @@ +import logging +import warnings + +warnings.filterwarnings("ignore") + +logging.getLogger( + "great_expectations.datasource.generator.in_memory_generator" +).setLevel(logging.CRITICAL) +logging.getLogger( + "great_expectations.dataset.sqlalchemy_dataset" +).setLevel(logging.CRITICAL) +logging.getLogger( + "great_expectations.profile.sample_expectations_dataset_profiler" +).setLevel(logging.CRITICAL) + +# Take over the entire GE module logging namespace when running CLI +logger = logging.getLogger("great_expectations") + + +def _set_up_logger(): + handler = logging.StreamHandler() + formatter = logging.Formatter("%(message)s") + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.WARNING) diff --git a/great_expectations/cli/datasource.py b/great_expectations/cli/datasource.py index 17af5ac9eecb..b234969748d5 100644 --- a/great_expectations/cli/datasource.py +++ b/great_expectations/cli/datasource.py @@ -1,18 +1,51 @@ +import datetime +import enum import importlib +import json +import logging import os -import enum -import click +import sys -from great_expectations.datasource import PandasDatasource, SparkDFDatasource, SqlAlchemyDatasource -from .util import cli_message -from great_expectations.exceptions import DatasourceInitializationError -from great_expectations.data_context import DataContext +import click -from great_expectations import rtd_url_ge_version +import great_expectations.exceptions as ge_exceptions +from great_expectations import DataContext, rtd_url_ge_version +from great_expectations.cli.docs import build_docs +from great_expectations.cli.init_messages import NO_DATASOURCES_FOUND +from great_expectations.cli.util import ( + _offer_to_install_new_template, + cli_message, +) +from great_expectations.core import ExpectationSuite +from great_expectations.data_context.types.resource_identifiers import ( + ValidationResultIdentifier, +) +from great_expectations.datasource import ( + PandasDatasource, + SparkDFDatasource, + SqlAlchemyDatasource, +) +from great_expectations.datasource.generator import ManualBatchKwargsGenerator +from great_expectations.datasource.generator.table_generator import ( + TableBatchKwargsGenerator, +) +from great_expectations.exceptions import ( + BatchKwargsError, + DatasourceInitializationError, +) +from great_expectations.profile.sample_expectations_dataset_profiler import ( + SampleExpectationsDatasetProfiler, +) +from great_expectations.validator.validator import Validator -import logging logger = logging.getLogger(__name__) +# FIXME: This prevents us from seeing a huge stack of these messages in python 2. We'll need to fix that later. +# tests/test_cli.py::test_cli_profile_with_datasource_arg +# /Users/abe/Documents/superconductive/tools/great_expectations/tests/test_cli.py:294: Warning: Click detected the use of the unicode_literals __future__ import. This is heavily discouraged because it can introduce subtle bugs in your code. You should instead use explicit u"" literals for your unicode strings. For more information see https://click.palletsprojects.com/python3/ +# cli, ["profile", "my_datasource", "-d", project_root_dir]) +click.disable_unicode_literals_warning = True + class DatasourceTypes(enum.Enum): PANDAS = "pandas" @@ -27,89 +60,263 @@ class DatasourceTypes(enum.Enum): "SqlAlchemyDatasource": DatasourceTypes.SQL, } +MANUAL_GENERATOR_CLASSES = (ManualBatchKwargsGenerator) + class SupportedDatabases(enum.Enum): MYSQL = 'MySQL' POSTGRES = 'Postgres' REDSHIFT = 'Redshift' SNOWFLAKE = 'Snowflake' - OTHER = 'other' + OTHER = 'other - Do you have a working SQLAlchemy connection string?' # TODO MSSQL # TODO BigQuery -def add_datasource(context): - cli_message( - """ -========== Datasources =========== -""".format(rtd_url_ge_version) - ) - data_source_selection = click.prompt( - msg_prompt_choose_datasource, - type=click.Choice(["1", "2", "3", "4"]), +@click.group() +def datasource(): + """datasource operations""" + pass + + +@datasource.command(name="new") +@click.option( + '--directory', + '-d', + default=None, + help="The project's great_expectations directory." +) +def datasource_new(directory): + """Add a new datasource to the data context.""" + try: + context = DataContext(directory) + except ge_exceptions.ConfigNotFoundError as err: + cli_message("{}".format(err.message)) + return + except ge_exceptions.ZeroDotSevenConfigVersionError as err: + _offer_to_install_new_template(err, context.root_directory) + + datasource_name, data_source_type = add_datasource(context) + + if datasource_name: + cli_message("A new datasource '{}' was added to your project.".format(datasource_name)) + else: # no datasource was created + sys.exit(1) + + +@datasource.command(name="list") +@click.option( + '--directory', + '-d', + default=None, + help="The project's great_expectations directory." +) +def datasource_list(directory): + """List known datasources.""" + try: + context = DataContext(directory) + datasources = context.list_datasources() + # TODO Pretty up this console output + cli_message(str([d for d in datasources])) + except ge_exceptions.ConfigNotFoundError as err: + cli_message("{}".format(err.message)) + return + except ge_exceptions.ZeroDotSevenConfigVersionError as err: + _offer_to_install_new_template(err, context.root_directory) + + +@datasource.command(name="profile") +@click.argument('datasource', default=None, required=False) +@click.option( + "--generator-name", + "-g", + default=None, + help="The name of the batch kwarg generator configured in the datasource. The generator will list data assets in the datasource" +) +@click.option('--data-assets', '-l', default=None, + help='Comma-separated list of the names of data assets that should be profiled. Requires datasource specified.') +@click.option('--profile_all_data_assets', '-A', is_flag=True, default=False, + help='Profile ALL data assets within the target data source. ' + 'If True, this will override --max_data_assets.') +@click.option( + "--directory", + "-d", + default=None, + help="The project's great_expectations directory." +) +@click.option( + "--view/--no-view", + help="By default open in browser unless you specify the --no-view flag", + default=True +) +@click.option('--additional-batch-kwargs', default=None, + help='Additional keyword arguments to be provided to get_batch when loading the data asset. Must be a valid JSON dictionary') +def datasource_profile(datasource, generator_name, data_assets, profile_all_data_assets, directory, view, additional_batch_kwargs): + """ + Profile a datasource + + If the optional data_assets and profile_all_data_assets arguments are not specified, the profiler will check + if the number of data assets in the datasource exceeds the internally defined limit. If it does, it will + prompt the user to either specify the list of data assets to profile or to profile all. + If the limit is not exceeded, the profiler will profile all data assets in the datasource. + + :param datasource: name of the datasource to profile + :param data_assets: if this comma-separated list of data asset names is provided, only the specified data assets will be profiled + :param profile_all_data_assets: if provided, all data assets will be profiled + :param directory: + :param view: Open the docs in a browser + :param additional_batch_kwargs: Additional keyword arguments to be provided to get_batch when loading the data asset. + :return: + """ + + try: + context = DataContext(directory) + except ge_exceptions.ConfigNotFoundError as err: + cli_message("{}".format(err.message)) + return + except ge_exceptions.ZeroDotSevenConfigVersionError as err: + _offer_to_install_new_template(err, context.root_directory) + return + + if additional_batch_kwargs is not None: + # TODO refactor out json load check in suite edit and add here + additional_batch_kwargs = json.loads(additional_batch_kwargs) + # TODO refactor batch load check in suite edit and add here + + if datasource is None: + datasources = [_datasource["name"] for _datasource in context.list_datasources()] + if not datasources: + cli_message(NO_DATASOURCES_FOUND) + sys.exit(1) + elif len(datasources) > 1: + cli_message( + "Error: please specify the datasource to profile. "\ + "Available datasources: " + ", ".join(datasources) + "" + ) + sys.exit(1) + else: + profile_datasource( + context, + datasources[0], + generator_name=generator_name, + data_assets=data_assets, + profile_all_data_assets=profile_all_data_assets, + open_docs=view, + additional_batch_kwargs=additional_batch_kwargs + ) + else: + profile_datasource( + context, + datasource, + generator_name=generator_name, + data_assets=data_assets, + profile_all_data_assets=profile_all_data_assets, + open_docs=view, + additional_batch_kwargs=additional_batch_kwargs + ) + + +def add_datasource(context, choose_one_data_asset=False): + """ + Interactive flow for adding a datasource to an existing context. + + :param context: + :param choose_one_data_asset: optional - if True, this signals the method that the intent + is to let user choose just one data asset (e.g., a file) and there is no need + to configure a generator that comprehensively scans the datasource for data assets + :return: a tuple: datasource_name, data_source_type + """ + + msg_prompt_where_is_your_data = """ +What data would you like Great Expectations to connect to? + 1. Files on a filesystem (for processing with Pandas or Spark) + 2. Relational database (SQL) +""" + + msg_prompt_files_compute_engine = """ +What are you processing your files with? + 1. Pandas + 2. PySpark +""" + + data_source_location_selection = click.prompt( + msg_prompt_where_is_your_data, + type=click.Choice(["1", "2"]), show_choices=False ) - cli_message(data_source_selection) - data_source_name = None + datasource_name = None data_source_type = None - if data_source_selection == "1": # pandas - data_source_type = DatasourceTypes.PANDAS - data_source_name = _add_pandas_datasource(context) - elif data_source_selection == "2": # sqlalchemy - data_source_type = DatasourceTypes.SQL - data_source_name = _add_sqlalchemy_datasource(context) - elif data_source_selection == "3": # Spark - data_source_type = DatasourceTypes.SPARK - data_source_name = _add_spark_datasource(context) - # if data_source_selection == "5": # dbt - # data_source_type = DatasourceTypes.DBT - # dbt_profile = click.prompt(msg_prompt_dbt_choose_profile) - # log_message(msg_dbt_go_to_notebook, color="blue") - # context.add_datasource("dbt", "dbt", profile=dbt_profile) - if data_source_selection == "4": # None of the above - cli_message(msg_unknown_data_source) - cli_message(""" -Skipping datasource configuration. - - Add one by running `great_expectations add-datasource` or - - ... by editing the `{}` file -""".format(DataContext.GE_YML) + if data_source_location_selection == "1": + data_source_compute_selection = click.prompt( + msg_prompt_files_compute_engine, + type=click.Choice(["1", "2"]), + show_choices=False ) - return data_source_name, data_source_type + if data_source_compute_selection == "1": # pandas + data_source_type = DatasourceTypes.PANDAS -def _add_pandas_datasource(context): - path = click.prompt( - msg_prompt_filesys_enter_base_path, - # default='/data/', - type=click.Path( - exists=True, - file_okay=False, - dir_okay=True, - readable=True - ), - show_default=True - ) - if path.startswith("./"): - path = path[2:] + datasource_name = _add_pandas_datasource(context, passthrough_generator_only=choose_one_data_asset) - if path.endswith("/"): - basenamepath = path[:-1] + elif data_source_compute_selection == "2": # Spark + + data_source_type = DatasourceTypes.SPARK + + datasource_name = _add_spark_datasource(context, passthrough_generator_only=choose_one_data_asset) else: - basenamepath = path + data_source_type = DatasourceTypes.SQL + datasource_name = _add_sqlalchemy_datasource(context, prompt_for_datasource_name=True) + + return datasource_name, data_source_type + + +def _add_pandas_datasource(context, passthrough_generator_only=True, prompt_for_datasource_name=True): + if passthrough_generator_only: + datasource_name = "files_datasource" + configuration = PandasDatasource.build_configuration() + + else: + path = click.prompt( + msg_prompt_filesys_enter_base_path, + type=click.Path( + exists=True, + file_okay=False, + dir_okay=True, + readable=True + ), + show_default=True + ) + if path.startswith("./"): + path = path[2:] + + if path.endswith("/"): + basenamepath = path[:-1] + else: + basenamepath = path + + datasource_name = os.path.basename(basenamepath) + "__dir" + if prompt_for_datasource_name: + datasource_name = click.prompt( + msg_prompt_datasource_name, + default=datasource_name, + show_default=True + ) + + configuration = PandasDatasource.build_configuration( + generators={ + "subdir_reader": { + "class_name": "SubdirReaderBatchKwargsGenerator", + "base_directory": os.path.join("..", path), + } + } + ) - default_data_source_name = os.path.basename(basenamepath) + "__dir" - data_source_name = click.prompt( - msg_prompt_datasource_name, - default=default_data_source_name, - show_default=True - ) - configuration = PandasDatasource.build_configuration(base_directory=os.path.join("..", path)) - context.add_datasource(name=data_source_name, class_name='PandasDatasource', **configuration) - return data_source_name + context.add_datasource(name=datasource_name, class_name='PandasDatasource', **configuration) + return datasource_name def load_library(library_name, install_instructions_string=None): @@ -132,19 +339,27 @@ def load_library(library_name, install_instructions_string=None): return True except ModuleNotFoundError as e: if install_instructions_string: - cli_message("""ERROR: Great Expectations relies on the library `{}` to connect to your database. + cli_message("""ERROR: Great Expectations relies on the library `{}` to connect to your data. - Please `{}` before trying again.""".format(library_name, install_instructions_string)) else: - cli_message("""ERROR: Great Expectations relies on the library `{}` to connect to your database. + cli_message("""ERROR: Great Expectations relies on the library `{}` to connect to your data. - Please `pip install {}` before trying again.""".format(library_name, library_name)) return False -def _add_sqlalchemy_datasource(context): +def _add_sqlalchemy_datasource(context, prompt_for_datasource_name=True): + msg_success_database = "\nGreat Expectations connected to your database!" + if not load_library("sqlalchemy"): return None + # TODO remove this nasty python 2 hack + try: + ModuleNotFoundError + except NameError: + ModuleNotFoundError = ImportError + db_choices = [str(x) for x in list(range(1, 1 + len(SupportedDatabases)))] selected_database = int( click.prompt( @@ -156,11 +371,15 @@ def _add_sqlalchemy_datasource(context): selected_database = list(SupportedDatabases)[selected_database] - data_source_name = click.prompt( - msg_prompt_datasource_name, - default="my_{}_db".format(selected_database.value.lower()), - show_default=True - ) + datasource_name = "my_{}_db".format(selected_database.value.lower()) + if selected_database == SupportedDatabases.OTHER: + datasource_name = "my_database" + if prompt_for_datasource_name: + datasource_name = click.prompt( + msg_prompt_datasource_name, + default=datasource_name, + show_default=True + ) credentials = {} # Since we don't want to save the database credentials in the config file that will be @@ -172,13 +391,15 @@ def _add_sqlalchemy_datasource(context): # GE will replace the ${datasource name} with the value from the credentials file in runtime. while True: - cli_message(msg_db_config.format(data_source_name)) + cli_message(msg_db_config.format(datasource_name)) if selected_database == SupportedDatabases.MYSQL: if not load_library("pymysql"): return None credentials = _collect_mysql_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.POSTGRES: + if not load_library("psycopg2"): + return None credentials = _collect_postgres_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.REDSHIFT: if not load_library("psycopg2"): @@ -198,15 +419,17 @@ def _add_sqlalchemy_datasource(context): "url": sqlalchemy_url } - context.save_config_variable(data_source_name, credentials) + context.save_config_variable(datasource_name, credentials) message = """ Cannot connect to the database. - Please check your environment and the configuration you provided. - Database Error: {0:s}""" try: - configuration = SqlAlchemyDatasource.build_configuration(credentials="${" + data_source_name + "}") - context.add_datasource(name=data_source_name, class_name='SqlAlchemyDatasource', **configuration) + cli_message("Attempting to connect to your database. This may take a moment...") + configuration = SqlAlchemyDatasource.build_configuration(credentials="${" + datasource_name + "}") + context.add_datasource(name=datasource_name, class_name='SqlAlchemyDatasource', **configuration) + cli_message(msg_success_database) break except ModuleNotFoundError as de: cli_message(message.format(str(de))) @@ -218,31 +441,27 @@ def _add_sqlalchemy_datasource(context): "Enter the credentials again?".format(str(de)), default=True ): - context.add_datasource(data_source_name, + context.add_datasource(datasource_name, initialize=False, module_name="great_expectations.datasource", class_name="SqlAlchemyDatasource", data_asset_type={ "class_name": "SqlAlchemyDataset"}, - credentials="${" + data_source_name + "}", - generators={ - "default": { - "class_name": "TableGenerator" - } - } + credentials="${" + datasource_name + "}", ) + # TODO this message about continuing may not be accurate cli_message( """ We saved datasource {0:s} in {1:s} and the credentials you entered in {2:s}. -Since we could not connect to the database, you can complete troubleshooting in the configuration files. Read here: +Since we could not connect to the database, you can complete troubleshooting in the configuration files documented here: https://docs.greatexpectations.io/en/latest/tutorials/add-sqlalchemy-datasource.html?utm_source=cli&utm_medium=init&utm_campaign={3:s}#{4:s} . -After you connect to the datasource, run great_expectations profile to continue. +After you connect to the datasource, run great_expectations init to continue. -""".format(data_source_name, DataContext.GE_YML, context.get_project_config().get("config_variables_file_path"), rtd_url_ge_version, selected_database.value.lower())) +""".format(datasource_name, DataContext.GE_YML, context.get_config()["config_variables_file_path"], rtd_url_ge_version, selected_database.value.lower())) return None - return data_source_name + return datasource_name def _collect_postgres_credentials(default_credentials={}): @@ -250,19 +469,19 @@ def _collect_postgres_credentials(default_credentials={}): "drivername": "postgres" } - credentials["host"] = click.prompt("What is the host for the sqlalchemy connection?", + credentials["host"] = click.prompt("What is the host for the postgres connection?", default=default_credentials.get("host", "localhost"), show_default=True) - credentials["port"] = click.prompt("What is the port for the sqlalchemy connection?", + credentials["port"] = click.prompt("What is the port for the postgres connection?", default=default_credentials.get("port", "5432"), show_default=True) - credentials["username"] = click.prompt("What is the username for the sqlalchemy connection?", + credentials["username"] = click.prompt("What is the username for the postgres connection?", default=default_credentials.get("username", "postgres"), show_default=True) - credentials["password"] = click.prompt("What is the password for the sqlalchemy connection?", + credentials["password"] = click.prompt("What is the password for the postgres connection?", default="", show_default=False, hide_input=True) - credentials["database"] = click.prompt("What is the database name for the sqlalchemy connection?", + credentials["database"] = click.prompt("What is the database name for the postgres connection?", default=default_credentials.get("database", "postgres"), show_default=True) @@ -299,12 +518,11 @@ def _collect_snowflake_credentials(default_credentials={}): # show_default=True) credentials["query"] = {} - credentials["query"]["warehouse_name"] = click.prompt("What is warehouse name for the snowflake connection?", - default=default_credentials.get("warehouse_name", ""), - show_default=True) - credentials["query"]["role_name"] = click.prompt("What is role name for the snowflake connection?", - default=default_credentials.get("role_name", ""), - show_default=True) + credentials["query"]["warehouse"] = click.prompt("What is warehouse name for the snowflake connection?", + default=default_credentials.get("warehouse", ""), + show_default=True) + credentials["query"]["role"] = click.prompt("What is role name for the snowflake connection?", + default=default_credentials.get("role", ""), show_default=True) return credentials @@ -355,13 +573,13 @@ def _collect_redshift_credentials(default_credentials={}): default=default_credentials.get("port", "5439"), show_default=True) credentials["username"] = click.prompt("What is the username for the Redshift connection?", - default=default_credentials.get("username", "postgres"), + default=default_credentials.get("username", ""), show_default=True) credentials["password"] = click.prompt("What is the password for the Redshift connection?", default="", show_default=False, hide_input=True) credentials["database"] = click.prompt("What is the database name for the Redshift connection?", - default=default_credentials.get("database", "postgres"), + default=default_credentials.get("database", ""), show_default=True) # optional @@ -373,35 +591,527 @@ def _collect_redshift_credentials(default_credentials={}): return credentials -def _add_spark_datasource(context): +def _add_spark_datasource(context, passthrough_generator_only=True, prompt_for_datasource_name=True): + if not load_library("pyspark"): + return None + + if passthrough_generator_only: + datasource_name = "files_spark_datasource" + + # configuration = SparkDFDatasource.build_configuration(generators={ + # "default": { + # "class_name": "PassthroughGenerator", + # } + # } + # ) + configuration = SparkDFDatasource.build_configuration() + + else: + path = click.prompt( + msg_prompt_filesys_enter_base_path, + # default='/data/', + type=click.Path( + exists=True, + file_okay=False, + dir_okay=True, + readable=True + ), + show_default=True + ) + if path.startswith("./"): + path = path[2:] + + if path.endswith("/"): + basenamepath = path[:-1] + else: + basenamepath = path + + datasource_name = os.path.basename(basenamepath) + "__dir" + if prompt_for_datasource_name: + datasource_name = click.prompt( + msg_prompt_datasource_name, + default=datasource_name, + show_default=True + ) + + configuration = SparkDFDatasource.build_configuration(generators={ + "subdir_reader": { + "class_name": "SubdirReaderBatchKwargsGenerator", + "base_directory": os.path.join("..", path) + } +} +) + + + context.add_datasource(name=datasource_name, class_name='SparkDFDatasource', **configuration) + return datasource_name + + +def select_datasource(context, datasource_name=None): + msg_prompt_select_data_source = "Select a datasource" + msg_no_datasources_configured = "No datasources found in the context. To add a datasource, run `great_expectations datasource new`" + + data_source = None + + if datasource_name is None: + data_sources = sorted(context.list_datasources(), key=lambda x: x["name"]) + if len(data_sources) == 0: + cli_message(msg_no_datasources_configured) + elif len(data_sources) ==1: + datasource_name = data_sources[0]["name"] + else: + choices = "\n".join([" {}. {}".format(i, data_source["name"]) for i, data_source in enumerate(data_sources, 1)]) + option_selection = click.prompt( + msg_prompt_select_data_source + "\n" + choices + "\n", + type=click.Choice([str(i) for i, data_source in enumerate(data_sources, 1)]), + show_choices=False + ) + datasource_name = data_sources[int(option_selection)-1]["name"] + + if datasource_name is not None: + data_source = context.get_datasource(datasource_name) + + return data_source + +def select_generator(context, datasource_name, available_data_assets_dict=None): + msg_prompt_select_generator = "Select generator" + + if available_data_assets_dict is None: + available_data_assets_dict = context.get_available_data_asset_names(datasource_names=datasource_name) + + available_data_asset_names_by_generator = {} + for key, value in available_data_assets_dict[datasource_name].items(): + if len(value["names"]) > 0: + available_data_asset_names_by_generator[key] = value["names"] + + if len(available_data_asset_names_by_generator.keys()) == 0: + return None + elif len(available_data_asset_names_by_generator.keys()) == 1: + return list(available_data_asset_names_by_generator.keys())[0] + else: # multiple generators + generator_names = list(available_data_asset_names_by_generator.keys()) + choices = "\n".join([" {}. {}".format(i, generator_name) for i, generator_name in enumerate(generator_names, 1)]) + option_selection = click.prompt( + msg_prompt_select_generator + "\n" + choices, + type=click.Choice([str(i) for i, generator_name in enumerate(generator_names, 1)]), + show_choices=False + ) + generator_name = generator_names[int(option_selection)-1] + + return generator_name + + +# TODO this method needs testing +def get_batch_kwargs(context, + datasource_name=None, + generator_name=None, + generator_asset=None, + additional_batch_kwargs=None): + """ + This method manages the interaction with user necessary to obtain batch_kwargs for a batch of a data asset. + + In order to get batch_kwargs this method needs datasource_name, generator_name and generator_asset + to combine them into a fully qualified data asset identifier(datasource_name/generator_name/generator_asset). + All three arguments are optional. If they are present, the method uses their values. Otherwise, the method + prompts user to enter them interactively. Since it is possible for any of these three components to be + passed to this method as empty values and to get their values after interacting with user, this method + returns these components' values in case they changed. + + If the datasource has generators that can list available data asset names, the method lets user choose a name + from that list (note: if there are multiple generators, user has to choose one first). If a name known to + the chosen generator is selected, the generator will be able to yield batch_kwargs. The method also gives user + an alternative to selecting the data asset name from the generator's list - user can type in a name for their + data asset. In this case a passthrough batch kwargs generator will be used to construct a fully qualified data asset + identifier (note: if the datasource has no passthrough generator configured, the method will exist with a failure). + Since no generator can yield batch_kwargs for this data asset name, the method prompts user to specify batch_kwargs + by choosing a file (if the datasource is pandas or spark) or by writing a SQL query (if the datasource points + to a database). + + :param context: + :param datasource_name: + :param generator_name: + :param generator_asset: + :param additional_batch_kwargs: + :return: a tuple: (datasource_name, generator_name, generator_asset, batch_kwargs). The components + of the tuple were passed into the methods as optional arguments, but their values might + have changed after this method's execution. If the returned batch_kwargs is None, it means + that the generator will know to yield batch_kwargs when called. + """ + try: + available_data_assets_dict = context.get_available_data_asset_names(datasource_names=datasource_name) + except ValueError: + # the datasource has no generators + available_data_assets_dict = {datasource_name: {}} + + data_source = select_datasource(context, datasource_name=datasource_name) + datasource_name = data_source.name + + if generator_name is None: + generator_name = select_generator(context, datasource_name, + available_data_assets_dict=available_data_assets_dict) + + # if the user provided us with the generator name and the generator asset, we have everything we need - + # let's ask the generator to build batch kwargs for this asset - we are done. + if generator_name is not None and generator_asset is not None: + generator = datasource.get_generator(generator_name) + batch_kwargs = generator.build_batch_kwargs(generator_asset, **additional_batch_kwargs) + return batch_kwargs + + if isinstance(context.get_datasource(datasource_name), (PandasDatasource, SparkDFDatasource)): + generator_asset, batch_kwargs = _get_batch_kwargs_from_generator_or_from_file_path( + context, + datasource_name, + generator_name=generator_name, + ) + + elif isinstance(context.get_datasource(datasource_name), SqlAlchemyDatasource): + generator_asset, batch_kwargs = _load_query_as_data_asset_from_sqlalchemy_datasource(context, + datasource_name, + generator_name=generator_name, + additional_batch_kwargs=additional_batch_kwargs) + else: + raise ge_exceptions.DataContextError("Datasource {0:s} is expected to be a PandasDatasource or SparkDFDatasource, but is {1:s}".format(datasource_name, str(type(context.get_datasource(datasource_name))))) + + return (datasource_name, generator_name, generator_asset, batch_kwargs) + + +def create_expectation_suite( + context, + datasource_name=None, + generator_name=None, + generator_asset=None, + batch_kwargs=None, + expectation_suite_name=None, + additional_batch_kwargs=None, + show_intro_message=False, + open_docs=False +): + + """ + Create a new expectation suite. + + :param context: + :param datasource_name: + :param generator_name: + :param generator_asset: + :param batch_kwargs: + :param expectation_suite_name: + :param additional_batch_kwargs: + :return: a tuple: (success, suite name) + """ + + msg_intro = """ +========== Create sample Expectations ========== + + +""" + + msg_some_data_assets_not_found = """Some of the data assets you specified were not found: {0:s} + """ + + msg_prompt_what_will_profiler_do = """ +Great Expectations will choose a couple of columns and generate expectations about them +to demonstrate some examples of assertions you can make about your data. + +Press Enter to continue +""" + + msg_prompt_expectation_suite_name = """ +Name the new expectation suite""" + + msg_data_doc_intro = """ +========== Data Docs ==========""" + + msg_suite_already_exists = "An expectation suite named `{}` already exists. If you intend to edit the suite please use `great_expectations suite edit {}`." + + if show_intro_message: + cli_message(msg_intro) + + data_source = select_datasource(context, datasource_name=datasource_name) + if data_source is None: + # select_datasource takes care of displaying an error message, so all is left here is to exit. + sys.exit(1) + + datasource_name = data_source.name + + existing_suite_names = [expectation_suite_id.expectation_suite_name for expectation_suite_id in context.list_expectation_suites()] + + if expectation_suite_name in existing_suite_names: + cli_message( + msg_suite_already_exists.format( + expectation_suite_name, + expectation_suite_name + ) + ) + sys.exit(1) + + if generator_name is None or generator_asset is None or batch_kwargs is None: + datasource_name, generator_name, generator_asset, batch_kwargs = get_batch_kwargs( + context, + datasource_name=datasource_name, + generator_name=generator_name, + generator_asset=generator_asset, + additional_batch_kwargs=additional_batch_kwargs) + # In this case, we have "consumed" the additional_batch_kwargs + additional_batch_kwargs = {} + + if expectation_suite_name is None: + if generator_asset: + default_expectation_suite_name = "{}.warning".format(generator_asset) + elif "query" in batch_kwargs: + default_expectation_suite_name = "query.warning" + else: + default_expectation_suite_name = "warning" + while True: + expectation_suite_name = click.prompt(msg_prompt_expectation_suite_name, default=default_expectation_suite_name, show_default=True) + if expectation_suite_name in existing_suite_names: + cli_message( + msg_suite_already_exists.format( + expectation_suite_name, + expectation_suite_name + ) + ) + else: + break + + profiler = SampleExpectationsDatasetProfiler + + click.prompt(msg_prompt_what_will_profiler_do, default=True, show_default=False) + + cli_message("\nGenerating example Expectation Suite...") + run_id = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") + + profiling_results = context.profile_data_asset( + datasource_name, + generator_name=generator_name, + data_asset_name=generator_asset, + batch_kwargs=batch_kwargs, + profiler=profiler, + expectation_suite_name=expectation_suite_name, + run_id=run_id, + additional_batch_kwargs=additional_batch_kwargs + ) + + if profiling_results['success']: + build_docs(context, view=False) + if open_docs: # This is mostly to keep tests from spawning windows + try: + # TODO this is really brittle and not covered in tests + validation_result = profiling_results["results"][0][1] + validation_result_identifier = ValidationResultIdentifier.from_object(validation_result) + context.open_data_docs(resource_identifier=validation_result_identifier) + except (KeyError, IndexError): + context.open_data_docs() + + return True, expectation_suite_name + + if profiling_results['error']['code'] == DataContext.PROFILING_ERROR_CODE_SPECIFIED_DATA_ASSETS_NOT_FOUND: + raise ge_exceptions.DataContextError(msg_some_data_assets_not_found.format(",".join(profiling_results['error']['not_found_data_assets']))) + if not profiling_results['success']: # unknown error + raise ge_exceptions.DataContextError("Unknown profiling error code: " + profiling_results['error']['code']) + + +def _get_batch_kwargs_from_generator_or_from_file_path(context, datasource_name, + generator_name=None, + additional_batch_kwargs={}): + msg_prompt_generator_or_file_path = """ +Would you like to: + 1. choose from a list of data assets in this datasource + 2. enter the path of a data file +""" + msg_prompt_file_path = """ +Enter the path (relative or absolute) of a data file +""" + + msg_prompt_enter_data_asset_name = "\nWhich data would you like to use?\n" + + msg_prompt_enter_data_asset_name_suffix = " Don't see the name of the data asset in the list above? Just type it\n" + + msg_prompt_file_type = """ +We could not determine the format of the file. What is it? + 1. CSV + 2. Parquet + 3. Excel + 4. JSON +""" + + reader_method_file_extensions = { + "1": "csv", + "2": "parquet", + "3": "xlsx", + "4": "json", + } + + generator_asset = None + + datasource = context.get_datasource(datasource_name) + if generator_name is not None: + generator = datasource.get_generator(generator_name) + + option_selection = click.prompt( + msg_prompt_generator_or_file_path, + type=click.Choice(["1", "2"]), + show_choices=False + ) + + if option_selection == "1": + + available_data_asset_names = sorted(generator.get_available_data_asset_names()["names"], key=lambda x: x[0]) + available_data_asset_names_str = ["{} ({})".format(name[0], name[1]) for name in + available_data_asset_names] + + data_asset_names_to_display = available_data_asset_names_str[:50] + choices = "\n".join([" {}. {}".format(i, name) for i, name in enumerate(data_asset_names_to_display, 1)]) + prompt = msg_prompt_enter_data_asset_name + choices + "\n" + msg_prompt_enter_data_asset_name_suffix.format( + len(data_asset_names_to_display)) + + generator_asset_selection = click.prompt(prompt, default=None, show_default=False) + + generator_asset_selection = generator_asset_selection.strip() + try: + data_asset_index = int(generator_asset_selection) - 1 + try: + generator_asset = \ + [name[0] for name in available_data_asset_names][data_asset_index] + except IndexError: + pass + except ValueError: + generator_asset = generator_asset_selection + + batch_kwargs = generator.build_batch_kwargs(generator_asset, **additional_batch_kwargs) + return (generator_asset, batch_kwargs) + + # No generator name was passed or the user chose to enter a file path + + # We should allow a directory for Spark, but not for Pandas + dir_okay = isinstance(datasource, SparkDFDatasource) + path = click.prompt( - msg_prompt_filesys_enter_base_path, - # default='/data/', + msg_prompt_file_path, type=click.Path( exists=True, - file_okay=False, - dir_okay=True, + file_okay=True, + dir_okay=dir_okay, readable=True ), show_default=True ) - if path.startswith("./"): - path = path[2:] - if path.endswith("/"): - path = path[:-1] - default_data_source_name = os.path.basename(path) + "__dir" - data_source_name = click.prompt( - msg_prompt_datasource_name, default=default_data_source_name, show_default=True) + path = os.path.abspath(path) - configuration = SparkDFDatasource.build_configuration(base_directory=os.path.join("..", path)) - context.add_datasource(name=data_source_name, class_name='SparkDFDatasource', **configuration) - return data_source_name + batch_kwargs = { + "path": path, + "datasource": datasource_name + } + + reader_method = None + try: + reader_method = datasource.guess_reader_method_from_path(path)["reader_method"] + except BatchKwargsError: + pass + + if reader_method is None: + + while True: + + option_selection = click.prompt( + msg_prompt_file_type, + type=click.Choice(["1", "2", "3", "4"]), + show_choices=False + ) + + try: + reader_method = datasource.guess_reader_method_from_path(path + "." + reader_method_file_extensions[option_selection])["reader_method"] + except BatchKwargsError: + pass + + if reader_method is not None: + batch_kwargs["reader_method"] = reader_method + batch = datasource.get_batch(batch_kwargs=batch_kwargs) + break + else: + # TODO: read the file and confirm with user that we read it correctly (headers, columns, etc.) + batch = datasource.get_batch(batch_kwargs=batch_kwargs) + + + return (generator_asset, batch_kwargs) + + +def _load_query_as_data_asset_from_sqlalchemy_datasource(context, datasource_name, + generator_name=None, + additional_batch_kwargs=None): + msg_prompt_query = """ +Enter an SQL query +""" + msg_prompt_data_asset_name = """ + Give your new data asset a short name +""" + msg_prompt_enter_data_asset_name = "\nWhich table would you like to use? (Choose one)\n" + + msg_prompt_enter_data_asset_name_suffix = " Don't see the table in the list above? Just type the SQL query\n" + + if additional_batch_kwargs is None: + additional_batch_kwargs = {} + + generator_asset = None + + datasource = context.get_datasource(datasource_name) + + temp_generator = TableBatchKwargsGenerator(name="temp", datasource=datasource) + + available_data_asset_names = temp_generator.get_available_data_asset_names()["names"] + available_data_asset_names_str = ["{} ({})".format(name[0], name[1]) for name in + available_data_asset_names] + + data_asset_names_to_display = available_data_asset_names_str[:5] + choices = "\n".join([" {}. {}".format(i, name) for i, name in enumerate(data_asset_names_to_display, 1)]) + prompt = msg_prompt_enter_data_asset_name + choices + "\n" + msg_prompt_enter_data_asset_name_suffix.format( + len(data_asset_names_to_display)) + + while True: + try: + query = None + + if len(available_data_asset_names) > 0: + selection = click.prompt(prompt, default=None, show_default=False) + + selection = selection.strip() + try: + data_asset_index = int(selection) - 1 + try: + generator_asset = \ + [name[0] for name in available_data_asset_names][data_asset_index] + except IndexError: + pass + except ValueError: + query = selection + + else: + query = click.prompt(msg_prompt_query, default=None, show_default=False) + + if query is None: + batch_kwargs = temp_generator.build_batch_kwargs(generator_asset, **additional_batch_kwargs) + else: + batch_kwargs = { + "query": query, + "datasource": datasource_name + } + + Validator(batch=datasource.get_batch(batch_kwargs), expectation_suite=ExpectationSuite("throwaway")).get_dataset() + + break + except ge_exceptions.GreatExpectationsError as error: + cli_message("""ERROR: {}""".format(str(error))) + except KeyError as error: + cli_message("""ERROR: {}""".format(str(error))) + + return generator_asset, batch_kwargs def profile_datasource( context, - data_source_name, + datasource_name, + generator_name=None, data_assets=None, profile_all_data_assets=False, max_data_assets=20, @@ -409,6 +1119,11 @@ def profile_datasource( open_docs=False, ): """"Profile a named datasource using the specified context""" + # Note we are explicitly not using a logger in all CLI output to have + # more control over console UI. + logging.getLogger( + "great_expectations.profile.basic_dataset_profiler" + ).setLevel(logging.INFO) msg_intro = """ ========== Profiling ========== @@ -418,7 +1133,7 @@ def profile_datasource( msg_confirm_ok_to_proceed = """Would you like to profile '{0:s}'?""" msg_skipping = "Skipping profiling for now. You can always do this later " \ - "by running `great_expectations profile`." + "by running `great_expectations datasource profile`." msg_some_data_assets_not_found = """Some of the data assets you specified were not found: {0:s} """ @@ -426,6 +1141,14 @@ def profile_datasource( msg_too_many_data_assets = """There are {0:d} data assets in {1:s}. Profiling all of them might take too long. """ + msg_error_multiple_generators_found = """More than one batch kwarg generators found in datasource {0:s}. +Specify the one you want the profiler to use in generator_name argument. +""" + + msg_error_no_generators_found = """No batch kwarg generators can list available data assets in datasource {0:s}. +The datasource might be empty or a generator not configured in the config file. +""" + msg_prompt_enter_data_asset_list = """Enter comma-separated list of data asset names (e.g., {0:s}) """ @@ -440,14 +1163,15 @@ def profile_datasource( Great Expectations is building Data Docs from the data you just profiled!""" - cli_message(msg_intro.format(data_source_name, rtd_url_ge_version)) + cli_message(msg_intro.format(datasource_name, rtd_url_ge_version)) if data_assets: data_assets = [item.strip() for item in data_assets.split(",")] # Call the data context's profiling method to check if the arguments are valid profiling_results = context.profile_datasource( - data_source_name, + datasource_name, + generator_name=generator_name, data_assets=data_assets, profile_all_data_assets=profile_all_data_assets, max_data_assets=max_data_assets, @@ -455,16 +1179,17 @@ def profile_datasource( additional_batch_kwargs=additional_batch_kwargs ) - if profiling_results['success']: # data context is ready to profile - run profiling - if data_assets or profile_all_data_assets or click.confirm(msg_confirm_ok_to_proceed.format(data_source_name), default=True): + if profiling_results["success"] is True: # data context is ready to profile - run profiling + if data_assets or profile_all_data_assets or click.confirm(msg_confirm_ok_to_proceed.format(datasource_name), default=True): profiling_results = context.profile_datasource( - data_source_name, - data_assets=data_assets, - profile_all_data_assets=profile_all_data_assets, - max_data_assets=max_data_assets, - dry_run=False, - additional_batch_kwargs=additional_batch_kwargs - ) + datasource_name, + generator_name=generator_name, + data_assets=data_assets, + profile_all_data_assets=profile_all_data_assets, + max_data_assets=max_data_assets, + dry_run=False, + additional_batch_kwargs=additional_batch_kwargs + ) else: cli_message(msg_skipping) return @@ -474,7 +1199,15 @@ def profile_datasource( if profiling_results['error']['code'] == DataContext.PROFILING_ERROR_CODE_SPECIFIED_DATA_ASSETS_NOT_FOUND: cli_message(msg_some_data_assets_not_found.format("," .join(profiling_results['error']['not_found_data_assets']))) elif profiling_results['error']['code'] == DataContext.PROFILING_ERROR_CODE_TOO_MANY_DATA_ASSETS: - cli_message(msg_too_many_data_assets.format(profiling_results['error']['num_data_assets'], data_source_name)) + cli_message(msg_too_many_data_assets.format(profiling_results['error']['num_data_assets'], datasource_name)) + elif profiling_results['error']['code'] == DataContext.PROFILING_ERROR_CODE_MULTIPLE_GENERATORS_FOUND: + cli_message( + msg_error_multiple_generators_found.format(datasource_name)) + sys.exit(1) + elif profiling_results['error']['code'] == DataContext.PROFILING_ERROR_CODE_NO_GENERATOR_FOUND: + cli_message( + msg_error_no_generators_found.format(datasource_name)) + sys.exit(1) else: # unknown error raise ValueError("Unknown profiling error code: " + profiling_results['error']['code']) @@ -486,7 +1219,7 @@ def profile_datasource( if option_selection == "1": data_assets = click.prompt( - msg_prompt_enter_data_asset_list.format(", ".join(profiling_results['error']['data_assets'][:3])), + msg_prompt_enter_data_asset_list.format(", ".join([data_asset[0] for data_asset in profiling_results['error']['data_assets']][:3])), default=None, show_default=False ) @@ -494,6 +1227,7 @@ def profile_datasource( data_assets = [item.strip() for item in data_assets.split(",")] elif option_selection == "3": profile_all_data_assets = True + data_assets = None elif option_selection == "2": # skip cli_message(msg_skipping) return @@ -503,7 +1237,8 @@ def profile_datasource( # after getting the arguments from the user, let's try to run profiling again # (no dry run this time) profiling_results = context.profile_datasource( - data_source_name, + datasource_name, + generator_name=generator_name, data_assets=data_assets, profile_all_data_assets=profile_all_data_assets, max_data_assets=max_data_assets, @@ -511,39 +1246,15 @@ def profile_datasource( additional_batch_kwargs=additional_batch_kwargs ) - if profiling_results['success']: # data context is ready to profile + if profiling_results["success"]: # data context is ready to profile break cli_message(msg_data_doc_intro.format(rtd_url_ge_version)) - build_docs(context) + build_docs(context, view=open_docs) if open_docs: # This is mostly to keep tests from spawning windows context.open_data_docs() -def build_docs(context, site_name=None): - """Build documentation in a context""" - logger.debug("Starting cli.datasource.build_docs") - - cli_message("Building Data Docs...") - - if site_name is not None: - site_names = [site_name] - else: - site_names = None - - index_page_locator_infos = context.build_data_docs(site_names=site_names) - - msg = "The following Data Docs sites were generated:\n" - for site_name, index_page_locator_info in index_page_locator_infos.items(): - if os.path.isfile(index_page_locator_info): - msg += "- " + site_name + ":\n" - msg += " file://" + index_page_locator_info + "\n" - else: - msg += site_name + "\n" - - cli_message(msg) - - msg_prompt_choose_datasource = """Configure a datasource: 1. Pandas DataFrame 2. Relational database (SQL) @@ -553,7 +1264,7 @@ def build_docs(context, site_name=None): msg_prompt_choose_database = """ -Which database? +Which database backend are you using? {} """.format("\n".join([" {}. {}".format(i, db.value) for i, db in enumerate(SupportedDatabases, 1)])) @@ -569,8 +1280,7 @@ def build_docs(context, site_name=None): # """ msg_prompt_filesys_enter_base_path = """ -Enter the path of the root directory where the data files are stored. -(The path may be either absolute or relative to current directory.) +Enter the path (relative or absolute) of the root directory where the data files are stored. """ msg_prompt_datasource_name = """ @@ -578,18 +1288,11 @@ def build_docs(context, site_name=None): """ msg_db_config = """ -Next, we will configure database credentials and store them in the "{0:s}" section -of this config file: great_expectations/uncommitted/credentials/profiles.yml: +Next, we will configure database credentials and store them in the `{0:s}` section +of this config file: great_expectations/uncommitted/config_variables.yml: """ msg_unknown_data_source = """ Do we not have the type of data source you want? - Please create a GitHub issue here so we can discuss it! - https://github.com/great-expectations/great_expectations/issues/new""" - -# TODO also maybe add validation playground notebook or wait for the full onboarding flow -MSG_GO_TO_NOTEBOOK = """ -To create expectations for your data, start Jupyter and open a tutorial notebook: - - To launch with jupyter notebooks: - jupyter notebook great_expectations/notebooks/{}/create_expectations.ipynb -""" diff --git a/great_expectations/cli/docs.py b/great_expectations/cli/docs.py new file mode 100644 index 000000000000..fa20c1570e6d --- /dev/null +++ b/great_expectations/cli/docs.py @@ -0,0 +1,82 @@ +import os +import sys + +import click + +from great_expectations import DataContext +from great_expectations import exceptions as ge_exceptions +from great_expectations.cli.cli_logging import logger +from great_expectations.cli.util import ( + _offer_to_install_new_template, + cli_message, +) + + +@click.group() +def docs(): + """data docs operations""" + pass + + +@docs.command(name="build") +@click.option( + "--directory", + "-d", + default=None, + help="The project's great_expectations directory.", +) +@click.option( + "--site-name", + "-s", + help="The site for which to generate documentation. See data_docs section in great_expectations.yml", +) +@click.option( + "--view/--no-view", + help="By default open in browser unless you specify the --no-view flag", + default=True, +) +def docs_build(directory, site_name, view=True): + """Build Data Docs for a project.""" + try: + context = DataContext(directory) + build_docs(context, site_name=site_name, view=view) + except ge_exceptions.ConfigNotFoundError as err: + cli_message("{}".format(err.message)) + sys.exit(1) + except ge_exceptions.ZeroDotSevenConfigVersionError as err: + _offer_to_install_new_template(err, context.root_directory) + return + except ge_exceptions.PluginModuleNotFoundError as err: + cli_message(err.cli_colored_message) + sys.exit(1) + except ge_exceptions.PluginClassNotFoundError as err: + cli_message(err.cli_colored_message) + sys.exit(1) + + +def build_docs(context, site_name=None, view=True): + """Build documentation in a context""" + logger.debug("Starting cli.datasource.build_docs") + + cli_message("Building Data Docs...") + + if site_name is not None: + site_names = [site_name] + else: + site_names = None + + index_page_locator_infos = context.build_data_docs(site_names=site_names) + + msg = "The following Data Docs sites were built:\n" + for site_name, index_page_locator_info in index_page_locator_infos.items(): + if os.path.isfile(index_page_locator_info): + msg += "- " + site_name + ":\n" + msg += " file://" + index_page_locator_info + "\n" + else: + msg += site_name + "\n" + + msg = msg.rstrip("\n") + cli_message(msg) + + if view: + context.open_data_docs() diff --git a/great_expectations/cli/init.py b/great_expectations/cli/init.py new file mode 100644 index 000000000000..e44ee66e0e83 --- /dev/null +++ b/great_expectations/cli/init.py @@ -0,0 +1,171 @@ +import os +import sys + +import click + +from great_expectations import DataContext +from great_expectations import exceptions as ge_exceptions +from great_expectations.cli.datasource import \ + add_datasource as add_datasource_impl +from great_expectations.cli.datasource import \ + create_expectation_suite as create_expectation_suite_impl +from great_expectations.cli.docs import build_docs +from great_expectations.cli.init_messages import ( + BUILD_DOCS_PROMPT, + GREETING, + LETS_BEGIN_PROMPT, + ONBOARDING_COMPLETE, + PROJECT_IS_COMPLETE, + RUN_INIT_AGAIN, + SETUP_SUCCESS, + SLACK_LATER, + SLACK_SETUP_COMPLETE, + SLACK_SETUP_INTRO, + SLACK_SETUP_PROMPT, + SLACK_WEBHOOK_PROMPT, +) +from great_expectations.cli.util import cli_message, is_sane_slack_webhook +from great_expectations.exceptions import ( + DataContextError, + DatasourceInitializationError, +) + +try: + from sqlalchemy.exc import SQLAlchemyError +except ImportError: + # We'll redefine this error in code below to catch ProfilerError, which is caught above, so SA errors will + # just fall through + SQLAlchemyError = ge_exceptions.ProfilerError + + +@click.command() +@click.option( + "--target-directory", + "-d", + default="./", + help="The root of the project directory where you want to initialize Great Expectations.", +) +@click.option( + # Note this --no-view option is mostly here for tests + "--view/--no-view", + help="By default open in browser unless you specify the --no-view flag", + default=True, +) +def init(target_directory, view): + """ + Initialize a new Great Expectations project. + + This guided input walks the user through setting up a new project and also + onboards a new developer in an existing project. + + It scaffolds directories, sets up notebooks, creates a project file, and + appends to a `.gitignore` file. + """ + target_directory = os.path.abspath(target_directory) + ge_dir = _get_full_path_to_ge_dir(target_directory) + + cli_message(GREETING) + + if DataContext.does_config_exist_on_disk(ge_dir): + try: + if DataContext.is_project_initialized(ge_dir): + # Ensure the context can be instantiated + cli_message(PROJECT_IS_COMPLETE) + except (DataContextError, DatasourceInitializationError) as e: + cli_message("{}".format(e.message)) + sys.exit(1) + else: + try: + context = DataContext.create(target_directory) + cli_message(ONBOARDING_COMPLETE) + # TODO if this is correct, ensure this is covered by a test + # cli_message(SETUP_SUCCESS) + # exit(0) + except DataContextError as e: + cli_message("{}".format(e.message)) + # TODO ensure this is covered by a test + exit(5) + else: + if not click.confirm(LETS_BEGIN_PROMPT, default=True): + cli_message(RUN_INIT_AGAIN) + # TODO ensure this is covered by a test + exit(0) + + try: + context = DataContext.create(target_directory) + except DataContextError as e: + # TODO ensure this is covered by a test + cli_message("{}".format(e)) + + try: + # if expectations exist, offer to build docs + context = DataContext(ge_dir) + if context.list_expectation_suites(): + if click.confirm(BUILD_DOCS_PROMPT, default=True): + build_docs(context, view=view) + + else: + datasources = context.list_datasources() + if len(datasources) == 0: + datasource_name, data_source_type = add_datasource_impl(context, choose_one_data_asset=True) + if not datasource_name: # no datasource was created + sys.exit(1) + + datasources = context.list_datasources() + if len(datasources) == 1: + datasource_name = datasources[0]["name"] + + success, suite_name = create_expectation_suite_impl( + context, + datasource_name=datasource_name, + show_intro_message=False, + additional_batch_kwargs={"limit": 1000}, + open_docs=view, + ) + if success: + cli_message( + "A new Expectation suite '{}' was added to your project".format(suite_name) + ) + + cli_message(SETUP_SUCCESS) + sys.exit(0) + except ( + DataContextError, + ge_exceptions.ProfilerError, + IOError, + SQLAlchemyError + ) as e: + cli_message("{}".format(e)) + sys.exit(1) + + +def _slack_setup(context): + webhook_url = None + cli_message(SLACK_SETUP_INTRO) + if not click.confirm(SLACK_SETUP_PROMPT, default=True): + cli_message(SLACK_LATER) + return context + else: + webhook_url = click.prompt(SLACK_WEBHOOK_PROMPT, default="") + + while not is_sane_slack_webhook(webhook_url): + cli_message("That URL was not valid.\n") + if not click.confirm(SLACK_SETUP_PROMPT, default=True): + cli_message(SLACK_LATER) + return context + webhook_url = click.prompt(SLACK_WEBHOOK_PROMPT, default="") + + context.save_config_variable("validation_notification_slack_webhook", webhook_url) + cli_message(SLACK_SETUP_COMPLETE) + + return context + + +def _get_full_path_to_ge_dir(target_directory): + return os.path.abspath(os.path.join(target_directory, DataContext.GE_DIR)) + + +def _complete_onboarding(target_dir): + DataContext.create(target_dir) + cli_message(ONBOARDING_COMPLETE) + return True diff --git a/great_expectations/cli/init_messages.py b/great_expectations/cli/init_messages.py index 3772e994f89c..2629b42a2bc5 100644 --- a/great_expectations/cli/init_messages.py +++ b/great_expectations/cli/init_messages.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -from great_expectations import rtd_url_ge_version, DataContext - -# !!! This injects a version tag into the docs. We should test that those versioned docs exist in RTD. +from great_expectations import DataContext GREETING = """\ ___ _ ___ _ _ _ @@ -10,9 +8,11 @@ \___|_| \___\__,_|\__| |___/_\_\ .__/\___\__|\__\__,_|\__|_\___/_||_/__/ |_| ~ Always know what to expect from your data ~ -""".format(rtd_url_ge_version) +""" + +LETS_BEGIN_PROMPT = """In a few minutes you will see Great Expectations in action on your data! -LETS_BEGIN_PROMPT = """Let's add Great Expectations to your project, by scaffolding a new great_expectations directory like this: +First, Great Expectations will create a new directory: great_expectations ├── expectations @@ -26,6 +26,7 @@ └── uncommitted ├── config_variables.yml └── ... + OK to proceed?""" PROJECT_IS_COMPLETE = "This looks like an existing project that appears complete! You are ready to roll.\n" @@ -38,7 +39,7 @@ SLACK_SETUP_INTRO = """ ========== Slack Notifications ========== -""".format(rtd_url_ge_version) +""" SLACK_SETUP_PROMPT = "Would you like to set up Slack data quality notifications?" @@ -54,8 +55,9 @@ OK. Slack is set up. To modify this in the future please see the slack section in the CLI init getting started guide.""" ONBOARDING_COMPLETE = """ -Done. You may see new files in `great_expectations/uncommitted`. - - Now add secrets to great_expectations/uncommitted/config_variables.yml to finish onboarding. +Great Expectations added some missing files required to run. + - You may see new files in `great_expectations/uncommitted`. + - You may need to add secrets to great_expectations/uncommitted/config_variables.yml to finish onboarding. """ BUILD_DOCS_PROMPT = "Would you like to build & view this project's Data Docs!?" @@ -70,6 +72,8 @@ """ NO_DATASOURCES_FOUND = """Error: No datasources were found. Please add one by: - - running `great_expectations add-datasource` or + - running `great_expectations datasource new` or - by editing the {} file """.format(DataContext.GE_YML) + +SETUP_SUCCESS = "\nGreat Expectations is now set up." diff --git a/great_expectations/cli/project.py b/great_expectations/cli/project.py new file mode 100644 index 000000000000..1f270ab330b2 --- /dev/null +++ b/great_expectations/cli/project.py @@ -0,0 +1,55 @@ +import sys + +import click + +from great_expectations import DataContext +from great_expectations import exceptions as ge_exceptions +from great_expectations.cli.util import ( + _offer_to_install_new_template, + cli_message, +) + + +@click.group() +def project(): + """project operations""" + pass + + +@project.command(name="check-config") +@click.option( + "--directory", + "-d", + default="./great_expectations", + help="The project's great_expectations directory.", +) +def project_check_config(directory): + """Check a config for validity and help with migrations.""" + cli_message("Checking your config files for validity...\n") + + try: + is_config_ok, error_message = do_config_check(directory) + if is_config_ok: + cli_message("Your config file appears valid!") + else: + cli_message("Unfortunately, your config appears to be invalid:\n") + cli_message("{}".format(error_message)) + sys.exit(1) + except ge_exceptions.ZeroDotSevenConfigVersionError as err: + _offer_to_install_new_template(err, directory) + + +def do_config_check(target_directory): + try: + DataContext(context_root_dir=target_directory) + return True, None + except ( + ge_exceptions.InvalidConfigurationYamlError, + ge_exceptions.InvalidTopLevelConfigKeyError, + ge_exceptions.MissingTopLevelConfigKeyError, + ge_exceptions.InvalidConfigValueTypeError, + ge_exceptions.UnsupportedConfigVersionError, + ge_exceptions.DataContextError, + ge_exceptions.PluginClassNotFoundError, + ) as err: + return False, err.message diff --git a/great_expectations/cli/suite.py b/great_expectations/cli/suite.py new file mode 100644 index 000000000000..b017b856d36f --- /dev/null +++ b/great_expectations/cli/suite.py @@ -0,0 +1,230 @@ +import json +import os +import subprocess +import sys + +import click + +from great_expectations import DataContext +from great_expectations import exceptions as ge_exceptions +from great_expectations.cli.cli_logging import logger +from great_expectations.cli.datasource import ( + create_expectation_suite as create_expectation_suite_impl, + select_datasource, + get_batch_kwargs +) +from great_expectations.cli.util import ( + _offer_to_install_new_template, + cli_message, +) +from great_expectations.data_asset import DataAsset +from great_expectations.render.renderer.notebook_renderer import ( + NotebookRenderer, +) + +try: + json_parse_exception = json.decoder.JSONDecodeError +except AttributeError: # Python 2 + json_parse_exception = ValueError + +try: + from sqlalchemy.exc import SQLAlchemyError +except ImportError: + # We'll redefine this error in code below to catch ProfilerError, which is caught above, so SA errors will + # just fall through + SQLAlchemyError = ge_exceptions.ProfilerError + + +@click.group() +def suite(): + """expectation suite operations""" + pass + + +@suite.command(name="edit") +@click.argument("suite") +@click.option( + "--datasource", + "-ds", + default=None, + help="""The name of the datasource. The datasource must contain a single BatchKwargGenerator that can list data assets in the datasource """ +) +@click.option( + "--batch-kwargs", + default=None, + help="""Batch_kwargs that specify the batch of data to be used a sample when editing the suite. Must be a valid JSON dictionary. +Make sure to escape quotes. Example: "{\"datasource\": \"my_db\", \"query\": \"select * from my_table\"}" +""", +) +@click.option( + "--directory", + "-d", + default=None, + help="The project's great_expectations directory.", +) +@click.option( + "--jupyter/--no-jupyter", + is_flag=True, + help="By default launch jupyter notebooks unless you specify the --no-jupyter flag", + default=True, +) +def suite_edit(suite, datasource, directory, jupyter, batch_kwargs): + """ + Generate a Jupyter notebook for editing an existing expectation suite. + + The SUITE argument is required. This is the name you gave to the suite + when you created it. + + A batch of data is required to edit the suite, which is used as a sample. + + The edit command will help you specify a batch interactively. Or you can + specify them manually by providing --batch-kwargs in valid JSON format. + + Read more about specifying batches of data in the documentation: https://docs.greatexpectations.io/ + """ + try: + context = DataContext(directory) + except ge_exceptions.ConfigNotFoundError as err: + cli_message("{}".format(err.message)) + return + except ge_exceptions.ZeroDotSevenConfigVersionError as err: + _offer_to_install_new_template(err, context.root_directory) + return + + suite = _load_suite(context, suite) + + if batch_kwargs: + try: + batch_kwargs = json.loads(batch_kwargs) + if datasource: + batch_kwargs["datasource"] = datasource + _batch = context.get_batch(batch_kwargs, suite.expectation_suite_name) + assert isinstance(_batch, DataAsset) + except json_parse_exception as je: + cli_message("Please check that your batch_kwargs are valid JSON.\n{}".format(je)) + sys.exit(1) + except ge_exceptions.DataContextError: + cli_message("Please check that your batch_kwargs are able to load a batch.") + sys.exit(1) + except ValueError as ve: + cli_message("Please check that your batch_kwargs are able to load a batch.\n{}".format(ve)) + sys.exit(1) + else: + cli_message(""" +A batch of data is required to edit the suite - let's help you to specify it.""" + ) + + additional_batch_kwargs = None + try: + data_source = select_datasource(context, datasource_name=datasource) + except ValueError as ve: + cli_message("{}".format(ve)) + sys.exit(1) + + if not data_source: + cli_message("No datasources found in the context.") + sys.exit(1) + + if batch_kwargs is None: + datasource_name, batch_kwarg_generator, data_asset, batch_kwargs = get_batch_kwargs( + context, + datasource_name=data_source.name, + generator_name=None, + generator_asset=None, + additional_batch_kwargs=additional_batch_kwargs + ) + + notebook_name = "{}.ipynb".format(suite.expectation_suite_name) + + notebook_path = os.path.join(context.root_directory, context.GE_EDIT_NOTEBOOK_DIR, notebook_name) + NotebookRenderer().render_to_disk(suite, batch_kwargs, notebook_path) + + cli_message( + "To continue editing this suite, run jupyter notebook {}".format( + notebook_path + ) + ) + + if jupyter: + subprocess.call(["jupyter", "notebook", notebook_path]) + + +def _load_suite(context, suite_name): + if suite_name.endswith(".json"): + suite_name = suite_name[:-5] + try: + suite = context.get_expectation_suite(suite_name) + return suite + except ge_exceptions.DataContextError as e: + cli_message( + "Could not find a suite named `{}`. Please check the name and try again.".format( + suite_name + ) + ) + logger.info(e) + sys.exit(1) + + +@suite.command(name="new") +@click.option("--suite", "-es", default=None, help="Expectation suite name.") +@click.option( + "--directory", + "-d", + default=None, + help="The project's great_expectations directory.", +) +@click.option( + "--view/--no-view", + help="By default open in browser unless you specify the --no-view flag", + default=True +) +@click.option( + "--batch-kwargs", + default=None, + help="Additional keyword arguments to be provided to get_batch when loading the data asset. Must be a valid JSON dictionary", +) +def suite_new(suite, directory, view, batch_kwargs): + """ + Create a new expectation suite. + + Great Expectations will choose a couple of columns and generate expectations about them + to demonstrate some examples of assertions you can make about your data. + """ + try: + context = DataContext(directory) + except ge_exceptions.ConfigNotFoundError as err: + cli_message("{}".format(err.message)) + return + except ge_exceptions.ZeroDotSevenConfigVersionError as err: + _offer_to_install_new_template(err, context.root_directory) + return + + if batch_kwargs is not None: + batch_kwargs = json.loads(batch_kwargs) + + datasource_name = None + generator_name = None + generator_asset = None + + try: + success, suite_name = create_expectation_suite_impl( + context, + datasource_name=datasource_name, + generator_name=generator_name, + generator_asset=generator_asset, + batch_kwargs=batch_kwargs, + expectation_suite_name=suite, + additional_batch_kwargs={"limit": 1000}, + show_intro_message=False, + open_docs=view, + ) + if success: + cli_message("A new Expectation suite '{}' was added to your project".format(suite_name)) + except ( + ge_exceptions.DataContextError, + ge_exceptions.ProfilerError, + IOError, + SQLAlchemyError + ) as e: + cli_message("{}".format(e)) + sys.exit(1) diff --git a/great_expectations/cli/util.py b/great_expectations/cli/util.py index 2935f5f27c5c..ea8451a5dc5b 100644 --- a/great_expectations/cli/util.py +++ b/great_expectations/cli/util.py @@ -1,5 +1,16 @@ -import six +import os import re +import shutil +import sys + +import click +import six + +from great_expectations import DataContext +from great_expectations.cli.init_messages import ( + NEW_TEMPLATE_INSTALLED, + NEW_TEMPLATE_PROMPT, +) try: from termcolor import colored @@ -9,36 +20,21 @@ def cli_message(string): # the DOTALL flag means that `.` includes newlines for multiline comments inside these tags - flags=re.DOTALL + flags = re.DOTALL mod_string = re.sub( - "(.*?)", - colored("\g<1>", "blue"), - string, - flags=flags + "(.*?)", colored("\g<1>", "blue"), string, flags=flags ) mod_string = re.sub( - "(.*?)", - colored("\g<1>", "cyan"), - mod_string, - flags=flags + "(.*?)", colored("\g<1>", "cyan"), mod_string, flags=flags ) mod_string = re.sub( - "(.*?)", - colored("\g<1>", "green"), - mod_string, - flags=flags + "(.*?)", colored("\g<1>", "green"), mod_string, flags=flags ) mod_string = re.sub( - "(.*?)", - colored("\g<1>", "yellow"), - mod_string, - flags=flags + "(.*?)", colored("\g<1>", "yellow"), mod_string, flags=flags ) mod_string = re.sub( - "(.*?)", - colored("\g<1>", "red"), - mod_string, - flags=flags + "(.*?)", colored("\g<1>", "red"), mod_string, flags=flags ) six.print_(colored(mod_string)) @@ -50,3 +46,28 @@ def is_sane_slack_webhook(url): return False return "https://hooks.slack.com/" in url.strip() + + +def _offer_to_install_new_template(err, ge_dir): + ge_dir = os.path.abspath(ge_dir) + cli_message("{}".format(err.message)) + ge_yml = os.path.join(ge_dir, DataContext.GE_YML) + archived_yml = ge_yml + ".archive" + + if click.confirm(NEW_TEMPLATE_PROMPT.format(ge_yml, archived_yml), default=True): + # archive existing project config + shutil.move(ge_yml, archived_yml) + DataContext.write_project_template_to_disk(ge_dir) + + cli_message( + NEW_TEMPLATE_INSTALLED.format("file://" + ge_yml, "file://" + archived_yml) + ) + else: + cli_message( + """\nOK. To continue, you will need to upgrade your config file to the latest format. + - Please see the docs here: https://docs.greatexpectations.io/en/latest/reference/data_context_reference.html + - We are super sorry about this breaking change! :] + - If you are running into any problems, please reach out on Slack and we can + help you in realtime: https://greatexpectations.io/slack""" + ) + sys.exit(0) diff --git a/great_expectations/core/__init__.py b/great_expectations/core/__init__.py new file mode 100644 index 000000000000..0bca519f1312 --- /dev/null +++ b/great_expectations/core/__init__.py @@ -0,0 +1,806 @@ +import logging +import json +# PYTHON 2 - py2 - update to ABC direct use rather than __metaclass__ once we drop py2 support +from collections import namedtuple +from copy import deepcopy + +from six import string_types + +from marshmallow import Schema, fields, ValidationError, post_load, pre_dump + +from great_expectations import __version__ as ge_version +from great_expectations.core.id_dict import IDDict +from great_expectations.core.util import nested_update +from great_expectations.types import DictDot + +from great_expectations.exceptions import InvalidExpectationConfigurationError, InvalidExpectationKwargsError, \ + UnavailableMetricError, ParserError + +logger = logging.getLogger(__name__) + +RESULT_FORMATS = [ + "BOOLEAN_ONLY", + "BASIC", + "COMPLETE", + "SUMMARY" +] + +EvaluationParameterIdentifier = namedtuple("EvaluationParameterIdentifier", ["expectation_suite_name", "metric_name", + "metric_kwargs_id"]) + + +def get_metric_kwargs_id(metric_name, metric_kwargs): + ### + # + # WARNING + # WARNING + # THIS IS A PLACEHOLDER UNTIL WE HAVE REFACTORED EXPECTATIONS TO HANDLE THIS LOGIC THEMSELVES + # WE ARE NO WORSE OFF THAN THE PREVIOUS SYSTEM, BUT NOT FULLY CUSTOMIZABLE + # WARNING + # WARNING + # + ### + if "metric_kwargs_id" in metric_kwargs: + return metric_kwargs["metric_kwargs_id"] + if "column" in metric_kwargs: + return "column=" + metric_kwargs.get("column") + return None + + +def parse_evaluation_parameter_urn(urn): + if urn.startswith("urn:great_expectations:validations:"): + split = urn.split(":") + if len(split) == 6: + return EvaluationParameterIdentifier(split[3], split[4], split[5]) + elif len(split) == 5: + return EvaluationParameterIdentifier(split[3], split[4], None) + else: + raise ParserError("Unable to parse URN: must have 5 or 6 components to be a valid GE URN") + + raise ParserError("Unrecognized evaluation parameter urn {}".format(urn)) + + +def convert_to_json_serializable(data): + """ + Helper function to convert an object to one that is json serializable + + Args: + data: an object to attempt to convert a corresponding json-serializable object + + Returns: + (dict) A converted test_object + + Warning: + test_obj may also be converted in place. + + """ + import numpy as np + import pandas as pd + from six import string_types, integer_types + import datetime + import decimal + import sys + + # If it's one of our types, we use our own conversion; this can move to full schema + # once nesting goes all the way down + if isinstance(data, (ExpectationConfiguration, ExpectationSuite, ExpectationValidationResult, + ExpectationSuiteValidationResult)): + return data.to_json_dict() + + try: + if not isinstance(data, list) and np.isnan(data): + # np.isnan is functionally vectorized, but we only want to apply this to single objects + # Hence, why we test for `not isinstance(list))` + return None + except TypeError: + pass + except ValueError: + pass + + if isinstance(data, (string_types, integer_types, float, bool)): + # No problem to encode json + return data + + elif isinstance(data, dict): + new_dict = {} + for key in data: + # A pandas index can be numeric, and a dict key can be numeric, but a json key must be a string + new_dict[str(key)] = convert_to_json_serializable(data[key]) + + return new_dict + + elif isinstance(data, (list, tuple, set)): + new_list = [] + for val in data: + new_list.append(convert_to_json_serializable(val)) + + return new_list + + elif isinstance(data, (np.ndarray, pd.Index)): + # test_obj[key] = test_obj[key].tolist() + # If we have an array or index, convert it first to a list--causing coercion to float--and then round + # to the number of digits for which the string representation will equal the float representation + return [convert_to_json_serializable(x) for x in data.tolist()] + + # Note: This clause has to come after checking for np.ndarray or we get: + # `ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()` + elif data is None: + # No problem to encode json + return data + + elif isinstance(data, (datetime.datetime, datetime.date)): + return data.isoformat() + + # Use built in base type from numpy, https://docs.scipy.org/doc/numpy-1.13.0/user/basics.types.html + # https://github.com/numpy/numpy/pull/9505 + elif np.issubdtype(type(data), np.bool_): + return bool(data) + + elif np.issubdtype(type(data), np.integer) or np.issubdtype(type(data), np.uint): + return int(data) + + elif np.issubdtype(type(data), np.floating): + # Note: Use np.floating to avoid FutureWarning from numpy + return float(round(data, sys.float_info.dig)) + + elif isinstance(data, pd.Series): + # Converting a series is tricky since the index may not be a string, but all json + # keys must be strings. So, we use a very ugly serialization strategy + index_name = data.index.name or "index" + value_name = data.name or "value" + return [{ + index_name: convert_to_json_serializable(idx), + value_name: convert_to_json_serializable(val) + } for idx, val in data.iteritems()] + + elif isinstance(data, pd.DataFrame): + return convert_to_json_serializable(data.to_dict(orient='records')) + + elif isinstance(data, decimal.Decimal): + if not (-1e-55 < decimal.Decimal.from_float(float(data)) - data < 1e-55): + logger.warning("Using lossy conversion for decimal %s to float object to support serialization." % str( + data)) + return float(data) + + else: + raise TypeError('%s is of type %s which cannot be serialized.' % ( + str(data), type(data).__name__)) + + +def ensure_json_serializable(data): + """ + Helper function to convert an object to one that is json serializable + + Args: + data: an object to attempt to convert a corresponding json-serializable object + + Returns: + (dict) A converted test_object + + Warning: + test_obj may also be converted in place. + + """ + import numpy as np + import pandas as pd + from six import string_types, integer_types + import datetime + import decimal + + # If it's one of our types, we use our own conversion; this can move to full schema + # once nesting goes all the way down + if isinstance(data, (ExpectationConfiguration, ExpectationSuite, ExpectationValidationResult, + ExpectationSuiteValidationResult)): + return + + try: + if not isinstance(data, list) and np.isnan(data): + # np.isnan is functionally vectorized, but we only want to apply this to single objects + # Hence, why we test for `not isinstance(list))` + return + except TypeError: + pass + except ValueError: + pass + + if isinstance(data, (string_types, integer_types, float, bool)): + # No problem to encode json + return + + elif isinstance(data, dict): + for key in data: + str(key) # key must be cast-able to string + ensure_json_serializable(data[key]) + + return + + elif isinstance(data, (list, tuple, set)): + for val in data: + ensure_json_serializable(val) + return + + elif isinstance(data, (np.ndarray, pd.Index)): + # test_obj[key] = test_obj[key].tolist() + # If we have an array or index, convert it first to a list--causing coercion to float--and then round + # to the number of digits for which the string representation will equal the float representation + _ = [ensure_json_serializable(x) for x in data.tolist()] + return + + # Note: This clause has to come after checking for np.ndarray or we get: + # `ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()` + elif data is None: + # No problem to encode json + return + + elif isinstance(data, (datetime.datetime, datetime.date)): + return + + # Use built in base type from numpy, https://docs.scipy.org/doc/numpy-1.13.0/user/basics.types.html + # https://github.com/numpy/numpy/pull/9505 + elif np.issubdtype(type(data), np.bool_): + return + + elif np.issubdtype(type(data), np.integer) or np.issubdtype(type(data), np.uint): + return + + elif np.issubdtype(type(data), np.floating): + # Note: Use np.floating to avoid FutureWarning from numpy + return + + elif isinstance(data, pd.Series): + # Converting a series is tricky since the index may not be a string, but all json + # keys must be strings. So, we use a very ugly serialization strategy + index_name = data.index.name or "index" + value_name = data.name or "value" + _ = [{index_name: ensure_json_serializable(idx), value_name: ensure_json_serializable(val)} + for idx, val in data.iteritems()] + return + elif isinstance(data, pd.DataFrame): + return ensure_json_serializable(data.to_dict(orient='records')) + + elif isinstance(data, decimal.Decimal): + return + + else: + raise InvalidExpectationConfigurationError('%s is of type %s which cannot be serialized to json' % ( + str(data), type(data).__name__)) + + +class ExpectationKwargs(dict): + ignored_keys = ['result_format', 'include_config', 'catch_exceptions'] + + """ExpectationKwargs store information necessary to evaluate an expectation.""" + def __init__(self, *args, **kwargs): + include_config = kwargs.pop("include_config", None) + if include_config is not None and not isinstance(include_config, bool): + raise InvalidExpectationKwargsError("include_config must be a boolean value") + + result_format = kwargs.get("result_format", None) + if result_format is None: + pass + elif result_format in RESULT_FORMATS: + pass + elif isinstance(result_format, dict) and result_format.get('result_format', None) in RESULT_FORMATS: + pass + else: + raise InvalidExpectationKwargsError("result format must be one of the valid formats: %s" + % str(RESULT_FORMATS)) + + catch_exceptions = kwargs.pop("catch_exceptions", None) + if catch_exceptions is not None and not isinstance(catch_exceptions, bool): + raise InvalidExpectationKwargsError("catch_exceptions must be a boolean value") + + super(ExpectationKwargs, self).__init__(*args, **kwargs) + ensure_json_serializable(self) + + def isEquivalentTo(self, other): + try: + n_self_keys = len([k for k in self.keys() if k not in self.ignored_keys]) + n_other_keys = len([k for k in other.keys() if k not in self.ignored_keys]) + return n_self_keys == n_other_keys and all([ + self[k] == other[k] for k in self.keys() if k not in self.ignored_keys + ]) + except KeyError: + return False + + def __repr__(self): + return json.dumps(self.to_json_dict()) + + def __str__(self): + return json.dumps(self.to_json_dict(), indent=2) + + def to_json_dict(self): + myself = convert_to_json_serializable(self) + return myself + + +class ExpectationConfiguration(DictDot): + """ExpectationConfiguration defines the parameters and name of a specific expectation.""" + + def __init__(self, expectation_type, kwargs, meta=None, success_on_last_run=None): + if not isinstance(expectation_type, string_types): + raise InvalidExpectationConfigurationError("expectation_type must be a string") + self._expectation_type = expectation_type + if not isinstance(kwargs, dict): + raise InvalidExpectationConfigurationError("expectation configuration kwargs must be an " + "ExpectationKwargs object.") + self._kwargs = ExpectationKwargs(kwargs) + if meta is None: + meta = {} + # We require meta information to be serializable, but do not convert until necessary + ensure_json_serializable(meta) + self.meta = meta + self.success_on_last_run = success_on_last_run + + @property + def expectation_type(self): + return self._expectation_type + + @property + def kwargs(self): + return self._kwargs + + def isEquivalentTo(self, other): + """ExpectationConfiguration equivalence does not include meta, and relies on *equivalence* of kwargs.""" + if not isinstance(other, self.__class__): + if isinstance(other, dict): + try: + other = expectationConfigurationSchema.load(other).data + except ValidationError: + logger.debug("Unable to evaluate equivalence of ExpectationConfiguration object with dict because " + "dict other could not be instantiated as an ExpectationConfiguration") + return NotImplemented + else: + # Delegate comparison to the other instance + return NotImplemented + return all(( + self.expectation_type == other.expectation_type, + self.kwargs.isEquivalentTo(other.kwargs) + )) + + def __eq__(self, other): + """ExpectationConfiguration equality does include meta, but ignores instance identity.""" + if not isinstance(other, self.__class__): + # Delegate comparison to the other instance's __eq__. + return NotImplemented + return all(( + self.expectation_type == other.expectation_type, + self.kwargs == other.kwargs, + self.meta == other.meta + )) + + def __ne__(self, other): + # By using the == operator, the returned NotImplemented is handled correctly. + return not self == other + + def __repr__(self): + return json.dumps(self.to_json_dict()) + + def __str__(self): + return json.dumps(self.to_json_dict(), indent=2) + + def to_json_dict(self): + myself = expectationConfigurationSchema.dump(self).data + # NOTE - JPC - 20191031: migrate to expectation-specific schemas that subclass result with properly-typed + # schemas to get serialization all-the-way down via dump + myself['kwargs'] = convert_to_json_serializable(myself['kwargs']) + return myself + + def get_evaluation_parameter_dependencies(self): + dependencies = {} + for key, value in self.kwargs.items(): + if isinstance(value, dict) and '$PARAMETER' in value: + if value["$PARAMETER"].startswith("urn:great_expectations:validations:"): + try: + evaluation_parameter_id = parse_evaluation_parameter_urn(value["$PARAMETER"]) + except ParserError: + logger.warning("Unable to parse great_expectations urn {}".format(value["$PARAMETER"])) + continue + + if evaluation_parameter_id.metric_kwargs_id is None: + nested_update(dependencies, { + evaluation_parameter_id.expectation_suite_name: [evaluation_parameter_id.metric_name] + }) + else: + nested_update(dependencies, { + evaluation_parameter_id.expectation_suite_name: [{ + "metric_kwargs_id": { + evaluation_parameter_id.metric_kwargs_id: [evaluation_parameter_id.metric_name] + } + }] + }) + # if evaluation_parameter_id.expectation_suite_name not in dependencies: + # dependencies[evaluation_parameter_id.expectation_suite_name] = {"metric_kwargs_id": {}} + # + # if evaluation_parameter_id.metric_kwargs_id not in dependencies[evaluation_parameter_id.expectation_suite_name]["metric_kwargs_id"]: + # dependencies[evaluation_parameter_id.expectation_suite_name]["metric_kwargs_id"][evaluation_parameter_id.metric_kwargs_id] = [] + # dependencies[evaluation_parameter_id.expectation_suite_name]["metric_kwargs_id"][ + # evaluation_parameter_id.metric_kwargs_id].append(evaluation_parameter_id.metric_name) + + return dependencies + + +class ExpectationConfigurationSchema(Schema): + expectation_type = fields.Str( + required=True, + error_messages={"required": "expectation_type missing in expectation configuration"} + ) + kwargs = fields.Dict() + meta = fields.Dict() + + # noinspection PyUnusedLocal + @post_load + def make_expectation_configuration(self, data, **kwargs): + return ExpectationConfiguration(**data) + + +# TODO: re-enable once we can allow arbitrary keys but still add this sort of validation +# class MetaDictSchema(Schema): +# """The MetaDict """ +# +# # noinspection PyUnusedLocal +# @validates_schema +# def validate_json_serializable(self, data, **kwargs): +# import json +# try: +# json.dumps(data) +# except (TypeError, OverflowError): +# raise ValidationError("meta information must be json serializable.") + + +class ExpectationSuite(object): + def __init__( + self, + expectation_suite_name, + expectations=None, + evaluation_parameters=None, + data_asset_type=None, + meta=None + ): + self.expectation_suite_name = expectation_suite_name + if expectations is None: + expectations = [] + self.expectations = [ExpectationConfiguration(**expectation) if isinstance(expectation, dict) else + expectation for expectation in expectations] + if evaluation_parameters is None: + evaluation_parameters = {} + self.evaluation_parameters = evaluation_parameters + self.data_asset_type = data_asset_type + if meta is None: + meta = {"great_expectations.__version__": ge_version} + # We require meta information to be serializable, but do not convert until necessary + ensure_json_serializable(meta) + self.meta = meta + + def isEquivalentTo(self, other): + """ + ExpectationSuite equivalence relies only on expectations and evaluation parameters. It does not include: + - data_asset_name + - expectation_suite_name + - meta + - data_asset_type + """ + if not isinstance(other, self.__class__): + if isinstance(other, dict): + try: + other = expectationSuiteSchema.load(other).data + except ValidationError: + logger.debug("Unable to evaluate equivalence of ExpectationConfiguration object with dict because " + "dict other could not be instantiated as an ExpectationConfiguration") + return NotImplemented + else: + # Delegate comparison to the other instance + return NotImplemented + return all( + [mine.isEquivalentTo(theirs) for (mine, theirs) in zip(self.expectations, other.expectations)] + ) + + def __eq__(self, other): + """ExpectationSuite equality ignores instance identity, relying only on properties.""" + if not isinstance(other, self.__class__): + # Delegate comparison to the other instance's __eq__. + return NotImplemented + return all(( + self.expectation_suite_name == other.expectation_suite_name, + self.expectations == other.expectations, + self.evaluation_parameters == other.evaluation_parameters, + self.data_asset_type == other.data_asset_type, + self.meta == other.meta, + )) + + def __ne__(self, other): + # By using the == operator, the returned NotImplemented is handled correctly. + return not self == other + + def __repr__(self): + return json.dumps(self.to_json_dict(), indent=2) + + def __str__(self): + return json.dumps(self.to_json_dict(), indent=2) + + def to_json_dict(self): + myself = expectationSuiteSchema.dump(self).data + # NOTE - JPC - 20191031: migrate to expectation-specific schemas that subclass result with properly-typed + # schemas to get serialization all-the-way down via dump + myself['expectations'] = convert_to_json_serializable(myself['expectations']) + try: + myself['evaluation_parameters'] = convert_to_json_serializable(myself['evaluation_parameters']) + except KeyError: + pass # Allow evaluation parameters to be missing if empty + myself['meta'] = convert_to_json_serializable(myself['meta']) + return myself + + def get_evaluation_parameter_dependencies(self): + dependencies = {} + for expectation in self.expectations: + t = expectation.get_evaluation_parameter_dependencies() + nested_update(dependencies, t) + + return dependencies + + +class ExpectationSuiteSchema(Schema): + expectation_suite_name = fields.Str() + expectations = fields.List(fields.Nested(ExpectationConfigurationSchema)) + evaluation_parameters = fields.Dict(allow_none=True) + data_asset_type = fields.Str(allow_none=True) + meta = fields.Dict() + + # NOTE: 20191107 - JPC - we may want to remove clean_empty and update tests to require the other fields; + # doing so could also allow us not to have to make a copy of data in the pre_dump method. + def clean_empty(self, data): + if not hasattr(data, 'evaluation_parameters'): + pass + elif len(data.evaluation_parameters) == 0: + del data.evaluation_parameters + if not hasattr(data, 'meta'): + pass + elif len(data.meta) == 0: + del data.meta + return data + + # noinspection PyUnusedLocal + @pre_dump + def prepare_dump(self, data, **kwargs): + data = deepcopy(data) + data.meta = convert_to_json_serializable(data.meta) + data = self.clean_empty(data) + return data + + # noinspection PyUnusedLocal + @post_load + def make_expectation_suite(self, data, **kwargs): + return ExpectationSuite(**data) + + +class ExpectationValidationResult(object): + def __init__(self, success=None, expectation_config=None, result=None, meta=None, exception_info=None): + self.success = success + self.expectation_config = expectation_config + # TODO: re-add + # assert_json_serializable(result, "result") + if result is None: + result = {} + self.result = result + if meta is None: + meta = {} + # We require meta information to be serializable, but do not convert until necessary + ensure_json_serializable(meta) + self.meta = meta + self.exception_info = exception_info + + def __eq__(self, other): + """ExpectationValidationResult equality ignores instance identity, relying only on properties.""" + # NOTE: JPC - 20200213 - need to spend some time thinking about whether we want to + # consistently allow dict as a comparison alternative in situations like these... + # if isinstance(other, dict): + # try: + # other = ExpectationValidationResult(**other) + # except ValueError: + # return NotImplemented + if not isinstance(other, self.__class__): + # Delegate comparison to the other instance's __eq__. + return NotImplemented + try: + return all(( + self.success == other.success, + (self.expectation_config is None and other.expectation_config is None) or + (self.expectation_config is not None and self.expectation_config.isEquivalentTo( + other.expectation_config)), + # Result is a dictionary allowed to have nested dictionaries that are still of complex types (e.g. + # numpy) consequently, series' comparison can persist. Wrapping in all() ensures comparision is + # handled appropriately. + (self.result is None and other.result is None) or (all(self.result) == all(other.result)), + self.meta == other.meta, + self.exception_info == other.exception_info + )) + except (ValueError, TypeError): + # if invalid comparisons are attempted, the objects are not equal. + return False + + def __repr__(self): + return json.dumps(self.to_json_dict(), indent=2) + + def __str__(self): + return json.dumps(self.to_json_dict(), indent=2) + + def to_json_dict(self): + myself = expectationValidationResultSchema.dump(self).data + # NOTE - JPC - 20191031: migrate to expectation-specific schemas that subclass result with properly-typed + # schemas to get serialization all-the-way down via dump + if 'result' in myself: + myself['result'] = convert_to_json_serializable(myself['result']) + if 'meta' in myself: + myself['meta'] = convert_to_json_serializable(myself['meta']) + if 'exception_info' in myself: + myself['exception_info'] = convert_to_json_serializable(myself['exception_info']) + return myself + + def get_metric(self, metric_name, **kwargs): + if not self.expectation_config: + raise UnavailableMetricError("No ExpectationConfig found in this ExpectationValidationResult. Unable to " + "return a metric.") + + metric_name_parts = metric_name.split(".") + metric_kwargs_id = get_metric_kwargs_id(metric_name, kwargs) + + if metric_name_parts[0] == self.expectation_config.expectation_type: + curr_metric_kwargs = get_metric_kwargs_id(metric_name, self.expectation_config.kwargs) + if metric_kwargs_id != curr_metric_kwargs: + raise UnavailableMetricError("Requested metric_kwargs_id (%s) does not match the configuration of this " + "ExpectationValidationResult (%s)." % (metric_kwargs_id or "None", + curr_metric_kwargs or "None")) + if len(metric_name_parts) < 2: + raise UnavailableMetricError("Expectation-defined metrics must include a requested metric.") + elif len(metric_name_parts) == 2: + if metric_name_parts[1] == "success": + return self.success + else: + raise UnavailableMetricError("Metric name must have more than two parts for keys other than " + "success.") + elif metric_name_parts[1] == "result": + try: + if len(metric_name_parts) == 3: + return self.result.get(metric_name_parts[2]) + elif metric_name_parts[2] == "details": + return self.result["details"].get(metric_name_parts[3]) + except KeyError: + raise UnavailableMetricError("Unable to get metric {} -- KeyError in " + "ExpectationValidationResult.".format(metric_name)) + raise UnavailableMetricError("Unrecognized metric name {}".format(metric_name)) + + +class ExpectationValidationResultSchema(Schema): + success = fields.Bool() + expectation_config = fields.Nested(ExpectationConfigurationSchema) + result = fields.Dict() + meta = fields.Dict() + exception_info = fields.Dict() + + # noinspection PyUnusedLocal + @pre_dump + def convert_result_to_serializable(self, data, **kwargs): + data = deepcopy(data) + data.result = convert_to_json_serializable(data.result) + return data + + # # noinspection PyUnusedLocal + # @pre_dump + # def clean_empty(self, data, **kwargs): + # # if not hasattr(data, 'meta'): + # # pass + # # elif len(data.meta) == 0: + # # del data.meta + # # return data + # pass + + # noinspection PyUnusedLocal + @post_load + def make_expectation_validation_result(self, data, **kwargs): + return ExpectationValidationResult(**data) + + +class ExpectationSuiteValidationResult(DictDot): + def __init__(self, success=None, results=None, evaluation_parameters=None, statistics=None, meta=None): + self.success = success + if results is None: + results = [] + self.results = results + if evaluation_parameters is None: + evaluation_parameters = {} + self.evaluation_parameters = evaluation_parameters + if statistics is None: + statistics = {} + self.statistics = statistics + if meta is None: + meta = {} + ensure_json_serializable(meta) # We require meta information to be serializable. + self.meta = meta + self._metrics = {} + + def __eq__(self, other): + """ExpectationSuiteValidationResult equality ignores instance identity, relying only on properties.""" + if not isinstance(other, self.__class__): + # Delegate comparison to the other instance's __eq__. + return NotImplemented + return all(( + self.success == other.success, + self.results == other.results, + self.evaluation_parameters == other.evaluation_parameters, + self.statistics == other.statistics, + self.meta == other.meta + )) + + def __repr__(self): + return json.dumps(self.to_json_dict(), indent=2) + + def __str__(self): + return json.dumps(self.to_json_dict(), indent=2) + + def to_json_dict(self): + myself = deepcopy(self) + # NOTE - JPC - 20191031: migrate to expectation-specific schemas that subclass result with properly-typed + # schemas to get serialization all-the-way down via dump + myself['evaluation_parameters'] = convert_to_json_serializable(myself['evaluation_parameters']) + myself['statistics'] = convert_to_json_serializable(myself['statistics']) + myself['meta'] = convert_to_json_serializable(myself['meta']) + myself = expectationSuiteValidationResultSchema.dump(myself).data + return myself + + def get_metric(self, metric_name, **kwargs): + metric_name_parts = metric_name.split(".") + metric_kwargs_id = get_metric_kwargs_id(metric_name, kwargs) + + metric_value = None + # Expose overall statistics + if metric_name_parts[0] == "statistics": + if len(metric_name_parts) == 2: + return self.statistics.get(metric_name_parts[1]) + else: + raise UnavailableMetricError("Unrecognized metric {}".format(metric_name)) + + # Expose expectation-defined metrics + elif metric_name_parts[0].lower().startswith("expect_"): + # Check our cache first + if (metric_name, metric_kwargs_id) in self._metrics: + return self._metrics[(metric_name, metric_kwargs_id)] + else: + for result in self.results: + try: + if metric_name_parts[0] == result.expectation_config.expectation_type: + metric_value = result.get_metric(metric_name, **kwargs) + break + except UnavailableMetricError: + pass + if metric_value is not None: + self._metrics[(metric_name, metric_kwargs_id)] = metric_value + return metric_value + + raise UnavailableMetricError("Metric {} with metric_kwargs_id {} is not available.".format(metric_name, + metric_kwargs_id)) + + +class ExpectationSuiteValidationResultSchema(Schema): + success = fields.Bool() + results = fields.List(fields.Nested(ExpectationValidationResultSchema)) + evaluation_parameters = fields.Dict() + statistics = fields.Dict() + meta = fields.Dict(allow_none=True) + + # noinspection PyUnusedLocal + @pre_dump + def prepare_dump(self, data, **kwargs): + data = deepcopy(data) + data.meta = convert_to_json_serializable(data.meta) + return data + + # noinspection PyUnusedLocal + @post_load + def make_expectation_suite_validation_result(self, data, **kwargs): + return ExpectationSuiteValidationResult(**data) + + +expectationConfigurationSchema = ExpectationConfigurationSchema(strict=True) +expectationSuiteSchema = ExpectationSuiteSchema(strict=True) +expectationValidationResultSchema = ExpectationValidationResultSchema(strict=True) +expectationSuiteValidationResultSchema = ExpectationSuiteValidationResultSchema(strict=True) diff --git a/great_expectations/core/batch.py b/great_expectations/core/batch.py new file mode 100644 index 000000000000..e6f1807017b0 --- /dev/null +++ b/great_expectations/core/batch.py @@ -0,0 +1,35 @@ +from great_expectations.types import DictDot + + +class Batch(DictDot): + def __init__(self, datasource_name, batch_kwargs, data, batch_parameters, batch_markers, data_context): + self._datasource_name = datasource_name + self._batch_kwargs = batch_kwargs + self._data = data + self._batch_parameters = batch_parameters + self._batch_markers = batch_markers + self._data_context = data_context + + @property + def datasource_name(self): + return self._datasource_name + + @property + def batch_kwargs(self): + return self._batch_kwargs + + @property + def data(self): + return self._data + + @property + def batch_parameters(self): + return self._batch_parameters + + @property + def batch_markers(self): + return self._batch_markers + + @property + def data_context(self): + return self._data_context diff --git a/great_expectations/core/data_context_key.py b/great_expectations/core/data_context_key.py new file mode 100644 index 000000000000..a27a7b72cae6 --- /dev/null +++ b/great_expectations/core/data_context_key.py @@ -0,0 +1,36 @@ +from abc import ABCMeta, abstractmethod + + +class DataContextKey(object): + __metaclass__ = ABCMeta + """DataContextKey objects are used to uniquely identify resources used by the DataContext. + + A DataContextKey is designed to support clear naming with multiple representations including a hashable + version making it suitable for use as the key in a dictionary. + """ + @abstractmethod + def to_tuple(self): + pass + + @classmethod + def from_tuple(cls, tuple_): + return cls(*tuple_) + + def to_fixed_length_tuple(self): + raise NotImplementedError + + @classmethod + def from_fixed_length_tuple(cls, tuple_): + raise NotImplementedError + + def __eq__(self, other): + if not isinstance(other, self.__class__): + # Delegate comparison to the other instance's __eq__. + return NotImplemented + return self.to_tuple() == other.to_tuple() + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(self.to_tuple()) diff --git a/great_expectations/core/id_dict.py b/great_expectations/core/id_dict.py new file mode 100644 index 000000000000..3bac92485a74 --- /dev/null +++ b/great_expectations/core/id_dict.py @@ -0,0 +1,29 @@ +import hashlib +import json + + +class IDDict(dict): + _id_ignore_keys = set() + + def to_id(self, id_keys=None, id_ignore_keys=None): + if id_keys is None: + id_keys = self.keys() + if id_ignore_keys is None: + id_ignore_keys = self._id_ignore_keys + id_keys = set(id_keys) - set(id_ignore_keys) + if len(id_keys) == 0: + return None + elif len(id_keys) == 1: + key = list(id_keys)[0] + return key + "=" + str(self[key]) + + _id_dict = {k: self[k] for k in id_keys} + return hashlib.md5(json.dumps(_id_dict, sort_keys=True).encode('utf-8')).hexdigest() + + +class BatchKwargs(IDDict): + pass + + +class MetricKwargs(IDDict): + pass diff --git a/great_expectations/core/metric.py b/great_expectations/core/metric.py new file mode 100644 index 000000000000..e86dda77d7b6 --- /dev/null +++ b/great_expectations/core/metric.py @@ -0,0 +1,170 @@ +from great_expectations.core.data_context_key import DataContextKey +from great_expectations.core.id_dict import IDDict +from great_expectations.data_context.types.resource_identifiers import ExpectationSuiteIdentifier +from great_expectations.exceptions import GreatExpectationsError + + +class Metric(object): + """A Metric associates a value with some name and configuration. The specific configuration parameters that are + relevant for a given metric's identity depend on the metric. For example, the metric `column_mean` depends on a + column name. + """ + def __init__(self, metric_name, metric_kwargs, metric_value): + self._metric_name = metric_name + if not isinstance(metric_kwargs, IDDict): + metric_kwargs = IDDict(metric_kwargs) + self._metric_kwargs = metric_kwargs + self._metric_value = metric_value + + @property + def metric_name(self): + return self._metric_name + + @property + def metric_kwargs(self): + return self._metric_kwargs + + @property + def metric_kwargs_id(self): + return self._metric_kwargs.to_id() + + +class MetricIdentifier(DataContextKey): + """A MetricIdentifier serves as a key to store and retrieve Metrics.""" + def __init__(self, metric_name, metric_kwargs_id): + self._metric_name = metric_name + if metric_kwargs_id is None: + metric_kwargs_id = "__" # This is a placeholder + self._metric_kwargs_id = metric_kwargs_id + + @property + def metric_name(self): + return self._metric_name + + @property + def metric_kwargs_id(self): + if self._metric_kwargs_id == "__": + return None + return self._metric_kwargs_id + + @classmethod + def from_object(cls, metric): + if not isinstance(metric, Metric): + raise GreatExpectationsError("Unable to build MetricIdentifier from object of type {} when Metric is " + "expected.".format(type(metric))) + return cls(metric.metric_name, metric.metric_kwargs_id) + + def to_fixed_length_tuple(self): + return self.to_tuple() + + def to_tuple(self): + return tuple((self.metric_name, self._metric_kwargs_id)) # We use the placeholder in to_tuple + + @classmethod + def from_fixed_length_tuple(cls, tuple_): + return cls.from_tuple(tuple_) + + @classmethod + def from_tuple(cls, tuple_): + return cls(*tuple_) + + +class BatchMetric(Metric): + """A BatchMetric is a metric associated with a particular Batch of data.""" + def __init__(self, metric_name, metric_kwargs, batch_identifier, metric_value): + super(BatchMetric, self).__init__(metric_name, metric_kwargs, metric_value) + self._batch_identifier = batch_identifier + + @property + def batch_identifier(self): + return self._batch_identifier + + +class ValidationMetric(Metric): + def __init__(self, run_id, expectation_suite_identifier, metric_name, metric_kwargs, metric_value): + super(ValidationMetric, self).__init__(metric_name, metric_kwargs, metric_value) + if not isinstance(expectation_suite_identifier, ExpectationSuiteIdentifier): + expectation_suite_identifier = ExpectationSuiteIdentifier( + expectation_suite_name=expectation_suite_identifier) + self._run_id = run_id + self._expectation_suite_identifier = expectation_suite_identifier + + @property + def run_id(self): + return self._run_id + + @property + def expectation_suite_identifier(self): + return self._expectation_suite_identifier + + +class ValidationMetricIdentifier(MetricIdentifier): + + def __init__(self, run_id, expectation_suite_identifier, metric_name, metric_kwargs_id): + super(ValidationMetricIdentifier, self).__init__(metric_name, metric_kwargs_id) + if not isinstance(expectation_suite_identifier, ExpectationSuiteIdentifier): + expectation_suite_identifier = ExpectationSuiteIdentifier( + expectation_suite_name=expectation_suite_identifier) + self._run_id = run_id + self._expectation_suite_identifier = expectation_suite_identifier + + @property + def run_id(self): + return self._run_id + + @property + def expectation_suite_identifier(self): + return self._expectation_suite_identifier + + @classmethod + def from_object(cls, validation_metric): + if not isinstance(validation_metric, ValidationMetric): + raise GreatExpectationsError("Unable to build ValidationMetricIdentifier from object of type {} when " + "ValidationMetric is expected.".format(type(validation_metric))) + + return cls(validation_metric.expectation_suite_identifier, validation_metric.run_id, + validation_metric.metric_name, validation_metric.metric_kwargs_id) + + def to_tuple(self): + # Note use of _metric_kwargs_id instead of metric_kwargs_id to preserve no None semantics + return tuple([self.run_id] + list(self.expectation_suite_identifier.to_tuple()) + [self.metric_name, + self._metric_kwargs_id]) + + def to_fixed_length_tuple(self): + # Note use of _metric_kwargs_id instead of metric_kwargs_id to preserve no None semantics + return tuple([self.run_id] + list(self.expectation_suite_identifier.to_fixed_length_tuple()) + + [self.metric_name, self._metric_kwargs_id]) + + def to_evaluation_parameter_urn(self): + if self._metric_kwargs_id == "__": + return "urn:great_expectations:validations:" + ":".join( + list(self.expectation_suite_identifier.to_fixed_length_tuple()) + [self.metric_name] + ) + else: + return "urn:great_expectations:validations:" + ":".join( + list(self.expectation_suite_identifier.to_fixed_length_tuple()) + + [self.metric_name, self.metric_kwargs_id] + ) + + @classmethod + def from_tuple(cls, tuple_): + if len(tuple_) < 4: + raise GreatExpectationsError("ValidationMetricIdentifier tuple must have at least four components.") + return cls( + run_id=tuple_[0], + expectation_suite_identifier=ExpectationSuiteIdentifier.from_tuple(tuple_[1:-2]), + metric_name=tuple_[-2], + metric_kwargs_id=tuple_[-1] + ) + + @classmethod + def from_fixed_length_tuple(cls, tuple_): + if len(tuple_) != 4: + raise GreatExpectationsError("ValidationMetricIdentifier fixed length tuple must have exactly four " + "components.") + return cls( + run_id=tuple_[0], + expectation_suite_identifier=ExpectationSuiteIdentifier.from_fixed_length_tuple(tuple((tuple_[1],))), + metric_name=tuple_[2], + metric_kwargs_id=tuple_[3] + ) diff --git a/great_expectations/core/util.py b/great_expectations/core/util.py new file mode 100644 index 000000000000..d9c7f91372ae --- /dev/null +++ b/great_expectations/core/util.py @@ -0,0 +1,19 @@ +try: + from collections.abc import Mapping +except ImportError: + from collections import Mapping + + +# Updated from the stack overflow version below to concatenate lists +# https://stackoverflow.com/questions/3232943/update-value-of-a-nested-dictionary-of-varying-depth +def nested_update(d, u): + for k, v in u.items(): + if isinstance(v, Mapping): + d[k] = nested_update(d.get(k, {}), v) + elif isinstance(v, list) or (k in d and isinstance(d[k], list)): + l1 = d.get(k, []) + l2 = v or [] + d[k] = l1 + l2 + else: + d[k] = v + return d diff --git a/great_expectations/data_asset/data_asset.py b/great_expectations/data_asset/data_asset.py index fc003c29086d..930c6a95de9c 100644 --- a/great_expectations/data_asset/data_asset.py +++ b/great_expectations/data_asset/data_asset.py @@ -2,11 +2,14 @@ import json import inspect import copy +import uuid from functools import wraps import traceback import warnings import logging import datetime + +from marshmallow import ValidationError from six import PY3, string_types from collections import namedtuple, Hashable, Counter, defaultdict @@ -14,9 +17,11 @@ from great_expectations.data_asset.util import ( recursively_convert_to_json_serializable, parse_result_format, - get_empty_expectation_suite ) -from great_expectations.types import DotDict +from great_expectations.core import ExpectationSuite, ExpectationConfiguration, ExpectationValidationResult, \ + ExpectationSuiteValidationResult, expectationSuiteSchema +from great_expectations.core.id_dict import BatchKwargs +from great_expectations.exceptions import GreatExpectationsError logger = logging.getLogger(__name__) logging.captureWarnings(True) @@ -45,11 +50,11 @@ def __init__(self, *args, **kwargs): interactive_evaluation = kwargs.pop("interactive_evaluation", True) profiler = kwargs.pop("profiler", None) expectation_suite = kwargs.pop("expectation_suite", None) - data_asset_name = kwargs.pop("data_asset_name", None) expectation_suite_name = kwargs.pop("expectation_suite_name", None) data_context = kwargs.pop("data_context", None) - batch_kwargs = kwargs.pop("batch_kwargs", None) - batch_id = kwargs.pop("batch_id", None) + batch_kwargs = kwargs.pop("batch_kwargs", BatchKwargs(ge_batch_id=str(uuid.uuid1()))) + batch_parameters = kwargs.pop("batch_parameters", {}) + batch_markers = kwargs.pop("batch_markers", {}) if "autoinspect_func" in kwargs: warnings.warn("Autoinspect_func is no longer supported; use a profiler instead (migration is easy!).", @@ -60,12 +65,12 @@ def __init__(self, *args, **kwargs): } self._initialize_expectations( expectation_suite=expectation_suite, - data_asset_name=data_asset_name, expectation_suite_name=expectation_suite_name ) self._data_context = data_context - self._batch_kwargs = batch_kwargs - self._batch_id = batch_id + self._batch_kwargs = BatchKwargs(batch_kwargs) + self._batch_markers = batch_markers + self._batch_parameters = batch_parameters # This special state variable tracks whether a validation run is going on, which will disable # saving expectation config objects @@ -194,24 +199,21 @@ def wrapper(self, *args, **kwargs): # This will become the stored config expectation_args = copy.deepcopy(all_args) - if "evaluation_parameters" in self._expectation_suite: + if self._expectation_suite.evaluation_parameters: evaluation_args = self._build_evaluation_parameters( expectation_args, - self._expectation_suite["evaluation_parameters"] + self._expectation_suite.evaluation_parameters ) else: evaluation_args = self._build_evaluation_parameters( expectation_args, None) # Construct the expectation_config object - expectation_config = DotDict({ - "expectation_type": method_name, - "kwargs": expectation_args - }) - - # Add meta to our expectation_config - if meta is not None: - expectation_config["meta"] = meta + expectation_config = ExpectationConfiguration( + expectation_type=method_name, + kwargs=expectation_args, + meta=meta + ) raised_exception = False exception_traceback = None @@ -221,22 +223,23 @@ def wrapper(self, *args, **kwargs): if self._config.get("interactive_evaluation", True) or self._active_validation: try: return_obj = func(self, **evaluation_args) - + if isinstance(return_obj, dict): + return_obj = ExpectationValidationResult(**return_obj) + except Exception as err: if catch_exceptions: raised_exception = True exception_traceback = traceback.format_exc() - exception_message = str(err) + exception_message = "{}: {}".format(type(err).__name__, str(err)) - return_obj = { - "success": False - } + return_obj = ExpectationValidationResult(success=False) else: raise err else: - return_obj = {"stored_configuration": expectation_config} + return_obj = ExpectationValidationResult(expectation_config=copy.deepcopy( + expectation_config)) # If validate has set active_validation to true, then we do not save the config to avoid # saving updating expectation configs to the same suite during validation runs @@ -247,16 +250,15 @@ def wrapper(self, *args, **kwargs): self._append_expectation(expectation_config) if include_config: - return_obj["expectation_config"] = copy.deepcopy( - expectation_config) + return_obj.expectation_config = copy.deepcopy(expectation_config) # If there was no interactive evaluation, success will not have been computed. - if "success" in return_obj: + if return_obj.success is not None: # Add a "success" object to the config - expectation_config["success_on_last_run"] = return_obj["success"] + expectation_config.success_on_last_run = return_obj.success if catch_exceptions: - return_obj["exception_info"] = { + return_obj.exception_info = { "raised_exception": raised_exception, "exception_message": exception_message, "exception_traceback": exception_traceback @@ -264,7 +266,7 @@ def wrapper(self, *args, **kwargs): # Add meta to return object if meta is not None: - return_obj['meta'] = meta + return_obj.meta = meta return_obj = recursively_convert_to_json_serializable( return_obj) @@ -278,7 +280,7 @@ def wrapper(self, *args, **kwargs): return outer_wrapper - def _initialize_expectations(self, expectation_suite=None, data_asset_name=None, expectation_suite_name=None): + def _initialize_expectations(self, expectation_suite=None, expectation_suite_name=None): """Instantiates `_expectation_suite` as empty by default or with a specified expectation `config`. In addition, this always sets the `default_expectation_args` to: `include_config`: False, @@ -295,9 +297,6 @@ def _initialize_expectations(self, expectation_suite=None, data_asset_name=None, If None, creates default `_expectation_suite` with an empty list of expectations and \ key value `data_asset_name` as `data_asset_name`. - data_asset_name (string): \ - The name to assign to `_expectation_suite.data_asset_name` - expectation_suite_name (string): \ The name to assign to the `expectation_suite.expectation_suite_name` @@ -305,33 +304,28 @@ def _initialize_expectations(self, expectation_suite=None, data_asset_name=None, None """ if expectation_suite is not None: - # TODO: validate the incoming expectation_suite with jsonschema here - self._expectation_suite = DotDict(copy.deepcopy(expectation_suite)) - - if data_asset_name is not None: - if self._expectation_suite["data_asset_name"] != data_asset_name: - logger.warning( - "Overriding existing data_asset_name {n1} with new name {n2}" - .format(n1=self._expectation_suite["data_asset_name"], n2=data_asset_name) - ) - self._expectation_suite["data_asset_name"] = data_asset_name + if isinstance(expectation_suite, dict): + expectation_suite = expectationSuiteSchema.load(expectation_suite).data + else: + expectation_suite = copy.deepcopy(expectation_suite) + self._expectation_suite = expectation_suite if expectation_suite_name is not None: - if self._expectation_suite["expectation_suite_name"] != expectation_suite_name: + if self._expectation_suite.expectation_suite_name != expectation_suite_name: logger.warning( "Overriding existing expectation_suite_name {n1} with new name {n2}" - .format(n1=self._expectation_suite["expectation_suite_name"], n2=expectation_suite_name) + .format(n1=self._expectation_suite.expectation_suite_name, n2=expectation_suite_name) ) - self._expectation_suite["expectation_suite_name"] = expectation_suite_name + self._expectation_suite.expectation_suite_name = expectation_suite_name else: if expectation_suite_name is None: expectation_suite_name = "default" - self._expectation_suite = get_empty_expectation_suite(data_asset_name, expectation_suite_name) + self._expectation_suite = ExpectationSuite(expectation_suite_name=expectation_suite_name) - self._expectation_suite["data_asset_type"] = self._data_asset_type + self._expectation_suite.data_asset_type = self._data_asset_type self.default_expectation_args = { - "include_config": False, + "include_config": True, "catch_exceptions": False, "result_format": 'BASIC', } @@ -353,13 +347,13 @@ def _append_expectation(self, expectation_config): May raise future errors once json-serializable tests are implemented to check for correct arg formatting """ - expectation_type = expectation_config['expectation_type'] + expectation_type = expectation_config.expectation_type # Test to ensure the new expectation is serializable. # FIXME: If it's not, are we sure we want to raise an error? # FIXME: Should we allow users to override the error? # FIXME: Should we try to convert the object using something like recursively_convert_to_json_serializable? - json.dumps(expectation_config) + # json.dumps(expectation_config) # Drop existing expectations with the same expectation_type. # For column_expectations, _append_expectation should only replace expectations @@ -368,17 +362,17 @@ def _append_expectation(self, expectation_config): # !!! it needs to be documented, and # !!! we need to provide syntax to override it. - if 'column' in expectation_config['kwargs']: - column = expectation_config['kwargs']['column'] + if 'column' in expectation_config.kwargs: + column = expectation_config.kwargs['column'] self._expectation_suite.expectations = [f for f in filter( - lambda exp: (exp['expectation_type'] != expectation_type) or ( - 'column' in exp['kwargs'] and exp['kwargs']['column'] != column), + lambda exp: (exp.expectation_type != expectation_type) or ( + 'column' in exp.kwargs and exp.kwargs['column'] != column), self._expectation_suite.expectations )] else: self._expectation_suite.expectations = [f for f in filter( - lambda exp: exp['expectation_type'] != expectation_type, + lambda exp: exp.expectation_type != expectation_type, self._expectation_suite.expectations )] @@ -414,18 +408,18 @@ def _copy_and_clean_up_expectation(self, del new_expectation["success_on_last_run"] if discard_result_format_kwargs: - if "result_format" in new_expectation["kwargs"]: - del new_expectation["kwargs"]["result_format"] + if "result_format" in new_expectation.kwargs: + del new_expectation.kwargs["result_format"] # discards["result_format"] += 1 if discard_include_config_kwargs: - if "include_config" in new_expectation["kwargs"]: - del new_expectation["kwargs"]["include_config"] + if "include_config" in new_expectation.kwargs: + del new_expectation.kwargs["include_config"] # discards["include_config"] += 1 if discard_catch_exceptions_kwargs: - if "catch_exceptions" in new_expectation["kwargs"]: - del new_expectation["kwargs"]["catch_exceptions"] + if "catch_exceptions" in new_expectation.kwargs: + del new_expectation.kwargs["catch_exceptions"] # discards["catch_exceptions"] += 1 return new_expectation @@ -499,7 +493,7 @@ def find_expectation_indexes(self, match_indexes = [] for i, exp in enumerate(self._expectation_suite.expectations): - if expectation_type is None or (expectation_type == exp['expectation_type']): + if expectation_type is None or (expectation_type == exp.expectation_type): # if column == None or ('column' not in exp['kwargs']) or # (exp['kwargs']['column'] == column) or (exp['kwargs']['column']==: match = True @@ -627,18 +621,22 @@ def batch_kwargs(self): @property def batch_id(self): - return self._batch_id + return self.batch_kwargs.to_id() @property - def batch_fingerprint(self): - return self._batch_id.batch_fingerprint + def batch_markers(self): + return self._batch_markers + + @property + def batch_parameters(self): + return self._batch_parameters def discard_failing_expectations(self): - res = self.validate(only_return_failures=True).get('results') + res = self.validate(only_return_failures=True).results if any(res): for item in res: - self.remove_expectation(expectation_type=item['expectation_config']['expectation_type'], - expectation_kwargs=item['expectation_config']['kwargs']) + self.remove_expectation(expectation_type=item.expectation_config.expectation_type, + expectation_kwargs=item.expectation_config['kwargs']) warnings.warn( "Removed %s expectations that were 'False'" % len(res)) @@ -651,7 +649,7 @@ def get_default_expectation_arguments(self): Ex:: { - "include_config" : False, + "include_config" : True, "catch_exceptions" : False, "result_format" : 'BASIC' } @@ -722,8 +720,8 @@ def get_expectation_suite(self, copy of _expectation_suite, not the original object. """ - expectation_suite = copy.deepcopy(dict(self._expectation_suite)) - expectations = expectation_suite["expectations"] + expectation_suite = copy.deepcopy(self._expectation_suite) + expectations = expectation_suite.expectations discards = defaultdict(int) @@ -733,9 +731,9 @@ def get_expectation_suite(self, for expectation in expectations: # Note: This is conservative logic. # Instead of retaining expectations IFF success==True, it discard expectations IFF success==False. - # In cases where expectation["success"] is missing or None, expectations are *retained*. + # In cases where expectation.success is missing or None, expectations are *retained*. # Such a case could occur if expectations were loaded from a config file and never run. - if "success_on_last_run" in expectation and expectation["success_on_last_run"] is False: + if expectation.success_on_last_run is False: discards["failed_expectations"] += 1 else: new_expectations.append(expectation) @@ -752,22 +750,21 @@ def get_expectation_suite(self, for expectation in expectations: # FIXME: Factor this out into a new function. The logic is duplicated in remove_expectation, # which calls _copy_and_clean_up_expectation - if "success_on_last_run" in expectation: - del expectation["success_on_last_run"] + expectation.success_on_last_run = None if discard_result_format_kwargs: - if "result_format" in expectation["kwargs"]: - del expectation["kwargs"]["result_format"] + if "result_format" in expectation.kwargs: + del expectation.kwargs["result_format"] discards["result_format"] += 1 if discard_include_config_kwargs: - if "include_config" in expectation["kwargs"]: - del expectation["kwargs"]["include_config"] + if "include_config" in expectation.kwargs: + del expectation.kwargs["include_config"] discards["include_config"] += 1 if discard_catch_exceptions_kwargs: - if "catch_exceptions" in expectation["kwargs"]: - del expectation["kwargs"]["catch_exceptions"] + if "catch_exceptions" in expectation.kwargs: + del expectation.kwargs["catch_exceptions"] discards["catch_exceptions"] += 1 settings_message = "" @@ -784,7 +781,7 @@ def get_expectation_suite(self, if len(settings_message) > 1: # Only add this if we added one of the settings above. settings_message += " settings filtered." - expectation_suite["expectations"] = expectations + expectation_suite.expectations = expectations logger.info(message + settings_message) return expectation_suite @@ -834,55 +831,18 @@ def save_expectation_suite( if filepath is None and self._data_context is not None: self._data_context.save_expectation_suite(expectation_suite) elif filepath is not None: - expectation_config_str = json.dumps(expectation_suite, indent=2) - open(filepath, 'w').write(expectation_config_str) + with open(filepath, 'w') as outfile: + json.dump(expectationSuiteSchema.dump(expectation_suite).data, outfile, indent=2) else: raise ValueError("Unable to save config: filepath or data_context must be available.") - def _validate_single_expectation(self, - expectation, - result_format, - runtime_evaluation_parameters, - catch_exceptions): - """This method is factored out of the inner loop of validate - to simplify certain kinds of error trapping and mocking. - - Note that we have to return BOTH the expectation and result, - since the expectation is often modified as a side effect of other operations, - and this config is used later in the validate method. - """ - # copy the config so we can modify it below if needed - expectation = copy.deepcopy(expectation) - - expectation_method = getattr( - self, expectation['expectation_type']) - - if result_format is not None: - expectation['kwargs'].update({'result_format': result_format}) - - # A missing parameter should raise a KeyError - evaluation_args = self._build_evaluation_parameters( - expectation['kwargs'], runtime_evaluation_parameters) - - result = expectation_method( - catch_exceptions=catch_exceptions, - include_config=True, - **evaluation_args - ) - - return expectation, result - - # TODO: when validate is called and expectation editor is in data_context, need to bypass widget creation - # NOTE : Abe 2019/09/21 : This method contains a lot of logic that will need to be split between - # the DataContextAwareDataAsset and BasicDataAsset classes, when we created those typed classes. - # Some of the ContextAware logic may go to live in the DataContext itself. def validate(self, - expectation_suite=None, + expectation_suite=None, run_id=None, data_context=None, evaluation_parameters=None, - catch_exceptions=True, - result_format=None, + catch_exceptions=True, + result_format=None, only_return_failures=False): """Generates a JSON-formatted report describing the outcome of all expectations. @@ -970,8 +930,18 @@ def validate(self, discard_catch_exceptions_kwargs=False, ) elif isinstance(expectation_suite, string_types): - expectation_suite = json.load(open(expectation_suite, 'r')) - + try: + with open(expectation_suite, 'r') as infile: + expectation_suite = expectationSuiteSchema.loads(infile.read()).data + except ValidationError: + raise + except IOError: + raise GreatExpectationsError( + "Unable to load expectation suite: IO error while reading %s" % expectation_suite) + elif not isinstance(expectation_suite, ExpectationSuite): + logger.error("Unable to validate using the provided value for expectation suite; does it need to be " + "loaded from a dictionary?") + return ExpectationValidationResult(success=False) # Evaluation parameter priority is # 1. from provided parameters # 2. from expectation configuration @@ -979,12 +949,13 @@ def validate(self, # So, we load them in reverse order if data_context is not None: - runtime_evaluation_parameters = data_context.get_parameters_in_evaluation_parameter_store_by_run_id(run_id) + runtime_evaluation_parameters = \ + data_context.evaluation_parameter_store.get_bind_params(run_id) else: runtime_evaluation_parameters = {} - if "evaluation_parameters" in expectation_suite: - runtime_evaluation_parameters.update(expectation_suite["evaluation_parameters"]) + if expectation_suite.evaluation_parameters: + runtime_evaluation_parameters.update(expectation_suite.evaluation_parameters) if evaluation_parameters is not None: runtime_evaluation_parameters.update(evaluation_parameters) @@ -994,11 +965,11 @@ def validate(self, # Warn if our version is different from the version in the configuration try: - if expectation_suite['meta']['great_expectations.__version__'] != ge_version: + if expectation_suite.meta['great_expectations.__version__'] != ge_version: warnings.warn( "WARNING: This configuration object was built using version %s of great_expectations, but " - "is currently being valided by version %s." - % (expectation_suite['meta']['great_expectations.__version__'], ge_version)) + "is currently being validated by version %s." + % (expectation_suite.meta['great_expectations.__version__'], ge_version)) except KeyError: warnings.warn( "WARNING: No great_expectations version found in configuration object.") @@ -1012,9 +983,9 @@ def validate(self, # Group expectations by column columns = {} - for expectation in expectation_suite["expectations"]: - if "column" in expectation["kwargs"] and isinstance(expectation["kwargs"]["column"], Hashable): - column = expectation["kwargs"]["column"] + for expectation in expectation_suite.expectations: + if "column" in expectation.kwargs and isinstance(expectation.kwargs["column"], Hashable): + column = expectation.kwargs["column"] else: column = "_nocolumn" if column not in columns: @@ -1028,11 +999,22 @@ def validate(self, for expectation in expectations_to_evaluate: try: - expectation, result = self._validate_single_expectation( - expectation, - result_format, - runtime_evaluation_parameters, - catch_exceptions, + # copy the config so we can modify it below if needed + expectation = copy.deepcopy(expectation) + + expectation_method = getattr(self, expectation.expectation_type) + + if result_format is not None: + expectation.kwargs.update({'result_format': result_format}) + + # A missing parameter should raise a KeyError + evaluation_args = self._build_evaluation_parameters( + expectation.kwargs, runtime_evaluation_parameters) + + result = expectation_method( + catch_exceptions=catch_exceptions, + include_config=True, + **evaluation_args ) except Exception as err: @@ -1040,24 +1022,24 @@ def validate(self, raised_exception = True exception_traceback = traceback.format_exc() - result = { - "success": False, - "exception_info": { + result = ExpectationValidationResult( + success=False, + exception_info={ "raised_exception": raised_exception, "exception_traceback": exception_traceback, "exception_message": str(err) } - } + ) else: raise err # if include_config: - result["expectation_config"] = expectation + result.expectation_config = expectation # Add an empty exception_info object if no exception was caught - if catch_exceptions and ('exception_info' not in result): - result["exception_info"] = { + if catch_exceptions and result.exception_info is None: + result.exception_info = { "raised_exception": False, "exception_traceback": None, "exception_message": None @@ -1070,43 +1052,34 @@ def validate(self, if only_return_failures: abbrev_results = [] for exp in results: - if not exp["success"]: + if not exp.success: abbrev_results.append(exp) results = abbrev_results - data_asset_name = expectation_suite.get("data_asset_name", None) - expectation_suite_name = expectation_suite.get("expectation_suite_name", "default") + expectation_suite_name = expectation_suite.expectation_suite_name - result = { - "results": results, - "success": statistics.success, - "statistics": { + if run_id is None: + run_id = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") + + result = ExpectationSuiteValidationResult( + results=results, + success=statistics.success, + statistics={ "evaluated_expectations": statistics.evaluated_expectations, "successful_expectations": statistics.successful_expectations, "unsuccessful_expectations": statistics.unsuccessful_expectations, "success_percent": statistics.success_percent, }, - "meta": { + evaluation_parameters=runtime_evaluation_parameters, + meta={ "great_expectations.__version__": ge_version, - "data_asset_name": data_asset_name, - "expectation_suite_name": expectation_suite_name + "expectation_suite_name": expectation_suite_name, + "run_id": run_id, + "batch_kwargs": self.batch_kwargs, + "batch_markers": self.batch_markers, + "batch_parameters": self.batch_parameters } - } - - if evaluation_parameters is not None: - result.update({"evaluation_parameters": runtime_evaluation_parameters}) - - if run_id is not None: - result["meta"].update({"run_id": run_id}) - else: - run_id = datetime.datetime.utcnow().isoformat().replace(":", "") + "Z" - result["meta"].update({"run_id": run_id}) - - if self._batch_kwargs is not None: - result["meta"].update({"batch_kwargs": self._batch_kwargs}) - - if self._batch_id is not None: - result["meta"].update({"batch_id": self._batch_id}) + ) self._data_context = validate__data_context except Exception: @@ -1126,9 +1099,8 @@ def get_evaluation_parameter(self, parameter_name, default_value=None): Returns: The current value of the evaluation parameter. """ - if "evaluation_parameters" in self._expectation_suite and \ - parameter_name in self._expectation_suite['evaluation_parameters']: - return self._expectation_suite['evaluation_parameters'][parameter_name] + if parameter_name in self._expectation_suite.evaluation_parameters: + return self._expectation_suite.evaluation_parameters[parameter_name] else: return default_value @@ -1140,28 +1112,30 @@ def set_evaluation_parameter(self, parameter_name, parameter_value): parameter_name (string): The name of the kwarg to be replaced at evaluation time parameter_value (any): The value to be used """ - - if 'evaluation_parameters' not in self._expectation_suite: - self._expectation_suite['evaluation_parameters'] = {} - - self._expectation_suite['evaluation_parameters'].update( + self._expectation_suite.evaluation_parameters.update( {parameter_name: parameter_value}) - def set_data_asset_name(self, data_asset_name): - """Sets the name of this data_asset as stored in the expectations configuration.""" - self._expectation_suite['data_asset_name'] = data_asset_name + # PENDING DELETION: 20200130 - JPC - Ready for deletion upon release of 0.9.0 with no data_asset_name + # + # @property + # def data_asset_name(self): + # """Gets the current name of this data_asset as stored in the expectations configuration.""" + # return self._expectation_suite.data_asset_name + # + # @data_asset_name.setter + # def data_asset_name(self, data_asset_name): + # """Sets the name of this data_asset as stored in the expectations configuration.""" + # self._expectation_suite.data_asset_name = data_asset_name - def get_data_asset_name(self): - """Gets the current name of this data_asset as stored in the expectations configuration.""" - return self._expectation_suite.get("data_asset_name", None) + @property + def expectation_suite_name(self): + """Gets the current expectation_suite name of this data_asset as stored in the expectations configuration.""" + return self._expectation_suite.expectation_suite_name - def save_expectation_suite_name(self, expectation_suite_name): + @expectation_suite_name.setter + def expectation_suite_name(self, expectation_suite_name): """Sets the expectation_suite name of this data_asset as stored in the expectations configuration.""" - self._expectation_suite["expectation_suite_name"] = expectation_suite_name - - def get_expectation_suite_name(self): - """Gets the current expectation_suite name of this data_asset as stored in the expectations configuration.""" - return self._expectation_suite.get("expectation_suite_name", None) + self._expectation_suite.expectation_suite_name = expectation_suite_name def _build_evaluation_parameters(self, expectation_args, evaluation_parameters): """Build a dictionary of parameters to evaluate, using the provided evaluation_parameters, @@ -1380,7 +1354,7 @@ def _calc_validation_statistics(validation_results): return ``ExpectationStatistics``. """ # calc stats - successful_expectations = sum(exp["success"] for exp in validation_results) + successful_expectations = sum(exp.success for exp in validation_results) evaluated_expectations = len(validation_results) unsuccessful_expectations = evaluated_expectations - successful_expectations success = successful_expectations == evaluated_expectations diff --git a/great_expectations/data_asset/file_data_asset.py b/great_expectations/data_asset/file_data_asset.py index cba9aff5f614..44228b58faed 100644 --- a/great_expectations/data_asset/file_data_asset.py +++ b/great_expectations/data_asset/file_data_asset.py @@ -154,7 +154,7 @@ def expect_file_line_regex_match_count_to_be_between(self, mostly=None, null_lines_regex=r"^\s*$", result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, _lines=None): @@ -264,7 +264,7 @@ def expect_file_line_regex_match_count_to_equal(self, regex, mostly=None, nonnull_lines_regex=r"^\s*$", result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, _lines=None): @@ -336,7 +336,7 @@ def expect_file_line_regex_match_count_to_equal(self, regex, @DataAsset.expectation(["value"]) def expect_file_hash_to_equal(self, value, hash_alg='md5', result_format=None, - include_config=False, catch_exceptions=None, + include_config=True, catch_exceptions=None, meta=None): """ @@ -390,7 +390,7 @@ def expect_file_hash_to_equal(self, value, hash_alg='md5', result_format=None, @DataAsset.expectation(["minsize", "maxsize"]) def expect_file_size_to_be_between(self, minsize=0, maxsize=None, result_format=None, - include_config=False, catch_exceptions=None, + include_config=True, catch_exceptions=None, meta=None): """ @@ -460,13 +460,13 @@ def expect_file_size_to_be_between(self, minsize=0, maxsize=None, result_format= return { "success": success, - "details": { - "filesize": size + "result": { + "observed_value": size } } @DataAsset.expectation(["filepath"]) - def expect_file_to_exist(self, filepath=None, result_format=None, include_config=False, + def expect_file_to_exist(self, filepath=None, result_format=None, include_config=True, catch_exceptions=None, meta=None): """ @@ -515,7 +515,7 @@ def expect_file_to_exist(self, filepath=None, result_format=None, include_config @DataAsset.expectation([]) def expect_file_to_have_valid_table_header(self, regex, skip=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None): """ Checks to see if a file has a line with unique delimited values, @@ -556,7 +556,7 @@ def expect_file_to_have_valid_table_header(self, regex, skip=None, try: comp_regex = re.compile(regex) - except: + except re.error: raise ValueError("Must enter valid regular expression for regex") success = False @@ -587,7 +587,7 @@ def expect_file_to_have_valid_table_header(self, regex, skip=None, @DataAsset.expectation([]) def expect_file_to_be_valid_json(self, schema=None, result_format=None, - include_config=False, catch_exceptions=None, + include_config=True, catch_exceptions=None, meta=None): """ diff --git a/great_expectations/data_asset/util.py b/great_expectations/data_asset/util.py index 57b7c50d8c9c..54fee25dae46 100644 --- a/great_expectations/data_asset/util.py +++ b/great_expectations/data_asset/util.py @@ -13,8 +13,8 @@ from functools import wraps -from great_expectations import __version__ as ge_version -from great_expectations.types import DotDict +from great_expectations.core import ExpectationConfiguration, ExpectationSuite, ExpectationValidationResult, \ + ExpectationKwargs, ExpectationSuiteValidationResult def parse_result_format(result_format): @@ -102,6 +102,12 @@ def recursively_convert_to_json_serializable(test_obj): test_obj may also be converted in place. """ + # If it's one of our types, we pass + if isinstance(test_obj, (ExpectationConfiguration, ExpectationSuite, ExpectationValidationResult, + ExpectationSuiteValidationResult)): + return test_obj + + # Validate that all aruguments are of approved types, coerce if it's easy, else exception # print(type(test_obj), test_obj) # Note: Not 100% sure I've resolved this correctly... @@ -188,14 +194,3 @@ def recursively_convert_to_json_serializable(test_obj): else: raise TypeError('%s is of type %s which cannot be serialized.' % ( str(test_obj), type(test_obj).__name__)) - - -def get_empty_expectation_suite(data_asset_name=None, expectation_suite_name="default"): - return DotDict({ - 'data_asset_name': data_asset_name, - 'expectation_suite_name': expectation_suite_name, - 'meta': { - 'great_expectations.__version__': ge_version - }, - 'expectations': [] - }) diff --git a/great_expectations/data_context/__init__.py b/great_expectations/data_context/__init__.py index 3b141fcc14f3..9ec0aa920047 100644 --- a/great_expectations/data_context/__init__.py +++ b/great_expectations/data_context/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from .data_context import ( - ConfigOnlyDataContext, + BaseDataContext, DataContext, ExplorerDataContext, ) diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py index f035a3d41a40..24a4c1688281 100644 --- a/great_expectations/data_context/data_context.py +++ b/great_expectations/data_context/data_context.py @@ -1,60 +1,60 @@ # -*- coding: utf-8 -*- +import copy +import datetime +import errno import glob -import os -import json import logging +import os import shutil +import sys +import warnings import webbrowser +from marshmallow import ValidationError from ruamel.yaml import YAML, YAMLError -import sys -import copy -import errno from six import string_types -import datetime -import warnings - -from great_expectations.util import file_relative_path -from .util import safe_mmkdir, substitute_all_config_variables, substitute_config_variable -from ..types.base import DotDict -import great_expectations.exceptions as ge_exceptions - -# FIXME : Consolidate all builder files and classes in great_expectations/render/builder, to make it clear that they aren't renderers. - - -try: - from urllib.parse import urlparse -except ImportError: - from urlparse import urlparse - -from great_expectations.data_asset.util import get_empty_expectation_suite +from great_expectations.core import ExpectationSuite, get_metric_kwargs_id +from great_expectations.core.id_dict import BatchKwargs +from great_expectations.core.metric import ValidationMetricIdentifier +from great_expectations.core.util import nested_update +from great_expectations.data_context.types.base import ( + DataContextConfig, + dataContextConfigSchema, +) +from great_expectations.data_context.util import ( + file_relative_path, + substitute_config_variable, +) from great_expectations.dataset import Dataset -from great_expectations.datasource import ( - PandasDatasource, - SqlAlchemyDatasource, - SparkDFDatasource, - DBTDatasource +from great_expectations.profile.basic_dataset_profiler import ( + BasicDatasetProfiler, ) -from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler -from .types import ( - NormalizedDataAssetName, # TODO : Replace this with DataAssetIdentifier. - DataAssetIdentifier, - ExpectationSuiteIdentifier, - ValidationResultIdentifier, -) +import great_expectations.exceptions as ge_exceptions +from ..validator.validator import Validator from .templates import ( - PROJECT_TEMPLATE, CONFIG_VARIABLES_INTRO, CONFIG_VARIABLES_TEMPLATE, + PROJECT_TEMPLATE, +) +from .types.resource_identifiers import ( + ExpectationSuiteIdentifier, + ValidationResultIdentifier, ) from .util import ( + instantiate_class_from_config, load_class, - instantiate_class_from_config + safe_mmkdir, + substitute_all_config_variables, ) +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse + try: from sqlalchemy.exc import SQLAlchemyError except ImportError: @@ -67,239 +67,67 @@ yaml.indent(mapping=2, sequence=4, offset=2) yaml.default_flow_style = False -ALLOWED_DELIMITERS = ['.', '/'] - -CURRENT_CONFIG_VERSION = 1 -MINIMUM_SUPPORTED_CONFIG_VERSION = 1 - -class ConfigOnlyDataContext(object): +class BaseDataContext(object): """ This class implements most of the functionality of DataContext, with a few exceptions. - 1. ConfigOnlyDataContext does not attempt to keep its project_config in sync with a file on disc. - 2. ConfigOnlyDataContext doesn't attempt to "guess" paths or objects types. Instead, that logic is pushed + 1. BaseDataContext does not attempt to keep its project_config in sync with a file on disc. + 2. BaseDataContext doesn't attempt to "guess" paths or objects types. Instead, that logic is pushed into DataContext class. - Together, these changes make ConfigOnlyDataContext class more testable. - - DataContext itself inherits from ConfigOnlyDataContext. It behaves essentially the same as the v0.7.* - implementation of DataContext. + Together, these changes make BaseDataContext class more testable. """ PROFILING_ERROR_CODE_TOO_MANY_DATA_ASSETS = 2 PROFILING_ERROR_CODE_SPECIFIED_DATA_ASSETS_NOT_FOUND = 3 + PROFILING_ERROR_CODE_NO_GENERATOR_FOUND = 4 + PROFILING_ERROR_CODE_MULTIPLE_GENERATORS_FOUND = 5 UNCOMMITTED_DIRECTORIES = ["data_docs", "samples", "validations"] + GE_UNCOMMITTED_DIR = "uncommitted" BASE_DIRECTORIES = [ - "datasources", "expectations", "notebooks", "plugins", - "uncommitted", + GE_UNCOMMITTED_DIR, ] NOTEBOOK_SUBDIRECTORIES = ["pandas", "spark", "sql"] GE_DIR = "great_expectations" GE_YML = "great_expectations.yml" - - # TODO: Consider moving this to DataContext, instead of ConfigOnlyDataContext, since it writes to disc. - @classmethod - def create(cls, project_root_dir=None): - """ - Build a new great_expectations directory and DataContext object in the provided project_root_dir. - - `create` will not create a new "great_expectations" directory in the provided folder, provided one does not - already exist. Then, it will initialize a new DataContext in that folder and write the resulting config. - - Args: - project_root_dir: path to the root directory in which to create a new great_expectations directory - - Returns: - DataContext - """ - - if not os.path.isdir(project_root_dir): - raise ge_exceptions.DataContextError( - "The project_root_dir must be an existing directory in which " - "to initialize a new DataContext" - ) - - ge_dir = os.path.join(project_root_dir, cls.GE_DIR) - safe_mmkdir(ge_dir, exist_ok=True) - cls.scaffold_directories(ge_dir) - - if os.path.isfile(os.path.join(ge_dir, cls.GE_YML)): - message = """Warning. An existing `{}` was found here: {}. - - No action was taken.""".format(cls.GE_YML, ge_dir) - warnings.warn(message) - else: - cls.write_project_template_to_disk(ge_dir) - - if os.path.isfile(os.path.join(ge_dir, "notebooks")): - message = """Warning. An existing `notebooks` directory was found here: {}. - - No action was taken.""".format(ge_dir) - warnings.warn(message) - else: - cls.scaffold_notebooks(ge_dir) - - uncommitted_dir = os.path.join(ge_dir, "uncommitted") - if os.path.isfile(os.path.join(uncommitted_dir, "config_variables.yml")): - message = """Warning. An existing `config_variables.yml` was found here: {}. - - No action was taken.""".format(uncommitted_dir) - warnings.warn(message) - else: - cls.write_config_variables_template_to_disk(uncommitted_dir) - - return cls(ge_dir) - - @classmethod - def all_uncommitted_directories_exist(cls, ge_dir): - """Check if all uncommitted direcotries exist.""" - uncommitted_dir = os.path.join(ge_dir, "uncommitted") - for directory in cls.UNCOMMITTED_DIRECTORIES: - if not os.path.isdir(os.path.join(uncommitted_dir, directory)): - return False - - return True - - @classmethod - def config_variables_yml_exist(cls, ge_dir): - """Check if all config_variables.yml exists.""" - path_to_yml = os.path.join(ge_dir, cls.GE_YML) - - # TODO this is so brittle and gross - with open(path_to_yml, "r") as f: - config = yaml.load(f) - config_var_path = config.get("config_variables_file_path") - config_var_path = os.path.join(ge_dir, config_var_path) - return os.path.isfile(config_var_path) - - @classmethod - def write_config_variables_template_to_disk(cls, uncommitted_dir): - safe_mmkdir(uncommitted_dir) - config_var_file = os.path.join(uncommitted_dir, "config_variables.yml") - with open(config_var_file, "w") as template: - template.write(CONFIG_VARIABLES_TEMPLATE) - - @classmethod - def write_project_template_to_disk(cls, ge_dir): - file_path = os.path.join(ge_dir, cls.GE_YML) - with open(file_path, "w") as template: - template.write(PROJECT_TEMPLATE) - - @classmethod - def scaffold_directories(cls, base_dir): - """Safely create GE directories for a new project.""" - safe_mmkdir(base_dir, exist_ok=True) - open(os.path.join(base_dir, ".gitignore"), 'w').write("uncommitted/") - - for directory in cls.BASE_DIRECTORIES: - if directory == "plugins": - plugins_dir = os.path.join(base_dir, directory) - safe_mmkdir(plugins_dir, exist_ok=True) - safe_mmkdir(os.path.join(plugins_dir, "custom_data_docs"), exist_ok=True) - safe_mmkdir(os.path.join(plugins_dir, "custom_data_docs", "views"), exist_ok=True) - safe_mmkdir(os.path.join(plugins_dir, "custom_data_docs", "renderers"), exist_ok=True) - safe_mmkdir(os.path.join(plugins_dir, "custom_data_docs", "styles"), exist_ok=True) - cls.scaffold_custom_data_docs(plugins_dir) - else: - safe_mmkdir(os.path.join(base_dir, directory), exist_ok=True) - - uncommitted_dir = os.path.join(base_dir, "uncommitted") - - for new_directory in cls.UNCOMMITTED_DIRECTORIES: - new_directory_path = os.path.join(uncommitted_dir, new_directory) - safe_mmkdir( - new_directory_path, - exist_ok=True - ) - - notebook_path = os.path.join(base_dir, "notebooks") - for subdir in cls.NOTEBOOK_SUBDIRECTORIES: - safe_mmkdir(os.path.join(notebook_path, subdir), exist_ok=True) - - @classmethod - def scaffold_custom_data_docs(cls, plugins_dir): - """Copy custom data docs templates""" - styles_template = file_relative_path( - __file__, "../render/view/static/styles/data_docs_custom_styles_template.css") - styles_destination_path = os.path.join( - plugins_dir, "custom_data_docs", "styles", "data_docs_custom_styles.css") - shutil.copyfile(styles_template, styles_destination_path) - - @classmethod - def scaffold_notebooks(cls, base_dir): - """Copy template notebooks into the notebooks directory for a project.""" - template_dir = file_relative_path(__file__, "../init_notebooks/") - notebook_dir = os.path.join(base_dir, "notebooks/") - for subdir in cls.NOTEBOOK_SUBDIRECTORIES: - subdir_path = os.path.join(notebook_dir, subdir) - for notebook in glob.glob(os.path.join(template_dir, subdir, "*.ipynb")): - notebook_name = os.path.basename(notebook) - destination_path = os.path.join(subdir_path, notebook_name) - shutil.copyfile(notebook, destination_path) + GE_EDIT_NOTEBOOK_DIR = GE_UNCOMMITTED_DIR @classmethod def validate_config(cls, project_config): - required_keys = { - # TODO next version re-introduce config_version as required - # "config_version", - "plugins_directory", - "expectations_store_name", - "validations_store_name", - "evaluation_parameter_store_name", - "datasources", - "stores", - "data_docs_sites", - "validation_operators" - } - for key in required_keys: - if key not in project_config: - raise ge_exceptions.MissingTopLevelConfigKeyError("Missing top-level key %s" % key) - - allowed_keys = { - "config_version", - "config_variables_file_path", - "plugins_directory", - "expectations_store_name", - "validations_store_name", - "evaluation_parameter_store_name", - "datasources", - "stores", - "data_docs_sites", - "validation_operators", - } - for key in project_config.keys(): - if key not in allowed_keys: - raise ge_exceptions.InvalidTopLevelConfigKeyError("Invalid top-level config key %s" % key) - + if isinstance(project_config, DataContextConfig): + return True + try: + dataContextConfigSchema.load(project_config) + except ValidationError: + raise return True - - # TODO : Migrate to an expressive __init__ method, with the top level of configs unpacked into named arguments. - def __init__(self, project_config, context_root_dir, data_asset_name_delimiter='/'): + def __init__(self, project_config, context_root_dir=None): """DataContext constructor Args: context_root_dir: location to look for the ``great_expectations.yml`` file. If None, searches for the file \ based on conventions for project subdirectories. - data_asset_name_delimiter: the delimiter character to use when parsing data_asset_name parameters. \ - Defaults to '/' Returns: None """ - if not ConfigOnlyDataContext.validate_config(project_config): + if not BaseDataContext.validate_config(project_config): raise ge_exceptions.InvalidConfigError("Your project_config is not valid. Try using the CLI check-config command.") self._project_config = project_config - # FIXME: This should just be a property - self._context_root_directory = os.path.abspath(context_root_dir) - self._project_config_with_variables_substituted = dict(**self.get_config_with_variables_substituted()) - - - # Init plugins - sys.path.append(self.plugins_directory) + if context_root_dir is not None: + self._context_root_directory = os.path.abspath(context_root_dir) + else: + self._context_root_directory = context_root_dir + # Init plugin support + if self.plugins_directory is not None: + sys.path.append(self.plugins_directory) # Init data sources self._datasources = {} @@ -307,28 +135,32 @@ def __init__(self, project_config, context_root_dir, data_asset_name_delimiter=' self.get_datasource(datasource) # Init stores - self._stores = DotDict() + self._stores = dict() self._init_stores(self._project_config_with_variables_substituted["stores"]) # Init validation operators self.validation_operators = {} - # TODO : This key should NOT be optional in the project config. - # It can be empty, but not missing. - # However, for now, I'm adding this check, to avoid having to migrate all the test fixtures - # while still experimenting with the workings of validation operators and actions. - if "validation_operators" in self._project_config: - for validation_operator_name, validation_operator_config in \ - self._project_config_with_variables_substituted["validation_operators"].items(): - self.add_validation_operator( - validation_operator_name, - validation_operator_config, - ) + for validation_operator_name, validation_operator_config in self._project_config_with_variables_substituted["validation_operators"].items(): + self.add_validation_operator( + validation_operator_name, + validation_operator_config, + ) - self._compiled = False + self._evaluation_parameter_dependencies_compiled = False + self._evaluation_parameter_dependencies = {} - if data_asset_name_delimiter not in ALLOWED_DELIMITERS: - raise ge_exceptions.DataContextError("Invalid delimiter: delimiter must be '.' or '/'") - self._data_asset_name_delimiter = data_asset_name_delimiter + def _build_store(self, store_name, store_config): + new_store = instantiate_class_from_config( + config=store_config, + runtime_environment={ + "root_directory": self.root_directory, + }, + config_defaults={ + "module_name": "great_expectations.data_context.store" + } + ) + self._stores[store_name] = new_store + return new_store def _init_stores(self, store_configs): """Initialize all Stores for this DataContext. @@ -349,10 +181,7 @@ def _init_stores(self, store_configs): """ for store_name, store_config in store_configs.items(): - self.add_store( - store_name, - store_config - ) + self._build_store(store_name, store_config) def add_store(self, store_name, store_config): """Add a new Store to the DataContext and (for convenience) return the instantiated Store object. @@ -366,19 +195,7 @@ def add_store(self, store_name, store_config): """ self._project_config["stores"][store_name] = store_config - self._project_config_with_variables_substituted["stores"][store_name] = \ - self.get_config_with_variables_substituted(config=store_config) - new_store = instantiate_class_from_config( - config=self._project_config_with_variables_substituted["stores"][store_name], - runtime_config={ - "root_directory" : self.root_directory, - }, - config_defaults={ - "module_name" : "great_expectations.data_context.store" - } - ) - self._stores[store_name] = new_store - return new_store + return self._build_store(store_name, store_config) def add_validation_operator(self, validation_operator_name, validation_operator_config): """Add a new ValidationOperator to the DataContext and (for convenience) return the instantiated object. @@ -392,22 +209,21 @@ def add_validation_operator(self, validation_operator_name, validation_operator_ """ self._project_config["validation_operators"][validation_operator_name] = validation_operator_config - self._project_config_with_variables_substituted["validation_operators"][validation_operator_name] = \ - self.get_config_with_variables_substituted(config=validation_operator_config) new_validation_operator = instantiate_class_from_config( config=self._project_config_with_variables_substituted["validation_operators"][validation_operator_name], - runtime_config={ - "data_context" : self, + runtime_environment={ + "data_context": self, }, config_defaults={ - "module_name" : "great_expectations.validation_operators" + "module_name": "great_expectations.validation_operators" } ) self.validation_operators[validation_operator_name] = new_validation_operator return new_validation_operator - def _normalize_absolute_or_relative_path(self, path): + if path is None: + return if os.path.isabs(path): return path else: @@ -419,32 +235,56 @@ def _normalize_store_path(self, resource_store): resource_store["base_directory"] = os.path.join(self.root_directory, resource_store["base_directory"]) return resource_store - def get_existing_local_data_docs_sites_urls(self): - """Get file urls for all built local data docs.""" - from great_expectations.data_context.store import FixedLengthTupleFilesystemStoreBackend - ge_dir = os.path.abspath(self.root_directory) - sites = self.get_project_config().get("data_docs_sites") + def get_docs_sites_urls(self, resource_identifier=None): + """ + Get URLs for a resource for all data docs sites. + + This function will return URLs for any configured site even if the sites have not + been built yet. - existing_sites = [] + :param resource_identifier: optional. It can be an identifier of ExpectationSuite's, + ValidationResults and other resources that have typed identifiers. + If not provided, the method will return the URLs of the index page. + :return: a list of URLs. Each item is the URL for the resource for a data docs site + """ - for site_name, site in sites.items(): - store_backend = site.get("store_backend") - store_class = load_class( - store_backend.get("class_name"), - "great_expectations.data_context.store" - ) - # Only do this for local files - if issubclass(store_class, FixedLengthTupleFilesystemStoreBackend): - base_dir = store_backend.get("base_directory") - data_docs_index = os.path.join(ge_dir, base_dir, "index.html") - - if os.path.isfile(data_docs_index): - existing_sites.append("file://" + data_docs_index) - return existing_sites - - def open_data_docs(self): - """A stdlib cross-platform way to open a file in a browser.""" - data_docs_urls = self.get_existing_local_data_docs_sites_urls() + site_urls = [] + + site_names = None + sites = self._project_config_with_variables_substituted.get('data_docs_sites', []) + if sites: + logger.debug("Found data_docs_sites.") + + for site_name, site_config in sites.items(): + if (site_names and site_name in site_names) or not site_names: + complete_site_config = site_config + site_builder = instantiate_class_from_config( + config=complete_site_config, + runtime_environment={ + "data_context": self, + "root_directory": self.root_directory + }, + config_defaults={ + "module_name": "great_expectations.render.renderer.site_builder" + } + ) + + url = site_builder.get_resource_url(resource_identifier=resource_identifier) + + site_urls.append(url) + + return site_urls + + def open_data_docs(self, resource_identifier=None): + + """ + A stdlib cross-platform way to open a file in a browser. + + :param resource_identifier: ExpectationSuiteIdentifier, ValidationResultIdentifier + or any other type's identifier. The argument is optional - when + not supplied, the method returns the URL of the index page. + """ + data_docs_urls = self.get_docs_sites_urls(resource_identifier=resource_identifier) for url in data_docs_urls: logger.debug("Opening Data Docs found here: {}".format(url)) webbrowser.open(url) @@ -462,6 +302,10 @@ def plugins_directory(self): self._project_config_with_variables_substituted["plugins_directory"] ) + @property + def _project_config_with_variables_substituted(self): + return self.get_config_with_variables_substituted() + @property def stores(self): """A single holder for all Stores in this context""" @@ -476,74 +320,19 @@ def datasources(self): def expectations_store_name(self): return self._project_config_with_variables_substituted["expectations_store_name"] - # TODO: Decide whether this stays here or moves into NamespacedStore - @property - def data_asset_name_delimiter(self): - """Configurable delimiter character used to parse data asset name strings into \ - ``NormalizedDataAssetName`` objects.""" - return self._data_asset_name_delimiter - - @data_asset_name_delimiter.setter - def data_asset_name_delimiter(self, new_delimiter): - """data_asset_name_delimiter property setter method""" - if new_delimiter not in ALLOWED_DELIMITERS: - raise ge_exceptions.DataContextError("Invalid delimiter: delimiter must be one of: {}".format(ALLOWED_DELIMITERS)) - else: - self._data_asset_name_delimiter = new_delimiter - ##### # # Internal helper methods # ##### - # TODO : This method should be deprecated in favor of NamespaceReadWriteStore. - def _get_normalized_data_asset_name_filepath(self, data_asset_name, - expectation_suite_name, - base_path=None, - file_extension=".json"): - """Get the path where the project-normalized data_asset_name expectations are stored. This method is used - internally for constructing all absolute and relative paths for asset_name-based paths. - - Args: - data_asset_name: name of data asset for which to construct the path - expectation_suite_name: name of expectation suite for which to construct the path - base_path: base path from which to construct the path. If None, uses the DataContext root directory - file_extension: the file extension to append to the path - - Returns: - path (str): path for the requsted object. - """ - if base_path is None: - base_path = os.path.join(self.root_directory, "expectations") - - # We need to ensure data_asset_name is a valid filepath no matter its current state - if isinstance(data_asset_name, NormalizedDataAssetName): - name_parts = [name_part.replace("/", "__") for name_part in data_asset_name] - relative_path = "/".join(name_parts) - elif isinstance(data_asset_name, string_types): - # if our delimiter is not '/', we need to first replace any slashes that exist in the name - # to avoid extra layers of nesting (e.g. for dbt models) - relative_path = data_asset_name - if self.data_asset_name_delimiter != "/": - relative_path.replace("/", "__") - relative_path = relative_path.replace(self.data_asset_name_delimiter, "/") - else: - raise ge_exceptions.DataContextError("data_assset_name must be a NormalizedDataAssetName or string") - - expectation_suite_name += file_extension - - return os.path.join( - base_path, - relative_path, - expectation_suite_name - ) - def _load_config_variables_file(self): """Get all config variables from the default location.""" - # TODO: support stores + if not hasattr(self, "root_directory"): + # A BaseDataContext does not have a directory in which to look + return {} - config_variables_file_path = self.get_project_config().get("config_variables_file_path") + config_variables_file_path = self.get_config().config_variables_file_path if config_variables_file_path: try: with open(os.path.join(self.root_directory, @@ -561,11 +350,6 @@ def _load_config_variables_file(self): else: return {} - def get_project_config(self): - project_config = self._project_config - - return project_config - def get_config_with_variables_substituted(self, config=None): if not config: config = self._project_config @@ -584,7 +368,7 @@ def save_config_variable(self, config_variable_name, value): """ config_variables = self._load_config_variables_file() config_variables[config_variable_name] = value - config_variables_filepath = self.get_project_config().get("config_variables_file_path") + config_variables_filepath = self.get_config().config_variables_file_path if not config_variables_filepath: raise ge_exceptions.InvalidConfigError("'config_variables_file_path' property is not found in config - setting it is required to use this feature") @@ -628,7 +412,7 @@ def get_available_data_asset_names(self, datasource_names=None, generator_names= raise ValueError( "Datasource names must be a datasource name, list of datasource names or None (to list all datasources)" ) - + if generator_names is not None: if isinstance(generator_names, string_types): generator_names = [generator_names] @@ -649,191 +433,169 @@ def get_available_data_asset_names(self, datasource_names=None, generator_names= ) else: # generator_names is None for datasource_name in datasource_names: - datasource = self.get_datasource(datasource_name) - data_asset_names[datasource_name] = datasource.get_available_data_asset_names(None) + try: + datasource = self.get_datasource(datasource_name) + data_asset_names[datasource_name] = datasource.get_available_data_asset_names() + except ValueError: + # handle the edge case of a non-existent datasource + data_asset_names[datasource_name] = {} return data_asset_names - def yield_batch_kwargs(self, data_asset_name, **kwargs): - """Yields a the next batch_kwargs for the provided data_asset_name, supplemented by any kwargs provided inline. + def build_batch_kwargs(self, datasource, generator, name=None, partition_id=None, **kwargs): + """Builds batch kwargs using the provided datasource, generator, and batch_parameters. Args: - data_asset_name (str or NormalizedDataAssetName): the name from which to provide batch_kwargs - **kwargs: additional kwargs to supplement the returned batch_kwargs + datasource (str): the name of the datasource for which to build batch_kwargs + generator (str): the name of the generator to use to build batch_kwargs + name (str): an optional name batch_parameter + **kwargs: additional batch_parameters Returns: BatchKwargs """ - if not isinstance(data_asset_name, NormalizedDataAssetName): - data_asset_name = self.normalize_data_asset_name(data_asset_name) - - datasource = self.get_datasource(data_asset_name.datasource) - generator = datasource.get_generator(data_asset_name.generator) - batch_kwargs = generator.yield_batch_kwargs(data_asset_name.generator_asset, **kwargs) - + datasource_obj = self.get_datasource(datasource) + batch_kwargs = datasource_obj.build_batch_kwargs(generator=generator, name=name, **kwargs) return batch_kwargs - def build_batch_kwargs(self, data_asset_name, partition_id=None, **kwargs): - """Builds batch kwargs for the provided data_asset_name, using an optional partition_id or building from - provided kwargs. - - build_batch_kwargs relies on the generator's implementation + def get_batch(self, batch_kwargs, expectation_suite_name, data_asset_type=None, batch_parameters=None): + """Build a batch of data using batch_kwargs, and return a DataAsset with expectation_suite_name attached. If + batch_parameters are included, they will be available as attributes of the batch. Args: - data_asset_name (str or NormalizedDataAssetName): the name from which to provide batch_kwargs - partition_id (str): partition_id to use when building batch_kwargs - **kwargs: additional kwargs to supplement the returned batch_kwargs + batch_kwargs: the batch_kwargs to use; must include a datasource key + expectation_suite_name: the name of the expectation_suite to get + data_asset_type: the type of data_asset to build, with associated expectation implementations. This can + generally be inferred from the datasource. + batch_parameters: optional parameters to store as the reference description of the batch. They should + reflect parameters that would provide the passed BatchKwargs. Returns: - BatchKwargs + DataAsset """ - if not isinstance(data_asset_name, (NormalizedDataAssetName, DataAssetIdentifier)): - data_asset_name = self.normalize_data_asset_name(data_asset_name) - - datasource = self.get_datasource(data_asset_name.datasource) - batch_kwargs = datasource.named_generator_build_batch_kwargs( - generator_name=data_asset_name.generator, - generator_asset=data_asset_name.generator_asset, - partition_id=partition_id, - **kwargs - ) + if isinstance(batch_kwargs, dict): + batch_kwargs = BatchKwargs(batch_kwargs) - return batch_kwargs + if not isinstance(batch_kwargs, BatchKwargs): + raise ge_exceptions.BatchKwargsError("BatchKwargs must be a BatchKwargs object or dictionary.") - def get_batch(self, data_asset_name, expectation_suite_name, batch_kwargs=None, **kwargs): - """ - Get a batch of data, using the namespace of the provided data_asset_name. + if not isinstance(expectation_suite_name, (ExpectationSuiteIdentifier, string_types)): + raise ge_exceptions.DataContextError("expectation_suite_name must be an ExpectationSuiteIdentifier or " + "string.") - get_batch constructs its batch by first normalizing the data_asset_name (if not already normalized) and then: - (1) getting data using the provided batch_kwargs; and - (2) attaching the named expectation suite - - A single partition_id may be used in place of batch_kwargs when using a data_asset_name whose generator - supports that partition type, and additional kwargs will be used to supplement the provided batch_kwargs. - - Args: - data_asset_name: name of the data asset. The name will be normalized. \ - (See :py:meth:`normalize_data_asset_name` ) - expectation_suite_name: name of the expectation suite to attach to the data_asset returned - batch_kwargs: key-value pairs describing the batch of data the datasource should fetch. \ - (See :class:`BatchGenerator` ) If no batch_kwargs are specified, then the context will get the next - available batch_kwargs for the data_asset. - **kwargs: additional key-value pairs to pass to the datasource when fetching the batch. - - Returns: - Great Expectations data_asset with attached expectation_suite and DataContext - """ - normalized_data_asset_name = self.normalize_data_asset_name(data_asset_name) - - datasource = self.get_datasource(normalized_data_asset_name.datasource) - if not datasource: - raise ge_exceptions.DataContextError( - "Can't find datasource {} in the config - please check your {}".format( - normalized_data_asset_name, - self.GE_YML - ) - ) - - if batch_kwargs is None: - batch_kwargs = self.build_batch_kwargs(data_asset_name, **kwargs) - - data_asset = datasource.get_batch(normalized_data_asset_name, - expectation_suite_name, - batch_kwargs, - **kwargs) - return data_asset + datasource = self.get_datasource(batch_kwargs.get("datasource")) + expectation_suite = self.get_expectation_suite(expectation_suite_name) + batch = datasource.get_batch(batch_kwargs=batch_kwargs, batch_parameters=batch_parameters) + if data_asset_type is None: + data_asset_type = datasource.config.get("data_asset_type") + validator = Validator(batch=batch, expectation_suite=expectation_suite, expectation_engine=data_asset_type) + return validator.get_dataset() def run_validation_operator( self, validation_operator_name, assets_to_validate, run_id=None, + **kwargs ): """ Run a validation operator to validate data assets and to perform the business logic around validation that the operator implements. - :param validation_operator_name: name of the operator, as appears in the context's config file - :param assets_to_validate: a list that specifies the data assets that the operator will validate. - The members of the list can be either batches (which means that have - data asset identifier, batch kwargs and expectation suite identifier) - or a triple that will allow the operator to fetch the batch: - (data asset identifier, expectation suite identifier, batch kwargs) - :param run_id: run id - this is set by the caller and should correspond to something - meaningful to the user (e.g., pipeline run id or timestamp) - :return: A result object that is defined by the class of the operator that is invoked. + Args: + validation_operator_name: name of the operator, as appears in the context's config file + assets_to_validate: a list that specifies the data assets that the operator will validate. The members of + the list can be either batches, or a tuple that will allow the operator to fetch the batch: + (batch_kwargs, expectation_suite_name) + run_id: The run_id for the validation; if None, a default value will be used + **kwargs: Additional kwargs to pass to the validation operator + + Returns: + ValidationOperatorResult """ + if run_id is None: + run_id = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") + logger.info("Setting run_id to: {}".format(run_id)) + return self.validation_operators[validation_operator_name].run( assets_to_validate=assets_to_validate, run_id=run_id, + **kwargs ) def add_datasource(self, name, initialize=True, **kwargs): """Add a new datasource to the data context, with configuration provided as kwargs. Args: - name (str): the name for the new datasource to add - initialize - if False, add the datasource to the config, but do not - initialize it. Example: user needs to debug database connectivity. + name: the name for the new datasource to add + initialize: if False, add the datasource to the config, but do not + initialize it, for example if a user needs to debug database connectivity. kwargs (keyword arguments): the configuration for the new datasource - Note: - the type_ parameter is still supported as a way to add a datasource, but support will - be removed in a future release. Please update to using class_name instead. + Returns: datasource (Datasource) """ - logger.debug("Starting ConfigOnlyDataContext.add_datasource for %s" % name) - if "generators" not in kwargs: - logger.warning("Adding a datasource without configuring a generator will rely on default " - "generator behavior. Consider adding a generator.") - - if "type" in kwargs: - warnings.warn("Using type_ configuration to build datasource. Please update to using class_name.") - type_ = kwargs["type"] - datasource_class = self._get_datasource_class_from_type(type_) - else: - datasource_class = load_class( - kwargs.get("class_name"), - kwargs.get("module_name", "great_expectations.datasource") - ) + logger.debug("Starting BaseDataContext.add_datasource for %s" % name) + datasource_class = load_class( + kwargs.get("class_name"), + kwargs.get("module_name", "great_expectations.datasource") + ) # For any class that should be loaded, it may control its configuration construction # by implementing a classmethod called build_configuration if hasattr(datasource_class, "build_configuration"): config = datasource_class.build_configuration(**kwargs) + else: + config = kwargs + + self._project_config["datasources"][name] = config # We perform variable substitution in the datasource's config here before using the config # to instantiate the datasource object. Variable substitution is a service that the data # context provides. Datasources should not see unsubstituted variables in their config. - self._project_config_with_variables_substituted["datasources"][ - name] = self.get_config_with_variables_substituted(config) - if initialize: datasource = self._build_datasource_from_config( - **self._project_config_with_variables_substituted["datasources"][name]) + name, self._project_config_with_variables_substituted["datasources"][name]) self._datasources[name] = datasource else: datasource = None - self._project_config["datasources"][name] = config - return datasource + def add_generator(self, datasource_name, generator_name, class_name, **kwargs): + """Add a generator to the named datasource, using the provided configuration. + + Args: + datasource_name: name of datasource to which to add the new generator + generator_name: name of the generator to add + class_name: class of the generator to add + **kwargs: generator configuration, provided as kwargs + + Returns: + + """ + datasource_obj = self.get_datasource(datasource_name) + generator = datasource_obj.add_generator(name=generator_name, class_name=class_name, **kwargs) + return generator + def get_config(self): return self._project_config - def _build_datasource_from_config(self, **kwargs): - if "type" in kwargs: + def _build_datasource_from_config(self, name, config): + if "type" in config: warnings.warn("Using type configuration to build datasource. Please update to using class_name.") - type_ = kwargs.pop("type") + type_ = config.pop("type") datasource_class = self._get_datasource_class_from_type(type_) - kwargs.update({ + config.update({ "class_name": datasource_class.__name__ }) + config.update({ + "name": name + }) datasource = instantiate_class_from_config( - config=kwargs, - runtime_config={ + config=config, + runtime_environment={ "data_context": self }, config_defaults={ @@ -842,26 +604,6 @@ def _build_datasource_from_config(self, **kwargs): ) return datasource - def _get_datasource_class_from_type(self, datasource_type): - """NOTE: THIS METHOD OF BUILDING DATASOURCES IS DEPRECATED. - Instead, please specify class_name - """ - warnings.warn("Using the 'type' key to instantiate a datasource is deprecated. Please use class_name instead.") - if datasource_type == "pandas": - return PandasDatasource - elif datasource_type == "dbt": - return DBTDatasource - elif datasource_type == "sqlalchemy": - return SqlAlchemyDatasource - elif datasource_type == "spark": - return SparkDFDatasource - else: - try: - # Update to do dynamic loading based on plugin types - return PandasDatasource - except ImportError: - raise - def get_datasource(self, datasource_name="default"): """Get the named datasource @@ -878,253 +620,38 @@ def get_datasource(self, datasource_name="default"): self._project_config_with_variables_substituted["datasources"][datasource_name]) else: raise ValueError( - "Unable to load datasource %s -- no configuration found or invalid configuration." % datasource_name + "Unable to load datasource `%s` -- no configuration found or invalid configuration." % datasource_name ) - datasource = self._build_datasource_from_config(**datasource_config) + datasource = self._build_datasource_from_config(datasource_name, datasource_config) self._datasources[datasource_name] = datasource return datasource - - def list_expectation_suite_keys(self): - """Return a list of available expectation suite keys.""" - keys = self.stores[self.expectations_store_name].list_keys() + + def list_expectation_suites(self): + """Return a list of available expectation suite names.""" + try: + keys = self.stores[self.expectations_store_name].list_keys() + except KeyError as e: + raise ge_exceptions.InvalidConfigError("Unable to find configured store: %s" % str(e)) return keys def list_datasources(self): """List currently-configured datasources on this context. Returns: - List(dict): each dictionary includes "name" and "type" keys + List(dict): each dictionary includes "name" and "class_name" keys """ datasources = [] - # NOTE: 20190916 - JPC - Upon deprecation of support for type: configuration, this can be simplified for key, value in self._project_config_with_variables_substituted["datasources"].items(): - if "type" in value: - logger.warning("Datasource %s configured using type. Please use class_name instead." % key) - datasources.append({ - "name": key, - "type": value["type"], - "class_name": self._get_datasource_class_from_type(value["type"]).__name__ - }) - else: - datasources.append({ - "name": key, - "class_name": value["class_name"] - }) + datasources.append({ + "name": key, + "class_name": value["class_name"] + }) return datasources - def normalize_data_asset_name(self, data_asset_name): - """Normalizes data_asset_names for a data context. - - A data_asset_name is defined per-project and consists of three components that together define a "namespace" - for data assets, encompassing both expectation suites and batches. - - Within a namespace, an expectation suite effectively defines candidate "types" for batches of data, and - validating a batch of data determines whether that instance is of the candidate type. - - The data_asset_name namespace consists of three components: - - - a datasource name - - a generator_name - - a generator_asset - - It has a string representation consisting of each of those components delimited by a character defined in the - data_context ('/' by default). - - Args: - data_asset_name (str): The (unnormalized) data asset name to normalize. The name will be split \ - according to the currently-configured data_asset_name_delimiter - - Returns: - NormalizedDataAssetName - """ - - if isinstance(data_asset_name, NormalizedDataAssetName): - return data_asset_name - elif isinstance(data_asset_name, DataAssetIdentifier): - return NormalizedDataAssetName( - datasource=data_asset_name.datasource, - generator=data_asset_name.generator, - generator_asset=data_asset_name.generator_asset - ) - - split_name = data_asset_name.split(self.data_asset_name_delimiter) - - existing_expectation_suite_keys = self.list_expectation_suite_keys() - existing_namespaces = [] - for key in existing_expectation_suite_keys: - existing_namespaces.append( - NormalizedDataAssetName( - key.data_asset_name.datasource, - key.data_asset_name.generator, - key.data_asset_name.generator_asset, - ) - ) - - if len(split_name) > 3: - raise ge_exceptions.DataContextError( - "Invalid data_asset_name '{data_asset_name}': found too many components using delimiter '{delimiter}'" - .format( - data_asset_name=data_asset_name, - delimiter=self.data_asset_name_delimiter - ) - ) - - elif len(split_name) == 1: - # In this case, the name *must* refer to a unique data_asset_name - provider_names = set() - generator_asset = split_name[0] - for normalized_identifier in existing_namespaces: - curr_generator_asset = normalized_identifier[2] - if generator_asset == curr_generator_asset: - provider_names.add( - normalized_identifier - ) - - # NOTE: Current behavior choice is to continue searching to see whether the namespace is ambiguous - # based on configured generators *even* if there is *only one* namespace with expectation suites - # in it. - - # If generators' namespaces are enormous or if they are slow to provide all their available names, - # that behavior could become unwieldy, and perhaps should be revisited by using the escape hatch - # commented out below. - - # if len(provider_names) == 1: - # return provider_names[0] - # - # elif len(provider_names) > 1: - # raise ge_exceptions.DataContextError( - # "Ambiguous data_asset_name '{data_asset_name}'. Multiple candidates found: {provider_names}" - # .format(data_asset_name=data_asset_name, provider_names=provider_names) - # ) - - available_names = self.get_available_data_asset_names() - for datasource in available_names.keys(): - for generator in available_names[datasource].keys(): - names_set = available_names[datasource][generator] - if generator_asset in names_set: - provider_names.add( - NormalizedDataAssetName(datasource, generator, generator_asset) - ) - - if len(provider_names) == 1: - return provider_names.pop() - - elif len(provider_names) > 1: - raise ge_exceptions.DataContextError( - "Ambiguous data_asset_name '{data_asset_name}'. Multiple candidates found: {provider_names}" - .format(data_asset_name=data_asset_name, provider_names=provider_names) - ) - - # If we are here, then the data_asset_name does not belong to any configured datasource or generator - # If there is only a single datasource and generator, we assume the user wants to create a new - # namespace. - if (len(available_names.keys()) == 1 and # in this case, we know that the datasource name is valid - len(available_names[datasource].keys()) == 1): - return NormalizedDataAssetName( - datasource, - generator, - generator_asset - ) - - if len(available_names.keys()) == 0: - raise ge_exceptions.DataContextError( - "No datasource configured: a datasource is required to normalize an incomplete data_asset_name" - ) - - raise ge_exceptions.DataContextError( - "Ambiguous data_asset_name: no existing data_asset has the provided name, no generator provides it, " - " and there are multiple datasources and/or generators configured." - ) - - elif len(split_name) == 2: - # In this case, the name must be a datasource_name/generator_asset - - # If the data_asset_name is already defined by a config in that datasource, return that normalized name. - provider_names = set() - for normalized_identifier in existing_namespaces: - curr_datasource_name = normalized_identifier[0] - curr_generator_asset = normalized_identifier[2] - if curr_datasource_name == split_name[0] and curr_generator_asset == split_name[1]: - provider_names.add(normalized_identifier) - - # NOTE: Current behavior choice is to continue searching to see whether the namespace is ambiguous - # based on configured generators *even* if there is *only one* namespace with expectation suites - # in it. - - # If generators' namespaces are enormous or if they are slow to provide all their available names, - # that behavior could become unwieldy, and perhaps should be revisited by using the escape hatch - # commented out below. - - # if len(provider_names) == 1: - # return provider_names[0] - # - # elif len(provider_names) > 1: - # raise ge_exceptions.DataContextError( - # "Ambiguous data_asset_name '{data_asset_name}'. Multiple candidates found: {provider_names}" - # .format(data_asset_name=data_asset_name, provider_names=provider_names) - # ) - - available_names = self.get_available_data_asset_names() - for datasource_name in available_names.keys(): - for generator in available_names[datasource_name].keys(): - generator_assets = available_names[datasource_name][generator] - if split_name[0] == datasource_name and split_name[1] in generator_assets: - provider_names.add(NormalizedDataAssetName(datasource_name, generator, split_name[1])) - - if len(provider_names) == 1: - return provider_names.pop() - - elif len(provider_names) > 1: - raise ge_exceptions.DataContextError( - "Ambiguous data_asset_name '{data_asset_name}'. Multiple candidates found: {provider_names}" - .format(data_asset_name=data_asset_name, provider_names=provider_names) - ) - - # If we are here, then the data_asset_name does not belong to any configured datasource or generator - # If there is only a single generator for their provided datasource, we allow the user to create a new - # namespace. - if split_name[0] in available_names and len(available_names[split_name[0]]) == 1: - logger.info("Normalizing to a new generator name.") - return NormalizedDataAssetName( - split_name[0], - list(available_names[split_name[0]].keys())[0], - split_name[1] - ) - - if len(available_names.keys()) == 0: - raise ge_exceptions.DataContextError( - "No datasource configured: a datasource is required to normalize an incomplete data_asset_name" - ) - - raise ge_exceptions.DataContextError( - "No generator available to produce data_asset_name '{data_asset_name}' " - "with datasource '{datasource_name}'" - .format(data_asset_name=data_asset_name, datasource_name=datasource_name) - ) - - elif len(split_name) == 3: - # In this case, we *do* check that the datasource and generator names are valid, but - # allow the user to define a new generator asset - datasources = [datasource["name"] for datasource in self.list_datasources()] - if split_name[0] in datasources: - datasource = self.get_datasource(split_name[0]) - - generators = [generator["name"] for generator in datasource.list_generators()] - if split_name[1] in generators: - return NormalizedDataAssetName(*split_name) - - raise ge_exceptions.DataContextError( - "Invalid data_asset_name: no configured datasource '{datasource_name}' " - "with generator '{generator_name}'" - .format(datasource_name=split_name[0], generator_name=split_name[1]) - ) - - def create_expectation_suite(self, data_asset_name, expectation_suite_name, overwrite_existing=False): + def create_expectation_suite(self, expectation_suite_name, overwrite_existing=False): """Build a new expectation suite and save it into the data_context expectation store. Args: - data_asset_name: The name of the data_asset for which this suite will be stored. - data_asset_name will be normalized if it is a string expectation_suite_name: The name of the expectation_suite to create overwrite_existing (boolean): Whether to overwrite expectation suite if expectation suite with given name already exists. @@ -1135,159 +662,120 @@ def create_expectation_suite(self, data_asset_name, expectation_suite_name, over if not isinstance(overwrite_existing, bool): raise ValueError("Parameter overwrite_existing must be of type BOOL") - if not isinstance(data_asset_name, NormalizedDataAssetName): - data_asset_name = self.normalize_data_asset_name(data_asset_name) - - expectation_suite = get_empty_expectation_suite( - # FIXME: For now, we just cast this to a string to be close to the old behavior - self.data_asset_name_delimiter.join(data_asset_name), - expectation_suite_name - ) - - key = ExpectationSuiteIdentifier( - data_asset_name=DataAssetIdentifier(*data_asset_name), - expectation_suite_name=expectation_suite_name, - ) + expectation_suite = ExpectationSuite(expectation_suite_name=expectation_suite_name) + key = ExpectationSuiteIdentifier(expectation_suite_name=expectation_suite_name) if self._stores[self.expectations_store_name].has_key(key) and not overwrite_existing: raise ge_exceptions.DataContextError( - "expectation_suite with name {} already exists for data_asset "\ - "{}. If you would like to overwrite this expectation_suite, "\ - "set overwrite_existing=True.".format( - expectation_suite_name, - data_asset_name - ) + "expectation_suite with name {} already exists. If you would like to overwrite this " + "expectation_suite, set overwrite_existing=True.".format(expectation_suite_name) ) else: self._stores[self.expectations_store_name].set(key, expectation_suite) return expectation_suite - def get_expectation_suite(self, data_asset_name, expectation_suite_name="default"): + def get_expectation_suite(self, expectation_suite_name): """Get a named expectation suite for the provided data_asset_name. Args: - data_asset_name (str or NormalizedDataAssetName): the data asset name to which the expectation suite belongs expectation_suite_name (str): the name for the expectation suite Returns: expectation_suite """ - if not isinstance(data_asset_name, NormalizedDataAssetName): - data_asset_name = self.normalize_data_asset_name(data_asset_name) - - key = ExpectationSuiteIdentifier( - data_asset_name=DataAssetIdentifier(*data_asset_name), - expectation_suite_name=expectation_suite_name, - ) + key = ExpectationSuiteIdentifier(expectation_suite_name=expectation_suite_name) if self.stores[self.expectations_store_name].has_key(key): return self.stores[self.expectations_store_name].get(key) else: raise ge_exceptions.DataContextError( - "No expectation_suite found for data_asset_name %s and expectation_suite_name %s" % - (data_asset_name, expectation_suite_name) + "expectation_suite %s not found" % expectation_suite_name ) - def save_expectation_suite(self, expectation_suite, data_asset_name=None, expectation_suite_name=None): + def save_expectation_suite(self, expectation_suite, expectation_suite_name=None): """Save the provided expectation suite into the DataContext. Args: expectation_suite: the suite to save - data_asset_name: the data_asset_name for this expectation suite. If no name is provided, the name will\ - be read from the suite expectation_suite_name: the name of this expectation suite. If no name is provided the name will \ be read from the suite Returns: None """ - if data_asset_name is None: - try: - data_asset_name = expectation_suite['data_asset_name'] - except KeyError: - raise ge_exceptions.DataContextError( - "data_asset_name must either be specified or present in the provided expectation suite") - else: - # Note: we ensure that the suite name is a string here, until we have typed ExpectationSuite - # objects that will know how to read the correct type back in - expectation_suite['data_asset_name'] = str(data_asset_name) - # expectation_suite['data_asset_name'] = data_asset_name - if expectation_suite_name is None: - try: - expectation_suite_name = expectation_suite['expectation_suite_name'] - except KeyError: - raise ge_exceptions.DataContextError( - "expectation_suite_name must either be specified or present in the provided expectation suite") + key = ExpectationSuiteIdentifier(expectation_suite_name=expectation_suite.expectation_suite_name) else: - expectation_suite['expectation_suite_name'] = expectation_suite_name + expectation_suite.expectation_suite_name = expectation_suite_name + key = ExpectationSuiteIdentifier(expectation_suite_name=expectation_suite_name) - if not isinstance(data_asset_name, NormalizedDataAssetName): - data_asset_name = self.normalize_data_asset_name(data_asset_name) + self.stores[self.expectations_store_name].set(key, expectation_suite) + self._evaluation_parameter_dependencies_compiled = False - self.stores[self.expectations_store_name].set(ExpectationSuiteIdentifier( - data_asset_name=DataAssetIdentifier(*data_asset_name), - expectation_suite_name=expectation_suite_name, - ), expectation_suite) + def _store_metrics(self, requested_metrics, validation_results, target_store_name): + """ + requested_metrics is a dictionary like this: + + requested_metrics: + *: # The asterisk here matches *any* expectation suite name + # use the 'kwargs' key to request metrics that are defined by kwargs, + # for example because they are defined only for a particular column + # - column: + # Age: + # - expect_column_min_to_be_between.result.observed_value + - statistics.evaluated_expectations + - statistics.successful_expectations - self._compiled = False + Args: + requested_metrics: + validation_results: + target_store_name: - def _extract_and_store_parameters_from_validation_results(self, validation_results, data_asset_name, expectation_suite_name, run_id): + Returns: - if not self._compiled: - self._compile() + """ + expectation_suite_name = validation_results.meta["expectation_suite_name"] + run_id = validation_results.meta["run_id"] + + for expectation_suite_dependency, metrics_list in requested_metrics.items(): + if (expectation_suite_dependency != "*") and (expectation_suite_dependency != expectation_suite_name): + continue + + if not isinstance(metrics_list, list): + raise ge_exceptions.DataContextError("Invalid requested_metrics configuration: metrics requested for " + "each expectation suite must be a list.") + + for metric_configuration in metrics_list: + metric_configurations = _get_metric_configuration_tuples(metric_configuration) + for metric_name, metric_kwargs in metric_configurations: + try: + metric_value = validation_results.get_metric(metric_name, **metric_kwargs) + self.stores[target_store_name].set( + ValidationMetricIdentifier( + run_id=run_id, + expectation_suite_identifier=ExpectationSuiteIdentifier(expectation_suite_name), + metric_name=metric_name, + metric_kwargs_id=get_metric_kwargs_id(metric_name, metric_kwargs) + ), + metric_value + ) + except ge_exceptions.UnavailableMetricError: + # This will happen frequently in larger pipelines + logger.debug("metric {} was requested by another expectation suite but is not available in " + "this validation result.".format(metric_name)) - if ("meta" not in validation_results or - "data_asset_name" not in validation_results["meta"] or - "expectation_suite_name" not in validation_results["meta"] - ): - logger.warning( - "Both data_asset_name and expectation_suite_name must be in validation results to " - "register evaluation parameters." - ) - return + def store_validation_result_metrics(self, requested_metrics, validation_results, target_store_name): + self._store_metrics(requested_metrics, validation_results, target_store_name) - elif (data_asset_name not in self._compiled_parameters["data_assets"] or - expectation_suite_name not in self._compiled_parameters["data_assets"][data_asset_name]): - # This is fine; short-circuit since we do not need to register any results from this dataset. - return - - for result in validation_results['results']: - # Unoptimized: loop over all results and check if each is needed - expectation_type = result['expectation_config']['expectation_type'] - if expectation_type in self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name]: - # First, bind column-style parameters - if (("column" in result['expectation_config']['kwargs']) and - ("columns" in self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_type]) and - (result['expectation_config']['kwargs']["column"] in - self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_type]["columns"])): - - column = result['expectation_config']['kwargs']["column"] - # Now that we have a small search space, invert logic, and look for the parameters in our result - for type_key, desired_parameters in self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_type]["columns"][column].items(): - # value here is the set of desired parameters under the type_key - for desired_param in desired_parameters: - desired_key = desired_param.split(":")[-1] - if type_key == "result" and desired_key in result['result']: - self.set_parameters_in_evaluation_parameter_store_by_run_id_and_key(run_id, desired_param, result["result"][desired_key]) - elif type_key == "details" and desired_key in result["result"]["details"]: - self.set_parameters_in_evaluation_parameter_store_by_run_id_and_key(run_id, desired_param, result["result"]["details"]) - else: - logger.warning("Unrecognized key for parameter %s" % desired_param) - - # Next, bind parameters that do not have column parameter - for type_key, desired_parameters in self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_type].items(): - if type_key == "columns": - continue - for desired_param in desired_parameters: - desired_key = desired_param.split(":")[-1] - if type_key == "result" and desired_key in result['result']: - self.set_parameters_in_evaluation_parameter_store_by_run_id_and_key(run_id, desired_param, result["result"][desired_key]) - elif type_key == "details" and desired_key in result["result"]["details"]: - self.set_parameters_in_evaluation_parameter_store_by_run_id_and_key(run_id, desired_param, result["result"]["details"]) - else: - logger.warning("Unrecognized key for parameter %s" % desired_param) + def store_evaluation_parameters(self, validation_results, target_store_name=None): + if not self._evaluation_parameter_dependencies_compiled: + self._compile_evaluation_parameter_dependencies() + + if target_store_name is None: + target_store_name = self.evaluation_parameter_store_name + + self._store_metrics(self._evaluation_parameter_dependencies, validation_results, target_store_name) @property def evaluation_parameter_store(self): @@ -1305,231 +793,22 @@ def validations_store_name(self): def validations_store(self): return self.stores[self.validations_store_name] - def set_parameters_in_evaluation_parameter_store_by_run_id_and_key(self, run_id, key, value): - """Store a new validation parameter. - - Args: - run_id: current run_id - key: parameter key - value: parameter value - - Returns: - None - """ - run_params = self.get_parameters_in_evaluation_parameter_store_by_run_id(run_id) - run_params[key] = value - self.evaluation_parameter_store.set(run_id, run_params) - - def get_parameters_in_evaluation_parameter_store_by_run_id(self, run_id): - """Fetches all validation parameters for a given run_id. - - Args: - run_id: current run_id - - Returns: - value stored in evaluation_parameter_store for the provided run_id and key - """ - if self.evaluation_parameter_store.has_key(run_id): - return copy.deepcopy( - self.evaluation_parameter_store.get(run_id) - ) - else: - return {} - - #NOTE: Abe 2019/08/22 : Can we rename this to _compile_all_evaluation_parameters_from_expectation_suites, or something similar? - # A more descriptive name would have helped me grok this faster when I first encountered it - def _compile(self): - """Compiles all current expectation configurations in this context to be ready for result registration. - - Compilation only respects parameters with a URN structure beginning with urn:great_expectations:validations - It splits parameters by the : (colon) character; valid URNs must have one of the following structures to be - automatically recognized. - - "urn" : "great_expectations" : "validations" : data_asset_name : expectation_suite_name : "expectations" : expectation_name : "columns" : column_name : "result": result_key - [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] - - "urn" : "great_expectations" : "validations" : data_asset_name : expectation_suite_name : "expectations" : expectation_name : "columns" : column_name : "details": details_key - [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] - - "urn" : "great_expectations" : "validations" : data_asset_name : expectation_suite_name : "expectations" : expectation_name : "result": result_key - [0] [1] [2] [3] [4] [5] [6] [7] [8] - - "urn" : "great_expectations" : "validations" : data_asset_name : expectation_suite_name : "expectations" : expectation_name : "details": details_key - [0] [1] [2] [3] [4] [5] [6] [7] [8] - - Parameters are compiled to the following structure: - - :: json - - { - "raw": - "data_assets": { - data_asset_name: { - expectation_suite_name: { - expectation_name: { - "details": - "result": - column_name: { - "details": - "result": - } - } - } - } - } - } - - - """ - - # Full recompilation every time - self._compiled_parameters = { - "raw": set(), - "data_assets": {} - } - + def _compile_evaluation_parameter_dependencies(self): + self._evaluation_parameter_dependencies = {} for key in self.stores[self.expectations_store_name].list_keys(): - config = self.stores[self.expectations_store_name].get(key) - for expectation in config["expectations"]: - for _, value in expectation["kwargs"].items(): - if isinstance(value, dict) and '$PARAMETER' in value: - # Compile *only* respects parameters in urn structure - # beginning with urn:great_expectations:validations - if value["$PARAMETER"].startswith("urn:great_expectations:validations:"): - column_expectation = False - parameter = value["$PARAMETER"] - self._compiled_parameters["raw"].add(parameter) - param_parts = parameter.split(":") - try: - data_asset_name = param_parts[3] - expectation_suite_name = param_parts[4] - expectation_name = param_parts[6] - if param_parts[7] == "columns": - column_expectation = True - column_name = param_parts[8] - param_key = param_parts[9] - else: - param_key = param_parts[7] - except IndexError: - logger.warning("Invalid parameter urn (not enough parts): %s" % parameter) - continue - - normalized_data_asset_name = self.normalize_data_asset_name(data_asset_name) - - data_asset_name = DataAssetIdentifier(normalized_data_asset_name.datasource, - normalized_data_asset_name.generator, - normalized_data_asset_name.generator_asset) - if data_asset_name not in self._compiled_parameters["data_assets"]: - self._compiled_parameters["data_assets"][data_asset_name] = {} - - if expectation_suite_name not in self._compiled_parameters["data_assets"][data_asset_name]: - self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name] = {} - - if expectation_name not in self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name]: - self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name] = {} - - if column_expectation: - if "columns" not in self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name]: - self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name]["columns"] = {} - if column_name not in self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name]["columns"]: - self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name]["columns"][column_name] = {} - if param_key not in self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name]["columns"][column_name]: - self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name]["columns"][column_name][param_key] = set() - self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name]["columns"][column_name][param_key].add(parameter) - - elif param_key in ["result", "details"]: - if param_key not in self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name]: - self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name][param_key] = set() - self._compiled_parameters["data_assets"][data_asset_name][expectation_suite_name][expectation_name][param_key].add(parameter) - - else: - logger.warning("Invalid parameter urn (unrecognized structure): %s" % parameter) - - self._compiled = True - - # # TDOD : Deprecate this method in favor of Stores. - # def write_resource( - # self, - # resource, # bytes - # resource_name, # name to be used inside namespace, e.g. "my_file.html" - # resource_store, # store to use to write the resource - # resource_namespace=None, # An arbitrary name added to the resource namespace - # data_asset_name=None, # A name that will be normalized by the data_context and used in the namespace - # expectation_suite_name=None, # A string that is part of the namespace - # run_id=None - # ): # A string that is part of the namespace - # """Writes the bytes in "resource" according to the resource_store's writing method, with a name constructed - # as follows: - # - # resource_namespace/run_id/data_asset_name/expectation_suite_name/resource_name - # - # If any of those components is None, it is omitted from the namespace. - # - # Args: - # resource: - # resource_name: - # resource_store: - # resource_namespace: - # data_asset_name: - # expectation_suite_name: - # run_id: - # - # Returns: - # A dictionary describing how to locate the resource (specific to resource_store type) - # """ - # logger.debug("Starting DatContext.write_resource") - # - # if resource_store is None: - # logger.error("No resource store specified") - # return - # - # resource_locator_info = {} - # - # if resource_store['type'] == "s3": - # raise NotImplementedError("s3 is not currently a supported resource_store type for writing") - # elif resource_store['type'] == 'filesystem': - # resource_store = self._normalize_store_path(resource_store) - # path_components = [resource_store['base_directory']] - # if resource_namespace is not None: - # path_components.append(resource_namespace) - # if run_id is not None: - # path_components.append(run_id) - # if data_asset_name is not None: - # if not isinstance(data_asset_name, NormalizedDataAssetName): - # normalized_name = self.normalize_data_asset_name(data_asset_name) - # else: - # normalized_name = data_asset_name - # if expectation_suite_name is not None: - # path_components.append(self._get_normalized_data_asset_name_filepath(normalized_name, expectation_suite_name, base_path="", file_extension="")) - # else: - # path_components.append( - # self._get_normalized_data_asset_name_filepath(normalized_name, "", - # base_path="", file_extension="")) - # else: - # if expectation_suite_name is not None: - # path_components.append(expectation_suite_name) - # - # path_components.append(resource_name) - # - # path = os.path.join( - # *path_components - # ) - # safe_mmkdir(os.path.dirname(path)) - # with open(path, "w") as writer: - # writer.write(resource) - # - # resource_locator_info['path'] = path - # else: - # raise ge_exceptions.DataContextError("Unrecognized resource store type.") - # - # return resource_locator_info + expectation_suite = self.stores[self.expectations_store_name].get(key) + dependencies = expectation_suite.get_evaluation_parameter_dependencies() + if len(dependencies) > 0: + nested_update(self._evaluation_parameter_dependencies, dependencies) + + self._evaluation_parameter_dependencies_compiled = True def get_validation_result( self, - data_asset_name, - expectation_suite_name="default", + expectation_suite_name, run_id=None, - validations_store_name="validations_store", + batch_identifier=None, + validations_store_name=None, failed_only=False, ): """Get validation results from a configured store. @@ -1545,45 +824,49 @@ def get_validation_result( validation_result """ - + if validations_store_name is None: + validations_store_name = self.validations_store_name selected_store = self.stores[validations_store_name] - if not isinstance(data_asset_name, NormalizedDataAssetName): - data_asset_name = self.normalize_data_asset_name(data_asset_name) - - if not isinstance(data_asset_name, DataAssetIdentifier): - data_asset_name = DataAssetIdentifier( - datasource=data_asset_name.datasource, - generator=data_asset_name.generator, - generator_asset=data_asset_name.generator_asset - ) - - if run_id == None: + if run_id is None or batch_identifier is None: #Get most recent run id # NOTE : This method requires a (potentially very inefficient) list_keys call. # It should probably move to live in an appropriate Store class, # but when we do so, that Store will need to function as more than just a key-value Store. key_list = selected_store.list_keys() - run_id_set = set([key.run_id for key in key_list]) - if len(run_id_set) == 0: + filtered_key_list = [] + for key in key_list: + if run_id is not None and key.run_id != run_id: + continue + if batch_identifier is not None and key.batch_identifier != batch_identifier: + continue + filtered_key_list.append(key) + + # run_id_set = set([key.run_id for key in filtered_key_list]) + if len(filtered_key_list) == 0: logger.warning("No valid run_id values found.") return {} - run_id = max(run_id_set) + filtered_key_list = sorted(filtered_key_list, key=lambda x: x.run_id) + + if run_id is None: + run_id = filtered_key_list[-1].run_id + if batch_identifier is None: + batch_identifier = filtered_key_list[-1].batch_identifier key = ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( - data_asset_name=data_asset_name, expectation_suite_name=expectation_suite_name ), - run_id=run_id - ) + run_id=run_id, + batch_identifier=batch_identifier + ) results_dict = selected_store.get(key) #TODO: This should be a convenience method of ValidationResultSuite if failed_only: - failed_results_list = [result for result in results_dict["results"] if not result["success"]] - results_dict["results"] = failed_results_list + failed_results_list = [result for result in results_dict.results if not result.success] + results_dict.results = failed_results_list return results_dict else: return results_dict @@ -1633,29 +916,28 @@ def build_data_docs(self, site_names=None, resource_identifiers=None): for site_name, site_config in sites.items(): logger.debug("Building Data Docs Site %s" % site_name,) - # NOTE: 20191007 - JPC: removed condition that zero-length site_names mean build all sites if (site_names and site_name in site_names) or not site_names: complete_site_config = site_config site_builder = instantiate_class_from_config( config=complete_site_config, - runtime_config={ + runtime_environment={ "data_context": self, + "root_directory": self.root_directory, + "site_name": site_name }, config_defaults={ "module_name": "great_expectations.render.renderer.site_builder" } ) - index_page_locator_info = site_builder.build(resource_identifiers)[0] + index_page_resource_identifier_tuple = site_builder.build(resource_identifiers) + if index_page_resource_identifier_tuple: + index_page_locator_infos[site_name] = index_page_resource_identifier_tuple[0] - if index_page_locator_info: - index_page_locator_infos[site_name] = index_page_locator_info else: logger.debug("No data_docs_config found. No site(s) built.") return index_page_locator_infos - # Proposed TODO : Abe 2019/09/21 : I think we want to convert this method into a configurable profiler class, so that - # it can be pluggable and configurable def profile_datasource(self, datasource_name, generator_name=None, @@ -1664,6 +946,7 @@ def profile_datasource(self, profile_all_data_assets=True, profiler=BasicDatasetProfiler, dry_run=False, + run_id="profiling", additional_batch_kwargs=None): """Profile the named datasource using the named profiler. @@ -1688,60 +971,109 @@ def profile_datasource(self, When success = False, the error details are under "error" key """ + # We don't need the datasource object, but this line serves to check if the datasource by the name passed as + # an arg exists and raise an error if it does not. + datasource = self.get_datasource(datasource_name) + if not dry_run: logger.info("Profiling '%s' with '%s'" % (datasource_name, profiler.__name__)) profiling_results = {} - # Get data_asset_name_list - data_asset_names = self.get_available_data_asset_names(datasource_name) + # Build the list of available data asset names (each item a tuple of name and type) + + data_asset_names_dict = self.get_available_data_asset_names(datasource_name) + + available_data_asset_name_list = [] + try: + datasource_data_asset_names_dict = data_asset_names_dict[datasource_name] + except KeyError: + # KeyError will happen if there is not datasource + raise ge_exceptions.ProfilerError( + "No datasource {} found.".format(datasource_name)) + if generator_name is None: - if len(data_asset_names[datasource_name].keys()) == 1: - generator_name = list(data_asset_names[datasource_name].keys())[0] - if generator_name not in data_asset_names[datasource_name]: - raise ge_exceptions.ProfilerError("Generator %s not found for datasource %s" % (generator_name, datasource_name)) + # if no generator name is passed as an arg and the datasource has only + # one generator with data asset names, use it. + # if ambiguous, raise an exception + for name in datasource_data_asset_names_dict.keys(): + if generator_name is not None: + profiling_results = { + 'success': False, + 'error': { + 'code': DataContext.PROFILING_ERROR_CODE_MULTIPLE_GENERATORS_FOUND + } + } + return profiling_results + + if len(datasource_data_asset_names_dict[name]["names"]) > 0: + available_data_asset_name_list = datasource_data_asset_names_dict[name]["names"] + generator_name = name + + if generator_name is None: + profiling_results = { + 'success': False, + 'error': { + 'code': DataContext.PROFILING_ERROR_CODE_NO_GENERATOR_FOUND + } + } + return profiling_results + else: + # if the generator name is passed as an arg, get this generator's available data asset names + try: + available_data_asset_name_list = datasource_data_asset_names_dict[generator_name]["names"] + except KeyError: + raise ge_exceptions.ProfilerError( + "Batch Kwarg Generator {} not found. Specify the name of a generator configured in this datasource".format(generator_name)) - data_asset_name_list = list(data_asset_names[datasource_name][generator_name]) - total_data_assets = len(data_asset_name_list) + available_data_asset_name_list = sorted(available_data_asset_name_list, key=lambda x: x[0]) - if data_assets and len(data_assets) > 0: - not_found_data_assets = [name for name in data_assets if name not in data_asset_name_list] + if len(available_data_asset_name_list) == 0: + raise ge_exceptions.ProfilerError( + "No Data Assets found in Datasource {}. Used generator: {}.".format( + datasource_name, + generator_name) + ) + total_data_assets = len(available_data_asset_name_list) + + data_asset_names_to_profiled = None + + if isinstance(data_assets, list) and len(data_assets) > 0: + not_found_data_assets = [name for name in data_assets if name not in [da[0] for da in available_data_asset_name_list]] if len(not_found_data_assets) > 0: profiling_results = { 'success': False, 'error': { 'code': DataContext.PROFILING_ERROR_CODE_SPECIFIED_DATA_ASSETS_NOT_FOUND, 'not_found_data_assets': not_found_data_assets, - 'data_assets': data_asset_name_list + 'data_assets': available_data_asset_name_list } } return profiling_results - - data_asset_name_list = data_assets - data_asset_name_list.sort() - total_data_assets = len(data_asset_name_list) + data_assets.sort() + data_asset_names_to_profiled = data_assets + total_data_assets = len(available_data_asset_name_list) if not dry_run: - logger.info("Profiling the white-listed data assets: %s, alphabetically." % (",".join(data_asset_name_list))) + logger.info("Profiling the white-listed data assets: %s, alphabetically." % (",".join(data_assets))) else: - if profile_all_data_assets: - data_asset_name_list.sort() - else: + if not profile_all_data_assets: if total_data_assets > max_data_assets: profiling_results = { 'success': False, 'error': { 'code': DataContext.PROFILING_ERROR_CODE_TOO_MANY_DATA_ASSETS, 'num_data_assets': total_data_assets, - 'data_assets': data_asset_name_list + 'data_assets': available_data_asset_name_list } } return profiling_results + data_asset_names_to_profiled = [name[0] for name in available_data_asset_name_list] if not dry_run: - logger.info("Profiling all %d data assets from generator %s" % (len(data_asset_name_list), generator_name)) + logger.info("Profiling all %d data assets from generator %s" % (len(available_data_asset_name_list), generator_name)) else: - logger.debug("Found %d data assets from generator %s" % (len(data_asset_name_list), generator_name)) + logger.info("Found %d data assets from generator %s" % (len(available_data_asset_name_list), generator_name)) profiling_results['success'] = True @@ -1749,82 +1081,29 @@ def profile_datasource(self, profiling_results['results'] = [] total_columns, total_expectations, total_rows, skipped_data_assets = 0, 0, 0, 0 total_start_time = datetime.datetime.now() - # run_id = total_start_time.isoformat().replace(":", "") + "Z" - run_id = "profiling" - for name in data_asset_name_list: + for name in data_asset_names_to_profiled: logger.info("\tProfiling '%s'..." % name) try: - start_time = datetime.datetime.now() - - # FIXME: There needs to be an affordance here to limit to 100 rows, or downsample, etc. - if additional_batch_kwargs is None: - additional_batch_kwargs = {} - - normalized_data_asset_name = self.normalize_data_asset_name(name) - expectation_suite_name = profiler.__name__ - self.create_expectation_suite( - data_asset_name=normalized_data_asset_name, - expectation_suite_name=expectation_suite_name, - overwrite_existing=True - ) - batch_kwargs = self.yield_batch_kwargs( - data_asset_name=normalized_data_asset_name, - **additional_batch_kwargs - ) - - batch = self.get_batch( - data_asset_name=normalized_data_asset_name, - expectation_suite_name=expectation_suite_name, - batch_kwargs=batch_kwargs - ) - - if not profiler.validate(batch): - raise ge_exceptions.ProfilerError( - "batch '%s' is not a valid batch for the '%s' profiler" % (name, profiler.__name__) - ) - - # Note: This logic is specific to DatasetProfilers, which profile a single batch. Multi-batch profilers - # will have more to unpack. - expectation_suite, validation_results = profiler.profile(batch, run_id=run_id) - profiling_results['results'].append((expectation_suite, validation_results)) - - self.validations_store.set( - key=ValidationResultIdentifier( - expectation_suite_identifier=ExpectationSuiteIdentifier( - data_asset_name=DataAssetIdentifier( - *normalized_data_asset_name - ), - expectation_suite_name=expectation_suite_name - ), - run_id=run_id - ), - value=validation_results + profiling_results['results'].append( + self.profile_data_asset( + datasource_name=datasource_name, + generator_name=generator_name, + data_asset_name=name, + profiler=profiler, + run_id=run_id, + additional_batch_kwargs=additional_batch_kwargs + )["results"][0] ) - if isinstance(batch, Dataset): - # For datasets, we can produce some more detailed statistics - row_count = batch.get_row_count() - total_rows += row_count - new_column_count = len(set([exp["kwargs"]["column"] for exp in expectation_suite["expectations"] if "column" in exp["kwargs"]])) - total_columns += new_column_count - - new_expectation_count = len(expectation_suite["expectations"]) - total_expectations += new_expectation_count - - self.save_expectation_suite(expectation_suite) - duration = (datetime.datetime.now() - start_time).total_seconds() - logger.info("\tProfiled %d columns using %d rows from %s (%.3f sec)" % - (new_column_count, row_count, name, duration)) - except ge_exceptions.ProfilerError as err: logger.warning(err.message) except IOError as err: - logger.warning("IOError while profiling %s. (Perhaps a loading error?) Skipping." % name) + logger.warning("IOError while profiling %s. (Perhaps a loading error?) Skipping." % name[1]) logger.debug(str(err)) skipped_data_assets += 1 except SQLAlchemyError as e: - logger.warning("SqlAlchemyError while profiling %s. Skipping." % name) + logger.warning("SqlAlchemyError while profiling %s. Skipping." % name[1]) logger.debug(str(e)) skipped_data_assets += 1 @@ -1832,7 +1111,7 @@ def profile_datasource(self, logger.info(""" Profiled %d of %d named data assets, with %d total rows and %d columns in %.2f seconds. Generated, evaluated, and stored %d Expectations during profiling. Please review results using data-docs.""" % ( - len(data_asset_name_list), + len(data_asset_names_to_profiled), total_data_assets, total_rows, total_columns, @@ -1845,8 +1124,142 @@ def profile_datasource(self, profiling_results['success'] = True return profiling_results + def profile_data_asset(self, + datasource_name, + generator_name=None, + data_asset_name=None, + batch_kwargs=None, + expectation_suite_name=None, + profiler=BasicDatasetProfiler, + run_id="profiling", + additional_batch_kwargs=None): + """ + Profile a data asset + + :param datasource_name: the name of the datasource to which the profiled data asset belongs + :param generator_name: the name of the generator to use to get batches (only if batch_kwargs are not provided) + :param data_asset_name: the name of the profiled data asset + :param batch_kwargs: optional - if set, the method will use the value to fetch the batch to be profiled. If not passed, the generator (generator_name arg) will choose a batch + :param profiler: the profiler class to use + :param run_id: optional - if set, the validation result created by the profiler will be under the provided run_id + :param additional_batch_kwargs: + :returns + A dictionary:: + + { + "success": True/False, + "results": List of (expectation_suite, EVR) tuples for each of the data_assets found in the datasource + } + + When success = False, the error details are under "error" key + """ + + logger.info("Profiling '%s' with '%s'" % (datasource_name, profiler.__name__)) + + if not additional_batch_kwargs: + additional_batch_kwargs = {} + + if batch_kwargs is None: + try: + generator = self.get_datasource(datasource_name=datasource_name).get_generator(generator_name=generator_name) + batch_kwargs = generator.build_batch_kwargs(data_asset_name, **additional_batch_kwargs) + except ge_exceptions.BatchKwargsError: + raise ge_exceptions.ProfilerError( + "Unable to build batch_kwargs for datasource {}, using generator {} for name {}".format( + datasource_name, + generator_name, + data_asset_name + )) + except ValueError: + raise ge_exceptions.ProfilerError( + "Unable to find datasource {} or generator {}.".format(datasource_name, generator_name) + ) + else: + batch_kwargs.update(additional_batch_kwargs) + + profiling_results = { + "success": False, + "results": [] + } + + total_columns, total_expectations, total_rows, skipped_data_assets = 0, 0, 0, 0 + total_start_time = datetime.datetime.now() + + name = data_asset_name + # logger.info("\tProfiling '%s'..." % name) + + start_time = datetime.datetime.now() + + if expectation_suite_name is None: + if generator_name is None and data_asset_name is None: + expectation_suite_name = datasource_name + "." + profiler.__name__ + "." + BatchKwargs( + batch_kwargs).to_id() + else: + expectation_suite_name = datasource_name + "." + generator_name + "." + data_asset_name + "." + \ + profiler.__name__ + + self.create_expectation_suite( + expectation_suite_name=expectation_suite_name, + overwrite_existing=True + ) + + # TODO: Add batch_parameters + batch = self.get_batch( + expectation_suite_name=expectation_suite_name, + batch_kwargs=batch_kwargs, + ) + + if not profiler.validate(batch): + raise ge_exceptions.ProfilerError( + "batch '%s' is not a valid batch for the '%s' profiler" % (name, profiler.__name__) + ) + + # Note: This logic is specific to DatasetProfilers, which profile a single batch. Multi-batch profilers + # will have more to unpack. + expectation_suite, validation_results = profiler.profile(batch, run_id=run_id) + profiling_results['results'].append((expectation_suite, validation_results)) + + self.validations_store.set( + key=ValidationResultIdentifier( + expectation_suite_identifier=ExpectationSuiteIdentifier( + expectation_suite_name=expectation_suite_name + ), + run_id=run_id, + batch_identifier=batch.batch_id + ), + value=validation_results + ) + + if isinstance(batch, Dataset): + # For datasets, we can produce some more detailed statistics + row_count = batch.get_row_count() + total_rows += row_count + new_column_count = len(set([exp.kwargs["column"] for exp in expectation_suite.expectations if "column" in exp.kwargs])) + total_columns += new_column_count + + new_expectation_count = len(expectation_suite.expectations) + total_expectations += new_expectation_count + + self.save_expectation_suite(expectation_suite) + duration = (datetime.datetime.now() - start_time).total_seconds() + logger.info("\tProfiled %d columns using %d rows from %s (%.3f sec)" % + (new_column_count, row_count, name, duration)) + + total_duration = (datetime.datetime.now() - total_start_time).total_seconds() + logger.info(""" +Profiled the data asset, with %d total rows and %d columns in %.2f seconds. +Generated, evaluated, and stored %d Expectations during profiling. Please review results using data-docs.""" % ( + total_rows, + total_columns, + total_duration, + total_expectations, + )) + + profiling_results['success'] = True + return profiling_results + -class DataContext(ConfigOnlyDataContext): +class DataContext(BaseDataContext): """A DataContext represents a Great Expectations project. It organizes storage and access for expectation suites, datasources, notification settings, and data fixtures. @@ -1883,9 +1296,143 @@ class DataContext(ConfigOnlyDataContext): Similarly, if no expectation suite name is provided, the DataContext will assume the name "default". """ + @classmethod + def create(cls, project_root_dir=None): + """ + Build a new great_expectations directory and DataContext object in the provided project_root_dir. + + `create` will not create a new "great_expectations" directory in the provided folder, provided one does not + already exist. Then, it will initialize a new DataContext in that folder and write the resulting config. + + Args: + project_root_dir: path to the root directory in which to create a new great_expectations directory + + Returns: + DataContext + """ + + if not os.path.isdir(project_root_dir): + raise ge_exceptions.DataContextError( + "The project_root_dir must be an existing directory in which " + "to initialize a new DataContext" + ) + + ge_dir = os.path.join(project_root_dir, cls.GE_DIR) + safe_mmkdir(ge_dir, exist_ok=True) + cls.scaffold_directories(ge_dir) + + if os.path.isfile(os.path.join(ge_dir, cls.GE_YML)): + message = """Warning. An existing `{}` was found here: {}. + - No action was taken.""".format(cls.GE_YML, ge_dir) + warnings.warn(message) + else: + cls.write_project_template_to_disk(ge_dir) + + if os.path.isfile(os.path.join(ge_dir, "notebooks")): + message = """Warning. An existing `notebooks` directory was found here: {}. + - No action was taken.""".format(ge_dir) + warnings.warn(message) + else: + cls.scaffold_notebooks(ge_dir) + + uncommitted_dir = os.path.join(ge_dir, cls.GE_UNCOMMITTED_DIR) + if os.path.isfile(os.path.join(uncommitted_dir, "config_variables.yml")): + message = """Warning. An existing `config_variables.yml` was found here: {}. + - No action was taken.""".format(uncommitted_dir) + warnings.warn(message) + else: + cls.write_config_variables_template_to_disk(uncommitted_dir) + + return cls(ge_dir) + + @classmethod + def all_uncommitted_directories_exist(cls, ge_dir): + """Check if all uncommitted direcotries exist.""" + uncommitted_dir = os.path.join(ge_dir, cls.GE_UNCOMMITTED_DIR) + for directory in cls.UNCOMMITTED_DIRECTORIES: + if not os.path.isdir(os.path.join(uncommitted_dir, directory)): + return False + + return True + + @classmethod + def config_variables_yml_exist(cls, ge_dir): + """Check if all config_variables.yml exists.""" + path_to_yml = os.path.join(ge_dir, cls.GE_YML) + + # TODO this is so brittle and gross + with open(path_to_yml, "r") as f: + config = yaml.load(f) + config_var_path = config.get("config_variables_file_path") + config_var_path = os.path.join(ge_dir, config_var_path) + return os.path.isfile(config_var_path) + + @classmethod + def write_config_variables_template_to_disk(cls, uncommitted_dir): + safe_mmkdir(uncommitted_dir) + config_var_file = os.path.join(uncommitted_dir, "config_variables.yml") + with open(config_var_file, "w") as template: + template.write(CONFIG_VARIABLES_TEMPLATE) + + @classmethod + def write_project_template_to_disk(cls, ge_dir): + file_path = os.path.join(ge_dir, cls.GE_YML) + with open(file_path, "w") as template: + template.write(PROJECT_TEMPLATE) + + @classmethod + def scaffold_directories(cls, base_dir): + """Safely create GE directories for a new project.""" + safe_mmkdir(base_dir, exist_ok=True) + open(os.path.join(base_dir, ".gitignore"), 'w').write("uncommitted/") + + for directory in cls.BASE_DIRECTORIES: + if directory == "plugins": + plugins_dir = os.path.join(base_dir, directory) + safe_mmkdir(plugins_dir, exist_ok=True) + safe_mmkdir(os.path.join(plugins_dir, "custom_data_docs"), exist_ok=True) + safe_mmkdir(os.path.join(plugins_dir, "custom_data_docs", "views"), exist_ok=True) + safe_mmkdir(os.path.join(plugins_dir, "custom_data_docs", "renderers"), exist_ok=True) + safe_mmkdir(os.path.join(plugins_dir, "custom_data_docs", "styles"), exist_ok=True) + cls.scaffold_custom_data_docs(plugins_dir) + else: + safe_mmkdir(os.path.join(base_dir, directory), exist_ok=True) + + uncommitted_dir = os.path.join(base_dir, cls.GE_UNCOMMITTED_DIR) - # def __init__(self, config, filepath, data_asset_name_delimiter='/'): - def __init__(self, context_root_dir=None, active_environment_name='default', data_asset_name_delimiter='/'): + for new_directory in cls.UNCOMMITTED_DIRECTORIES: + new_directory_path = os.path.join(uncommitted_dir, new_directory) + safe_mmkdir( + new_directory_path, + exist_ok=True + ) + + notebook_path = os.path.join(base_dir, "notebooks") + for subdir in cls.NOTEBOOK_SUBDIRECTORIES: + safe_mmkdir(os.path.join(notebook_path, subdir), exist_ok=True) + + @classmethod + def scaffold_custom_data_docs(cls, plugins_dir): + """Copy custom data docs templates""" + styles_template = file_relative_path( + __file__, "../render/view/static/styles/data_docs_custom_styles_template.css") + styles_destination_path = os.path.join( + plugins_dir, "custom_data_docs", "styles", "data_docs_custom_styles.css") + shutil.copyfile(styles_template, styles_destination_path) + + @classmethod + def scaffold_notebooks(cls, base_dir): + """Copy template notebooks into the notebooks directory for a project.""" + template_dir = file_relative_path(__file__, "../init_notebooks/") + notebook_dir = os.path.join(base_dir, "notebooks/") + for subdir in cls.NOTEBOOK_SUBDIRECTORIES: + subdir_path = os.path.join(notebook_dir, subdir) + for notebook in glob.glob(os.path.join(template_dir, subdir, "*.ipynb")): + notebook_name = os.path.basename(notebook) + destination_path = os.path.join(subdir_path, notebook_name) + shutil.copyfile(notebook, destination_path) + + def __init__(self, context_root_dir=None): # Determine the "context root directory" - this is the parent of "great_expectations" dir if context_root_dir is None: @@ -1893,17 +1440,13 @@ def __init__(self, context_root_dir=None, active_environment_name='default', dat context_root_directory = os.path.abspath(os.path.expanduser(context_root_dir)) self._context_root_directory = context_root_directory - self.active_environment_name = active_environment_name - project_config = self._load_project_config() super(DataContext, self).__init__( project_config, - context_root_directory, - data_asset_name_delimiter, + context_root_directory ) - # TODO : This should use a Store so that the DataContext doesn't need to be aware of reading and writing to disk. def _load_project_config(self): """ Reads the project configuration from the project configuration file. @@ -1924,52 +1467,22 @@ def _load_project_config(self): except IOError: raise ge_exceptions.ConfigNotFoundError() - version = config_dict.get("config_version", 0) - - # TODO clean this up once type-checking configs is more robust - if not isinstance(version, int): - raise ge_exceptions.InvalidConfigValueTypeError("The key `config_version` must be an integer. Please check your config file.") - - # When migrating from 0.7.x to 0.8.0 - if version == 0 and ("validations_store" in list(config_dict.keys()) or "validations_stores" in list(config_dict.keys())): - raise ge_exceptions.ZeroDotSevenConfigVersionError( - "You appear to be using a config version from the 0.7.x series. This version is no longer supported." - ) - elif version < MINIMUM_SUPPORTED_CONFIG_VERSION: - raise ge_exceptions.UnsupportedConfigVersionError( - "You appear to have an invalid config version ({}).\n The version number must be between {} and {}.".format( - version, - MINIMUM_SUPPORTED_CONFIG_VERSION, - CURRENT_CONFIG_VERSION, - ) - ) - elif version > CURRENT_CONFIG_VERSION: - raise ge_exceptions.InvalidConfigVersionError( - "You appear to have an invalid config version ({}).\n The maximum valid version is {}.".format( - version, - CURRENT_CONFIG_VERSION - ) - ) - - # return DataContextConfig(**config_dict) - return config_dict - + try: + return DataContextConfig.from_commented_map(config_dict) + except ge_exceptions.InvalidDataContextConfigError: + # Just to be explicit about what we intended to catch + raise - # TODO : This should use a Store so that the DataContext doesn't need to be aware of reading and writing to disk. def _save_project_config(self): """Save the current project to disk.""" logger.debug("Starting DataContext._save_project_config") config_filepath = os.path.join(self.root_directory, self.GE_YML) - with open(config_filepath, "w") as data: - config = copy.deepcopy( - self._project_config - ) - - yaml.dump(config, data) + with open(config_filepath, "w") as outfile: + self._project_config.to_yaml(outfile) def add_store(self, store_name, store_config): - logger.debug("Starting DataContext.add_store") + logger.debug("Starting DataContext.add_store for store %s" % store_name) new_store = super(DataContext, self).add_store(store_name, store_config) self._save_project_config() @@ -2002,13 +1515,15 @@ def find_context_root_dir(cls): if result is None: raise ge_exceptions.ConfigNotFoundError() - logger.info("Using project config: {}".format(yml_path)) + logger.debug("Using project config: {}".format(yml_path)) return result @classmethod - def find_context_yml_file(cls, search_start_dir=os.getcwd()): + def find_context_yml_file(cls, search_start_dir=None): """Search for the yml file starting here and moving upward.""" yml_path = None + if search_start_dir is None: + search_start_dir = os.getcwd() for i in range(4): logger.debug("Searching for config file {} ({} layer deep)".format(search_start_dir, i)) @@ -2026,18 +1541,73 @@ def find_context_yml_file(cls, search_start_dir=os.getcwd()): return yml_path + @classmethod + def does_config_exist_on_disk(cls, context_root_dir): + """Return True if the great_expectations.yml exists on disk.""" + return os.path.isfile(os.path.join(context_root_dir, cls.GE_YML)) + + @classmethod + def is_project_initialized(cls, ge_dir): + """ + Return True if the project is initialized. + + To be considered initialized, all of the following must be true: + - all project directories exist (including uncommitted directories) + - a valid great_expectations.yml is on disk + - a config_variables.yml is on disk + - the project has at least one datasource + - the project has at least one suite + """ + return ( + cls.does_config_exist_on_disk(ge_dir) + and cls.all_uncommitted_directories_exist(ge_dir) + and cls.config_variables_yml_exist(ge_dir) + and cls._does_context_have_at_least_one_datasource(ge_dir) + and cls._does_context_have_at_least_one_suite(ge_dir) + ) + + @classmethod + def does_project_have_a_datasource_in_config_file(cls, ge_dir): + if not cls.does_config_exist_on_disk(ge_dir): + return False + return cls._does_context_have_at_least_one_datasource(ge_dir) + + @classmethod + def _does_context_have_at_least_one_datasource(cls, ge_dir): + context = cls._attempt_context_instantiation(ge_dir) + if not isinstance(context, DataContext): + return False + return len(context.list_datasources()) >= 1 + + @classmethod + def _does_context_have_at_least_one_suite(cls, ge_dir): + context = cls._attempt_context_instantiation(ge_dir) + if not isinstance(context, DataContext): + return False + return len(context.list_expectation_suites()) >= 1 + + @classmethod + def _attempt_context_instantiation(cls, ge_dir): + try: + context = DataContext(ge_dir) + return context + except ( + ge_exceptions.DataContextError, + ge_exceptions.InvalidDataContextConfigError + ) as e: + logger.warning(e) + class ExplorerDataContext(DataContext): - def __init__(self, context_root_dir=None, expectation_explorer=True, data_asset_name_delimiter='/'): + def __init__(self, context_root_dir=None, expectation_explorer=True): """ expectation_explorer: If True, load the expectation explorer manager, which will modify GE return objects \ to include ipython notebook widgets. """ super(ExplorerDataContext, self).__init__( - context_root_dir, - data_asset_name_delimiter, + context_root_dir ) self._expectation_explorer = expectation_explorer @@ -2059,3 +1629,38 @@ def update_return_obj(self, data_asset, return_obj): return self._expectation_explorer_manager.create_expectation_widget(data_asset, return_obj) else: return return_obj + + +def _get_metric_configuration_tuples(metric_configuration, base_kwargs=None): + if base_kwargs is None: + base_kwargs = {} + + if isinstance(metric_configuration, string_types): + return [(metric_configuration, base_kwargs)] + + metric_configurations_list = [] + for kwarg_name in metric_configuration.keys(): + if not isinstance(metric_configuration[kwarg_name], dict): + raise ge_exceptions.DataContextError("Invalid metric_configuration: each key must contain a " + "dictionary.") + if kwarg_name == "metric_kwargs_id": # this special case allows a hash of multiple kwargs + for metric_kwargs_id in metric_configuration[kwarg_name].keys(): + if base_kwargs != {}: + raise ge_exceptions.DataContextError("Invalid metric_configuration: when specifying " + "metric_kwargs_id, no other keys or values may be defined.") + if not isinstance(metric_configuration[kwarg_name][metric_kwargs_id], list): + raise ge_exceptions.DataContextError("Invalid metric_configuration: each value must contain a " + "list.") + metric_configurations_list += [(metric_name, {"metric_kwargs_id": metric_kwargs_id}) for metric_name + in metric_configuration[kwarg_name][metric_kwargs_id]] + else: + for kwarg_value in metric_configuration[kwarg_name].keys(): + base_kwargs.update({kwarg_name: kwarg_value}) + if not isinstance(metric_configuration[kwarg_name][kwarg_value], list): + raise ge_exceptions.DataContextError("Invalid metric_configuration: each value must contain a " + "list.") + for nested_configuration in metric_configuration[kwarg_name][kwarg_value]: + metric_configurations_list += _get_metric_configuration_tuples(nested_configuration, + base_kwargs=base_kwargs) + + return metric_configurations_list diff --git a/great_expectations/data_context/store/__init__.py b/great_expectations/data_context/store/__init__.py index 71ae488276c9..7a7203c520e6 100644 --- a/great_expectations/data_context/store/__init__.py +++ b/great_expectations/data_context/store/__init__.py @@ -1,25 +1,13 @@ -from .store_backend import ( - StoreBackend, - InMemoryStoreBackend, - # FilesystemStoreBackend, - FixedLengthTupleFilesystemStoreBackend, - FixedLengthTupleS3StoreBackend, - FixedLengthTupleGCSStoreBackend -) - -from .store import ( - WriteOnlyStore, - ReadWriteStore, - BasicInMemoryStore, -) +from .store import Store +from .validations_store import ValidationsStore +from .expectations_store import ExpectationsStore +from .html_site_store import HtmlSiteStore +from .metric_store import MetricStore, EvaluationParameterStore -from .namespaced_read_write_store import ( - NamespacedReadWriteStore, - ValidationsStore, - ExpectationsStore, - HtmlSiteStore, +from .store_backend import StoreBackend, InMemoryStoreBackend +from .tuple_store_backend import ( + TupleFilesystemStoreBackend, + TupleS3StoreBackend, + TupleGCSStoreBackend ) - -from .evaluation_parameter_store import ( - InMemoryEvaluationParameterStore, -) \ No newline at end of file +from .database_store_backend import DatabaseStoreBackend diff --git a/great_expectations/data_context/store/data_snapshot_store.py b/great_expectations/data_context/store/data_snapshot_store.py deleted file mode 100644 index 00169ac8917b..000000000000 --- a/great_expectations/data_context/store/data_snapshot_store.py +++ /dev/null @@ -1,96 +0,0 @@ -# NOTE: Deprecated. Retain until DataSnapshotStore is implemented -# class NamespacedReadWriteStoreConfig(ReadWriteStoreConfig): -# _allowed_keys = set({ -# "serialization_type", -# "resource_identifier_class_name", -# "store_backend", -# }) -# _required_keys = set({ -# "resource_identifier_class_name", -# "store_backend", -# }) - -# class DataSnapshotStore(WriteOnlyStore): - -# config_class = NamespacedReadWriteStoreConfig - -# def __init__(self, config, root_directory): -# # super(NamespacedReadWriteStore, self).__init__(config, root_directory) - -# # TODO: This method was copied and modified from the base class. -# # We need to refactor later to inherit sensibly. -# assert hasattr(self, 'config_class') - -# assert isinstance(config, self.config_class) -# self.config = config - -# self.root_directory = root_directory - -# # NOTE: hm. This is tricky. -# # At this point, we need to add some keys to the store_backend config. -# # The config from THIS class should be typed by this point. -# # But if we insist that it's recursively typed, it will have failed before arriving at this point. -# if self.config["store_backend"]["class_name"] == "FilesystemStoreBackend": -# self.config["store_backend"]["key_length"] = self.resource_identifier_class._recursively_get_key_length()#+1 #Only add one if we prepend the identifier type -# self.store_backend = self._configure_store_backend(self.config["store_backend"]) -# self.store_backend.verify_that_key_to_filepath_operation_is_reversible() - -# else: -# self.store_backend = self._configure_store_backend(self.config["store_backend"]) - - -# self._setup() - - -# def _get(self, key): -# key_tuple = self._convert_resource_identifier_to_tuple(key) -# return self.store_backend.get(key_tuple) - -# def _set(self, key, serialized_value): -# key_tuple = self._convert_resource_identifier_to_tuple(key) -# return self.store_backend.set(key_tuple, serialized_value) - -# def list_keys(self): -# return [self._convert_tuple_to_resource_identifier(key) for key in self.store_backend.list_keys()] - -# def _convert_resource_identifier_to_tuple(self, key): -# # TODO : Optionally prepend a source_id (the frontend Store name) to the tuple. - -# # TODO : Optionally prepend a resource_identifier_type to the tuple. -# # list_ = [self.config.resource_identifier_class_name] - -# list_ = [] -# list_ += self._convert_resource_identifier_to_list(key) - -# return tuple(list_) - -# def _convert_resource_identifier_to_list(self, key): -# # The logic in this function is recursive, so it can't return a tuple - -# list_ = [] -# #Fetch keys in _key_order to guarantee tuple ordering in both python 2 and 3 -# for key_name in key._key_order: -# key_element = key[key_name] -# if isinstance( key_element, DataContextResourceIdentifier ): -# list_ += self._convert_resource_identifier_to_list(key_element) -# else: -# list_.append(key_element) - -# return list_ - -# def _convert_tuple_to_resource_identifier(self, tuple_): -# new_identifier = self.resource_identifier_class(*tuple_)#[1:]) #Only truncate one if we prepended the identifier type -# return new_identifier - -# @property -# def resource_identifier_class(self): -# module = importlib.import_module("great_expectations.data_context.types.resource_identifiers") -# class_ = getattr(module, self.config.resource_identifier_class_name) -# return class_ - -# def _validate_key(self, key): -# if not isinstance(key, self.resource_identifier_class): -# raise TypeError("key: {!r} must be a DataContextResourceIdentifier, not {!r}".format( -# key, -# type(key), -# )) diff --git a/great_expectations/data_context/store/database_store_backend.py b/great_expectations/data_context/store/database_store_backend.py new file mode 100644 index 000000000000..19414d143c26 --- /dev/null +++ b/great_expectations/data_context/store/database_store_backend.py @@ -0,0 +1,71 @@ +import json +import great_expectations.exceptions as ge_exceptions + +try: + import sqlalchemy + from sqlalchemy import create_engine, Column, String, MetaData, Table, select, and_, column + from sqlalchemy.engine.url import URL +except ImportError: + sqlalchemy = None + create_engine = None + +from great_expectations.data_context.store.store_backend import StoreBackend + + +class DatabaseStoreBackend(StoreBackend): + + def __init__(self, credentials, table_name, key_columns, fixed_length_key=True): + super(DatabaseStoreBackend, self).__init__(fixed_length_key=fixed_length_key) + if not sqlalchemy: + raise ge_exceptions.DataContextError("ModuleNotFoundError: No module named 'sqlalchemy'") + + if not self.fixed_length_key: + raise ValueError("DatabaseStoreBackend requires use of a fixed-length-key") + + meta = MetaData() + self.key_columns = key_columns + # Dynamically construct a SQLAlchemy table with the name and column names we'll use + cols = [] + for column in key_columns: + if column == "value": + raise ValueError("'value' cannot be used as a key_element name") + cols.append(Column(column, String, primary_key=True)) + + cols.append(Column("value", String)) + self._table = Table( + table_name, meta, + *cols + ) + + drivername = credentials.pop("drivername") + options = URL(drivername, **credentials) + self.engine = create_engine(options) + meta.create_all(self.engine) + + def _get(self, key): + sel = select([column("value")]).select_from(self._table).where( + and_( + *[getattr(self._table.columns, key_col) == val for key_col, val in zip(self.key_columns, key)] + ) + ) + res = self.engine.execute(sel).fetchone() + if res: + return self.engine.execute(sel).fetchone()[0] + + def _set(self, key, value, **kwargs): + cols = {k: v for (k, v) in zip(self.key_columns, key)} + cols["value"] = value + ins = self._table.insert().values(**cols) + self.engine.execute(ins) + + def _has_key(self, key): + pass + + def list_keys(self, prefix=()): + sel = select([column(col) for col in self.key_columns]).select_from(self._table).where( + and_( + *[getattr(self._table.columns, key_col) == val for key_col, val in + zip(self.key_columns[:len(prefix)], prefix)] + ) + ) + return [tuple(row) for row in self.engine.execute(sel).fetchall()] diff --git a/great_expectations/data_context/store/evaluation_parameter_store.py b/great_expectations/data_context/store/evaluation_parameter_store.py deleted file mode 100644 index b961ded9e526..000000000000 --- a/great_expectations/data_context/store/evaluation_parameter_store.py +++ /dev/null @@ -1,17 +0,0 @@ -class InMemoryEvaluationParameterStore(object): - """You want to be a dict. You get to be a dict. But we call you a Store.""" - - def __init__(self, root_directory=None): - self.store = {} - - def get(self, key): - return self.store[key] - - def set(self, key, value): - self.store[key] = value - - def has_key(self, key): - return key in self.store - - def list_keys(self): - return list(self.store.keys()) diff --git a/great_expectations/data_context/store/expectations_store.py b/great_expectations/data_context/store/expectations_store.py new file mode 100644 index 000000000000..449065faa486 --- /dev/null +++ b/great_expectations/data_context/store/expectations_store.py @@ -0,0 +1,36 @@ +from great_expectations.core import ExpectationSuiteSchema +from great_expectations.data_context.store.database_store_backend import DatabaseStoreBackend +from great_expectations.data_context.store.tuple_store_backend import TupleStoreBackend +from great_expectations.data_context.store.store import Store +from great_expectations.data_context.types.resource_identifiers import ExpectationSuiteIdentifier +from great_expectations.data_context.util import load_class + + +class ExpectationsStore(Store): + _key_class = ExpectationSuiteIdentifier + + def __init__(self, store_backend=None, runtime_environment=None): + self._expectationSuiteSchema = ExpectationSuiteSchema(strict=True) + + if store_backend is not None: + store_backend_module_name = store_backend.get("module_name", "great_expectations.data_context.store") + store_backend_class_name = store_backend.get("class_name", "InMemoryStoreBackend") + store_backend_class = load_class(store_backend_class_name, store_backend_module_name) + + if issubclass(store_backend_class, TupleStoreBackend): + # Provide defaults for this common case + store_backend["filepath_suffix"] = store_backend.get("filepath_suffix", ".json") + elif issubclass(store_backend_class, DatabaseStoreBackend): + # Provide defaults for this common case + store_backend["table_name"] = store_backend.get("table_name", "ge_expectations_store") + store_backend["key_columns"] = store_backend.get( + "key_columns", ["expectation_suite_name"] + ) + + super(ExpectationsStore, self).__init__(store_backend=store_backend, runtime_environment=runtime_environment) + + def serialize(self, key, value): + return self._expectationSuiteSchema.dumps(value).data + + def deserialize(self, key, value): + return self._expectationSuiteSchema.loads(value).data diff --git a/great_expectations/data_context/store/html_site_store.py b/great_expectations/data_context/store/html_site_store.py new file mode 100644 index 000000000000..0784184b22d8 --- /dev/null +++ b/great_expectations/data_context/store/html_site_store.py @@ -0,0 +1,214 @@ +import os +import logging +from mimetypes import guess_type + +from great_expectations.data_context.types.resource_identifiers import ( + ExpectationSuiteIdentifier, + ValidationResultIdentifier, + SiteSectionIdentifier, +) +from .tuple_store_backend import TupleStoreBackend +from great_expectations.data_context.util import ( + load_class, + instantiate_class_from_config, + file_relative_path +) +from great_expectations.exceptions import DataContextError +from ...core.data_context_key import DataContextKey + +logger = logging.getLogger(__name__) + + +class HtmlSiteStore(object): + _key_class = SiteSectionIdentifier + + def __init__(self, store_backend=None, runtime_environment=None): + store_backend_module_name = store_backend.get("module_name", "great_expectations.data_context.store") + store_backend_class_name = store_backend.get("class_name", "TupleFilesystemStoreBackend") + store_class = load_class(store_backend_class_name, store_backend_module_name) + + if not issubclass(store_class, TupleStoreBackend): + raise DataContextError("Invalid configuration: HtmlSiteStore needs a TupleStoreBackend") + if "filepath_template" in store_backend or ("fixed_length_key" in store_backend and + store_backend["fixed_length_key"] is True): + logger.warning("Configuring a filepath_template or using fixed_length_key is not supported in SiteBuilder: " + "filepaths will be selected based on the type of asset rendered.") + + # One thing to watch for is reversibility of keys. + # If several types are being written to overlapping directories, we could get collisions. + self.store_backends = { + ExpectationSuiteIdentifier: instantiate_class_from_config( + config=store_backend, + runtime_environment=runtime_environment, + config_defaults={ + "module_name": "great_expectations.data_context.store", + "filepath_prefix": "expectations/", + "filepath_suffix": ".html" + } + ), + ValidationResultIdentifier: instantiate_class_from_config( + config=store_backend, + runtime_environment=runtime_environment, + config_defaults={ + "module_name": "great_expectations.data_context.store", + "filepath_prefix": "validations/", + "filepath_suffix": ".html" + } + ), + "index_page": instantiate_class_from_config( + config=store_backend, + runtime_environment=runtime_environment, + config_defaults={ + "module_name": "great_expectations.data_context.store", + "filepath_template": 'index.html', + } + ), + "static_assets": instantiate_class_from_config( + config=store_backend, + runtime_environment=runtime_environment, + config_defaults={ + "module_name": "great_expectations.data_context.store", + "filepath_template": None, + } + ), + } + + # NOTE: Instead of using the filesystem as the source of record for keys, + # this class tracks keys separately in an internal set. + # This means that keys are stored for a specific session, but can't be fetched after the original + # HtmlSiteStore instance leaves scope. + # Doing it this way allows us to prevent namespace collisions among keys while still having multiple + # backends that write to the same directory structure. + # It's a pretty reasonable way for HtmlSiteStore to do its job---you just ahve to remember that it + # can't necessarily set and list_keys like most other Stores. + self.keys = set() + + def get(self, key): + self._validate_key(key) + return self.store_backends[ + type(key.resource_identifier) + ].get(key.to_tuple()) + + def set(self, key, serialized_value): + self._validate_key(key) + self.keys.add(key) + + return self.store_backends[ + type(key.resource_identifier) + ].set(key.resource_identifier.to_tuple(), serialized_value, + content_encoding='utf-8', content_type='text/html; charset=utf-8') + + def get_url_for_resource(self, resource_identifier=None): + """ + Return the URL of the HTML document that renders a resource + (e.g., an expectation suite or a validation result). + + :param resource_identifier: ExpectationSuiteIdentifier, ValidationResultIdentifier + or any other type's identifier. The argument is optional - when + not supplied, the method returns the URL of the index page. + :return: URL (string) + """ + if resource_identifier is None: + store_backend = self.store_backends["index_page"] + key = () + elif isinstance(resource_identifier, ExpectationSuiteIdentifier): + store_backend = self.store_backends[ExpectationSuiteIdentifier] + key = resource_identifier.to_tuple() + elif isinstance(resource_identifier, ValidationResultIdentifier): + store_backend = self.store_backends[ValidationResultIdentifier] + key = resource_identifier.to_tuple() + else: + # this method does not support getting the URL of static assets + raise ValueError("Cannot get URL for resource {0:s}".format(str(resource_identifier))) + + return store_backend.get_url_for_key(key) + + def _validate_key(self, key): + if not isinstance(key, SiteSectionIdentifier): + raise TypeError("key: {!r} must a SiteSectionIdentifier, not {!r}".format( + key, + type(key), + )) + + for key_class in self.store_backends.keys(): + try: + if isinstance(key.resource_identifier, key_class): + return + except TypeError: + # it's ok to have a key that is not a type (e.g. the string "index_page") + continue + + # The key's resource_identifier didn't match any known key_class + raise TypeError("resource_identifier in key: {!r} must one of {}, not {!r}".format( + key, + set(self.store_backends.keys()), + type(key), + )) + + def list_keys(self): + keys = [] + for type_, backend in self.store_backends.items(): + try: + # If the store_backend does not support list_keys... + key_tuples = backend.list_keys() + except NotImplementedError: + pass + try: + if issubclass(type_, DataContextKey): + keys += [type_.from_tuple(tuple_) for tuple_ in key_tuples] + except TypeError: + # If the key in store_backends is not itself a type... + pass + return keys + + def write_index_page(self, page): + """This third param_store has a special method, which uses a zero-length tuple as a key.""" + return self.store_backends["index_page"].set((), page, content_encoding='utf-8', content_type='text/html; ' + 'charset=utf-8') + + def copy_static_assets(self, static_assets_source_dir=None): + """ + Copies static assets, using a special "static_assets" backend store that accepts variable-length tuples as + keys, with no filepath_template. + """ + file_exclusions = [".DS_Store"] + dir_exclusions = [] + + if not static_assets_source_dir: + static_assets_source_dir = file_relative_path(__file__, "../../render/view/static") + + for item in os.listdir(static_assets_source_dir): + # Directory + if os.path.isdir(os.path.join(static_assets_source_dir, item)): + if item in dir_exclusions: + continue + # Recurse + new_source_dir = os.path.join(static_assets_source_dir, item) + self.copy_static_assets(new_source_dir) + # File + else: + # Copy file over using static assets store backend + if item in file_exclusions: + continue + source_name = os.path.join(static_assets_source_dir, item) + with open(source_name, 'rb') as f: + # Only use path elements starting from static/ for key + store_key = tuple(os.path.normpath(source_name).split(os.sep)) + store_key = store_key[store_key.index('static'):] + content_type, content_encoding = guess_type(item, strict=False) + + if content_type is None: + # Use GE-known content-type if possible + if source_name.endswith(".otf"): + content_type = "font/opentype" + else: + # fallback + logger.warning("Unable to automatically determine content_type for {}".format(source_name)) + content_type = "text/html; charset=utf8" + + self.store_backends["static_assets"].set( + store_key, + f.read(), + content_encoding=content_encoding, + content_type=content_type + ) \ No newline at end of file diff --git a/great_expectations/data_context/store/metric_store.py b/great_expectations/data_context/store/metric_store.py new file mode 100644 index 000000000000..a12423e4bd4b --- /dev/null +++ b/great_expectations/data_context/store/metric_store.py @@ -0,0 +1,64 @@ +import json + +from great_expectations.core import ensure_json_serializable +from great_expectations.core.metric import ValidationMetricIdentifier +from great_expectations.data_context.store.database_store_backend import DatabaseStoreBackend +from great_expectations.data_context.store.store import Store +from great_expectations.util import load_class + + +class MetricStore(Store): + _key_class = ValidationMetricIdentifier + + def __init__(self, store_backend=None): + if store_backend is not None: + store_backend_module_name = store_backend.get("module_name", "great_expectations.data_context.store") + store_backend_class_name = store_backend.get("class_name", "InMemoryStoreBackend") + store_backend_class = load_class(store_backend_class_name, store_backend_module_name) + + if issubclass(store_backend_class, DatabaseStoreBackend): + # Provide defaults for this common case + store_backend["table_name"] = store_backend.get("table_name", "ge_metrics") + store_backend["key_columns"] = store_backend.get( + "key_columns", [ + "run_id", + "expectation_suite_identifier", + "metric_name", + "metric_kwargs_id", + ] + ) + + super(MetricStore, self).__init__(store_backend=store_backend) + + # noinspection PyMethodMayBeStatic + def _validate_value(self, value): + # Values must be json serializable since they must be inputs to expectation configurations + ensure_json_serializable(value) + + def serialize(self, key, value): + return json.dumps({"value": value}) + + def deserialize(self, key, value): + if value: + return json.loads(value)["value"] + + +class EvaluationParameterStore(MetricStore): + + def __init__(self, store_backend=None): + if store_backend is not None: + store_backend_module_name = store_backend.get("module_name", "great_expectations.data_context.store") + store_backend_class_name = store_backend.get("class_name", "InMemoryStoreBackend") + store_backend_class = load_class(store_backend_class_name, store_backend_module_name) + + if issubclass(store_backend_class, DatabaseStoreBackend): + # Provide defaults for this common case + store_backend["table_name"] = store_backend.get("table_name", "ge_evaluation_parameters") + super(EvaluationParameterStore, self).__init__(store_backend=store_backend) + + def get_bind_params(self, run_id): + params = {} + for k in self._store_backend.list_keys((run_id,)): + key = self.tuple_to_key(k) + params[key.to_evaluation_parameter_urn()] = self.get(key) + return params diff --git a/great_expectations/data_context/store/namespaced_read_write_store.py b/great_expectations/data_context/store/namespaced_read_write_store.py deleted file mode 100644 index cf0102afef68..000000000000 --- a/great_expectations/data_context/store/namespaced_read_write_store.py +++ /dev/null @@ -1,371 +0,0 @@ -import logging - -import copy -from mimetypes import guess_type -import os - -from ..types.base_resource_identifiers import ( - DataContextKey, -) -from great_expectations.data_context.types.resource_identifiers import ( - ExpectationSuiteIdentifier, - ValidationResultIdentifier, - SiteSectionIdentifier, -) -from .store import ( - ReadWriteStore, -) -from .store_backend import ( - FixedLengthTupleStoreBackend -) -from great_expectations.data_context.util import ( - load_class, - instantiate_class_from_config -) -from great_expectations.exceptions import DataContextError -from great_expectations.util import file_relative_path - -logger = logging.getLogger(__name__) - - -class NamespacedReadWriteStore(ReadWriteStore): - - def __init__(self, - store_backend, - root_directory, - serialization_type="json" - ): - self.store_backend = self._init_store_backend( - copy.deepcopy(store_backend), - runtime_config={ - "root_directory": root_directory - } - ) - self.root_directory = root_directory - self.serialization_type = serialization_type - - def _init_store_backend(self, store_backend_config, runtime_config): - if store_backend_config["class_name"] == "FixedLengthTupleFilesystemStoreBackend": - config_defaults = { - "key_length" : 5, - "module_name" : "great_expectations.data_context.store", - } - elif store_backend_config["class_name"] == "FixedLengthTupleS3StoreBackend": - config_defaults = { - "key_length": 5, # NOTE: Eugene: 2019-09-06: ??? - "module_name": "great_expectations.data_context.store", - } - else: - config_defaults = { - "module_name" : "great_expectations.data_context.store", - } - - return instantiate_class_from_config( - config=store_backend_config, - runtime_config=runtime_config, - config_defaults=config_defaults, - ) - - def _get(self, key): - self._validate_key(key) - - key_tuple = self._convert_resource_identifier_to_tuple(key) - return self.store_backend.get(key_tuple) - - def _set(self, key, serialized_value): - self._validate_key(key) - - key_tuple = self._convert_resource_identifier_to_tuple(key) - return self.store_backend.set(key_tuple, serialized_value) - - def list_keys(self): - return [self._convert_tuple_to_resource_identifier(key) for key in self.store_backend.list_keys()] - - def has_key(self, key): - # NOTE: This is not efficient - return key in self.list_keys() - - def _convert_resource_identifier_to_tuple(self, key): - # TODO : Optionally prepend a source_id (the frontend Store name) to the tuple. - - list_ = [] - list_ += self._convert_resource_identifier_to_list(key) - - return tuple(list_) - - def _convert_resource_identifier_to_list(self, key): - # The logic in this function is recursive, so it can't return a tuple - - list_ = [] - #Fetch keys in _key_order to guarantee tuple ordering in both python 2 and 3 - for key_name in key._key_order: - key_element = key[key_name] - if isinstance( key_element, DataContextKey ): - list_ += self._convert_resource_identifier_to_list(key_element) - else: - list_.append(key_element) - - return list_ - - def _convert_tuple_to_resource_identifier(self, tuple_): - new_identifier = self.key_class(*tuple_) - return new_identifier - - def _validate_key(self, key): - if not isinstance(key, self.key_class): - raise TypeError("key: {!r} must be a {}, not {!r}".format( - key, - self.key_class, - type(key), - )) - - -class ExpectationsStore(NamespacedReadWriteStore): - # Note : As of 2019/09/06, this method is untested. - # It shares virtually all of its business logic with ValidationStore, which is under test. - - def _init_store_backend(self, store_backend_config, runtime_config): - self.key_class = ExpectationSuiteIdentifier - - store_backend_class_name = store_backend_config.get("class_name", "FixedLengthTupleFilesystemStoreBackend") - store_backend_module_name = store_backend_config.get("module_name", "great_expectations.data_context.store") - store_backend_class = load_class( - class_name=store_backend_class_name, - module_name=store_backend_module_name - ) - if issubclass(store_backend_class, FixedLengthTupleStoreBackend): - config_defaults = { - "key_length": 4, - "module_name": "great_expectations.data_context.store", - "filepath_template": "{0}/{1}/{2}/{3}.json" - } - else: - config_defaults = { - "module_name": "great_expectations.data_context.store", - } - - return instantiate_class_from_config( - config=store_backend_config, - runtime_config=runtime_config, - config_defaults=config_defaults, - ) - - -class ValidationsStore(NamespacedReadWriteStore): - - def _init_store_backend(self, store_backend_config, runtime_config): - self.key_class = ValidationResultIdentifier - - store_backend_class_name = store_backend_config.get("class_name", "FixedLengthTupleFilesystemStoreBackend") - store_backend_module_name = store_backend_config.get("module_name", "great_expectations.data_context.store") - store_backend_class = load_class( - class_name=store_backend_class_name, - module_name=store_backend_module_name - ) - if issubclass(store_backend_class, FixedLengthTupleStoreBackend): - config_defaults = { - "key_length": 5, - "module_name": "great_expectations.data_context.store", - "filepath_template": "{4}/{0}/{1}/{2}/{3}.json" - } - else: - config_defaults = { - "module_name": "great_expectations.data_context.store", - } - - return instantiate_class_from_config( - config=store_backend_config, - runtime_config=runtime_config, - config_defaults=config_defaults, - ) - - -class HtmlSiteStore(NamespacedReadWriteStore): - - def __init__(self, - root_directory, - serialization_type=None, - store_backend=None - ): - self.key_class = SiteSectionIdentifier - store_backend_module_name = store_backend.get("module_name", "great_expectations.data_context.store") - store_backend_class_name = store_backend.get("class_name", "FixedLengthTupleFilesystemStoreBackend") - store_class = load_class(store_backend_class_name, store_backend_module_name) - - if not issubclass(store_class, FixedLengthTupleStoreBackend): - raise DataContextError("Invalid configuration: HtmlSiteStore needs a FixedLengthTupleStoreBackend") - if "filepath_template" in store_backend or "key_length" in store_backend: - logger.warning("Configuring a filepath_template or key_length is not supported in SiteBuilder: " - "filepaths will be selected based on the type of asset rendered.") - - # # Each key type gets its own backend. - # # If backends were DB connections, this could be inefficient, but it doesn't much matter for filepaths. - # # One thing to watch for is reversibility of keys. - # # If several types are being writtten to overlapping directories, we could get collisions. - # expectations_backend_config = copy.deepcopy(store_backend) - # if "base_directory" in expectations_backend_config: - # expectations_backend_config["base_directory"] = os.path.join(expectations_backend_config["base_directory"], "expectations") - # elif "prefix" in expectations_backend_config: - # expectations_backend_config["prefix"] = os.path.join(expectations_backend_config["prefix"], "expectations") - # - # validations_backend_config = copy.deepcopy(store_backend) - # if "base_directory" in validations_backend_config: - # validations_backend_config["base_directory"] = os.path.join(validations_backend_config["base_directory"], "validations") - # elif "prefix" in validations_backend_config: - # validations_backend_config["prefix"] = os.path.join(validations_backend_config["prefix"], "validations") - - self.store_backends = { - ExpectationSuiteIdentifier: instantiate_class_from_config( - config=store_backend, - runtime_config={ - "root_directory": root_directory - }, - config_defaults={ - "module_name": "great_expectations.data_context.store", - "key_length": 4, - "filepath_template": 'expectations/{0}/{1}/{2}/{3}.html', - } - ), - ValidationResultIdentifier: instantiate_class_from_config( - config=store_backend, - runtime_config={ - "root_directory": root_directory - }, - config_defaults={ - "module_name": "great_expectations.data_context.store", - "key_length": 5, - "filepath_template": 'validations/{4}/{0}/{1}/{2}/{3}.html', - } - ), - "index_page": instantiate_class_from_config( - config=store_backend, - runtime_config={ - "root_directory": root_directory - }, - config_defaults={ - "module_name": "great_expectations.data_context.store", - "key_length": 0, - "filepath_template": 'index.html', - } - ), - "static_assets": instantiate_class_from_config( - config=store_backend, - runtime_config={ - "root_directory": root_directory - }, - config_defaults={ - "module_name": "great_expectations.data_context.store", - "key_length": None, - "filepath_template": None, - } - ), - } - - self.root_directory = root_directory - self.serialization_type = serialization_type - - # NOTE: Instead of using the filesystem as the source of record for keys, - # this class trackes keys separately in an internal set. - # This means that keys are stored for a specific session, but can't be fetched after the original - # HtmlSiteStore instance leaves scope. - # Doing it this way allows us to prevent namespace collisions among keys while still having multiple - # backends that write to the same directory structure. - # It's a pretty reasonable way for HtmlSiteStore to do its job---you just have to remember that it - # can't necessarily set and list_keys like most other Stores. - self.keys = set() - - def _convert_tuple_to_resource_identifier(self, tuple_): - if tuple_[0] == "expectations": - resource_identifier = ExpectationSuiteIdentifier(*tuple_[1]) - elif tuple_[0] == "validations": - resource_identifier = ValidationResultIdentifier(*tuple_[1]) - else: - raise Exception("unknown section name: " + tuple_[0]) - new_identifier = SiteSectionIdentifier(site_section_name=tuple_[0], resource_identifier=resource_identifier) - return new_identifier - - def _get(self, key): - self._validate_key(key) - - key_tuple = self._convert_resource_identifier_to_tuple(key.resource_identifier) - return self.store_backends[ - type(key.resource_identifier) - ].get(key_tuple) - - def _set(self, key, serialized_value): - self._validate_key(key) - - self.keys.add(key) - - key_tuple = self._convert_resource_identifier_to_tuple(key.resource_identifier) - return self.store_backends[ - type(key.resource_identifier) - ].set(key_tuple, serialized_value, content_encoding='utf-8', content_type='text/html; charset=utf-8') - - def _validate_key(self, key): - if not isinstance(key, SiteSectionIdentifier): - raise TypeError("key: {!r} must a SiteSectionIdentifier, not {!r}".format( - key, - type(key), - )) - - for key_class in self.store_backends.keys(): - try: - if isinstance(key.resource_identifier, key_class): - return - except TypeError: - # it's ok to have a key that is not a type (e.g. the string "index_page") - continue - - # The key's resource_identifier didn't match any known key_class - raise TypeError("resource_identifier in key: {!r} must one of {}, not {!r}".format( - key, - set(self.store_backends.keys()), - type(key), - )) - - def list_keys(self): - return [self._convert_tuple_to_resource_identifier(("expectations", key)) for key in self.store_backends[ExpectationSuiteIdentifier].list_keys()] + \ - [self._convert_tuple_to_resource_identifier(("validations", key)) for key in self.store_backends[ValidationResultIdentifier].list_keys()] - - def write_index_page(self, page): - """This third store has a special method, which uses a zero-length tuple as a key.""" - return self.store_backends["index_page"].set((), page, content_encoding='utf-8', content_type='text/html; ' - 'charset=utf-8') - - def copy_static_assets(self, static_assets_source_dir=None): - """ - Copies static assets, using a special "static_assets" backend store that accepts variable-length tuples as - keys, with no filepath_template. - """ - file_exclusions = [".DS_Store"] - dir_exclusions = [] - - if not static_assets_source_dir: - static_assets_source_dir = file_relative_path(__file__, "../../render/view/static") - - for item in os.listdir(static_assets_source_dir): - # Directory - if os.path.isdir(os.path.join(static_assets_source_dir, item)): - if item in dir_exclusions: - continue - # Recurse - new_source_dir = os.path.join(static_assets_source_dir, item) - self.copy_static_assets(new_source_dir) - # File - else: - # Copy file over using static assets store backend - if item in file_exclusions: - continue - source_name = os.path.join(static_assets_source_dir, item) - with open(source_name, 'rb') as f: - # Only use path elements starting from static/ for key - store_key = tuple(os.path.normpath(source_name).split(os.sep)) - store_key = store_key[store_key.index('static'):] - content_type, content_encoding = guess_type(item, strict=False) - self.store_backends["static_assets"].set( - store_key, - f.read(), - content_encoding=content_encoding, - content_type=content_type - ) diff --git a/great_expectations/data_context/store/store.py b/great_expectations/data_context/store/store.py index dadec1151c8d..98dc50d867ea 100644 --- a/great_expectations/data_context/store/store.py +++ b/great_expectations/data_context/store/store.py @@ -1,158 +1,78 @@ import logging -logger = logging.getLogger(__name__) -from six import string_types -import json +from great_expectations.core.data_context_key import DataContextKey +from great_expectations.data_context.store.store_backend import StoreBackend +from great_expectations.data_context.util import instantiate_class_from_config +from great_expectations.exceptions import DataContextError -import pandas as pd +logger = logging.getLogger(__name__) -from great_expectations.data_context.util import ( - instantiate_class_from_config -) -# TODO : Add a ConfigReadWriteStore. +class Store(object): + """A store is responsible for reading and writing Great Expectations objects + to appropriate backends. It provides a generic API that the DataContext can + use independently of any particular ORM and backend. -class WriteOnlyStore(object): - """This base class supports writing, but not reading. + An implementation of a store will generally need to define the following: + - serialize + - deserialize + - _key_class (class of expected key type) - It's suitable for things like HTML files that are information sinks. + All keys must have a to_tuple() method. """ - - def __init__(self, serialization_type=None, root_directory=None): - self.serialization_type = serialization_type - self.root_directory = root_directory - - def set(self, key, value, serialization_type=None): - self._validate_key(key) - - if serialization_type: - serialization_method = self._get_serialization_method( - serialization_type) - else: - serialization_method = self._get_serialization_method( - self.serialization_type) - - serialized_value = serialization_method(value) - return self._set(key, serialized_value) - - - # NOTE : Abe 2019/09/06 : It's unclear whether this serialization logic belongs here, - # or should be factored out to individual classes on a case-by-case basis. - - def _get_serialization_method(self, serialization_type): - if serialization_type == None: - return lambda x: x - - elif serialization_type == "json": - return json.dumps - - elif serialization_type == "pandas_csv": - - def convert_to_csv(df): - logger.debug("Starting convert_to_csv") - - assert isinstance(df, pd.DataFrame) - - return df.to_csv(index=None) - - return convert_to_csv - - # TODO: Add more serialization methods as needed + _key_class = DataContextKey + + def __init__(self, store_backend=None, runtime_environment=None): + """Runtime environment may be necessary to instantiate store backend elements.""" + if store_backend is None: + store_backend = { + "class_name": "InMemoryStoreBackend" + } + logger.debug("Building store_backend.") + self._store_backend = instantiate_class_from_config( + config=store_backend, + runtime_environment=runtime_environment or {}, + config_defaults={ + "module_name": "great_expectations.data_context.store" + } + ) + if not isinstance(self._store_backend, StoreBackend): + raise DataContextError("Invalid StoreBackend configuration: expected a StoreBackend instance.") + self._use_fixed_length_key = self._store_backend.fixed_length_key def _validate_key(self, key): - raise NotImplementedError - - def _validate_value(self, value): - # NOTE : This is probably mainly a check of serializability using the chosen serialization_type. - # Might be redundant. - raise NotImplementedError + if not isinstance(key, self._key_class): + raise TypeError("key must be an instance of %s, not %s" % (self._key_class.__name__, type(key))) - def _setup(self): - pass + # noinspection PyMethodMayBeStatic + def serialize(self, key, value): + return value - def _set(self, key, value): - raise NotImplementedError + # noinspection PyMethodMayBeStatic + def key_to_tuple(self, key): + if self._use_fixed_length_key: + return key.to_fixed_length_tuple() + return key.to_tuple() + def tuple_to_key(self, tuple_): + if self._use_fixed_length_key: + return self._key_class.from_fixed_length_tuple(tuple_) + return self._key_class.from_tuple(tuple_) -class ReadWriteStore(WriteOnlyStore): - """This base class supports both reading and writing. - - Most of the core objects in DataContext are handled by subclasses of ReadWriteStore. - """ + # noinspection PyMethodMayBeStatic + def deserialize(self, key, value): + return value - def get(self, key, serialization_type=None): + def get(self, key): self._validate_key(key) + return self.deserialize(key, self._store_backend.get(self.key_to_tuple(key))) - value = self._get(key) - - if serialization_type: - deserialization_method = self._get_deserialization_method( - serialization_type) - else: - deserialization_method = self._get_deserialization_method( - self.serialization_type) - - deserialized_value = deserialization_method(value) - return deserialized_value - - def _get(self, key): - raise NotImplementedError + def set(self, key, value): + self._validate_key(key) + return self._store_backend.set(self.key_to_tuple(key), self.serialize(key, value)) def list_keys(self): - raise NotImplementedError - - def has_key(self, key): - raise NotImplementedError - - def _get_deserialization_method(self, serialization_type): - if serialization_type == None: - return lambda x: x - - elif serialization_type == "json": - return json.loads - - elif serialization_type == "pandas_csv": - # TODO: - raise NotImplementedError - - # TODO: Add more serialization methods as needed - -class BasicInMemoryStore(ReadWriteStore): - """Like a dict, but much harder to write. - - This class uses an InMemoryStoreBackend, but I question whether it's worth it. - It would be easier just to wrap a dict. - - This class is used for testing and not much else. - """ - - def __init__(self, serialization_type=None, root_directory=None): - self.serialization_type = serialization_type - self.root_directory = root_directory - - self.store_backend = instantiate_class_from_config( - config={ - "module_name" : "great_expectations.data_context.store", - "class_name" : "InMemoryStoreBackend", - "separator" : ".", - }, - runtime_config={ - "root_directory": root_directory, - }, - config_defaults={}, - ) - - def _validate_key(self, key): - assert isinstance(key, string_types) - - def _get(self, key): - return self.store_backend.get((key,)) - - def _set(self, key, value): - self.store_backend.set((key,), value) + return [self.tuple_to_key(key) for key in self._store_backend.list_keys()] def has_key(self, key): - return self.store_backend.has_key((key,)) - - def list_keys(self): - return [key for key, in self.store_backend.list_keys()] + return self._store_backend.has_key(key.to_tuple()) diff --git a/great_expectations/data_context/store/store_backend.py b/great_expectations/data_context/store/store_backend.py index 5e242f7644fc..7192a040f5b8 100644 --- a/great_expectations/data_context/store/store_backend.py +++ b/great_expectations/data_context/store/store_backend.py @@ -1,47 +1,27 @@ -import random +# PYTHON 2 - py2 - update to ABC direct use rather than __metaclass__ once we drop py2 support +from abc import ABCMeta, abstractmethod -from ..types import ( - DataAssetIdentifier, - ValidationResultIdentifier, -) -from ..types.base_resource_identifiers import ( - DataContextKey, -) -from ..util import safe_mmkdir -import pandas as pd -import six -import io -import os -import json -import logging -logger = logging.getLogger(__name__) -import importlib -import re from six import string_types -from ..util import ( - parse_string_to_data_context_resource_identifier -) -from ...types import ( - ListOf, - AllowedKeysDotDict, -) - -# TODO : Add docstrings to these classes. -# TODO : Implement S3StoreBackend with mocks and tests - -# NOTE : Abe 2019/08/30 : Currently, these classes behave as key-value stores. -# We almost certainly want to extend that functionality to allow other operations class StoreBackend(object): - """a key-value store, to abstract away reading and writing to a persistence layer + __metaclass__ = ABCMeta + """A store backend acts as a key-value store that can accept tuples as keys, to abstract away + reading and writing to a persistence layer. + + In general a StoreBackend implementation must provide implementations of: + - _get + - _set + - list_keys + - _has_key """ - def __init__( - self, - root_directory=None, # NOTE: Eugene: 2019-09-06: I think this should be moved into filesystem-specific children classes - ): - self.root_directory = root_directory + def __init__(self, fixed_length_key=False): + self._fixed_length_key = fixed_length_key + + @property + def fixed_length_key(self): + return self._fixed_length_key def get(self, key): self._validate_key(key) @@ -58,32 +38,41 @@ def has_key(self, key): self._validate_key(key) return self._has_key(key) + def get_url_for_key(self, key, protocol=None): + raise NotImplementedError( + "Store backend of type {0:s} does not have an implementation of get_url_for_key".format( + type(self).__name__)) + def _validate_key(self, key): - if not isinstance(key, tuple): + if isinstance(key, tuple): + for key_element in key: + if not isinstance(key_element, string_types): + raise TypeError( + "Elements within tuples passed as keys to {0} must be instances of {1}, not {2}".format( + self.__class__.__name__, + string_types, + type(key_element), + )) + else: raise TypeError("Keys in {0} must be instances of {1}, not {2}".format( self.__class__.__name__, tuple, type(key), )) - - for key_element in key: - if not isinstance(key_element, string_types): - raise TypeError("Elements within tuples passed as keys to {0} must be instances of {1}, not {2}".format( - self.__class__.__name__, - string_types, - type(key_element), - )) def _validate_value(self, value): pass + @abstractmethod def _get(self, key): raise NotImplementedError + @abstractmethod def _set(self, key, value, **kwargs): raise NotImplementedError - def list_keys(self): + @abstractmethod + def list_keys(self, prefix=()): raise NotImplementedError def _has_key(self, key): @@ -92,433 +81,21 @@ def _has_key(self, key): class InMemoryStoreBackend(StoreBackend): """Uses an in-memory dictionary as a store backend. - - Note: currently, this class turns the whole key into a single key_string. - This works, but it's blunt. """ - def __init__( - self, - separator=".", - root_directory=None - ): - self.store = {} - self.separator = separator + # noinspection PyUnusedLocal + def __init__(self, runtime_environment=None, fixed_length_key=False): + super(InMemoryStoreBackend, self).__init__(fixed_length_key=fixed_length_key) + self._store = {} def _get(self, key): - return self.store[self._convert_tuple_to_string(key)] + return self._store[key] def _set(self, key, value, **kwargs): - self.store[self._convert_tuple_to_string(key)] = value + self._store[key] = value - def _validate_key(self, key): - super(InMemoryStoreBackend, self)._validate_key(key) - - if self.separator in key: - raise ValueError("Keys in {0} must not contain the separator character {1} : {2}".format( - self.__class__.__name__, - self.separator, - key, - )) - - def _convert_tuple_to_string(self, tuple_): - return self.separator.join(tuple_) - - def _convert_string_to_tuple(self, string): - return tuple(string.split(self.separator)) - - def list_keys(self): - return [self._convert_string_to_tuple(key_str) for key_str in list(self.store.keys())] + def list_keys(self, prefix=()): + return [key for key in self._store.keys() if key[:len(prefix)] == prefix] def _has_key(self, key): - return self._convert_tuple_to_string(key) in self.store - - -class FixedLengthTupleStoreBackend(StoreBackend): - """ - If key_length is provided, the key to this StoreBackend abstract class must be a tuple with fixed length equal - to key_length. The filepath_template is a string template used to convert the key to a filepath. - There's a bit of regex magic in _convert_filepath_to_key that reverses this process, - so that we can write AND read using filenames as keys. - - Another class should get this logic through multiple inheritance. - """ - - def __init__( - self, - # base_directory, - filepath_template, - key_length, - root_directory, - forbidden_substrings=None, - platform_specific_separator=True - ): - assert isinstance(key_length, int) or key_length is None - self.key_length = key_length - if forbidden_substrings is None: - forbidden_substrings = ["/", "\\"] - self.forbidden_substrings = forbidden_substrings - self.platform_specific_separator = platform_specific_separator - - self.filepath_template = filepath_template - if key_length: - self.verify_that_key_to_filepath_operation_is_reversible() - - def _validate_key(self, key): - super(FixedLengthTupleStoreBackend, self)._validate_key(key) - - for key_element in key: - for substring in self.forbidden_substrings: - if substring in key_element: - raise ValueError("Keys in {0} must not contain substrings in {1} : {2}".format( - self.__class__.__name__, - self.forbidden_substrings, - key, - )) - - def _validate_value(self, value): - if not isinstance(value, string_types) and not isinstance(value, bytes): - raise TypeError("Values in {0} must be instances of {1} or {2}, not {3}".format( - self.__class__.__name__, - string_types, - bytes, - type(value), - )) - - def _convert_key_to_filepath(self, key): - # NOTE: At some point in the future, it might be better to replace this logic with os.path.join. - # That seems more correct, but the configs will be a lot less intuitive. - # In the meantime, there is some chance that configs will not be cross-OS compatible. - - # NOTE : These methods support variable-length keys if no filepath template is provided, in which case - # all key elements are joined to generate the filepath - self._validate_key(key) - if self.filepath_template: - converted_string = self.filepath_template.format(*list(key)) - else: - converted_string = '/'.join(key) - if self.platform_specific_separator: - converted_string = os.path.join(*converted_string.split('/')) - return converted_string - - def _convert_filepath_to_key(self, filepath): - # filepath_template (for now) is always specified with forward slashes, but it is then - # used to (1) dynamically construct and evaluate a regex, and (2) split the provided (observed) filepath - if not self.filepath_template: - return tuple(filepath.split(os.sep)) - - if self.platform_specific_separator: - filepath_template = os.path.join(*self.filepath_template.split('/')) - filepath_template = filepath_template.replace('\\', '\\\\') - else: - filepath_template = self.filepath_template - - # Convert the template to a regex - indexed_string_substitutions = re.findall(r"{\d+}", filepath_template) - tuple_index_list = ["(?P.*)".format(i, ) for i in range(len(indexed_string_substitutions))] - intermediate_filepath_regex = re.sub( - r"{\d+}", - lambda m, r=iter(tuple_index_list): next(r), - filepath_template - ) - filepath_regex = intermediate_filepath_regex.format(*tuple_index_list) - - # Apply the regex to the filepath - matches = re.compile(filepath_regex).match(filepath) - if matches is None: - return None - - # Map key elements into the appropriate parts of the tuple - new_key = list([None for element in range(self.key_length)]) - for i in range(len(tuple_index_list)): - tuple_index = int(re.search('\d+', indexed_string_substitutions[i]).group(0)) - key_element = matches.group('tuple_index_' + str(i)) - new_key[tuple_index] = key_element - - new_key = tuple(new_key) - return new_key - - def verify_that_key_to_filepath_operation_is_reversible(self): - def get_random_hex(len=4): - return "".join([random.choice(list("ABCDEF0123456789")) for i in range(len)]) - - key = tuple([get_random_hex() for j in range(self.key_length)]) - filepath = self._convert_key_to_filepath(key) - new_key = self._convert_filepath_to_key(filepath) - if key != new_key: - raise ValueError( - "filepath template {0} for class {1} is not reversible for a tuple of length {2}. Have you included all elements in the key tuple?".format( - self.filepath_template, - self.__class__.__name__, - self.key_length, - )) - - -class FixedLengthTupleFilesystemStoreBackend(FixedLengthTupleStoreBackend): - """Uses a local filepath as a store. - - The key to this StoreBackend must be a tuple with fixed length equal to key_length. - The filepath_template is a string template used to convert the key to a filepath. - There's a bit of regex magic in _convert_filepath_to_key that reverses this process, - so that we can write AND read using filenames as keys. - """ - - def __init__( - self, - base_directory, - filepath_template, - key_length, - root_directory, - forbidden_substrings=None, - platform_specific_separator=True - ): - super(FixedLengthTupleFilesystemStoreBackend, self).__init__( - root_directory=root_directory, - filepath_template=filepath_template, - key_length=key_length, - forbidden_substrings=forbidden_substrings, - platform_specific_separator=platform_specific_separator - ) - - self.base_directory = base_directory - - if not os.path.isabs(root_directory): - raise ValueError("root_directory must be an absolute path. Got {0} instead.".format(root_directory)) - - self.root_directory = root_directory - - self.full_base_directory = os.path.join( - self.root_directory, - self.base_directory, - ) - - safe_mmkdir(str(os.path.dirname(self.full_base_directory))) - - def _get(self, key): - filepath = os.path.join( - self.full_base_directory, - self._convert_key_to_filepath(key) - ) - with open(filepath, 'r') as infile: - return infile.read() - - def _set(self, key, value, **kwargs): - filepath = os.path.join( - self.full_base_directory, - self._convert_key_to_filepath(key) - ) - path, filename = os.path.split(filepath) - - safe_mmkdir(str(path)) - with open(filepath, "wb") as outfile: - if isinstance(value, bytes): - outfile.write(value) - else: - outfile.write(value.encode("utf-8")) - return filepath - - def list_keys(self): - key_list = [] - for root, dirs, files in os.walk(self.full_base_directory): - for file_ in files: - full_path, file_name = os.path.split(os.path.join(root, file_)) - relative_path = os.path.relpath( - full_path, - self.full_base_directory, - ) - if relative_path == ".": - filepath = file_name - else: - filepath = os.path.join( - relative_path, - file_name - ) - - key = self._convert_filepath_to_key(filepath) - if key: - key_list.append(key) - - return key_list - - def has_key(self, key): - assert isinstance(key, string_types) - - all_keys = self.list_keys() - return key in all_keys - - -class FixedLengthTupleS3StoreBackend(FixedLengthTupleStoreBackend): - """ - Uses an S3 bucket as a store. - - The key to this StoreBackend must be a tuple with fixed length equal to key_length. - The filepath_template is a string template used to convert the key to a filepath. - There's a bit of regex magic in _convert_filepath_to_key that reverses this process, - so that we can write AND read using filenames as keys. - """ - def __init__( - self, - root_directory, - filepath_template, - key_length, - bucket, - prefix="", - boto3_options=None, - forbidden_substrings=None, - platform_specific_separator=False - ): - super(FixedLengthTupleS3StoreBackend, self).__init__( - root_directory=root_directory, - filepath_template=filepath_template, - key_length=key_length, - forbidden_substrings=forbidden_substrings, - platform_specific_separator=platform_specific_separator - ) - self.bucket = bucket - self.prefix = prefix - if boto3_options is None: - boto3_options = {} - self._boto3_options = boto3_options - - def _get(self, key): - s3_object_key = os.path.join( - self.prefix, - self._convert_key_to_filepath(key) - ) - - import boto3 - s3 = boto3.client('s3', **self._boto3_options) - s3_response_object = s3.get_object(Bucket=self.bucket, Key=s3_object_key) - return s3_response_object['Body'].read().decode(s3_response_object.get("ContentEncoding", 'utf-8')) - - def _set(self, key, value, content_encoding='utf-8', content_type='application/json', **kwargs): - s3_object_key = os.path.join( - self.prefix, - self._convert_key_to_filepath(key) - ) - - import boto3 - s3 = boto3.resource('s3', **self._boto3_options) - result_s3 = s3.Object(self.bucket, s3_object_key) - if isinstance(value, string_types): - result_s3.put(Body=value.encode(content_encoding), ContentEncoding=content_encoding, - ContentType=content_type) - else: - result_s3.put(Body=value, ContentType=content_type) - return s3_object_key - - def list_keys(self): - key_list = [] - - import boto3 - s3 = boto3.client('s3', **self._boto3_options) - - for s3_object_info in s3.list_objects(Bucket=self.bucket, Prefix=self.prefix)['Contents']: - s3_object_key = s3_object_info['Key'] - s3_object_key = os.path.relpath( - s3_object_key, - self.prefix, - ) - - key = self._convert_filepath_to_key(s3_object_key) - if key: - key_list.append(key) - - return key_list - - def has_key(self, key): - assert isinstance(key, string_types) - - all_keys = self.list_keys() - return key in all_keys - - -class FixedLengthTupleGCSStoreBackend(FixedLengthTupleStoreBackend): - """ - Uses a GCS bucket as a store. - - The key to this StoreBackend must be a tuple with fixed length equal to key_length. - The filepath_template is a string template used to convert the key to a filepath. - There's a bit of regex magic in _convert_filepath_to_key that reverses this process, - so that we can write AND read using filenames as keys. - """ - def __init__( - self, - root_directory, - filepath_template, - key_length, - bucket, - prefix, - project, - forbidden_substrings=None, - platform_specific_separator=False - ): - super(FixedLengthTupleGCSStoreBackend, self).__init__( - root_directory=root_directory, - filepath_template=filepath_template, - key_length=key_length, - forbidden_substrings=forbidden_substrings, - platform_specific_separator=platform_specific_separator - ) - self.bucket = bucket - self.prefix = prefix - self.project = project - - - def _get(self, key): - gcs_object_key = os.path.join( - self.prefix, - self._convert_key_to_filepath(key) - ) - - from google.cloud import storage - gcs = storage.Client(project=self.project) - bucket = gcs.get_bucket(self.bucket) - gcs_response_object = bucket.get_blob(gcs_object_key) - return gcs_response_object.download_as_string().decode("utf-8") - - def _set(self, key, value, content_encoding='utf-8', content_type='application/json', **kwargs): - gcs_object_key = os.path.join( - self.prefix, - self._convert_key_to_filepath(key) - ) - - from google.cloud import storage - gcs = storage.Client(project=self.project) - bucket = gcs.get_bucket(self.bucket) - blob = bucket.blob(gcs_object_key) - if isinstance(value, string_types): - # Following try/except is to support py2, since both str and bytes objects pass above condition - try: - blob.upload_from_string(value.encode(content_encoding), content_encoding=content_encoding, - content_type=content_type) - except TypeError: - blob.upload_from_string(value, content_type=content_type) - else: - blob.upload_from_string(value, content_type=content_type) - return gcs_object_key - - def list_keys(self): - key_list = [] - - from google.cloud import storage - gcs = storage.Client(self.project) - - for blob in gcs.list_blobs(self.bucket, prefix=self.prefix): - gcs_object_name = blob.name - gcs_object_key = os.path.relpath( - gcs_object_name, - self.prefix, - ) - - key = self._convert_filepath_to_key(gcs_object_key) - if key: - key_list.append(key) - - return key_list - - def has_key(self, key): - assert isinstance(key, string_types) - - all_keys = self.list_keys() - return key in all_keys + return key in self._store diff --git a/great_expectations/data_context/store/tuple_store_backend.py b/great_expectations/data_context/store/tuple_store_backend.py new file mode 100644 index 000000000000..2852f27482c6 --- /dev/null +++ b/great_expectations/data_context/store/tuple_store_backend.py @@ -0,0 +1,466 @@ +import os +import random +import re +import logging +# PYTHON 2 - py2 - update to ABC direct use rather than __metaclass__ once we drop py2 support +from abc import ABCMeta + +from six import string_types + +from great_expectations.data_context.store.store_backend import StoreBackend +from great_expectations.data_context.util import safe_mmkdir + +logger = logging.getLogger(__name__) + + +class TupleStoreBackend(StoreBackend): + __metaclass__ = ABCMeta + """ + If filepath_template is provided, the key to this StoreBackend abstract class must be a tuple with + fixed length equal to the number of unique components matching the regex r"{\d+}" + + For example, in the following template path: expectations/{0}/{1}/{2}/prefix-{2}.json, keys must have + three components. + """ + + def __init__(self, filepath_template=None, filepath_prefix=None, filepath_suffix=None, forbidden_substrings=None, + platform_specific_separator=True, fixed_length_key=False): + super(TupleStoreBackend, self).__init__(fixed_length_key=fixed_length_key) + if forbidden_substrings is None: + forbidden_substrings = ["/", "\\"] + self.forbidden_substrings = forbidden_substrings + self.platform_specific_separator = platform_specific_separator + + if filepath_template is not None and filepath_suffix is not None: + raise ValueError("filepath_suffix may only be used when filepath_template is None") + + self.filepath_template = filepath_template + self.filepath_prefix = filepath_prefix + self.filepath_suffix = filepath_suffix + + if filepath_template is not None: + # key length is the number of unique values to be substituted in the filepath_template + self.key_length = len( + set( + re.findall(r"{\d+}", filepath_template) + ) + ) + + self.verify_that_key_to_filepath_operation_is_reversible() + self._fixed_length_key = True + + def _validate_key(self, key): + super(TupleStoreBackend, self)._validate_key(key) + + for key_element in key: + for substring in self.forbidden_substrings: + if substring in key_element: + raise ValueError("Keys in {0} must not contain substrings in {1} : {2}".format( + self.__class__.__name__, + self.forbidden_substrings, + key, + )) + + def _validate_value(self, value): + if not isinstance(value, string_types) and not isinstance(value, bytes): + raise TypeError("Values in {0} must be instances of {1} or {2}, not {3}".format( + self.__class__.__name__, + string_types, + bytes, + type(value), + )) + + def _convert_key_to_filepath(self, key): + self._validate_key(key) + if self.filepath_template: + converted_string = self.filepath_template.format(*list(key)) + else: + converted_string = '/'.join(key) + + if self.filepath_prefix: + converted_string = self.filepath_prefix + converted_string + if self.filepath_suffix: + converted_string += self.filepath_suffix + if self.platform_specific_separator: + converted_string = os.path.join(*converted_string.split('/')) + + return converted_string + + def _convert_filepath_to_key(self, filepath): + # filepath_template (for now) is always specified with forward slashes, but it is then + # used to (1) dynamically construct and evaluate a regex, and (2) split the provided (observed) filepath + if self.filepath_prefix: + if not filepath.startswith(self.filepath_prefix): + # If filepath_prefix is set, we expect that it is the first component of a valid filepath. + raise ValueError("filepath must start with the filepath_prefix when one is set by the store_backend") + else: + # Remove the prefix before processing + filepath = filepath[len(self.filepath_prefix):] + + if self.filepath_suffix: + if not filepath.endswith(self.filepath_suffix): + # If filepath_suffix is set, we expect that it is the last component of a valid filepath. + raise ValueError("filepath must end with the filepath_suffix when one is set by the store_backend") + else: + # Remove the suffix before processing + filepath = filepath[:-len(self.filepath_suffix)] + + if self.filepath_template: + if self.platform_specific_separator: + filepath_template = os.path.join(*self.filepath_template.split('/')) + filepath_template = filepath_template.replace('\\', '\\\\') + else: + filepath_template = self.filepath_template + + # Convert the template to a regex + indexed_string_substitutions = re.findall(r"{\d+}", filepath_template) + tuple_index_list = ["(?P.*)".format(i, ) for i in range(len(indexed_string_substitutions))] + intermediate_filepath_regex = re.sub( + r"{\d+}", + lambda m, r=iter(tuple_index_list): next(r), + filepath_template + ) + filepath_regex = intermediate_filepath_regex.format(*tuple_index_list) + + # Apply the regex to the filepath + matches = re.compile(filepath_regex).match(filepath) + if matches is None: + return None + + # Map key elements into the appropriate parts of the tuple + new_key = [None] * self.key_length + for i in range(len(tuple_index_list)): + tuple_index = int(re.search(r'\d+', indexed_string_substitutions[i]).group(0)) + key_element = matches.group('tuple_index_' + str(i)) + new_key[tuple_index] = key_element + + new_key = tuple(new_key) + else: + new_key = tuple(filepath.split(os.sep)) + + return new_key + + def verify_that_key_to_filepath_operation_is_reversible(self): + def get_random_hex(size=4): + return "".join([random.choice(list("ABCDEF0123456789")) for _ in range(size)]) + + key = tuple([get_random_hex() for _ in range(self.key_length)]) + filepath = self._convert_key_to_filepath(key) + new_key = self._convert_filepath_to_key(filepath) + if key != new_key: + raise ValueError( + "filepath template {0} for class {1} is not reversible for a tuple of length {2}. " + "Have you included all elements in the key tuple?".format( + self.filepath_template, + self.__class__.__name__, + self.key_length, + )) + + +class TupleFilesystemStoreBackend(TupleStoreBackend): + """Uses a local filepath as a store. + + The key to this StoreBackend must be a tuple with fixed length based on the filepath_template, + or a variable-length tuple may be used and returned with an optional filepath_suffix (to be) added. + The filepath_template is a string template used to convert the key to a filepath. + """ + + def __init__(self, + base_directory, + filepath_template=None, + filepath_prefix=None, + filepath_suffix=None, + forbidden_substrings=None, + platform_specific_separator=True, + root_directory=None, + fixed_length_key=False): + super(TupleFilesystemStoreBackend, self).__init__( + filepath_template=filepath_template, + filepath_prefix=filepath_prefix, + filepath_suffix=filepath_suffix, + forbidden_substrings=forbidden_substrings, + platform_specific_separator=platform_specific_separator, + fixed_length_key=fixed_length_key + ) + if os.path.isabs(base_directory): + self.full_base_directory = base_directory + else: + if root_directory is None: + raise ValueError("base_directory must be an absolute path if root_directory is not provided") + elif not os.path.isabs(root_directory): + raise ValueError("root_directory must be an absolute path. Got {0} instead.".format(root_directory)) + else: + self.full_base_directory = os.path.join(root_directory, base_directory) + + safe_mmkdir(str(os.path.dirname(self.full_base_directory))) + + def _get(self, key): + filepath = os.path.join( + self.full_base_directory, + self._convert_key_to_filepath(key) + ) + with open(filepath, 'r') as infile: + return infile.read() + + def _set(self, key, value, **kwargs): + if not isinstance(key, tuple): + key = key.to_tuple() + filepath = os.path.join( + self.full_base_directory, + self._convert_key_to_filepath(key) + ) + path, filename = os.path.split(filepath) + + safe_mmkdir(str(path)) + with open(filepath, "wb") as outfile: + if isinstance(value, string_types): + # Following try/except is to support py2, since both str and bytes objects pass above condition + try: + outfile.write(value.encode("utf-8")) + except UnicodeDecodeError: + outfile.write(value) + else: + outfile.write(value) + return filepath + + def list_keys(self, prefix=()): + key_list = [] + for root, dirs, files in os.walk(os.path.join(self.full_base_directory, *prefix)): + for file_ in files: + full_path, file_name = os.path.split(os.path.join(root, file_)) + relative_path = os.path.relpath( + full_path, + self.full_base_directory, + ) + if relative_path == ".": + filepath = file_name + else: + filepath = os.path.join( + relative_path, + file_name + ) + + if self.filepath_prefix and not filepath.startswith(self.filepath_prefix): + continue + elif self.filepath_suffix and not filepath.endswith(self.filepath_suffix): + continue + else: + key = self._convert_filepath_to_key(filepath) + if key: + key_list.append(key) + + return key_list + + def get_url_for_key(self, key, protocol=None): + path = self._convert_key_to_filepath(key) + full_path = os.path.join(self.full_base_directory, path) + if protocol is None: + protocol = "file:" + url = protocol + "//" + full_path + + return url + + def _has_key(self, key): + return os.path.isfile(os.path.join(self.full_base_directory, self._convert_key_to_filepath(key))) + + +class TupleS3StoreBackend(TupleStoreBackend): + """ + Uses an S3 bucket as a store. + + The key to this StoreBackend must be a tuple with fixed length based on the filepath_template, + or a variable-length tuple may be used and returned with an optional filepath_suffix (to be) added. + The filepath_template is a string template used to convert the key to a filepath. + """ + + def __init__( + self, + bucket, + prefix="", + filepath_template=None, + filepath_prefix=None, + filepath_suffix=None, + forbidden_substrings=None, + platform_specific_separator=False, + fixed_length_key=False + ): + super(TupleS3StoreBackend, self).__init__( + filepath_template=filepath_template, + filepath_prefix=filepath_prefix, + filepath_suffix=filepath_suffix, + forbidden_substrings=forbidden_substrings, + platform_specific_separator=platform_specific_separator, + fixed_length_key=fixed_length_key + ) + self.bucket = bucket + self.prefix = prefix + + def _get(self, key): + s3_object_key = os.path.join( + self.prefix, + self._convert_key_to_filepath(key) + ) + + import boto3 + s3 = boto3.client('s3') + s3_response_object = s3.get_object(Bucket=self.bucket, Key=s3_object_key) + return s3_response_object['Body'].read().decode(s3_response_object.get("ContentEncoding", 'utf-8')) + + def _set(self, key, value, content_encoding='utf-8', content_type='application/json'): + s3_object_key = os.path.join( + self.prefix, + self._convert_key_to_filepath(key) + ) + + import boto3 + s3 = boto3.resource('s3') + result_s3 = s3.Object(self.bucket, s3_object_key) + if isinstance(value, string_types): + # Following try/except is to support py2, since both str and bytes objects pass above condition + try: + result_s3.put(Body=value.encode(content_encoding), ContentEncoding=content_encoding, + ContentType=content_type) + except TypeError: + result_s3.put(Body=value, ContentType=content_type) + else: + result_s3.put(Body=value, ContentType=content_type) + return s3_object_key + + def list_keys(self): + key_list = [] + + import boto3 + s3 = boto3.client('s3') + + s3_objects = s3.list_objects(Bucket=self.bucket, Prefix=self.prefix) + if "Contents" in s3_objects: + objects = s3_objects["Contents"] + elif "CommonPrefixes" in s3_objects: + logger.warning("TupleS3StoreBackend returned CommonPrefixes, but delimiter should not have been set.") + objects = [] + else: + # No objects found in store + objects = [] + + for s3_object_info in objects: + s3_object_key = s3_object_info['Key'] + s3_object_key = os.path.relpath( + s3_object_key, + self.prefix, + ) + if self.filepath_prefix and not s3_object_key.startswith(self.filepath_prefix): + # There can be other keys located in the same bucket; they are *not* our keys + continue + + key = self._convert_filepath_to_key(s3_object_key) + if key: + key_list.append(key) + + return key_list + + def get_url_for_key(self, key, protocol=None): + import boto3 + + location = boto3.client('s3').get_bucket_location(Bucket=self.bucket)['LocationConstraint'] + if location is None: + location = "s3" + else: + location = "s3-" + location + s3_key = self._convert_key_to_filepath(key) + return "https://%s.amazonaws.com/%s/%s/%s" % (location, self.bucket, self.prefix, s3_key) + + def _has_key(self, key): + all_keys = self.list_keys() + return key in all_keys + + +class TupleGCSStoreBackend(TupleStoreBackend): + """ + Uses a GCS bucket as a store. + + The key to this StoreBackend must be a tuple with fixed length based on the filepath_template, + or a variable-length tuple may be used and returned with an optional filepath_suffix (to be) added. + + The filepath_template is a string template used to convert the key to a filepath. + """ + + def __init__( + self, + bucket, + prefix, + project, + filepath_template=None, + filepath_prefix=None, + filepath_suffix=None, + forbidden_substrings=None, + platform_specific_separator=False, + fixed_length_key=False + ): + super(TupleGCSStoreBackend, self).__init__( + filepath_template=filepath_template, + filepath_prefix=filepath_prefix, + filepath_suffix=filepath_suffix, + forbidden_substrings=forbidden_substrings, + platform_specific_separator=platform_specific_separator, + fixed_length_key=fixed_length_key + ) + self.bucket = bucket + self.prefix = prefix + self.project = project + + def _get(self, key): + gcs_object_key = os.path.join( + self.prefix, + self._convert_key_to_filepath(key) + ) + + from google.cloud import storage + gcs = storage.Client(project=self.project) + bucket = gcs.get_bucket(self.bucket) + gcs_response_object = bucket.get_blob(gcs_object_key) + return gcs_response_object.download_as_string().decode("utf-8") + + def _set(self, key, value, content_encoding='utf-8', content_type='application/json'): + gcs_object_key = os.path.join( + self.prefix, + self._convert_key_to_filepath(key) + ) + + from google.cloud import storage + gcs = storage.Client(project=self.project) + bucket = gcs.get_bucket(self.bucket) + blob = bucket.blob(gcs_object_key) + if isinstance(value, string_types): + # Following try/except is to support py2, since both str and bytes objects pass above condition + try: + blob.upload_from_string(value.encode(content_encoding), content_encoding=content_encoding, + content_type=content_type) + except TypeError: + blob.upload_from_string(value, content_type=content_type) + else: + blob.upload_from_string(value, content_type=content_type) + return gcs_object_key + + def list_keys(self): + key_list = [] + + from google.cloud import storage + gcs = storage.Client(self.project) + + for blob in gcs.list_blobs(self.bucket, prefix=self.prefix): + gcs_object_name = blob.name + gcs_object_key = os.path.relpath( + gcs_object_name, + self.prefix, + ) + + key = self._convert_filepath_to_key(gcs_object_key) + if key: + key_list.append(key) + + return key_list + + def _has_key(self, key): + all_keys = self.list_keys() + return key in all_keys + diff --git a/great_expectations/data_context/store/validations_store.py b/great_expectations/data_context/store/validations_store.py new file mode 100644 index 000000000000..4655720d922a --- /dev/null +++ b/great_expectations/data_context/store/validations_store.py @@ -0,0 +1,39 @@ +from great_expectations.core import ExpectationSuiteValidationResultSchema +from great_expectations.data_context.store.database_store_backend import DatabaseStoreBackend +from great_expectations.data_context.store.store import Store +from great_expectations.data_context.store.tuple_store_backend import TupleStoreBackend +from great_expectations.data_context.types.resource_identifiers import ValidationResultIdentifier +from great_expectations.data_context.util import load_class + + +class ValidationsStore(Store): + _key_class = ValidationResultIdentifier + + def __init__(self, store_backend=None, runtime_environment=None): + self._expectationSuiteValidationResultSchema = ExpectationSuiteValidationResultSchema(strict=True) + + if store_backend is not None: + store_backend_module_name = store_backend.get("module_name", "great_expectations.data_context.store") + store_backend_class_name = store_backend.get("class_name", "InMemoryStoreBackend") + store_backend_class = load_class(store_backend_class_name, store_backend_module_name) + + if issubclass(store_backend_class, TupleStoreBackend): + # Provide defaults for this common case + store_backend["filepath_suffix"] = store_backend.get("filepath_suffix", ".json") + elif issubclass(store_backend_class, DatabaseStoreBackend): + # Provide defaults for this common case + store_backend["table_name"] = store_backend.get("table_name", "ge_validations_store") + store_backend["key_columns"] = store_backend.get( + "key_columns", [ + "expectation_suite_name", + "run_id", + "batch_identifier" + ] + ) + super(ValidationsStore, self).__init__(store_backend=store_backend, runtime_environment=runtime_environment) + + def serialize(self, key, value): + return self._expectationSuiteValidationResultSchema.dumps(value).data + + def deserialize(self, key, value): + return self._expectationSuiteValidationResultSchema.loads(value).data diff --git a/great_expectations/data_context/templates.py b/great_expectations/data_context/templates.py index d8414ed1a7a7..493c507af4a5 100644 --- a/great_expectations/data_context/templates.py +++ b/great_expectations/data_context/templates.py @@ -1,23 +1,18 @@ # -*- coding: utf-8 -*- -from great_expectations import rtd_url_ge_version PROJECT_HELP_COMMENT = """ # Welcome to Great Expectations! Always know what to expect from your data. # -# Here you can define datasources, generators, integrations and more. This file -# is intended to be committed to your repo. For help with configuration please: +# Here you can define datasources, batch kwarg generators, integrations and +# more. This file is intended to be committed to your repo. For help with +# configuration please: # - Read our docs: https://docs.greatexpectations.io/en/latest/reference/data_context_reference.html#configuration # - Join our slack channel: http://greatexpectations.io/slack -# -# NOTE: GE uses the names of configured `datasources` and `generators` to manage -# how `expectations` and other artifacts are stored in the `expectations/` and -# `datasources/` folders. If you need to rename an existing `datasource` or -# `generator`, be sure to also update the relevant directory names. config_version: 1 # Datasources tell Great Expectations where your data lives and how to get it. -# You can use the CLI command `great_expectations add-datasource` to help you +# You can use the CLI command `great_expectations datasource new` to help you # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/features/datasource.html datasources: {} """ @@ -28,7 +23,7 @@ # such as staging vs prod. # # When GE encounters substitution syntax (like `my_key: ${my_value}` or -# `my_key: $my_value`) in the config file it will attempt to replace the value +# `my_key: $my_value`) in the config file, it will attempt to replace the value # of `my_key` with the value from an environment variable `my_value` or a # corresponding key read from the file specified using # `config_variables_file_path`. Environment variables take precedence. @@ -60,10 +55,10 @@ action_list: - name: store_validation_result action: - class_name: StoreAction + class_name: StoreValidationResultAction - name: store_evaluation_params action: - class_name: ExtractAndStoreEvaluationParamsAction + class_name: StoreEvaluationParametersAction - name: update_data_docs action: class_name: UpdateDataDocsAction @@ -88,19 +83,19 @@ expectations_store: class_name: ExpectationsStore store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: expectations/ validations_store: class_name: ValidationsStore store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/validations/ evaluation_parameter_store: # Evaluation Parameters enable dynamic expectations. Read more here: # https://docs.greatexpectations.io/en/latest/reference/evaluation_parameters.html - class_name: InMemoryEvaluationParameterStore + class_name: EvaluationParameterStore expectations_store_name: expectations_store validations_store_name: validations_store @@ -114,7 +109,7 @@ local_site: class_name: SiteBuilder store_backend: - class_name: FixedLengthTupleFilesystemStoreBackend + class_name: TupleFilesystemStoreBackend base_directory: uncommitted/data_docs/local_site/ site_index_builder: class_name: DefaultSiteIndexBuilder diff --git a/great_expectations/data_context/types/__init__.py b/great_expectations/data_context/types/__init__.py index 7f37f86584eb..e69de29bb2d1 100644 --- a/great_expectations/data_context/types/__init__.py +++ b/great_expectations/data_context/types/__init__.py @@ -1,23 +0,0 @@ -from collections import namedtuple -from .base import ( - NormalizedDataAssetName, -) - -from .metrics import ( - Metric, - NamespaceAwareValidationMetric -) - -# from .configurations import ( -# DataContextConfig -# ) -from .base_resource_identifiers import ( - DataContextKey, - OrderedDataContextKey, -) -from .resource_identifiers import ( - DataAssetIdentifier, - ExpectationSuiteIdentifier, - ValidationResultIdentifier, - SiteSectionIdentifier, -) diff --git a/great_expectations/data_context/types/base.py b/great_expectations/data_context/types/base.py index 074ea18e954d..c5967fa2f379 100644 --- a/great_expectations/data_context/types/base.py +++ b/great_expectations/data_context/types/base.py @@ -1,8 +1,187 @@ -from collections import namedtuple - -# TODO: Deprecate this in favor of DataAssetIdentifier -NormalizedDataAssetName = namedtuple("NormalizedDataAssetName", [ - "datasource", - "generator", - "generator_asset" -]) \ No newline at end of file +from copy import deepcopy +import logging + +from marshmallow import Schema, fields, ValidationError, pre_dump, post_load, validates_schema +from ruamel.yaml import YAML +from ruamel.yaml.comments import CommentedMap + +import great_expectations.exceptions as ge_exceptions +from great_expectations.types import DictDot +from great_expectations.types.configurations import ClassConfigSchema + +logger = logging.getLogger(__name__) + +yaml = YAML() + +CURRENT_CONFIG_VERSION = 1 +MINIMUM_SUPPORTED_CONFIG_VERSION = 1 + + +class DataContextConfig(DictDot): + + def __init__( + self, + config_version, + datasources, + expectations_store_name, + validations_store_name, + evaluation_parameter_store_name, + plugins_directory, + validation_operators, + stores, + data_docs_sites, + config_variables_file_path=None, + commented_map=None + ): + if commented_map is None: + commented_map = CommentedMap() + self._commented_map = commented_map + self._config_version = config_version + self.datasources = datasources + self.expectations_store_name = expectations_store_name + self.validations_store_name = validations_store_name + self.evaluation_parameter_store_name = evaluation_parameter_store_name + self.plugins_directory = plugins_directory + if not isinstance(validation_operators, dict): + raise ValueError("validation_operators must be configured with a dictionary") + self.validation_operators = validation_operators + self.stores = stores + self.data_docs_sites = data_docs_sites + self.config_variables_file_path = config_variables_file_path + + @property + def commented_map(self): + return self._commented_map + + @classmethod + def from_commented_map(cls, commented_map): + try: + config = dataContextConfigSchema.load(commented_map).data + return cls(commented_map=commented_map, **config) + except ValidationError: + logger.error("Encountered errors during loading data context config. See ValidationError for more details.") + raise + + def to_yaml(self, outfile): + commented_map = deepcopy(self.commented_map) + commented_map.update(dataContextConfigSchema.dump(self).data) + yaml.dump(commented_map, outfile) + + def as_dict(self): + myself = { + "config_version": self._config_version, + "datasources": self.datasources, + "expectations_store_name": self.expectations_store_name, + "validations_store_name": self.validations_store_name, + "evaluation_parameter_store_name": self.evaluation_parameter_store_name, + "plugins_directory": self.plugins_directory, + "validation_operators": self.validation_operators, + "stores": self.stores, + "data_docs_sites": self.data_docs_sites, + "config_variables_file_path": self.config_variables_file_path, + } + if self.config_variables_file_path is None: + del myself['config_variables_file_path'] + return myself + + +class DatasourceConfig(DictDot): + def __init__(self, class_name, module_name=None, data_asset_type=None, generators=None): + self._class_name = class_name + self._module_name = module_name + self.data_asset_type = data_asset_type, + self.generators = generators + + @property + def class_name(self): + return self._class_name + + @property + def module_name(self): + return self._module_name + + +class DatasourceConfigSchema(Schema): + class_name = fields.Str() + # REMOVE WHEN SUPPORT FOR TYPE CONFIGURATION NO LONGER NEEDED + type = fields.Str(allow_none=True) + module_name = fields.Str(allow_none=True) + data_asset_type = fields.Nested(ClassConfigSchema) + # TODO: Update to generator-specific + # generators = fields.Mapping(keys=fields.Str(), values=fields.Nested(fields.GeneratorSchema)) + generators = fields.Dict(keys=fields.Str(), values=fields.Dict()) + credentials = fields.Raw(allow_none=True) + + # noinspection PyUnusedLocal + @post_load + def make_datasource_config(self, data, **kwargs): + return DatasourceConfig(**data) + + +class DataContextConfigSchema(Schema): + config_version = fields.Number(validate=lambda x: 0 < x < 100, error_messages={"invalid": "BLARG!"}) + datasources = fields.Dict(keys=fields.Str(), values=fields.Nested(DatasourceConfigSchema)) + expectations_store_name = fields.Str() + validations_store_name = fields.Str() + evaluation_parameter_store_name = fields.Str() + plugins_directory = fields.Str(allow_none=True) + validation_operators = fields.Dict(keys=fields.Str(), values=fields.Dict()) + stores = fields.Dict(keys=fields.Str(), values=fields.Dict()) + data_docs_sites = fields.Dict(keys=fields.Str(), values=fields.Dict(), allow_none=True) + config_variables_file_path = fields.Str(allow_none=True) + + # noinspection PyUnusedLocal + @pre_dump + def handle_nested_dictionaries(self, data, **kwargs): + # PY2 support -> this local handling will be unnecessary when we can upgrade to marshmallow 3 + datasources = {name: datasourceConfigSchema.dump(config).data for name, config in data['datasources'].items()} + data['datasources'] = datasources + return data + + # noinspection PyMethodMayBeStatic + # noinspection PyUnusedLocal + def handle_error(self, exc, data, **kwargs): + """Log and raise our custom exception when (de)serialization fails.""" + logger.error(exc.messages) + raise ge_exceptions.InvalidDataContextConfigError("Error while processing DataContextConfig.", + exc) + + @validates_schema + def validate_schema(self, data): + if 'config_version' not in data: + raise ge_exceptions.InvalidDataContextConfigError( + "The key `config_version` is missing; please check your config file.", + validation_error=ValidationError("no config_version key")) + + if not isinstance(data['config_version'], (int, float)): + raise ge_exceptions.InvalidDataContextConfigError( + "The key `config_version` must be a number. Please check your config file.", + validation_error=ValidationError("config version not a number") + ) + + # When migrating from 0.7.x to 0.8.0 + if data['config_version'] == 0 and ( + "validations_store" in list(data.keys()) or "validations_stores" in list(data.keys())): + raise ge_exceptions.ZeroDotSevenConfigVersionError( + "You appear to be using a config version from the 0.7.x series. This version is no longer supported." + ) + elif data['config_version'] < MINIMUM_SUPPORTED_CONFIG_VERSION: + raise ge_exceptions.UnsupportedConfigVersionError( + "You appear to have an invalid config version ({}).\n The version number must be between {} and {}.".format( + data['config_version'], + MINIMUM_SUPPORTED_CONFIG_VERSION, + CURRENT_CONFIG_VERSION, + ), + ) + elif data['config_version'] > CURRENT_CONFIG_VERSION: + raise ge_exceptions.InvalidDataContextConfigError( + "You appear to have an invalid config version ({}).\n The maximum valid version is {}.".format( + data['config_version'], + CURRENT_CONFIG_VERSION + ), + validation_error=ValidationError("config version too high") + ) + + +dataContextConfigSchema = DataContextConfigSchema(strict=True) +datasourceConfigSchema = DatasourceConfigSchema(strict=True) diff --git a/great_expectations/data_context/types/base_resource_identifiers.py b/great_expectations/data_context/types/base_resource_identifiers.py index 0224e08f44d3..9c3a6f4d4a94 100644 --- a/great_expectations/data_context/types/base_resource_identifiers.py +++ b/great_expectations/data_context/types/base_resource_identifiers.py @@ -1,101 +1 @@ -import logging -logger = logging.getLogger(__name__) - -from collections import Iterable -from six import string_types, class_types - -from great_expectations.types import ( - RequiredKeysDotDict, - AllowedKeysDotDict, - OrderedKeysDotDict, -) - -class DataContextKey(AllowedKeysDotDict): - """is used to uniquely identify resources used by the DataContext. - - DataContextKey is based OrderedKeysDotDict. - It extends the base class with a to_string method that converts - the full, nested structure of the identifier into a string. - - For example: - - "DataAssetIdentifier.my_db.default_generator.my_table" - "ValidationResultIdentifier.my_db.default_generator.my_table.default_expectations.warnings.prod.20190801" - - These strings are also used for hashing, so that DataContextKey can be used in sets, etc. - - The parse_string_to_data_context_resource_identifier convenience method great_expectations.util - can instantiate a valid identifier from any full identifier string. - - - Notes on usage and convention: - DataContextKeys define the "namespace" of the DataContext. - In an import dependency sense, these Keys exist prior to the DataContext class: DataContext imports and makes use of many Ids. - That said, DataContextKeys exist primarily to make possible the work of the DataContext, so it's hard to separate the concepts cleanly. - - DataContextKeys are the internal typing system for DataContexts. - Within the DataContext (and related classes, like DataSources, Actions, Stores, etc.), concepts that can be typed as existing DataContextKeys should always be cast to DataContextKeys format---never strings or dictionaries. - Methods that expect a DataContextKeys should check types using isinstance at the entry point when they receive the input. - In particular, most ReadWriteStores are usually bound tightly to DataContextKeys. (This is not necessarily true for WriteOnlyStores, which might be creating un-typed data, such as HTML.) - If you're reading something from a Store and it's not keyed on a DataContextKeys, it probably should be. - - Note on typing in python in general: - We sometimes joke that adding types is turning the Great Expectations codebase into Java. - This may feel un-pythonic, but for the core classes and abstractions of the library, - we judge that the purpose of the code is closer to software engineering than data manipulation and analysis. - In that context, typing is far more helpful than harmful. - - Note on transition plans: - * The OrderedKeysDotDict class is a homegrown typing system that we regard as a transitionary state. - * When we deprecate python 2 (and possibly sooner), this class will be replaced by a more standard approach to typing. - """ - - pass - -class OrderedDataContextKey(DataContextKey, OrderedKeysDotDict): - """ - """ - - def __init__(self, *args, **kwargs): - # TODO : Pull out all of this logic into a `from_string` classmethod: - from_string = kwargs.pop("from_string", None) - - if from_string == None: - super(DataContextKey, self).__init__( - *args, **kwargs - ) - - else: - # /END TODO - super(DataContextKey, self).__init__( - *from_string.split(".")[1:], - **kwargs - ) - - # TODO : Change this to __str__ - def to_string(self, include_class_prefix=True, separator="."): - return separator.join(self._get_string_elements(include_class_prefix)) - - # NOTE: This logic has been pulled into NamespacedReadWriteStore. - # I'm not sure if we should keep a copy here. - def _get_string_elements(self, include_class_prefix=True): - string_elements = [] - - if include_class_prefix: - string_elements.append(self.__class__.__name__) - - for key in self._key_order: - if isinstance(self[key], DataContextKey): - string_elements += self[key]._get_string_elements(include_class_prefix=False) - else: - string_elements.append(str(self[key])) - - return string_elements - - # This is required to make OrderedDataContextKeys hashable - def __hash__(self): - return hash(self.to_string()) - - def __eq__(self, other): - return self.__hash__() == other.__hash__() - +# DELETE diff --git a/great_expectations/data_context/types/metrics.py b/great_expectations/data_context/types/metrics.py deleted file mode 100644 index 5158f0f2502c..000000000000 --- a/great_expectations/data_context/types/metrics.py +++ /dev/null @@ -1,186 +0,0 @@ -from six import string_types - -from great_expectations.data_context.types import NormalizedDataAssetName -from great_expectations.datasource.types import BatchFingerprint -from great_expectations.types import AllowedKeysDotDict -from great_expectations.profile.metrics_utils import make_dictionary_key - -try: - from urllib.parse import urlencode -except ImportError: - from urllib import urlencode - -# TODO : separate out a MetricIdentifier class, subclassed from DataContextKey, -# so that we can support operations like isinstance(foo, MetricIdentifier) -class Metric(AllowedKeysDotDict): - """Stores a named metric.""" - _allowed_keys = { - "metric_name", - "metric_value" - } - _required_keys = { - "metric_name", - "metric_value" - } - -# TODO : separate out a NamespaceAwareValidationMetricIdentifier class, subclassed from DataContextKey -class NamespaceAwareValidationMetric(Metric): - """Captures information from a validation result in a fully namespace aware way suitable to be accessed - in evaluation parameters, multi-batch validation meta analysis or multi batch validation.""" - _allowed_keys = { - "data_asset_name", - "batch_fingerprint", - "metric_name", - "metric_kwargs", - "metric_value" - } - _required_keys = { - "data_asset_name", - "batch_fingerprint", - "metric_name", - "metric_kwargs", - } - _key_types = { - "data_asset_name": NormalizedDataAssetName, - "batch_fingerprint": BatchFingerprint, - "metric_name": string_types, - "metric_kwargs": dict - } - - @property - def key(self): - return ('NamespaceAwareValidationMetric', - self.data_asset_name, - self.batch_fingerprint, - self.metric_name, - make_dictionary_key(self.metric_kwargs)) - - @property - def multi_batch_key(self): - return ('NamespaceAwareValidationMetric', - self.data_asset_name, - self.metric_name, - make_dictionary_key(self.metric_kwargs)) - -# TODO : separate out a NamespaceAwareExpectationDefinedValidationMetricIdentifier class, subclassed from DataContextKey -class NamespaceAwareExpectationDefinedValidationMetric(Metric): - """Captures information from a validation result in a fully namespace aware way suitable to be accessed - in evaluation parameters, multi-batch validation meta analysis or multi batch validation.""" - _allowed_keys = { - "data_asset_name", - "batch_fingerprint", - "expectation_type", - # the path to the key in the result dictionary that holds the metric, encoded as a tuple - # examples: - # for {'foo': 1} result_key will be ('foo',), - # for {'foo': {'bar': 1}} result_key will be ('foo','bar') - "result_key", - "metric_kwargs", - "metric_value" - } - _required_keys = { - "data_asset_name", - "batch_fingerprint", - "expectation_type", - "result_key", - "metric_kwargs" - } - _key_types = { - "data_asset_name": NormalizedDataAssetName, - "batch_fingerprint": BatchFingerprint, - "expectation_type": string_types, - "result_key": tuple, - "metric_kwargs": dict - } - - @property - def key(self): - return ('NamespaceAwareExpectationDefinedValidationMetric', - self.data_asset_name, - self.batch_fingerprint, - self.expectation_type, - self.result_key, - make_dictionary_key(self.metric_kwargs)) - - @property - def multi_batch_key(self): - return ('NamespaceAwareExpectationDefinedValidationMetric', - self.data_asset_name, - self.expectation_type, - self.result_key, - make_dictionary_key(self.metric_kwargs)) - -# TODO : separate out a MultiBatchNamespaceAwareValidationMetricIdentifier class, subclassed from DataContextKey -class MultiBatchNamespaceAwareValidationMetric(Metric): - """Holds values of a metric captured from validation results of multiple batches.""" - - _allowed_keys = { - "data_asset_name", - "metric_name", - "metric_kwargs", - "batch_fingerprints", - "batch_metric_values" - } - _required_keys = { - "data_asset_name", - "metric_name", - "metric_kwargs", - "batch_fingerprints", - "batch_metric_values" - } - _key_types = { - "data_asset_name": NormalizedDataAssetName, - "metric_name": string_types, - "metric_kwargs": dict, - "batch_fingerprints": list, - "batch_metric_values": list - } - - @property - def key(self): - return ('MultiBatchNamespaceAwareValidationMetric', - self.data_asset_name, - self.metric_name, - make_dictionary_key(self.metric_kwargs)) - - -# TODO : separate out a MultiBatchNamespaceAwareExpectationDefinedValidationMetricIdentifier class, subclassed from DataContextKey -class MultiBatchNamespaceAwareExpectationDefinedValidationMetric(Metric): - """Holds values of a metric captured from validation results of multiple batches.""" - - _allowed_keys = { - "data_asset_name", - # the path to the key in the result dictionary that holds the metric, encoded as a tuple - # examples: - # for {'foo': 1} result_key will be ('foo',), - # for {'foo': {'bar': 1}} result_key will be ('foo','bar') - "result_key", - "metric_kwargs", - "expectation_type", - "batch_fingerprints", - "batch_metric_values" - } - _required_keys = { - "data_asset_name", - "result_key", - "metric_kwargs", - "expectation_type", - "batch_fingerprints", - "batch_metric_values" - } - _key_types = { - "data_asset_name": NormalizedDataAssetName, - "result_key": tuple, - "metric_kwargs": dict, - "expectation_type": string_types, - "batch_fingerprints": list, - "batch_metric_values": list - } - - @property - def key(self): - return ('MultiBatchNamespaceAwareExpectationDefinedValidationMetric', - self.data_asset_name, - self.expectation_type, - self.result_key, - make_dictionary_key(self.metric_kwargs)) diff --git a/great_expectations/data_context/types/resource_identifiers.py b/great_expectations/data_context/types/resource_identifiers.py index 5350784cd4b4..e44cd518a289 100644 --- a/great_expectations/data_context/types/resource_identifiers.py +++ b/great_expectations/data_context/types/resource_identifiers.py @@ -1,94 +1,212 @@ import logging + +from marshmallow import Schema, fields, post_load + +from great_expectations.core import IDDict +from great_expectations.core.data_context_key import DataContextKey +from great_expectations.exceptions import InvalidDataContextKeyError, DataContextError + logger = logging.getLogger(__name__) -from six import string_types - -from great_expectations.data_context.types.base_resource_identifiers import ( - DataContextKey, - OrderedDataContextKey, -) - - - -# TODO: Rename to DataAssetKey, for consistency -class DataAssetIdentifier(OrderedDataContextKey): - - def __init__(self, *args, **kwargs): - delimiter = kwargs.pop('delimiter', '/') - super(DataAssetIdentifier, self).__init__(*args, **kwargs) - self.__delimiter = delimiter - - _key_order = [ - "datasource", - "generator", - "generator_asset" - ] - _key_types = { - "datasource": string_types, - "generator": string_types, - "generator_asset": string_types - } - # NOTE: This pattern is kinda awkward. It would be nice to ONLY specify _key_order - _required_keys = set(_key_order) - _allowed_keys = set(_key_order) | {"_DataAssetIdentifier__delimiter"} - - def __str__(self): - return self.__delimiter.join( - (self.datasource, - self.generator, - self.generator_asset) + +class ExpectationSuiteIdentifier(DataContextKey): + + def __init__(self, expectation_suite_name): + super(ExpectationSuiteIdentifier, self).__init__() + self._expectation_suite_name = expectation_suite_name + + @property + def expectation_suite_name(self): + return self._expectation_suite_name + + def to_tuple(self): + return tuple(self.expectation_suite_name.split(".")) + + def to_fixed_length_tuple(self): + return self.expectation_suite_name, + + @classmethod + def from_tuple(cls, tuple_): + return cls(".".join(tuple_)) + + @classmethod + def from_fixed_length_tuple(cls, tuple_): + return cls(expectation_suite_name=tuple_[0]) + + +class ExpectationSuiteIdentifierSchema(Schema): + expectation_suite_name = fields.Str() + + # noinspection PyUnusedLocal + @post_load + def make_expectation_suite_identifier(self, data, **kwargs): + return ExpectationSuiteIdentifier(**data) + + +class BatchIdentifier(DataContextKey): + + def __init__(self, batch_identifier): + super(BatchIdentifier, self).__init__() + # batch_kwargs + # if isinstance(batch_identifier, (BatchKwargs, dict)): + # self._batch_identifier = batch_identifier.batch_fingerprint + # else: + self._batch_identifier = batch_identifier + + @property + def batch_identifier(self): + return self._batch_identifier + + def to_tuple(self): + return self.batch_identifier, + + @classmethod + def from_tuple(cls, tuple_): + return cls(batch_identifier=tuple_[0]) + + +class BatchIdentifierSchema(Schema): + batch_identifier = fields.Str() + + # noinspection PyUnusedLocal + @post_load + def make_batch_identifier(self, data, **kwargs): + return BatchIdentifier(**data) + + +class ValidationResultIdentifier(DataContextKey): + """A ValidationResultIdentifier identifies a validation result by the fully qualified expectation_suite_identifer + and run_id. + """ + + def __init__(self, expectation_suite_identifier, run_id, batch_identifier): + """Constructs a ValidationResultIdentifier + + Args: + expectation_suite_identifier (ExpectationSuiteIdentifier, list, tuple, or dict): + identifying information for the fully qualified expectation suite used to validate + run_id (str): The run_id for which validation occurred + """ + super(ValidationResultIdentifier, self).__init__() + self._expectation_suite_identifier = expectation_suite_identifier + self._run_id = run_id + self._batch_identifier = batch_identifier + + @property + def expectation_suite_identifier(self): + return self._expectation_suite_identifier + + @property + def run_id(self): + return self._run_id + + @property + def batch_identifier(self): + return self._batch_identifier + + def to_tuple(self): + return tuple( + list(self.expectation_suite_identifier.to_tuple()) + [ + self.run_id or "__none__", + self.batch_identifier or "__none__" + ] + ) + + def to_fixed_length_tuple(self): + return self.expectation_suite_identifier.expectation_suite_name, self.run_id or "__none__", \ + self.batch_identifier or "__none__" + + @classmethod + def from_tuple(cls, tuple_): + return cls(ExpectationSuiteIdentifier.from_tuple(tuple_[0:-2]), tuple_[-2], tuple_[-1]) + + @classmethod + def from_fixed_length_tuple(cls, tuple_): + return cls(ExpectationSuiteIdentifier(tuple_[0]), tuple_[1], tuple_[2]) + + @classmethod + def from_object(cls, validation_result): + batch_kwargs = validation_result.meta.get("batch_kwargs", {}) + if isinstance(batch_kwargs, IDDict): + batch_identifier = batch_kwargs.to_id() + elif isinstance(batch_kwargs, dict): + batch_identifier = IDDict(batch_kwargs).to_id() + else: + raise DataContextError("Unable to construct ValidationResultIdentifier from provided object.") + return cls( + expectation_suite_identifier=ExpectationSuiteIdentifier(validation_result.meta["expectation_suite_name"]), + run_id=validation_result.meta.get("run_id"), + batch_identifier=batch_identifier ) - def __repr__(self): - return str(self) - - -# TODO: Rename to ExpectationSuiteKey, for consistency -class ExpectationSuiteIdentifier(OrderedDataContextKey): - _key_order = [ - "data_asset_name", - "expectation_suite_name", - ] - _key_types = { - "data_asset_name" : DataAssetIdentifier, - "expectation_suite_name" : string_types, - } - # NOTE: This pattern is kinda awkward. It would be nice to ONLY specify _key_order - _required_keys = set(_key_order) - _allowed_keys = set(_key_order) - -# TODO: Rename to ValidatioResultKey, for consistency -class ValidationResultIdentifier(OrderedDataContextKey): - _key_order = [ - "expectation_suite_identifier", - "run_id", - # "purpose" - ] - _key_types = { - "expectation_suite_identifier": ExpectationSuiteIdentifier, - "run_id": string_types - } - # NOTE: This pattern is kinda awkward. It would be nice to ONLY specify _key_order - _required_keys = set(_key_order) - _allowed_keys = set(_key_order) - -# TODO: Rename to SiteSectionKey, for consistency + +class ValidationResultIdentifierSchema(Schema): + expectation_suite_identifier = fields.Nested(ExpectationSuiteIdentifierSchema, required=True, error_messages={ + 'required': 'expectation_suite_identifier is required for a ValidationResultIdentifier'}) + run_id = fields.Str(required=True, error_messages={'required': "run_id is required for a " + "ValidationResultIdentifier"}) + batch_identifier = fields.Nested(BatchIdentifierSchema, required=True) + + # noinspection PyUnusedLocal + @post_load + def make_validation_result_identifier(self, data, **kwargs): + return ValidationResultIdentifier(**data) + + class SiteSectionIdentifier(DataContextKey): - _required_keys = set([ - "site_section_name", - "resource_identifier", - ]) - _allowed_keys = _required_keys - _key_types = { - "site_section_name" : string_types, - "resource_identifier" : DataContextKey, - # "resource_identifier", ... is NOT strictly typed, since it can contain any type of ResourceIdentifier - } - - def __hash__(self): - return hash(self.site_section_name+"::"+self.resource_identifier.to_string()) - - def __eq__(self, other): - print(self) - print(other) - return self.__hash__() == other.__hash__() + def __init__(self, site_section_name, resource_identifier): + self._site_section_name = site_section_name + if site_section_name in ["validations", "profiling"]: + if isinstance(resource_identifier, ValidationResultIdentifier): + self._resource_identifier = resource_identifier + elif isinstance(resource_identifier, (tuple, list)): + self._resource_identifier = ValidationResultIdentifier(*resource_identifier) + else: + self._resource_identifier = ValidationResultIdentifier(**resource_identifier) + elif site_section_name == "expectations": + if isinstance(resource_identifier, ExpectationSuiteIdentifier): + self._resource_identifier = resource_identifier + elif isinstance(resource_identifier, (tuple, list)): + self._resource_identifier = ExpectationSuiteIdentifier(*resource_identifier) + else: + self._resource_identifier = ExpectationSuiteIdentifier(**resource_identifier) + else: + raise InvalidDataContextKeyError( + "SiteSectionIdentifier only supports 'validations' and 'expectations' as site section names" + ) + + @property + def site_section_name(self): + return self._site_section_name + + @property + def resource_identifier(self): + return self._resource_identifier + + def to_tuple(self): + # if PY3: + # return (self.site_section_name, *self.resource_identifier.to_tuple()) + # else: + site_section_identifier_tuple_list = [self.site_section_name] + list(self.resource_identifier.to_tuple()) + return tuple(site_section_identifier_tuple_list) + + @classmethod + def from_tuple(cls, tuple_): + if tuple_[0] == "validations": + return cls( + site_section_name=tuple_[0], + resource_identifier=ValidationResultIdentifier.from_tuple(tuple_[1:]) + ) + elif tuple_[0] == "expectations": + return cls( + site_section_name=tuple_[0], + resource_identifier=ExpectationSuiteIdentifier.from_tuple(tuple_[1:]) + ) + else: + raise InvalidDataContextKeyError( + "SiteSectionIdentifier only supports 'validations' and 'expectations' as site section names" + ) + + +expectationSuiteIdentifierSchema = ExpectationSuiteIdentifierSchema(strict=True) +validationResultIdentifierSchema = ValidationResultIdentifierSchema(strict=True) diff --git a/great_expectations/data_context/util.py b/great_expectations/data_context/util.py index bc52b1ffe943..42265bd30544 100644 --- a/great_expectations/data_context/util.py +++ b/great_expectations/data_context/util.py @@ -5,8 +5,10 @@ import importlib import copy import re +import inspect from collections import OrderedDict +from great_expectations.data_context.types.base import DataContextConfig from great_expectations.exceptions import ( PluginModuleNotFoundError, PluginClassNotFoundError, @@ -32,20 +34,6 @@ def safe_mmkdir(directory, exist_ok=True): raise -# TODO : Consider moving this into types.resource_identifiers.DataContextKey. -# NOTE : We **don't** want to encourage stringification of keys, other than in tests, etc. -# TODO : Rename to parse_string_to_data_context_key -def parse_string_to_data_context_resource_identifier(string, separator="."): - string_elements = string.split(separator) - - loaded_module = importlib.import_module("great_expectations.data_context.types.resource_identifiers") - class_ = getattr(loaded_module, string_elements[0]) - - class_instance = class_(*(string_elements[1:])) - - return class_instance - - def load_class(class_name, module_name): """Dynamically load a class from strings or raise a helpful error.""" @@ -68,10 +56,9 @@ def load_class(class_name, module_name): return class_ -# TODO: Rename runtime_config to runtime_environment and pass it through as a typed object, rather than unpacking it. # TODO: Rename config to constructor_kwargs and config_defaults -> constructor_kwarg_default # TODO: Improve error messages in this method. Since so much of our workflow is config-driven, this will be a *super* important part of DX. -def instantiate_class_from_config(config, runtime_config, config_defaults=None): +def instantiate_class_from_config(config, runtime_environment, config_defaults=None): """Build a GE class from configuration dictionaries.""" if config_defaults is None: @@ -109,7 +96,21 @@ def instantiate_class_from_config(config, runtime_config, config_defaults=None): config_with_defaults = copy.deepcopy(config_defaults) config_with_defaults.update(config) - config_with_defaults.update(runtime_config) + if runtime_environment is not None: + # If there are additional kwargs available in the runtime_environment requested by a + # class to be instantiated, provide them + if six.PY3: + argspec = inspect.getfullargspec(class_.__init__)[0][1:] + else: + argspec = inspect.getargspec(class_.__init__)[0][1:] + missing_args = set(argspec) - set(config_with_defaults.keys()) + config_with_defaults.update( + {missing_arg: runtime_environment[missing_arg] for missing_arg in missing_args + if missing_arg in runtime_environment} + ) + # Add the entire runtime_environment as well if it's requested + if "runtime_environment" in missing_args: + config_with_defaults.update({"runtime_environment": runtime_environment}) try: class_instance = class_(**config_with_defaults) @@ -164,7 +165,7 @@ def substitute_config_variable(template_str, config_variables_dict): else: return template_str[:match.start()] + config_variable_value + template_str[match.end():] - raise InvalidConfigError("Unable to find match for config variable {:s}".format(match.group(1))) + raise InvalidConfigError("Unable to find match for config variable {:s}. See https://great-expectations.readthedocs.io/en/latest/reference/data_context_reference.html#managing-environment-and-secrets".format(match.group(1))) return template_str @@ -180,9 +181,27 @@ def substitute_all_config_variables(data, replace_variables_dict): :param replace_variables_dict: :return: a dictionary with all the variables replaced with their values """ + if isinstance(data, DataContextConfig): + data = data.as_dict() + if isinstance(data, dict) or isinstance(data, OrderedDict): return {k: substitute_all_config_variables(v, replace_variables_dict) for k, v in data.items()} elif isinstance(data, list): return [substitute_all_config_variables(v, replace_variables_dict) for v in data] return substitute_config_variable(data, replace_variables_dict) + + +def file_relative_path(dunderfile, relative_path): + """ + This function is useful when one needs to load a file that is + relative to the position of the current file. (Such as when + you encode a configuration file path in source file and want + in runnable in any current working directory) + + It is meant to be used like the following: + file_relative_path(__file__, 'path/relative/to/file') + + H/T https://github.com/dagster-io/dagster/blob/8a250e9619a49e8bff8e9aa7435df89c2d2ea039/python_modules/dagster/dagster/utils/__init__.py#L34 + """ + return os.path.join(os.path.dirname(dunderfile), relative_path) diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py index c6aed48de3ea..6f7922839d1e 100644 --- a/great_expectations/dataset/dataset.py +++ b/great_expectations/dataset/dataset.py @@ -138,8 +138,7 @@ def inner_wrapper(self, column, result_format=None, *args, **kwargs): if result_format['result_format'] in ["SUMMARY", "COMPLETE"]: return return_obj - raise ValueError("Unknown result_format %s." % - (result_format['result_format'],)) + raise ValueError("Unknown result_format %s." % result_format['result_format']) return inner_wrapper @@ -382,7 +381,7 @@ def test_column_aggregate_expectation_function(self, function, *args, **kwargs): def expect_column_to_exist( self, column, column_index=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect the specified column to exist. @@ -434,7 +433,7 @@ def expect_table_columns_to_match_ordered_list( self, column_list, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -504,7 +503,7 @@ def expect_table_columns_to_match_ordered_list( def expect_table_column_count_to_be_between( self, min_value=None, max_value=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None, ): """Expect the number of columns to be between two values. @@ -590,7 +589,7 @@ def expect_table_column_count_to_be_between( def expect_table_column_count_to_equal( self, value, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect the number of columns to equal a value. @@ -647,7 +646,7 @@ def expect_table_column_count_to_equal( def expect_table_row_count_to_be_between( self, min_value=None, max_value=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None, ): """Expect the number of rows to be between two values. @@ -733,7 +732,7 @@ def expect_table_row_count_to_be_between( def expect_table_row_count_to_equal( self, value, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect the number of rows to equal a value. @@ -793,7 +792,7 @@ def expect_table_row_count_to_equal( def expect_column_values_to_be_unique(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect each column value to be unique. @@ -839,7 +838,7 @@ def expect_column_values_to_be_unique(self, def expect_column_values_to_not_be_null(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column values to not be null. @@ -888,7 +887,7 @@ def expect_column_values_to_not_be_null(self, def expect_column_values_to_be_null(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column values to be null. @@ -936,7 +935,7 @@ def expect_column_values_to_be_of_type( column, type_, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect a column to contain values of a specified data type. @@ -998,7 +997,7 @@ def expect_column_values_to_be_in_type_list( column, type_list, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect a column to contain values from a specified type list. @@ -1063,7 +1062,7 @@ def expect_column_values_to_be_in_set(self, value_set, mostly=None, parse_strings_as_datetimes=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): # noinspection PyUnresolvedReferences """Expect each column value to be in a given set. @@ -1136,7 +1135,7 @@ def expect_column_values_to_not_be_in_set(self, value_set, mostly=None, parse_strings_as_datetimes=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): # noinspection PyUnresolvedReferences """Expect column entries to not be in the set. @@ -1213,7 +1212,7 @@ def expect_column_values_to_be_between(self, parse_strings_as_datetimes=False, output_strftime_format=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column entries to be between a minimum value and a maximum value (inclusive). @@ -1279,7 +1278,7 @@ def expect_column_values_to_be_increasing(self, strictly=None, parse_strings_as_datetimes=False, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column values to be increasing. @@ -1337,7 +1336,7 @@ def expect_column_values_to_be_decreasing(self, strictly=None, parse_strings_as_datetimes=False, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column values to be decreasing. @@ -1402,7 +1401,7 @@ def expect_column_value_lengths_to_be_between( min_value=None, max_value=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column entries to be strings with length between a minimum value and a maximum value (inclusive). @@ -1462,7 +1461,7 @@ def expect_column_value_lengths_to_equal(self, column, value, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column entries to be strings with length equal to the provided value. @@ -1513,7 +1512,7 @@ def expect_column_values_to_match_regex(self, column, regex, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column entries to be strings that match a given regular expression. Valid matches can be found \ anywhere in the string, for example "[at]+" will identify the following strings as expected: "cat", "hat", \ @@ -1568,7 +1567,7 @@ def expect_column_values_to_not_match_regex( column, regex, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column entries to be strings that do NOT match a given regular expression. The regex must not match \ any portion of the provided string. For example, "[at]+" would identify the following strings as expected: \ @@ -1622,7 +1621,7 @@ def expect_column_values_to_match_regex_list( self, column, regex_list, match_on="any", mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect the column entries to be strings that can be matched to either any of or all of a list of regular expressions. Matches can be anywhere in the string. @@ -1679,7 +1678,7 @@ def expect_column_values_to_not_match_regex_list( self, column, regex_list, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None): """Expect the column entries to be strings that do not match any of a list of regular expressions. Matches can be anywhere in the string. @@ -1736,7 +1735,7 @@ def expect_column_values_to_match_strftime_format( column, strftime_format, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column entries to be strings representing a date or time with a given format. @@ -1781,7 +1780,7 @@ def expect_column_values_to_be_dateutil_parseable( self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column entries to be parsable using dateutil. @@ -1824,7 +1823,7 @@ def expect_column_values_to_be_json_parseable( self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column entries to be data written in JavaScript Object Notation. @@ -1872,7 +1871,7 @@ def expect_column_values_to_match_json_schema( column, json_schema, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column entries to be JSON objects matching a given JSON schema. @@ -1927,7 +1926,7 @@ def expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( self, column, distribution, p_value=0.05, params=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): """ Expect the column values to be distributed similarly to a scipy distribution. \ @@ -2008,7 +2007,7 @@ def expect_column_distinct_values_to_be_in_set( column, value_set, parse_strings_as_datetimes=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): # noinspection PyUnresolvedReferences """Expect the set of distinct column values to be contained by a given set. @@ -2119,7 +2118,7 @@ def expect_column_distinct_values_to_equal_set( column, value_set, parse_strings_as_datetimes=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): # noinspection PyUnresolvedReferences """Expect the set of distinct column values to equal a given set. @@ -2207,7 +2206,7 @@ def expect_column_distinct_values_to_contain_set( column, value_set, parse_strings_as_datetimes=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): # noinspection PyUnresolvedReferences """Expect the set of distinct column values to contain a given set. @@ -2295,7 +2294,7 @@ def expect_column_mean_to_be_between( column, min_value=None, max_value=None, strict_min=False, strict_max=False, # tolerance=1e-9, - result_format=None, include_config=False, catch_exceptions=None, meta=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None, ): """Expect the column mean to be between a minimum value and a maximum value (inclusive). @@ -2411,7 +2410,7 @@ def expect_column_median_to_be_between( column, min_value=None, max_value=None, strict_min=False, strict_max=False, # tolerance=1e-9, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None, ): """Expect the column median to be between a minimum value and a maximum value. @@ -2521,7 +2520,7 @@ def expect_column_quantile_values_to_be_between( column, quantile_ranges, allow_relative_error=False, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None, ): # noinspection PyUnresolvedReferences @@ -2657,7 +2656,7 @@ def expect_column_stdev_to_be_between( column, min_value=None, max_value=None, strict_min=False, strict_max=False, # tolerance=1e-9, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect the column standard deviation to be between a minimum value and a maximum value. @@ -2758,7 +2757,7 @@ def expect_column_unique_value_count_to_be_between( self, column, min_value=None, max_value=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None, ): """Expect the number of unique values to be between a minimum value and a maximum value. @@ -2848,7 +2847,7 @@ def expect_column_proportion_of_unique_values_to_be_between( column, min_value=0, max_value=1, strict_min=False, strict_max=False, # tolerance=1e-9, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None, ): """Expect the proportion of unique values to be between a minimum value and a maximum value. @@ -2962,7 +2961,7 @@ def expect_column_most_common_value_to_be_in_set( column, value_set, ties_okay=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None, ): """Expect the most common value to be within the designated value set @@ -3037,7 +3036,7 @@ def expect_column_sum_to_be_between( column, min_value=None, max_value=None, strict_min=False, strict_max=False, # tolerance=1e-9, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect the column to sum to be between an min and max value @@ -3144,7 +3143,7 @@ def expect_column_min_to_be_between( strict_min=False, strict_max=False, # tolerance=1e-9, parse_strings_as_datetimes=False, output_strftime_format=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect the column to sum to be between an min and max value @@ -3284,7 +3283,7 @@ def expect_column_max_to_be_between( # tolerance=1e-9, parse_strings_as_datetimes=False, output_strftime_format=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect the column max to be between an min and max value @@ -3422,7 +3421,7 @@ def expect_column_chisquare_test_p_value_to_be_greater_than( partition_object=None, p=0.05, tail_weight_holdout=0, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None, ): """Expect column values to be distributed similarly to the provided categorical partition. \ @@ -3541,7 +3540,7 @@ def expect_column_bootstrapped_ks_test_p_value_to_be_greater_than( column, partition_object=None, p=0.05, bootstrap_samples=None, bootstrap_sample_size=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect column values to be distributed similarly to the provided continuous partition. This expectation \ @@ -3633,7 +3632,7 @@ def expect_column_kl_divergence_to_be_less_than( tail_weight_holdout=0, internal_weight_holdout=0, bucketize_data=True, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """Expect the Kulback-Leibler (KL) divergence (relative entropy) of the specified column with respect to the \ @@ -4027,7 +4026,7 @@ def expect_column_pair_values_to_be_equal( self, column_A, column_B, ignore_row_if="both_values_are_missing", - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """ @@ -4071,7 +4070,7 @@ def expect_column_pair_values_A_to_be_greater_than_B( parse_strings_as_datetimes=False, allow_cross_type_comparisons=None, ignore_row_if="both_values_are_missing", - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """ @@ -4118,7 +4117,7 @@ def expect_column_pair_values_to_be_in_set( column_B, value_pairs_set, ignore_row_if="both_values_are_missing", - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """ @@ -4165,7 +4164,7 @@ def expect_multicolumn_values_to_be_unique( self, column_list, ignore_row_if="all_values_are_missing", - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None ): """ diff --git a/great_expectations/dataset/pandas_dataset.py b/great_expectations/dataset/pandas_dataset.py index bd3a52ff530e..adc8cd760f07 100644 --- a/great_expectations/dataset/pandas_dataset.py +++ b/great_expectations/dataset/pandas_dataset.py @@ -13,6 +13,7 @@ from scipy import stats from six import PY2, PY3, integer_types, string_types +from great_expectations.core import ExpectationConfiguration from great_expectations.data_asset import DataAsset from .dataset import Dataset from great_expectations.data_asset.util import DocInherit, parse_result_format @@ -368,7 +369,14 @@ def get_column_value_counts(self, column, sort="value", collate=None): ) counts = self[column].value_counts() if sort == "value": - counts.sort_index(inplace=True) + try: + counts.sort_index(inplace=True) + except TypeError: + # Having values of multiple types in a object dtype column (e.g., strings and floats) + # raises a TypeError when the sorting method performs comparisons. + if self[column].dtype == object: + counts.index = counts.index.astype(str) + counts.sort_index(inplace=True) elif sort == "counts": counts.sort_values(inplace=True) counts.name = "count" @@ -423,7 +431,7 @@ def get_column_count_in_range(self, column, min_val=None, max_val=None, strict_m @MetaPandasDataset.column_map_expectation def expect_column_values_to_be_unique(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): return ~column.duplicated(keep=False) @@ -431,7 +439,7 @@ def expect_column_values_to_be_unique(self, column, @MetaPandasDataset.column_map_expectation def expect_column_values_to_not_be_null(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None, include_nulls=True): + result_format=None, include_config=True, catch_exceptions=None, meta=None, include_nulls=True): return ~column.isnull() @@ -439,7 +447,7 @@ def expect_column_values_to_not_be_null(self, column, @MetaPandasDataset.column_map_expectation def expect_column_values_to_be_null(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): return column.isnull() @@ -493,7 +501,7 @@ def expect_column_values_to_be_of_type( "expect_column_values_to_be_of_type", column ) if len(existing_expectations) == 1: - self._expectation_suite["expectations"].pop(existing_expectations[0]) + self._expectation_suite.expectations.pop(existing_expectations[0]) # Now, rename the expectation we just added @@ -501,9 +509,14 @@ def expect_column_values_to_be_of_type( "_expect_column_values_to_be_of_type__aggregate", column ) assert len(new_expectations) == 1 - expectation_index = new_expectations[0] - self._expectation_suite["expectations"][expectation_index]["expectation_type"] = \ - "expect_column_values_to_be_of_type" + old_config = self._expectation_suite.expectations[new_expectations[0]] + new_config = ExpectationConfiguration( + expectation_type="expect_column_values_to_be_of_type", + kwargs=old_config.kwargs, + meta=old_config.meta, + success_on_last_run=old_config.success_on_last_run + ) + self._expectation_suite.expectations[new_expectations[0]] = new_config else: res = self._expect_column_values_to_be_of_type__map( column, type_, **kwargs @@ -521,16 +534,21 @@ def expect_column_values_to_be_of_type( "expect_column_values_to_be_of_type", column ) if len(existing_expectations) == 1: - self._expectation_suite["expectations"].pop(existing_expectations[0]) + self._expectation_suite.expectations.pop(existing_expectations[0]) # Now, rename the expectation we just added new_expectations = self.find_expectation_indexes( "_expect_column_values_to_be_of_type__map", column ) assert len(new_expectations) == 1 - expectation_index = new_expectations[0] - self._expectation_suite["expectations"][expectation_index]["expectation_type"] = \ - "expect_column_values_to_be_of_type" + old_config = self._expectation_suite.expectations[new_expectations[0]] + new_config = ExpectationConfiguration( + expectation_type="expect_column_values_to_be_of_type", + kwargs=old_config.kwargs, + meta=old_config.meta, + success_on_last_run=old_config.success_on_last_run + ) + self._expectation_suite.expectations[new_expectations[0]] = new_config return res @@ -539,7 +557,7 @@ def _expect_column_values_to_be_of_type__aggregate( self, column, type_, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if mostly is not None: raise ValueError("PandasDataset cannot support mostly for a column with a non-object dtype.") @@ -607,7 +625,7 @@ def _expect_column_values_to_be_of_type__map( self, column, type_, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): comp_types = [] try: @@ -685,15 +703,20 @@ def expect_column_values_to_be_in_type_list( "expect_column_values_to_be_in_type_list", column ) if len(existing_expectations) == 1: - self._expectation_suite["expectations"].pop(existing_expectations[0]) + self._expectation_suite.expectations.pop(existing_expectations[0]) new_expectations = self.find_expectation_indexes( "_expect_column_values_to_be_in_type_list__aggregate", column ) assert len(new_expectations) == 1 - expectation_index = new_expectations[0] - self._expectation_suite["expectations"][expectation_index]["expectation_type"] = \ - "expect_column_values_to_be_in_type_list" + old_config = self._expectation_suite.expectations[new_expectations[0]] + new_config = ExpectationConfiguration( + expectation_type="expect_column_values_to_be_in_type_list", + kwargs=old_config.kwargs, + meta=old_config.meta, + success_on_last_run=old_config.success_on_last_run + ) + self._expectation_suite.expectations[new_expectations[0]] = new_config else: res = self._expect_column_values_to_be_in_type_list__map( column, type_list, **kwargs @@ -711,16 +734,21 @@ def expect_column_values_to_be_in_type_list( "expect_column_values_to_be_in_type_list", column ) if len(existing_expectations) == 1: - self._expectation_suite["expectations"].pop(existing_expectations[0]) + self._expectation_suite.expectations.pop(existing_expectations[0]) # Now, rename the expectation we just added new_expectations = self.find_expectation_indexes( "_expect_column_values_to_be_in_type_list__map", column ) assert len(new_expectations) == 1 - expectation_index = new_expectations[0] - self._expectation_suite["expectations"][expectation_index]["expectation_type"] = \ - "expect_column_values_to_be_in_type_list" + old_config = self._expectation_suite.expectations[new_expectations[0]] + new_config = ExpectationConfiguration( + expectation_type="expect_column_values_to_be_in_type_list", + kwargs=old_config.kwargs, + meta=old_config.meta, + success_on_last_run=old_config.success_on_last_run + ) + self._expectation_suite.expectations[new_expectations[0]] = new_config return res @@ -729,7 +757,7 @@ def _expect_column_values_to_be_in_type_list__aggregate( self, column, type_list, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if mostly is not None: raise ValueError("PandasDataset cannot support mostly for a column with a non-object dtype.") @@ -774,7 +802,7 @@ def _expect_column_values_to_be_in_type_list__map( self, column, type_list, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): comp_types = [] for type_ in type_list: @@ -809,7 +837,7 @@ def _expect_column_values_to_be_in_type_list__map( def expect_column_values_to_be_in_set(self, column, value_set, mostly=None, parse_strings_as_datetimes=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): if value_set is None: # Vacuously true return np.ones(len(column), dtype=np.bool_) @@ -825,7 +853,7 @@ def expect_column_values_to_be_in_set(self, column, value_set, def expect_column_values_to_not_be_in_set(self, column, value_set, mostly=None, parse_strings_as_datetimes=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): if parse_strings_as_datetimes: parsed_value_set = self._parse_value_set(value_set) else: @@ -843,7 +871,7 @@ def expect_column_values_to_be_between(self, output_strftime_format=None, allow_cross_type_comparisons=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") @@ -862,7 +890,10 @@ def expect_column_values_to_be_between(self, if max_value: max_value = parse(max_value) - temp_column = column.map(parse) + try: + temp_column = column.map(parse) + except TypeError as e: + temp_column = column else: temp_column = column @@ -954,7 +985,7 @@ def is_between(val): @MetaPandasDataset.column_map_expectation def expect_column_values_to_be_increasing(self, column, strictly=None, parse_strings_as_datetimes=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): if parse_strings_as_datetimes: temp_column = column.map(parse) @@ -982,7 +1013,7 @@ def expect_column_values_to_be_increasing(self, column, strictly=None, parse_str @MetaPandasDataset.column_map_expectation def expect_column_values_to_be_decreasing(self, column, strictly=None, parse_strings_as_datetimes=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): if parse_strings_as_datetimes: temp_column = column.map(parse) @@ -1012,7 +1043,7 @@ def expect_column_value_lengths_to_be_between(self, column, min_value=None, max_value=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") @@ -1046,28 +1077,28 @@ def expect_column_value_lengths_to_be_between(self, column, @MetaPandasDataset.column_map_expectation def expect_column_value_lengths_to_equal(self, column, value, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): return column.str.len() == value @DocInherit @MetaPandasDataset.column_map_expectation def expect_column_values_to_match_regex(self, column, regex, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): return column.astype(str).str.contains(regex) @DocInherit @MetaPandasDataset.column_map_expectation def expect_column_values_to_not_match_regex(self, column, regex, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): return ~column.astype(str).str.contains(regex) @DocInherit @MetaPandasDataset.column_map_expectation def expect_column_values_to_match_regex_list(self, column, regex_list, match_on="any", mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): regex_matches = [] for regex in regex_list: @@ -1086,7 +1117,7 @@ def expect_column_values_to_match_regex_list(self, column, regex_list, match_on= @MetaPandasDataset.column_map_expectation def expect_column_values_to_not_match_regex_list(self, column, regex_list, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): regex_matches = [] for regex in regex_list: regex_matches.append(column.astype(str).str.contains(regex)) @@ -1098,7 +1129,7 @@ def expect_column_values_to_not_match_regex_list(self, column, regex_list, @MetaPandasDataset.column_map_expectation def expect_column_values_to_match_strftime_format(self, column, strftime_format, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, + result_format=None, include_config=True, catch_exceptions=None, meta=None): # Below is a simple validation that the provided format can both format and parse a datetime object. # %D is an example of a format that can format but not parse, e.g. @@ -1125,7 +1156,7 @@ def is_parseable_by_format(val): @MetaPandasDataset.column_map_expectation def expect_column_values_to_be_dateutil_parseable(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): def is_parseable(val): try: if type(val) != str: @@ -1144,7 +1175,7 @@ def is_parseable(val): @MetaPandasDataset.column_map_expectation def expect_column_values_to_be_json_parseable(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): def is_json(val): try: json.loads(val) @@ -1158,7 +1189,7 @@ def is_json(val): @MetaPandasDataset.column_map_expectation def expect_column_values_to_match_json_schema(self, column, json_schema, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): def matches_json_schema(val): try: val_json = json.loads(val) @@ -1180,7 +1211,7 @@ def matches_json_schema(val): def expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(self, column, distribution, p_value=0.05, params=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None): column = self[column] @@ -1219,7 +1250,7 @@ def expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( @DocInherit @MetaPandasDataset.column_aggregate_expectation def expect_column_bootstrapped_ks_test_p_value_to_be_greater_than(self, column, partition_object=None, p=0.05, bootstrap_samples=None, bootstrap_sample_size=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): column = self[column] if not is_valid_continuous_partition_object(partition_object): @@ -1319,7 +1350,7 @@ def expect_column_pair_values_to_be_equal(self, column_A, column_B, ignore_row_if="both_values_are_missing", - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): return column_A == column_B @@ -1332,7 +1363,7 @@ def expect_column_pair_values_A_to_be_greater_than_B(self, parse_strings_as_datetimes=None, allow_cross_type_comparisons=None, ignore_row_if="both_values_are_missing", - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): # FIXME if allow_cross_type_comparisons == True: @@ -1358,7 +1389,7 @@ def expect_column_pair_values_to_be_in_set(self, column_B, value_pairs_set, ignore_row_if="both_values_are_missing", - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if value_pairs_set is None: # vacuously true @@ -1388,7 +1419,7 @@ def expect_column_pair_values_to_be_in_set(self, def expect_multicolumn_values_to_be_unique(self, column_list, ignore_row_if="all_values_are_missing", - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): threshold = len(column_list.columns) # Do not dropna here, since we have separately dealt with na in decorator diff --git a/great_expectations/dataset/sparkdf_dataset.py b/great_expectations/dataset/sparkdf_dataset.py index ca2803d71ae5..cbc67c660fb9 100644 --- a/great_expectations/dataset/sparkdf_dataset.py +++ b/great_expectations/dataset/sparkdf_dataset.py @@ -555,7 +555,7 @@ def expect_column_values_to_be_in_set( mostly=None, parse_strings_as_datetimes=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -581,7 +581,7 @@ def expect_column_values_to_not_be_in_set( value_set, # List[Any] mostly=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -601,7 +601,7 @@ def expect_column_values_to_be_between(self, output_strftime_format=None, allow_cross_type_comparisons=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): # NOTE: This function is implemented using native functions instead of UDFs, which is a faster # implementation. Please ensure new spark implementations migrate to the new style where possible @@ -640,7 +640,7 @@ def expect_column_values_to_be_between(self, @MetaSparkDFDataset.column_map_expectation def expect_column_value_lengths_to_be_between(self, column, min_value=None, max_value=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): if min_value is None and max_value is None: return column.withColumn('__success', lit(True)) elif min_value is None: @@ -662,7 +662,7 @@ def expect_column_values_to_be_unique( column, mostly=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -676,7 +676,7 @@ def expect_column_value_lengths_to_equal( value, # int mostly=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -690,7 +690,7 @@ def expect_column_values_to_match_strftime_format( strftime_format, # str mostly=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -722,7 +722,7 @@ def expect_column_values_to_not_be_null( column, mostly=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -735,7 +735,7 @@ def expect_column_values_to_be_null( column, mostly=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -748,7 +748,7 @@ def expect_column_values_to_be_of_type( column, type_, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if mostly is not None: raise ValueError("SparkDFDataset does not support column map semantics for column types") @@ -785,7 +785,7 @@ def expect_column_values_to_be_in_type_list( column, type_list, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if mostly is not None: raise ValueError("SparkDFDataset does not support column map semantics for column types") @@ -827,7 +827,7 @@ def expect_column_values_to_match_regex( regex, mostly=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -841,7 +841,7 @@ def expect_column_values_to_not_match_regex( regex, mostly=None, result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None, ): @@ -855,7 +855,7 @@ def expect_column_pair_values_to_be_equal( column_B, ignore_row_if="both_values_are_missing", result_format=None, - include_config=False, + include_config=True, catch_exceptions=None, meta=None ): diff --git a/great_expectations/dataset/sqlalchemy_dataset.py b/great_expectations/dataset/sqlalchemy_dataset.py index d0dfffb89175..d6874c04be34 100644 --- a/great_expectations/dataset/sqlalchemy_dataset.py +++ b/great_expectations/dataset/sqlalchemy_dataset.py @@ -51,6 +51,42 @@ pybigquery = None +class SqlAlchemyBatchReference(object): + + def __init__(self, engine, table_name=None, schema=None, query=None): + self._engine = engine + if table_name is None and query is None: + raise ValueError("Table_name or query must be specified") + + self._table_name = table_name + self._schema = schema + self._query = query + + def get_init_kwargs(self): + if self._table_name and self._query: + # This is allowed in BigQuery where a temporary table name must be provided *with* the + # custom sql to execute. + kwargs = { + "engine": self._engine, + "table_name": self._table_name, + "custom_sql": self._query + } + elif self._table_name: + kwargs = { + "engine": self._engine, + "table_name": self._table_name + } + else: + kwargs = { + "engine": self._engine, + "custom_sql": self._query + } + if self._schema: + kwargs["schema"] = self._schema + + return kwargs + + class MetaSqlAlchemyDataset(Dataset): def __init__(self, *args, **kwargs): @@ -209,6 +245,9 @@ def __init__(self, table_name=None, engine=None, connection_string=None, custom_sql=None, schema=None, *args, **kwargs): if custom_sql and not table_name: + #NOTE: Eugene 2020-01-31: @James, this is a not a proper fix, but without it the "public" schema + #was used for a temp table and raising an error + schema = None # dashes are special characters in most databases so use underscores table_name = "ge_tmp_" + str(uuid.uuid4()).replace("-", "_") generated_table_name = table_name @@ -236,6 +275,10 @@ def __init__(self, table_name=None, engine=None, connection_string=None, if self.engine.dialect.name.lower() in ["postgresql", "mysql", "sqlite", "oracle", "mssql", "oracle"]: # These are the officially included and supported dialects by sqlalchemy self.dialect = import_module("sqlalchemy.dialects." + self.engine.dialect.name) + + if engine and engine.dialect.name.lower() == "sqlite": + # sqlite temp tables only persist within a connection so override the engine + self.engine = engine.connect() elif self.engine.dialect.name.lower() == "snowflake": self.dialect = import_module("snowflake.sqlalchemy.snowdialect") elif self.engine.dialect.name.lower() == "redshift": @@ -310,7 +353,7 @@ def head(self, n=5): def get_row_count(self): count_query = sa.select([sa.func.count()]).select_from( self._table) - return self.engine.execute(count_query).scalar() + return int(self.engine.execute(count_query).scalar()) def get_column_count(self): return len(self.columns) @@ -331,8 +374,8 @@ def get_column_nonnull_count(self, column): ).label('null_count'), ]).select_from(self._table) count_results = dict(self.engine.execute(count_query).fetchone()) - element_count = count_results['element_count'] - null_count = count_results['null_count'] or 0 + element_count = int(count_results.get('element_count') or 0) + null_count = int(count_results.get('null_count') or 0) return element_count - null_count def get_column_sum(self, column): @@ -416,7 +459,7 @@ def get_column_median(self, column): column_median = None elif nonnull_count % 2 == 0: # An even number of column values: take the average of the two center values - column_median = ( + column_median = float( column_values[0][0] + # left center value column_values[1][0] # right center value ) / 2.0 # Average center values @@ -604,7 +647,7 @@ def column_reflection_fallback(self): def expect_column_values_to_be_null(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): return sa.column(column) == None @@ -614,7 +657,7 @@ def expect_column_values_to_be_null(self, def expect_column_values_to_not_be_null(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): return sa.column(column) != None @@ -648,7 +691,7 @@ def expect_column_values_to_be_of_type( column, type_, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if mostly is not None: raise ValueError("SqlAlchemyDataset does not support column map semantics for column types") @@ -695,7 +738,7 @@ def expect_column_values_to_be_in_type_list( column, type_list, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if mostly is not None: raise ValueError("SqlAlchemyDataset does not support column map semantics for column types") @@ -748,7 +791,7 @@ def expect_column_values_to_be_in_set(self, value_set, mostly=None, parse_strings_as_datetimes=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if value_set is None: # vacuously true @@ -767,7 +810,7 @@ def expect_column_values_to_not_be_in_set(self, value_set, mostly=None, parse_strings_as_datetimes=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if parse_strings_as_datetimes: parsed_value_set = self._parse_value_set(value_set) @@ -787,7 +830,7 @@ def expect_column_values_to_be_between(self, parse_strings_as_datetimes=None, output_strftime_format=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if parse_strings_as_datetimes: if min_value: @@ -842,7 +885,7 @@ def expect_column_value_lengths_to_equal(self, column, value, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): return sa.func.length(sa.column(column)) == value @@ -853,7 +896,7 @@ def expect_column_value_lengths_to_be_between(self, min_value=None, max_value=None, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if min_value is None and max_value is None: @@ -882,7 +925,7 @@ def expect_column_value_lengths_to_be_between(self, @MetaSqlAlchemyDataset.column_map_expectation def expect_column_values_to_be_unique(self, column, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): # Duplicates are found by filtering a group by query dup_query = sa.select([sa.column(column)]).\ select_from(self._table).\ @@ -932,7 +975,7 @@ def expect_column_values_to_match_regex( column, regex, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): regex_fn = self._get_dialect_regex_fn(positive=True) @@ -948,7 +991,7 @@ def expect_column_values_to_not_match_regex( column, regex, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): regex_fn = self._get_dialect_regex_fn(positive=False) if regex_fn is None: @@ -963,7 +1006,7 @@ def expect_column_values_to_match_regex_list(self, regex_list, match_on="any", mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None + result_format=None, include_config=True, catch_exceptions=None, meta=None ): if match_on not in ["any", "all"]: @@ -989,7 +1032,7 @@ def expect_column_values_to_match_regex_list(self, @MetaSqlAlchemyDataset.column_map_expectation def expect_column_values_to_not_match_regex_list(self, column, regex_list, mostly=None, - result_format=None, include_config=False, catch_exceptions=None, meta=None): + result_format=None, include_config=True, catch_exceptions=None, meta=None): regex_fn = self._get_dialect_regex_fn(positive=False) if regex_fn is None: diff --git a/great_expectations/datasource/__init__.py b/great_expectations/datasource/__init__.py index 7dd4adc3f96a..a74e8404ae44 100644 --- a/great_expectations/datasource/__init__.py +++ b/great_expectations/datasource/__init__.py @@ -2,4 +2,3 @@ from .pandas_datasource import PandasDatasource from .sqlalchemy_datasource import SqlAlchemyDatasource from .sparkdf_datasource import SparkDFDatasource -from .dbt_datasource import DBTDatasource diff --git a/great_expectations/datasource/datasource.py b/great_expectations/datasource/datasource.py index 58879db65f4c..8c83c4493366 100644 --- a/great_expectations/datasource/datasource.py +++ b/great_expectations/datasource/datasource.py @@ -7,21 +7,13 @@ from ruamel.yaml import YAML -from great_expectations.data_context.types import ( - DataAssetIdentifier, - NormalizedDataAssetName, -) from great_expectations.data_context.util import ( load_class, instantiate_class_from_config ) -from great_expectations.data_asset.util import get_empty_expectation_suite -from great_expectations.exceptions import BatchKwargsError -from great_expectations.datasource.types import ReaderMethods + from great_expectations.types import ClassConfig -from great_expectations.exceptions import InvalidConfigError import warnings -from importlib import import_module logger = logging.getLogger(__name__) yaml = YAML() @@ -29,20 +21,24 @@ class Datasource(object): - """Datasources are responsible for connecting data and compute infrastructure. Each Datasource provides - Great Expectations DataAssets (or batches in a DataContext) connected to a specific compute environment, such as a - SQL database, a Spark cluster, or a local in-memory Pandas DataFrame. Datasources know how to access data from + """A Datasource connects to a compute environment and one or more storage environments and produces batches of data + that Great Expectations can validate in that compute environment. + + Each Datasource provides Batches connected to a specific compute environment, such as a + SQL database, a Spark cluster, or a local in-memory Pandas DataFrame. + + Datasources use Batch Kwargs to specify instructions for how to access data from relevant sources such as an existing object from a DAG runner, a SQL database, S3 bucket, or local filesystem. To bridge the gap between those worlds, Datasources interact closely with *generators* which - are aware of a source of data and can produce produce identifying information, called - "batch_kwargs" that datasources can use to get individual batches of data. They add flexibility + are aware of a source of data and can produce produce identifying information, called + "batch_kwargs" that datasources can use to get individual batches of data. They add flexibility in how to obtain data such as with time-based partitioning, downsampling, or other techniques appropriate for the datasource. For example, a generator could produce a SQL query that logically represents "rows in the Events table with a timestamp on February 7, 2012," which a SqlAlchemyDatasource could use to materialize - a SqlAlchemyDataset corresponding to that batch of data and ready for validation. + a SqlAlchemyDataset corresponding to that batch of data and ready for validation. Since opinionated DAG managers such as airflow, dbt, prefect.io, dagster can also act as datasources and/or generators for a more generic datasource. @@ -50,6 +46,7 @@ class Datasource(object): When adding custom expectations by subclassing an existing DataAsset type, use the data_asset_type parameter to configure the datasource to load and return DataAssets of the custom type. """ + recognized_batch_parameters = {'limit'} @classmethod def from_configuration(cls, **kwargs): @@ -66,7 +63,14 @@ def from_configuration(cls, **kwargs): return cls(**kwargs) @classmethod - def build_configuration(cls, class_name, module_name="great_expectations.datasource", data_asset_type=None, generators=None, **kwargs): + def build_configuration( + cls, + class_name, + module_name="great_expectations.datasource", + data_asset_type=None, + generators=None, + **kwargs + ): """ Build a full configuration object for a datasource, potentially including generators with defaults. @@ -102,29 +106,30 @@ def __init__(self, name, data_context=None, data_asset_type=None, generators=Non "String-only configuration for data_asset_type is deprecated. Use module_name and class_name instead.", DeprecationWarning) self._data_asset_type = data_asset_type - self._generators = {} - if generators is None: - generators = {} self._datasource_config = kwargs + self._generators = {} - self._datasource_config.update({ - "generators": generators, - "data_asset_type": data_asset_type - }) + self._datasource_config["data_asset_type"] = data_asset_type + if generators is not None: + self._datasource_config["generators"] = generators @property - def data_context(self): + def name(self): """ - Property for attached DataContext + Property for datasource name """ - return self._data_context + return self._name @property - def name(self): + def config(self): + return copy.deepcopy(self._datasource_config) + + @property + def data_context(self): """ - Property for datasource name + Property for attached DataContext """ - return self._name + return self._data_context def _build_generators(self): """ @@ -133,31 +138,46 @@ def _build_generators(self): Returns: None """ - for generator in self._datasource_config["generators"].keys(): - self.get_generator(generator) + try: + for generator in self._datasource_config["generators"].keys(): + self.get_generator(generator) + except KeyError: + pass - def get_config(self): - """ - Get the current configuration. + def add_generator(self, name, class_name, **kwargs): + """Add a generator to the datasource. + + Args: + name (str): the name of the new generator to add + class_name: class of the generator to add + kwargs: additional keyword arguments will be passed directly to the new generator's constructor Returns: - datasource configuration dictionary + generator (Generator) """ - return self._datasource_config - def build_generator(self, **kwargs): + # PENDING DELETION - 20200130 - JPC + # 0.9.0 removes support for the type system + # if isinstance(generator_config, string_types): + # warnings.warn("Configuring generators with a type name is no longer supported. Please update to new-style " + # "configuration.") + # generator_config = { + # "type": generator_config + # } + # generator_config.update(kwargs) + kwargs["class_name"] = class_name + generator = self._build_generator(**kwargs) + if "generators" not in self._datasource_config: + self._datasource_config["generators"] = dict() + self._datasource_config["generators"][name] = kwargs + + return generator + + def _build_generator(self, **kwargs): """Build a generator using the provided configuration and return the newly-built generator.""" - if "type" in kwargs: - warnings.warn("Using type to configure generators is now deprecated. Please use module_name and class_name" - "instead.") - type_ = kwargs.pop("type") - generator_class = self._get_generator_class_from_type(type_) - kwargs.update({ - "class_name": generator_class.__name__ - }) generator = instantiate_class_from_config( config=kwargs, - runtime_config={ + runtime_environment={ "datasource": self }, config_defaults={ @@ -166,30 +186,7 @@ def build_generator(self, **kwargs): ) return generator - def add_generator(self, name, generator_config, **kwargs): - """Add a generator to the datasource. - - Args: - name (str): the name of the new generator to add - generator_config: the configuration parameters to add to the datasource - kwargs: additional keyword arguments will be passed directly to the new generator's constructor - - Returns: - generator (Generator) - """ - if isinstance(generator_config, string_types): - warnings.warn("Configuring generators with a type name is no longer supported. Please update to new-style " - "configuration.") - generator_config = { - "type": generator_config - } - generator_config.update(kwargs) - generator = self.build_generator(**generator_config) - self._datasource_config["generators"][name] = generator_config - - return generator - - def get_generator(self, generator_name="default"): + def get_generator(self, generator_name): """Get the (named) generator from a datasource) Args: @@ -200,13 +197,13 @@ def get_generator(self, generator_name="default"): """ if generator_name in self._generators: return self._generators[generator_name] - elif generator_name in self._datasource_config["generators"]: + elif "generators" in self._datasource_config and generator_name in self._datasource_config["generators"]: generator_config = copy.deepcopy(self._datasource_config["generators"][generator_name]) else: raise ValueError( "Unable to load generator %s -- no configuration found or invalid configuration." % generator_name ) - generator = self.build_generator(**generator_config) + generator = self._build_generator(**generator_config) self._generators[generator_name] = generator return generator @@ -217,126 +214,69 @@ def list_generators(self): List(dict): each dictionary includes "name" and "type" keys """ generators = [] - # NOTE: 20190916 - JPC - Upon deprecation of support for type: configuration, this can be simplified - for key, value in self._datasource_config["generators"].items(): - if "type" in value: - logger.warning("Generator %s configured using type. Please use class_name instead." % key) - generators.append({ - "name": key, - "type": value["type"], - "class_name": self._get_generator_class_from_type(value["type"]).__name__ - }) - else: + + if "generators" in self._datasource_config: + for key, value in self._datasource_config["generators"].items(): generators.append({ "name": key, "class_name": value["class_name"] }) + else: + generators.append({ + "name": None, + "class_name": None + }) return generators - def get_batch(self, data_asset_name, expectation_suite_name, batch_kwargs, **kwargs): - """ - Get a batch of data from the datasource. - - If a DataContext is attached, then expectation_suite_name can be used to define an expectation suite to - attach to the data_asset being fetched. Otherwise, the expectation suite will be empty. - - If no batch_kwargs are specified, the next kwargs for the named data_asset will be fetched from the generator - first. - - Specific datasource types implement the internal _get_data_asset method to use appropriate batch_kwargs to - construct and return GE data_asset objects. + def process_batch_parameters(self, limit=None): + """Use datasource-specific configuration to translate any batch parameters into batch kwargs at the datasource + level. Args: - data_asset_name: the name of the data asset for which to fetch data. - expectation_suite_name: the name of the expectation suite to attach to the batch - batch_kwargs: dictionary of key-value pairs describing the batch to get, or a single identifier if \ - that can be unambiguously translated to batch_kwargs - **kwargs: Additional key-value pairs to pass to the datasource, such as reader parameters + limit (int): a parameter all datasources must accept to allow limiting a batch to a smaller number of rows. Returns: - A data_asset consisting of the specified batch of data with the named expectation suite connected. - + batch_parameters, batch_kwargs: a tuple containing all defined batch_parameters and batch_kwargs. Result + will include both parameters passed via argument and configured parameters. """ - if isinstance(data_asset_name, NormalizedDataAssetName): # this richer type can include more metadata - if self._data_context is not None: - expectation_suite = self._data_context.get_expectation_suite( - data_asset_name, - expectation_suite_name - ) - else: - expectation_suite = None - # If data_context is not set, we cannot definitely use a fully normalized data_asset reference. - # This would mean someone got a normalized name without a data context which is unusual - logger.warning( - "Using NormalizedDataAssetName type without a data_context could result in unexpected behavior: " - "using '/' as a default delimiter." - ) - else: - expectation_suite = get_empty_expectation_suite(data_asset_name=data_asset_name, - expectation_suite_name=expectation_suite_name) + batch_kwargs = self._datasource_config.get("batch_kwargs", {}) - # Support partition_id or other mechanisms of building batch_kwargs - if not isinstance(batch_kwargs, dict): - batch_kwargs = self.build_batch_kwargs(data_asset_name, batch_kwargs) + if limit is not None: + batch_kwargs["limit"] = limit - return self._get_data_asset(batch_kwargs, expectation_suite, **kwargs) + return batch_kwargs - def get_data_asset(self, - generator_asset, - generator_name=None, - expectation_suite=None, - batch_kwargs=None, - **kwargs): - """ - Get a DataAsset using a datasource. generator_asset and generator_name are required. + def get_batch(self, batch_kwargs, batch_parameters=None): + """Get a batch of data from the datasource. Args: - generator_asset: The name of the asset as identified by the generator to return. - generator_name: The name of the configured generator to use. - expectation_suite: The expectation suite to attach to the data_asset - batch_kwargs: Additional batch_kwargs that can - **kwargs: Additional kwargs that can be used to supplement batch_kwargs - - Returns: - DataAsset - """ - if batch_kwargs is None: - # noinspection PyUnboundLocalVariable - generator = self.get_generator(generator_name) - if generator is not None: - batch_kwargs = generator.yield_batch_kwargs(generator_asset, **kwargs) + batch_kwargs: the BatchKwargs to use to construct the batch + batch_parameters: optional parameters to store as the reference description of the batch. They should + reflect parameters that would provide the passed BatchKwargs. - return self._get_data_asset(batch_kwargs, expectation_suite, **kwargs) - - def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): - """ - Internal implementation of batch fetch logic. Note that this must be overridden by datasource implementations. - - Args: - batch_kwargs: the identifying information to use to fetch the batch. - expectation_suite: the expectation suite to attach to the batch. - **kwargs: additional key-value pairs to use when fetching the batch of data Returns: - A data_asset consisting of the specified batch of data with the named expectation suite connected. + Batch """ raise NotImplementedError def get_available_data_asset_names(self, generator_names=None): - """Returns a dictionary of data_asset_names that the specified generator can provide. Note that some generators, - such as the "no-op" in-memory generator may not be capable of describing specific named data assets, and some + """Returns a dictionary of data_asset_names that the specified generator can provide. Note that some generators + may not be capable of describing specific named data assets, and some generators (such as filesystem glob generators) require the user to configure data asset names. Args: - generator_names: the generators for which to fetch available data asset names. + generator_names: the generators for which to get available data asset names. Returns: dictionary consisting of sets of generator assets available for the specified generators: :: { - generator_name: [ data_asset_1, data_asset_2, ... ] + generator_name: { + names: [ (data_asset_1, data_asset_1_type), (data_asset_2, data_asset_2_type) ... ] + } ... } @@ -352,147 +292,8 @@ def get_available_data_asset_names(self, generator_names=None): available_data_asset_names[generator_name] = generator.get_available_data_asset_names() return available_data_asset_names - def build_batch_kwargs(self, data_asset_name, *args, **kwargs): - """ - Build batch kwargs for a requested data_asset. Try to use a generator where possible to support partitioning, - but fall back to datasource-default behavior if the generator cannot be identified. - - Args: - data_asset_name: the data asset for which to build batch_kwargs; if a normalized name is provided, - use the named generator. - *args: at most exactly one positional argument can be provided from which to build kwargs - **kwargs: additional keyword arguments to be used to build the batch_kwargs - - Returns: - A PandasDatasourceBatchKwargs object suitable for building a batch of data from this datasource - - """ - if isinstance(data_asset_name, (NormalizedDataAssetName, DataAssetIdentifier)): - generator_name = data_asset_name.generator - generator_asset = data_asset_name.generator_asset - elif len(self._datasource_config["generators"]) == 1: - logger.warning("Falling back to only configured generator to build batch_kwargs; consider explicitly " - "declaring the generator using named_generator_build_batch_kwargs or a DataAssetIdentifier.") - generator_name = list(self._datasource_config["generators"].keys())[0] - generator_asset = data_asset_name - else: - raise BatchKwargsError( - "Unable to determine generator. Consider using named_generator_build_batch_kwargs or a " - "DataAssetIdentifier.", - {"args": args, - "kwargs": kwargs} - ) - - return self.named_generator_build_batch_kwargs( - generator_name, - generator_asset, - *args, - **kwargs - ) - - def named_generator_build_batch_kwargs(self, generator_name, generator_asset, partition_id=None, **kwargs): - """Use the named generator to build batch_kwargs""" - generator = self.get_generator(generator_name=generator_name) - if partition_id: - batch_kwargs = generator.build_batch_kwargs_from_partition_id( - generator_asset=generator_asset, - partition_id=partition_id, - **kwargs - ) - else: - if len(kwargs) > 0: - batch_kwargs = generator.yield_batch_kwargs(generator_asset, **kwargs) - else: - raise BatchKwargsError( - "Unable to build batch_kwargs: no partition_id or base kwargs found to pass to generator.", - batch_kwargs=kwargs - ) - - return batch_kwargs - - def get_data_context(self): - """Getter for the currently-configured data context.""" - return self._data_context - - @staticmethod - def _guess_reader_method_from_path(path): - """Static helper for parsing reader types from file path extensions. - - Args: - path (str): the to use to guess - - Returns: - ReaderMethod to use for the filepath - - """ - if path.endswith(".csv") or path.endswith(".tsv"): - return ReaderMethods.CSV - elif path.endswith(".parquet"): - return ReaderMethods.parquet - elif path.endswith(".xlsx") or path.endswith(".xls"): - return ReaderMethods.excel - elif path.endswith(".json"): - return ReaderMethods.JSON - elif path.endswith(".csv.gz") or path.endswith(".csv.gz"): - return ReaderMethods.CSV_GZ - elif path.endswith(".pkl"): - return ReaderMethods.pickle - else: - return None - - def _get_generator_class_from_type(self, type_): - """DEPRECATED. - - This method can be used to support legacy-style type-only declaration of generators.""" - raise NotImplementedError - - def _get_data_asset_class(self, data_asset_type): - """Returns the class to be used to generate a data_asset from this datasource""" - if isinstance(data_asset_type, string_types): - # We have a custom type, but it is defined with only a string - try: - logger.warning("Use of custom_data_assets module is deprecated. Please define data_asset_type" - "using a module_name and class_name.") - # FOR LEGACY REASONS support the fixed "custom_data_assets" name - # FIXME: this option should be removed in a future release - custom_data_assets_module = __import__("custom_data_assets", fromlist=["custom_data_assets"]) - data_asset_type_class = getattr(custom_data_assets_module, data_asset_type) - return data_asset_type_class - except ImportError: - logger.error( - "Unable to import custom_data_asset module. " - "Check the plugins directory for 'custom_data_assets'." - ) - raise InvalidConfigError( - "Unable to import custom_data_asset module. " - "Check the plugins directory for 'custom_data_assets'." - ) - except AttributeError: - logger.error( - "Unable to find data_asset_type: '%s'." % data_asset_type - ) - raise InvalidConfigError("Unable to find data_asset_type: '%s'." % data_asset_type) - elif isinstance(data_asset_type, ClassConfig): - try: - if data_asset_type.module_name is None: - data_asset_type.module_name = "great_expectations.dataset" - - loaded_module = import_module(data_asset_type.module_name) - data_asset_type_class = getattr(loaded_module, data_asset_type.class_name) - return data_asset_type_class - except ImportError: - logger.error( - "Unable to find module '%s'." % data_asset_type.module_name - ) - raise InvalidConfigError("Unable to find module '%s'." % data_asset_type.module_name) - except AttributeError: - logger.error( - "Unable to find data_asset_type: '%s' in module '%s'." - % (data_asset_type.class_name, data_asset_type.module_name) - ) - raise InvalidConfigError( - "Unable to find data_asset_type: '%s' in module '%s'." - % (data_asset_type.class_name, data_asset_type.module_name) - ) - else: - raise InvalidConfigError("Invalid configuration for data_asset_type") + def build_batch_kwargs(self, generator, name=None, partition_id=None, **kwargs): + generator_obj = self.get_generator(generator) + if partition_id is not None: + kwargs["partition_id"] = partition_id + return generator_obj.build_batch_kwargs(name=name, **kwargs) diff --git a/great_expectations/datasource/dbt_datasource.py b/great_expectations/datasource/dbt_datasource.py deleted file mode 100644 index 1910602a95f0..000000000000 --- a/great_expectations/datasource/dbt_datasource.py +++ /dev/null @@ -1,133 +0,0 @@ -import os -import time -import logging -import errno - -from ruamel.yaml import YAML - -from .sqlalchemy_datasource import SqlAlchemyDatasource -from great_expectations.datasource.generator.batch_generator import BatchGenerator - -yaml = YAML(typ='safe') -logger = logging.getLogger(__name__) - -try: - import sqlalchemy - from sqlalchemy import create_engine, MetaData -except ImportError: - logger.debug("Unable to import sqlalchemy.") - - -class DBTModelGenerator(BatchGenerator): - """This is a helper class that makes using great expectations with dbt easy!""" - - def __init__(self, name="dbt_models", datasource=None): - super(DBTModelGenerator, self).__init__(name, type_="dbt_models", datasource=datasource) - self.dbt_target_path = datasource.dbt_target_path - - def _get_iterator(self, data_asset_name, **kwargs): - """ - Read compiled SQL of a dbt model. - - :param data_asset_name: model name. For model file blah/boo/mymodel.sql, pass the value "blah/boo/mymodel" - - :return: iterator over batch_kwargs with a query parameter equal to the content of the relevant model file - """ - try: - with open(os.path.join(self.dbt_target_path, data_asset_name) + ".sql", "r") as data: - return iter([{ - "query": data.read(), - "timestamp": time.time() - }]) - except IOError as e: - if e.errno == errno.NOENT: - raise IOError( - "dbt model %s was not found in the compiled directory. Please run `dbt compile` or `dbt run` and try again. Or, check the directory." % data_asset_name - ) - else: - raise - - def get_available_data_asset_names(self): - return set([path for path in os.walk(self.dbt_target_path) if path.endswith(".sql")]) - - -class DBTDatasource(SqlAlchemyDatasource): - """ - A DBTDataSource creates a SQLAlchemy connection to the database used by a dbt project. - - and allows to create, manage and validate expectations on the models that exist in that dbt project. - """ - - def __init__(self, - name="dbt", - data_context=None, - generators=None, - profile="default", - project_filepath="dbt_project.yml", - profiles_filepath="~/.dbt/profiles.yml", - **kwargs - ): - if generators is None: - generators = { - "dbt_models": {"type": "dbt_models"} - } - super(DBTDatasource, self).__init__(name, type_="dbt", data_context=data_context, generators=generators) - self._datasource_config.update({ - "profile": profile, - "project_filepath": project_filepath, - "profiles_filepath": profiles_filepath - }) - self._datasource_config.update(kwargs) - - with open(os.path.join(self._data_context.root_directory, - self._datasource_config["project_filepath"]), "r") as f: - self._dbt_project = yaml.load(f) or {} - - self.dbt_target_path = os.path.join( - self._data_context.root_directory, - self._dbt_project["target-path"], - "compiled", - self._dbt_project["name"], - ) - - self._options = self._get_sqlalchemy_connection_options() - self._connect(self._get_sqlalchemy_connection_options(**kwargs)) - self._build_generators() - - def _get_sqlalchemy_connection_options(self, **kwargs): - with open(os.path.expanduser(self._datasource_config["profiles_filepath"]), "r") as data: - profiles_config = yaml.load(data) or {} - - target = profiles_config[self._datasource_config["profile"]]["target"] - db_config = profiles_config[self._datasource_config["profile"]]["outputs"][target] - options = \ - sqlalchemy.engine.url.URL( - db_config["type"], - username=db_config["user"], - password=db_config["pass"], - host=db_config["host"], - port=db_config["port"], - database=db_config["dbname"], - ) - return options - - def _get_generator_class(self, type_): - if type_ == "dbt_models": - return DBTModelGenerator - else: - raise ValueError("Unrecognized DataAssetGenerator type %s" % type_) - - def build_batch_kwargs(self, *args, **kwargs): - if len(args) > 0: - # Allow a model name here - generator = self.get_generator() - if isinstance(generator, DBTModelGenerator): - batch_kwargs = generator.yield_batch_kwargs(args[0]) - else: - batch_kwargs = {} - else: - batch_kwargs = {} - batch_kwargs.update({ - "timestamp": time.time() - }) - return batch_kwargs diff --git a/great_expectations/datasource/generator/__init__.py b/great_expectations/datasource/generator/__init__.py index 5e7296daa6fd..1f69b490fd14 100644 --- a/great_expectations/datasource/generator/__init__.py +++ b/great_expectations/datasource/generator/__init__.py @@ -1,7 +1,7 @@ -from .databricks_generator import DatabricksTableGenerator -from .glob_reader_generator import GlobReaderGenerator -from .subdir_reader_generator import SubdirReaderGenerator -from .in_memory_generator import InMemoryGenerator -from .query_generator import QueryGenerator -from .table_generator import TableGenerator -from .s3_generator import S3Generator +from .databricks_generator import DatabricksTableBatchKwargsGenerator +from .glob_reader_generator import GlobReaderBatchKwargsGenerator +from .subdir_reader_generator import SubdirReaderBatchKwargsGenerator +from .query_generator import QueryBatchKwargsGenerator +from .table_generator import TableBatchKwargsGenerator +from .s3_generator import S3GlobReaderBatchKwargsGenerator +from .manual_generator import ManualBatchKwargsGenerator diff --git a/great_expectations/datasource/generator/batch_generator.py b/great_expectations/datasource/generator/batch_kwargs_generator.py similarity index 74% rename from great_expectations/datasource/generator/batch_generator.py rename to great_expectations/datasource/generator/batch_kwargs_generator.py index 144f2dfb5c43..887b7d992c32 100644 --- a/great_expectations/datasource/generator/batch_generator.py +++ b/great_expectations/datasource/generator/batch_kwargs_generator.py @@ -2,13 +2,16 @@ import logging -from great_expectations.datasource.types import BatchKwargs +from six import string_types + +from great_expectations.core.id_dict import BatchKwargs logger = logging.getLogger(__name__) -class BatchGenerator(object): - """Generators produce identifying information, called "batch_kwargs" that datasources +class BatchKwargsGenerator(object): + """ + BatchKwargsGenerators produce identifying information, called "batch_kwargs" that datasources can use to get individual batches of data. They add flexibility in how to obtain data such as with time-based partitioning, downsampling, or other techniques appropriate for the datasource. @@ -36,7 +39,7 @@ class BatchGenerator(object): # and "data_asset_2" keys. The file_logs asset will be partitioned according to the match group # defined in partition_regex default: - class_name: GlobReaderGenerator + class_name: GlobReaderBatchKwargsGenerator base_directory: /var/logs reader_options: sep: " @@ -53,7 +56,7 @@ class BatchGenerator(object): # This generator will create one data asset per subdirectory in /data # Each asset will have partitions corresponding to the filenames in that subdirectory default: - class_name: SubdirReaderGenerator + class_name: SubdirReaderBatchKwargsGenerator reader_options: sep: " base_directory: /data @@ -64,21 +67,28 @@ class BatchGenerator(object): # This generator will search for a file named with the name of the requested generator asset and the # .sql suffix to open with a query to use to generate data default: - class_name: QueryGenerator + class_name: QueryBatchKwargsGenerator """ _batch_kwargs_type = BatchKwargs + recognized_batch_parameters = set() - def __init__(self, name, datasource=None): + def __init__(self, name, datasource): self._name = name self._generator_config = { "class_name": self.__class__.__name__ } self._data_asset_iterators = {} + if datasource is None: + raise ValueError("datasource must be provided for a BatchKwargsGenerator") self._datasource = datasource + @property + def name(self): + return self._name + def _get_iterator(self, generator_asset, **kwargs): raise NotImplementedError @@ -121,18 +131,26 @@ def get_iterator(self, generator_asset, **kwargs): self.reset_iterator(generator_asset, **kwargs) return self._data_asset_iterators[generator_asset][0] - def build_batch_kwargs_from_partition_id(self, generator_asset, partition_id=None, batch_kwargs=None, **kwargs): - """ - Build batch kwargs for the named generator_asset based on partition_id and optionally existing batch_kwargs. - Args: - generator_asset: the generator_asset for which to build batch_kwargs - partition_id: the partition id - batch_kwargs: any existing batch_kwargs object to use. Will be supplemented with configured information. - **kwargs: any addition kwargs to use. Will be added to returned batch_kwargs - - Returns: BatchKwargs object - - """ + def build_batch_kwargs(self, name=None, partition_id=None, **kwargs): + """The key workhorse. Docs forthcoming.""" + if name is not None: + batch_parameters = {"name": name} + else: + batch_parameters = dict() + if partition_id is not None: + batch_parameters["partition_id"] = partition_id + batch_parameters.update(kwargs) + param_keys = set(batch_parameters.keys()) + recognized_params = (self.recognized_batch_parameters | self._datasource.recognized_batch_parameters) + if not param_keys <= recognized_params: + logger.warning("Unrecognized batch_parameter(s): %s" % str(param_keys - recognized_params)) + + batch_kwargs = self._build_batch_kwargs(batch_parameters) + # Track the datasource *in batch_kwargs* when building from a context so that the context can easily reuse them. + batch_kwargs["datasource"] = self._datasource.name + return batch_kwargs + + def _build_batch_kwargs(self, batch_parameters): raise NotImplementedError def yield_batch_kwargs(self, generator_asset, **kwargs): @@ -145,18 +163,22 @@ def yield_batch_kwargs(self, generator_asset, **kwargs): self.reset_iterator(generator_asset, **kwargs) data_asset_iterator, passed_kwargs = self._data_asset_iterators[generator_asset] try: - return next(data_asset_iterator) + batch_kwargs = next(data_asset_iterator) + batch_kwargs["datasource"] = self._datasource.name + return batch_kwargs except StopIteration: self.reset_iterator(generator_asset, **kwargs) data_asset_iterator, passed_kwargs = self._data_asset_iterators[generator_asset] if passed_kwargs != kwargs: logger.warning( - "Asked to yield batch_kwargs using different supplemental kwargs. Resetting iterator to " - "use different supplemental kwargs.") + "Asked to yield batch_kwargs using different batch parameters. Resetting iterator to " + "use different batch parameters.") self.reset_iterator(generator_asset, **kwargs) data_asset_iterator, passed_kwargs = self._data_asset_iterators[generator_asset] try: - return next(data_asset_iterator) + batch_kwargs = next(data_asset_iterator) + batch_kwargs["datasource"] = self._datasource.name + return batch_kwargs except StopIteration: # This is a degenerate case in which no kwargs are actually being generated logger.warning("No batch_kwargs found for generator_asset %s" % generator_asset) diff --git a/great_expectations/datasource/generator/databricks_generator.py b/great_expectations/datasource/generator/databricks_generator.py index efc75366c833..10f14b0009ed 100644 --- a/great_expectations/datasource/generator/databricks_generator.py +++ b/great_expectations/datasource/generator/databricks_generator.py @@ -1,7 +1,7 @@ import time import logging -from great_expectations.datasource.generator.batch_generator import BatchGenerator +from great_expectations.datasource.generator.batch_kwargs_generator import BatchKwargsGenerator logger = logging.getLogger(__name__) @@ -11,14 +11,14 @@ logger.debug("Unable to load spark context; install optional spark dependency for support.") -class DatabricksTableGenerator(BatchGenerator): +class DatabricksTableBatchKwargsGenerator(BatchKwargsGenerator): """Meant to be used in a Databricks notebook """ def __init__(self, name="default", datasource=None, database="default"): - super(DatabricksTableGenerator, self).__init__(name, datasource=datasource) + super(DatabricksTableBatchKwargsGenerator, self).__init__(name, datasource=datasource) self.database = database try: self.spark = SparkSession.builder.getOrCreate() @@ -29,10 +29,10 @@ def __init__(self, name="default", def get_available_data_asset_names(self): if self.spark is None: logger.warning("No sparkSession available to query for tables.") - return set() + return {"names": []} tables = self.spark.sql('show tables in {}'.format(self.database)) - return [row.tableName for row in tables.collect()] + return {"names": [(row.tableName, "table") for row in tables.collect()]} def _get_iterator(self, generator_asset, **kwargs): query = 'select * from {}.{}'.format(self.database, generator_asset) diff --git a/great_expectations/datasource/generator/glob_reader_generator.py b/great_expectations/datasource/generator/glob_reader_generator.py index c382d87b9005..d75f48dba75a 100644 --- a/great_expectations/datasource/generator/glob_reader_generator.py +++ b/great_expectations/datasource/generator/glob_reader_generator.py @@ -3,19 +3,17 @@ import re import datetime import logging -import warnings -from six import string_types -from great_expectations.datasource.generator.batch_generator import BatchGenerator +from great_expectations.datasource.generator.batch_kwargs_generator import BatchKwargsGenerator from great_expectations.datasource.types import PathBatchKwargs from great_expectations.exceptions import BatchKwargsError logger = logging.getLogger(__name__) -class GlobReaderGenerator(BatchGenerator): - r"""GlobReaderGenerator processes files in a directory according to glob patterns to produce batches of data. +class GlobReaderBatchKwargsGenerator(BatchKwargsGenerator): + r"""GlobReaderBatchKwargsGenerator processes files in a directory according to glob patterns to produce batches of data. A more interesting asset_glob might look like the following:: @@ -29,13 +27,13 @@ class GlobReaderGenerator(BatchGenerator): forward slash, period, or null separated) will be identified by a partition_id equal to just the date portion of their name. - A fully configured GlobReaderGenerator in yml might look like the following:: + A fully configured GlobReaderBatchKwargsGenerator in yml might look like the following:: my_datasource: class_name: PandasDatasource generators: my_generator: - class_name: GlobReaderGenerator + class_name: GlobReaderBatchKwargsGenerator base_directory: /var/log reader_options: sep: % @@ -47,6 +45,7 @@ class GlobReaderGenerator(BatchGenerator): partition_regex: wifi-((0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])-20\d\d).*\.log reader_method: csv """ + recognized_batch_parameters = {"name", "reader_method", "reader_options", "limit"} def __init__(self, name="default", datasource=None, @@ -54,8 +53,8 @@ def __init__(self, name="default", reader_options=None, asset_globs=None, reader_method=None): - logger.debug("Constructing GlobReaderGenerator {!r}".format(name)) - super(GlobReaderGenerator, self).__init__(name, datasource=datasource) + logger.debug("Constructing GlobReaderBatchKwargsGenerator {!r}".format(name)) + super(GlobReaderBatchKwargsGenerator, self).__init__(name, datasource=datasource) if reader_options is None: reader_options = {} @@ -65,7 +64,7 @@ def __init__(self, name="default", "glob": "*", "partition_regex": r"^((19|20)\d\d[- /.]?(0[1-9]|1[012])[- /.]?(0[1-9]|[12][0-9]|3[01])_(.*))\.csv", "match_group_id": 1, - "reader_method": 'csv' + "reader_method": 'read_csv' } } @@ -98,13 +97,13 @@ def base_directory(self): def get_available_data_asset_names(self): known_assets = [] if not os.path.isdir(self.base_directory): - return known_assets + return {"names": [(asset, "path") for asset in known_assets]} for generator_asset in self.asset_globs.keys(): batch_paths = self._get_generator_asset_paths(generator_asset) if len(batch_paths) > 0 and generator_asset not in known_assets: known_assets.append(generator_asset) - return known_assets + return {"names": [(asset, "path") for asset in known_assets]} def get_available_partition_ids(self, generator_asset): glob_config = self._get_generator_asset_config(generator_asset) @@ -115,20 +114,30 @@ def get_available_partition_ids(self, generator_asset): ] return partition_ids - def build_batch_kwargs_from_partition_id(self, generator_asset, partition_id=None, reader_options=None, limit=None): - """Build batch kwargs from a partition id.""" + def _build_batch_kwargs(self, batch_parameters): + try: + generator_asset = batch_parameters.pop("name") + except KeyError: + raise BatchKwargsError("Unable to build BatchKwargs: no name provided in batch_parameters.", + batch_kwargs=batch_parameters) + glob_config = self._get_generator_asset_config(generator_asset) batch_paths = self._get_generator_asset_paths(generator_asset) - path = [path for path in batch_paths if self._partitioner(path, glob_config) == partition_id] - if len(path) != 1: - raise BatchKwargsError("Unable to identify partition %s for asset %s" % (partition_id, generator_asset), - { - generator_asset: generator_asset, - partition_id: partition_id - }) - batch_kwargs = self._build_batch_kwargs_from_path(path[0], glob_config, reader_options=reader_options, - limit=limit, partition_id=partition_id) - return batch_kwargs + partition_id = batch_parameters.pop("partition_id", None) + + if partition_id: + path = [path for path in batch_paths if self._partitioner(path, glob_config) == partition_id] + if len(path) != 1: + raise BatchKwargsError("Unable to identify partition %s for asset %s" % (partition_id, generator_asset), + { + generator_asset: generator_asset, + partition_id: partition_id + }) + batch_kwargs = self._build_batch_kwargs_from_path(path[0], glob_config, **batch_parameters) + return batch_kwargs + + else: + return self.yield_batch_kwargs(generator_asset=generator_asset, **batch_parameters) def _get_generator_asset_paths(self, generator_asset): """ @@ -144,66 +153,37 @@ def _get_generator_asset_paths(self, generator_asset): return glob.glob(os.path.join(self.base_directory, glob_config["glob"])) def _get_generator_asset_config(self, generator_asset): - if generator_asset not in self._asset_globs: + try: + return self.asset_globs[generator_asset] + except KeyError: batch_kwargs = { "generator_asset": generator_asset, } raise BatchKwargsError("Unknown asset_name %s" % generator_asset, batch_kwargs) - if isinstance(self.asset_globs[generator_asset], string_types): - warnings.warn("String-only glob configuration has been deprecated and will be removed in a future" - "release. See GlobReaderGenerator docstring for more information on the new configuration" - "format.", DeprecationWarning) - glob_config = {"glob": self.asset_globs[generator_asset]} - else: - glob_config = self.asset_globs[generator_asset] - return glob_config - - def _get_iterator(self, generator_asset, reader_options=None, limit=None): + def _get_iterator(self, generator_asset, reader_method=None, reader_options=None, limit=None): glob_config = self._get_generator_asset_config(generator_asset) paths = glob.glob(os.path.join(self.base_directory, glob_config["glob"])) - return self._build_batch_kwargs_path_iter(paths, glob_config, reader_options=reader_options, limit=limit) + return self._build_batch_kwargs_path_iter(paths, glob_config, reader_method=reader_method, + reader_options=reader_options, + limit=limit) - def _build_batch_kwargs_path_iter(self, path_list, glob_config, reader_options=None, limit=None): + def _build_batch_kwargs_path_iter(self, path_list, glob_config, reader_method=None, reader_options=None, + limit=None): for path in path_list: - yield self._build_batch_kwargs_from_path(path, glob_config, reader_options=reader_options, limit=limit) - - def _build_batch_kwargs_from_path(self, path, glob_config, reader_options=None, limit=None, partition_id=None): - # We could add MD5 (e.g. for smallish files) - # but currently don't want to assume the extra read is worth it - # unless it's configurable - # with open(path,'rb') as f: - # md5 = hashlib.md5(f.read()).hexdigest() - batch_kwargs = PathBatchKwargs({ - "path": path - }) - computed_partition_id = self._partitioner(path, glob_config) - if partition_id and computed_partition_id: - if partition_id != computed_partition_id: - logger.warning("Provided partition_id does not match computed partition_id; consider explicitly " - "defining the asset or updating your partitioner.") - batch_kwargs["partition_id"] = partition_id - elif partition_id: - batch_kwargs["partition_id"] = partition_id - elif computed_partition_id: - batch_kwargs["partition_id"] = computed_partition_id - - # Apply globally-configured reader options first - batch_kwargs['reader_options'] = self.reader_options - if reader_options: - # Then update with any locally-specified reader options - batch_kwargs['reader_options'].update(reader_options) - - if limit is not None: - batch_kwargs['limit'] = limit - - if self.reader_method is not None: - batch_kwargs['reader_method'] = self.reader_method - - if glob_config.get("reader_method"): - batch_kwargs['reader_method'] = glob_config.get("reader_method") - - return batch_kwargs + yield self._build_batch_kwargs_from_path(path, glob_config, reader_method=reader_method, + reader_options=reader_options, + limit=limit) + + def _build_batch_kwargs_from_path(self, path, glob_config, reader_method=None, reader_options=None, limit=None): + batch_kwargs = self._datasource.process_batch_parameters( + reader_method=reader_method or glob_config.get("reader_method") or self.reader_method, + reader_options=reader_options or glob_config.get("reader_options") or self.reader_options, + limit=limit or glob_config.get("limit") + ) + batch_kwargs["path"] = path + batch_kwargs["datasource"] = self._datasource.name + return PathBatchKwargs(batch_kwargs) def _partitioner(self, path, glob_config): if "partition_regex" in glob_config: diff --git a/great_expectations/datasource/generator/in_memory_generator.py b/great_expectations/datasource/generator/in_memory_generator.py deleted file mode 100644 index f2facd1fd43d..000000000000 --- a/great_expectations/datasource/generator/in_memory_generator.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging - -from .batch_generator import BatchGenerator -from great_expectations.datasource.types import InMemoryBatchKwargs -from great_expectations.exceptions import BatchKwargsError - -logger = logging.getLogger(__name__) - - -class InMemoryGenerator(BatchGenerator): - """A basic generator that simply captures an existing object.""" - - def __init__(self, name="default", datasource=None): - super(InMemoryGenerator, self).__init__(name, datasource=datasource) - - def _get_iterator(self, generator_asset, **kwargs): - return iter([]) - - def get_available_data_asset_names(self): - logger.warning( - "InMemoryGenerator cannot identify data_asset_names, but can accept any object as a valid data_asset." - ) - return [] - - def get_available_partition_ids(self, generator_asset): - logger.warning( - "InMemoryGenerator cannot identify partition_ids, but can accept partition_id together with a valid GE " - "object." - ) - return [] - - def build_batch_kwargs_from_partition_id(self, generator_asset, partition_id=None, batch_kwargs=None, **kwargs): - kwargs.update(batch_kwargs) - if "dataset" not in kwargs: - raise BatchKwargsError( - "InMemoryGenerator cannot build batch_kwargs without an explicit dataset, but it can provide" - "a namespace for any data asset.", - kwargs - ) - - batch_kwargs = InMemoryBatchKwargs(kwargs) - if partition_id is not None: - batch_kwargs["partition_id"] = partition_id - return batch_kwargs diff --git a/great_expectations/datasource/generator/manual_generator.py b/great_expectations/datasource/generator/manual_generator.py new file mode 100644 index 000000000000..1ac2b20d29bc --- /dev/null +++ b/great_expectations/datasource/generator/manual_generator.py @@ -0,0 +1,118 @@ +import logging +from copy import deepcopy + +from great_expectations.datasource.generator.batch_kwargs_generator import BatchKwargsGenerator +from great_expectations.exceptions import BatchKwargsError, InvalidBatchKwargsError + +logger = logging.getLogger(__name__) + + +class ManualBatchKwargsGenerator(BatchKwargsGenerator): + """ManualBatchKwargsGenerator returns manually-configured batch_kwargs for named data assets. It provides a convenient way to + capture complete batch definitions without requiring the configuration of a more fully-featured generator. + + A fully configured ManualBatchKwargsGenerator in yml might look like the following:: + + my_datasource: + class_name: PandasDatasource + generators: + my_generator: + class_name: ManualBatchKwargsGenerator + assets: + asset1: + - partition_id: 1 + path: /data/file_1.csv + reader_options: + sep: ; + - partition_id: 2 + path: /data/file_2.csv + reader_options: + header: 0 + logs: + path: data/log.csv + """ + recognized_batch_parameters = {"name", "partition_id"} + + def __init__(self, name="default", + datasource=None, + assets=None): + logger.debug("Constructing ManualBatchKwargsGenerator {!r}".format(name)) + super(ManualBatchKwargsGenerator, self).__init__(name, datasource=datasource) + + if assets is None: + assets = {} + + self._assets = assets + + @property + def assets(self): + return self._assets + + def get_available_data_asset_names(self): + return {"names": [(key, "manual") for key in self.assets.keys()]} + + def _get_generator_asset_config(self, generator_asset): + if generator_asset is None: + return + + elif generator_asset in self.assets: + return self.assets[generator_asset] + + raise InvalidBatchKwargsError("No asset definition for requested asset %s" % generator_asset) + + def _get_iterator(self, generator_asset, **kwargs): + datasource_batch_kwargs = self._datasource.process_batch_parameters(**kwargs) + asset_definition = deepcopy(self._get_generator_asset_config(generator_asset)) + if isinstance(asset_definition, list): + for batch_definition in asset_definition: + batch_definition.update(datasource_batch_kwargs) + return iter(asset_definition) + else: + asset_definition.update(datasource_batch_kwargs) + return iter([asset_definition]) + + def get_available_partition_ids(self, generator_asset): + partition_ids = [] + asset_definition = self._get_generator_asset_config(generator_asset=generator_asset) + if isinstance(asset_definition, list): + for batch_definition in asset_definition: + try: + partition_ids.append(batch_definition['partition_id']) + except KeyError: + pass + elif isinstance(asset_definition, dict): + try: + partition_ids.append(asset_definition['partition_id']) + except KeyError: + pass + return partition_ids + + def _build_batch_kwargs(self, batch_parameters): + """Build batch kwargs from a partition id.""" + partition_id = batch_parameters.pop("partition_id", None) + batch_kwargs = self._datasource.process_batch_parameters(batch_parameters) + if partition_id: + asset_definition = self._get_generator_asset_config(generator_asset=batch_parameters.get("name")) + if isinstance(asset_definition, list): + for batch_definition in asset_definition: + try: + if batch_definition['partition_id'] == partition_id: + batch_kwargs = deepcopy(batch_definition) + batch_kwargs.pop("partition_id") + except KeyError: + pass + elif isinstance(asset_definition, dict): + try: + if asset_definition['partition_id'] == partition_id: + batch_kwargs = deepcopy(asset_definition) + batch_kwargs.pop("partition_id") + except KeyError: + pass + else: + batch_kwargs = next(self._get_iterator(batch_parameters.get("name"))) + + if batch_kwargs is not None: + return batch_kwargs + else: + raise BatchKwargsError("Unable to find batch_kwargs for given batch_parameters", batch_parameters) + diff --git a/great_expectations/datasource/generator/query_generator.py b/great_expectations/datasource/generator/query_generator.py index e8a3498a6a8b..865c7e21418c 100644 --- a/great_expectations/datasource/generator/query_generator.py +++ b/great_expectations/datasource/generator/query_generator.py @@ -1,10 +1,10 @@ import os import logging -from string import Template -from .batch_generator import BatchGenerator +from .batch_kwargs_generator import BatchKwargsGenerator from great_expectations.datasource.types import SqlAlchemyDatasourceQueryBatchKwargs from great_expectations.exceptions import BatchKwargsError +from ...data_context.util import instantiate_class_from_config logger = logging.getLogger(__name__) @@ -19,119 +19,90 @@ logger.debug("Unable to import sqlalchemy.") -class QueryGenerator(BatchGenerator): +class QueryBatchKwargsGenerator(BatchKwargsGenerator): """Produce query-style batch_kwargs from sql files stored on disk """ + recognized_batch_parameters = {'query_parameters', 'partition_id'} + + def __init__(self, name="default", datasource=None, query_store_backend=None, queries=None): + super(QueryBatchKwargsGenerator, self).__init__(name=name, datasource=datasource) + root_directory = None + if query_store_backend is None: + # We will choose a Tuple store if there is a configured DataContext with a root_directory, + # and an InMemoryStore otherwise + if datasource and datasource.data_context and datasource.data_context.root_directory: + query_store_backend = { + "class_name": "TupleFilesystemStoreBackend", + "base_directory": os.path.join(datasource.data_context.root_directory, "datasources", + datasource.name, "generators", name), + "filepath_suffix": ".sql" + } + root_directory = datasource.data_context.root_directory + else: + query_store_backend = { + "class_name": "InMemoryStoreBackend" + } + self._store_backend = instantiate_class_from_config( + config=query_store_backend, + runtime_environment={ + "root_directory": root_directory + }, + config_defaults={ + "module_name": "great_expectations.data_context.store" + } - # FIXME: This needs to be updated to use a store so that the query generator does not have to manage storage itself - # FIXME: New tests should then be added - def __init__(self, name="default", datasource=None, queries=None): - super(QueryGenerator, self).__init__(name=name, datasource=datasource) - if ( - datasource is not None and - datasource.data_context is not None and - os.path.isdir(os.path.join(self._datasource.data_context.root_directory, - "datasources", - self._datasource.name, - "generators", - self._name, - "queries") - ) - ): - self._queries_path = os.path.join(self._datasource.data_context.root_directory, - "datasources", - self._datasource.name, - "generators", - self._name, - "queries") - else: - self._queries_path = None - - if queries is None: - queries = {} - - self._queries = queries + ) + if queries is not None: + for query_name, query in queries.items(): + self.add_query(query_name, query) def _get_raw_query(self, generator_asset): - raw_query = None - if self._queries_path: - if generator_asset in [path[:-4] for path in os.listdir(self._queries_path) if str(path).endswith(".sql")]: - with open(os.path.join(self._queries_path, generator_asset) + ".sql", "r") as data: - raw_query = data.read() - elif self._queries: - if generator_asset in self._queries: - raw_query = self._queries[generator_asset] - - return raw_query - - def _get_iterator(self, generator_asset, query_params=None): + return self._store_backend.get(tuple(generator_asset)) + + def _get_iterator(self, generator_asset, query_parameters=None): raw_query = self._get_raw_query(generator_asset) if raw_query is None: logger.warning("No query defined for generator asset: %s" % generator_asset) # There is no valid query path or temp query storage defined with the generator_asset return None - if query_params is None: - query_params = {} - try: - substituted_query = Template(raw_query).substitute(query_params) - except KeyError: - raise BatchKwargsError( - "Unable to generate batch kwargs for asset '" + generator_asset + "': " - "missing template key", - { - "generator_asset": generator_asset, - "query_template": raw_query - } - ) - return iter([ - SqlAlchemyDatasourceQueryBatchKwargs( - query=substituted_query, - raw_query=raw_query, - query_params=query_params - )]) + if query_parameters is None: + iter_ = iter([ + SqlAlchemyDatasourceQueryBatchKwargs( + query=raw_query + )]) + else: + iter_= iter([ + SqlAlchemyDatasourceQueryBatchKwargs( + query=raw_query, + query_parameters=query_parameters + )]) + + return iter_ def add_query(self, generator_asset, query): - if self._queries_path: - with open(os.path.join(self._queries_path, generator_asset + ".sql"), "w") as queryfile: - queryfile.write(query) - else: - logger.info("Adding query to temporary storage only.") - self._queries[generator_asset] = query + # Backends must have a tuple key; we use only a single-element tuple + self._store_backend.set(tuple(generator_asset), query) def get_available_data_asset_names(self): - if self._queries_path: - defined_queries = [path[:-4] for path in os.listdir(self._queries_path) if str(path).endswith(".sql")] - else: - defined_queries = list(self._queries.keys()) - - return defined_queries + defined_queries = self._store_backend.list_keys() + # Backends must have a tuple key; we use only a single-element tuple + return {"names": [(query_key_tuple[0], "query") for query_key_tuple in defined_queries]} - def build_batch_kwargs_from_partition_id(self, generator_asset, partition_id=None, query_params=None): + def _build_batch_kwargs(self, batch_parameters): """Build batch kwargs from a partition id.""" + generator_asset = batch_parameters.pop("name") raw_query = self._get_raw_query(generator_asset) - if "$partition_id" not in raw_query and "${partition_id}" not in raw_query: - raise BatchKwargsError("No partition_id parameter found in the requested query.", {}) - try: - if query_params is None: - query_params = {} - query_params.update({'partition_id': partition_id}) - substituted_query = Template(raw_query).substitute(query_params) - except KeyError: - raise BatchKwargsError( - "Unable to generate batch kwargs for asset '" + generator_asset + "': " - "missing template key", - { - "generator_asset": generator_asset, - "query_template": raw_query - } - ) - return SqlAlchemyDatasourceQueryBatchKwargs( - query=substituted_query, - raw_query=raw_query, - query_params=query_params - ) + partition_id = batch_parameters.pop("partition_id", None) + batch_kwargs = self._datasource.process_batch_parameters(**batch_parameters) + batch_kwargs["query"] = raw_query + + if partition_id: + if not batch_kwargs["query_parameters"]: + batch_kwargs["query_parameters"] = {} + batch_kwargs["query_parameters"]["partition_id"] = partition_id + + return SqlAlchemyDatasourceQueryBatchKwargs(batch_kwargs) def get_available_partition_ids(self, generator_asset): - raise BatchKwargsError("QueryGenerator cannot identify partitions, however any asset defined with" - "a single parameter can be accessed using that parameter as a partition_id.", {}) + raise BatchKwargsError("QueryBatchKwargsGenerator cannot identify partitions.", {}) diff --git a/great_expectations/datasource/generator/s3_generator.py b/great_expectations/datasource/generator/s3_generator.py index 56c9104e037b..091e988f5371 100644 --- a/great_expectations/datasource/generator/s3_generator.py +++ b/great_expectations/datasource/generator/s3_generator.py @@ -8,14 +8,14 @@ boto3 = None from great_expectations.exceptions import GreatExpectationsError -from great_expectations.datasource.generator.batch_generator import BatchGenerator -from great_expectations.datasource.types import ReaderMethods, S3BatchKwargs +from great_expectations.datasource.generator.batch_kwargs_generator import BatchKwargsGenerator +from great_expectations.datasource.types import S3BatchKwargs from great_expectations.exceptions import BatchKwargsError logger = logging.getLogger(__name__) -class S3Generator(BatchGenerator): +class S3GlobReaderBatchKwargsGenerator(BatchKwargsGenerator): """ S3 Generator provides support for generating batches of data from an S3 bucket. For the S3 generator, assets must be individually defined using a prefix and glob, although several additional configuration parameters are available @@ -28,7 +28,7 @@ class S3Generator(BatchGenerator): ... generators: my_s3_generator: - class_name: S3Generator + class_name: S3GlobReaderBatchKwargsGenerator bucket: my_bucket.my_organization.priv reader_method: parquet # This will be automatically inferred from suffix where possible, but can be explicitly specified as well reader_options: # Note that reader options can be specified globally or per-asset @@ -60,7 +60,7 @@ def __init__(self, reader_method=None, boto3_options=None, max_keys=1000): - """Initialize a new S3Generator + """Initialize a new S3GlobReaderBatchKwargsGenerator Args: name: the name of the generator @@ -73,7 +73,7 @@ def __init__(self, boto3_options: dictionary of key-value pairs to use when creating boto3 client or resource objects max_keys: the maximum number of keys to fetch in a single list_objects request to s3 """ - super(S3Generator, self).__init__(name, datasource=datasource) + super(S3GlobReaderBatchKwargsGenerator, self).__init__(name, datasource=datasource) if reader_options is None: reader_options = {} @@ -112,10 +112,11 @@ def bucket(self): return self._bucket def get_available_data_asset_names(self): - return self._assets.keys() + return {"names": [(key, "file") for key in self._assets.keys()]} + def _get_iterator(self, generator_asset, reader_options=None, limit=None): - logger.debug("Beginning S3Generator _get_iterator for generator_asset: %s" % generator_asset) + logger.debug("Beginning S3GlobReaderBatchKwargsGenerator _get_iterator for generator_asset: %s" % generator_asset) if generator_asset not in self._assets: batch_kwargs = { @@ -205,7 +206,7 @@ def _get_asset_options(self, asset_config, iterator_dict): if directory_assets: if "CommonPrefixes" not in asset_options: raise BatchKwargsError( - "Unable to build batch_kwargs. The asset may not be configured correctly. If dictionary assets " + "Unable to build batch_kwargs. The asset may not be configured correctly. If directory assets " "are requested, then common prefixes must be returned.", { "asset_configuration": asset_config, diff --git a/great_expectations/datasource/generator/subdir_reader_generator.py b/great_expectations/datasource/generator/subdir_reader_generator.py index 179d0bbc39bd..78dca20f1604 100644 --- a/great_expectations/datasource/generator/subdir_reader_generator.py +++ b/great_expectations/datasource/generator/subdir_reader_generator.py @@ -1,7 +1,7 @@ import os import logging -from great_expectations.datasource.generator.batch_generator import BatchGenerator +from great_expectations.datasource.generator.batch_kwargs_generator import BatchKwargsGenerator from great_expectations.datasource.types import PathBatchKwargs from great_expectations.exceptions import BatchKwargsError @@ -10,21 +10,21 @@ KNOWN_EXTENSIONS = ['.csv', '.tsv', '.parquet', '.xls', '.xlsx', '.json', '.csv.gz', '.tsv.gz'] -class SubdirReaderGenerator(BatchGenerator): - """The SubdirReaderGenerator inspects a filesystem and produces path-based batch_kwargs. +class SubdirReaderBatchKwargsGenerator(BatchKwargsGenerator): + """The SubdirReaderBatchKwargsGenerator inspects a filesystem and produces path-based batch_kwargs. - SubdirReaderGenerator recognizes generator_assets using two criteria: + SubdirReaderBatchKwargsGenerator recognizes generator_assets using two criteria: - for files directly in 'base_directory' with recognized extensions (.csv, .tsv, .parquet, .xls, .xlsx, .json), it uses the name of the file without the extension - for other files or directories in 'base_directory', is uses the file or directory name - SubdirReaderGenerator sees all files inside a directory of base_directory as batches of one datasource. + SubdirReaderBatchKwargsGenerator sees all files inside a directory of base_directory as batches of one datasource. - SubdirReaderGenerator can also include configured reader_options which will be added to batch_kwargs generated + SubdirReaderBatchKwargsGenerator can also include configured reader_options which will be added to batch_kwargs generated by this generator. """ - _default_reader_options = {} + recognized_batch_parameters = {'name', 'partition_id'} def __init__(self, name="default", datasource=None, @@ -32,7 +32,7 @@ def __init__(self, name="default", reader_options=None, known_extensions=None, reader_method=None): - super(SubdirReaderGenerator, self).__init__(name, datasource=datasource) + super(SubdirReaderBatchKwargsGenerator, self).__init__(name, datasource=datasource) if reader_options is None: reader_options = self._default_reader_options @@ -60,16 +60,20 @@ def reader_method(self): def base_directory(self): # If base directory is a relative path, interpret it as relative to the data context's # context root directory (parent directory of great_expectation dir) - if os.path.isabs(self._base_directory) or self._datasource.get_data_context() is None: + if os.path.isabs(self._base_directory) or self._datasource.data_context is None: return self._base_directory else: - return os.path.join(self._datasource.get_data_context().root_directory, self._base_directory) + return os.path.join(self._datasource.data_context.root_directory, self._base_directory) def get_available_data_asset_names(self): if not os.path.isdir(self.base_directory): - return [] + return {"names": [], + "is_complete_list": True + } known_assets = self._get_valid_file_options(base_directory=self.base_directory) - return known_assets + return {"names": known_assets, + "is_complete_list": True + } def get_available_partition_ids(self, generator_asset): # If the generator asset names a single known *file*, return ONLY that @@ -80,31 +84,51 @@ def get_available_partition_ids(self, generator_asset): return [generator_asset] # Otherwise, subdir files are partition ids - return self._get_valid_file_options(base_directory=os.path.join(self.base_directory, generator_asset)) + return [path for (path, type) in self._get_valid_file_options(base_directory=os.path.join( + self.base_directory, generator_asset))] - def build_batch_kwargs_from_partition_id(self, generator_asset, partition_id=None, reader_options=None, limit=None): - path = None - for extension in self.known_extensions: - if os.path.isfile(os.path.join(self.base_directory, generator_asset, partition_id + extension)): - path = os.path.join(self.base_directory, generator_asset, partition_id + extension) + def _build_batch_kwargs(self, batch_parameters): + """ + + Args: + batch_parameters: + + Returns: + batch_kwargs - if path is None: - # Fall through to this case in the event that there is not a subdir available, so partition_id is - # the same as the generator asset - if os.path.isfile(os.path.join(self.base_directory, generator_asset)): - path = os.path.join(self.base_directory, generator_asset) + """ + try: + generator_asset = batch_parameters.pop("name") + except KeyError: + raise BatchKwargsError("Unable to build BatchKwargs: no name provided in batch_parameters.", + batch_kwargs=batch_parameters) + if "partition_id" in batch_parameters: + partition_id = batch_parameters.pop("partition_id") + # Find the path + path = None for extension in self.known_extensions: - if os.path.isfile(os.path.join(self.base_directory, generator_asset + extension)): - path = os.path.join(self.base_directory, generator_asset + extension) + if os.path.isfile(os.path.join(self.base_directory, generator_asset, partition_id + extension)): + path = os.path.join(self.base_directory, generator_asset, partition_id + extension) - if path is None: - raise BatchKwargsError("Unable to build batch kwargs from partition_id for asset '%s'" % generator_asset, { - "partition_id": partition_id - }) + if path is None: + logger.warning("Unable to find path with the provided partition; searching for asset-name partitions.") + # Fall through to this case in the event that there is not a subdir available, or if partition_id was + # not provided + if os.path.isfile(os.path.join(self.base_directory, generator_asset)): + path = os.path.join(self.base_directory, generator_asset) - return self._build_batch_kwargs_from_path(path, reader_options=reader_options, limit=limit, - partition_id=partition_id) + for extension in self.known_extensions: + if os.path.isfile(os.path.join(self.base_directory, generator_asset + extension)): + path = os.path.join(self.base_directory, generator_asset + extension) + + if path is None: + raise BatchKwargsError("Unable to build batch kwargs from for asset '%s'" % generator_asset, + batch_parameters) + return self._build_batch_kwargs_from_path(path, **batch_parameters) + + else: + return self.yield_batch_kwargs(generator_asset=generator_asset, **batch_parameters) def _get_valid_file_options(self, base_directory=None): valid_options = [] @@ -114,18 +138,18 @@ def _get_valid_file_options(self, base_directory=None): for file_option in file_options: for extension in self.known_extensions: if (file_option.endswith(extension) and not file_option.startswith(".") and - file_option[:-len(extension)] not in valid_options): - valid_options.append(file_option[:-len(extension)]) + (file_option[:-len(extension)], "file") not in valid_options): + valid_options.append((file_option[:-len(extension)], "file")) elif os.path.isdir(os.path.join(self.base_directory, file_option)): # Make sure there's at least one valid file inside the subdir subdir_options = self._get_valid_file_options(base_directory=os.path.join(base_directory, file_option)) - if len(subdir_options) > 0 and file_option not in valid_options: - valid_options.append(file_option) + if len(subdir_options) > 0 and (file_option, "directory") not in valid_options: + valid_options.append((file_option, "directory")) return valid_options def _get_iterator(self, generator_asset, reader_options=None, limit=None): - logger.debug("Beginning SubdirReaderGenerator _get_iterator for generator_asset: %s" % generator_asset) + logger.debug("Beginning SubdirReaderBatchKwargsGenerator _get_iterator for generator_asset: %s" % generator_asset) # If the generator_asset is a file, then return the path. # Otherwise, use files in a subdir as batches if os.path.isdir(os.path.join(self.base_directory, generator_asset)): @@ -156,40 +180,11 @@ def _build_batch_kwargs_path_iter(self, path_list, reader_options=None, limit=No for path in path_list: yield self._build_batch_kwargs_from_path(path, reader_options=reader_options, limit=limit) - def _build_batch_kwargs_from_path(self, path, reader_options=None, limit=None, partition_id=None): - # We could add MD5 (e.g. for smallish files) - # but currently don't want to assume the extra read is worth it - # unless it's configurable - # with open(path,'rb') as f: - # md5 = hashlib.md5(f.read()).hexdigest() - batch_kwargs = PathBatchKwargs({ - "path": path - }) - computed_partition_id = self._partitioner(path) - if partition_id and computed_partition_id: - if partition_id != computed_partition_id: - logger.warning("Provided partition_id does not match computed partition_id; consider explicitly " - "defining the asset or updating your partitioner.") - batch_kwargs["partition_id"] = partition_id - elif partition_id: - batch_kwargs["partition_id"] = partition_id - elif computed_partition_id: - batch_kwargs["partition_id"] = computed_partition_id - - # Apply globally-configured reader options first - batch_kwargs['reader_options'] = self.reader_options - if reader_options: - # Then update with any locally-specified reader options - batch_kwargs['reader_options'].update(reader_options) - - if limit is not None: - batch_kwargs['limit'] = limit - - if self.reader_method is not None: - batch_kwargs['reader_method'] = self.reader_method - - return batch_kwargs - - # noinspection PyMethodMayBeStatic - def _partitioner(self, path): - return os.path.basename(path).rpartition(".")[0] + def _build_batch_kwargs_from_path(self, path, reader_method=None, reader_options=None, limit=None): + batch_kwargs = self._datasource.process_batch_parameters( + reader_method=reader_method or self.reader_method, + reader_options=reader_options or self.reader_options, + limit=limit) + batch_kwargs["path"] = path + batch_kwargs["datasource"] = self._datasource.name + return PathBatchKwargs(batch_kwargs) diff --git a/great_expectations/datasource/generator/table_generator.py b/great_expectations/datasource/generator/table_generator.py index 0f409cb86211..fb570a452818 100644 --- a/great_expectations/datasource/generator/table_generator.py +++ b/great_expectations/datasource/generator/table_generator.py @@ -3,7 +3,7 @@ from marshmallow import Schema, fields, post_load, ValidationError -from .batch_generator import BatchGenerator +from .batch_kwargs_generator import BatchKwargsGenerator from great_expectations.exceptions import BatchKwargsError, GreatExpectationsError from great_expectations.datasource.types import SqlAlchemyDatasourceTableBatchKwargs @@ -47,10 +47,10 @@ def schema(self): assetConfigurationSchema = AssetConfigurationSchema() -class TableGenerator(BatchGenerator): +class TableBatchKwargsGenerator(BatchKwargsGenerator): """Provide access to already materialized tables or views in a database. - TableGenerator can be used to define specific data asset names that take and substitute parameters, + TableBatchKwargsGenerator can be used to define specific data asset names that take and substitute parameters, for example to support referring to the same data asset but with different schemas depending on provided batch_kwargs. @@ -58,7 +58,7 @@ class TableGenerator(BatchGenerator): following configurations:: my_generator: - class_name: TableGenerator + class_name: TableBatchKwargsGenerator assets: my_table: schema: $schema @@ -69,9 +69,10 @@ class TableGenerator(BatchGenerator): defined in batch_kwargs. """ + recognized_batch_parameters = {'name', 'limit', 'offset', 'query_parameters'} def __init__(self, name="default", datasource=None, assets=None): - super(TableGenerator, self).__init__(name=name, datasource=datasource) + super(TableBatchKwargsGenerator, self).__init__(name=name, datasource=datasource) if not assets: assets = {} try: @@ -80,7 +81,7 @@ def __init__(self, name="default", datasource=None, assets=None): (asset_name, asset_config) in assets.items() } except ValidationError as err: - raise GreatExpectationsError("Unable to load asset configuration in TableGenerator '%s': " + raise GreatExpectationsError("Unable to load asset configuration in TableBatchKwargsGenerator '%s': " "validation error: %s." % (name, str(err))) if datasource is not None: @@ -92,18 +93,18 @@ def __init__(self, name="default", datasource=None, assets=None): logger.warning("Unable to create inspector from engine in generator '%s'" % name) self.inspector = None - def _get_iterator(self, generator_asset, query_params=None, limit=None, offset=None, partition_id=None): + def _get_iterator(self, generator_asset, query_parameters=None, limit=None, offset=None, partition_id=None): batch_kwargs = None # First, we check if we have a configured asset if generator_asset in self._assets: asset_config = self._assets[generator_asset] try: - if query_params is None: - query_params = {} - table_name = Template(asset_config.table).substitute(query_params) + if query_parameters is None: + query_parameters = {} + table_name = Template(asset_config.table).substitute(query_parameters) schema_name = None if asset_config.schema is not None: - schema_name = Template(asset_config.schema).substitute(query_params) + schema_name = Template(asset_config.schema).substitute(query_parameters) except KeyError: raise BatchKwargsError("Unable to generate batch kwargs for asset '" + generator_asset + "': " "missing template key", @@ -149,6 +150,8 @@ def _get_iterator(self, generator_asset, query_params=None, limit=None, offset=N return def get_available_data_asset_names(self): + # TODO: limit and is_complete_list logic + is_complete_list = True defined_assets = list(self._assets.keys()) tables = [] if self.engine is not None and self.inspector is not None: @@ -167,16 +170,16 @@ def get_available_data_asset_names(self): continue tables.extend( - [table_name if self.inspector.default_schema_name == schema_name else - schema_name + "." + table_name + [(table_name, "table") if self.inspector.default_schema_name == schema_name else + (schema_name + "." + table_name, "table") for table_name in self.inspector.get_table_names(schema=schema_name) if table_name not in known_system_tables ] ) try: tables.extend( - [table_name if self.inspector.default_schema_name == schema_name else - schema_name + "." + table_name + [(table_name, "view") if self.inspector.default_schema_name == schema_name else + (schema_name + "." + table_name, "view") for table_name in self.inspector.get_view_names(schema=schema_name) if table_name not in known_system_tables ] @@ -185,17 +188,21 @@ def get_available_data_asset_names(self): # Not implemented by bigquery dialect pass - return defined_assets + tables + return {"names": defined_assets + tables, + "is_complete_list": is_complete_list + } - def build_batch_kwargs_from_partition_id(self, generator_asset, partition_id=None, limit=None, offset=None, - query_params=None): - if query_params is None: - query_params = {} - - return next(self._get_iterator(generator_asset, query_params=query_params, limit=limit, - offset=offset, partition_id=partition_id)) + def _build_batch_kwargs(self, batch_parameters): + return next( + self._get_iterator( + batch_parameters.get("name"), + query_parameters=batch_parameters.get("query_parameters", {}), + limit=batch_parameters.get("limit"), + offset=batch_parameters.get("offset") + ) + ) def get_available_partition_ids(self, generator_asset): - raise BatchKwargsError("TableGenerator cannot identify partitions, however any existing table may" + raise BatchKwargsError("TableBatchKwargsGenerator cannot identify partitions, however any existing table may" "already be referenced by accessing a generator_asset with the name of the " "table or of the form SCHEMA.TABLE", {}) diff --git a/great_expectations/datasource/pandas_datasource.py b/great_expectations/datasource/pandas_datasource.py index 954975f580ec..4bf059755997 100644 --- a/great_expectations/datasource/pandas_datasource.py +++ b/great_expectations/datasource/pandas_datasource.py @@ -1,6 +1,9 @@ -import time +import datetime +import uuid import hashlib import logging +from functools import partial + try: from io import StringIO @@ -11,15 +14,9 @@ import pandas as pd -from .datasource import Datasource, ReaderMethods -from great_expectations.datasource.generator.in_memory_generator import InMemoryGenerator -from great_expectations.datasource.generator.subdir_reader_generator import SubdirReaderGenerator -from great_expectations.datasource.generator.glob_reader_generator import GlobReaderGenerator -from great_expectations.datasource.generator.s3_generator import S3Generator -from great_expectations.datasource.types import ( - BatchId -) -from great_expectations.dataset.pandas_dataset import PandasDataset +from .datasource import Datasource +from great_expectations.datasource.types import BatchMarkers +from great_expectations.core.batch import Batch from great_expectations.types import ClassConfig from great_expectations.exceptions import BatchKwargsError from .util import S3Url @@ -30,13 +27,15 @@ class PandasDatasource(Datasource): - """The PandasDatasource produces PandasDataset objects and supports generators capable of + """The PandasDatasource produces PandasDataset objects and supports generators capable of interacting with the local filesystem (the default subdir_reader generator), and from existing in-memory dataframes. """ + recognized_batch_parameters = {'reader_method', 'reader_options', 'limit'} @classmethod - def build_configuration(cls, data_asset_type=None, generators=None, boto3_options=None, **kwargs): + def build_configuration(cls, data_asset_type=None, generators=None, boto3_options=None, reader_method=None, + reader_options=None, limit=None, **kwargs): """ Build a full configuration object for a datasource, potentially including generators with defaults. @@ -44,28 +43,36 @@ def build_configuration(cls, data_asset_type=None, generators=None, boto3_option data_asset_type: A ClassConfig dictionary generators: Generator configuration dictionary boto3_options: Optional dictionary with key-value pairs to pass to boto3 during instantiation. + reader_method: Optional default reader_method for generated batches + reader_options: Optional default reader_options for generated batches + limit: Optional default limit for generated batches **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ - if generators is None: + + # PENDING DELETION - JPC - 20200130 + # if generators is None: # Provide a gentle way to build a datasource with a sane default, # including ability to specify the base_directory and reader_options - base_directory = kwargs.pop("base_directory", "data") + # base_directory = kwargs.pop("base_directory", "data") # By default, use CSV sniffer to infer separator, which requires the python engine - reader_options = kwargs.pop("reader_options", { - "sep": None, - "engine": "python" - }) - generators = { - "default": { - "class_name": "SubdirReaderGenerator", - "base_directory": base_directory, - "reader_options": reader_options - } - } + # reader_options = kwargs.pop("reader_options", { + # "sep": None, + # "engine": "python" + # }) + # generators = { + # # "default": { + # # "class_name": "SubdirReaderBatchKwargsGenerator", + # # "base_directory": base_directory, + # # "reader_options": reader_options + # # }, + # # "passthrough": { + # # "class_name": "PassthroughGenerator", + # # } + # } if data_asset_type is None: data_asset_type = ClassConfig( class_name="PandasDataset") @@ -77,53 +84,91 @@ def build_configuration(cls, data_asset_type=None, generators=None, boto3_option pass configuration = kwargs - configuration.update({ - "data_asset_type": data_asset_type, - "generators": generators, - }) + configuration["data_asset_type"] = data_asset_type + if generators: + configuration["generators"] = generators + if boto3_options is not None: if isinstance(boto3_options, dict): configuration.update(boto3_options) else: raise ValueError("boto3_options must be a dictionary of key-value pairs to pass to boto3 upon " "initialization.") + + if reader_options is not None: + if isinstance(reader_options, dict): + configuration.update(reader_options) + else: + raise ValueError("boto3_options must be a dictionary of key-value pairs to pass to boto3 upon " + "initialization.") + + if reader_method is not None: + configuration["reader_method"] = reader_method + + if limit is not None: + configuration["limit"] = limit + return configuration def __init__(self, name="pandas", data_context=None, data_asset_type=None, generators=None, - boto3_options=None, **kwargs): + boto3_options=None, reader_method=None, reader_options=None, limit=None, **kwargs): configuration_with_defaults = PandasDatasource.build_configuration(data_asset_type, generators, - boto3_options, **kwargs) + boto3_options, + reader_method=reader_method, + reader_options=reader_options, + limit=limit, + **kwargs) + data_asset_type = configuration_with_defaults.pop("data_asset_type") - generators = configuration_with_defaults.pop("generators") + generators = configuration_with_defaults.pop("generators", None) super(PandasDatasource, self).__init__(name, data_context=data_context, data_asset_type=data_asset_type, generators=generators, **configuration_with_defaults) + self._build_generators() self._boto3_options = configuration_with_defaults.get("boto3_options", {}) + self._reader_method = configuration_with_defaults.get("reader_method", None) + self._reader_options = configuration_with_defaults.get("reader_options", None) + self._limit = configuration_with_defaults.get("limit", None) - def _get_generator_class_from_type(self, type_): - if type_ == "subdir_reader": - return SubdirReaderGenerator - elif type_ == "glob_reader": - return GlobReaderGenerator - elif type_ == "memory": - return InMemoryGenerator - elif type_ == "s3": - return S3Generator - else: - raise ValueError("Unrecognized BatchGenerator type %s" % type_) - - def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): - for k, v in kwargs.items(): - if isinstance(v, dict): - if k in batch_kwargs and isinstance(batch_kwargs[k], dict): - batch_kwargs[k].update(v) - else: - batch_kwargs[k] = v - else: - batch_kwargs[k] = v + def process_batch_parameters(self, reader_method=None, reader_options=None, limit=None): + # Note that we do not pass any parameters up, since *all* will be handled by PandasDatasource + batch_kwargs = super(PandasDatasource, self).process_batch_parameters() + + # Apply globally-configured reader options first + if self._reader_options: + # Then update with any locally-specified reader options + if not batch_kwargs.get("reader_options"): + batch_kwargs["reader_options"] = dict() + batch_kwargs["reader_options"].update(self._reader_options) + + # Then update with any locally-specified reader options + if reader_options: + if not batch_kwargs.get("reader_options"): + batch_kwargs["reader_options"] = dict() + batch_kwargs["reader_options"].update(reader_options) + + if self._limit: + if not batch_kwargs.get("reader_options"): + batch_kwargs["reader_options"] = dict() + batch_kwargs['reader_options']['nrows'] = self._limit + + if limit is not None: + if not batch_kwargs.get("reader_options"): + batch_kwargs["reader_options"] = dict() + batch_kwargs['reader_options']['nrows'] = limit + + if self._reader_method: + batch_kwargs["reader_method"] = self._reader_method + + if reader_method is not None: + batch_kwargs["reader_method"] = reader_method + + return batch_kwargs + + def get_batch(self, batch_kwargs, batch_parameters=None): # pandas cannot take unicode as a delimiter, which can happen in py2. Handle this case explicitly. # We handle it here so that the updated value will be in the batch_kwargs for transparency to the user. if PY2 and "reader_options" in batch_kwargs and "sep" in batch_kwargs['reader_options'] and \ @@ -132,38 +177,16 @@ def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): # We will use and manipulate reader_options along the way reader_options = batch_kwargs.get("reader_options", {}) - # We need to build a batch_id to be used in the dataframe - batch_id = BatchId({ - "timestamp": time.time() + # We need to build a batch_markers to be used in the dataframe + batch_markers = BatchMarkers({ + "ge_load_time": datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") }) - if "data_asset_type" in batch_kwargs: - data_asset_type_config = reader_options.pop("data_asset_type") # Get and remove the config - try: - data_asset_type_config = ClassConfig(**data_asset_type_config) - except TypeError: - # We tried; we'll pass the config downstream, probably as a string, and handle an error later - pass - else: - data_asset_type_config = self._data_asset_type - - data_asset_type = self._get_data_asset_class(data_asset_type_config) - - if not issubclass(data_asset_type, PandasDataset): - raise ValueError("PandasDatasource cannot instantiate batch with data_asset_type: '%s'. It " - "must be a subclass of PandasDataset." % data_asset_type.__name__) - - if "limit" in batch_kwargs: - reader_options['nrows'] = batch_kwargs['limit'] - if "path" in batch_kwargs: path = batch_kwargs['path'] reader_method = batch_kwargs.get("reader_method") - reader_fn, reader_fn_options = self._get_reader_fn(reader_method, path, reader_options) - try: - df = getattr(pd, reader_fn)(path, **reader_fn_options) - except AttributeError: - raise BatchKwargsError("Unsupported reader: %s" % reader_method.name, batch_kwargs) + reader_fn = self._get_reader_fn(reader_method, path) + df = reader_fn(path, **reader_options) elif "s3" in batch_kwargs: try: @@ -176,60 +199,80 @@ def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): url = S3Url(raw_url) logger.debug("Fetching s3 object. Bucket: %s Key: %s" % (url.bucket, url.key)) s3_object = s3.get_object(Bucket=url.bucket, Key=url.key) - reader_fn, reader_fn_options = self._get_reader_fn(reader_method, url.key, reader_options) - - try: - df = getattr(pd, reader_fn)( - StringIO(s3_object["Body"].read().decode(s3_object.get("ContentEncoding", "utf-8"))), - **reader_fn_options - ) - except AttributeError: - raise BatchKwargsError("Unsupported reader: %s" % reader_method.name, batch_kwargs) - except IOError: - raise + reader_fn = self._get_reader_fn(reader_method, url.key) + df = reader_fn( + StringIO(s3_object["Body"].read().decode(s3_object.get("ContentEncoding", "utf-8"))), + **reader_options + ) elif "dataset" in batch_kwargs and isinstance(batch_kwargs["dataset"], (pd.DataFrame, pd.Series)): df = batch_kwargs.get("dataset") # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs batch_kwargs = {k: batch_kwargs[k] for k in batch_kwargs if k != 'dataset'} - # Record this in the kwargs *and* the id batch_kwargs["PandasInMemoryDF"] = True - batch_id["PandasInMemoryDF"] = True + batch_kwargs["ge_batch_id"] = str(uuid.uuid1()) else: raise BatchKwargsError("Invalid batch_kwargs: path, s3, or df is required for a PandasDatasource", batch_kwargs) if df.memory_usage().sum() < HASH_THRESHOLD: - batch_id["fingerprint"] = hashlib.md5(pd.util.hash_pandas_object(df, index=True).values).hexdigest() - return data_asset_type(df, - expectation_suite=expectation_suite, - data_context=self._data_context, - batch_kwargs=batch_kwargs, - batch_id=batch_id) - - def _get_reader_fn(self, reader_method, path, reader_options): + batch_markers["pandas_data_fingerprint"] = hashlib.md5(pd.util.hash_pandas_object( + df, index=True).values).hexdigest() + + return Batch( + datasource_name=self.name, + batch_kwargs=batch_kwargs, + data=df, + batch_parameters=batch_parameters, + batch_markers=batch_markers, + data_context=self._data_context + ) + + @staticmethod + def guess_reader_method_from_path(path): + if path.endswith(".csv") or path.endswith(".tsv"): + return {"reader_method": "read_csv"} + elif path.endswith(".parquet"): + return {"reader_method": "read_parquet"} + elif path.endswith(".xlsx") or path.endswith(".xls"): + return {"reader_method": "read_excel"} + elif path.endswith(".json"): + return {"reader_method": "read_json"} + elif path.endswith(".pkl"): + return {"reader_method": "read_pickle"} + elif path.endswith(".csv.gz") or path.endswith(".csv.gz"): + return {"reader_method": "read_csv", "reader_options": {"compression": "gzip"}} + + raise BatchKwargsError("Unable to determine reader method from path: %s" % path, {"path": path}) + + def _get_reader_fn(self, reader_method=None, path=None): + """Static helper for parsing reader types. If reader_method is not provided, path will be used to guess the + correct reader_method. + + Args: + reader_method (str): the name of the reader method to use, if available. + path (str): the to use to guess + + Returns: + ReaderMethod to use for the filepath + + """ + if reader_method is None and path is None: + raise BatchKwargsError("Unable to determine pandas reader function without reader_method or path.", + {"reader_method": reader_method}) + + reader_options = None if reader_method is None: - reader_method = self._guess_reader_method_from_path(path) - if reader_method is None: - raise BatchKwargsError("Unable to determine reader for path: %s" % path, reader_options) - else: - try: - reader_method = ReaderMethods[reader_method] - except KeyError: - raise BatchKwargsError("Unknown reader method: %s" % reader_method, reader_options) - - if reader_method == ReaderMethods.CSV: - return "read_csv", reader_options - elif reader_method == ReaderMethods.parquet: - return "read_parquet", reader_options - elif reader_method == ReaderMethods.excel: - return "read_excel", reader_options - elif reader_method == ReaderMethods.JSON: - return "read_json", reader_options - elif reader_method == ReaderMethods.CSV_GZ: - return "read_csv", reader_options.update({"compression": "gzip"}) - elif reader_method == ReaderMethods.pickle: - return "read_pickle", reader_options - - return None + path_guess = self.guess_reader_method_from_path(path) + reader_method = path_guess["reader_method"] + reader_options = path_guess.get("reader_options") # This may not be there; use None in that case + + try: + reader_fn = getattr(pd, reader_method) + if reader_options: + reader_fn = partial(reader_fn, **reader_options) + return reader_fn + except AttributeError: + raise BatchKwargsError("Unable to find reader_method %s in pandas." % reader_method, {"reader_method": + reader_method}) diff --git a/great_expectations/datasource/sparkdf_datasource.py b/great_expectations/datasource/sparkdf_datasource.py index f23d87e619fa..ea4994390fbb 100644 --- a/great_expectations/datasource/sparkdf_datasource.py +++ b/great_expectations/datasource/sparkdf_datasource.py @@ -1,21 +1,18 @@ import logging -import time +import datetime +import uuid -from great_expectations.datasource.types import BatchId +from great_expectations.datasource.types import BatchMarkers +from ..core.batch import Batch +from ..dataset import SparkDFDataset from ..exceptions import BatchKwargsError -from .datasource import Datasource, ReaderMethods -from great_expectations.datasource.generator.subdir_reader_generator import SubdirReaderGenerator -from great_expectations.datasource.generator.databricks_generator import DatabricksTableGenerator -from great_expectations.datasource.generator.in_memory_generator import InMemoryGenerator -from great_expectations.datasource.generator.s3_generator import S3Generator - +from .datasource import Datasource from great_expectations.types import ClassConfig logger = logging.getLogger(__name__) try: - from great_expectations.dataset.sparkdf_dataset import SparkDFDataset from pyspark.sql import SparkSession, DataFrame except ImportError: SparkSession = None @@ -26,7 +23,13 @@ class SparkDFDatasource(Datasource): """The SparkDFDatasource produces SparkDFDatasets and supports generators capable of interacting with local filesystem (the default subdir_reader generator) and databricks notebooks. + + Accepted Batch Kwargs: + - PathBatchKwargs ("path" or "s3" keys) + - InMemoryBatchKwargs ("dataset" key) + - QueryBatchKwargs ("query" key) """ + recognized_batch_parameters = {'reader_method', 'reader_options', 'limit'} @classmethod def build_configuration(cls, data_asset_type=None, generators=None, spark_config=None, **kwargs): @@ -43,18 +46,24 @@ def build_configuration(cls, data_asset_type=None, generators=None, spark_config A complete datasource configuration. """ - if generators is None: - # Provide a gentle way to build a datasource with a sane default, - # including ability to specify the base_directory - base_directory = kwargs.pop("base_directory", "/data") - reader_options = kwargs.pop("reader_options", {}) - generators = { - "default": { - "class_name": "SubdirReaderGenerator", - "base_directory": base_directory, - "reader_options": reader_options - } - } + # No more default generators + + # PENDING DELETION - JPC - 20200130 + # if generators is None: + # # Provide a gentle way to build a datasource with a sane default, + # # including ability to specify the base_directory + # base_directory = kwargs.pop("base_directory", "/data") + # reader_options = kwargs.pop("reader_options", {}) + # generators = { + # "default": { + # "class_name": "SubdirReaderBatchKwargsGenerator", + # "base_directory": base_directory, + # "reader_options": reader_options + # }, + # "passthrough": { + # "class_name": "PassthroughGenerator", + # } + # } if data_asset_type is None: data_asset_type = ClassConfig( @@ -73,9 +82,11 @@ def build_configuration(cls, data_asset_type=None, generators=None, spark_config configuration = kwargs configuration.update({ "data_asset_type": data_asset_type, - "generators": generators, "spark_config": spark_config }) + if generators: + configuration["generators"] = generators + return configuration def __init__(self, name="default", data_context=None, data_asset_type=None, generators=None, @@ -93,7 +104,7 @@ def __init__(self, name="default", data_context=None, data_asset_type=None, gene configuration_with_defaults = SparkDFDatasource.build_configuration(data_asset_type, generators, spark_config, **kwargs) data_asset_type = configuration_with_defaults.pop("data_asset_type") - generators = configuration_with_defaults.pop("generators") + generators = configuration_with_defaults.pop("generators", None) super(SparkDFDatasource, self).__init__( name, data_context=data_context, @@ -112,84 +123,45 @@ def __init__(self, name="default", data_context=None, data_asset_type=None, gene self._build_generators() - def _get_generator_class_from_type(self, type_): - if type_ == "subdir_reader": - return SubdirReaderGenerator - elif type_ == "databricks": - return DatabricksTableGenerator - elif type_ == "memory": - return InMemoryGenerator - elif type_ == "s3": - return S3Generator - else: - raise ValueError("Unrecognized BatchGenerator type %s" % type_) + def process_batch_parameters(self, reader_method=None, reader_options=None, limit=None): + batch_kwargs = super(SparkDFDatasource, self).process_batch_parameters(limit=limit) - def _get_data_asset(self, batch_kwargs, expectation_suite, caching=True, **kwargs): + # Apply globally-configured reader options first + if reader_options: + # Then update with any locally-specified reader options + if not batch_kwargs.get("reader_options"): + batch_kwargs["reader_options"] = dict() + batch_kwargs["reader_options"].update(reader_options) + + if reader_method is not None: + batch_kwargs["reader_method"] = reader_method + + return batch_kwargs + + def get_batch(self, batch_kwargs, batch_parameters=None): """class-private implementation of get_data_asset""" if self.spark is None: logger.error("No spark session available") return None - for k, v in kwargs.items(): - if isinstance(v, dict): - if k in batch_kwargs and isinstance(batch_kwargs[k], dict): - batch_kwargs[k].update(v) - else: - batch_kwargs[k] = v - else: - batch_kwargs[k] = v - reader_options = batch_kwargs.get("reader_options", {}) - # We need to build a batch_id to be used in the dataframe - batch_id = BatchId({ - "timestamp": time.time() + # We need to build batch_markers to be used with the DataFrame + batch_markers = BatchMarkers({ + "ge_load_time": datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") }) - if "data_asset_type" in batch_kwargs: - data_asset_type_config = reader_options.pop("data_asset_type") # Get and remove the config - try: - data_asset_type_config = ClassConfig(**data_asset_type_config) - except TypeError: - # We tried; we'll pass the config downstream, probably as a string, and handle an error later - pass - else: - data_asset_type_config = self._data_asset_type - - data_asset_type = self._get_data_asset_class(data_asset_type_config) - - if not issubclass(data_asset_type, SparkDFDataset): - raise ValueError("SparkDFDatasource cannot instantiate batch with data_asset_type: '%s'. It " - "must be a subclass of SparkDFDataset." % data_asset_type.__name__) - if "path" in batch_kwargs or "s3" in batch_kwargs: # If both are present, let s3 override path = batch_kwargs.get("path") path = batch_kwargs.get("s3", path) reader_method = batch_kwargs.get("reader_method") - if reader_method is None: - reader_method = self._guess_reader_method_from_path(path) - if reader_method is None: - raise BatchKwargsError("Unable to determine reader for path: %s" % path, batch_kwargs) - else: - try: - reader_method = ReaderMethods[reader_method] - except KeyError: - raise BatchKwargsError("Unknown reader method: %s" % reader_method, batch_kwargs) - reader = self.spark.read for option in reader_options.items(): reader = reader.option(*option) - - if reader_method == ReaderMethods.CSV: - df = reader.csv(path) - elif reader_method == ReaderMethods.parquet: - df = reader.parquet(path) - elif reader_method == ReaderMethods.delta: - df = reader.format("delta").load(path) - else: - raise BatchKwargsError("Unsupported reader: %s" % reader_method.name, batch_kwargs) + reader_fn = self._get_reader_fn(reader, reader_method, path) + df = reader_fn(path) elif "query" in batch_kwargs: df = self.spark.sql(batch_kwargs["query"]) @@ -203,7 +175,7 @@ def _get_data_asset(self, batch_kwargs, expectation_suite, caching=True, **kwarg df = df.spark_df # Record this in the kwargs *and* the id batch_kwargs["SparkDFRef"] = True - batch_id["SparkDFRef"] = True + batch_kwargs["ge_batch_id"] = str(uuid.uuid1()) else: raise BatchKwargsError("Unrecognized batch_kwargs for spark_source", batch_kwargs) @@ -211,9 +183,48 @@ def _get_data_asset(self, batch_kwargs, expectation_suite, caching=True, **kwarg if "limit" in batch_kwargs: df = df.limit(batch_kwargs['limit']) - return data_asset_type(df, - expectation_suite=expectation_suite, - data_context=self._data_context, - batch_kwargs=batch_kwargs, - caching=caching, - batch_id=batch_id) + return Batch( + datasource_name=self.name, + batch_kwargs=batch_kwargs, + data=df, + batch_parameters=batch_parameters, + batch_markers=batch_markers, + data_context=self._data_context + ) + + @staticmethod + def guess_reader_method_from_path(path): + if path.endswith(".csv") or path.endswith(".tsv"): + return {"reader_method": "csv"} + elif path.endswith(".parquet"): + return {"reader_method": "parquet"} + + raise BatchKwargsError("Unable to determine reader method from path: %s" % path, {"path": path}) + + def _get_reader_fn(self, reader, reader_method=None, path=None): + """Static helper for providing reader_fn + + Args: + reader: the base spark reader to use; this should have had reader_options applied already + reader_method: the name of the reader_method to use, if specified + path (str): the path to use to guess reader_method if it was not specified + + Returns: + ReaderMethod to use for the filepath + + """ + if reader_method is None and path is None: + raise BatchKwargsError("Unable to determine spark reader function without reader_method or path.", + {"reader_method": reader_method}) + + if reader_method is None: + reader_method = self.guess_reader_method_from_path(path=path)["reader_method"] + + try: + if reader_method.lower() == "delta": + return reader.format("delta").load + + return getattr(reader, reader_method) + except AttributeError: + raise BatchKwargsError("Unable to find reader_method %s in spark." % reader_method, + {"reader_method": reader_method}) diff --git a/great_expectations/datasource/sqlalchemy_datasource.py b/great_expectations/datasource/sqlalchemy_datasource.py index 4c7b15ee0447..4f9568169605 100644 --- a/great_expectations/datasource/sqlalchemy_datasource.py +++ b/great_expectations/datasource/sqlalchemy_datasource.py @@ -1,13 +1,18 @@ -import time import logging +import datetime from string import Template from great_expectations.datasource import Datasource -from great_expectations.datasource.types import BatchId -from great_expectations.dataset.sqlalchemy_dataset import SqlAlchemyDataset -from .generator.query_generator import QueryGenerator +from great_expectations.datasource.types import ( + SqlAlchemyDatasourceQueryBatchKwargs, + SqlAlchemyDatasourceTableBatchKwargs, + BatchMarkers +) +from great_expectations.dataset.sqlalchemy_dataset import SqlAlchemyBatchReference from great_expectations.exceptions import DatasourceInitializationError from great_expectations.types import ClassConfig +from great_expectations.core.batch import Batch +from great_expectations.core.util import nested_update logger = logging.getLogger(__name__) @@ -29,6 +34,7 @@ class SqlAlchemyDatasource(Datasource): that query. The query can be parameterized according to the standard python Template engine, which uses $parameter, with additional kwargs passed to the get_batch method. """ + recognized_batch_parameters = {'query_parameters', 'limit'} @classmethod def build_configuration(cls, data_asset_type=None, generators=None, **kwargs): @@ -44,12 +50,16 @@ def build_configuration(cls, data_asset_type=None, generators=None, **kwargs): A complete datasource configuration. """ - if generators is None: - generators = { - "default": { - "class_name": "TableGenerator" - } - } + + # As of 0.9.0, we do not require generators be configured + # generators = { + # "default": { + # "class_name": "TableBatchKwargsGenerator" + # }, + # "passthrough": { + # "class_name": "PassthroughGenerator", + # } + # } if data_asset_type is None: data_asset_type = ClassConfig( @@ -62,10 +72,10 @@ def build_configuration(cls, data_asset_type=None, generators=None, **kwargs): pass configuration = kwargs - configuration.update({ - "data_asset_type": data_asset_type, - "generators": generators, - }) + configuration["data_asset_type"] = data_asset_type + if generators is not None: + configuration["generators"] = generators + return configuration def __init__(self, name="default", data_context=None, data_asset_type=None, credentials=None, generators=None, **kwargs): @@ -74,7 +84,7 @@ def __init__(self, name="default", data_context=None, data_asset_type=None, cred configuration_with_defaults = SqlAlchemyDatasource.build_configuration(data_asset_type, generators, **kwargs) data_asset_type = configuration_with_defaults.pop("data_asset_type") - generators = configuration_with_defaults.pop("generators") + generators = configuration_with_defaults.pop("generators", None) super(SqlAlchemyDatasource, self).__init__( name, data_context=data_context, @@ -139,43 +149,10 @@ def _get_sqlalchemy_connection_options(self, **kwargs): return options, drivername - def _get_generator_class_from_type(self, type_): - if type_ == "queries": - return QueryGenerator - else: - raise ValueError("Unrecognized DataAssetGenerator type %s" % type_) - - def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): - for k, v in kwargs.items(): - if isinstance(v, dict): - if k in batch_kwargs and isinstance(batch_kwargs[k], dict): - batch_kwargs[k].update(v) - else: - batch_kwargs[k] = v - else: - batch_kwargs[k] = v - - if "data_asset_type" in batch_kwargs: - # Sqlalchemy does not use reader_options or need to remove batch_kwargs since it does not pass - # options through to a later reader - data_asset_type_config = batch_kwargs["data_asset_type"] - try: - data_asset_type_config = ClassConfig(**data_asset_type_config) - except TypeError: - # We tried; we'll pass the config downstream, probably as a string, and handle an error later - pass - else: - data_asset_type_config = self._data_asset_type - - data_asset_type = self._get_data_asset_class(data_asset_type_config) - - if not issubclass(data_asset_type, SqlAlchemyDataset): - raise ValueError("SqlAlchemyDatasource cannot instantiate batch with data_asset_type: '%s'. It " - "must be a subclass of SqlAlchemyDataset." % data_asset_type.__name__) - + def get_batch(self, batch_kwargs, batch_parameters=None): # We need to build a batch_id to be used in the dataframe - batch_id = BatchId({ - "timestamp": time.time() + batch_markers = BatchMarkers({ + "ge_load_time": datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") }) if "schema" in batch_kwargs: @@ -193,25 +170,11 @@ def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): .offset(offset)\ .limit(limit) query = str(raw_query.compile(self.engine, compile_kwargs={"literal_binds": True})) - return data_asset_type( - custom_sql=query, - engine=self.engine, - data_context=self._data_context, - expectation_suite=expectation_suite, - batch_kwargs=batch_kwargs, - batch_id=batch_id - ) - + batch_reference = SqlAlchemyBatchReference(engine=self.engine, query=query, + schema=batch_kwargs.get("schema")) else: - return data_asset_type( - table_name=batch_kwargs["table"], - engine=self.engine, - schema=schema, - data_context=self._data_context, - expectation_suite=expectation_suite, - batch_kwargs=batch_kwargs, - batch_id=batch_id - ) + batch_reference = SqlAlchemyBatchReference(engine=self.engine, table_name=batch_kwargs["table"], + schema=batch_kwargs.get("schema")) elif "query" in batch_kwargs: if "limit" in batch_kwargs or "offset" in batch_kwargs: @@ -221,17 +184,27 @@ def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): table_name = batch_kwargs.get("bigquery_temp_table") else: table_name = None - - query = Template(batch_kwargs["query"]).safe_substitute(**kwargs) - return data_asset_type( - custom_sql=query, - engine=self.engine, - table_name=table_name, - data_context=self._data_context, - expectation_suite=expectation_suite, - batch_kwargs=batch_kwargs, - batch_id=batch_id - ) + + if "query_parameters" in batch_kwargs: + query = Template(batch_kwargs["query"]).safe_substitute(batch_kwargs["query_parameters"]) + else: + query = batch_kwargs["query"] + batch_reference = SqlAlchemyBatchReference(engine=self.engine, query=query, table_name=table_name, + schema=batch_kwargs.get("schema")) else: raise ValueError("Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified") + + return Batch( + datasource_name=self.name, + batch_kwargs=batch_kwargs, + data=batch_reference, + batch_parameters=batch_parameters, + batch_markers=batch_markers, + data_context=self._data_context + ) + + def process_batch_parameters(self, query_parameters=None, limit=None): + batch_kwargs = super(SqlAlchemyDatasource, self).process_batch_parameters(limit=limit) + nested_update(batch_kwargs, {"query_parameters": query_parameters}) + return batch_kwargs diff --git a/great_expectations/datasource/types/__init__.py b/great_expectations/datasource/types/__init__.py index af41aafea23a..2030b5317ab2 100644 --- a/great_expectations/datasource/types/__init__.py +++ b/great_expectations/datasource/types/__init__.py @@ -1,2 +1 @@ from .batch_kwargs import * -from .reader_methods import ReaderMethods diff --git a/great_expectations/datasource/types/batch_kwargs.py b/great_expectations/datasource/types/batch_kwargs.py index a08e3417921a..c1ef7a4f6094 100644 --- a/great_expectations/datasource/types/batch_kwargs.py +++ b/great_expectations/datasource/types/batch_kwargs.py @@ -1,96 +1,31 @@ import logging -import copy -from hashlib import md5 -import datetime -import pandas as pd -from six import string_types -from great_expectations.types import RequiredKeysDotDict, ClassConfig -from great_expectations.data_context.types.base_resource_identifiers import OrderedDataContextKey +# PYTHON 2 - py2 - update to ABC direct use rather than __metaclass__ once we drop py2 support +from abc import ABCMeta -try: - import pyspark -except ImportError: - pyspark = None +from great_expectations.core.data_context_key import DataContextKey +from great_expectations.core.id_dict import BatchKwargs +from great_expectations.exceptions import InvalidBatchKwargsError, InvalidBatchIdError logger = logging.getLogger(__name__) -class BatchFingerprint(OrderedDataContextKey): - _allowed_keys = OrderedDataContextKey._allowed_keys | { - "partition_id", - "fingerprint" - } - _required_keys = OrderedDataContextKey._required_keys | { - "partition_id", - "fingerprint" - } - _key_types = copy.copy(OrderedDataContextKey._key_types) - _key_types.update({ - "partition_id": string_types, - "fingerprint": string_types - }) - _key_order = copy.copy(OrderedDataContextKey._key_order) - _key_order.extend(["partition_id", "fingerprint"]) - - -class BatchKwargs(RequiredKeysDotDict): - """BatchKwargs represent information required by a datasource to fetch a batch of data. - - BatchKwargs are usually generated by BatchGenerator objects and interpreted by Datasource objects. - """ - _required_keys = set() - _partition_id_key = "partition_id" # a partition id can be used as shorthand to access a batch of data - - # _batch_fingerprint_ignored_keys makes it possible to define keys which, if present, are ignored for purposes - # of determining the unique batch id, such that batches differing only in the value in these keys are given - # the same id - _batch_fingerprint_ignored_keys = { - "data_asset_type" - } - _key_types = { - "data_asset_type": ClassConfig - } - - @property - def batch_fingerprint(self): - partition_id = self.get(self._partition_id_key, None) - # We do not allow a "None" partition_id, even if it's explicitly present as such in batch_kwargs - if partition_id is None: - partition_id = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") - id_keys = (set(self.keys()) - set(self._batch_fingerprint_ignored_keys)) - {self._partition_id_key} - if len(id_keys) == 1: - key = list(id_keys)[0] - hash_ = key + ":" + str(self[key]) - else: - hash_dict = {k: self[k] for k in id_keys} - hash_ = md5(str(sorted(hash_dict.items())).encode("utf-8")).hexdigest() - - return BatchFingerprint(partition_id=partition_id, fingerprint=hash_) - - @classmethod - def build_batch_fingerprint(cls, dict_): - try: - return BatchKwargs(dict_).batch_fingerprint - except (KeyError, TypeError): - logger.warning("Unable to build BatchKwargs from provided dictionary.") - return None - - -class BatchId(BatchKwargs): - """A BatchId is a special type of BatchKwargs (so that it has a batch_fingerprint) but it generally does +class BatchMarkers(BatchKwargs): + """A BatchMarkers is a special type of BatchKwargs (so that it has a batch_fingerprint) but it generally does NOT require specific keys and instead captures information about the OUTPUT of a datasource's fetch process, such as the timestamp at which a query was executed.""" - _required_keys = BatchKwargs._required_keys | { - "timestamp" - } - _key_types = copy.copy(BatchKwargs._key_types) - _key_types.update({ - "timestamp": float - }) + def __init__(self, *args, **kwargs): + super(BatchMarkers, self).__init__(*args, **kwargs) + if "ge_load_time" not in self: + raise InvalidBatchIdError("BatchMarkers requires a ge_load_time") + + @property + def ge_load_time(self): + return self.get("ge_load_time") class PandasDatasourceBatchKwargs(BatchKwargs): + __metaclass__ = ABCMeta """This is an abstract class and should not be instantiated. It's relevant for testing whether a subclass is allowed """ @@ -98,6 +33,7 @@ class PandasDatasourceBatchKwargs(BatchKwargs): class SparkDFDatasourceBatchKwargs(BatchKwargs): + __metaclass__ = ABCMeta """This is an abstract class and should not be instantiated. It's relevant for testing whether a subclass is allowed """ @@ -105,79 +41,112 @@ class SparkDFDatasourceBatchKwargs(BatchKwargs): class SqlAlchemyDatasourceBatchKwargs(BatchKwargs): + __metaclass__ = ABCMeta """This is an abstract class and should not be instantiated. It's relevant for testing whether a subclass is allowed """ - pass + @property + def limit(self): + return self.get("limit") + + @property + def schema(self): + return self.get("schema") class PathBatchKwargs(PandasDatasourceBatchKwargs, SparkDFDatasourceBatchKwargs): - """PathBatchKwargs represents kwargs suitable for reading a file from a given path.""" - _required_keys = { - "path" - } - _key_types = { - "path": string_types, - "reader_method": string_types - } + def __init__(self, *args, **kwargs): + super(PathBatchKwargs, self).__init__(*args, **kwargs) + if "path" not in self: + raise InvalidBatchKwargsError("PathBatchKwargs requires a path element") + + @property + def path(self): + return self.get("path") + + @property + def reader_method(self): + return self.get("reader_method") class S3BatchKwargs(PandasDatasourceBatchKwargs, SparkDFDatasourceBatchKwargs): - """PathBatchKwargs represents kwargs suitable for reading a file from a given path.""" - _required_keys = { - "s3" - } - _key_types = { - "s3": string_types, - "reader_method": string_types - } + def __init__(self, *args, **kwargs): + super(S3BatchKwargs, self).__init__(*args, **kwargs) + if "s3" not in self: + raise InvalidBatchKwargsError("S3BatchKwargs requires a path element") + + @property + def s3(self): + return self.get("s3") + @property + def reader_method(self): + return self.get("reader_method") class InMemoryBatchKwargs(PandasDatasourceBatchKwargs, SparkDFDatasourceBatchKwargs): - _required_keys = { - "dataset" - } + def __init__(self, *args, **kwargs): + super(InMemoryBatchKwargs, self).__init__(*args, **kwargs) + if "dataset" not in self: + raise InvalidBatchKwargsError("InMemoryBatchKwargs requires a 'dataset' element") + + @property + def dataset(self): + return self.get("dataset") -class PandasDatasourceMemoryBatchKwargs(InMemoryBatchKwargs): - _key_types = { - "dataset": pd.DataFrame - } +class PandasDatasourceInMemoryBatchKwargs(InMemoryBatchKwargs): + def __init__(self, *args, **kwargs): + super(PandasDatasourceInMemoryBatchKwargs, self).__init__(*args, **kwargs) + import pandas as pd + if not isinstance(self["dataset"], pd.DataFrame): + raise InvalidBatchKwargsError("PandasDatasourceInMemoryBatchKwargs 'dataset' must be a pandas DataFrame") -class SparkDFDatasourceMemoryBatchKwargs(InMemoryBatchKwargs): - try: - _key_types = { - "dataset": pyspark.sql.DataFrame - } - except AttributeError: - _key_types = { - "dataset": None # If we were unable to import pyspark, these are invalid - } +class SparkDFDatasourceInMemoryBatchKwargs(InMemoryBatchKwargs): + def __init__(self, *args, **kwargs): + super(SparkDFDatasourceInMemoryBatchKwargs, self).__init__(*args, **kwargs) + try: + import pyspark + except ImportError: + raise InvalidBatchKwargsError( + "SparkDFDatasourceInMemoryBatchKwargs requires a valid pyspark installation, but pyspark import failed." + ) + if not isinstance(self["dataset"], pyspark.sql.DataFrame): + raise InvalidBatchKwargsError("SparkDFDatasourceInMemoryBatchKwargs 'dataset' must be a spark DataFrame") class SqlAlchemyDatasourceTableBatchKwargs(SqlAlchemyDatasourceBatchKwargs): - _required_keys = { - "table" - } - _key_types = { - "table": string_types - } + def __init__(self, *args, **kwargs): + super(SqlAlchemyDatasourceTableBatchKwargs, self).__init__(*args, **kwargs) + if "table" not in self: + raise InvalidBatchKwargsError("SqlAlchemyDatasourceTableBatchKwargs requires a 'table' element") + + @property + def table(self): + return self.get("table") class SqlAlchemyDatasourceQueryBatchKwargs(SqlAlchemyDatasourceBatchKwargs): - _required_keys = { - "query" - } - _key_types = { - "query": string_types - } + def __init__(self, *args, **kwargs): + super(SqlAlchemyDatasourceQueryBatchKwargs, self).__init__(*args, **kwargs) + if "query" not in self: + raise InvalidBatchKwargsError("SqlAlchemyDatasourceQueryBatchKwargs requires a 'query' element") + + @property + def query(self): + return self.get("query") + + @property + def query_parameters(self): + return self.get("query_parameters") class SparkDFDatasourceQueryBatchKwargs(SparkDFDatasourceBatchKwargs): - _required_keys = { - "query" - } - _key_types = { - "query": string_types - } + def __init__(self, *args, **kwargs): + super(SparkDFDatasourceQueryBatchKwargs, self).__init__(*args, **kwargs) + if "query" not in self: + raise InvalidBatchKwargsError("SparkDFDatasourceQueryBatchKwargs requires a 'query' element") + + @property + def query(self): + return self.get("query") \ No newline at end of file diff --git a/great_expectations/datasource/types/reader_methods.py b/great_expectations/datasource/types/reader_methods.py index a091ded261f2..e69de29bb2d1 100644 --- a/great_expectations/datasource/types/reader_methods.py +++ b/great_expectations/datasource/types/reader_methods.py @@ -1,15 +0,0 @@ -from enum import Enum - - -class ReaderMethods(Enum): - CSV = 1 - csv = 1 - parquet = 2 - excel = 3 - xls = 3 - xlsx = 3 - JSON = 4 - json = 4 - delta = 5 - CSV_GZ = 6 - pickle = 7 diff --git a/great_expectations/exceptions.py b/great_expectations/exceptions.py index 0c6b46bf4178..04ed6fd1f8f4 100644 --- a/great_expectations/exceptions.py +++ b/great_expectations/exceptions.py @@ -1,15 +1,33 @@ -import os +from marshmallow import ValidationError class GreatExpectationsError(Exception): def __init__(self, message): - self.message = message + self.message = message + + +class GreatExpectationsValidationError(ValidationError, GreatExpectationsError): + def __init__(self, message, validation_error): + self.message = message + self.messages = validation_error.messages + self.data = validation_error.data + self.field_names = validation_error.field_names + self.fields = validation_error.fields + self.kwargs = validation_error.kwargs class DataContextError(GreatExpectationsError): pass +class UnavailableMetricError(GreatExpectationsError): + pass + + +class ParserError(GreatExpectationsError): + pass + + class InvalidConfigurationYamlError(GreatExpectationsError): pass @@ -18,15 +36,23 @@ class InvalidTopLevelConfigKeyError(GreatExpectationsError): pass -class MissingTopLevelConfigKeyError(GreatExpectationsError): +class MissingTopLevelConfigKeyError(GreatExpectationsValidationError): pass -class InvalidConfigValueTypeError(GreatExpectationsError): +class InvalidDataContextConfigError(GreatExpectationsValidationError): pass -class InvalidConfigVersionError(GreatExpectationsError): +class InvalidBatchKwargsError(GreatExpectationsError): + pass + + +class InvalidBatchIdError(GreatExpectationsError): + pass + + +class InvalidDataContextKeyError(DataContextError): pass @@ -47,6 +73,32 @@ def __init__(self, message): self.message = message +class AmbiguousDataAssetNameError(DataContextError): + def __init__(self, message, candidates=None): + self.message = message + self.candidates = candidates + + +class StoreConfigurationError(DataContextError): + pass + + +class InvalidExpectationKwargsError(GreatExpectationsError): + pass + + +class InvalidExpectationConfigurationError(GreatExpectationsError): + pass + + +class InvalidValidationResultError(GreatExpectationsError): + pass + + +class GreatExpectationsTypeError(TypeError): + pass + + class ConfigNotFoundError(DataContextError): """The great_expectations dir could not be found.""" def __init__(self): @@ -74,22 +126,55 @@ def __init__(self, module_name): ) -class PluginClassNotFoundError(GreatExpectationsError, AttributeError): +class PluginClassNotFoundError(DataContextError, AttributeError): """A module import failed.""" def __init__(self, module_name, class_name): - template = """Error: The module: `{}` does not contain the class: `{}`. - - Please verify this class name `{}`. -""" - self.message = template.format(module_name, class_name, class_name) + class_name_changes = { + "FixedLengthTupleFilesystemStoreBackend": "TupleFilesystemStoreBackend", + "FixedLengthTupleS3StoreBackend": "TupleS3StoreBackend", + "FixedLengthTupleGCSStoreBackend": "TupleGCSStoreBackend", + "InMemoryEvaluationParameterStore": "EvaluationParameterStore", + "DatabricksTableGenerator": "DatabricksTableBatchKwargsGenerator", + "GlobReaderGenerator": "GlobReaderBatchKwargsGenerator", + "SubdirReaderGenerator": "SubdirReaderBatchKwargsGenerator", + "QueryGenerator": "QueryBatchKwargsGenerator", + "TableGenerator": "TableBatchKwargsGenerator", + "S3Generator": "S3GlobReaderBatchKwargsGenerator", + "ExtractAndStoreEvaluationParamsAction": "StoreEvaluationParametersAction", + "StoreAction": "StoreValidationResultAction" + } + + if class_name_changes.get(class_name): + template = """Error: The module: `{}` does not contain the class: `{}`. + The class name `{}` has changed to `{}`.""" + self.message = template.format( + module_name, + class_name, + class_name, + class_name_changes.get(class_name) + ) + else: + template = """Error: The module: `{}` does not contain the class: `{}`. + - Please verify this class name `{}`.""" + self.message = template.format(module_name, class_name, class_name) colored_template = "" + template + "" module_snippet = "" + module_name + "" class_snippet = "" + class_name + "" - self.cli_colored_message = colored_template.format( - module_snippet, - class_snippet, - class_snippet, - ) + if class_name_changes.get(class_name): + new_class_snippet = "" + class_name_changes.get(class_name) + "" + self.cli_colored_message = colored_template.format( + module_snippet, + class_snippet, + class_snippet, + new_class_snippet + ) + else: + self.cli_colored_message = colored_template.format( + module_snippet, + class_snippet, + class_snippet, + ) class ExpectationSuiteNotFoundError(GreatExpectationsError): @@ -99,7 +184,7 @@ def __init__(self, data_asset_name): class BatchKwargsError(DataContextError): - def __init__(self, message, batch_kwargs): + def __init__(self, message, batch_kwargs=None): self.message = message self.batch_kwargs = batch_kwargs @@ -107,3 +192,7 @@ def __init__(self, message, batch_kwargs): class DatasourceInitializationError(GreatExpectationsError): def __init__(self, datasource_name, message): self.message = "Cannot initialize datasource %s, error: %s" % (datasource_name, message) + + +class InvalidConfigValueTypeError(DataContextError): + pass diff --git a/tests/store/__init__.py b/great_expectations/expectation_suite.py similarity index 100% rename from tests/store/__init__.py rename to great_expectations/expectation_suite.py diff --git a/great_expectations/init_notebooks/pandas/create_expectations.ipynb b/great_expectations/init_notebooks/pandas/create_expectations.ipynb deleted file mode 100644 index 640b4dcac867..000000000000 --- a/great_expectations/init_notebooks/pandas/create_expectations.ipynb +++ /dev/null @@ -1,349 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Author Expectations\n", - "\n", - "Watch a [short tutorial video](https://greatexpectations.io/videos/getting_started/create_expectations?utm_source=notebook&utm_medium=create_expectations) or read [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations)\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import os\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#get-a-datacontext-object)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. List the CSVs in your folder\n", - "\n", - "The `DataContext` will now introspect your pandas `Datasource` and list the CSVs it finds. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#list-data-assets)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "great_expectations.jupyter_ux.list_available_data_asset_names(context)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Pick a CSV and set the expectation suite name\n", - "\n", - "Internally, Great Expectations represents CSVs and dataframes as `DataAsset`s and uses this notion to link them to `Expectation Suites`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#pick-a-data-asset-and-set-the-expectation-suite-name)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_asset_name = \"ONE_OF_THE_CSV_DATA_ASSET_NAMES_FROM_ABOVE\" # TODO: replace with your value!\n", - "normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name)\n", - "normalized_data_asset_name" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We recommend naming your first expectation suite for a table `warning`. Later, as you identify some of the expectations that you add to this suite as critical, you can move these expectations into another suite and call it `failure`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = \"warning\" # TODO: replace with your value!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Create a new empty expectation suite" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.create_expectation_suite(data_asset_name=data_asset_name, expectation_suite_name=expectation_suite_name, overwrite_existing=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Load a batch of data you want to use to create `Expectations`\n", - "\n", - "To learn more about `get_batch` with other data types (such as existing pandas dataframes, SQL tables or Spark), see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#load-a-batch-of-data-to-create-expectations)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch_kwargs = context.yield_batch_kwargs(data_asset_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load a batch of data and take a peek at the first few rows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch = context.get_batch(data_asset_name, expectation_suite_name, batch_kwargs)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Optionally, customize and review batch options\n", - "\n", - "`BatchKwargs` are extremely flexible - to learn more [read the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#load-a-batch-of-data-to-create-expectations)\n", - "\n", - "Here are the batch kwargs used to load your batch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.batch_kwargs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The datasource can add and store additional identifying information to ensure you can track a batch through\n", - "# your pipeline\n", - "batch.batch_id" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Author Expectations\n", - "\n", - "With a batch, you can add expectations by calling specific expectation methods. They all begin with `.expect_` which makes autocompleting easy.\n", - "\n", - "See available expectations in the [expectation glossary](https://docs.greatexpectations.io/en/latest/glossary.html?utm_source=notebook&utm_medium=create_expectations).\n", - "You can also see available expectations by hovering over data elements in the HTML page generated by profiling your dataset.\n", - "\n", - "Below is an example expectation that checks if the values in the batch's first column are null.\n", - "\n", - "[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#author-expectations)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_name = batch.get_table_columns()[0]\n", - "batch.expect_column_values_to_not_be_null(column_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add more expectations here. **Hint** start with `batch.expect_` and hit tab for Jupyter's autocomplete to see all the expectations!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 7. Review and save your Expectations\n", - "\n", - "Expectations that are `True` on this data batch are added automatically. Let's view all the expectations you created in machine-readable JSON." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.get_expectation_suite()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - " \n", - "If you decide not to save some expectations that you created, use [remove_expectaton method](https://docs.greatexpectations.io/en/latest/module_docs/data_asset_module.html?highlight=remove_expectation&utm_source=notebook&utm_medium=create_expectations#great_expectations.data_asset.data_asset.DataAsset.remove_expectation). You can also choose not to filter expectations that were `False` on this batch.\n", - "\n", - "\n", - "The following method will save the expectation suite as a JSON file in the `great_expectations/expectations` directory of your project:\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.save_expectation_suite()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 8. View the Expectations in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **Expectation Suite Overview** built from the expectations you just created that helps you communicate about your data with both machines and humans." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.build_data_docs()\n", - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You created and saved Expectations\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Play with Validation\n", - "\n", - "Validation is the process of checking if new batches of this data meet to your expectations before they are processed by your pipeline. Go to [validation_playground.ipynb](validation_playground.ipynb) to see how!\n", - "\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/great_expectations/init_notebooks/pandas/validation_playground.ipynb b/great_expectations/init_notebooks/pandas/validation_playground.ipynb index ecca936094a4..5b57896cca3a 100644 --- a/great_expectations/init_notebooks/pandas/validation_playground.ipynb +++ b/great_expectations/init_notebooks/pandas/validation_playground.ipynb @@ -8,6 +8,10 @@ "\n", "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", "\n", + "#### This notebook assumes that you created at least one expectation suite in your project.\n", + "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n", + "\n", + "\n", "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" ] }, @@ -19,7 +23,6 @@ "source": [ "import json\n", "import great_expectations as ge\n", - "from great_expectations.profile import ColumnsExistProfiler\n", "import great_expectations.jupyter_ux\n", "from great_expectations.datasource.types import BatchKwargs\n", "from datetime import datetime" @@ -30,7 +33,7 @@ "metadata": {}, "source": [ "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#get-a-datacontext-object)" + "This represents your **project** that you just created using `great_expectations init`." ] }, { @@ -46,9 +49,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. List the CSVs in your folder\n", - "\n", - "The `DataContext` will now introspect your pandas `Datasource` and list the CSVs it finds. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#list-data-assets)" + "## 2. Choose an Expectation Suite\n" ] }, { @@ -57,34 +58,10 @@ "metadata": {}, "outputs": [], "source": [ - "ge.jupyter_ux.list_available_data_asset_names(context)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Pick a csv and the expectation suite\n", + "# list expectation suites that you created in your project\n", "\n", - "Internally, Great Expectations represents csvs and dataframes as `DataAsset`s and uses this notion to link them to `Expectation Suites`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#pick-a-data-asset-and-expectation-suite)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_asset_name = \"ONE_OF_THE_CSV_DATA_ASSET_NAMES_FROM_ABOVE\" # TODO: replace with your value!\n", - "normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name)\n", - "normalized_data_asset_name" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We recommend naming your first expectation suite for a table `warning`. Later, as you identify some of the expectations that you add to this suite as critical, you can move these expectations into another suite and call it `failure`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/pipeline_integration.html?utm_source=notebook&utm_medium=integrate_validation#choose-data-asset-and-expectation-suite)" + "for expectation_suite_id in context.list_expectation_suites():\n", + " print(expectation_suite_id.expectation_suite_name)" ] }, { @@ -93,23 +70,16 @@ "metadata": {}, "outputs": [], "source": [ - "expectation_suite_name = \"warning\" # TODO: replace with your value!" + "expectation_suite_name = # TODO: set to a name from the list above" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### 3.a. If you don't have an expectation suite, let's create a simple one\n", - "\n", - "You need expectations to validate your data. Expectations are grouped into Expectation Suites. \n", - "\n", - "If you don't have an expectation suite for this data asset, the notebook's next cell will create a suite of very basic expectations, so that you have some expectations to play with. The expectation suite will have `expect_column_to_exist` expectations for each column.\n", - "\n", - "If you created an expectation suite for this data asset, you can skip executing the next cell (if you execute it, it will do nothing).\n", + "## 3. Load a batch of data you want to validate\n", "\n", - "To create a more interesting suite, open the [create_expectations.ipynb](create_expectations.ipynb) notebook.\n", - "\n" + "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" ] }, { @@ -118,25 +88,8 @@ "metadata": {}, "outputs": [], "source": [ - "try:\n", - " context.get_expectation_suite(normalized_data_asset_name, expectation_suite_name)\n", - "except great_expectations.exceptions.DataContextError:\n", - " context.create_expectation_suite(data_asset_name=normalized_data_asset_name, expectation_suite_name=expectation_suite_name, overwrite_existing=True);\n", - " batch_kwargs = context.yield_batch_kwargs(data_asset_name)\n", - " batch = context.get_batch(normalized_data_asset_name, expectation_suite_name, batch_kwargs)\n", - " ColumnsExistProfiler().profile(batch)\n", - " batch.save_expectation_suite()\n", - " expectation_suite = context.get_expectation_suite(normalized_data_asset_name, expectation_suite_name)\n", - " context.build_data_docs()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch` with other data types (such as existing pandas dataframes, SQL tables or Spark), see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" + "# list datasources of the type PandasDatasource in your project\n", + "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']" ] }, { @@ -145,19 +98,7 @@ "metadata": {}, "outputs": [], "source": [ - "batch_kwargs = context.yield_batch_kwargs(data_asset_name)\n", - "batch = context.get_batch(normalized_data_asset_name, expectation_suite_name, batch_kwargs)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Get a pipeline run id\n", - "\n", - "Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", - "[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/pipeline_integration.html?utm_source=notebook&utm_medium=validate_data#set-a-run-id)" + "datasource_name = # TODO: set to a datasource name from above" ] }, { @@ -166,18 +107,22 @@ "metadata": {}, "outputs": [], "source": [ - "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner.\n", - "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", - "run_id" + "# If you would like to validate a file on a filesystem:\n", + "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n", + "\n", + "# If you already loaded the data into a Pandas Data Frame:\n", + "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", + "\n", + "\n", + "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", + "batch.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 6. Validate the batch\n", - "\n", - "This is the \"workhorse\" of Great Expectations. Call it in your pipeline code after loading data and just before passing it to your computation.\n", + "## 4. Validate the batch\n", "\n", "[Read more about the validate method in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#validate-the-batch)\n" ] @@ -188,19 +133,19 @@ "metadata": {}, "outputs": [], "source": [ - "validation_result = batch.validate(run_id=run_id)\n", + "validation_result = batch.validate()\n", "\n", "if validation_result[\"success\"]:\n", - " print(\"This data meets all expectations for {}\".format(str(data_asset_name)))\n", + " print(\"This data meets all expectations in {}\".format(expectation_suite_name))\n", "else:\n", - " print(\"This data is not a valid batch of {}\".format(str(data_asset_name)))" + " print(\"This data does not meet some expectations in {}\".format(expectation_suite_name))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 6.a. OPTIONAL: Review the JSON validation results\n", + "## 4.a. OPTIONAL: Review the JSON validation results\n", "\n", "Don't worry - this blob of JSON is meant for machines. Continue on or skip this to see this in Data Docs!" ] @@ -211,14 +156,14 @@ "metadata": {}, "outputs": [], "source": [ - "# print(json.dumps(validation_result, indent=4))" + "#validation_result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 7. Validation Operators\n", + "## 5. Validation Operators\n", "\n", "The `validate` method evaluates one batch of data against one expectation suite and returns a dictionary of validation results. This is sufficient when you explore your data and get to know Great Expectations.\n", "When deploying Great Expectations in a **real data pipeline, you will typically discover additional needs**:\n", @@ -240,18 +185,22 @@ "source": [ "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", "\n", + "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", + "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n", + "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", + "\n", "results = context.run_validation_operator(\n", - " assets_to_validate=[batch],\n", - " run_id=run_id,\n", - " validation_operator_name=\"action_list_operator\",\n", - ")" + " \"action_list_operator\", \n", + " assets_to_validate=[batch], \n", + " run_id=run_id)\n", + "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 8. View the Validation Results in Data Docs\n", + "## 6. View the Validation Results in Data Docs\n", "\n", "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", "\n", @@ -275,9 +224,9 @@ "\n", "## Next steps:\n", "\n", - "### 1. Author more interesting Expectations\n", + "### 1. Read about the typical workflow with Great Expectations:\n", "\n", - "Here we used some **extremely basic** `Expectations`. To really harness the power of Great Expectations you can author much more interesting and specific `Expectations` to protect your data pipelines and defeat pipeline debt. Go to [create_expectations.ipynb](create_expectations.ipynb) to see how!\n", + "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", "\n", "### 2. Explore the documentation & community\n", "\n", diff --git a/great_expectations/init_notebooks/spark/create_expectations.ipynb b/great_expectations/init_notebooks/spark/create_expectations.ipynb deleted file mode 100644 index bcd5e389a832..000000000000 --- a/great_expectations/init_notebooks/spark/create_expectations.ipynb +++ /dev/null @@ -1,349 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Author Expectations\n", - "\n", - "Watch a [short tutorial video](https://greatexpectations.io/videos/getting_started/create_expectations?utm_source=notebook&utm_medium=create_expectations) or read [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations)\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import os\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#get-a-datacontext-object)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. List the CSVs in your folder\n", - "\n", - "The `DataContext` will now introspect your pyspark `Datasource` and list the CSVs it finds. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#list-data-assets)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "great_expectations.jupyter_ux.list_available_data_asset_names(context)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Pick a CSV and set the expectation suite name\n", - "\n", - "Internally, Great Expectations represents CSVs and dataframes as `DataAsset`s and uses this notion to link them to `Expectation Suites`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#pick-a-data-asset-and-set-the-expectation-suite-name)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_asset_name = \"ONE_OF_THE_CSV_DATA_ASSET_NAMES_FROM_ABOVE\" # TODO: replace with your value!\n", - "normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name)\n", - "normalized_data_asset_name" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We recommend naming your first expectation suite for a table `warning`. Later, as you identify some of the expectations that you add to this suite as critical, you can move these expectations into another suite and call it `failure`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = \"warning\" # TODO: replace with your value!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Create a new empty expectation suite" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.create_expectation_suite(data_asset_name=data_asset_name, expectation_suite_name=expectation_suite_name, overwrite_existing=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Load a batch of data you want to use to create `Expectations`\n", - "\n", - "To learn more about batches and `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#load-a-batch-of-data-to-create-expectations)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch_kwargs = context.yield_batch_kwargs(data_asset_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load a batch of data and take a peek at the first few rows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch = context.get_batch(data_asset_name, expectation_suite_name, batch_kwargs)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Optionally, customize and review batch options\n", - "\n", - "`BatchKwargs` are extremely flexible - to learn more [read the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#load-a-batch-of-data-to-create-expectations)\n", - "\n", - "Here are the batch kwargs used to load your batch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.batch_kwargs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The datasource can add and store additional identifying information to ensure you can track a batch through\n", - "# your pipeline\n", - "batch.batch_id" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Author Expectations\n", - "\n", - "With a batch, you can add expectations by calling specific expectation methods. They all begin with `.expect_` which makes autocompleting easy.\n", - "\n", - "See available expectations in the [expectation glossary](https://docs.greatexpectations.io/en/latest/glossary.html?utm_source=notebook&utm_medium=create_expectations).\n", - "You can also see available expectations by hovering over data elements in the HTML page generated by profiling your dataset.\n", - "\n", - "Below is an example expectation that checks if the values in the batch's first column are null.\n", - "\n", - "[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#author-expectations)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_name = batch.get_table_columns()[0]\n", - "batch.expect_column_values_to_not_be_null(column_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add more expectations here. **Hint** start with `batch.expect_` and hit tab for Jupyter's autocomplete to see all the expectations!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 7. Review and save your Expectations\n", - "\n", - "Expectations that are `True` on this data batch are added automatically. Let's view all the expectations you created in machine-readable JSON." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.get_expectation_suite()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - " \n", - "If you decide not to save some expectations that you created, use [remove_expectaton method](https://docs.greatexpectations.io/en/latest/module_docs/data_asset_module.html?highlight=remove_expectation&utm_source=notebook&utm_medium=create_expectations#great_expectations.data_asset.data_asset.DataAsset.remove_expectation). You can also choose not to filter expectations that were `False` on this batch.\n", - "\n", - "\n", - "The following method will save the expectation suite as a JSON file in the `great_expectations/expectations` directory of your project:\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.save_expectation_suite()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 8. View the Expectations in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **Expectation Suite Overview** built from the expectations you just created that helps you communicate about your data with both machines and humans." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.build_data_docs()\n", - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You created and saved Expectations\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Play with Validation\n", - "\n", - "Validation is the process of checking if new batches of this data meet to your expectations before they are processed by your pipeline. Go to [validation_playground.ipynb](validation_playground.ipynb) to see how!\n", - "\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/great_expectations/init_notebooks/spark/validation_playground.ipynb b/great_expectations/init_notebooks/spark/validation_playground.ipynb index 8d82097ed939..36b2ee8d3a3f 100644 --- a/great_expectations/init_notebooks/spark/validation_playground.ipynb +++ b/great_expectations/init_notebooks/spark/validation_playground.ipynb @@ -8,6 +8,10 @@ "\n", "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", "\n", + "#### This notebook assumes that you created at least one expectation suite in your project.\n", + "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n", + "\n", + "\n", "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" ] }, @@ -19,7 +23,6 @@ "source": [ "import json\n", "import great_expectations as ge\n", - "from great_expectations.profile import ColumnsExistProfiler\n", "import great_expectations.jupyter_ux\n", "from great_expectations.datasource.types import BatchKwargs\n", "from datetime import datetime" @@ -30,7 +33,7 @@ "metadata": {}, "source": [ "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#get-a-datacontext-object)" + "This represents your **project** that you just created using `great_expectations init`." ] }, { @@ -46,9 +49,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. List the CSVs in your folder\n", - "\n", - "The `DataContext` will now introspect your pyspark `Datasource` and list the CSVs it finds. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#list-data-assets)" + "## 2. Choose an Expectation Suite\n" ] }, { @@ -57,34 +58,10 @@ "metadata": {}, "outputs": [], "source": [ - "ge.jupyter_ux.list_available_data_asset_names(context)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Pick a csv and the expectation suite\n", + "# list expectation suites that you created in your project\n", "\n", - "Internally, Great Expectations represents csvs and dataframes as `DataAsset`s and uses this notion to link them to `Expectation Suites`. [Read more about the validate method in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#pick-a-data-asset-and-expectation-suite)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_asset_name = \"ONE_OF_THE_CSV_DATA_ASSET_NAMES_FROM_ABOVE\" # TODO: replace with your value!\n", - "normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name)\n", - "normalized_data_asset_name" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We recommend naming your first expectation suite for a table `warning`. Later, as you identify some of the expectations that you add to this suite as critical, you can move these expectations into another suite and call it `failure`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/pipeline_integration.html?utm_source=notebook&utm_medium=integrate_validation#choose-data-asset-and-expectation-suite)" + "for expectation_suite_id in context.list_expectation_suites():\n", + " print(expectation_suite_id.expectation_suite_name)" ] }, { @@ -93,23 +70,16 @@ "metadata": {}, "outputs": [], "source": [ - "expectation_suite_name = \"warning\" # TODO: replace with your value!" + "expectation_suite_name = # TODO: set to a name from the list above" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### 3.a. If you don't have an expectation suite, let's create a simple one\n", - "\n", - "You need expectations to validate your data. Expectations are grouped into Expectation Suites. \n", - "\n", - "If you don't have an expectation suite for this data asset, the notebook's next cell will create a suite of very basic expectations, so that you have some expectations to play with. The expectation suite will have `expect_column_to_exist` expectations for each column.\n", - "\n", - "If you created an expectation suite for this data asset, you can skip executing the next cell (if you execute it, it will do nothing).\n", + "## 3. Load a batch of data you want to validate\n", "\n", - "To create a more interesting suite, open the [create_expectations.ipynb](create_expectations.ipynb) notebook.\n", - "\n" + "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" ] }, { @@ -118,25 +88,8 @@ "metadata": {}, "outputs": [], "source": [ - "try:\n", - " context.get_expectation_suite(normalized_data_asset_name, expectation_suite_name)\n", - "except great_expectations.exceptions.DataContextError:\n", - " context.create_expectation_suite(data_asset_name=normalized_data_asset_name, expectation_suite_name=expectation_suite_name, overwrite_existing=True);\n", - " batch_kwargs = context.yield_batch_kwargs(data_asset_name)\n", - " batch = context.get_batch(normalized_data_asset_name, expectation_suite_name, batch_kwargs)\n", - " ColumnsExistProfiler().profile(batch)\n", - " batch.save_expectation_suite()\n", - " expectation_suite = context.get_expectation_suite(normalized_data_asset_name, expectation_suite_name)\n", - " context.build_data_docs()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch` with other data types (such as existing pandas dataframes, SQL tables or Spark), see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" + "# list datasources of the type SparkDFDatasource in your project\n", + "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource']" ] }, { @@ -145,19 +98,7 @@ "metadata": {}, "outputs": [], "source": [ - "batch_kwargs = context.yield_batch_kwargs(data_asset_name)\n", - "batch = context.get_batch(normalized_data_asset_name, expectation_suite_name, batch_kwargs)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Get a pipeline run id\n", - "\n", - "Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", - "[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/pipeline_integration.html?utm_source=notebook&utm_medium=validate_data#set-a-run-id)" + "datasource_name = # TODO: set to a datasource name from above" ] }, { @@ -166,18 +107,23 @@ "metadata": {}, "outputs": [], "source": [ - "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner.\n", - "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", - "run_id" + "# If you would like to validate a file on a filesystem:\n", + "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n", + "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true') \n", + "\n", + "# If you already loaded the data into a PySpark Data Frame:\n", + "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", + "\n", + "\n", + "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", + "batch.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 6. Validate the batch\n", - "\n", - "This is the \"workhorse\" of Great Expectations. Call it in your pipeline code after loading data and just before passing it to your computation.\n", + "## 4. Validate the batch\n", "\n", "[Read more about the validate method in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#validate-the-batch)\n" ] @@ -188,19 +134,19 @@ "metadata": {}, "outputs": [], "source": [ - "validation_result = batch.validate(run_id=run_id)\n", + "validation_result = batch.validate()\n", "\n", "if validation_result[\"success\"]:\n", - " print(\"This data meets all expectations for {}\".format(str(data_asset_name)))\n", + " print(\"This data meets all expectations in {}\".format(expectation_suite_name))\n", "else:\n", - " print(\"This data is not a valid batch of {}\".format(str(data_asset_name)))" + " print(\"This data does not meet some expectations in {}\".format(expectation_suite_name))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 6.a. OPTIONAL: Review the JSON validation results\n", + "## 4.a. OPTIONAL: Review the JSON validation results\n", "\n", "Don't worry - this blob of JSON is meant for machines. Continue on or skip this to see this in Data Docs!" ] @@ -211,14 +157,14 @@ "metadata": {}, "outputs": [], "source": [ - "# print(json.dumps(validation_result, indent=4))" + "#validation_result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 7. Validation Operators\n", + "## 5. Validation Operators\n", "\n", "The `validate` method evaluates one batch of data against one expectation suite and returns a dictionary of validation results. This is sufficient when you explore your data and get to know Great Expectations.\n", "When deploying Great Expectations in a **real data pipeline, you will typically discover additional needs**:\n", @@ -240,18 +186,22 @@ "source": [ "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", "\n", + "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", + "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n", + "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", + "\n", "results = context.run_validation_operator(\n", - " assets_to_validate=[batch],\n", - " run_id=run_id,\n", - " validation_operator_name=\"action_list_operator\",\n", - ")" + " \"action_list_operator\", \n", + " assets_to_validate=[batch], \n", + " run_id=run_id)\n", + "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 8. View the Validation Results in Data Docs\n", + "## 6. View the Validation Results in Data Docs\n", "\n", "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", "\n", @@ -275,9 +225,9 @@ "\n", "## Next steps:\n", "\n", - "### 1. Author more interesting Expectations\n", + "### 1. Read about the typical workflow with Great Expectations:\n", "\n", - "Here we used some **extremely basic** `Expectations`. To really harness the power of Great Expectations you can author much more interesting and specific `Expectations` to protect your data pipelines and defeat pipeline debt. Go to [create_expectations.ipynb](create_expectations.ipynb) to see how!\n", + "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", "\n", "### 2. Explore the documentation & community\n", "\n", diff --git a/great_expectations/init_notebooks/sql/create_expectations.ipynb b/great_expectations/init_notebooks/sql/create_expectations.ipynb deleted file mode 100644 index 1c78c4f0fa8e..000000000000 --- a/great_expectations/init_notebooks/sql/create_expectations.ipynb +++ /dev/null @@ -1,359 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Author Expectations\n", - "\n", - "Watch a [short tutorial video](https://greatexpectations.io/videos/getting_started/create_expectations?utm_source=notebook&utm_medium=create_expectations) or read [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations)\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import os\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#get-a-datacontext-object)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. List the tables in your database\n", - "\n", - "The `DataContext` will now introspect your pandas `Datasource` and list the CSVs it finds. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#list-data-assets)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "great_expectations.jupyter_ux.list_available_data_asset_names(context)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Pick a table and set the expectation suite name\n", - "\n", - "Internally, Great Expectations represents CSVs and dataframes as `DataAsset`s and uses this notion to link them to `Expectation Suites`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#pick-a-data-asset-and-set-the-expectation-suite-name)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_asset_name = \"YOUR_TABLE_NAME_LISTED_ABOVE\" # TODO: replace with your value!\n", - "normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name)\n", - "normalized_data_asset_name" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We recommend naming your first expectation suite for a table `warning`. Later, as you identify some of the expectations that you add to this suite as critical, you can move these expectations into another suite and call it `failure`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = \"warning\" # TODO: replace with your value!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Create a new empty expectation suite" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.create_expectation_suite(data_asset_name=data_asset_name, expectation_suite_name=expectation_suite_name, overwrite_existing=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Load a batch of data you want to use to create `Expectations`\n", - "\n", - "To learn more about batches and `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#load-a-batch-of-data-to-create-expectations)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate an entire table or view in your database's default schema:\n", - "batch_kwargs = {'table': \"YOUR_TABLE\"}\n", - "\n", - "# If you would like to validate an entire table or view from a non-default schema in your database:\n", - "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\"}\n", - "\n", - "# If you would like to validate using a query to construct a temporary table:\n", - "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE'}\n", - "\n", - "batch = context.get_batch(normalized_data_asset_name, expectation_suite_name, batch_kwargs)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load a batch of data and take a peek at the first few rows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch = context.get_batch(data_asset_name, expectation_suite_name, batch_kwargs)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Optionally, customize and review batch options\n", - "\n", - "`BatchKwargs` are extremely flexible - to learn more [read the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#load-a-batch-of-data-to-create-expectations)\n", - "\n", - "Here are the batch kwargs used to load your batch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.batch_kwargs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The datasource can add and store additional identifying information to ensure you can track a batch through\n", - "# your pipeline\n", - "batch.batch_id" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Author Expectations\n", - "\n", - "With a batch, you can add expectations by calling specific expectation methods. They all begin with `.expect_` which makes autocompleting easy.\n", - "\n", - "See available expectations in the [expectation glossary](https://docs.greatexpectations.io/en/latest/glossary.html?utm_source=notebook&utm_medium=create_expectations).\n", - "You can also see available expectations by hovering over data elements in the HTML page generated by profiling your dataset.\n", - "\n", - "Below is an example expectation that checks if the values in the batch's first column are null.\n", - "\n", - "[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#author-expectations)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_name = batch.get_table_columns()[0]\n", - "batch.expect_column_values_to_not_be_null(column_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add more expectations here. **Hint** start with `batch.expect_` and hit tab for Jupyter's autocomplete to see all the expectations!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 7. Review and save your Expectations\n", - "\n", - "Expectations that are `True` on this data batch are added automatically. Let's view all the expectations you created in machine-readable JSON." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.get_expectation_suite()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - " \n", - "If you decide not to save some expectations that you created, use [remove_expectaton method](https://docs.greatexpectations.io/en/latest/module_docs/data_asset_module.html?highlight=remove_expectation&utm_source=notebook&utm_medium=create_expectations#great_expectations.data_asset.data_asset.DataAsset.remove_expectation). You can also choose not to filter expectations that were `False` on this batch.\n", - "\n", - "\n", - "The following method will save the expectation suite as a JSON file in the `great_expectations/expectations` directory of your project:\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.save_expectation_suite()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 8. View the Expectations in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **Expectation Suite Overview** built from the expectations you just created that helps you communicate about your data with both machines and humans." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.build_data_docs()\n", - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You created and saved Expectations\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Play with Validation\n", - "\n", - "Validation is the process of checking if new batches of this data meet to your expectations before they are processed by your pipeline. Go to [validation_playground.ipynb](validation_playground.ipynb) to see how!\n", - "\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/great_expectations/init_notebooks/sql/validation_playground.ipynb b/great_expectations/init_notebooks/sql/validation_playground.ipynb index 16b1a8aa668c..4157e30cbb5b 100644 --- a/great_expectations/init_notebooks/sql/validation_playground.ipynb +++ b/great_expectations/init_notebooks/sql/validation_playground.ipynb @@ -8,6 +8,10 @@ "\n", "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", "\n", + "#### This notebook assumes that you created at least one expectation suite in your project.\n", + "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n", + "\n", + "\n", "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" ] }, @@ -19,7 +23,6 @@ "source": [ "import json\n", "import great_expectations as ge\n", - "from great_expectations.profile import ColumnsExistProfiler\n", "import great_expectations.jupyter_ux\n", "from great_expectations.datasource.types import BatchKwargs\n", "from datetime import datetime" @@ -30,7 +33,7 @@ "metadata": {}, "source": [ "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#get-a-datacontext-object)" + "This represents your **project** that you just created using `great_expectations init`." ] }, { @@ -46,9 +49,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. List the tables in your database\n", - "\n", - "The `DataContext` will now introspect your database `Datasource` and list the tables. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#list-data-assets)" + "## 2. Choose an Expectation Suite\n" ] }, { @@ -57,16 +58,10 @@ "metadata": {}, "outputs": [], "source": [ - "ge.jupyter_ux.list_available_data_asset_names(context)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Pick a table and set the expectation suite name\n", + "# list expectation suites that you created in your project\n", "\n", - "Internally, Great Expectations represents tables and views as `DataAsset`s and uses this notion to link them to `Expectation Suites`. [Read more about the validate method in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#pick-a-data-asset-and-expectation-suite)\n" + "for expectation_suite_id in context.list_expectation_suites():\n", + " print(expectation_suite_id.expectation_suite_name)" ] }, { @@ -75,16 +70,16 @@ "metadata": {}, "outputs": [], "source": [ - "data_asset_name = \"YOUR_TABLE_NAME_LISTED_ABOVE\" # TODO: replace with your value!\n", - "normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name)\n", - "normalized_data_asset_name" + "expectation_suite_name = # TODO: set to a name from the list above" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We recommend naming your first expectation suite for a table `warning`. Later, as you identify some of the expectations that you add to this suite as critical, you can move these expectations into another suite and call it `failure`. [Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/pipeline_integration.html?utm_source=notebook&utm_medium=integrate_validation#choose-data-asset-and-expectation-suite)" + "## 3. Load a batch of data you want to validate\n", + "\n", + "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" ] }, { @@ -93,23 +88,8 @@ "metadata": {}, "outputs": [], "source": [ - "expectation_suite_name = \"warning\" # TODO: replace with your value!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3.a. If you don't have an expectation suite, let's create a simple one\n", - "\n", - "You need expectations to validate your data. Expectations are grouped into Expectation Suites. \n", - "\n", - "If you don't have an expectation suite for this data asset, the notebook's next cell will create a suite of very basic expectations, so that you have some expectations to play with. The expectation suite will have `expect_column_to_exist` expectations for each column.\n", - "\n", - "If you created an expectation suite for this data asset, you can skip executing the next cell (if you execute it, it will do nothing).\n", - "\n", - "To create a more interesting suite, open the [create_expectations.ipynb](create_expectations.ipynb) notebook.\n", - "\n" + "# list datasources of the type SqlAlchemyDatasource in your project\n", + "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']" ] }, { @@ -118,25 +98,7 @@ "metadata": {}, "outputs": [], "source": [ - "try:\n", - " context.get_expectation_suite(normalized_data_asset_name, expectation_suite_name)\n", - "except great_expectations.exceptions.DataContextError:\n", - " context.create_expectation_suite(data_asset_name=normalized_data_asset_name, expectation_suite_name=expectation_suite_name, overwrite_existing=True);\n", - " batch_kwargs = context.yield_batch_kwargs(data_asset_name)\n", - " batch = context.get_batch(normalized_data_asset_name, expectation_suite_name, batch_kwargs)\n", - " ColumnsExistProfiler().profile(batch)\n", - " batch.save_expectation_suite()\n", - " expectation_suite = context.get_expectation_suite(normalized_data_asset_name, expectation_suite_name)\n", - " context.build_data_docs()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch` with other data types (such as csv files, pandas, or Spark), see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" + "datasource_name = # TODO: set to a datasource name from above" ] }, { @@ -146,46 +108,25 @@ "outputs": [], "source": [ "# If you would like to validate an entire table or view in your database's default schema:\n", - "batch_kwargs = {'table': \"YOUR_TABLE\"}\n", + "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n", "\n", "# If you would like to validate an entire table or view from a non-default schema in your database:\n", - "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\"}\n", + "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n", "\n", - "# If you would like to validate using a query to construct a temporary table:\n", - "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE'}\n", + "# If you would like to validate the result set of a query:\n", + "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n", "\n", - "batch = context.get_batch(normalized_data_asset_name, expectation_suite_name, batch_kwargs)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Get a pipeline run id\n", "\n", - "Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", - "[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/pipeline_integration.html?utm_source=notebook&utm_medium=validate_data#set-a-run-id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner.\n", - "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", - "run_id" + "\n", + "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", + "batch.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 6. Validate the batch\n", - "\n", - "This is the \"workhorse\" of Great Expectations. Call it in your pipeline code after loading data and just before passing it to your computation.\n", + "## 4. Validate the batch\n", "\n", "[Read more about the validate method in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#validate-the-batch)\n" ] @@ -196,19 +137,19 @@ "metadata": {}, "outputs": [], "source": [ - "validation_result = batch.validate(run_id=run_id)\n", + "validation_result = batch.validate()\n", "\n", "if validation_result[\"success\"]:\n", - " print(\"This data meets all expectations for {}\".format(str(data_asset_name)))\n", + " print(\"This data meets all expectations in {}\".format(expectation_suite_name))\n", "else:\n", - " print(\"This data is not a valid batch of {}\".format(str(data_asset_name)))" + " print(\"This data does not meet some expectations in {}\".format(expectation_suite_name))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 6.a. OPTIONAL: Review the JSON validation results\n", + "## 4.a. OPTIONAL: Review the JSON validation results\n", "\n", "Don't worry - this blob of JSON is meant for machines. Continue on or skip this to see this in Data Docs!" ] @@ -219,14 +160,14 @@ "metadata": {}, "outputs": [], "source": [ - "# print(json.dumps(validation_result, indent=4))" + "#validation_result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 7. Validation Operators\n", + "## 5. Validation Operators\n", "\n", "The `validate` method evaluates one batch of data against one expectation suite and returns a dictionary of validation results. This is sufficient when you explore your data and get to know Great Expectations.\n", "When deploying Great Expectations in a **real data pipeline, you will typically discover additional needs**:\n", @@ -248,18 +189,22 @@ "source": [ "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", "\n", + "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", + "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n", + "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", + "\n", "results = context.run_validation_operator(\n", - " assets_to_validate=[batch],\n", - " run_id=run_id,\n", - " validation_operator_name=\"action_list_operator\",\n", - ")" + " \"action_list_operator\", \n", + " assets_to_validate=[batch], \n", + " run_id=run_id)\n", + "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 8. View the Validation Results in Data Docs\n", + "## 6. View the Validation Results in Data Docs\n", "\n", "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", "\n", @@ -283,9 +228,9 @@ "\n", "## Next steps:\n", "\n", - "### 1. Author more interesting Expectations\n", + "### 1. Read about the typical workflow with Great Expectations:\n", "\n", - "Here we used some **extremely basic** `Expectations`. To really harness the power of Great Expectations you can author much more interesting and specific `Expectations` to protect your data pipelines and defeat pipeline debt. Go to [create_expectations.ipynb](create_expectations.ipynb) to see how!\n", + "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", "\n", "### 2. Explore the documentation & community\n", "\n", diff --git a/great_expectations/jupyter_ux/__init__.py b/great_expectations/jupyter_ux/__init__.py index e3b4cef10680..bb36992bdf3f 100755 --- a/great_expectations/jupyter_ux/__init__.py +++ b/great_expectations/jupyter_ux/__init__.py @@ -4,12 +4,16 @@ import logging import sys -import great_expectations.render as render from datetime import datetime import tzlocal from IPython.core.display import display, HTML +from great_expectations.render.renderer import ProfilingResultsColumnSectionRenderer, \ + ExpectationSuiteColumnSectionRenderer +from great_expectations.render.types import RenderedSectionContent +from great_expectations.render.view import DefaultJinjaSectionView + def set_data_source(context, data_source_type=None): """ @@ -115,7 +119,7 @@ def formatTime(self, record, datefmt=None): # warnings.filterwarnings('ignore') -def list_available_data_asset_names(context, data_source_name=None): +def show_available_data_asset_names(context, data_source_name=None): """ List asset names found in the current context. """ # TODO: Needs tests. styles = """ @@ -134,7 +138,7 @@ def list_available_data_asset_names(context, data_source_name=None): """ print("Inspecting your data sources. This may take a moment...") - expectation_suite_keys = context.list_expectation_suite_keys() + expectation_suite_keys = context.list_expectation_suites() datasources = context.list_datasources() html = "" for datasource in datasources: @@ -146,7 +150,14 @@ def list_available_data_asset_names(context, data_source_name=None): for generator_info in generators: html += "generator: {0:s} ({1:s})".format(generator_info['name'], generator_info['class_name']) generator = ds.get_generator(generator_info['name']) - data_asset_names = sorted(generator.get_available_data_asset_names()) + + # TODO hacks to deal w/ inconsistent return types. Remove urgently + mystery_object = generator.get_available_data_asset_names() + if isinstance(mystery_object, dict) and "names" in mystery_object.keys(): + data_asset_names = sorted([name[0] for name in mystery_object["names"]]) + elif isinstance(mystery_object, list): + data_asset_names = sorted(mystery_object) + if len(data_asset_names) > 0: html += "

Data Assets Found:

" html += styles @@ -243,17 +254,12 @@ def display_column_expectations_as_section( """ #TODO: replace this with a generic utility function, preferably a method on an ExpectationSuite class - column_expectation_list = [ e for e in expectation_suite["expectations"] if "column" in e["kwargs"] and e["kwargs"]["column"] == column ] + column_expectation_list = [ e for e in expectation_suite.expectations if "column" in e.kwargs and e.kwargs["column"] == column ] #TODO: Handle the case where zero evrs match the column name - document = render.renderer.ExpectationSuiteColumnSectionRenderer().render(column_expectation_list) - view = render.view.DefaultJinjaSectionView().render( - render.types.RenderedComponentContentWrapper(**{ - "section": document, - "section_loop": {"index": 1}, - }) - ) + document = ExpectationSuiteColumnSectionRenderer().render(column_expectation_list).to_json_dict() + view = DefaultJinjaSectionView().render({"section": document, "section_loop": 1}) if include_styling: html_to_display = bootstrap_link_element+cooltip_style_element+view @@ -266,43 +272,43 @@ def display_column_expectations_as_section( display(HTML(html_to_display)) -def display_column_evrs_as_section( - evrs, - column, - include_styling=True, - return_without_displaying=False, -): - """This is a utility function to render all of the EVRs in an ExpectationSuite with the same column name as an HTML block. - - By default, the HTML block is rendered using ExpectationSuiteColumnSectionRenderer and the view is rendered using DefaultJinjaSectionView. - Therefore, it should look exactly the same as the default renderer for build_docs. - - Example usage: - display_column_evrs_as_section(exp, "my_column") - """ - - #TODO: replace this with a generic utility function, preferably a method on an ExpectationSuite class - column_evr_list = [ e for e in evrs["results"] if "column" in e["expectation_config"]["kwargs"] and e["expectation_config"]["kwargs"]["column"] == column ] - - #TODO: Handle the case where zero evrs match the column name - - document = render.renderer.ProfilingResultsColumnSectionRenderer().render(column_evr_list) - view = render.view.DefaultJinjaSectionView().render( - render.types.RenderedComponentContentWrapper(**{ - "section": document, - "section_loop": {"index": 1}, - }) - ) - - if include_styling: - html_to_display = bootstrap_link_element+cooltip_style_element+view - else: - html_to_display = view - - if return_without_displaying: - return html_to_display - else: - display(HTML(html_to_display)) +# def display_column_evrs_as_section( +# evrs, +# column, +# include_styling=True, +# return_without_displaying=False, +# ): +# """This is a utility function to render all of the EVRs in an ExpectationSuite with the same column name as an HTML block. +# +# By default, the HTML block is rendered using ExpectationSuiteColumnSectionRenderer and the view is rendered using DefaultJinjaSectionView. +# Therefore, it should look exactly the same as the default renderer for build_docs. +# +# Example usage: +# display_column_evrs_as_section(exp, "my_column") +# """ +# +# #TODO: replace this with a generic utility function, preferably a method on an ExpectationSuite class +# column_evr_list = [ e for e in evrs.results if "column" in e.expectation_config.kwargs and e.expectation_config.kwargs["column"] == column ] +# +# #TODO: Handle the case where zero evrs match the column name +# +# document = ProfilingResultsColumnSectionRenderer().render(column_evr_list) +# view = DefaultJinjaSectionView().render( +# { +# "section": document, +# "section_loop": {"index": 1}, +# } +# ) +# +# if include_styling: +# html_to_display = bootstrap_link_element+cooltip_style_element+view +# else: +# html_to_display = view +# +# if return_without_displaying: +# return html_to_display +# else: +# display(HTML(html_to_display)) # When importing the jupyter_ux module, we set up a preferred logging configuration diff --git a/great_expectations/jupyter_ux/expectation_explorer.py b/great_expectations/jupyter_ux/expectation_explorer.py index 587dd1131683..aa418e3109fe 100644 --- a/great_expectations/jupyter_ux/expectation_explorer.py +++ b/great_expectations/jupyter_ux/expectation_explorer.py @@ -103,8 +103,8 @@ def __init__(self): def update_result(self, data_asset_name, new_result, column=None): new_success_value = new_result.get('success') - expectation_type = new_result['expectation_config'].get('expectation_type') - new_result_widgets = self.generate_expectation_result_detail_widgets(result=new_result.get('result', {})) + expectation_type = new_result.expectation_config.expectation_type + new_result_widgets = self.generate_expectation_result_detail_widgets(result=new_result.result) new_border_color = 'green' if new_success_value else 'red' data_asset_expectations = self.state['data_assets'][data_asset_name]['expectations'] @@ -143,7 +143,7 @@ def get_expectation_state(self, data_asset_name, expectation_type, column=None): return non_column_expectations.get(expectation_type) def initialize_data_asset_state(self, data_asset): - data_asset_name = data_asset.get_data_asset_name() + data_asset_name = data_asset.data_asset_name self.state['data_assets'][data_asset_name] = { "data_asset": data_asset, @@ -151,7 +151,7 @@ def initialize_data_asset_state(self, data_asset): } def set_expectation_state(self, data_asset, expectation_state, column=None): - data_asset_name = data_asset.get_data_asset_name() + data_asset_name = data_asset.data_asset_name expectation_type = expectation_state.get('expectation_type') data_asset_state = self.state['data_assets'].get(data_asset_name) @@ -231,7 +231,7 @@ def min_max_value_to_string(widget_dict, number_kwarg): def update_expectation_state(self, existing_expectation_state, expectation_validation_result, validation_time): expectation_editor_widget = existing_expectation_state.get( 'editor_widget') - new_ge_expectation_kwargs = expectation_validation_result['expectation_config']['kwargs'] + new_ge_expectation_kwargs = expectation_validation_result.expectation_config['kwargs'] current_expectation_kwarg_dict = existing_expectation_state['kwargs'] column = current_expectation_kwarg_dict.get('column') data_asset_name = existing_expectation_state.get('data_asset_name') @@ -1349,10 +1349,10 @@ def create_expectation_widget( collapsed=False ): validation_time = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M") - data_asset_name = data_asset.get_data_asset_name() + data_asset_name = data_asset.data_asset_name data_asset_state = self.state['data_assets'].get(data_asset_name) - expectation_type = expectation_validation_result['expectation_config']['expectation_type'] - expectation_kwargs = expectation_validation_result['expectation_config']['kwargs'] + expectation_type = expectation_validation_result.expectation_config.expectation_type + expectation_kwargs = expectation_validation_result.expectation_config['kwargs'] column = expectation_kwargs.get('column') if data_asset_state: @@ -1367,13 +1367,13 @@ def create_expectation_widget( self.initialize_data_asset_state(data_asset) # success_widget - success = expectation_validation_result['success'] + success = expectation_validation_result.success success_widget = widgets.HTML( value="Success: {success}".format(success=str(success))) # widget with result details result_detail_widget = widgets.VBox(children=self.generate_expectation_result_detail_widgets( - result=expectation_validation_result.get("result", {}) + result=expectation_validation_result.result )) # accordion container for result_detail_widget @@ -1500,7 +1500,7 @@ def get_expectation_types(self, data_asset_name): return list(set(expectation_types)) def generate_expectation_suite_editor_widgets(self, data_asset, expectation_suite): - data_asset_name = data_asset.get_data_asset_name() + data_asset_name = data_asset.data_asset_name column_names = self.get_column_names(data_asset_name) column_accordions = [] data_asset_state = self.state['data_assets'].get(data_asset_name, {}) @@ -1557,7 +1557,7 @@ def generate_expectation_suite_editor_widgets(self, data_asset, expectation_suit return [summary_widget] + column_accordions def edit_expectation_suite(self, data_asset): - data_asset_name = data_asset.get_data_asset_name() + data_asset_name = data_asset.data_asset_name expectation_suite = data_asset.get_expectation_suite( discard_failed_expectations=False) expectations = expectation_suite.get('expectations') diff --git a/great_expectations/profile/__init__.py b/great_expectations/profile/__init__.py index a5ec2c438857..c374b17864b3 100644 --- a/great_expectations/profile/__init__.py +++ b/great_expectations/profile/__init__.py @@ -1 +1,2 @@ -from .columns_exist import ColumnsExistProfiler \ No newline at end of file +from .columns_exist import ColumnsExistProfiler +from .basic_dataset_profiler import BasicDatasetProfiler diff --git a/great_expectations/profile/base.py b/great_expectations/profile/base.py index 55e86472b525..3c973592599f 100644 --- a/great_expectations/profile/base.py +++ b/great_expectations/profile/base.py @@ -22,41 +22,36 @@ def validate(cls, dataset): @classmethod def add_expectation_meta(cls, expectation): - if not "meta" in expectation: - expectation["meta"] = {} - - expectation["meta"][str(cls.__name__)] = { + expectation.meta[str(cls.__name__)] = { "confidence": "very low" } return expectation @classmethod def add_meta(cls, expectation_suite, batch_kwargs=None): - if not "meta" in expectation_suite: - expectation_suite["meta"] = {} - class_name = str(cls.__name__) - expectation_suite["meta"][class_name] = { + expectation_suite.meta[class_name] = { "created_by": class_name, "created_at": time.time(), } if batch_kwargs is not None: - expectation_suite["meta"][class_name]["batch_kwargs"] = batch_kwargs + expectation_suite.meta[class_name]["batch_kwargs"] = batch_kwargs new_expectations = [cls.add_expectation_meta( - exp) for exp in expectation_suite["expectations"]] - expectation_suite["expectations"] = new_expectations - - expectation_suite["meta"]["notes"] = { - "format": "markdown", - "content": [ - "_To add additional notes, edit the meta.notes.content field in the appropriate Expectation json file._" - #TODO: be more helpful to the user by piping in the filename. - #This will require a minor refactor to make more DataContext information accessible from this method. - # "_To add additional notes, edit the meta.notes.content field in expectations/mydb/default/movies/BasicDatasetProfiler.json_" - ] - } + exp) for exp in expectation_suite.expectations] + expectation_suite.expectations = new_expectations + + if not "notes" in expectation_suite.meta: + expectation_suite.meta["notes"] = { + "format": "markdown", + "content": [ + "_To add additional notes, edit the meta.notes.content field in the appropriate Expectation json file._" + #TODO: be more helpful to the user by piping in the filename. + #This will require a minor refactor to make more DataContext information accessible from this method. + # "_To add additional notes, edit the meta.notes.content field in expectations/mydb/default/movies/BasicDatasetProfiler.json_" + ] + } return expectation_suite @classmethod diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index 198b24198505..5ac56a41fc4c 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -1,18 +1,26 @@ import logging + +# Gross legacy python 2 hacks +try: + ModuleNotFoundError +except NameError: + ModuleNotFoundError = ImportError + +try: + from sqlalchemy.exc import OperationalError +except ModuleNotFoundError: + OperationalError = RuntimeError + from .base import DatasetProfiler logger = logging.getLogger(__name__) -class BasicDatasetProfiler(DatasetProfiler): - """BasicDatasetProfiler is inspired by the beloved pandas_profiling project. - - The profiler examines a batch of data and creates a report that answers the basic questions - most data practitioners would ask about a dataset during exploratory data analysis. - The profiler reports how unique the values in the column are, as well as the percentage of empty values in it. - Based on the column's type it provides a description of the column by computing a number of statistics, - such as min, max, mean and median, for numeric columns, and distribution of values, when appropriate. +class BasicDatasetProfilerBase(DatasetProfiler): + """BasicDatasetProfilerBase provides basic logic of inferring the type and the cardinality of columns + that is used by the dataset profiler classes that extend this class. """ + INT_TYPE_NAMES = {"INTEGER", "int", "INT", "TINYINT", "BYTEINT", "SMALLINT", "BIGINT", "IntegerType", "LongType", "DECIMAL"} FLOAT_TYPE_NAMES = {"FLOAT", "FLOAT4", "FLOAT8", "DOUBLE_PRECISION", "NUMERIC", "FloatType", "DoubleType", "float"} STRING_TYPE_NAMES = {"CHAR", "VARCHAR", "TEXT", "StringType", "string", "str"} @@ -21,22 +29,23 @@ class BasicDatasetProfiler(DatasetProfiler): @classmethod def _get_column_type(cls, df, column): + # list of types is used to support pandas and sqlalchemy df.set_config_value("interactive_evaluation", True) try: - if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.INT_TYPE_NAMES)))["success"]: + if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.INT_TYPE_NAMES))).success: type_ = "int" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.FLOAT_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.FLOAT_TYPE_NAMES))).success: type_ = "float" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.STRING_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.STRING_TYPE_NAMES))).success: type_ = "string" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.BOOLEAN_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.BOOLEAN_TYPE_NAMES))).success: type_ = "bool" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.DATETIME_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.DATETIME_TYPE_NAMES))).success: type_ = "datetime" else: @@ -55,10 +64,9 @@ def _get_column_cardinality(cls, df, column): df.set_config_value("interactive_evaluation", True) try: - num_unique = df.expect_column_unique_value_count_to_be_between(column, None, None)[ - 'result']['observed_value'] + num_unique = df.expect_column_unique_value_count_to_be_between(column, None, None).result['observed_value'] pct_unique = df.expect_column_proportion_of_unique_values_to_be_between( - column, None, None)['result']['observed_value'] + column, None, None).result['observed_value'] except KeyError: # if observed_value value is not set logger.error("Failed to get cardinality of column {0:s} - continuing...".format(column)) @@ -96,6 +104,17 @@ def _get_column_cardinality(cls, df, column): return cardinality + +class BasicDatasetProfiler(BasicDatasetProfilerBase): + """BasicDatasetProfiler is inspired by the beloved pandas_profiling project. + + The profiler examines a batch of data and creates a report that answers the basic questions + most data practitioners would ask about a dataset during exploratory data analysis. + The profiler reports how unique the values in the column are, as well as the percentage of empty values in it. + Based on the column's type it provides a description of the column by computing a number of statistics, + such as min, max, mean and median, for numeric columns, and distribution of values, when appropriate. + """ + @classmethod def _profile(cls, dataset): df = dataset @@ -119,7 +138,7 @@ def _profile(cls, dataset): # df.expect_column_to_exist(column) type_ = cls._get_column_type(df, column) - cardinality= cls._get_column_cardinality(df, column) + cardinality = cls._get_column_cardinality(df, column) df.expect_column_values_to_not_be_null(column, mostly=0.5) # The renderer will show a warning for columns that do not meet this expectation df.expect_column_values_to_be_in_set(column, [], result_format="SUMMARY") @@ -165,11 +184,11 @@ def _profile(cls, dataset): df.expect_column_kl_divergence_to_be_less_than(column, partition_object=None, threshold=None, result_format='COMPLETE') - else: # unknown cardinality - skip + else: # unknown cardinality - skip pass elif type_ == "string": - # Check for leading and tralining whitespace. + # Check for leading and trailing whitespace. #!!! It would be nice to build additional Expectations here, but #!!! the default logic for remove_expectations prevents us. df.expect_column_values_to_not_match_regex(column, r"^\s+|\s+$") @@ -209,9 +228,6 @@ def _profile(cls, dataset): df.set_config_value("interactive_evaluation", True) expectation_suite = df.get_expectation_suite(suppress_warnings=True, discard_failed_expectations=False) - if not "meta" in expectation_suite: - expectation_suite["meta"] = {"columns": meta_columns} - else: - expectation_suite["meta"]["columns"] = meta_columns + expectation_suite.meta["columns"] = meta_columns return expectation_suite diff --git a/great_expectations/profile/metrics_store.py b/great_expectations/profile/metrics_store.py deleted file mode 100644 index 234144fb2f28..000000000000 --- a/great_expectations/profile/metrics_store.py +++ /dev/null @@ -1,143 +0,0 @@ -import collections -from functools import reduce -import operator -from collections import defaultdict -from great_expectations.data_context.types.metrics import ( -NamespaceAwareValidationMetric, -MultiBatchNamespaceAwareValidationMetric, -NamespaceAwareExpectationDefinedValidationMetric, -MultiBatchNamespaceAwareExpectationDefinedValidationMetric, -) - - -class MetricsStore(object): - def __init__(self): - self.single_batch_metrics = [] - self.dict_single_batch_metrics_by_multi_batch_key_by_batch = {} - - - def add_single_batch_metric( - self, - data_asset_name, - batch_fingerprint, - metric_name, - metric_kwargs, - metric_value): - - new_metric = NamespaceAwareValidationMetric( - data_asset_name=data_asset_name, - batch_fingerprint=batch_fingerprint, - metric_name=metric_name, - metric_kwargs=metric_kwargs, - metric_value=metric_value) - - self.single_batch_metrics.append(new_metric) - - batch_metrics = self.dict_single_batch_metrics_by_multi_batch_key_by_batch.get(batch_fingerprint.fingerprint) - if not batch_metrics: - batch_metrics = {} - self.dict_single_batch_metrics_by_multi_batch_key_by_batch[batch_fingerprint.fingerprint] = batch_metrics - self.dict_single_batch_metrics_by_multi_batch_key_by_batch[batch_fingerprint.fingerprint][new_metric.multi_batch_key] = new_metric - - def add_single_batch_expectation_defined_metric( - self, - data_asset_name, - batch_fingerprint, - expectation_type, - result_key, - metric_kwargs, - metric_value): - - new_metric = NamespaceAwareExpectationDefinedValidationMetric( - data_asset_name=data_asset_name, - batch_fingerprint=batch_fingerprint, - expectation_type=expectation_type, - result_key=result_key, - metric_kwargs=metric_kwargs, - metric_value=metric_value) - - self.single_batch_metrics.append(new_metric) - batch_metrics = self.dict_single_batch_metrics_by_multi_batch_key_by_batch.get(batch_fingerprint.fingerprint) - if not batch_metrics: - batch_metrics = {} - self.dict_single_batch_metrics_by_multi_batch_key_by_batch[batch_fingerprint.fingerprint] = batch_metrics - self.dict_single_batch_metrics_by_multi_batch_key_by_batch[batch_fingerprint.fingerprint][new_metric.multi_batch_key] = new_metric - - def get_multi_batch_metrics(self, batch_kwargs_list): - """ - Return a list of multi batch metrics for a list of batches - :param batch_fingerprints: - :return: dict of multi batch metrics (by mb metric key). - Values are MultiBatchNamespaceAwareValidationMetric or - MultiBatchNamespaceAwareExpectationDefinedValidationMetric - """ - - dict_selected_batches = {} - for batch_fingerprint, batch_metrics in self.dict_single_batch_metrics_by_multi_batch_key_by_batch.items(): - if batch_fingerprint in [bk.batch_fingerprint.fingerprint for bk in batch_kwargs_list]: - dict_selected_batches[batch_fingerprint] = batch_metrics - - # let's compute the union of all metrics names that come from all the batches. - # this will help us fill with nulls if a particular metric is missing from a batch - # (e.g., due to the column missing) - # Not performing this steps would result in non-uniform lengths of lists and we would - # not be able to convert this dict of lists into a dataframe. - metric_names_union = set() - for batch_id, batch_metrics in dict_selected_batches.items(): - metric_names_union = metric_names_union.union(batch_metrics.keys()) - - metrics_dict_of_lists = defaultdict(list) - - batch_index = list(self.dict_single_batch_metrics_by_multi_batch_key_by_batch.keys()) - - for batch_id, batch_metrics in dict_selected_batches.items(): - # fill in the metrics that are present in the batch - for metric_name, metric_value in batch_metrics.items(): - metrics_dict_of_lists[metric_name].append(metric_value) - - # fill in the metrics that are missing in the batch - metrics_missing_in_batch = metric_names_union - set(batch_metrics.keys()) - for metric_name in metrics_missing_in_batch: - metrics_dict_of_lists[metric_name].append(None) - - mb_metrics = {} - for metric_key, single_batch_metric_list in metrics_dict_of_lists.items(): - mb_metric = self._make_multi_batch_metric_from_list_of_single_batch_metrics(metric_key[0], single_batch_metric_list, - batch_index) - mb_metrics[mb_metric.key] = mb_metric - - return mb_metrics - - def _make_multi_batch_metric_from_list_of_single_batch_metrics(self, single_batch_metric_name, single_batch_metric_list, batch_index): - """ - Utility method that gets a list of single batch metrics with the same multi-batch key (meaning that they are the same - metric with the same kwargs, but obtained by validating different batches of the same data asset) and - constructs a multi-batch metric for that key. - - :param single_batch_metric_name: - :param single_batch_metric_list: - :param batch_index: - :return: - """ - first_non_null_single_batch_metric = [item for item in single_batch_metric_list if item is not None][0] - - if 'NamespaceAwareValidationMetric' == single_batch_metric_name: - mb_metric = MultiBatchNamespaceAwareValidationMetric( - data_asset_name=first_non_null_single_batch_metric.data_asset_name, - metric_name=first_non_null_single_batch_metric.metric_name, - metric_kwargs=first_non_null_single_batch_metric.metric_kwargs, - batch_fingerprints=batch_index, - batch_metric_values=[None if metric is None else metric.metric_value for metric in - single_batch_metric_list] - ) - elif 'NamespaceAwareExpectationDefinedValidationMetric' == single_batch_metric_name: - mb_metric = MultiBatchNamespaceAwareExpectationDefinedValidationMetric( - data_asset_name = first_non_null_single_batch_metric.data_asset_name, - result_key = first_non_null_single_batch_metric.result_key, - expectation_type = first_non_null_single_batch_metric.expectation_type, - metric_kwargs = first_non_null_single_batch_metric.metric_kwargs, - batch_fingerprints = batch_index, - batch_metric_values = [None if metric is None else metric.metric_value for metric in single_batch_metric_list] - ) - - return mb_metric diff --git a/great_expectations/profile/metrics_utils.py b/great_expectations/profile/metrics_utils.py index d9bec14f5c0d..2ba0246e0f42 100644 --- a/great_expectations/profile/metrics_utils.py +++ b/great_expectations/profile/metrics_utils.py @@ -1,68 +1,17 @@ -import collections -from functools import reduce -import operator +from hashlib import md5 -def acts_as_a_number(var): - try: - 0 + var - except TypeError: - return False - else: - return True -def make_dictionary_key(d): - """ +def tuple_to_hash(tuple_): + return md5(str(tuple_).encode("utf-8")).hexdigest() - :param d: - :return: - """ - return tuple(sorted([item for item in flatten_nested_dictionary_to_hashable_tuple_list(d)],\ - key=lambda item: '.'.join(item[0]) if type(item[0]) is tuple else item[0])) - -def flatten_nested_dictionary_to_hashable_tuple_list(d, nested_key_tuple=()): - """ - Assumption: leaf values can be either lists or primitive - - :param d: - :param nested_key_tuple: - :return: - """ - for key, value in d.items(): - if isinstance(value, collections.Mapping): - for inner_key, inner_value in flatten_nested_dictionary_to_hashable_tuple_list(value, nested_key_tuple=nested_key_tuple + (key,)): - yield inner_key, inner_value - else: - yield (key if nested_key_tuple==() else (nested_key_tuple + (key,)), tuple(value) if type(value) is list else value) - - -def result_contains_numeric_observed_value(result): - """ - - :param result: - :return: - """ - return ('observed_value' in result['result'] \ - and acts_as_a_number(result['result'].get('observed_value'))) \ - and set(result['result'].keys()) <= set( - ['observed_value', 'element_count', 'missing_count', 'missing_percent']) - - -def result_contains_unexpected_pct(result): - """ - - :param result: - :return: - """ - return 'unexpected_percent' in result['result'] \ - and result['expectation_config']['expectation_type'] != 'expect_column_values_to_be_in_set' - - - -def get_nested_value_from_dict(d, key_path): - return reduce(operator.getitem, key_path, d) - -def set_nested_value_in_dict(d, key_path, value): - for key in key_path[:-1]: - d = d.setdefault(key, {}) - d[key_path[-1]] = value +def kwargs_to_tuple(d): + """Convert expectation configuration kwargs to a canonical tuple.""" + if isinstance(d, list): + return tuple([kwargs_to_tuple(v) for v in sorted(d)]) + elif isinstance(d, dict): + return tuple([(k, kwargs_to_tuple(v)) for k, v in sorted(d.items()) + if k not in [ + "result_format", "include_config", "catch_exceptions", "meta" + ]]) + return d diff --git a/great_expectations/profile/multi_batch_validation_meta_analysis.py b/great_expectations/profile/multi_batch_validation_meta_analysis.py index d1900cf85a2c..0a31d973f76a 100644 --- a/great_expectations/profile/multi_batch_validation_meta_analysis.py +++ b/great_expectations/profile/multi_batch_validation_meta_analysis.py @@ -1,167 +1,167 @@ -import logging -from collections import defaultdict -import collections - -import warnings -from great_expectations.datasource.types import BatchKwargs -from great_expectations.profile.metrics_store import MetricsStore -from great_expectations.profile.metrics_utils import ( -set_nested_value_in_dict, -get_nested_value_from_dict -) - -logger = logging.getLogger(__name__) - - -class MultiBatchValidationMetaAnalysis(object): - """MultiBatchValidationMetaAnalysis takes a list of validation results - (same expectation suite evaluated against multiple batches) - and returns multi-batch metrics from these results. - - """ - - # (expectation type, result key) -> (expectation kwargs that should become metric kwargs) - # result key is a string or a tuple if the key is nested. same for the expectation kwargs - - # NOTE: Eugene: 2019-09-04: Add more entries - EXPECTATION_DEFINED_METRICS_LOOKUP_TABLE = { - ('expect_column_values_to_not_be_null', ('unexpected_percent',)): ('column',), # note: "," is important - it makes it a tuple! - ('expect_column_quantile_values_to_be_between', ('observed_value', 'values')): ( - 'column', ('quantile_ranges', 'quantiles')), - - } - - @classmethod - def add_expectation_defined_metric_for_result_key(cls, d, result, data_asset_name, batch_kwargs, metrics_store, t=()): - for key, value in d.items(): - if isinstance(value, collections.Mapping): - cls.add_expectation_defined_metric_for_result_key(value, result, data_asset_name, batch_kwargs, metrics_store, t + (key,)) - else: - # result_key_lookup_key = key if t==() else (t + (key,)) - result_key_lookup_key = (t + (key,)) - full_lookup_key = (result['expectation_config']['expectation_type'], result_key_lookup_key) - metric_kwargs_names = cls.EXPECTATION_DEFINED_METRICS_LOOKUP_TABLE.get(full_lookup_key) - if metric_kwargs_names: - metric_kwargs = {} - for metric_kwarg_name in metric_kwargs_names: - if isinstance(metric_kwarg_name, tuple): - set_nested_value_in_dict(metric_kwargs, metric_kwarg_name, get_nested_value_from_dict(result['expectation_config']['kwargs'], metric_kwarg_name)) - else: - metric_kwargs[metric_kwarg_name] = result['expectation_config']['kwargs'][metric_kwarg_name] - - metrics_store.add_single_batch_expectation_defined_metric( - data_asset_name, - batch_kwargs.batch_fingerprint, - result['expectation_config']['expectation_type'], - result_key_lookup_key, - metric_kwargs, - value) - - @classmethod - def add_metrics_from_single_expectation_validation_result(cls, result, data_asset_name, batch_kwargs, metrics_store): - """ - Extract metrics from a validation result of one expectation and store them. - Depending on the type of the expectation, this method chooses the key - in the result dictionary that should be returned as a metric - (e.g., "observed_value" or "unexpected_percent"). - - :param result: a validation result dictionary of one expectation - :param data_asset_name: - :param batch_kwargs: BatchKwargs of the batch that was validated - :param metrics_store - """ - # NOTE: Eugene: 2019-09-04: Add more entries - expectation_metrics = { - # 'expect_column_distinct_values_to_be_in_set' - # 'expect_column_kl_divergence_to_be_less_than', - 'expect_column_max_to_be_between': { - 'observed_value': 'column_max' - }, - 'expect_column_mean_to_be_between': { - 'observed_value': 'column_mean' - }, - 'expect_column_median_to_be_between': { - 'observed_value': 'column_median' - }, - 'expect_column_min_to_be_between': { - 'observed_value': 'column_min' - }, - 'expect_column_proportion_of_unique_values_to_be_between': { - 'observed_value': 'column_proportion_of_unique_values' - }, - # 'expect_column_quantile_values_to_be_between', - 'expect_column_stdev_to_be_between': { - 'observed_value': 'column_stdev' - }, - 'expect_column_unique_value_count_to_be_between': { - 'observed_value': 'column_unique_count' - }, - # 'expect_column_values_to_be_between', - # 'expect_column_values_to_be_in_set', - # 'expect_column_values_to_be_in_type_list', - 'expect_column_values_to_be_unique': { - - }, - # 'expect_table_columns_to_match_ordered_list', - 'expect_table_row_count_to_be_between': { - 'observed_value': 'row_count' - } - - } - - metrics = [] - if result.get('result'): - entry = expectation_metrics.get(result['expectation_config']['expectation_type']) - if entry: - for key in result['result'].keys(): - metric_name = entry.get(key) - if metric_name: - metric_kwargs = {"column": result['expectation_config']['kwargs']['column']} if result['expectation_config'][ - 'kwargs'].get('column') else {} - - metrics_store.add_single_batch_metric( - data_asset_name, - batch_kwargs.batch_fingerprint, - metric_name, - metric_kwargs, - result['result'][key]) - - else: - cls.add_expectation_defined_metric_for_result_key(result['result'], result, - data_asset_name, batch_kwargs, metrics_store) - - @classmethod - def get_metrics(cls, validation_results_list, data_context): - """ - Get multi-batch metrics from a list of validation results - - :param validation_results_list: a list validation results where each item is a - result of validating a batch against the same expectation suite - :return: a dict: {multi-batch metric urn -> multi-batch metric} - """ - - # NOTE: Eugene: 2019-09-04: For now we are creating an instance of metrics store here - # but it probably should be some singleton obtained from a factory/manager. - metrics_store = MetricsStore() - - batch_kwargs_list = [] - for j, one_batch_validation_results in enumerate(validation_results_list): - # print(json.dumps(one_batch_validation_results['meta'], indent=2)) - batch_kwargs = BatchKwargs(one_batch_validation_results['meta']['batch_kwargs']) - batch_kwargs_list.append(batch_kwargs) - - # NOTE: Eugene 2019-08-25: when validation results be a typed object, - # that object will have data_asset_name property method that will - # return a NormalizedDataAssetName. Until then we are constructing - # a NormalizedDataAssetName from the string that we fetch from the dictionary - normalized_data_asset_name = data_context.normalize_data_asset_name( - one_batch_validation_results['meta']['data_asset_name']) - for i, result in enumerate(one_batch_validation_results['results']): - cls.add_metrics_from_single_expectation_validation_result(result, - normalized_data_asset_name, - batch_kwargs, - metrics_store) - - mb_metrics = metrics_store.get_multi_batch_metrics(batch_kwargs_list) - - return mb_metrics +# import logging +# from collections import defaultdict +# import collections +# +# import warnings +# from great_expectations.datasource.types import BatchKwargs +# from great_expectations.profile.metrics_store import MetricsStore +# from great_expectations.profile.metrics_utils import ( +# set_nested_value_in_dict, +# get_nested_value_from_dict +# ) +# +# logger = logging.getLogger(__name__) +# +# +# class MultiBatchValidationMetaAnalysis(object): +# """MultiBatchValidationMetaAnalysis takes a list of validation results +# (same expectation suite evaluated against multiple batches) +# and returns multi-batch metrics from these results. +# +# """ +# +# # (expectation type, result key) -> (expectation kwargs that should become metric kwargs) +# # result key is a string or a tuple if the key is nested. same for the expectation kwargs +# +# # NOTE: Eugene: 2019-09-04: Add more entries +# EXPECTATION_DEFINED_METRICS_LOOKUP_TABLE = { +# ('expect_column_values_to_not_be_null', ('unexpected_percent',)): ('column',), # note: "," is important - it makes it a tuple! +# ('expect_column_quantile_values_to_be_between', ('observed_value', 'values')): ( +# 'column', ('quantile_ranges', 'quantiles')), +# +# } +# +# @classmethod +# def add_expectation_defined_metric_for_result_key(cls, d, result, data_asset_name, batch_kwargs, metrics_store, t=()): +# for key, value in d.items(): +# if isinstance(value, collections.Mapping): +# cls.add_expectation_defined_metric_for_result_key(value, result, data_asset_name, batch_kwargs, metrics_store, t + (key,)) +# else: +# # result_key_lookup_key = key if t==() else (t + (key,)) +# result_key_lookup_key = (t + (key,)) +# full_lookup_key = (result.expectation_config.expectation_type, result_key_lookup_key) +# metric_kwargs_names = cls.EXPECTATION_DEFINED_METRICS_LOOKUP_TABLE.get(full_lookup_key) +# if metric_kwargs_names: +# metric_kwargs = {} +# for metric_kwarg_name in metric_kwargs_names: +# if isinstance(metric_kwarg_name, tuple): +# set_nested_value_in_dict(metric_kwargs, metric_kwarg_name, get_nested_value_from_dict(result.expectation_config['kwargs'], metric_kwarg_name)) +# else: +# metric_kwargs[metric_kwarg_name] = result.expectation_config['kwargs'][metric_kwarg_name] +# +# metrics_store.add_single_batch_expectation_defined_metric( +# data_asset_name, +# batch_kwargs.batch_fingerprint, +# result.expectation_config.expectation_type, +# result_key_lookup_key, +# metric_kwargs, +# value) +# +# @classmethod +# def add_metrics_from_single_expectation_validation_result(cls, result, data_asset_name, batch_kwargs, metrics_store): +# """ +# Extract metrics from a validation result of one expectation and store them. +# Depending on the type of the expectation, this method chooses the key +# in the result dictionary that should be returned as a metric +# (e.g., "observed_value" or "unexpected_percent"). +# +# :param result: a validation result dictionary of one expectation +# :param data_asset_name: +# :param batch_kwargs: BatchKwargs of the batch that was validated +# :param metrics_store +# """ +# # NOTE: Eugene: 2019-09-04: Add more entries +# expectation_metrics = { +# # 'expect_column_distinct_values_to_be_in_set' +# # 'expect_column_kl_divergence_to_be_less_than', +# 'expect_column_max_to_be_between': { +# 'observed_value': 'column_max' +# }, +# 'expect_column_mean_to_be_between': { +# 'observed_value': 'column_mean' +# }, +# 'expect_column_median_to_be_between': { +# 'observed_value': 'column_median' +# }, +# 'expect_column_min_to_be_between': { +# 'observed_value': 'column_min' +# }, +# 'expect_column_proportion_of_unique_values_to_be_between': { +# 'observed_value': 'column_proportion_of_unique_values' +# }, +# # 'expect_column_quantile_values_to_be_between', +# 'expect_column_stdev_to_be_between': { +# 'observed_value': 'column_stdev' +# }, +# 'expect_column_unique_value_count_to_be_between': { +# 'observed_value': 'column_unique_count' +# }, +# # 'expect_column_values_to_be_between', +# # 'expect_column_values_to_be_in_set', +# # 'expect_column_values_to_be_in_type_list', +# 'expect_column_values_to_be_unique': { +# +# }, +# # 'expect_table_columns_to_match_ordered_list', +# 'expect_table_row_count_to_be_between': { +# 'observed_value': 'row_count' +# } +# +# } +# +# metrics = [] +# if result.get('result'): +# entry = expectation_metrics.get(result.expectation_config.expectation_type) +# if entry: +# for key in result['result'].keys(): +# metric_name = entry.get(key) +# if metric_name: +# metric_kwargs = {"column": result.expectation_config['kwargs']['column']} if result.expectation_config[ +# 'kwargs'].get('column') else {} +# +# metrics_store.add_single_batch_metric( +# data_asset_name, +# batch_kwargs.batch_fingerprint, +# metric_name, +# metric_kwargs, +# result['result'][key]) +# +# else: +# cls.add_expectation_defined_metric_for_result_key(result['result'], result, +# data_asset_name, batch_kwargs, metrics_store) +# +# @classmethod +# def get_metrics(cls, validation_results_list, data_context): +# """ +# Get multi-batch metrics from a list of validation results +# +# :param validation_results_list: a list validation results where each item is a +# result of validating a batch against the same expectation suite +# :return: a dict: {multi-batch metric urn -> multi-batch metric} +# """ +# +# # NOTE: Eugene: 2019-09-04: For now we are creating an instance of metrics store here +# # but it probably should be some singleton obtained from a factory/manager. +# metrics_store = MetricsStore() +# +# batch_kwargs_list = [] +# for j, one_batch_validation_results in enumerate(validation_results_list): +# # print(json.dumps(one_batch_validation_results['meta'], indent=2)) +# batch_kwargs = BatchKwargs(one_batch_validation_results['meta']['batch_kwargs']) +# batch_kwargs_list.append(batch_kwargs) +# +# # NOTE: Eugene 2019-08-25: when validation results be a typed object, +# # that object will have data_asset_name property method that will +# # return a NormalizedDataAssetName. Until then we are constructing +# # a NormalizedDataAssetName from the string that we fetch from the dictionary +# normalized_data_asset_name = data_context.normalize_data_asset_name( +# one_batch_validation_results['meta']['data_asset_name']) +# for i, result in enumerate(one_batch_validation_results['results']): +# cls.add_metrics_from_single_expectation_validation_result(result, +# normalized_data_asset_name, +# batch_kwargs, +# metrics_store) +# +# mb_metrics = metrics_store.get_multi_batch_metrics(batch_kwargs_list) +# +# return mb_metrics diff --git a/great_expectations/profile/sample_expectations_dataset_profiler.py b/great_expectations/profile/sample_expectations_dataset_profiler.py new file mode 100644 index 000000000000..7db1290dffcd --- /dev/null +++ b/great_expectations/profile/sample_expectations_dataset_profiler.py @@ -0,0 +1,304 @@ +import datetime + +from dateutil.parser import parse + +from great_expectations.dataset.util import build_categorical_partition_object +from great_expectations.profile.basic_dataset_profiler import ( + BasicDatasetProfilerBase, + logger, +) + + +class SampleExpectationsDatasetProfiler(BasicDatasetProfilerBase): + """The goal of SampleExpectationsDatasetProfiler is to generate an expectation suite that + contains one instance of every interesting expectation type. + + This expectation suite is intended to serve as a demo of the expressive power of expectations + and provide a service similar to the one expectations glossary documentation page, but on + users' own data. + + Ranges of acceptable values in the expectations created by this profiler (e.g., min/max + of the median in expect_column_median_to_be_between) are created only to demonstrate + the functionality and should not be taken as the actual ranges outside which the data + should be considered incorrect. + """ + + @classmethod + def _get_column_type_with_caching(cls, dataset, column_name, cache): + column_cache_entry = cache.get(column_name) + if not column_cache_entry: + column_cache_entry = {} + cache[column_name] = column_cache_entry + column_type = column_cache_entry.get("type") + if not column_type: + column_type = cls._get_column_type(dataset, column_name) + column_cache_entry["type"] = column_type + # remove the expectation + dataset.remove_expectation(expectation_type="expect_column_values_to_be_in_type_list") + dataset.set_config_value('interactive_evaluation', True) + + return column_type + + + @classmethod + def _get_column_cardinality_with_caching(cls, dataset, column_name, cache): + column_cache_entry = cache.get(column_name) + if not column_cache_entry: + column_cache_entry = {} + cache[column_name] = column_cache_entry + column_cardinality = column_cache_entry.get("cardinality") + if not column_cardinality: + column_cardinality = cls._get_column_cardinality(dataset, column_name) + column_cache_entry["cardinality"] = column_cardinality + # remove the expectations + dataset.remove_expectation(expectation_type="expect_column_unique_value_count_to_be_between") + dataset.remove_expectation(expectation_type="expect_column_proportion_of_unique_values_to_be_between") + dataset.set_config_value('interactive_evaluation', True) + + return column_cardinality + + @classmethod + def _create_expectations_for_low_card_column(cls, dataset, column, column_cache): + cls._create_non_nullity_expectations(dataset, column) + + value_set = \ + dataset.expect_column_distinct_values_to_be_in_set(column, value_set=None, result_format="SUMMARY").result[ + "observed_value"] + dataset.expect_column_distinct_values_to_be_in_set(column, value_set=value_set, result_format="SUMMARY") + + if cls._get_column_cardinality_with_caching(dataset, column, column_cache) in ["two", "very few"]: + partition_object = build_categorical_partition_object(dataset, column) + dataset.expect_column_kl_divergence_to_be_less_than(column, partition_object=partition_object, + threshold=0.6, catch_exceptions=True) + + @classmethod + def _create_non_nullity_expectations(cls, dataset, column): + not_null_result = dataset.expect_column_values_to_not_be_null(column) + if not not_null_result.success: + mostly_value = max(0.001, (100.0 - not_null_result.result["unexpected_percent"] - 10) / 100.0) + dataset.expect_column_values_to_not_be_null(column, mostly=mostly_value) + + @classmethod + def _create_expectations_for_numeric_column(cls, dataset, column): + cls._create_non_nullity_expectations(dataset, column) + + value = \ + dataset.expect_column_min_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[ + "observed_value"] + value = dataset.expect_column_min_to_be_between(column, min_value=value - 1, max_value=value + 1) + + value = \ + dataset.expect_column_max_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[ + "observed_value"] + value = dataset.expect_column_max_to_be_between(column, min_value=value - 1, max_value=value + 1) + + value = dataset.expect_column_mean_to_be_between(column, min_value=None, max_value=None, + result_format="SUMMARY").result["observed_value"] + dataset.expect_column_mean_to_be_between(column, min_value=value - 1, max_value=value + 1) + + value = dataset.expect_column_median_to_be_between(column, min_value=None, max_value=None, + result_format="SUMMARY").result["observed_value"] + dataset.expect_column_median_to_be_between(column, min_value=value - 1, max_value=value + 1) + + result = dataset.expect_column_quantile_values_to_be_between( + column, + quantile_ranges={ + "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95], + "value_ranges": [ + [None, None], + [None, None], + [None, None], + [None, None], + [None, None], + ], + }, + result_format="SUMMARY", + catch_exceptions=True + ) + if result.exception_info and ( + result.exception_info["exception_traceback"] + or result.exception_info["exception_message"] + ): + # TODO quantiles are not implemented correctly on sqlite, and likely other sql dialects + logger.debug(result.exception_info["exception_traceback"]) + logger.debug(result.exception_info["exception_message"]) + else: + dataset.set_config_value('interactive_evaluation', False) + dataset.expect_column_quantile_values_to_be_between( + column, + quantile_ranges={ + "quantiles": result.result["observed_value"]["quantiles"], + "value_ranges": [ + [v - 1, v + 1] for v in + result.result["observed_value"]["values"] + ], + }, + catch_exceptions=True + ) + dataset.set_config_value('interactive_evaluation', True) + + @classmethod + def _create_expectations_for_string_column(cls, dataset, column): + cls._create_non_nullity_expectations(dataset, column) + dataset.expect_column_value_lengths_to_be_between(column, min_value=1) + + + @classmethod + def _find_next_low_card_column(cls, dataset, columns, profiled_columns, column_cache): + for column in columns: + if column in profiled_columns["low_card"]: + continue + cardinality = cls._get_column_cardinality_with_caching(dataset, column, column_cache) + if cardinality in ["two", "very few", "few"]: + return column + + return None + + + @classmethod + def _find_next_numeric_column(cls, dataset, columns, profiled_columns, column_cache): + for column in columns: + if column in profiled_columns["numeric"]: + continue + if column.lower().strip() == "id" or column.lower().strip().find("_id") > -1: + continue + + cardinality = cls._get_column_cardinality_with_caching(dataset, column, column_cache) + type = cls._get_column_type_with_caching(dataset, column, column_cache) + + if cardinality in ["many", "very many", "unique"] and type in ["int", "float"]: + return column + + return None + + @classmethod + def _find_next_string_column(cls, dataset, columns, profiled_columns, column_cache): + for column in columns: + if column in profiled_columns["string"]: + continue + + cardinality = cls._get_column_cardinality_with_caching(dataset, column, column_cache) + type = cls._get_column_type_with_caching(dataset, column, column_cache) + + if cardinality in ["many", "very many", "unique"] and type in ["string", "unknown"]: + return column + + return None + + @classmethod + def _find_next_datetime_column(cls, dataset, columns, profiled_columns, column_cache): + for column in columns: + if column in profiled_columns["datetime"]: + continue + + cardinality = cls._get_column_cardinality_with_caching(dataset, column, column_cache) + type = cls._get_column_type_with_caching(dataset, column, column_cache) + + if cardinality in ["many", "very many", "unique"] and type in ["datetime"]: + return column + + return None + + @classmethod + def _create_expectations_for_datetime_column(cls, dataset, column): + cls._create_non_nullity_expectations(dataset, column) + + min_value = \ + dataset.expect_column_min_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[ + "observed_value"] + + if min_value is not None: + dataset.remove_expectation(expectation_type="expect_column_min_to_be_between", column=column) + try: + min_value = min_value + datetime.timedelta(days=-365) + except OverflowError as o_err: + min_value = datetime.datetime.min + except TypeError as o_err: + min_value = parse(min_value) + datetime.timedelta(days=-365) + + + max_value = \ + dataset.expect_column_max_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[ + "observed_value"] + if max_value is not None: + dataset.remove_expectation(expectation_type="expect_column_max_to_be_between", column=column) + try: + max_value = max_value + datetime.timedelta(days=365) + except OverflowError as o_err: + max_value = datetime.datetime.max + except TypeError as o_err: + max_value = parse(max_value) + datetime.timedelta(days=365) + + if min_value is not None or max_value is not None: + dataset.expect_column_values_to_be_between(column, min_value, max_value, parse_strings_as_datetimes=True) + + + @classmethod + def _profile(cls, dataset): + + dataset.set_default_expectation_argument("catch_exceptions", False) + + value = dataset.expect_table_row_count_to_be_between(min_value=0, max_value=None).result["observed_value"] + dataset.expect_table_row_count_to_be_between(min_value=max(0, value-10), max_value=value+10) + + dataset.set_config_value('interactive_evaluation', True) + + columns = dataset.get_table_columns() + + dataset.expect_table_column_count_to_equal(len(columns)) + dataset.expect_table_columns_to_match_ordered_list(columns) + + meta_columns = {} + for column in columns: + meta_columns[column] = {"description": ""} + + column_cache = {} + profiled_columns = { + "numeric": [], + "low_card": [], + "string": [], + "datetime": [] + } + + column = cls._find_next_low_card_column(dataset, columns, profiled_columns, column_cache) + if column: + cls._create_expectations_for_low_card_column(dataset, column, column_cache) + profiled_columns["low_card"].append(column) + + + column = cls._find_next_numeric_column(dataset, columns, profiled_columns, column_cache) + if column: + cls._create_expectations_for_numeric_column(dataset, column) + profiled_columns["numeric"].append(column) + + + column = cls._find_next_string_column(dataset, columns, profiled_columns, column_cache) + if column: + cls._create_expectations_for_string_column(dataset, column) + profiled_columns["string"].append(column) + + column = cls._find_next_datetime_column(dataset, columns, profiled_columns, column_cache) + if column: + cls._create_expectations_for_datetime_column(dataset, column) + profiled_columns["datetime"].append(column) + + + expectation_suite = dataset.get_expectation_suite(suppress_warnings=True, discard_failed_expectations=True) + if not expectation_suite.meta: + expectation_suite.meta = {"columns": meta_columns, "notes": {""}} + else: + expectation_suite.meta["columns"] = meta_columns + + expectation_suite.meta["notes"] = { + "format": "markdown", + "content": [ + """#### This is an _example_ suite + +- This suite was made by quickly glancing at 1000 rows of your data. +- This is **not a production suite**. It is meant to show examples of expectations. +- Because this suite was auto-generated using a very basic profiler that does not know your data like you do, many of the expectations may not be meaningful. +""" + ] + } + + return expectation_suite diff --git a/great_expectations/render/exceptions.py b/great_expectations/render/exceptions.py new file mode 100644 index 000000000000..08a986b10ccb --- /dev/null +++ b/great_expectations/render/exceptions.py @@ -0,0 +1,5 @@ +from ..exceptions import GreatExpectationsTypeError + + +class InvalidRenderedContentError(GreatExpectationsTypeError): + pass diff --git a/great_expectations/render/renderer/column_section_renderer.py b/great_expectations/render/renderer/column_section_renderer.py index 44b5df7e9c57..a8602f745879 100644 --- a/great_expectations/render/renderer/column_section_renderer.py +++ b/great_expectations/render/renderer/column_section_renderer.py @@ -1,19 +1,33 @@ import json -from builtins import str # PY2 compatibility +import logging import re +from builtins import str # PY2 compatibility import altair as alt import pandas as pd -from .renderer import Renderer +from great_expectations.core import ( + ExpectationConfiguration, + ExpectationValidationResult, +) +from great_expectations.data_context.util import instantiate_class_from_config +from great_expectations.render.renderer.content_block import ( + ExceptionListContentBlockRenderer, +) +from great_expectations.render.renderer.renderer import Renderer +from great_expectations.render.types import ( + RenderedBulletListContent, + RenderedGraphContent, + RenderedHeaderContent, + RenderedSectionContent, + RenderedStringTemplateContent, + RenderedTableContent, + TextContent, + ValueListContent, +) from great_expectations.util import load_class -from .content_block import ExceptionListContentBlockRenderer - -from ..types import RenderedSectionContent -from ..types import ( - RenderedComponentContent, -) +logger = logging.getLogger(__name__) def convert_to_string_and_escape(var): @@ -29,30 +43,53 @@ def _get_column_name(cls, ge_object): else: candidate_object = ge_object try: - if "kwargs" in candidate_object: - # This is an expectation - return candidate_object["kwargs"]["column"] - elif "expectation_config" in candidate_object: - # This is a validation - return candidate_object["expectation_config"]["kwargs"]["column"] + if isinstance(candidate_object, ExpectationConfiguration): + return candidate_object.kwargs["column"] + elif isinstance(candidate_object, ExpectationValidationResult): + return candidate_object.expectation_config.kwargs["column"] else: raise ValueError( "Provide a column section renderer an expectation, list of expectations, evr, or list of evrs.") except KeyError: - return None + return "Table-Level Expectations" class ProfilingResultsColumnSectionRenderer(ColumnSectionRenderer): - def __init__(self, overview_table_renderer=None): + def __init__(self, overview_table_renderer=None, expectation_string_renderer=None, runtime_environment=None): if overview_table_renderer is None: overview_table_renderer = { "class_name": "ProfilingOverviewTableContentBlockRenderer" } - self._overview_table_renderer = load_class( - class_name=overview_table_renderer.get("class_name"), - module_name=overview_table_renderer.get("module_name", "great_expectations.render.renderer.content_block") + if expectation_string_renderer is None: + expectation_string_renderer = { + "class_name": "ExpectationStringRenderer" + } + self._overview_table_renderer = instantiate_class_from_config( + config=overview_table_renderer, + runtime_environment=runtime_environment, + config_defaults={ + "module_name": "great_expectations.render.renderer.content_block" + } ) + self._expectation_string_renderer = instantiate_class_from_config( + config=expectation_string_renderer, + runtime_environment=runtime_environment, + config_defaults={ + "module_name": "great_expectations.render.renderer.content_block" + } + ) + + self.content_block_function_names = [ + "_render_header", + "_render_overview_table", + "_render_quantile_table", + "_render_stats_table", + "_render_values_set", + "_render_histogram", + "_render_bar_chart_table", + "_render_failed", + ] #Note: Seems awkward to pass section_name and column_type into this renderer. #Can't we figure that out internally? @@ -64,24 +101,15 @@ def render(self, evrs, section_name=None, column_type=None): content_blocks = [] - content_blocks.append(self._render_header(evrs, column_type)) - # content_blocks.append(cls._render_column_type(evrs)) - content_blocks.append(self._render_overview_table(evrs)) - content_blocks.append(self._render_quantile_table(evrs)) - content_blocks.append(self._render_stats_table(evrs)) - content_blocks.append(self._render_histogram(evrs)) - content_blocks.append(self._render_values_set(evrs)) - content_blocks.append(self._render_bar_chart_table(evrs)) - - # content_blocks.append(cls._render_statistics(evrs)) - # content_blocks.append(cls._render_common_values(evrs)) - # content_blocks.append(cls._render_extreme_values(evrs)) - # content_blocks.append(cls._render_frequency(evrs)) - # content_blocks.append(cls._render_composition(evrs)) - # content_blocks.append(cls._render_expectation_types(evrs)) - # content_blocks.append(cls._render_unrecognized(evrs)) - - content_blocks.append(self._render_failed(evrs)) + for content_block_function_name in self.content_block_function_names: + try: + if content_block_function_name == "_render_header": + content_blocks.append(getattr(self, content_block_function_name)(evrs, column_type)) + else: + content_blocks.append(getattr(self, content_block_function_name)(evrs)) + except Exception as e: + logger.error("Exception occurred during data docs rendering: ", e, exc_info=True) + # NOTE : Some render* functions return None so we filter them out populated_content_blocks = list(filter(None, content_blocks)) @@ -94,30 +122,45 @@ def render(self, evrs, section_name=None, column_type=None): def _render_header(cls, evrs, column_type=None): # NOTE: This logic is brittle try: - column_name = evrs[0]["expectation_config"]["kwargs"]["column"] + column_name = evrs[0].expectation_config.kwargs["column"] except KeyError: column_name = "Table-level expectations" - return RenderedComponentContent(**{ + return RenderedHeaderContent(**{ "content_block_type": "header", - "header": { - "template": convert_to_string_and_escape(column_name), - "tooltip": { - "content": "expect_column_to_exist", - "placement": "top" - }, - }, - "subheader": { - "template": "Type: {column_type}".format(column_type=column_type), - "tooltip": { - "content": "expect_column_values_to_be_of_type
expect_column_values_to_be_in_type_list", - }, - }, + "header": RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { + "template": convert_to_string_and_escape(column_name), + "tooltip": { + "content": "expect_column_to_exist", + "placement": "top" + }, + "tag": "h5", + "styling": { + "classes": ["m-0", "p-0"] + } + } + }), + "subheader": RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { + "template": "Type: {column_type}".format(column_type=column_type), + "tooltip": { + "content": + "expect_column_values_to_be_of_type
expect_column_values_to_be_in_type_list", + }, + "tag": "h6", + "styling": { + "classes": ["mt-1", "mb-0"] + } + } + }), # { # "template": column_type, # }, "styling": { - "classes": ["col-12"], + "classes": ["col-12", "p-0"], "header": { "classes": ["alert", "alert-secondary"] } @@ -132,7 +175,7 @@ def _render_expectation_types(cls, evrs, content_blocks): # type_counts = defaultdict(int) # for evr in evrs: - # type_counts[evr["expectation_config"]["expectation_type"]] += 1 + # type_counts[evr.expectation_config.expectation_type] += 1 # bullet_list = sorted(type_counts.items(), key=lambda kv: -1*kv[1]) @@ -141,8 +184,8 @@ def _render_expectation_types(cls, evrs, content_blocks): "string_template": { "template": "$expectation_type $is_passing", "params": { - "expectation_type": evr["expectation_config"]["expectation_type"], - "is_passing": str(evr["success"]), + "expectation_type": evr.expectation_config.expectation_type, + "is_passing": str(evr.success), }, "styling": { "classes": ["list-group-item", "d-flex", "justify-content-between", "align-items-center"], @@ -155,17 +198,19 @@ def _render_expectation_types(cls, evrs, content_blocks): } } for evr in evrs] - content_blocks.append(RenderedComponentContent(**{ + content_blocks.append(RenderedBulletListContent(**{ "content_block_type": "bullet_list", - "header": 'Expectation types ', + "header": RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { + "template": 'Expectation types ', + "tag": "h6" + } + }), "bullet_list": bullet_list, "styling": { - "classes": ["col-12"], - "styles": { - "margin-top": "20px" - }, + "classes": ["col-12", "mt-1"], "header": { - # "classes": ["alert", "alert-secondary"], "classes": ["collapsed"], "attributes": { "data-toggle": "collapse", @@ -197,16 +242,19 @@ def _render_overview_table(self, evrs): evrs, "expect_column_values_to_not_be_null" ) - evrs = [evr for evr in [unique_n, unique_proportion, null_evr] if (evr is not None and "result" in evr)] + evrs = [evr for evr in [unique_n, unique_proportion, null_evr] if (evr is not None)] if len(evrs) > 0: new_content_block = self._overview_table_renderer.render(evrs) - new_content_block["header"] = "Properties" - new_content_block["styling"] = { - "classes": ["col-4", ], - "styles": { - "margin-top": "20px" - }, + new_content_block.header = RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { + "template": 'Properties', + "tag": "h6" + } + }) + new_content_block.styling = { + "classes": ["col-3", "mt-1", "pl-1", "pr-1"], "body": { "classes": ["table", "table-sm", "table-unbordered"], "styles": { @@ -226,18 +274,18 @@ def _render_quantile_table(cls, evrs): "expect_column_quantile_values_to_be_between" ) - if not quantile_evr or "result" not in quantile_evr: + if not quantile_evr or quantile_evr.exception_info["raised_exception"]: return - quantiles = quantile_evr["result"]["observed_value"]["quantiles"] - quantile_ranges = quantile_evr["result"]["observed_value"]["values"] + quantiles = quantile_evr.result["observed_value"]["quantiles"] + quantile_ranges = quantile_evr.result["observed_value"]["values"] quantile_strings = { .25: "Q1", .75: "Q3", .50: "Median" } - + for idx, quantile in enumerate(quantiles): quantile_string = quantile_strings.get(quantile) table_rows.append([ @@ -253,15 +301,18 @@ def _render_quantile_table(cls, evrs): quantile_ranges[idx], ]) - return RenderedComponentContent(**{ + return RenderedTableContent(**{ "content_block_type": "table", - "header": "Quantiles", + "header": RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { + "template": 'Quantiles', + "tag": "h6" + } + }), "table": table_rows, "styling": { - "classes": ["col-4"], - "styles": { - "margin-top": "20px" - }, + "classes": ["col-3", "mt-1", "pl-1", "pr-1"], "body": { "classes": ["table", "table-sm", "table-unbordered"], } @@ -277,11 +328,11 @@ def _render_stats_table(cls, evrs): "expect_column_mean_to_be_between" ) - if not mean_evr or "result" not in mean_evr: + if not mean_evr or mean_evr.exception_info["raised_exception"]: return mean_value = "{:.2f}".format( - mean_evr['result']['observed_value']) if mean_evr else None + mean_evr.result['observed_value']) if mean_evr else None if mean_value: table_rows.append([ { @@ -301,7 +352,7 @@ def _render_stats_table(cls, evrs): "expect_column_min_to_be_between" ) min_value = "{:.2f}".format( - min_evr['result']['observed_value']) if min_evr else None + min_evr.result['observed_value']) if min_evr else None if min_value: table_rows.append([ { @@ -321,7 +372,7 @@ def _render_stats_table(cls, evrs): "expect_column_max_to_be_between" ) max_value = "{:.2f}".format( - max_evr['result']['observed_value']) if max_evr else None + max_evr.result['observed_value']) if max_evr else None if max_value: table_rows.append([ { @@ -337,15 +388,18 @@ def _render_stats_table(cls, evrs): ]) if len(table_rows) > 0: - return RenderedComponentContent(**{ + return RenderedTableContent(**{ "content_block_type": "table", - "header": "Statistics", + "header": RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { + "template": 'Statistics', + "tag": "h6" + } + }), "table": table_rows, "styling": { - "classes": ["col-4"], - "styles": { - "margin-top": "20px" - }, + "classes": ["col-3", "mt-1", "pl-1", "pr-1"], "body": { "classes": ["table", "table-sm", "table-unbordered"], } @@ -361,36 +415,38 @@ def _render_values_set(cls, evrs): "expect_column_values_to_be_in_set" ) - if not set_evr or "result" not in set_evr: + if not set_evr or set_evr.exception_info["raised_exception"]: return - if set_evr and "partial_unexpected_counts" in set_evr["result"]: - partial_unexpected_counts = set_evr["result"]["partial_unexpected_counts"] + if set_evr and "partial_unexpected_counts" in set_evr.result: + partial_unexpected_counts = set_evr.result["partial_unexpected_counts"] values = [str(v["value"]) for v in partial_unexpected_counts] - elif set_evr and "partial_unexpected_list" in set_evr["result"]: - values = [str(item) for item in set_evr["result"]["partial_unexpected_list"]] + elif set_evr and "partial_unexpected_list" in set_evr.result: + values = [str(item) for item in set_evr.result["partial_unexpected_list"]] else: return - if len(" ".join(values)) > 100: - classes = ["col-12"] - else: - classes = ["col-4"] + classes = ["col-3", "mt-1", "pl-1", "pr-1"] if any(len(value) > 80 for value in values): content_block_type = "bullet_list" + content_block_class = RenderedBulletListContent else: content_block_type = "value_list" + content_block_class = ValueListContent - new_block = RenderedComponentContent(**{ + new_block = content_block_class(**{ "content_block_type": content_block_type, - "header": - { + "header": RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { "template": "Example Values", "tooltip": { "content": "expect_column_values_to_be_in_set" - } - }, + }, + "tag": "h6" + } + }), content_block_type: [{ "content_block_type": "string_template", "string_template": { @@ -410,113 +466,55 @@ def _render_values_set(cls, evrs): } for value in values], "styling": { "classes": classes, - "styles": { - "margin-top": "20px", - } } }) return new_block - @classmethod - def _render_histogram(cls, evrs): + def _render_histogram(self, evrs): # NOTE: This code is very brittle - kl_divergence_evr = cls._find_evr_by_type( + kl_divergence_evr = self._find_evr_by_type( evrs, "expect_column_kl_divergence_to_be_less_than" ) # print(json.dumps(kl_divergence_evr, indent=2)) - if not kl_divergence_evr or "result" not in kl_divergence_evr or "details" not in kl_divergence_evr.get("result", {}): + if kl_divergence_evr is None or kl_divergence_evr.result is None or "details" not in kl_divergence_evr.result: return - weights = kl_divergence_evr["result"]["details"]["observed_partition"]["weights"] - + observed_partition_object = kl_divergence_evr.result["details"]["observed_partition"] + weights = observed_partition_object["weights"] if len(weights) > 60: return None - else: - chart_pixel_width = (len(weights) / 60.0) * 1000 - if chart_pixel_width < 200: - chart_pixel_width = 200 - chart_container_col_width = round((len(weights) / 60.0) * 12) - if chart_container_col_width < 4: - chart_container_col_width = 4 - elif chart_container_col_width > 8: - chart_container_col_width = 12 - elif chart_container_col_width > 4: - chart_container_col_width = 8 - - mark_bar_args = {} - if len(weights) == 1: - mark_bar_args["size"] = 20 - - if kl_divergence_evr["result"]["details"]["observed_partition"].get("bins"): - bins = kl_divergence_evr["result"]["details"]["observed_partition"]["bins"] - bins_x1 = [round(value, 1) for value in bins[:-1]] - bins_x2 = [round(value, 1) for value in bins[1:]] - df = pd.DataFrame({ - "bin_min": bins_x1, - "bin_max": bins_x2, - "fraction": weights, - }) - df.fraction *= 100 - - bars = alt.Chart(df).mark_bar(**mark_bar_args).encode( - x='bin_min:O', - x2='bin_max:O', - y="fraction:Q", - tooltip=["bin_min", "bin_max", "fraction"] - ).properties(width=chart_pixel_width, height=400, autosize="fit") - chart = bars.to_json() - elif kl_divergence_evr["result"]["details"]["observed_partition"].get("values"): - values = kl_divergence_evr["result"]["details"]["observed_partition"]["values"] - - df = pd.DataFrame({ - "values": values, - "fraction": weights - }) - df.fraction *= 100 - - bars = alt.Chart(df).mark_bar(**mark_bar_args).encode( - x='values:N', - y="fraction:Q", - tooltip=["values", "fraction"] - ).properties(width=chart_pixel_width, height=400, autosize="fit") - chart = bars.to_json() - - return RenderedComponentContent(**{ - "content_block_type": "graph", - "header": - { - "template": "Histogram", - "tooltip": { - "content": "expect_column_kl_divergence_to_be_less_than" - } + header = RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { + "template": "Histogram", + "tooltip": { + "content": "expect_column_kl_divergence_to_be_less_than" }, - "graph": chart, - "styling": { - "classes": ["col-" + str(chart_container_col_width)], - "styles": { - "margin-top": "20px", - } + "tag": "h6" } }) + return self._expectation_string_renderer._get_kl_divergence_chart(observed_partition_object, header) + @classmethod def _render_bar_chart_table(cls, evrs): distinct_values_set_evr = cls._find_evr_by_type( evrs, "expect_column_distinct_values_to_be_in_set" ) - # print(json.dumps(kl_divergence_evr, indent=2)) - if not distinct_values_set_evr or "result" not in distinct_values_set_evr: + if not distinct_values_set_evr or distinct_values_set_evr.exception_info["raised_exception"]: return - value_count_dicts = distinct_values_set_evr['result']['details']['value_counts'] - values = [value_count_dict['value'] - for value_count_dict in value_count_dicts] - counts = [value_count_dict['count'] - for value_count_dict in value_count_dicts] + value_count_dicts = distinct_values_set_evr.result['details']['value_counts'] + if isinstance(value_count_dicts, pd.Series): + values = value_count_dicts.index.tolist() + counts = value_count_dicts.tolist() + else: + values = [value_count_dict['value'] for value_count_dict in value_count_dicts] + counts = [value_count_dict['count'] for value_count_dict in value_count_dicts] df = pd.DataFrame({ "value": values, @@ -526,16 +524,16 @@ def _render_bar_chart_table(cls, evrs): if len(values) > 60: return None else: - chart_pixel_width = (len(values) / 60.0) * 1000 - if chart_pixel_width < 200: - chart_pixel_width = 200 - chart_container_col_width = round((len(values) / 60.0) * 12) + chart_pixel_width = (len(values) / 60.0) * 500 + if chart_pixel_width < 250: + chart_pixel_width = 250 + chart_container_col_width = round((len(values) / 60.0) * 6) if chart_container_col_width < 4: chart_container_col_width = 4 - elif chart_container_col_width > 8: - chart_container_col_width = 12 - elif chart_container_col_width > 4: - chart_container_col_width = 8 + elif chart_container_col_width >= 5: + chart_container_col_width = 6 + elif chart_container_col_width >= 4: + chart_container_col_width = 5 mark_bar_args = {} if len(values) == 1: @@ -549,21 +547,21 @@ def _render_bar_chart_table(cls, evrs): chart = bars.to_json() - new_block = RenderedComponentContent(**{ + new_block = RenderedGraphContent(**{ "content_block_type": "graph", - "header": - { - "template": "Value Counts", - "tooltip": { - "content": "expect_column_distinct_values_to_be_in_set" - } - }, + "header": RenderedStringTemplateContent(**{ + "content_block_type": "string_template", + "string_template": { + "template": "Value Counts", + "tooltip": { + "content": "expect_column_distinct_values_to_be_in_set" + }, + "tag": "h6" + } + }), "graph": chart, "styling": { - "classes": ["col-" + str(chart_container_col_width)], - "styles": { - "margin-top": "20px", - } + "classes": ["col-" + str(chart_container_col_width), "mt-1"], } }) @@ -578,7 +576,7 @@ def _render_unrecognized(cls, evrs, content_blocks): unrendered_blocks = [] new_block = None for evr in evrs: - if evr["expectation_config"]["expectation_type"] not in [ + if evr.expectation_config.expectation_type not in [ "expect_column_to_exist", "expect_column_values_to_be_of_type", "expect_column_values_to_be_in_set", @@ -589,9 +587,9 @@ def _render_unrecognized(cls, evrs, content_blocks): "expect_column_mean_to_be_between", "expect_column_min_to_be_between" ]: - new_block = RenderedComponentContent(**{ + new_block = TextContent(**{ "content_block_type": "text", - "content": [] + "text": [] }) new_block["content"].append("""