From df818fddf6682aa25b0fcf66c461854471498fc3 Mon Sep 17 00:00:00 2001 From: wrongkindofdoctor <20195932+wrongkindofdoctor@users.noreply.github.com> Date: Thu, 4 Apr 2024 14:23:37 -0400 Subject: [PATCH] clean up dev docs and copy_external_docs update toc files --- doc/copy_external_docs.py | 6 +- doc/index.rst | 1 + doc/sphinx/dev_cheatsheet.rst | 63 ++++++++---- doc/sphinx/dev_coding_tips.rst | 117 ++++++++++++++--------- doc/sphinx/dev_guidelines.rst | 18 ++-- doc/sphinx/dev_walkthrough.rst | 170 --------------------------------- doc/sphinx/pod_dev_toc.rst | 2 +- doc/tex_all.rst | 2 +- 8 files changed, 133 insertions(+), 246 deletions(-) delete mode 100644 doc/sphinx/dev_walkthrough.rst diff --git a/doc/copy_external_docs.py b/doc/copy_external_docs.py index e01574a77..e4682b8f2 100644 --- a/doc/copy_external_docs.py +++ b/doc/copy_external_docs.py @@ -68,7 +68,7 @@ def _docname(item): os.makedirs(sphinx_dir) # find PODs or sites as directories under search_root - entries = [x for x in os.listdir(search_root) \ + entries = [x for x in os.listdir(search_root) if os.path.isdir(os.path.join(search_root, x)) and x[0].isalnum() ] # Case-insensitive alpha sort @@ -77,10 +77,6 @@ def _docname(item): if 'example' in entries: entries.remove('example') entries.insert(0, 'example') - # put local site documentation first - elif 'local' in entries: - entries.remove('local') - entries.insert(0, 'local') # find documentation files # = all non-PDF files (.rst and graphics) in /doc subdirectory diff --git a/doc/index.rst b/doc/index.rst index bc5e6eb5e..12be4bf6c 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -21,6 +21,7 @@ This site contains documentation for the MDTF-Diagnostics package. The code is f sphinx/ref_toc sphinx/fmwk_toc sphinx/pod_toc + sphinx/tools_toc Acknowledgements diff --git a/doc/sphinx/dev_cheatsheet.rst b/doc/sphinx/dev_cheatsheet.rst index 325c1700b..925693cc6 100644 --- a/doc/sphinx/dev_cheatsheet.rst +++ b/doc/sphinx/dev_cheatsheet.rst @@ -5,31 +5,45 @@ Creating and submitting a POD ----------------------------- 1. Prepare for implementation - - Run the unmodified MDTF-diagnostics package to make sure that your conda installation, directory structure, etc... are set up properly - - Modify the conda environment to work for your POD by adding a configuration file ``MDTF_diagnostics/src/conda/env_[YOUR POD NAME].yml`` with any new required modules. Be sure to re-run ``MDTF-diagnostics/src/conda/conda_env_setup.sh`` to install your POD's environment if it requires a separate YAML file with additional modules. - - Name your POD, make a directory for your POD in MDTF-diagnostics/diagnostics, and move your code to your POD directory + - Run the unmodified MDTF-diagnostics package to make sure that your conda installation, directory structure, etc... + are set up properly + - Modify the conda environment to work for your POD by adding a configuration file + ``MDTF_diagnostics/src/conda/env_[YOUR POD NAME].yml`` with any new required modules. Be sure to re-run + ``MDTF-diagnostics/src/conda/conda_env_setup.sh`` to install your POD's environment if it requires a separate YAML + file with additional modules. + - Name your POD, make a directory for your POD in MDTF-diagnostics/diagnostics, and move your code to your POD + directory - ``cp`` your observational data to ``MDTF_diagnostics/../inputdata/obs_data/[YOUR POD NAME]`` 2. Link your POD code into the framework - Modify your POD's driver script (e.g, ``driver.py``) to interface with your code - Modify pod's ``settings.jsonc`` to specify variables that will be passed to the framework - - Modify your code to use ``ENV_VARS`` provided by the framework (see the *Notes* for descriptions of the available environment variables) + - Modify your code to use ``ENV_VARS`` provided by the framework (see the *Notes* for descriptions of the available + environment variables) - Input files: - model input data: specified in an ESM-intake catalog - observational input data: ``MDTF-diagnostics/../inputdata/obs_data/[POD name]`` - - You may re-define input data locations in the ``OBS_DATA_ROOT`` setting in your runtime configuration file (or whatever the name of your runtime settings jsonc file is). + - You may re-define input data locations in the ``OBS_DATA_ROOT`` setting in your runtime configuration file + (or whatever the name of your runtime settings jsonc file is). - Working files: - - ``${WORK_DIR}`` is a framework environment variable defining the working directory. It is set to ``MDTF-diagnostics/../wkdir`` by default. + - ``${WORK_DIR}`` is a framework environment variable defining the working directory. It is set to + ``MDTF-diagnostics/../wkdir`` by default. - ``${WORK_DIR}`` contains temporary files and logs. - - You can modify ``${WORK_DIR}`` by changing "WORK_DIR" to the desired location in ``templates/runtime.[jsonc |yml}`` + - You can modify ``${WORK_DIR}`` by changing "WORK_DIR" to the desired location in + ``templates/runtime.[jsonc |yml}`` - Output files: - POD output files are written to the following locations by the framework: - - Postscript files: ``${WORK_DIR}/[POD NAME]/[model,obs]/PS`` - - Other files, including PNG plots: ``${WORK_DIR}/[POD NAME]/[model,obs]`` - - Set the "OUTPUT_DIR" option in default_tests.jsonc to write output files to a different location; "OUTPUT_DIR" defaults to "WORK_DIR" if it is not defined. + - Postscript files: ``${WORK_DIR}/MDTF_output[.v#]/[POD NAME]/[model,obs]/PS`` + - Other files, including PNG plots: ``${WORK_DIR}/MDTF_output[.v#]/[POD NAME]/[model,obs]`` + - Set the "OUTPUT_DIR" option in default_tests.jsonc to write output files to a different location; + "OUTPUT_DIR" defaults to "WORK_DIR" if it is not defined. - Output figure locations: - PNG files should be placed directly in ``$WORK_DIR/obs/`` and ``$WORK_DIR/model/`` - - If a POD chooses to save vector-format figures, they should be written into the ``$WORK_DIR/obs/PS`` and ``$WORK_DIR/model/PS`` directories. Files in these locations will be converted by the framework to PNG, so use those names in the html file. - - If a POD uses matplotlib, it is recommended to write as figures as EPS instead of PS because of potential bugs + - If a POD chooses to save vector-format figures, they should be written into the + ``$WORK_DIR/MDTF_output[.v#]/[POD_NAME]/obs/PS`` and + ``$WORK_DIR/MDTF_output[.v#]/[POD_NAME]/model/PS`` directories. Files in these locations will be + converted by the framework to PNG, so use those names in the html file. + - If a POD uses matplotlib, it is recommended to write as figures as EPS instead of PS because of potential + bugs - Modify html files to point to the figure names @@ -39,22 +53,33 @@ Creating and submitting a POD Notes: ------ -- **Make sure that WORK_DIR and OUTPUT_DIR have enough space to hold data for your POD(s) AND any PODs included in the package.** +- **Make sure that WORK_DIR and OUTPUT_DIR have enough space to hold data for your POD(s) AND any PODs included in the + package.** - Defining POD variables - - Add variables to the ``varlist`` block in the ``MDTF-diagnostics/diagnostics/[POD name]/settings.jsonc`` and define the following: - - the variable name: the short name that will generate the corresponding ``${ENV_VAR}`` (e.g., "zg500" generates the ``${ENV_VAR}`` "zg500_var") + - Add variables to the ``varlist`` block in the ``MDTF-diagnostics/diagnostics/[POD name]/settings.jsonc`` and define + the following: + - the variable name: the short name that will generate the corresponding ``${ENV_VAR}`` + (e.g., "zg500" generates the ``${ENV_VAR}`` "zg500_var") - the standard name with a corresponding entry in the appropriate fieldlist file(s) - variable units - variable dimensions (e.g., [time, lat, lon]) - variable realm (e.g., atmos, ocean ice, land) - - scalar coordinates for variables defined on a specific atmospheric pressure level (e.g. ``{"lev": 250}`` for a field on the 250-hPa p level). + - scalar coordinates for variables defined on a specific atmospheric pressure level (e.g. ``{"lev": 250}`` + for a field on the 250-hPa p level). - - If your variable is not in the necessary fieldlist file(s), add them to the file(s), or open an issue on GitHub requesting that the framework team add them. Once the files are updated, merge the changes from the main branch into your POD branch. + - If your variable is not in the necessary fieldlist file(s), add them to the file(s), or open an issue on GitHub + requesting that the framework team add them. Once the files are updated, merge the changes from the main branch + into your POD branch. - Note that the variable name and the standard name must be unique fieldlist entries - Environment variables - - To define an environment variable specific to your POD, add a ``"pod_env_vars"`` block to the ``"settings"`` block in your POD's ``settings.jsonc`` file and define the desired variables - - Reference an environment variable associated with a specific case in Python by calling ``os.environ[case_env_file]``, reading the file contents into a Python dictionary, and getting value associated with the first case (assuming variable names and coordinates are identical for each case), e.g. ``tas_var = [case['tas_var'] for case in case_list.values()][0]``. See ``example_multicase.py`` for more information. + - To define an environment variable specific to your POD, add a ``"pod_env_vars"`` block to the ``"settings"`` + block in your POD's ``settings.jsonc`` file and define the desired variables + - Reference an environment variable associated with a specific case in Python by calling + ``os.environ[case_env_file]``, reading the file contents into a Python dictionary, and getting value associated + with the first case (assuming variable names and coordinates are identical for each case), e.g. + ``tas_var = [case['tas_var'] for case in case_list.values()][0]``. See ``example_multicase.py`` for more + information. - NCL code can reference environment variables by calling ``getenv("VARIABLE NAME")`` - Framework-specific environment variables include: - case_env_file: path to yaml file with case-specific environment variables: diff --git a/doc/sphinx/dev_coding_tips.rst b/doc/sphinx/dev_coding_tips.rst index e1282c8ca..feecebc7b 100644 --- a/doc/sphinx/dev_coding_tips.rst +++ b/doc/sphinx/dev_coding_tips.rst @@ -45,7 +45,10 @@ Python: General `shutil `__ modules to interact with the filesystem, instead of running unix commands using ``os.system()``, ``commands`` (which is deprecated), or ``subprocess``. - *Why*: Hard-coding unix commands makes code less portable. Calling out to a subprocess introduces overhead and makes error handling and logging more difficult. The main reason, however, is that Python already provides these tools in a portable way. Please see the documentation for the `os `__ and `shutil `__ modules, summarized in this table: + *Why*: Hard-coding unix commands makes code less portable. Calling out to a subprocess introduces overhead and makes + error handling and logging more difficult. The main reason, however, is that Python already provides these tools in a + portable way. Please see the documentation for the `os `__ and + `shutil `__ modules, summarized in this table: .. list-table:: Recommended python functions for filesystem interaction :header-rows: 1 @@ -73,12 +76,16 @@ Python: General * - Delete a directory *dir* and everything inside it - `shutil.rmtree `__\(*dir*) - In particular, using `os.path.join `__ is more verbose than joining strings but eliminates bugs arising from missing or redundant directory separators. + In particular, using `os.path.join `__ + is more verbose than joining strings but eliminates bugs arising from missing or redundant directory separators. Python: Arrays -------------- -To obtain acceptable performance for numerical computation, people use Python interfaces to optimized, compiled code. `NumPy `__ is the standard module for manipulating numerical arrays in Python. `xarray `__ sits on top of NumPy and provides a higher-level interface to its functionality; any advice about NumPy applies to it as well. +To obtain acceptable performance for numerical computation, people use Python interfaces to optimized, compiled code. +`NumPy `__ is the standard module for manipulating numerical arrays in Python. +`xarray `__ sits on top of NumPy and provides a higher-level interface to +its functionality; any advice about NumPy applies to it as well. NumPy and xarray both have extensive documentation and many tutorials, such as: @@ -86,7 +93,8 @@ NumPy and xarray both have extensive documentation and many tutorials, such as: `intermediate `__ tutorials; xarray's `overview `__ and climate and weather `examples `__; - + A `demonstration `__ of the features of xarray using earth science data; + + A `demonstration `__ of the features of xarray using + Earth science data; + The 2020 SciPy conference has open-source, interactive `tutorials `__ you can work through on your own machine or fully online using `Binder `__. @@ -98,75 +106,96 @@ NumPy and xarray both have extensive documentation and many tutorials, such as: to be used. *Why*: For loops in Python are very slow compared to C or Fortran, because Python is an interpreted language. - You can think of the NumPy functions as someone writing those for-loops for you in C, and giving you a way to call it as a Python function. + You can think of the NumPy functions as someone writing those for-loops for you in C, and giving you a way to call it + as a Python function. It's beyond the scope of this document to cover all possible situations, since this is the main use case for NumPy. We refer to the tutorials above for instructions, and to the following blog posts that discuss this specific issue: + "`Look Ma, no for-loops `__," by Brad Solomon; - + "`Turn your conditional loops to Numpy vectors `__," by Tirthajyoti Sarkar; - + "`'Vectorized' Operations: Optimized Computations on NumPy Arrays `__", part of "`Python like you mean it `__," a free resource by Ryan Soklaski. + + "`Turn your conditional loops to Numpy vectors `__," + by Tirthajyoti Sarkar; + + "`'Vectorized' Operations: Optimized Computations on NumPy Arrays `__", + part of "`Python like you mean it `__," a free resource by Ryan Soklaski. - **Use xarray with netCDF data**: - *Why*: This is xarray's use case. You can think of NumPy as implementing multidimensional matrices in the fully general, mathematical sense, and xarray providing the specialization to the case where the matrix contains data on a lat-lon-time-(etc.) grid. + *Why*: This is xarray's use case. You can think of NumPy as implementing multidimensional matrices in the fully + general, mathematical sense, and xarray providing the specialization to the case where the matrix contains data on a + lat-lon-time-(etc.) grid. - xarray lets you refer to your data with human-readable labels such as 'latitude,' rather than having to remember that that's the second dimension of your array. This bookkeeping is essential when writing code for the MDTF framework, when your POD will be run on data from models you haven't been able to test on. + xarray lets you refer to your data with human-readable labels such as 'latitude,' rather than having to remember + that that's the second dimension of your array. This bookkeeping is essential when writing code for the MDTF + framework, when your POD will be run on data from models you haven't been able to test on. - In particular, xarray provides seamless support for `time axes `__, with `support `__ for all CF convention calendars through the ``cftime`` library. You can, eg, subset a range of data between two dates without having to manually convert those dates to array indices. + In particular, xarray provides seamless support for `time axes `__, + with `support `__ for all CF convention calendars through + the ``cftime`` library. You can, eg, subset a range of data between two dates without having to manually convert those + dates to array indices. See the xarray tutorials linked above for more examples of xarray's features. -- **Memory use and views vs. copies**: Use scalar indexing and `slices `__ (index specifications of the form `start_index`:`stop_index`:`stride`) to get subsets of arrays whenever possible, and only use `advanced indexing `__ features (indexing arrays with other arrays) when necessary. +- **Memory use and views vs. copies**: Use scalar indexing and + `slices `__ + (index specifications of the form `start_index`:`stop_index`:`stride`) to get subsets of arrays whenever + possible, and only use + `advanced indexing `__ + features (indexing arrays with other arrays) when necessary. - *Why*: When advanced indexing is used, NumPy will need to create a new copy of the array in memory, which can hurt performance if the array contains a large amount of data. By contrast, slicing or basic indexing is done in-place, without allocating a new array: the NumPy documentation calls this a "view." + *Why*: When advanced indexing is used, NumPy will need to create a new copy of the array in memory, which can hurt + performance if the array contains a large amount of data. By contrast, slicing or basic indexing is done in-place, + without allocating a new array: the NumPy documentation calls this a "view." - Note that array slices are native `Python objects `__, so you can define a slice in a different place from the array you intend to use it on. Both NumPy and xarray arrays recognize slice objects. + Note that array slices are native `Python objects `__, + so you can define a slice in a different place from the array you intend to use it on. Both NumPy and xarray arrays + recognize slice objects. - This is easier to understand if you think about NumPy as a wrapper around C-like functions: array indexing in C is implemented with pointer arithmetic, since the array is implemented as a contiguous block of memory. An array slice is just a pointer to the same block of memory, but with different offsets. More complex indexing isn't guaranteed to follow a regular pattern, so NumPy needs to copy the requested data in that case. + This is easier to understand if you think about NumPy as a wrapper around C-like functions: array indexing in C is + implemented with pointer arithmetic, since the array is implemented as a contiguous block of memory. An array slice is + just a pointer to the same block of memory, but with different offsets. More complex indexing isn't guaranteed to + follow a regular pattern, so NumPy needs to copy the requested data in that case. See the following references for more information: + The NumPy `documentation `__ on indexing; - + "`Numpy Views vs Copies: Avoiding Costly Mistakes `__," by Jessica Yung; - + "`How can I tell if NumPy creates a view or a copy? `__" on stackoverflow. + + "`Numpy Views vs Copies: Avoiding Costly Mistakes `__," + by Jessica Yung; + + "`How can I tell if NumPy creates a view or a copy? `__" + on stackoverflow. -- **MaskedArrays instead of NaNs or sentinel values**: Use NumPy's `MaskedArrays `__ for data that may contain missing or invalid values, instead of setting those entries to NaN or a sentinel value. +- **MaskedArrays instead of NaNs or sentinel values**: Use NumPy's + `MaskedArrays `__ + for data that may contain missing or invalid values, instead of setting those entries to NaN or a sentinel value. - *Why*: One sometimes encounters code which sets array entries to fixed "sentinel values" (such as 1.0e+20 or `NaN `__\) to indicate missing or invalid data. This is a dangerous and error-prone practice, since it's frequently not possible to detect if the invalid entries are being used by mistake. For example, computing the variance of a timeseries with missing elements set to 1e+20 will either result in a floating-point overflow, or return zero. + *Why*: One sometimes encounters code which sets array entries to fixed "sentinel values" (such as 1.0e+20 or + `NaN `__\) to indicate missing or invalid data. This is a dangerous and + error-prone practice, since it's frequently not possible to detect if the invalid entries are being used by mistake. + For example, computing the variance of a timeseries with missing elements set to 1e+20 will either result in a + floating-point overflow, or return zero. - NumPy provides a better solution in the form of `MaskedArrays `__, which behave identically to regular arrays but carry an extra boolean mask to indicate valid/invalid status. All the NumPy mathematical functions will automatically use this mask for error propagation. For `example `__, trying to divide an array element by zero or taking the square root of a negative element will mask it off, indicating that the value is invalid: you don't need to remember to do these sorts of checks explicitly. + NumPy provides a better solution in the form of + `MaskedArrays `__, + which behave identically to regular arrays but carry an extra boolean mask to indicate valid/invalid status. + All the NumPy mathematical functions will automatically use this mask for error propagation. For + `example `__, + trying to divide an array element by zero or taking the square root of a negative element will mask it off, indicating + that the value is invalid: you don't need to remember to do these sorts of checks explicitly. Python: Plotting ---------------- -- **Use the 'Agg' backend when testing your POD**: For reproducibility, set the shell environment variable ``MPLBACKEND`` to ``Agg`` when testing your POD outside of the framework. +- **Use the 'Agg' backend when testing your POD**: For reproducibility, set the shell environment variable +``MPLBACKEND`` to ``Agg`` when testing your POD outside of the framework. - *Why*: Matplotlib can use a variety of `backends `__\: interfaces to low-level graphics libraries. Some of these are platform-dependent, or require additional libraries that the MDTF framework doesn't install. In order to achieve cross-platform portability and reproducibility, the framework specifies the ``'Agg'`` non-interactive (ie, writing files only) backend for all PODs, by setting the ``MPLBACKEND`` environment variable. + *Why*: Matplotlib can use a variety of `backends `__\: + interfaces to low-level graphics libraries. Some of these are platform-dependent, or require additional libraries + that the MDTF framework doesn't install. In order to achieve cross-platform portability and reproducibility, the + framework specifies the ``'Agg'`` non-interactive (ie, writing files only) backend for all PODs, by setting the + ``MPLBACKEND`` environment variable. - When developing your POD, you'll want an interactive backend -- for example, this is automatically set up for you in a Jupyter notebook. When it comes to testing your POD outside of the framework, however, you should be aware of this backend difference. - -- **Pass the cartopy CRS to plotting functions**: See cartopy's `documentation `__. A coordinate reference system (CRS) must be passed as a ``projection`` argument when plot axes are created. This should be passed to subsequent functions that set the plot range (``crs`` argument of ``set_extent``: avoid the use of ``set_xlim``/``set_ylim``) and to plotting functions (``transform`` argument). - -Note that this applies even to simple lat/lon plots, for which the appropriate CRS is ``PlateCarree()``. Not specifying a CRS in this case will give rise to subtle errors, e.g. when trying to set longitude ranges of [-180,180] or [0, 360] in which the bounds map to the same location. - -NCL ---- - -- **Large file support**: By default, NCL cannot read netCDF files larger than 2gb. To drop this limitation, call `setfileoption `__ with the following arguments in every script before any file operations: - - .. code-block:: - - setfileoption("nc", "Format", getenv("MDTF_NC_FORMAT")) - - ``"netCDF4"`` can also be used as the requested format in the above call. - - -- **Deprecated calendar functions**: Check the `function reference `__ to verify that the functions you use are not deprecated in the current version of `NCL `__. This is especially necessary for `date/calendar functions `__. - - *Why*: The framework uses a current version of `NCL `__ (6.6.x), to avoid plotting bugs that were present in earlier versions. This is especially relevant for calendar functions: the ``ut_*`` set of functions have been deprecated in favor of counterparts beginning with ``cd_`` that take identical arguments (so code can be updated using find/replace). For example, use `cd_calendar `__ instead of the deprecated `ut_calendar `__. - - This change is necessary because only the ``cd_*`` functions support all calendars defined in the CF conventions, which is needed to process data from some models (eg, weather or seasonal models are typically run with a Julian calendar.) + When developing your POD, you'll want an interactive backend -- for example, this is automatically set up for you in + a Jupyter notebook. When it comes to testing your POD outside of the framework, however, you should be aware of this + backend difference. diff --git a/doc/sphinx/dev_guidelines.rst b/doc/sphinx/dev_guidelines.rst index 311fdb36a..40a4e7b14 100644 --- a/doc/sphinx/dev_guidelines.rst +++ b/doc/sphinx/dev_guidelines.rst @@ -59,17 +59,23 @@ The environment variables most relevant for a POD's operation are: - ``POD_HOME``: Path to directory containing POD's scripts, e.g., ``diagnostics/convective_transition_diag/``. -- ``OBS_DATA``: Path to directory containing POD's supporting/digested observation data, e.g., ``inputdata/obs_data/convective_transition_diag/``. +- ``OBS_DATA``: Path to directory containing POD's supporting/digested observation data, e.g., + ``inputdata/obs_data/convective_transition_diag/``. -- ``DATADIR``: Path to directory containing model data files for one case/experiment, e.g., ``inputdata/model/QBOi.EXP1.AMIP.001/``. +- ``DATADIR`` (deprecated; PODs written for MDTF-diagnostics v3.5 and earlier): Path to directory containing model data files for + one case/experiment, e.g., ``inputdata/model/QBOi.EXP1.AMIP.001/``. -- ``WK_DIR``: Path to directory for POD to output files. Note that **this is the only directory a POD is allowed to write its output**. e.g., ``wkdir/MDTF_QBOi.EXP1.AMIP.001_1977_1981/convective_transition_diag/``. +- ``WORK_DIR``: Path to directory for POD to output files. Note that **this is the only directory a POD is allowed + to write its output**. e.g., ``wkdir/MDTF_QBOi.EXP1.AMIP.001_1977_1981/convective_transition_diag/``. - 1. Output figures to ``$WK_DIR/obs/`` and ``$WK_DIR/model/`` respectively. + 1. Output figures to ``$WORK_DIR/obs/`` and ``$WORK_DIR/model/`` respectively. - 2. ``$WK_DIR/obs/PS/`` and ``$WK_DIR/model/PS/``: If a POD chooses to save vector-format figures, save them as ``EPS`` under these two directories. Files in these locations will be converted by the framework to ``PNG`` for HTML output. Caution: avoid using ``PS`` because of potential bugs in recent ``matplotlib`` and converting to PNG. + 2. ``$WORKK_DIR/obs/PS/`` and ``$WORK_DIR/model/PS/``: If a POD chooses to save vector-format figures, save them as + ``EPS`` under these two directories. Files in these locations will be converted by the framework to ``PNG`` for HTML + output. Caution: avoid using ``PS`` because of potential bugs in recent ``matplotlib`` and converting to PNG. - 3. ``$WK_DIR/obs/netCDF/`` and ``$WK_DIR/model/netCDF/``: If a POD chooses to save digested data for later analysis/plotting, save them in these two directories in ``NetCDF``. + 3. ``$WORK_DIR/obs/netCDF/`` and ``$WORK_DIR/model/netCDF/``: If a POD chooses to save digested data for later + analysis/plotting, save them in these two directories in ``NetCDF``. Note that (1) values of ``POD_HOME``, ``OBS_DATA``, and ``WK_DIR`` change when the framework executes different PODs; (2) the ``WK_DIR`` directory and subdirectories therein are automatically created by the framework. **Each POD should output files as described here** so that the framework knows where to find what, and also for the ease of code maintenance. diff --git a/doc/sphinx/dev_walkthrough.rst b/doc/sphinx/dev_walkthrough.rst deleted file mode 100644 index f6949c97e..000000000 --- a/doc/sphinx/dev_walkthrough.rst +++ /dev/null @@ -1,170 +0,0 @@ -.. role:: console(code) - :language: console - :class: highlight - -.. _ref-dev-walkthrough: - -Walkthrough of framework operation -================================== - -In this section, we describe the actions that are taken when the framework is run, focusing on aspects that are relevant for the operation of individual PODs. The `Example Diagnostic POD `__ (short name: ``example``) is used as a concrete example here to illustrate how a POD is implemented and integrated into the framework. - -.. figure:: ../img/dev_flowchart.jpg - :align: center - :width: 100 % - -We begin with a reminder that there are 2 essential files for the operation of the framework and POD: - -- ``src/default_tests.jsonc``: configuration input for the framework. -- ``diagnostics/example/settings.jsonc``: settings file for the example POD. - -To setup for running the example POD, (1) download the necessary `supporting `__ and `NCAR-CAM5.timeslice sample data `__ and unzip them under ``inputdata/``, and (2) open ``default_tests.jsonc``, uncomment the whole ``NCAR-CAM5.timeslice`` section in ``case_list``, and comment out the other cases in the list. We also recommend setting both ``save_ps`` and ``save_nc`` to ``true``. - -Step 1: Framework invocation ----------------------------- - -The user runs the framework by executing the framework’s main driver script ``$CODE_ROOT/mdtf``, rather than executing the PODs directly. This is where the user specifies the model run to be analyzed, and chooses which PODs to run via the ``pod_list`` section in ``default_tests.jsonc``. - -- Some of the configuration options can be input through command line, see the :doc:`command line reference ` or run :console:`% $CODE_ROOT/mdtf --help`. - -At this stage, the framework also creates the directory ``$OUTPUT_DIR/`` (default: ``mdtf/wkdir/``) and all subdirectories therein for hosting the output files by the framework and PODs from each run. - -- If you've run the framework with both ``save_ps`` and ``save_nc`` in ``default_tests.jsonc`` set to ``true``, check the output directory structure and files therein. - -Note that when running, the framework will keep collecting the messages relevant to individual PODs, including (1) the status of required data and environment, and (2) texts printed out by PODs during execution, and will save them as log files under each POD's output directory. These ``log`` files can be viewed via the top-level results page ``index.html`` and, together with messages printed in the terminal, are useful for debugging. - -Example diagnostic -^^^^^^^^^^^^^^^^^^ - -Run the framework using the ``NCAR-CAM5.timeslice`` case. After successful execution, open the ``index.html`` under the output directory in a web browser. The ``plots`` links to the webpage produced by the example POD with figures, and ``log`` to ``example.log`` including all example-related messages collected by the framework. The messages displayed in the terminal are not identical to those in the log files, but also provide a status update on the framework-POD operation. - -Step 2: Data request --------------------- - -Each POD describes the model data it requires as input in the ``varlist`` section of its ``settings.jsonc``, with each entry in ``varlist`` corresponding to one model data file used by the POD. The framework goes through all the PODs to be run in ``pod_list`` and assembles a list of required model data from their ``varlist``. It then queries the source of the model data (``$DATADIR/``) for the presence of each requested variable with the requested characteristics (e.g., frequency, units, etc.). - -- The most important features of ``settings.jsonc`` are described in the :doc:`settings documentation ` and full detail on the :doc:`reference page `. - -- Variables are specified in ``varlist`` following `CF convention `__ wherever possible. If your POD requires derived quantities that are not part of the standard model output (e.g., column weighted averages), incorporate necessary preprocessing for computing these from standard output variables into your code. PODs are allowed to request variables outside of the CF conventions (by requiring an exact match on the variable name), but this will severely limit the POD's application. - -- Some of the requested variables may be unavailable or without the requested characteristics (e.g., frequency). You can specify a *backup plan* for this situation by designating sets of variables as *alternates* if feasible: when the framework is unable to obtain a variable that has the ``alternates`` attribute in ``varlist``, it will then (and only then) query the model data source for the variables named as alternates. - -- If no alternates are defined or the alternate variables are also unavailable, the framework will skip executing your POD, and an ``error log`` will be presented in ``index.html``. - -Once the framework has determined which PODs are able to run given the model data, it prepares the necessary environment variables, including directory paths and the requested variable names (as defined in ``data/fieldlist_$convention.jsonc``) for PODs' operation. - -- At this step, the framework also checks the PODs' observational/supporting data under ``inputdata/obs_data/``. If the directory of any of the PODs in ``pod_list`` is missing, the framework would terminate with error messages showing on the terminal. Note that the framework only checks the presence of the directory, but not the files therein. - -Example diagnostic -^^^^^^^^^^^^^^^^^^ - -The example POD uses only one model variable in its `varlist `__: surface air temperature, recorded at monthly frequency. - -- In the beginning of ``example.log``, the framework reports finding the requested model data file under ``Found files``. - -- If the framework could not locate the file, the log would instead record ``Skipping execution`` with the reason being missing data. - -Step 3: Runtime environment configuration ------------------------------------------ - -The framework reads the other parts of your POD’s ``settings.jsonc``, e.g., ``pod_env_vars``, and generates additional environment variables accordingly (on top of those being defined through ``default_tests.jsonc``). - -Furthermore, in the ``runtime_requirements`` section of ``settings.jsonc``, we request that you provide a list of languages and third-party libraries your POD uses. The framework will check that all these requirements are met by one of the Conda environments under ``$CONDA_ENV_DIR/``. - -- The requirements should be satisfied by one of the existing generic Conda environments (updated by you if necessary), or a new environment you created specifically for your POD. - -- If there isn't a suitable environment, the POD will be skipped. - -Note that the framework's information about the Conda environments all comes from the YAML (.yml) files under ``src/conda/`` (and their contents) by assuming that the corresponding Conda environments have been installed using (thus are consistent with) the YAML files. - -- The framework doesn't directly check files under ``$CONDA_ENV_DIR/``, where the Conda environments locate. - -- Therefore, it's imperative that you keep the Conda environments and the YAML files consistent at all time so the framework can properly function. - -Example diagnostic -^^^^^^^^^^^^^^^^^^ - -In its ``settings.jsonc``, the example POD lists its `requirements `__: Python 3, and the matplotlib, xarray and netCDF4 third-party libraries for Python. In this case, the framework assigns the POD to run in the generic `python3_base `__ environment provided by the framework. - -- In ``example.log``, under ``Env vars:`` is a comprehensive list of environment variables prepared for the POD by the framework. A great part of them are defined as in `data/fieldlist_CMIP.jsonc `__ via setting ``convention`` in ``default_tests.jsonc`` to ``CMIP``. Some of the environment variables are POD-specific as defined under `pod_env_vars `__ in the POD's ``settings.jsonc``, e.g., ``EXAMPLE_FAV_COLOR``. - -- In ``example.log``, after ``--- MDTF.py calling POD example``, the framework verifies the Conda-related paths, and makes sure that the ``runtime_requirements`` in ``settings.jsonc`` are met by the python3_base environment via checking `env_python3_base.yml `__. - -Step 4: POD execution ---------------------- - -At this point, your POD’s requirements have been met, and the environment variables are set. The framework then activates the right Conda environment, and begins execution of your POD’s code by calling the top-level driver script listed in its ``settings.jsonc``. - -- See :ref:`ref-using-env-vars` for most relevant environment variables, and how your POD is expected to output results. - -- All information passed from the framework to your POD is in the form of Unix/Linux shell environment variables; see `reference `__ for a complete list of environment variables (another good source is the log files for individual PODs). - -- For debugging, we encourage that your POD print out messages of its progress as it runs. All text written to stdout or stderr (i.e., displayed in a terminal) will be captured by the framework and added to a log file available to the users via ``index.html``. - -- Properly structure your code/scripts and include *error and exception handling* mechanisms so that simple issues will not completely shut down the POD's operation. Here are a few suggestions: - - A. Separate basic and advanced diagnostics. Certain computations (e.g., fitting) may need adjustment or are more likely to fail when model performance out of observed range. Organize your POD scripts so that the basic part can produce results even when the advanced part fails. - - B. If some of the observational data files are missing by accident, the POD should still be able to run analysis and produce figures for *model* data regardless. - - C. Say a POD reads in multiple variable files and computes statistics for individual variables. If some of the files are missing or corrupted, the POD should still produce results for the rest (note that the framework would skip this POD due to missing data, but PODs should have this robustness property for ease of workarounds or running outside the framework). - -- The framework contains additional exception handling so that if a POD experiences a fatal or unrecoverable error, the rest of the tasks and POD-calls by the framework can continue. The error messages, if any, will be included in the POD's log file. - -In case your POD requires derived quantities that are not part of the standard model output, and you've incorporated necessary preprocessing into your code (e.g., compute column average temperature from a vertically-resolved temperature field), one might be interested in saving these derived quantities as intermediate output for later use, and you may include this functionality in your code. - -- Here we are referring to derived quantities gridded in a similar way to model output, instead of highly-digested data that is just enough for making figures. - -- Save these as NetCDF files to the same directory containing the original model files. One file for one variable, following the filename convention spelled out in :doc:`Getting Started `. - -- You *must* provide an option so that users can choose *not* to save the files (e.g., because of write permission, disk space, or files are accessed via soft links). Include this option through ``pod_env_vars`` in your POD's ``settings.jsonc``, with "not to save" as default. You can remind users about this option by printing out messages in the terminal during runtime, or include a reminder in your POD documentation. - -Example diagnostic -^^^^^^^^^^^^^^^^^^ - -The framework activates the ``_MDTF_python3_base`` Conda environment, and calls the driver script `example-diag.py `__ listed in ``settings.jsonc``. Take a look at the script and the comments therein. - -``example-diag.py`` performs tasks roughly in the following order: - - 1) It reads the model surface air temperature data at ``input_path``, - 2) computes the model time average, - 3) saves the model time averages to ``$WK_DIR/model/netCDF/temp_means.nc`` for later use, - 4) plots model figure ``$WK_DIR/model/PS/example_model_plot.eps``, - 5) reads the digested data in time-averaged form at ``$OBS_DATA/example_tas_means.nc``, and plots the figure to ``$WK_DIR/obs/PS/example_obs_plot.eps``. - -Note that these tasks correspond to the code blocks 1) through 5) in the script. - -- When the script is called and running, it prints out messages which are saved in ``example.log``. These are helpful to determine when and how the POD execution is interrupted if there's a problem. - -- The script is organized to deal with model data first, and then to process digested observations. Thus if something goes wrong with the digested data, the script is still able to produce the html page with model figures. This won't happen if code block 5) is moved before 4), i.e., well-organized code is more robust and may be able to produce partial results even when it encounters problems. - -In code block 7) of ``example-diag.py``, we include an example of exception handling by trying to access a non-existent file (the final block is just to confirm that the *error* would not interrupt the script's execution because of exception-handling). - -- The last few lines of ``example.log`` demonstrate the script is able to finish execution despite an error having occurred. Exception handling makes code robust. - -.. _ref-output-cleanup: - -Step 5: Output and cleanup --------------------------- - -At this point, your POD has successfully finished running, and all remaining tasks are handled by the framework. The framework converts the postscript plots to bitmaps according to the following rule: - -- ``$WK_DIR/model/PS/filename.eps`` → ``$WK_DIR/model/filename.png`` -- ``$WK_DIR/obs/PS/filename.eps`` → ``$WK_DIR/obs/filename.png`` - -The html template for each POD is then copied to ``$WK_DIR`` by the framework. - -- In writing the template file all plots should be referenced as relative links to this location, e.g., "````". See templates from existing PODs. - -- Values of all environment variables referenced in the html template are substituted by the framework, allowing you to show the run’s ``CASENAME``, date range, etc. Text you'd like to change at runtime must be changed through environment variables (the v3 framework doesn’t allow other ways to alter the text of your POD’s output webpage at runtime). - -- If ``save_ps`` and ``save_nc`` are set to ``false``, the ``.eps`` and ``.nc`` files will be deleted. - -Finally, the framework links your POD’s html page to the top-level ``index.html``, and copies all files to the specified output location (``OUTPUT_DIR`` in ``default_tests.jsonc``; same as ``WK_DIR`` by default). - -- If ``make_variab_tar`` in ``default_tests.jsonc`` is set to ``true``, the framework will create a tar file for the output directory, in case you're working on a server, and have to move the file to a local machine before viewing it. - -Example diagnostic -^^^^^^^^^^^^^^^^^^ - -Open the html template ``diagnostics/example/example.html`` and the output ``$WK_DIR/example.html`` in a text editor, and compare. All the environment variables in the template have been substituted, e.g., ``{EXAMPLE_FAV_COLOR}`` becomes ``blue`` (defined in ``pod_env_vars`` in settings.jsonc). diff --git a/doc/sphinx/pod_dev_toc.rst b/doc/sphinx/pod_dev_toc.rst index 175f7867f..c2b1887d6 100644 --- a/doc/sphinx/pod_dev_toc.rst +++ b/doc/sphinx/pod_dev_toc.rst @@ -9,6 +9,6 @@ Developer information dev_start dev_guidelines pod_settings - dev_walkthrough dev_coding_tips dev_git_intro + dev_cheatsheet diff --git a/doc/tex_all.rst b/doc/tex_all.rst index b2a1d1394..ef265bf91 100644 --- a/doc/tex_all.rst +++ b/doc/tex_all.rst @@ -11,4 +11,4 @@ Documentation for the MDTF diagnostics framework sphinx/pod_toc sphinx/ref_toc sphinx/fmwk_toc - sphinx/tex_acknowledgements + sphinx/tools_toc