2.2.4 (#576)

MontrealCorpusTools · Mar 7, 2023 · 9a10345 · 9a10345
1 parent 415129e
commit 9a10345
Show file tree

Hide file tree

Showing 77 changed files with 2,463 additions and 1,086 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -36,45 +36,56 @@ jobs:
     name: ${{ matrix.label }}
     runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@v2
-
-      - name: Cache MFA models
-        uses: actions/cache@v3
-        env:
-          cache-name: cache-mfa-models
-        with:
-          key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('**/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-build-${{ env.cache-name }}-
-            ${{ runner.os }}-build-
-            ${{ runner.os }}-
-          path: |
-            ~/Documents/MFA
-
-      - name: Setup Mambaforge
-        uses: conda-incubator/setup-miniconda@v2
+      - uses: actions/checkout@main
         with:
-          miniforge-variant: Mambaforge
-          miniforge-version: latest
-          activate-environment: my-env
-          use-mamba: true
-      - name: Set cache date
-        run: echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV
-
-      - uses: actions/cache@v3
+          fetch-depth: 0
+
+      - name: Install Conda environment with Micromamba
+        uses: mamba-org/provision-with-micromamba@main
         with:
-          path: ${{ matrix.prefix }}
-          key: ${{ matrix.label }}-conda-${{ hashFiles('environment.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }}
-        id: cache
+          environment-file: environment.yml
+          environment-name: mfa
+          extra-specs: |
+            python=3.9
+
+      - name: Configure mfa
+        shell: bash -l {0}
+        run: python -m montreal_forced_aligner configure --disable_auto_server
+
+      - name: Initialize database
+        shell: bash -l {0}
+        run: python -m montreal_forced_aligner server init -p test
+
+      - name: Check database config
+        shell: bash -l {0}
+        run: cat ~/Documents/MFA/pg_mfa_test/postgresql.conf
 
-      - name: Update environment
-        run: mamba env update -n my-env -f environment.yml
-        if: steps.cache.outputs.cache-hit != 'true'
+      - name: Check init database log
+        shell: bash -l {0}
+        run: cat ~/Documents/MFA/pg_init_log_test.txt
+
+      - name: Check database start log
+        shell: bash -l {0}
+        run: cat ~/Documents/MFA/pg_log_test.txt
 
       - name: Run tests
+        env:
+          GITHUB_TOKEN: ${{ secrets.MFA_GITHUB_TOKEN }}
         shell: bash -l {0}
         run: pytest -x ./tests
 
+      - name: Stop database
+        shell: bash -l {0}
+        run: python -m montreal_forced_aligner server stop -p test
+
+      - name: Check init database log
+        shell: bash -l {0}
+        run: cat ~/Documents/MFA/pg_init_log_test.txt
+
+      - name: Check database start log
+        shell: bash -l {0}
+        run: cat ~/Documents/MFA/pg_log_test.txt
+
       - name: "Upload coverage to Codecov"
         uses: "codecov/codecov-action@v3"
         with:

diff --git a/ci/mfa_publish.yml b/ci/mfa_publish.yml
diff --git a/docs/source/changelog/changelog_2.2.rst b/docs/source/changelog/changelog_2.2.rst
@@ -5,6 +5,24 @@
 2.2 Changelog
 *************
 
+2.2.4
+=====
+
+- Fixes an issue where some directories in Common Voice Japanese were causing FileNotFound errors for sound files
+- Changes PostgreSQL database connections to use socket directories rather than ports
+- Added the ability to manage MFA database servers (:ref:`server`), along with the configuration flag to disable automatic starting/stopping of databases
+- Disabled starting servers for subcommands like ``configure``, ``version``, ``history`` or ``--help`` invocations
+- Added support for handling spaces when running :ref:`mfa g2p <g2p_dictionary_generating>` (though very simple as it just concatenates the outputs, and if ``--num_pronunciations`` is set to something other than 1, it is ignored)
+- Added the ability to pipe words via stdin/stdout when running :ref:`mfa g2p <g2p_dictionary_generating>`
+- Added the ability to generate pronunciations per utterance when running :ref:`mfa g2p <g2p_dictionary_generating>`
+- Added a first pass at providing estimations of alignment quality through the ``alignment_analysis.csv`` file exported with alignments, see :ref:`alignment_analysis` for more details.
+
+2.2.3
+=====
+
+- Update terminal printing to use :mod:`rich` rather than custom logic
+- Prevented the tokenizer utility from processing of text files that don't have a corresponding sound file
+
 2.2.2
 =====
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -353,7 +353,7 @@
         # "image_dark": "logo-dark.svg",
     },
     "analytics": {
-        "google_analytics_id": "353930198",
+        "google_analytics_id": "G-5NGNLY0CWX",
     },
     # "show_nav_level": 1,
     # "navigation_depth": 4,

diff --git a/docs/source/reference/helper/helper.rst b/docs/source/reference/helper/helper.rst
@@ -3,7 +3,6 @@
    .. autosummary::
       :toctree: generated/
 
-       TerminalPrinter
        comma_join
        make_safe
        make_scp_safe
@@ -15,4 +14,3 @@
        compare_labels
        overlap_scoring
        align_phones
-       CustomFormatter
diff --git a/docs/source/reference/helper/textgrid.rst b/docs/source/reference/helper/textgrid.rst
@@ -5,4 +5,6 @@
 
        process_ctm_line
        export_textgrid
+       construct_output_tiers
+       construct_output_path
        output_textgrid_writing_errors
diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst
@@ -23,4 +23,5 @@ This should (hopefully) make it easier to extend MFA for your own purposes if yo
 
    core_index
    top_level_index
+   server/index
    helper/index
diff --git a/docs/source/reference/server/index.rst b/docs/source/reference/server/index.rst
@@ -0,0 +1,20 @@
+
+.. _server_api:
+
+Managing MFA servers
+====================
+
+Functions
+---------
+
+.. currentmodule:: montreal_forced_aligner.command_line.utils
+
+.. autosummary::
+   :toctree: generated/
+
+   configure_pg
+   initialize_server
+   check_databases
+   start_server
+   stop_server
+   delete_server
diff --git a/docs/source/user_guide/implementations/alignment_analysis.md b/docs/source/user_guide/implementations/alignment_analysis.md
@@ -0,0 +1,31 @@
+
+(alignment_analysis)=
+# Analyzing alignment quality
+
+When exporting textgrids following alignment, an additional file named `alignment_analysis.csv` will be exported.  I am still currently working to refine what are the best measures for analyzing alignments as it's not entirely as straightforward as taking the overall alignment log-likelihood.
+
+## Alignment log-likelihood
+
+The first measure provided for each utterance is the alignment log-likelihood.  This represents overall the objective measure that was optimized for alignment. However, it is extremely important to note that this log-likelihood is a relative measure for the best path of alignment for this particular utterance compared to other possible alignments.
+
+A primary reason that such heavy caveats come with this metric is due to the use of speaker adaptation.  MFA does two passes of alignment. The first uses a speaker-independent model to generate an initial alignment.  This initial alignment is used to estimate per-speaker feature transforms that try to map the observed features into a common space.  Depending on the amount of data for a particular speaker, and the amount of variability they exhibit (i.e., do they yell, do they get excited, do they whisper, did they have a cold, etc etc), speaker transforms have a variable effect on improving alignment.  This variable improvement directly affects the log likelihood for a given utterance.
+
+Additionally, log-likelihood reflects differences in the training data versus alignment data.  Is the variety of the language the same? Does it have similar gender distribution? Does it have similar styles (conversational, scripted)? Does it have similar noise levels?  All of these can affect the acoustics of phones and skew how "likely" a given phone at a given point in time is.
+
+## Speech log-likelihood
+
+The overall alignment log-likelihood represents the best path including all sections of silence.  In general when we're thinking about how good an alignment is, we don't necessarily care how good of a match the silence intervals in a given utterance are to the trained silence model.  So the speech log-likelihood measure takes out all log-likelihoods from silence intervals and is the average of per-phone log-likelihoods in the utterance.
+
+## Phone duration deviation
+
+Stepping back from log-likelihoods generated by the model, we can take a look at statistics of the duration of phones in the aligned corpus.  By calculating the mean and standard deviation of durations per phone, we can z-score the individual phone's duration to see how unexpected it is relative to the corpus overall.  The phone duration deviation measure is an average of the absolute z-score of each phones duration.
+
+We use the absolute value of the z-score because often excessive durations due to misalignment will also result in excessively small durations on other phones. The average of raw z-scores in these cases will trend towards zero, when really we want these deviations to aggregate to utterances that clearly had something go wrong.
+
+It is important to note that there stylistic and speaker influences on duration, and statistics are gathered for the whole corpus, not normalized per speaker, so false positives are likely to pop up when sorting by this metric.  Normalizing per-speaker, however, might minimize the magnitude of duration deviation if a given speaker's utterances are all poorly aligned.  This would increase the likelihood of false negatives, and false positives are more acceptable than false negatives.
+
+
+## Ideas for the future that need a lot more thinking before I implement them
+
+1. Use the alignment best path from the speaker adapted pass with a lattice and scores generated using the speaker-independent first-pass alignment model
+    * This *might* help get around the variable optimizations that are speaker dependent
diff --git a/docs/source/user_guide/implementations/index.md b/docs/source/user_guide/implementations/index.md
@@ -11,6 +11,7 @@ This section is under construction!
 phone_groups
 phonological_rules
 lexicon_probabilities
+alignment_analysis
 alignment_evaluation
 fine_tune
 phone_models

diff --git a/docs/source/user_guide/index.rst b/docs/source/user_guide/index.rst
@@ -135,6 +135,7 @@ We acknowledge funding from Social Sciences and Humanities Research Council (SSH
    workflows/index
    corpus_creation/index
    configuration/index
+   server/index
    models/index
    implementations/index
    concepts/index

diff --git a/docs/source/user_guide/server/index.rst b/docs/source/user_guide/server/index.rst
@@ -0,0 +1,54 @@
+
+.. _server:
+
+***********
+MFA Servers
+***********
+
+MFA database servers
+====================
+
+By default, MFA starts or creates a PostgreSQL servers when a command is invoked, and stops the server at the end of processing.  The goal here is to have as unobtrusive of a database server as possible, however there are use cases that you may require more control. To turn off the automatic management of PostgreSQL servers, run :code:`mfa configure --disable_auto_server`.
+
+You can have multiple PostgreSQL servers by using the :code:`--profile` flag, if necessary.  By default the "global" profile is used.  The profile flags are used in :ref:`configure_cli`, as the default options set with :code:`configure` are done on a per-profile basis.
+
+
+PostgreSQL configuration
+------------------------
+
+MFA overrides some default configuration values for its PostgreSQL servers when they are initialized.
+
+.. code-block::
+
+   log_min_duration_statement = 5000
+   enable_partitionwise_join = on
+   enable_partitionwise_aggregate = on
+   unix_socket_directories = '/path/to/current/profile/socket_directory'
+   listen_addresses = ''
+
+   maintenance_work_mem = 500MB
+   work_mem = 128MB
+   shared_buffers = 256MB
+   max_connections = 1000
+
+The goal for MFA is to run on local desktops at reasonable performance on moderate sized corpora (<3k hours).  Depending on your use case, you may need to tune the :code:`postgres.conf` file further to suit your set up and corpus (see `PostgreSQL's documentation <https://www.postgresql.org/docs/15/runtime-config.html>`_ and `postgresqltuner utility script <https://github.com/jfcoz/postgresqltuner>`_.  Additionally, note that any port listening is turned off by default and connections are handled via socket directories.
+
+.. warning::
+
+   MFA PostgreSQL databases are meant to be on the expendable side. Though they can persist across use cases, it's not really recommended.  Use of :code:`--clean` drops all data in the database to ensure a fresh start state, as various commands perform destructive commands.  As an example :ref:`create_segments` deletes and recreates :class:`~montreal_forced_aligner.db.Utterance` objects, so the original text transcripts are absent in the database following its run.
+
+.. _server_cli:
+
+Managing MFA database servers
+=============================
+
+MFA PostgreSQL servers can be managed via the subcommands in `mfa server`, allowing you to initialize new servers, and start, stop, and delete existing servers.
+
+.. click:: montreal_forced_aligner.command_line.server:server_cli
+   :prog: mfa server
+   :nested: full
+
+API reference
+-------------
+
+- :ref:`server_api`
diff --git a/docs/source/user_guide/workflows/alignment.rst b/docs/source/user_guide/workflows/alignment.rst
@@ -11,6 +11,7 @@ This is the primary workflow of MFA, where you can use pretrained :term:`acousti
    * :ref:`alignment_evaluation` for details on how to evaluate alignments against a gold standard.
    * :ref:`fine_tune_alignments` for implementation details on how alignments are fine tuned.
    * :ref:`phone_models` for implementation details on using phone bigram models for generating alignments.
+   * :ref:`alignment_analysis` for details on the fields generated in the ``alignment_analysis.csv`` file in the output folder
 
 Command reference
 -----------------

diff --git a/docs/source/user_guide/workflows/dictionary_generating.rst b/docs/source/user_guide/workflows/dictionary_generating.rst
@@ -2,8 +2,8 @@
 
 .. _g2p_dictionary_generating:
 
-Generate a new pronunciation dictionary ``(mfa g2p)``
-=====================================================
+Generate pronunciations for words ``(mfa g2p)``
+===============================================
 
 We have trained several G2P models that are available for download (:xref:`pretrained_g2p`).
 
@@ -30,6 +30,26 @@ See :ref:`dict_generating_example` for an example of how to use G2P functionalit
 
    As of version 2.0.6, users on Windows can run this command natively without requiring :xref:`wsl`, see :ref:`installation` for more details.
 
+Piping stdin/stdout
+-------------------
+
+If you specify the input path as ``-`` instead of a file path, the g2p command will run through each line in the stdin and G2P each word with minimal processing.  Words will be lower cased and any graphemes that were not in the model's training data will be removed.
+
+If you specify the output path as ``-`` instead of a file path, the g2p command will send pronunciations as stdout rather than writing to a file.
+
+.. note::
+
+   Using stdin will also bypass database set up (though the database server will still be started and stopped, so be sure to run :code:`mfa configure --no_auto_server` if speed is of necessity.
+
+Per-utterance G2P
+-----------------
+
+The primary use case for G2P is in generating new pronunciation dictionaries, however there is limited support for generating pronunciations over an entire utterance.  If the ``OUTPUT_PATH`` specified for ``mfa g2p`` is a directory (i.e., no periods to mark a file extension), then MFA will generate a pronunciation for each word and then concatenate them together and save the resulting transcript in the output directory.
+
+.. warning::
+
+   This method is largely not recommended as the output is only the top hypothesis per word in isolation as MFA does not have access to necessary higher order information, so homographs may often have the wrong pronunciation (i.e., English present tense :ipa_inline:`read [ɹ iː d]` vs English past tense :ipa_inline:`read [ɹ ɛ d]`). Use at your own risk.
+
 Command reference
 -----------------
 

diff --git a/environment.yml b/environment.yml
@@ -31,6 +31,8 @@ dependencies:
   - setuptools_scm
   - pytest
   - pytest-mypy
+  - pytest-cov
+  - pytest-timeout
   - mock
   - coverage
   - coveralls