3.1.0 (#812)

MontrealCorpusTools · Jun 2, 2024 · 76c46a1 · 76c46a1
1 parent 4499f28
commit 76c46a1
Show file tree

Hide file tree

Showing 30 changed files with 1,554 additions and 332 deletions.
diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
@@ -5,6 +5,24 @@
 3.0 Changelog
 *************
 
+3.1.0
+-----
+
+- Fixed a bug where cutoffs were not properly modelled
+- Added additional filter on create subset to not include utterances with cutoffs in smaller subsets
+- Added the ability to specify HMM topologies for phones
+- Fixed issues caused by validators not cleaning up temporary files and databases
+- Added support for default and nonnative dictionaries generated from other dictionaries
+- Restricted initial training rounds to exclude default and nonnative dictionaries
+- Changed clustering of phones to not mix silence and non-silence phones
+- Optimized textgrid export
+- Added better memory management for collecting alignments
+
+3.0.8
+-----
+
+- Fixed a compatibility issue with models trained under version 1.0 and earlier
+
 3.0.7
 -----
 

diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py
@@ -316,7 +316,7 @@ def db_engine(self) -> sqlalchemy.engine.Engine:
             self._db_engine = self.construct_engine()
         return self._db_engine
 
-    def get_next_primary_key(self, database_table: MfaSqlBase):
+    def get_next_primary_key(self, database_table):
         with self.session() as session:
             pk = session.query(sqlalchemy.func.max(database_table.id)).scalar()
             if not pk:
@@ -634,7 +634,8 @@ def parse_args(
                 unknown_dict[name] = val
         for name, param_type in param_types.items():
             if (name.endswith("_directory") and name != "audio_directory") or (
-                name.endswith("_path") and name not in {"rules_path", "phone_groups_path"}
+                name.endswith("_path")
+                and name not in {"rules_path", "phone_groups_path", "topology_path"}
             ):
                 continue
             if args is not None and name in args and args[name] is not None:

diff --git a/montreal_forced_aligner/acoustic_modeling/lda.py b/montreal_forced_aligner/acoustic_modeling/lda.py
@@ -97,6 +97,8 @@ def _run(self):
             ]
             for dict_id in job.dictionary_ids:
                 ali_path = job.construct_path(self.working_directory, "ali", "ark", dict_id)
+                if not ali_path.exists():
+                    continue
                 lda_logger.debug(f"Processing {ali_path}")
                 feat_path = job.construct_path(
                     job.corpus.current_subset_directory, "feats", "scp", dictionary_id=dict_id
@@ -164,6 +166,8 @@ def _run(self) -> typing.Generator[int]:
             ]
             for dict_id in job.dictionary_ids:
                 ali_path = job.construct_path(self.working_directory, "ali", "ark", dict_id)
+                if not ali_path.exists():
+                    continue
                 lda_logger.debug(f"Processing {ali_path}")
                 feature_archive = job.construct_feature_archive(self.working_directory, dict_id)
                 alignment_archive = AlignmentArchive(ali_path)

diff --git a/montreal_forced_aligner/acoustic_modeling/monophone.py b/montreal_forced_aligner/acoustic_modeling/monophone.py
@@ -75,7 +75,7 @@ def _run(self):
             num_error = 0
             tot_like = 0.0
             tot_t = 0.0
-            for d in job.dictionaries:
+            for d in job.training_dictionaries:
                 dict_id = d.id
                 train_logger.debug(f"Aligning for dictionary {d.name} ({d.id})")
                 train_logger.debug(f"Aligning with model: {self.model_path}")
@@ -302,14 +302,22 @@ def _trainer_initialization(self) -> None:
         tree_path = self.working_directory.joinpath("tree")
         init_log_path = self.working_log_directory.joinpath("init.log")
         job = self.jobs[0]
-        dict_id = job.dictionary_ids[0]
-        feature_archive = job.construct_feature_archive(self.working_directory, dict_id)
         feats = []
         with kalpy_logger("kalpy.train", init_log_path) as train_logger:
-            for i, (_, mat) in enumerate(feature_archive):
-                if i > 10:
+            dict_index = 0
+            while len(feats) < 10:
+                try:
+                    dict_id = job.dictionary_ids[dict_index]
+                except IndexError:
                     break
-                feats.append(mat)
+                feature_archive = job.construct_feature_archive(self.working_directory, dict_id)
+                for i, (_, mat) in enumerate(feature_archive):
+                    if i > 10:
+                        break
+                    feats.append(mat)
+                dict_index += 1
+            if not feats:
+                raise Exception("Could not initialize monophone model due to lack of features")
             shared_phones = self.worker.shared_phones_set_symbols()
             topo = read_topology(self.worker.topo_path)
             gmm_init_mono(topo, feats, shared_phones, str(self.model_path), str(tree_path))

diff --git a/montreal_forced_aligner/acoustic_modeling/pronunciation_probabilities.py b/montreal_forced_aligner/acoustic_modeling/pronunciation_probabilities.py
@@ -312,8 +312,12 @@ def setup(self):
         previous_directory = self.previous_aligner.working_directory
         for j in self.jobs:
             for p in j.construct_path_dictionary(previous_directory, "ali", "ark").values():
+                if not p.exists():
+                    continue
                 shutil.copy(p, wf.working_directory.joinpath(p.name))
             for p in j.construct_path_dictionary(previous_directory, "words", "ark").values():
+                if not p.exists():
+                    continue
                 shutil.copy(p, wf.working_directory.joinpath(p.name))
         for f in ["final.mdl", "final.alimdl", "lda.mat", "tree"]:
             p = previous_directory.joinpath(f)
@@ -384,6 +388,12 @@ def train_pronunciation_probabilities(self) -> None:
                     )
                     with mfa_open(silence_info_path, "r") as f:
                         data = json.load(f)
+                        for k, v in data.items():
+                            if v is None:
+                                if "correction" in k:
+                                    data[k] = 1.0
+                                else:
+                                    data[k] = 0.5
                     if self.silence_probabilities:
                         d.silence_probability = data["silence_probability"]
                         d.initial_silence_probability = data["initial_silence_probability"]

diff --git a/montreal_forced_aligner/acoustic_modeling/sat.py b/montreal_forced_aligner/acoustic_modeling/sat.py
@@ -81,13 +81,15 @@ def _run(self):
                 .filter(Job.id == self.job_name)
                 .first()
             )
-            for d in job.dictionaries:
+            for d in job.training_dictionaries:
                 train_logger.debug(f"Accumulating stats for dictionary {d.name} ({d.id})")
                 train_logger.debug(f"Accumulating stats for model: {self.model_path}")
                 dict_id = d.id
                 accumulator = TwoFeatsStatsAccumulator(self.model_path)
 
                 ali_path = job.construct_path(self.working_directory, "ali", "ark", dict_id)
+                if not ali_path.exists():
+                    continue
                 fmllr_path = job.construct_path(
                     job.corpus.current_subset_directory, "trans", "scp", dict_id
                 )

diff --git a/montreal_forced_aligner/acoustic_modeling/trainer.py b/montreal_forced_aligner/acoustic_modeling/trainer.py
@@ -100,6 +100,8 @@ def _run(self) -> typing.Generator[typing.Tuple[int, str]]:
             transition_model, acoustic_model = read_gmm_model(self.model_path)
             for dict_id in job.dictionary_ids:
                 ali_path = job.construct_path(self.working_directory, "ali", "ark", dict_id)
+                if not ali_path.exists():
+                    continue
                 transition_accs = DoubleVector(transition_model.NumTransitionIds() + 1)
                 alignment_archive = AlignmentArchive(ali_path)
                 for alignment in alignment_archive:
@@ -523,6 +525,8 @@ def quality_check_subset(self):
                     self.working_directory, "temp_ali", "ark"
                 )
                 for dict_id, ali_path in ali_paths.items():
+                    if not ali_path.exists():
+                        continue
                     new_path = temp_ali_paths[dict_id]
                     write_specifier = generate_write_specifier(new_path)
                     writer = Int32VectorWriter(write_specifier)
@@ -577,15 +581,20 @@ def train(self) -> None:
                 self.current_acoustic_model = AcousticModel(
                     previous.exported_model_path, self.working_directory
                 )
-                self.align()
-                with self.session() as session:
-                    session.query(WordInterval).delete()
-                    session.query(PhoneInterval).delete()
-                    session.commit()
-                self.collect_alignments()
-                self.analyze_alignments()
-                if self.current_subset != 0:
-                    self.quality_check_subset()
+                if (
+                    not self.current_workflow.done
+                    or not self.current_workflow.working_directory.exists()
+                ):
+                    logger.debug(f"Skipping {self.current_aligner.identifier} alignments")
+                    self.align()
+                    with self.session() as session:
+                        session.query(WordInterval).delete()
+                        session.query(PhoneInterval).delete()
+                        session.commit()
+                    self.collect_alignments()
+                    self.analyze_alignments()
+                    if self.current_subset != 0:
+                        self.quality_check_subset()
 
             self.set_current_workflow(trainer.identifier)
             if trainer.identifier.startswith("pronunciation_probabilities"):
@@ -721,7 +730,6 @@ def align_options(self) -> MetaDict:
             options = self.current_aligner.align_options
         else:
             options = super().align_options
-        options["boost_silence"] = max(1.25, options["boost_silence"])
         return options
 
     def align(self) -> None:

diff --git a/montreal_forced_aligner/acoustic_modeling/triphone.py b/montreal_forced_aligner/acoustic_modeling/triphone.py
@@ -94,10 +94,12 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
             train_logger.debug(f"Previous model path: {self.align_model_path}")
             train_logger.debug(f"Model path: {self.model_path}")
             train_logger.debug(f"Tree path: {self.tree_path}")
-            for d in job.dictionaries:
+            for d in job.training_dictionaries:
                 dict_id = d.id
                 train_logger.debug(f"Converting alignments for {d.name}")
                 ali_path = self.ali_paths[dict_id]
+                if not ali_path.exists():
+                    continue
                 new_ali_path = self.new_ali_paths[dict_id]
                 train_logger.debug(f"Old alignments: {ali_path}")
                 train_logger.debug(f"New alignments: {new_ali_path}")
@@ -159,12 +161,14 @@ def _run(self):
                 .filter(Phone.phone_type.in_([PhoneType.silence, PhoneType.oov]))
                 .order_by(Phone.mapping_id)
             ]
-            for d in job.dictionaries:
+            for d in job.training_dictionaries:
                 train_logger.debug(f"Accumulating stats for dictionary {d.name} ({d.id})")
                 train_logger.debug(f"Accumulating stats for model: {self.model_path}")
                 dict_id = d.id
                 feature_archive = job.construct_feature_archive(self.working_directory, dict_id)
                 ali_path = job.construct_path(self.working_directory, "ali", "ark", dict_id)
+                if not ali_path.exists():
+                    continue
                 train_logger.debug("Feature Archive information:")
                 train_logger.debug(f"File: {feature_archive.file_name}")
                 train_logger.debug(f"CMVN: {feature_archive.cmvn_read_specifier}")
@@ -397,8 +401,29 @@ def _setup_tree(self, init_from_previous=False, initial_mix_up=True) -> None:
             train_logger.debug(f"Phone sets: {phone_sets}")
             questions = automatically_obtain_questions(tree_stats, phone_sets, [1], 1)
             train_logger.debug(f"Automatically obtained {len(questions)} questions")
-            for v in self.worker.extra_questions_mapping.values():
-                questions.append(sorted([self.phone_mapping[x] for x in v]))
+            train_logger.debug("Automatic questions:")
+            for q_set in questions:
+                train_logger.debug(", ".join([self.reversed_phone_mapping[x] for x in q_set]))
+
+            # Remove questions containing silence and other phones
+            train_logger.debug("Filtering the following sets for containing silence phone:")
+            silence_phone_id = self.phone_mapping[self.optional_silence_phone]
+            silence_sets = [
+                x for x in questions if silence_phone_id in x and x != [silence_phone_id]
+            ]
+            for q_set in silence_sets:
+                train_logger.debug(", ".join([self.reversed_phone_mapping[x] for x in q_set]))
+            questions = [
+                x for x in questions if silence_phone_id not in x or x == [silence_phone_id]
+            ]
+
+            extra_questions = self.worker.extra_questions_mapping
+            if extra_questions:
+                train_logger.debug(f"Adding {len(extra_questions)} questions")
+                train_logger.debug("Extra questions:")
+                for v in self.worker.extra_questions_mapping.values():
+                    questions.append(sorted([self.phone_mapping[x] for x in v]))
+                    train_logger.debug(", ".join(v))
             train_logger.debug(f"{len(questions)} total questions")
 
             build_tree(