bug fix

Synced with lutianyu2001/TIR-Learner at 2024-01-12 03:42 (CST)
oushujun · Jan 12, 2024 · e5492b3 · e5492b3
1 parent 8d18a77
commit e5492b3
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 16 deletions.
diff --git a/bin/TIR-Learner3.0/TIR-Learner3.0.py b/bin/TIR-Learner3.0/TIR-Learner3.0.py
@@ -63,8 +63,8 @@
 
     GRF_path = parsed_args.grf_path.replace('"', "")
     gt_path = parsed_args.gt_path.replace('"', "")
-
     additional_args = prog_const.process_additional_args(parsed_args.additional_args.split(" "))
+    print(f"INFO: Additional args: {additional_args} accepted.")
 
     # Transforming the possible relative path into absolute path
     genome_file = os.path.abspath(genome_file)

diff --git a/bin/TIR-Learner3.0/bin/CNN_predict.py b/bin/TIR-Learner3.0/bin/CNN_predict.py
@@ -13,6 +13,7 @@
 
     from sklearn.preprocessing import LabelEncoder
     # Attention: sklearn does not automatically import its subpackages
+    import tensorflow as tf
     from keras.utils import to_categorical
     from keras.models import load_model
 
@@ -40,11 +41,11 @@ def feature_encoding(df_in, flag_verbose):
     feature_int_encoder.fit(voc)
 
     df = df_in.loc[:, ["id", "seq_frag"]].copy()
-    print("  Step 2/6: Label Encoding - Transforming non-numerical labels to numerical labels")
+    print("  Step 2/8: Label Encoding - Transforming non-numerical labels to numerical labels")
     df["int_enc"] = df.swifter.progress_bar(flag_verbose).apply(
         lambda x: np.array(feature_int_encoder.transform(list(x["seq_frag"]))).reshape(-1, 1), axis=1)
     df = df.drop(columns="seq_frag")
-    print("  Step 3/6: One-Hot Encoding - Converting class vectors to binary class matrices")
+    print("  Step 3/8: One-Hot Encoding - Converting class vectors to binary class matrices")
     df["feature"] = df.swifter.progress_bar(flag_verbose).apply(
         lambda x: to_categorical(x["int_enc"], num_classes=num_classes), axis=1)
     df = df.drop(columns="int_enc")
@@ -69,7 +70,11 @@ def predict(df_in, genome_file, path_to_model):
     target_int_encoded = target_int_encoder.transform(l_class)
     d = dict(zip(target_int_encoded, l_class))
 
-    predicted_labels = model.predict(np.stack(pre_feature))
+    print("  Step 5/8: Converting feature to tensor")
+    with tf.device("/cpu:0"):
+        pre_feature_tensor = tf.convert_to_tensor(np.stack(pre_feature), np.float32)
+
+    predicted_labels = model.predict(pre_feature_tensor)
     df["percent"] = pd.Series(predicted_labels.max(axis=-1))
     y_classes = predicted_labels.argmax(axis=-1)
     df["TIR_type"] = pd.Series([d[i] for i in y_classes])
@@ -79,11 +84,11 @@ def predict(df_in, genome_file, path_to_model):
 def postprocessing(df_in, flag_verbose):
     df = df_in.loc[:, ["id", "TIR_type"]]
     df = df[df["TIR_type"] != "NonTIR"].reset_index(drop=True)
-    print("  Step 4/6: Retrieving sequence ID")
+    print("  Step 6/8: Retrieving sequence ID")
     df["seqid"] = df.swifter.progress_bar(flag_verbose).apply(lambda x: x["id"].split(":")[0], axis=1)
-    print("  Step 5/6: Retrieving sequence starting coordinate")
+    print("  Step 7/8: Retrieving sequence starting coordinate")
     df["sstart"] = df.swifter.progress_bar(flag_verbose).apply(lambda x: int(x["id"].split(":")[1]), axis=1)
-    print("  Step 6/6: Retrieving sequence ending coordinate")
+    print("  Step 8/8: Retrieving sequence ending coordinate")
     df["send"] = df.swifter.progress_bar(flag_verbose).apply(lambda x: int(x["id"].split(":")[2]), axis=1)
     df = df.loc[:, ["TIR_type", "id", "seqid", "sstart", "send"]]
     df = df.sort_values(["TIR_type", "seqid", "sstart", "send"], ignore_index=True)
@@ -92,12 +97,15 @@ def postprocessing(df_in, flag_verbose):
 
 def execute(TIRLearner_instance) -> pd.DataFrame:
     df = TIRLearner_instance["base"].copy()
-    print("  Step 1/6: Getting sequence fragment for prediction")
+
+    print("  Step 1/8: Getting sequence fragment for prediction")
     df["seq_frag"] = df.swifter.progress_bar(TIRLearner_instance.flag_verbose).apply(get_sequence_fragment, axis=1)
     df = df.drop(columns="seq")
 
     df = feature_encoding(df, TIRLearner_instance.flag_verbose)
+
+    print("  Step 4/8: CNN prediction")
     df = predict(df, TIRLearner_instance.genome_file_path,
                  os.path.join(prog_const.program_root_dir_path, prog_const.CNN_model_dir_name))
-    df = postprocessing(df, TIRLearner_instance.flag_verbose)
-    return df
+
+    return postprocessing(df, TIRLearner_instance.flag_verbose)
diff --git a/bin/TIR-Learner3.0/bin/check_TIR_TSD.py b/bin/TIR-Learner3.0/bin/check_TIR_TSD.py
@@ -195,7 +195,7 @@ def process_result(df_in, module):
     return df
 
 
-def execute(TIRLearner_instance, module: str) -> pd.DataFrame | None:
+def execute(TIRLearner_instance, module: str) -> pd.DataFrame:
     df = TIRLearner_instance["base"].copy()
     df["len"] = df["end"] - df["start"]
     df = df[df["len"] >= 450].reset_index(drop=True)

diff --git a/bin/TIR-Learner3.0/bin/process_de_novo_result.py b/bin/TIR-Learner3.0/bin/process_de_novo_result.py
@@ -45,6 +45,10 @@ def process_GRF_result(TIRLearner_instance):
     df_in = TIRLearner_instance.working_df_dict["GRF"]
     df = df_in[df_in["len"] >= 50].copy()
 
+    if df.shape[0] == 0:
+        print("NOTICE: No TIR found by GRF.")
+        return None
+
     print("  Step 1/7: Getting TIR")
     df["TIR_len"] = df.swifter.progress_bar(TIRLearner_instance.flag_verbose).apply(
         lambda x: find_digits_sum(x["id"].split(":")[-2]), axis=1)
@@ -95,6 +99,10 @@ def process_TIRvish_result(TIRLearner_instance):
     df_in = TIRLearner_instance["TIRvish"]
     df = df_in[df_in["end"] - df_in["start"] + 1 >= 50].copy()
 
+    if df.shape[0] == 0:
+        print("NOTICE: No TIR found by TIRvish.")
+        return None
+
     print("  Step 1/5: Getting TIR")
     df["TIR1_start"] = df["TIR1_start"] - df["start"]
     df.loc[df["TIR1_start"] < 0, "TIR1_start"] = 0

diff --git a/bin/TIR-Learner3.0/bin/prog_const.py b/bin/TIR-Learner3.0/bin/prog_const.py
@@ -1,10 +1,10 @@
 import os
 
 # Acceptable additional args
-FORCE_GRF_MODE = 1
-CHECKPOINT_OFF = 2
-SKIP_TIRVISH = 3
-SKIP_GRF = 4
+FORCE_GRF_MODE = "FORCE_GRF_MODE"
+CHECKPOINT_OFF = "CHECKPOINT_OFF"
+SKIP_TIRVISH = "SKIP_TIRVISH"
+SKIP_GRF = "SKIP_GRF"
 
 spliter = "-+-"
 TIR_types = ("DTA", "DTC", "DTH", "DTM", "DTT")
@@ -19,7 +19,7 @@
 program_root_dir_path = os.path.abspath(str(os.path.dirname(os.path.dirname(__file__))))
 
 ref_lib_dir_name = "RefLib"
-ref_lib_available_species = ["rice", "maize"]
+ref_lib_available_species = ("rice", "maize")
 ref_lib_file_dict = {species: [f"{species}_{TIR_type}_RefLib" for TIR_type in TIR_types]
                      for species in ref_lib_available_species}
 ref_lib_dir_path = os.path.join(program_root_dir_path, ref_lib_dir_name)