Skip to content

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
Synced with lutianyu2001/TIR-Learner at 2024-01-12 03:42 (CST)
  • Loading branch information
lutianyu2001 committed Jan 12, 2024
1 parent 8d18a77 commit e5492b3
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 16 deletions.
2 changes: 1 addition & 1 deletion bin/TIR-Learner3.0/TIR-Learner3.0.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@

GRF_path = parsed_args.grf_path.replace('"', "")
gt_path = parsed_args.gt_path.replace('"', "")

additional_args = prog_const.process_additional_args(parsed_args.additional_args.split(" "))
print(f"INFO: Additional args: {additional_args} accepted.")

# Transforming the possible relative path into absolute path
genome_file = os.path.abspath(genome_file)
Expand Down
26 changes: 17 additions & 9 deletions bin/TIR-Learner3.0/bin/CNN_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from sklearn.preprocessing import LabelEncoder
# Attention: sklearn does not automatically import its subpackages
import tensorflow as tf
from keras.utils import to_categorical
from keras.models import load_model

Expand Down Expand Up @@ -40,11 +41,11 @@ def feature_encoding(df_in, flag_verbose):
feature_int_encoder.fit(voc)

df = df_in.loc[:, ["id", "seq_frag"]].copy()
print(" Step 2/6: Label Encoding - Transforming non-numerical labels to numerical labels")
print(" Step 2/8: Label Encoding - Transforming non-numerical labels to numerical labels")
df["int_enc"] = df.swifter.progress_bar(flag_verbose).apply(
lambda x: np.array(feature_int_encoder.transform(list(x["seq_frag"]))).reshape(-1, 1), axis=1)
df = df.drop(columns="seq_frag")
print(" Step 3/6: One-Hot Encoding - Converting class vectors to binary class matrices")
print(" Step 3/8: One-Hot Encoding - Converting class vectors to binary class matrices")
df["feature"] = df.swifter.progress_bar(flag_verbose).apply(
lambda x: to_categorical(x["int_enc"], num_classes=num_classes), axis=1)
df = df.drop(columns="int_enc")
Expand All @@ -69,7 +70,11 @@ def predict(df_in, genome_file, path_to_model):
target_int_encoded = target_int_encoder.transform(l_class)
d = dict(zip(target_int_encoded, l_class))

predicted_labels = model.predict(np.stack(pre_feature))
print(" Step 5/8: Converting feature to tensor")
with tf.device("/cpu:0"):
pre_feature_tensor = tf.convert_to_tensor(np.stack(pre_feature), np.float32)

predicted_labels = model.predict(pre_feature_tensor)
df["percent"] = pd.Series(predicted_labels.max(axis=-1))
y_classes = predicted_labels.argmax(axis=-1)
df["TIR_type"] = pd.Series([d[i] for i in y_classes])
Expand All @@ -79,11 +84,11 @@ def predict(df_in, genome_file, path_to_model):
def postprocessing(df_in, flag_verbose):
df = df_in.loc[:, ["id", "TIR_type"]]
df = df[df["TIR_type"] != "NonTIR"].reset_index(drop=True)
print(" Step 4/6: Retrieving sequence ID")
print(" Step 6/8: Retrieving sequence ID")
df["seqid"] = df.swifter.progress_bar(flag_verbose).apply(lambda x: x["id"].split(":")[0], axis=1)
print(" Step 5/6: Retrieving sequence starting coordinate")
print(" Step 7/8: Retrieving sequence starting coordinate")
df["sstart"] = df.swifter.progress_bar(flag_verbose).apply(lambda x: int(x["id"].split(":")[1]), axis=1)
print(" Step 6/6: Retrieving sequence ending coordinate")
print(" Step 8/8: Retrieving sequence ending coordinate")
df["send"] = df.swifter.progress_bar(flag_verbose).apply(lambda x: int(x["id"].split(":")[2]), axis=1)
df = df.loc[:, ["TIR_type", "id", "seqid", "sstart", "send"]]
df = df.sort_values(["TIR_type", "seqid", "sstart", "send"], ignore_index=True)
Expand All @@ -92,12 +97,15 @@ def postprocessing(df_in, flag_verbose):

def execute(TIRLearner_instance) -> pd.DataFrame:
df = TIRLearner_instance["base"].copy()
print(" Step 1/6: Getting sequence fragment for prediction")

print(" Step 1/8: Getting sequence fragment for prediction")
df["seq_frag"] = df.swifter.progress_bar(TIRLearner_instance.flag_verbose).apply(get_sequence_fragment, axis=1)
df = df.drop(columns="seq")

df = feature_encoding(df, TIRLearner_instance.flag_verbose)

print(" Step 4/8: CNN prediction")
df = predict(df, TIRLearner_instance.genome_file_path,
os.path.join(prog_const.program_root_dir_path, prog_const.CNN_model_dir_name))
df = postprocessing(df, TIRLearner_instance.flag_verbose)
return df

return postprocessing(df, TIRLearner_instance.flag_verbose)
2 changes: 1 addition & 1 deletion bin/TIR-Learner3.0/bin/check_TIR_TSD.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def process_result(df_in, module):
return df


def execute(TIRLearner_instance, module: str) -> pd.DataFrame | None:
def execute(TIRLearner_instance, module: str) -> pd.DataFrame:
df = TIRLearner_instance["base"].copy()
df["len"] = df["end"] - df["start"]
df = df[df["len"] >= 450].reset_index(drop=True)
Expand Down
8 changes: 8 additions & 0 deletions bin/TIR-Learner3.0/bin/process_de_novo_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ def process_GRF_result(TIRLearner_instance):
df_in = TIRLearner_instance.working_df_dict["GRF"]
df = df_in[df_in["len"] >= 50].copy()

if df.shape[0] == 0:
print("NOTICE: No TIR found by GRF.")
return None

print(" Step 1/7: Getting TIR")
df["TIR_len"] = df.swifter.progress_bar(TIRLearner_instance.flag_verbose).apply(
lambda x: find_digits_sum(x["id"].split(":")[-2]), axis=1)
Expand Down Expand Up @@ -95,6 +99,10 @@ def process_TIRvish_result(TIRLearner_instance):
df_in = TIRLearner_instance["TIRvish"]
df = df_in[df_in["end"] - df_in["start"] + 1 >= 50].copy()

if df.shape[0] == 0:
print("NOTICE: No TIR found by TIRvish.")
return None

print(" Step 1/5: Getting TIR")
df["TIR1_start"] = df["TIR1_start"] - df["start"]
df.loc[df["TIR1_start"] < 0, "TIR1_start"] = 0
Expand Down
10 changes: 5 additions & 5 deletions bin/TIR-Learner3.0/bin/prog_const.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os

# Acceptable additional args
FORCE_GRF_MODE = 1
CHECKPOINT_OFF = 2
SKIP_TIRVISH = 3
SKIP_GRF = 4
FORCE_GRF_MODE = "FORCE_GRF_MODE"
CHECKPOINT_OFF = "CHECKPOINT_OFF"
SKIP_TIRVISH = "SKIP_TIRVISH"
SKIP_GRF = "SKIP_GRF"

spliter = "-+-"
TIR_types = ("DTA", "DTC", "DTH", "DTM", "DTT")
Expand All @@ -19,7 +19,7 @@
program_root_dir_path = os.path.abspath(str(os.path.dirname(os.path.dirname(__file__))))

ref_lib_dir_name = "RefLib"
ref_lib_available_species = ["rice", "maize"]
ref_lib_available_species = ("rice", "maize")
ref_lib_file_dict = {species: [f"{species}_{TIR_type}_RefLib" for TIR_type in TIR_types]
for species in ref_lib_available_species}
ref_lib_dir_path = os.path.join(program_root_dir_path, ref_lib_dir_name)
Expand Down

0 comments on commit e5492b3

Please sign in to comment.