diff --git a/pipit/trace.py b/pipit/trace.py index f400fdc0..4e320f12 100644 --- a/pipit/trace.py +++ b/pipit/trace.py @@ -73,6 +73,7 @@ def from_csv(filename): # ensure that ranks are ints events_dataframe = events_dataframe.astype({"Process": "int32"}) + # this next part is needed for fake test reading # ensure that the attributes are a dict, not a string if "Attributes" in events_dataframe.columns: # use literal_eval so we're not running a security risk diff --git a/pipit/util/fake.py b/pipit/util/fake.py deleted file mode 100644 index 40b02ff3..00000000 --- a/pipit/util/fake.py +++ /dev/null @@ -1,87 +0,0 @@ -from pipit import Trace -import numpy as np -from faketest import gen_fake_tree, emit_tree_file, gen_forest -import pandas as pd - - -function_names = ["foo", "bar", "baz", "quux", "grault", "garply", "waldo"] - - -def test_with_fake_data(): - """ - Generate a fake test file and ground truth file, read the test file - with Pipit, and check it against the ground truth. Tests inclusive and - exclusive metrics, and uses time_profile_test_generic. - """ - num_processes = 8 - # generate one fake tree per process, 2000 functions in the tree - seed_tree = gen_fake_tree(200, function_names) - trees = gen_forest(seed_tree, num_processes) - test_file = open("fake.csv", "w") - ground_truth = open("fake_ground.csv", "w") - emit_tree_file(trees, test_file, ground_truth) - test_file.close() - ground_truth.close() - trace = Trace.from_csv("fake.csv") - # gt_dataframe should hold identical values to the columns of trace.events - gt_dataframe = pd.read_csv("fake_ground.csv") - trace.calc_exc_metrics() - pipit_dataframe = trace.events[["time.inc", "time.exc"]] - # adjust for nanoseconds - gt_dataframe["time.inc"] *= 1e9 - gt_dataframe["time.exc"] *= 1e9 - # NaN values for time won't compare equal, so check ourselves - assert ( - np.isclose(pipit_dataframe["time.inc"], gt_dataframe["time.inc"]) - | (np.isnan(gt_dataframe["time.inc"]) & np.isnan(pipit_dataframe["time.inc"])) - ).all() - # likewise, check exclusive metrics - assert ( - np.isclose(pipit_dataframe["time.exc"], gt_dataframe["time.exc"]) - | (np.isnan(gt_dataframe["time.exc"]) & np.isnan(pipit_dataframe["time.exc"])) - ).all() - time_profile_test_generic(trace, num_processes) - - -def time_profile_test_generic(trace, num_processes): - """ - Tests universal properties of time_profile, regardless of the trace. - Most asserts were taken from pipit/tests/trace.py, except those specific - to the ping-pong trace. - """ - trace.calc_exc_metrics(["Timestamp (ns)"]) - - time_profile = trace.time_profile(num_bins=62) - - # check length - assert len(time_profile) == 62 - - # check bin sizes - exp_duration = ( - trace.events["Timestamp (ns)"].max() - trace.events["Timestamp (ns)"].min() - ) - exp_bin_size = exp_duration / 62 - bin_sizes = time_profile["bin_end"] - time_profile["bin_start"] - - assert np.isclose(bin_sizes, exp_bin_size).all() - - # check that sum of function contributions per bin equals bin duration - exp_bin_total_duration = exp_bin_size * num_processes - time_profile.drop(columns=["bin_start", "bin_end"], inplace=True) - - assert np.isclose(time_profile.sum(axis=1), exp_bin_total_duration).all() - - # check for each function that sum of exc time per bin equals total exc time - total_exc_times = trace.events.groupby("Name")["time.exc"].sum() - - for column in time_profile: - if column == "idle_time": - continue - - assert np.isclose(time_profile[column].sum(), total_exc_times[column]) - - # check normalization - norm = trace.time_profile(num_bins=62, normalized=True) - norm.drop(columns=["bin_start", "bin_end"], inplace=True) - - assert (time_profile / exp_bin_total_duration).equals(norm) diff --git a/pipit/util/faketest.py b/pipit/util/test_generator.py similarity index 89% rename from pipit/util/faketest.py rename to pipit/util/test_generator.py index bfdb8811..be12c455 100644 --- a/pipit/util/faketest.py +++ b/pipit/util/test_generator.py @@ -229,9 +229,10 @@ def gen_fake_tree(num_nodes, function_names, copy_subtrees=True): """ Generates a whole tree of FakeNodes by randomly appending children. """ - nodes = [gen_fake_node(function_names) for n in range(num_nodes)] - root = nodes[0] - for index, node in enumerate(nodes[1:]): + root = gen_fake_node(function_names) + # continue to add nodes until we've reached the target + while root.total_nodes < num_nodes: + node = gen_fake_node(function_names) # choose a node that's currently in the graph to add child to parent = root.choose_random_node() # select a random point for that child to run @@ -243,7 +244,7 @@ def gen_fake_tree(num_nodes, function_names, copy_subtrees=True): else: subtree = random.choice(same_name) # larger subtrees are less likely to be copied - if random.random() > 0.7 / (subtree.total_nodes**0.5): + if random.random() > 4 / (subtree.total_nodes**0.5): parent.add_child(node, run_time) else: subtree = subtree.deepcopy() @@ -318,12 +319,11 @@ def add_fake_mpi_events(trees, num_pairs): second_tree.insert_at_time(second_node, second_evt) -def emit_tree_file(trees, test_file, ground_truth_file): +def emit_tree_data(trees): """ - Writes trees (one per process) as a CSV to the File object test_file. - At the same time, write ground truth function call information - to the File object ground_truth_file. - ground_truth_file will contain columns corresponding to Pipit's + Writes trees (one per process) as a CSV and returns them. + At the same time, return ground truth function call information. + The ground truth data will contain columns corresponding to Pipit's time.inc, time.exc. """ data = [] @@ -345,7 +345,26 @@ def emit_tree_file(trees, test_file, ground_truth_file): "time.exc", ], ).sort_values("Timestamp (s)") - dataframe[["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"]].to_csv( - test_file, index=False - ) - dataframe[["time.inc", "time.exc"]].to_csv(ground_truth_file, index=False) + data_csv = dataframe[ + ["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"] + ].to_csv(index=False) + ground_csv = dataframe[["time.inc", "time.exc"]].to_csv(index=False) + return data_csv, ground_csv + + +def generate_fake_test( + num_events, + num_processes, + function_names=["foo", "bar", "baz", "quux", "grault", "garply", "waldo"], + num_mpi_events=0, +): + """ + Top level test generation function. Generates test and ground truth datasets with a + minimum of num_events Enter/Leave events per process, of which there are + num_processes. Optionally, MPI events can be added. + """ + seed_tree = gen_fake_tree(num_events // 2, function_names) + print(num_events // 2, seed_tree.total_nodes) + forest = gen_forest(seed_tree, num_processes) + add_fake_mpi_events(forest, num_mpi_events) + return emit_tree_data(forest)