diff --git a/pipit/trace.py b/pipit/trace.py
index f400fdc0..4e320f12 100644
--- a/pipit/trace.py
+++ b/pipit/trace.py
@@ -73,6 +73,7 @@ def from_csv(filename):
         # ensure that ranks are ints
         events_dataframe = events_dataframe.astype({"Process": "int32"})
 
+        # this next part is needed for fake test reading
         # ensure that the attributes are a dict, not a string
         if "Attributes" in events_dataframe.columns:
             # use literal_eval so we're not running a security risk
diff --git a/pipit/util/fake.py b/pipit/util/fake.py
deleted file mode 100644
index 40b02ff3..00000000
--- a/pipit/util/fake.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from pipit import Trace
-import numpy as np
-from faketest import gen_fake_tree, emit_tree_file, gen_forest
-import pandas as pd
-
-
-function_names = ["foo", "bar", "baz", "quux", "grault", "garply", "waldo"]
-
-
-def test_with_fake_data():
-    """
-    Generate a fake test file and ground truth file, read the test file
-    with Pipit, and check it against the ground truth. Tests inclusive and
-    exclusive metrics, and uses time_profile_test_generic.
-    """
-    num_processes = 8
-    # generate one fake tree per process, 2000 functions in the tree
-    seed_tree = gen_fake_tree(200, function_names)
-    trees = gen_forest(seed_tree, num_processes)
-    test_file = open("fake.csv", "w")
-    ground_truth = open("fake_ground.csv", "w")
-    emit_tree_file(trees, test_file, ground_truth)
-    test_file.close()
-    ground_truth.close()
-    trace = Trace.from_csv("fake.csv")
-    # gt_dataframe should hold identical values to the columns of trace.events
-    gt_dataframe = pd.read_csv("fake_ground.csv")
-    trace.calc_exc_metrics()
-    pipit_dataframe = trace.events[["time.inc", "time.exc"]]
-    # adjust for nanoseconds
-    gt_dataframe["time.inc"] *= 1e9
-    gt_dataframe["time.exc"] *= 1e9
-    # NaN values for time won't compare equal, so check ourselves
-    assert (
-        np.isclose(pipit_dataframe["time.inc"], gt_dataframe["time.inc"])
-        | (np.isnan(gt_dataframe["time.inc"]) & np.isnan(pipit_dataframe["time.inc"]))
-    ).all()
-    # likewise, check exclusive metrics
-    assert (
-        np.isclose(pipit_dataframe["time.exc"], gt_dataframe["time.exc"])
-        | (np.isnan(gt_dataframe["time.exc"]) & np.isnan(pipit_dataframe["time.exc"]))
-    ).all()
-    time_profile_test_generic(trace, num_processes)
-
-
-def time_profile_test_generic(trace, num_processes):
-    """
-    Tests universal properties of time_profile, regardless of the trace.
-    Most asserts were taken from pipit/tests/trace.py, except those specific
-    to the ping-pong trace.
-    """
-    trace.calc_exc_metrics(["Timestamp (ns)"])
-
-    time_profile = trace.time_profile(num_bins=62)
-
-    # check length
-    assert len(time_profile) == 62
-
-    # check bin sizes
-    exp_duration = (
-        trace.events["Timestamp (ns)"].max() - trace.events["Timestamp (ns)"].min()
-    )
-    exp_bin_size = exp_duration / 62
-    bin_sizes = time_profile["bin_end"] - time_profile["bin_start"]
-
-    assert np.isclose(bin_sizes, exp_bin_size).all()
-
-    # check that sum of function contributions per bin equals bin duration
-    exp_bin_total_duration = exp_bin_size * num_processes
-    time_profile.drop(columns=["bin_start", "bin_end"], inplace=True)
-
-    assert np.isclose(time_profile.sum(axis=1), exp_bin_total_duration).all()
-
-    # check for each function that sum of exc time per bin equals total exc time
-    total_exc_times = trace.events.groupby("Name")["time.exc"].sum()
-
-    for column in time_profile:
-        if column == "idle_time":
-            continue
-
-        assert np.isclose(time_profile[column].sum(), total_exc_times[column])
-
-    # check normalization
-    norm = trace.time_profile(num_bins=62, normalized=True)
-    norm.drop(columns=["bin_start", "bin_end"], inplace=True)
-
-    assert (time_profile / exp_bin_total_duration).equals(norm)
diff --git a/pipit/util/faketest.py b/pipit/util/test_generator.py
similarity index 89%
rename from pipit/util/faketest.py
rename to pipit/util/test_generator.py
index bfdb8811..be12c455 100644
--- a/pipit/util/faketest.py
+++ b/pipit/util/test_generator.py
@@ -229,9 +229,10 @@ def gen_fake_tree(num_nodes, function_names, copy_subtrees=True):
     """
     Generates a whole tree of FakeNodes by randomly appending children.
     """
-    nodes = [gen_fake_node(function_names) for n in range(num_nodes)]
-    root = nodes[0]
-    for index, node in enumerate(nodes[1:]):
+    root = gen_fake_node(function_names)
+    # continue to add nodes until we've reached the target
+    while root.total_nodes < num_nodes:
+        node = gen_fake_node(function_names)
         # choose a node that's currently in the graph to add child to
         parent = root.choose_random_node()
         # select a random point for that child to run
@@ -243,7 +244,7 @@ def gen_fake_tree(num_nodes, function_names, copy_subtrees=True):
         else:
             subtree = random.choice(same_name)
             # larger subtrees are less likely to be copied
-            if random.random() > 0.7 / (subtree.total_nodes**0.5):
+            if random.random() > 4 / (subtree.total_nodes**0.5):
                 parent.add_child(node, run_time)
             else:
                 subtree = subtree.deepcopy()
@@ -318,12 +319,11 @@ def add_fake_mpi_events(trees, num_pairs):
         second_tree.insert_at_time(second_node, second_evt)
 
 
-def emit_tree_file(trees, test_file, ground_truth_file):
+def emit_tree_data(trees):
     """
-    Writes trees (one per process) as a CSV to the File object test_file.
-    At the same time, write ground truth function call information
-    to the File object ground_truth_file.
-    ground_truth_file will contain columns corresponding to Pipit's
+    Writes trees (one per process) as a CSV and returns them.
+    At the same time, return ground truth function call information.
+    The ground truth data will contain columns corresponding to Pipit's
     time.inc, time.exc.
     """
     data = []
@@ -345,7 +345,26 @@ def emit_tree_file(trees, test_file, ground_truth_file):
             "time.exc",
         ],
     ).sort_values("Timestamp (s)")
-    dataframe[["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"]].to_csv(
-        test_file, index=False
-    )
-    dataframe[["time.inc", "time.exc"]].to_csv(ground_truth_file, index=False)
+    data_csv = dataframe[
+        ["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"]
+    ].to_csv(index=False)
+    ground_csv = dataframe[["time.inc", "time.exc"]].to_csv(index=False)
+    return data_csv, ground_csv
+
+
+def generate_fake_test(
+    num_events,
+    num_processes,
+    function_names=["foo", "bar", "baz", "quux", "grault", "garply", "waldo"],
+    num_mpi_events=0,
+):
+    """
+    Top level test generation function. Generates test and ground truth datasets with a
+    minimum of num_events Enter/Leave events per process, of which there are
+    num_processes. Optionally, MPI events can be added.
+    """
+    seed_tree = gen_fake_tree(num_events // 2, function_names)
+    print(num_events // 2, seed_tree.total_nodes)
+    forest = gen_forest(seed_tree, num_processes)
+    add_fake_mpi_events(forest, num_mpi_events)
+    return emit_tree_data(forest)