Handle performance reports

tenstorrent · Dec 4, 2024 · d541806 · d541806
1 parent f18e9a2
commit d541806
Show file tree

Hide file tree

Showing 10 changed files with 619 additions and 5 deletions.
diff --git a/.github/actions/collect_data/action.yml b/.github/actions/collect_data/action.yml
@@ -52,7 +52,8 @@ runs:
     run: |
       python3 ${GITHUB_ACTION_PATH}/src/generate_data.py --run_id ${{ inputs.run_id }}
       # Workaround: Copy file to avoid GH upload filename limitations
-      cp pipeline_${{ inputs.run_id }}*.json pipelinecopy_${{ inputs.run_id }}.json
+      cp pipeline_*.json pipelinecopy_${{ inputs.run_id }}.json
+      cp benchmark_*.json benchmarkcopy_${{ inputs.run_id }}.json
 
   - name: Create key file
     if: ${{ inputs.ssh-private-key != '' }}
@@ -74,6 +75,7 @@ runs:
       path: |
         if-no-files-found: warn
         path: |
+          benchmarkcopy_${{ inputs.run_id }}.json
           pipelinecopy_${{ inputs.run_id }}.json
           generated/cicd/${{ inputs.run_id }}/workflow.json
           generated/cicd/${{ inputs.run_id }}/workflow_jobs.json
diff --git a/.github/actions/collect_data/sftp-csv.txt b/.github/actions/collect_data/sftp-csv.txt
diff --git a/.github/actions/collect_data/sftp-json.txt b/.github/actions/collect_data/sftp-json.txt
@@ -1,2 +1,3 @@
 put -r pipeline_*.json
+put -r benchmark_*.json
 ls -hal
diff --git a/.github/actions/collect_data/src/benchmark.py b/.github/actions/collect_data/src/benchmark.py
@@ -0,0 +1,117 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import os
+import pathlib
+import json
+from loguru import logger
+from pydantic_models import BenchmarkMeasurement, CompleteBenchmarkRun
+
+
+def create_json_from_report(pipeline, workflow_outputs_dir):
+    results = []
+    reports = _get_model_reports(workflow_outputs_dir, pipeline.github_pipeline_id)
+
+    for job_id, report_paths in reports.items():
+        for report_path in report_paths:
+            with open(report_path) as report_file:
+                report_data = json.load(report_file)
+                results.append(_map_benchmark_data(pipeline, job_id, report_data))
+                logger.info(f"Created benchmark data for job: {job_id} model: {report_data['model']}")
+    return results
+
+
+def get_benchmark_filename(report):
+    ts = report.run_start_ts.strftime("%Y-%m-%dT%H:%M:%S%z")
+    return f"benchmark_{report.github_job_id}_{ts}.json"
+
+
+def _get_model_reports(workflow_outputs_dir, workflow_run_id: int):
+    """
+    This function searches for perf reports in the artifacts directory
+    and returns a mapping of job IDs to the paths of the perf reports.
+    We expect that report filename is in format `<report_name>_<job_id>.json`.
+    """
+    job_paths_map = {}
+    artifacts_dir = f"{workflow_outputs_dir}/{workflow_run_id}/artifacts"
+
+    logger.info(f"Searching for perf reports in {artifacts_dir}")
+
+    for root, _, files in os.walk(artifacts_dir):
+        for file in files:
+            if file.endswith(".json"):
+                logger.debug(f"Found perf report {file}")
+                file_path = pathlib.Path(root) / file
+                filename = file_path.name
+                try:
+                    job_id = int(filename.split(".")[-2].split("_")[-1])
+                except ValueError:
+                    logger.warning(f"Could not extract job ID from {filename}")
+                    continue
+                report_paths = job_paths_map.get(job_id, [])
+                report_paths.append(file_path)
+                job_paths_map[job_id] = report_paths
+    return job_paths_map
+
+
+def _map_benchmark_data(pipeline, job_id, report_data):
+
+    # get job information from pipeline
+    job = next(job for job in pipeline.jobs if job.github_job_id == job_id)
+
+    return CompleteBenchmarkRun(
+        run_start_ts=pipeline.pipeline_start_ts,
+        run_end_ts=pipeline.pipeline_end_ts,
+        run_type="",
+        git_repo_name=None,
+        git_commit_hash=pipeline.git_commit_hash,
+        git_commit_ts=None,
+        git_branch_name=pipeline.git_branch_name,
+        github_pipeline_id=pipeline.github_pipeline_id,
+        github_pipeline_link=pipeline.github_pipeline_link,
+        github_job_id=job.github_job_id,
+        user_name=pipeline.git_author,
+        docker_image=job.docker_image,
+        device_hostname=job.host_name,
+        device_ip=None,
+        device_info=None,
+        ml_model_name=report_data["model"],
+        ml_model_type=None,
+        num_layers=None,
+        batch_size=report_data.get("batch_size", None),
+        config_params={},
+        precision=None,
+        dataset_name=None,
+        profiler_name=None,
+        input_sequence_length=None,
+        output_sequence_length=None,
+        image_dimension=None,
+        perf_analysis=None,
+        training=report_data.get("training", False),
+        measurements=[
+            BenchmarkMeasurement(
+                step_start_ts=job.job_start_ts,
+                step_end_ts=job.job_end_ts,
+                iteration=0,
+                step_name="",
+                step_warm_up_num_iterations=None,
+                name="samples_per_sec",
+                value=report_data["samples_per_sec"],
+                target=None,
+                device_power=None,
+                device_temperature=None,
+            ),
+            BenchmarkMeasurement(
+                step_start_ts=job.job_start_ts,
+                step_end_ts=job.job_end_ts,
+                iteration=0,
+                step_name="",
+                step_warm_up_num_iterations=None,
+                name="total_time",
+                value=report_data["total_time"],
+                target=None,
+                device_power=None,
+                device_temperature=None,
+            ),
+        ],
+    )
diff --git a/.github/actions/collect_data/src/generate_data.py b/.github/actions/collect_data/src/generate_data.py
@@ -2,10 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 import argparse
 from loguru import logger
 from utils import get_github_runner_environment
 from cicd import create_cicd_json_for_data_analysis, get_cicd_json_filename
+from benchmark import create_json_from_report, get_benchmark_filename
 
 
 def create_pipeline_json(workflow_filename: str, jobs_filename: str, workflow_outputs_dir):
@@ -27,6 +29,20 @@ def create_pipeline_json(workflow_filename: str, jobs_filename: str, workflow_ou
     return pipeline, report_filename
 
 
+def create_benchmark_jsons(pipeline, workflow_outputs_dir):
+    results = []
+    reports = create_json_from_report(pipeline, workflow_outputs_dir)
+    for report in reports:
+        report_filename = get_benchmark_filename(
+            report
+        )  # f"benchmark_{report.github_job_id}_{report.run_start_ts}.json"
+        logger.info(f"Writing benchmark JSON to {report_filename}")
+        with open(report_filename, "w") as f:
+            f.write(report.model_dump_json())
+        results.append((report, report_filename))
+    return results
+
+
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
@@ -41,8 +57,13 @@ def create_pipeline_json(workflow_filename: str, jobs_filename: str, workflow_ou
     args = parser.parse_args()
 
     logger.info(f"Creating pipeline JSON for workflow run ID {args.run_id}")
-    create_pipeline_json(
+    pipeline, _ = create_pipeline_json(
         workflow_filename=f"{args.output_dir}/{args.run_id}/workflow.json",
         jobs_filename=f"{args.output_dir}/{args.run_id}/workflow_jobs.json",
         workflow_outputs_dir=args.output_dir,
     )
+
+    create_benchmark_jsons(
+        pipeline=pipeline,
+        workflow_outputs_dir=args.output_dir,
+    )
diff --git a/.github/actions/collect_data/src/pydantic_models.py b/.github/actions/collect_data/src/pydantic_models.py
@@ -114,3 +114,117 @@ class Pipeline(BaseModel):
     git_author: str = Field(description="Author of the Git commit.")
     orchestrator: Optional[str] = Field(None, description="CI/CD pipeline orchestration platform.")
     jobs: List[Job] = []
+
+
+class BenchmarkMeasurement(BaseModel):
+    """
+    Contains measurements for each benchmark run, iteration and step.
+
+    A run can have multiple iterations, each iteration can have multiple steps and each
+    step can execute multiple measurements.
+    """
+
+    step_start_ts: datetime = Field(description="Timestamp with time zone when the step started.")
+    step_end_ts: datetime = Field(description="Timestamp with time zone when the step ended.")
+    iteration: int = Field(
+        description="A benchmark run can comprise a loop that repeats with the same "
+        "parameters the same sequence of steps and measurements for each. "
+        "This integer is the repetition number."
+    )
+    step_name: str = Field(description="Name of the benchmark step within the run.")
+    step_warm_up_num_iterations: Optional[int] = Field(
+        None, description="Number of iterations for device warm-up at each step."
+    )
+    name: str = Field(
+        description="Name of the measurement performed, e.g. tokens_per_sec_per_user, "
+        "tokens_per_sec, images_per_sec, pearson_correlation, "
+        "top1/top5 ratios."
+    )
+    value: float = Field(description="Measured value.")
+    target: Optional[float] = Field(None, description="Target value.")
+    device_power: Optional[float] = Field(
+        None,
+        description="Average power consumption in Watts during the benchmark step.",
+    )
+    device_temperature: Optional[float] = Field(
+        None, description="Average temperature of the device during the benchmark."
+    )
+
+
+class CompleteBenchmarkRun(BaseModel):
+    """
+    Contains information about each execution of an AI model benchmark, called benchmark
+    run, composed of steps each of which performs a set of measurements.
+
+    The sequence of steps in a run can be iterated in a loop.
+    """
+
+    run_start_ts: datetime = Field(description="Timestamp with time zone when the benchmark run started.")
+    run_end_ts: datetime = Field(description="Timestamp with time zone when the benchmark run ended.")
+    run_type: str = Field(description="Description of the benchmark run, e.g. a100_fp16_experiments.")
+    git_repo_name: Optional[str] = Field(
+        None,
+        description="Name of the Git repository containing the code that executes " "the benchmark.",
+    )
+    git_commit_hash: Optional[str] = Field(
+        None,
+        description="Git commit hash of the code used to run the benchmark (software " "version info).",
+    )
+    git_commit_ts: Optional[datetime] = Field(None, description="Timestamp with timezone of the git commit.")
+    git_branch_name: Optional[str] = Field(
+        None, description="Name of the Git branch associated with the benchmark run."
+    )
+    github_pipeline_id: Optional[int] = Field(
+        None,
+        description="Unique identifier for the pipeline record from GitHub Actions.",
+    )
+    github_pipeline_link: Optional[str] = Field(
+        None,
+        description="Link to the GitHub job run associated with the benchmark run.",
+    )
+    github_job_id: Optional[int] = Field(None, description="Unique GitHub Actions CI job ID.")
+    user_name: Optional[str] = Field(None, description="Name of the person that executed the benchmark run.")
+    docker_image: Optional[str] = Field(
+        None,
+        description="Name or ID of the Docker image used for benchmarking (software "
+        "version info), e.g., trt-llm-v080.",
+    )
+    device_hostname: str = Field(description="Host name of the device on which the benchmark is performed.")
+    device_ip: Optional[str] = Field(None, description="Host IP address.")
+    device_info: Optional[dict] = Field(
+        None,
+        description="Device information as JSON, such as manufacturer, card_type, "
+        "dram_size, num_cores, price, bus_interface, optimal_clock_speed.",
+    )
+    ml_model_name: str = Field(description="Name of the benchmarked neural network model.")
+    ml_model_type: Optional[str] = Field(
+        None,
+        description="Model type, such as text generation, classification, question " "answering, etc.",
+    )
+    num_layers: Optional[int] = Field(None, description="Number of layers of the model.")
+    batch_size: Optional[int] = Field(None, description="Batch size.")
+    config_params: Optional[dict] = Field(None, description="Additional training/inference parameters.")
+    precision: Optional[str] = Field(
+        None,
+        description="Numerical precision, such as bfp8, fp16, or a mix such as " "fp16_act_bfp8_weights, etc.",
+    )
+    dataset_name: Optional[str] = Field(None, description="Name of the dataset used for the benchmark.")
+    profiler_name: Optional[str] = Field(None, description="Profiler to time the benchmark.")
+    input_sequence_length: Optional[int] = Field(
+        None,
+        description="Length of the sequence used as input to the model, applicable " "to sequence models.",
+    )
+    output_sequence_length: Optional[int] = Field(
+        None,
+        description="Length of the sequence used as output by the model, applicable " "to sequence models.",
+    )
+    image_dimension: Optional[str] = Field(
+        None,
+        description="Dimension of the image, e.g. 224x224x3, applicable to computer " "vision models.",
+    )
+    perf_analysis: Optional[bool] = Field(
+        None,
+        description="If the model was run in perf analysis mode. This is " "kernel/operation execution mode.",
+    )
+    training: Optional[bool] = Field(None, description="ML model benchmarks for training or inference.")
+    measurements: List[BenchmarkMeasurement] = Field(description="List of benchmark measurements.")
diff --git a/...s/collect_data/test/data/12141788622/artifacts/forge-benchmark-e2e-mnist_33854708624.json b/...s/collect_data/test/data/12141788622/artifacts/forge-benchmark-e2e-mnist_33854708624.json
@@ -0,0 +1,22 @@
+{
+    "model": "MNIST Linear",
+    "config": "",
+    "date": "24-12-03",
+    "hash": "c47f41a",
+    "machine_name": "8cb186cee6d2",
+    "samples_per_sec": 0.23979727678872859,
+    "total_samples": 1,
+    "total_time": 4.170189142227173,
+    "training": false,
+    "batch_size": 1,
+    "output": "forge-benchmark-e2e-mnist_33854708624.json",
+    "arch": "",
+    "chips": "",
+    "device": "",
+    "galaxy": "",
+    "perf_analysis": "",
+    "load_tti": "",
+    "save_tti": "",
+    "task": "",
+    "evaluation_score": ""
+}