Compare commits

...

14 Commits

Author SHA1 Message Date
Peter Allen Webb
ce827c658a Clean up dummy credentials and make types in script semi-respectable 2023-11-27 11:48:58 -05:00
Peter Allen Webb
70afe559c6 Fix artifact path 2023-11-22 16:34:03 -05:00
Peter Allen Webb
4b2a84872b Actually use the append operator as intended 2023-11-22 16:30:14 -05:00
Peter Allen Webb
aafb017d5f Show profile contents 2023-11-22 16:25:14 -05:00
Peter Allen Webb
98746ead28 Script bug fixes 2023-11-22 15:21:50 -05:00
Peter Allen Webb
03aab3ab98 Fix script parameter 2023-11-22 15:16:26 -05:00
Peter Allen Webb
c8b911b400 Add parameter to script 2023-11-22 15:08:33 -05:00
Peter Allen Webb
72696ab176 Add mkdir so dir exists 2023-11-22 14:08:49 -05:00
Peter Allen Webb
b661ad586c Fix indentation 2023-11-22 12:50:00 -05:00
Peter Allen Webb
8bbbf6588b Modify on clause 2023-11-22 12:47:51 -05:00
Peter Allen Webb
b7943f3372 Add changelog entry 2023-11-22 12:47:07 -05:00
Peter Allen Webb
52456b2ff7 Add more options to trigger workflow 2023-11-22 12:32:36 -05:00
Peter Allen Webb
85f9fd7251 Add on clause. 2023-11-22 12:24:57 -05:00
Peter Allen Webb
99ef0dd79c Add a GitHub action for performance checks. 2023-11-22 12:11:40 -05:00
3 changed files with 194 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
kind: Under the Hood
body: Added a GitHub action for checking performance characteristics of dbt.
time: 2023-11-22T12:46:12.16794-05:00
custom:
Author: peterallenwebb
Issue: "8323"

59
.github/workflows/perf-check.yml vendored Normal file
View File

@@ -0,0 +1,59 @@
# **what?**
# This workflow uses a python script to check the performance of dbt against
# baselines on a set of benchmark projects.
name: Performance Check
on:
pull_request:
workflow_dispatch:
jobs:
build-and-perf-check:
name: Build dbt and check performance
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Check out the repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: Install Python dependencies and install dbt
run: |
python -m pip install --user --upgrade pip
python -m pip --version
make dev
mypy --version
dbt --version
- name: Create profile
run: |
mkdir $HOME/.dbt
echo "default:" >> $HOME/.dbt/profiles.yml
echo " outputs:" >> $HOME/.dbt/profiles.yml
echo " dev:" >> $HOME/.dbt/profiles.yml
echo " dbname: dummy" >> $HOME/.dbt/profiles.yml
echo " host: localhost" >> $HOME/.dbt/profiles.yml
echo " password: paswd" >> $HOME/.dbt/profiles.yml
echo " port: 5432" >> $HOME/.dbt/profiles.yml
echo " schema: dummy" >> $HOME/.dbt/profiles.yml
echo " threads: 4" >> $HOME/.dbt/profiles.yml
echo " type: postgres" >> $HOME/.dbt/profiles.yml
echo " user: dummy" >> $HOME/.dbt/profiles.yml
echo " target: dev" >> $HOME/.dbt/profiles.yml
cat $HOME/.dbt/profiles.yml
- name: Run performance script
run: |
python ./scripts/perf-check.py baseline
- uses: actions/upload-artifact@v3
with:
name: perf-check-result
path: ./perf_check.json

129
scripts/perf-check.py Normal file
View File

@@ -0,0 +1,129 @@
import json
import os
import pathlib
import subprocess
import sys
import time
projects = {
# example of how to use project in separate public repo, not used yet.
# "jaffle_shop": {
# "name": "jaffle_shop",
# "git_url": "https://github.com/dbt-labs/jaffle_shop.git",
# "jobs": {
# "jaffle_shop__parse_no_partial": {
# "command": ["dbt", "parse", "--no-partial-parse"],
# },
# }
# },
"simple_models": {
"name": "simple_models",
"path": "./performance/projects/01_2000_simple_models",
"jobs": {
"simple_models__parse_no_partial": {
"command": ["dbt", "parse", "--no-partial-parse"],
},
"simple_models__second_parse": {
"command": ["dbt", "parse"],
},
},
},
}
def print_usage() -> None:
print("invalid usage")
def git_checkout(repo: str, path: pathlib.Path, commit: str = None) -> None:
if not os.path.exists(path):
print(f"Didn't find path {path}. Cloing {repo} into {path}.")
res = subprocess.run(["git", "clone", repo, path], capture_output=True)
res.check_returncode()
else:
print(f"Found path {path}. Skipping clone of {repo}.")
if commit:
print(f"Checking out commit {commit} for repo {repo}")
res = subprocess.run(["git", "checkout", commit], cwd=path, capture_output=True)
res.check_returncode()
def prepare_projects(projects) -> None:
for project_name, project in projects.items():
if "git_url" in project:
git_checkout(project["git_url"], project_name)
def run_jobs(projects):
results = {}
for project_name, project in projects.items():
for job_name, job in project["jobs"].items():
print(f"running job {job_name}")
cwd = project["path"] if "path" in project else project_name
start = time.perf_counter()
res = subprocess.run(job["command"], cwd=cwd)
end = time.perf_counter()
if res.returncode != 0:
results[job_name] = {"succeeded": False}
else:
results[job_name] = {"succeeded": True, "time": end - start}
return results
def compare(baseline_file: str, result_file: str) -> None:
with open(baseline_file, "r") as b:
baseline = json.load(b)
with open(result_file, "r") as r:
result = json.load(r)
from rich.console import Console
from rich.table import Table
table = Table(title="Performance Comparison")
table.add_column("Job Name")
table.add_column("Baseline")
table.add_column("Result")
table.add_column("Change")
for job_name, baseline_record in baseline.items():
baseline_time = baseline_record.get("time")
baseline_time_str = "{:.1f}s".format(baseline_time) if time is not None else "?"
result_record = result[job_name]
result_time = result_record.get("time")
result_time_str = "{:.1f}s".format(baseline_time) if time is not None else "?"
time_change_str = "-"
if result_time and baseline_time:
time_change_pct = 100.0 * (result_time - baseline_time) / baseline_time
time_change_pfx = "[green]" if time_change_pct >= 0.0 else "[red]"
time_change_str = time_change_pfx + "{:.1f}%".format(time_change_pct)
table.add_row(job_name, baseline_time_str, result_time_str, time_change_str)
print()
Console().print(table)
def baseline(projects) -> None:
prepare_projects(projects)
results = run_jobs(projects)
print("Writing results to 'perf_check.json'.")
with open("perf_check.json", "w") as w:
json.dump(results, w, indent=4)
if len(sys.argv) < 2:
print_usage()
elif sys.argv[1] == "baseline":
baseline(projects)
elif sys.argv[1] == "compare":
compare(sys.argv[2], sys.argv[3])
else:
print_usage()