Compare commits

...

3 Commits

Author SHA1 Message Date
Nathaniel May
0a2dbf3721 add manually modeled 1.0.3 baseline 2022-03-16 12:10:02 -04:00
Nathaniel May
a138f5f42b init sampling action 2022-03-16 11:50:20 -04:00
Nathaniel May
c5672b564b init modeling action 2022-03-16 11:49:09 -04:00
3 changed files with 488 additions and 0 deletions

310
.github/workflows/model_performance.yml vendored Normal file
View File

@@ -0,0 +1,310 @@
# **what?**
# This workflow models the performance characteristics of a point in time in dbt.
# It runs specific dbt commands on committed projects multiple times to create and
# commit information about the distribution to the current branch. For more information
# see the readme in the performance module at /performance/README.md.
#
# **why?**
# When developing new features, we can take quick performance samples and compare
# them against the commited baseline measurements produced by this workflow to detect
# some performance regressions at development time before they reach users.
#
# **when?**
# This is only run once directly after each release. If for some reason the results of
# a run are not satisfactory, it can also be triggered manually.
name: Model Performance Characteristics
on:
# runs after non-prereleases are published.
release:
types: [released]
# run manually from the actions tab
workflow_dispatch:
inputs:
# if we ever want to model pre-releases, we would need to handle pre-release version numbers
# here and in the runner. Even if we use a semver library I suspect it will be rather difficult.
release_id:
description: '(^^ always run from main) dbt version to model (must be non-prerelease in Pypi)'
default: 9.9.9
required: true
open_prs:
description: Open PRs to main and release branch? (branch name inferred from provided version) (yes/no)
default: 'no'
required: true
env:
RUNNER_CACHE_PATH: performance/runner/target/release/runner
# both jobs need to write
permissions:
contents: write
jobs:
latest-runner:
name: Build or Fetch Runner
runs-on: ubuntu-latest
env:
RUSTFLAGS: "-D warnings"
outputs:
cache_key: ${{ steps.variables.outputs.cache_key }}
release_id: ${{ steps.variables.outputs.release_id }}
open_prs: ${{ steps.variables.outputs.open_prs }}
release_branch: ${{ steps.variables.outputs.release_branch }}
steps:
# explicitly checkout the performance runner from main regardless of which
# version we are modeling.
- name: Checkout
uses: actions/checkout@v2
with:
ref: main
# collect all the variables that need to be used in subsequent jobs
- name: Set Variables
id: variables
run: |
# create a cache key that will be used in the next job. without this the
# next job would have to checkout from main an hash the files itself.
echo "::set-output name=cache_key::${{ runner.os }}-${{ hashFiles('performance/runner/Cargo.toml')}}-${{ hashFiles('performance/runner/src/*') }}"
# this value gets used to create other values locally so it gets a local definition too.
local_release_id=''
# users are prompted to input with the correct format
if [[ $GITHUB_EVENT_NAME == "workflow_dispatch" ]]; then
echo "Workflow dispatch event detected"
local_release_id=${{github.event.inputs.release_id}}
echo "::set-output name=release_id::${{github.event.inputs.release_id}}"
echo "::set-output name=open_prs::${{github.event.inputs.open_prs}}"
# release.tag_name has a v prepended. we must remove it.
else
echo "release event detected"
with_v=${{github.event.release.tag_name}}
without_v=${with_v:1}
local_release_id=$without_v
echo "::set-output name=release_id::$without_v"
echo "::set-output name=open_prs::yes"
fi
# string manipulation to get the branch name. It can't be discovered from the github api
# for release triggers so we're stuck with this. If we change our branch naming strategy
# we have to update this code. example: 1.0.0 -> 1.0.latest. the sed command takes into
# account multiple digits like 1.0.999 -> 1.0.latest
no_patch=$(sed "s|\(.*\)\..*|\1|" <<< $local_release_id)
branch_name="${no_patch}.latest"
echo "::set-output name=release_branch::$branch_name"
echo "release branch is inferred to be ${branch_name}"
# attempts to access a previously cached runner
- uses: actions/cache@v2
id: cache
with:
path: ${{ env.RUNNER_CACHE_PATH }}
key: ${{ steps.variables.outputs.cache_key }}
- name: Fetch Rust Toolchain
if: steps.cache.outputs.cache-hit != 'true'
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Add fmt
if: steps.cache.outputs.cache-hit != 'true'
run: rustup component add rustfmt
- name: Cargo fmt
if: steps.cache.outputs.cache-hit != 'true'
uses: actions-rs/cargo@v1
with:
command: fmt
args: --manifest-path performance/runner/Cargo.toml --all -- --check
- name: Test
if: steps.cache.outputs.cache-hit != 'true'
uses: actions-rs/cargo@v1
with:
command: test
args: --manifest-path performance/runner/Cargo.toml
- name: Build (optimized)
if: steps.cache.outputs.cache-hit != 'true'
uses: actions-rs/cargo@v1
with:
command: build
args: --release --manifest-path performance/runner/Cargo.toml
# the cache action automatically caches this binary at the end of the job
model:
# depends on `latest-runner` as a separate job so that failures in this job do not prevent
# a successfully tested and built binary from being cached.
needs: [latest-runner]
name: Model a release
runs-on: ubuntu-latest
steps:
- name: '[DEBUG] print variables'
run: |
echo "all variables defined in latest-runner > Set Variables > outputs"
echo "cache_key: ${{ needs.latest-runner.outputs.cache_key }}"
echo "release_id: ${{ needs.latest-runner.outputs.release_id }}"
echo "open_prs: ${{ needs.latest-runner.outputs.open_prs }}"
echo "release_branch: ${{ needs.latest-runner.outputs.release_branch }}"
- name: Setup Python
uses: actions/setup-python@v2.2.2
with:
python-version: "3.8"
- name: Install dbt
run: pip install dbt-postgres==${{ needs.latest-runner.outputs.release_id }}
- name: Install Hyperfine
run: wget https://github.com/sharkdp/hyperfine/releases/download/v1.11.0/hyperfine_1.11.0_amd64.deb && sudo dpkg -i hyperfine_1.11.0_amd64.deb
# explicitly checkout main to get the latest project definitions
- name: Checkout
uses: actions/checkout@v2
with:
ref: main
# this was built in the previous job so it will be there.
- name: Fetch Runner
uses: actions/cache@v2
id: cache
with:
path: ${{ env.RUNNER_CACHE_PATH }}
key: ${{ needs.latest-runner.outputs.cache_key }}
- name: Move Runner
run: mv performance/runner/target/release/runner performance/app
- name: Change Runner Permissions
run: chmod +x ./performance/app
- name: '[DEBUG] ls baseline directory before run'
run: ls -R performance/baselines/
# `${{ github.workspace }}` is used to pass the absolute path
# TODO CHANGE NUMBER OF RUNS BEFORE MERGING
# TODO this isn't putting the baseline in the right directory. it's putting it one level up.
- name: Run Measurement
run: mkdir ${{ github.workspace }}/performance/tmp/ && mkdir -p performance/baselines/${{ needs.latest-runner.outputs.release_id }}/ && performance/app model -v ${{ needs.latest-runner.outputs.release_id }} -b ${{ github.workspace }}/performance/baselines/ -p ${{ github.workspace }}/performance/projects/ -t ${{ github.workspace }}/performance/tmp/ -n 2
- name: '[DEBUG] ls baseline directory after run'
run: ls -R performance/baselines/
- uses: actions/upload-artifact@v3
with:
name: baseline
path: performance/baselines/${{ needs.latest-runner.outputs.release_id }}/
pr-release-branch:
if: ${{ needs.latest-runner.outputs.open_prs == 'yes' && needs.latest-runner.outputs.release_branch != 'main' }}
# depends on `model` as a separate job so that the baseline can be committed to more than one branch
# i.e. release branch and main
needs: [latest-runner, model]
name: Open PR for release branch (if specified)
runs-on: ubuntu-latest
steps:
- name: '[DEBUG] print variables'
run: |
echo "all variables defined in latest-runner > Set Variables > outputs"
echo "cache_key: ${{ needs.latest-runner.outputs.cache_key }}"
echo "release_id: ${{ needs.latest-runner.outputs.release_id }}"
echo "open_prs: ${{ needs.latest-runner.outputs.open_prs }}"
echo "release_branch: ${{ needs.latest-runner.outputs.release_branch }}"
# explicitly checkout the branch specified during dispatch
- name: Checkout
uses: actions/checkout@v2
with:
ref: ${{ needs.latest-runner.outputs.release_branch }}
- name: '[DEBUG] ls baselines before artifact download'
run: ls -R performance/baselines/
- uses: actions/download-artifact@v3
with:
name: baseline
path: performance/baselines/${{ needs.latest-runner.outputs.release_id }}
- name: '[DEBUG] ls baselines after artifact download'
run: ls -R performance/baselines/
- name: Make Branch
uses: EndBug/add-and-commit@v8
with:
add: performance/baselines
author_name: 'Github Build Bot'
author_email: 'buildbot@fishtownanalytics.com'
message: 'adding performance baseline for ${{ needs.latest-runner.outputs.release_id }}'
new_branch: 'performance-bot/release_${{needs.latest-runner.outputs.release_id}}_${{GITHUB.RUN_ID}}'
push: false
- name: Create Pull Request
uses: peter-evans/create-pull-request@v3
with:
author: 'Github Build Bot <buildbot@fishtownanalytics.com>'
draft: true
base: ${{ needs.latest-runner.outputs.release_branch }}
title: 'Adding performance modeling for ${{needs.latest-runner.outputs.release_id}}'
branch: 'performance-bot/main_${{needs.latest-runner.outputs.release_id}}_${{GITHUB.RUN_ID}}'
pr-main-branch:
if: ${{ needs.latest-runner.outputs.open_prs == 'yes' }}
# depends on `model` as a separate job so that the baseline can be committed to more than one branch
# i.e. release branch and main
needs: [latest-runner, model]
name: Open PR for main (if specified)
runs-on: ubuntu-latest
steps:
- name: '[DEBUG] print variables'
run: |
echo "all variables defined in latest-runner > Set Variables > outputs"
echo "cache_key: ${{ needs.latest-runner.outputs.cache_key }}"
echo "release_id: ${{ needs.latest-runner.outputs.release_id }}"
echo "open_prs: ${{ needs.latest-runner.outputs.open_prs }}"
echo "release_branch: ${{ needs.latest-runner.outputs.release_branch }}"
# explicitly checkout main
- name: Checkout
uses: actions/checkout@v2
with:
ref: main
- name: '[DEBUG] ls baselines before artifact download'
run: ls -R performance/baselines/
- uses: actions/download-artifact@v3
with:
name: baseline
path: performance/baselines/${{ needs.latest-runner.outputs.release_id }}
- name: '[DEBUG] ls baselines after artifact download'
run: ls -R performance/baselines/
- name: Make Branch
uses: EndBug/add-and-commit@v8
with:
add: performance/baselines
author_name: 'Github Build Bot'
author_email: 'buildbot@fishtownanalytics.com'
message: 'adding performance baseline for ${{ needs.latest-runner.outputs.release_id }}'
new_branch: 'performance-bot/main_${{needs.latest-runner.outputs.release_id}}_${{GITHUB.RUN_ID}}'
push: false
- name: Create Pull Request
uses: peter-evans/create-pull-request@v3
with:
author: 'Github Build Bot <buildbot@fishtownanalytics.com>'
draft: true
base: main
title: 'Adding performance modeling for ${{needs.latest-runner.outputs.release_id}}'
branch: 'performance-bot/main_${{needs.latest-runner.outputs.release_id}}_${{GITHUB.RUN_ID}}'

138
.github/workflows/sample_performance.yml vendored Normal file
View File

@@ -0,0 +1,138 @@
# **what?**
# This workflow samples performance characteristics of your commit and compares them to
# the most recent release. If they are significanly off from the previously recorded
# distribution it will trigger a failure. Do not rerun these failures to get them to pass.
# There is more information in the performance readme about how to handle failures.
#
# **why?**
# This will help us potentially catch new performance regressions in development before
# releasing a new version.
#
# **whent?**
# This runs on every commit in PRs.
#
name: Performance Regression Tests
# Schedule triggers
on:
# sampling is fast enough to run on every commit in PRs
pull_request:
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
env:
RUNNER_CACHE_PATH: performance/runner/target/release/runner
jobs:
latest-runner:
name: Build or Fetch Performance Runner
runs-on: ubuntu-latest
env:
RUSTFLAGS: "-D warnings"
steps:
# specifically checksout main so that when we're sampling on commits to
# patch releases, we're using the latest runner code not whatever is in the working branch.
#
# the with clause should be commented out if you're working on the runner
# and want to see output from your code. it's pulling from main, not what you're
# working on.
- name: Checkout
uses: actions/checkout@v2
# TODO uncomment when done developing this.
# with:
# ref: main
# create a cache key that will be used in the next job. without this the
# next job would have to checkout from main an hash the files itself.
- name: Create Cache Key
id: cacheKey
run: echo "::set-output name=key::${{ runner.os }}-${{ hashFiles('performance/runner/Cargo.toml')}}-${{ hashFiles('performance/runner/src/*') }}"
working-directory: ${{ env.RUNNER_CACHE_PATH }}
# attempts to access a previously cached runner
#
# unless you're developing the runner, it should be in the cache.
- uses: actions/cache@v2
id: cache
with:
path: ${{ env.RUNNER_CACHE_PATH }}
key: ${{ steps.cacheKey.outputs.key }}
- name: Fetch Rust Toolchain
if: steps.cache.outputs.cache-hit != 'true'
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Add fmt
if: steps.cache.outputs.cache-hit != 'true'
run: rustup component add rustfmt
- name: Cargo fmt
if: steps.cache.outputs.cache-hit != 'true'
uses: actions-rs/cargo@v1
with:
command: fmt
args: --manifest-path performance/runner/Cargo.toml --all -- --check
- name: Test
if: steps.cache.outputs.cache-hit != 'true'
uses: actions-rs/cargo@v1
with:
command: test
args: --manifest-path performance/runner/Cargo.toml
- name: Build (optimized)
if: steps.cache.outputs.cache-hit != 'true'
uses: actions-rs/cargo@v1
with:
command: build
args: --release --manifest-path performance/runner/Cargo.toml
# the cache action automatically caches this binary at the end of the job
sample:
# depends on `latest-runner` as a separate job so that failures in this job do not prevent
# a successfully tested and built binary from being cached. Also so we can checkout dbt
# from THIS branch not main like we want for the runner.
needs: [latest-runner]
name: Compare Performance Samples
runs-on: ubuntu-latest
steps:
# checkout this branch not main
- name: Checkout
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2.2.2
with:
python-version: "3.8"
- name: Install dbt
run: pip install -r dev-requirements.txt -r editable-requirements.txt
- name: Install Hyperfine
run: wget https://github.com/sharkdp/hyperfine/releases/download/v1.11.0/hyperfine_1.11.0_amd64.deb && sudo dpkg -i hyperfine_1.11.0_amd64.deb
# this was built in the previous job so it will be there.
- name: Fetch Runner
uses: actions/cache@v2
id: cache
with:
path: ${{ env.RUNNER_CACHE_PATH }}
key: ${{ steps.cacheKey.outputs.key }}
- name: Move Runner
run: mv performance/runner/target/release/runner ./performance/app
- name: Change Runner Permissions
run: chmod +x ./performance/app
# `${{ github.workspace }}` is used to pass the absolute path
- name: Run Measurement
run: mkdir tmp && ./app sample -b ${{ github.workspace }}/performance/baselines -p ${{ github.workspace }}/performance/projects -o ${{ github.workspace }}/tmp
working-directory: ${{ github.workspace }}/performance/

View File

@@ -0,0 +1,40 @@
{
"version": "1.0.3",
"metric": {
"name": "parse",
"project_name": "01_2000_simple_models"
},
"ts": "2022-03-04T00:02:52.657727515Z",
"measurement": {
"command": "dbt parse --no-version-check --profiles-dir ../../project_config/",
"mean": 41.224566760615,
"stddev": 0.252468634424254,
"median": 41.182836243915,
"user": 40.70073678499999,
"system": 0.61185062,
"min": 40.89372129691501,
"max": 41.68176405591501,
"times": [
41.397582801915,
41.618822256915,
41.374914350915,
41.68176405591501,
41.255119986915,
41.528348636915,
41.238762892915,
40.950121934915,
41.388716648915,
41.62938069991501,
41.139914502915,
41.114225200915,
41.045012222915,
41.01039839391501,
40.915296414915,
41.006528646915,
40.89372129691501,
40.951454721915,
41.125491559915,
41.225757984915
]
}
}