Skip to content

Use GitHub action caching to speed up test image build #97

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 70 additions & 11 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches: main
tags: "*"

jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
Expand Down Expand Up @@ -54,29 +55,80 @@ jobs:
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}

cluster-test:
name: Cluster Test
name: Cluster Test - Julia ${{matrix.julia-version }} - K8s ${{ matrix.k8s-version }} - minikube ${{ matrix.minikube-version }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
julia-version:
- "1"
# Support the latest versions of the supported releases: https://kubernetes.io/releases/.
# These must be full version numbers including the patch.
k8s-version:
- "1.24.12"
- "1.25.8"
- "1.26.3"
minikube-version:
- "1.29.0"
env:
# Reference the HEAD commit which triggerred this workflow. By default PRs use a merge commit
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
K8S_CLUSTER_TESTS: "true"
IMAGE_REPO: "k8s-cluster-managers"
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
ref: ${{ env.SHA }} # Always checkout HEAD commit
fetch-depth: 2 # To determine parent commit SHA
- name: Determine SHAs
id: sha
run: |
head_short_sha="$(git rev-parse --short HEAD)"
parent_short_sha="$(git rev-parse --short HEAD^)"
echo "head_short_sha=$head_short_sha" | tee -a "$GITHUB_OUTPUT"
echo "parent_short_sha=$parent_short_sha" | tee -a "$GITHUB_OUTPUT"
echo "HEAD_SHORT_SHA=$head_short_sha" | tee -a "$GITHUB_ENV"
echo "PARENT_SHORT_SHA=$parent_short_sha" | tee -a "$GITHUB_ENV"
- uses: julia-actions/setup-julia@v1
with:
version: "1"
version: ${{ matrix.julia-version }}
- uses: julia-actions/cache@v1
- uses: julia-actions/julia-buildpkg@v1

# https://github.com/marketplace/actions/docker-setup-buildx
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

# Allow Docker layers to persist between CI workflows. Using SHAs as our keys to allow for workflow restarts
# to pull the correct cache data instead of returning the most recent cached data.
# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#matching-a-cache-key
- uses: actions/cache/restore@v3
id: restore-docker-image-cache
with:
key: docker-image-cache-${{ steps.sha.outputs.head_short_sha }}
path: ./images
restore-keys: |
docker-image-cache-${{ steps.sha.outputs.parent_short_sha }}
docker-image-cache-
- name: Load Docker image
run: |
parent_image_path="./images/$IMAGE_REPO:$PARENT_SHORT_SHA"
if [ -f "$parent_image_path" ]; then
docker load <"$parent_image_path"
docker tag "$IMAGE_REPO:$PARENT_SHORT_SHA" "$IMAGE_REPO:$HEAD_SHORT_SHA"
rm "$parent_image_path" # Avoid storing the old image in the new cache entry
fi
- name: Build Docker image
run: |
GIT_REV=$(git rev-parse --short HEAD)
IMAGE=k8s-cluster-managers:$GIT_REV
echo "K8S_CLUSTER_MANAGERS_TEST_IMAGE=$IMAGE" >> $GITHUB_ENV
docker build -t $IMAGE .
image="$IMAGE_REPO:$HEAD_SHORT_SHA"
docker build -t "$image" .
echo "K8S_CLUSTER_MANAGERS_TEST_IMAGE=$image" | tee -a "$GITHUB_ENV"
- name: Save Docker image
run: |
image_path="./images/$K8S_CLUSTER_MANAGERS_TEST_IMAGE"
mkdir -p "$(dirname "$image_path")"
docker save "$K8S_CLUSTER_MANAGERS_TEST_IMAGE" >"$image_path"
echo "IMAGE_PATH=$image_path" | tee -a "$GITHUB_ENV"

# Factors influencing the setup of the "local" Kubernetes cluster:
# - Limited resources on GitHub runners only allow running a 1 pod at a time with
Expand All @@ -89,9 +141,9 @@ jobs:
uses: manusa/actions-setup-minikube@v2.7.2
with:
# https://github.com/kubernetes/minikube/releases
minikube version: v1.22.0
# https://github.com/kubernetes/kubernetes/releases
kubernetes version: v1.21.4
minikube version: v${{ matrix.minikube-version }}
# Needs to match the tags format: https://github.com/kubernetes/kubernetes/tags
kubernetes version: v${{ matrix.k8s-version }}
driver: docker
start args: --nodes=2 --cni=kindnet

Expand All @@ -105,9 +157,16 @@ jobs:
run: |
for node in $(minikube node list | cut -f1); do
echo "Transfering image to node $node..."
docker save $K8S_CLUSTER_MANAGERS_TEST_IMAGE | minikube ssh --node $node --native-ssh=false -- docker load
cat "$IMAGE_PATH" | minikube ssh --node $node --native-ssh=false -- docker load
done

- uses: julia-actions/julia-runtest@v1
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v3

# https://github.com/actions/cache/tree/main/save#always-save-cache
- uses: actions/cache/save@v3
if: always()
with:
key: ${{ steps.restore-docker-image-cache.outputs.cache-primary-key }}
path: ./images
42 changes: 34 additions & 8 deletions test/cluster.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,20 @@ const POD_OUTPUT_REGEX = r"From worker (?<worker_id>\d+):\s+(?<output>.*?)\r?\n"
# As a convenience we'll automatically build the Docker image when a user uses `Pkg.test()`.
# If the environmental variable is set we expect the Docker image has already been built.
if !haskey(ENV, "K8S_CLUSTER_MANAGERS_TEST_IMAGE")
if success(`command -v minikube`) && !haskey(ENV, "MINIKUBE_ACTIVE_DOCKERD")
@warn "minikube users should run `eval \$(minikube docker-env)` before executing " *
"tests. Otherwise you may see pods fail with the reason \"ErrImageNeverPull\""
if readchomp(`$(kubectl) config current-context`) == "minikube" && !haskey(ENV, "MINIKUBE_ACTIVE_DOCKERD")
# When using a minikue cluster we need to build the image within the minikube
# environment otherwise we'll see pods fail with the reason "ErrImageNeverPull".
withenv(minikube_docker_env()...) do
run(`docker build -t $TEST_IMAGE $PKG_DIR`)
end
else
run(`docker build -t $TEST_IMAGE $PKG_DIR`)
end

run(`docker build -t $TEST_IMAGE $PKG_DIR`)

# Alternate build call which works on Apple Silicon
# run(pipeline(`docker save $TEST_IMAGE`, `minikube ssh --native-ssh=false -- docker load`))
end


@testset "pod control" begin
pod_control_manifest = YAML.load_file(joinpath(@__DIR__, "pod-control.yaml"))

Expand Down Expand Up @@ -226,6 +228,10 @@ let job_name = "test-success"
# Display details to assist in debugging the failure
if any(r -> !(r isa Test.Pass || r isa Test.Broken), test_results)
report(job_name, "manager" => manager_pod, "worker" => worker_pod)
else
@info "Deleting job/pods for $job_name"
delete_job(job_name; wait=false)
delete_pod(worker_pod; wait=false)
end
end
end
Expand Down Expand Up @@ -295,6 +301,10 @@ let job_name = "test-multi-addprocs"
end

report(job_name, "manager" => manager_pod, worker_pairs...)
else
@info "Deleting job/pods for $job_name"
delete_job(job_name; wait=false)
foreach(pod_name -> delete_pod(pod_name; wait=false), worker_pods)
end
end
end
Expand Down Expand Up @@ -340,6 +350,10 @@ let job_name = "test-interrupt"
# Display details to assist in debugging the failure
if any(r -> !(r isa Test.Pass || r isa Test.Broken), test_results)
report(job_name, "manager" => manager_pod, "worker" => worker_pod)
else
@info "Deleting job/pods for $job_name"
delete_job(job_name; wait=false)
delete_pod(worker_pod; wait=false)
end
end
end
Expand Down Expand Up @@ -409,6 +423,10 @@ let job_name = "test-oom"
# Display details to assist in debugging the failure
if any(r -> !(r isa Test.Pass || r isa Test.Broken), test_results)
report(job_name, "manager" => manager_pod, "worker" => worker_pod)
else
@info "Deleting job/pods for $job_name"
delete_job(job_name; wait=false)
delete_pod(worker_pod; wait=false)
end
end
end
Expand All @@ -424,8 +442,13 @@ let job_name = "test-pending-timeout"
return pod
end

# Request 1 exbibyte of memory (should always fail)
mgr = K8sClusterManager(1; configure, pending_timeout=1, memory="1Ei")
# Make a worker memory request so large that it will always fail.
# Previously with Kubenetes 1.22 we used "1Ei" (1 exbibyte) as this large
# value but this now fails with Kubernetes 1.26 so we'll use "1Pi" (1 pebibyte)
# instead.
# TODO: Reproduce the "1Ei" failure outside of K8sClusterManagers and file an
# issue for this.
mgr = K8sClusterManager(1; configure, pending_timeout=1, memory="1Pi")
pids = addprocs(mgr)

println("Num Processes: ", nprocs())
Expand Down Expand Up @@ -466,6 +489,9 @@ let job_name = "test-pending-timeout"
# Display details to assist in debugging the failure
if any(r -> !(r isa Test.Pass || r isa Test.Broken), test_results)
report(job_name, "manager" => manager_pod)
else
@info "Deleting job/pods for $job_name"
delete_job(job_name; wait=false)
end
end
end
6 changes: 4 additions & 2 deletions test/job.template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ kind: Job
metadata:
name: {{{:job_name}}}
spec:
# Clean up finished jobs
ttlSecondsAfterFinished: 10
# Avoid using `ttlSecondsAfterFinished` as this will cause failed jobs to be cleaned up
# which makes debugging failures harder. Instead we'll just manually delete the created
# resources.

# Stop the job from creating a new pod when the container exits in error
backoffLimit: 0
template:
Expand Down
36 changes: 36 additions & 0 deletions test/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ function wait_job(job_name; condition=!isempty, timeout=60)
end
end

function delete_job(name::AbstractString; wait::Bool=true)
kubectl_cmd = `$(kubectl()) delete job/$name --wait=$wait`
err = IOBuffer()
run(pipeline(ignorestatus(kubectl_cmd), stdout=devnull, stderr=err))

err.size > 0 && throw(KubeError(err))
return nothing
end

pod_exists(pod_name) = success(`$(kubectl()) get pod/$pod_name`)

# Will fail if called and the job is in state "Waiting"
Expand Down Expand Up @@ -83,3 +92,30 @@ function report(job_name, pods::Pair...)
end
end
end

function minikube_docker_env()
env_vars = Pair{String,String}[]
open(`minikube docker-env`) do f
while !eof(f)
line = readline(f)

if startswith(line, "export")
line = replace(line, r"^export " => "")
key, value = split(line, '='; limit=2)
push!(env_vars, key => unquote(value))
end
end
end

return env_vars
end

isquoted(str::AbstractString) = startswith(str, '"') && endswith(str, '"')

function unquote(str::AbstractString)
if isquoted(str)
return replace(SubString(str, 2, lastindex(str) - 1), "\\\"" => "\"")
else
throw(ArgumentError("Passed in string is not quoted"))
end
end