beacon-biosignals · omus · Apr 3, 2023 · Apr 3, 2023 · Apr 3, 2023 · Apr 3, 2023
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches: main
     tags: "*"
+
 jobs:
   test:
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
@@ -54,29 +55,80 @@ jobs:
           DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
 
   cluster-test:
-    name: Cluster Test
+    name: Cluster Test - Julia ${{matrix.julia-version }} - K8s ${{ matrix.k8s-version }} - minikube ${{ matrix.minikube-version }}
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        julia-version:
+          - "1"
+        # Support the latest versions of the supported releases: https://kubernetes.io/releases/.
+        # These must be full version numbers including the patch.
+        k8s-version:
+          - "1.24.12"
+          - "1.25.8"
+          - "1.26.3"
+        minikube-version:
+          - "1.29.0"
     env:
+      # Reference the HEAD commit which triggerred this workflow. By default PRs use a merge commit
+      SHA: ${{ github.event.pull_request.head.sha || github.sha }}
       K8S_CLUSTER_TESTS: "true"
+      IMAGE_REPO: "k8s-cluster-managers"
     steps:
       - uses: actions/checkout@v3
         with:
-          fetch-depth: 0
+          ref: ${{ env.SHA }}  # Always checkout HEAD commit
+          fetch-depth: 2  # To determine parent commit SHA
+      - name: Determine SHAs
+        id: sha
+        run: |
+          head_short_sha="$(git rev-parse --short HEAD)"
+          parent_short_sha="$(git rev-parse --short HEAD^)"
+          echo "head_short_sha=$head_short_sha" | tee -a "$GITHUB_OUTPUT"
+          echo "parent_short_sha=$parent_short_sha" | tee -a "$GITHUB_OUTPUT"
+          echo "HEAD_SHORT_SHA=$head_short_sha" | tee -a "$GITHUB_ENV"
+          echo "PARENT_SHORT_SHA=$parent_short_sha" | tee -a "$GITHUB_ENV"
       - uses: julia-actions/setup-julia@v1
         with:
-          version: "1"
+          version: ${{ matrix.julia-version }}
       - uses: julia-actions/cache@v1
       - uses: julia-actions/julia-buildpkg@v1
 
       # https://github.com/marketplace/actions/docker-setup-buildx
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
+
+      # Allow Docker layers to persist between CI workflows. Using SHAs as our keys to allow for workflow restarts
+      # to pull the correct cache data instead of returning the most recent cached data.
+      # https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#matching-a-cache-key
+      - uses: actions/cache/restore@v3
+        id: restore-docker-image-cache
+        with:
+          key: docker-image-cache-${{ steps.sha.outputs.head_short_sha }}
+          path: ./images
+          restore-keys: |
+            docker-image-cache-${{ steps.sha.outputs.parent_short_sha }}
+            docker-image-cache-
+      - name: Load Docker image
+        run: |
+          parent_image_path="./images/$IMAGE_REPO:$PARENT_SHORT_SHA"
+          if [ -f "$parent_image_path" ]; then
+              docker load <"$parent_image_path"
+              docker tag "$IMAGE_REPO:$PARENT_SHORT_SHA" "$IMAGE_REPO:$HEAD_SHORT_SHA"
+              rm "$parent_image_path"  # Avoid storing the old image in the new cache entry
+          fi
       - name: Build Docker image
         run: |
-          GIT_REV=$(git rev-parse --short HEAD)
-          IMAGE=k8s-cluster-managers:$GIT_REV
-          echo "K8S_CLUSTER_MANAGERS_TEST_IMAGE=$IMAGE" >> $GITHUB_ENV
-          docker build -t $IMAGE .
+          image="$IMAGE_REPO:$HEAD_SHORT_SHA"
+          docker build -t "$image" .
+          echo "K8S_CLUSTER_MANAGERS_TEST_IMAGE=$image" | tee -a "$GITHUB_ENV"
+      - name: Save Docker image
+        run: |
+          image_path="./images/$K8S_CLUSTER_MANAGERS_TEST_IMAGE"
+          mkdir -p "$(dirname "$image_path")"
+          docker save "$K8S_CLUSTER_MANAGERS_TEST_IMAGE" >"$image_path"
+          echo "IMAGE_PATH=$image_path" | tee -a "$GITHUB_ENV"
 
       # Factors influencing the setup of the "local" Kubernetes cluster:
       # - Limited resources on GitHub runners only allow running a 1 pod at a time with
@@ -89,9 +141,9 @@ jobs:
         uses: manusa/actions-setup-minikube@v2.7.2
         with:
           # https://github.com/kubernetes/minikube/releases
-          minikube version: v1.22.0
-          # https://github.com/kubernetes/kubernetes/releases
-          kubernetes version: v1.21.4
+          minikube version: v${{ matrix.minikube-version }}
+          # Needs to match the tags format: https://github.com/kubernetes/kubernetes/tags
+          kubernetes version: v${{ matrix.k8s-version }}
           driver: docker
           start args: --nodes=2 --cni=kindnet
 
@@ -105,9 +157,16 @@ jobs:
         run: |
           for node in $(minikube node list | cut -f1); do
               echo "Transfering image to node $node..."
-              docker save $K8S_CLUSTER_MANAGERS_TEST_IMAGE | minikube ssh --node $node --native-ssh=false -- docker load
+              cat "$IMAGE_PATH" | minikube ssh --node $node --native-ssh=false -- docker load
           done
 
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v3
+
+      # https://github.com/actions/cache/tree/main/save#always-save-cache
+      - uses: actions/cache/save@v3
+        if: always()
+        with:
+          key: ${{ steps.restore-docker-image-cache.outputs.cache-primary-key }}
+          path: ./images
diff --git a/test/cluster.jl b/test/cluster.jl
@@ -35,18 +35,20 @@ const POD_OUTPUT_REGEX = r"From worker (?<worker_id>\d+):\s+(?<output>.*?)\r?\n"
 # As a convenience we'll automatically build the Docker image when a user uses `Pkg.test()`.
 # If the environmental variable is set we expect the Docker image has already been built.
 if !haskey(ENV, "K8S_CLUSTER_MANAGERS_TEST_IMAGE")
-    if success(`command -v minikube`) && !haskey(ENV, "MINIKUBE_ACTIVE_DOCKERD")
-        @warn "minikube users should run `eval \$(minikube docker-env)` before executing " *
-            "tests. Otherwise you may see pods fail with the reason \"ErrImageNeverPull\""
+    if readchomp(`$(kubectl) config current-context`) == "minikube" && !haskey(ENV, "MINIKUBE_ACTIVE_DOCKERD")
+        # When using a minikue cluster we need to build the image within the minikube
+        # environment otherwise we'll see pods fail with the reason "ErrImageNeverPull".
+        withenv(minikube_docker_env()...) do
+            run(`docker build -t $TEST_IMAGE $PKG_DIR`)
+        end
+    else
+        run(`docker build -t $TEST_IMAGE $PKG_DIR`)
     end
 
-    run(`docker build -t $TEST_IMAGE $PKG_DIR`)
-
     # Alternate build call which works on Apple Silicon
     # run(pipeline(`docker save $TEST_IMAGE`, `minikube ssh --native-ssh=false -- docker load`))
 end
 
-
 @testset "pod control" begin
     pod_control_manifest = YAML.load_file(joinpath(@__DIR__, "pod-control.yaml"))
 
@@ -226,6 +228,10 @@ let job_name = "test-success"
         # Display details to assist in debugging the failure
         if any(r -> !(r isa Test.Pass || r isa Test.Broken), test_results)
             report(job_name, "manager" => manager_pod, "worker" => worker_pod)
+        else
+            @info "Deleting job/pods for $job_name"
+            delete_job(job_name; wait=false)
+            delete_pod(worker_pod; wait=false)
         end
     end
 end
@@ -295,6 +301,10 @@ let job_name = "test-multi-addprocs"
             end
 
             report(job_name, "manager" => manager_pod, worker_pairs...)
+        else
+            @info "Deleting job/pods for $job_name"
+            delete_job(job_name; wait=false)
+            foreach(pod_name -> delete_pod(pod_name; wait=false), worker_pods)
         end
     end
 end
@@ -340,6 +350,10 @@ let job_name = "test-interrupt"
         # Display details to assist in debugging the failure
         if any(r -> !(r isa Test.Pass || r isa Test.Broken), test_results)
             report(job_name, "manager" => manager_pod, "worker" => worker_pod)
+        else
+            @info "Deleting job/pods for $job_name"
+            delete_job(job_name; wait=false)
+            delete_pod(worker_pod; wait=false)
         end
     end
 end
@@ -409,6 +423,10 @@ let job_name = "test-oom"
         # Display details to assist in debugging the failure
         if any(r -> !(r isa Test.Pass || r isa Test.Broken), test_results)
             report(job_name, "manager" => manager_pod, "worker" => worker_pod)
+        else
+            @info "Deleting job/pods for $job_name"
+            delete_job(job_name; wait=false)
+            delete_pod(worker_pod; wait=false)
         end
     end
 end
@@ -424,8 +442,13 @@ let job_name = "test-pending-timeout"
                 return pod
             end
 
-            # Request 1 exbibyte of memory (should always fail)
-            mgr = K8sClusterManager(1; configure, pending_timeout=1, memory="1Ei")
+            # Make a worker memory request so large that it will always fail.
+            # Previously with Kubenetes 1.22 we used "1Ei" (1 exbibyte) as this large
+            # value but this now fails with Kubernetes 1.26 so we'll use "1Pi" (1 pebibyte)
+            # instead.
+            # TODO: Reproduce the "1Ei" failure outside of K8sClusterManagers and file an
+            # issue for this.
+            mgr = K8sClusterManager(1; configure, pending_timeout=1, memory="1Pi")
             pids = addprocs(mgr)
 
             println("Num Processes: ", nprocs())
@@ -466,6 +489,9 @@ let job_name = "test-pending-timeout"
         # Display details to assist in debugging the failure
         if any(r -> !(r isa Test.Pass || r isa Test.Broken), test_results)
             report(job_name, "manager" => manager_pod)
+        else
+            @info "Deleting job/pods for $job_name"
+            delete_job(job_name; wait=false)
         end
     end
 end
diff --git a/test/job.template.yaml b/test/job.template.yaml
@@ -42,8 +42,10 @@ kind: Job
 metadata:
   name: {{{:job_name}}}
 spec:
-  # Clean up finished jobs
-  ttlSecondsAfterFinished: 10
+  # Avoid using `ttlSecondsAfterFinished` as this will cause failed jobs to be cleaned up
+  # which makes debugging failures harder. Instead we'll just manually delete the created
+  # resources.
+
   # Stop the job from creating a new pod when the container exits in error
   backoffLimit: 0
   template:

diff --git a/test/utils.jl b/test/utils.jl
@@ -27,6 +27,15 @@ function wait_job(job_name; condition=!isempty, timeout=60)
     end
 end
 
+function delete_job(name::AbstractString; wait::Bool=true)
+    kubectl_cmd = `$(kubectl()) delete job/$name --wait=$wait`
+    err = IOBuffer()
+    run(pipeline(ignorestatus(kubectl_cmd), stdout=devnull, stderr=err))
+
+    err.size > 0 && throw(KubeError(err))
+    return nothing
+end
+
 pod_exists(pod_name) = success(`$(kubectl()) get pod/$pod_name`)
 
 # Will fail if called and the job is in state "Waiting"
@@ -83,3 +92,30 @@ function report(job_name, pods::Pair...)
         end
     end
 end
+
+function minikube_docker_env()
+    env_vars = Pair{String,String}[]
+    open(`minikube docker-env`) do f
+        while !eof(f)
+            line = readline(f)
+
+            if startswith(line, "export")
+                line = replace(line, r"^export " => "")
+                key, value = split(line, '='; limit=2)
+                push!(env_vars, key => unquote(value))
+            end
+        end
+    end
+
+    return env_vars
+end
+
+isquoted(str::AbstractString) = startswith(str, '"') && endswith(str, '"')
+
+function unquote(str::AbstractString)
+    if isquoted(str)
+        return replace(SubString(str, 2, lastindex(str) - 1), "\\\"" => "\"")
+    else
+        throw(ArgumentError("Passed in string is not quoted"))
+    end
+end