Skip to content

Commit 3b779f2

Browse files
committed
Adding SlowRetry on Infeasible Provisioning
1 parent c746941 commit 3b779f2

File tree

3 files changed

+237
-196
lines changed

3 files changed

+237
-196
lines changed

controller/controller.go

+99-1
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,12 @@ import (
3030
"sync"
3131
"time"
3232

33+
"github.com/kubernetes-csi/csi-lib-utils/slowset"
3334
"github.com/prometheus/client_golang/prometheus"
3435
"github.com/prometheus/client_golang/prometheus/promhttp"
3536
"golang.org/x/time/rate"
37+
"google.golang.org/grpc/codes"
38+
"google.golang.org/grpc/status"
3639
v1 "k8s.io/api/core/v1"
3740
storage "k8s.io/api/storage/v1"
3841
storagebeta "k8s.io/api/storage/v1beta1"
@@ -183,6 +186,10 @@ type ProvisionController struct {
183186
volumeStore VolumeStore
184187

185188
volumeNameHook VolumeNameHook
189+
190+
slowSet *slowset.SlowSet
191+
192+
retryIntervalMax time.Duration
186193
}
187194

188195
const (
@@ -216,6 +223,8 @@ const (
216223
DefaultMetricsPath = "/metrics"
217224
// DefaultAddFinalizer is used when option function AddFinalizer is omitted
218225
DefaultAddFinalizer = false
226+
// DefaultRetryIntervalMax is used when option function RetryIntervalMax is omitted
227+
DefaultRetryIntervalMax = 5 * time.Minute
219228
)
220229

221230
var errRuntime = fmt.Errorf("cannot call option functions after controller has Run")
@@ -451,6 +460,18 @@ func RetryPeriod(retryPeriod time.Duration) func(*ProvisionController) error {
451460
}
452461
}
453462

463+
// RetryIntervalMax is the maximum retry interval of failed provisioning or deletion.
464+
// Defaults to 5 minutes.
465+
func RetryIntervalMax(retryIntervalMax time.Duration) func(*ProvisionController) error {
466+
return func(c *ProvisionController) error {
467+
if c.HasRun() {
468+
return errRuntime
469+
}
470+
c.retryIntervalMax = retryIntervalMax
471+
return nil
472+
}
473+
}
474+
454475
// ClaimsInformer sets the informer to use for accessing PersistentVolumeClaims.
455476
// Defaults to using a internal informer.
456477
func ClaimsInformer(informer cache.SharedIndexInformer) func(*ProvisionController) error {
@@ -667,8 +688,11 @@ func NewProvisionController(
667688
hasRun: false,
668689
hasRunLock: &sync.Mutex{},
669690
volumeNameHook: getProvisionedVolumeNameForClaim,
691+
retryIntervalMax: DefaultRetryIntervalMax,
670692
}
671693

694+
controller.slowSet = slowset.NewSlowSet(controller.retryIntervalMax)
695+
672696
for _, option := range options {
673697
err := option(controller)
674698
if err != nil {
@@ -840,6 +864,8 @@ func (ctrl *ProvisionController) Run(ctx context.Context) {
840864
defer ctrl.claimQueue.ShutDown()
841865
defer ctrl.volumeQueue.ShutDown()
842866

867+
go ctrl.slowSet.Run(ctx.Done())
868+
843869
ctrl.hasRunLock.Lock()
844870
ctrl.hasRun = true
845871
ctrl.hasRunLock.Unlock()
@@ -1085,6 +1111,10 @@ func (ctrl *ProvisionController) syncClaim(ctx context.Context, obj interface{})
10851111
return fmt.Errorf("expected claim but got %+v", obj)
10861112
}
10871113

1114+
if err := ctrl.delayProvisioningIfRecentlyInfeasible(claim); err != nil {
1115+
return err
1116+
}
1117+
10881118
should, err := ctrl.shouldProvision(ctx, claim)
10891119
if err != nil {
10901120
ctrl.updateProvisionStats(claim, err, time.Time{})
@@ -1494,7 +1524,20 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
14941524
}
14951525

14961526
ctx2 := klog.NewContext(ctx, logger)
1497-
err = fmt.Errorf("failed to provision volume with StorageClass %q: %v", claimClass, err)
1527+
1528+
if isInfeasibleError(err) {
1529+
logger.V(2).Info("Detected infeasible volume provisioning request",
1530+
"error", err,
1531+
"claim", klog.KObj(claim))
1532+
1533+
ctrl.markForSlowRetry(claim, err)
1534+
1535+
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, "ProvisioningFailed",
1536+
fmt.Sprintf("Volume provisioning failed with infeasible error. Retries will be delayed. %v", err))
1537+
1538+
return ProvisioningFinished, err
1539+
}
1540+
14981541
return ctrl.provisionVolumeErrorHandling(ctx2, result, err, claim)
14991542
}
15001543

@@ -1519,6 +1562,61 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
15191562
return ProvisioningFinished, nil
15201563
}
15211564

1565+
func (ctrl *ProvisionController) delayProvisioningIfRecentlyInfeasible(claim *v1.PersistentVolumeClaim) error {
1566+
key := string(claim.UID)
1567+
1568+
claimClass := util.GetPersistentVolumeClaimClass(claim)
1569+
currentClass, err := ctrl.getStorageClass(claimClass)
1570+
if err != nil {
1571+
return fmt.Errorf("failed to get storage class: %v", err)
1572+
}
1573+
1574+
if info, exists := ctrl.slowSet.Get(key); exists {
1575+
if info.StorageClassUID != string(currentClass.UID) {
1576+
ctrl.slowSet.Remove(key)
1577+
return nil
1578+
}
1579+
}
1580+
if delay := ctrl.slowSet.TimeRemaining(key); delay > 0 {
1581+
return fmt.Errorf("skipping volume provisioning for pvc %s, because provisioning previously failed with infeasible error", key)
1582+
}
1583+
return nil
1584+
}
1585+
1586+
func (ctrl *ProvisionController) markForSlowRetry(claim *v1.PersistentVolumeClaim, err error) {
1587+
if isInfeasibleError(err) {
1588+
key := string(claim.UID)
1589+
1590+
claimClass := util.GetPersistentVolumeClaimClass(claim)
1591+
class, err := ctrl.getStorageClass(claimClass)
1592+
if err != nil {
1593+
klog.ErrorS(err, "Failed to get StorageClass for delay tracking",
1594+
"PVC", klog.KObj(claim))
1595+
return
1596+
}
1597+
1598+
info := slowset.ObjectData{
1599+
Timestamp: time.Now(),
1600+
StorageClassUID: string(class.UID),
1601+
}
1602+
ctrl.slowSet.Add(key, info)
1603+
}
1604+
}
1605+
1606+
func isInfeasibleError(err error) bool {
1607+
1608+
st, ok := status.FromError(err)
1609+
if !ok {
1610+
return false
1611+
}
1612+
1613+
switch st.Code() {
1614+
case codes.InvalidArgument:
1615+
return true
1616+
}
1617+
return false
1618+
}
1619+
15221620
func (ctrl *ProvisionController) provisionVolumeErrorHandling(ctx context.Context, result ProvisioningState, err error, claim *v1.PersistentVolumeClaim) (ProvisioningState, error) {
15231621
logger := klog.FromContext(ctx)
15241622
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, "ProvisioningFailed", err.Error())

go.mod

+42-39
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,65 @@
11
module sigs.k8s.io/sig-storage-lib-external-provisioner/v11
22

3-
go 1.22.0
3+
go 1.23.1
44

5-
toolchain go1.22.2
5+
toolchain go1.23.6
66

77
require (
8+
github.com/kubernetes-csi/csi-lib-utils v0.21.0
89
github.com/miekg/dns v1.1.29
9-
github.com/prometheus/client_golang v1.5.1
10-
github.com/prometheus/client_model v0.2.0
11-
golang.org/x/time v0.3.0
12-
k8s.io/api v0.30.0
13-
k8s.io/apimachinery v0.30.0
14-
k8s.io/client-go v0.30.0
15-
k8s.io/klog/v2 v2.120.1
10+
github.com/prometheus/client_golang v1.20.5
11+
github.com/prometheus/client_model v0.6.1
12+
golang.org/x/time v0.8.0
13+
google.golang.org/grpc v1.69.0
14+
k8s.io/api v0.32.0
15+
k8s.io/apimachinery v0.32.0
16+
k8s.io/client-go v0.32.0
17+
k8s.io/klog/v2 v2.130.1
1618
)
1719

1820
require (
1921
github.com/beorn7/perks v1.0.1 // indirect
20-
github.com/cespare/xxhash/v2 v2.1.1 // indirect
21-
github.com/davecgh/go-spew v1.1.1 // indirect
22-
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
23-
github.com/evanphx/json-patch v5.6.0+incompatible // indirect
24-
github.com/go-logr/logr v1.4.1 // indirect
25-
github.com/go-openapi/jsonpointer v0.19.6 // indirect
26-
github.com/go-openapi/jsonreference v0.20.2 // indirect
27-
github.com/go-openapi/swag v0.22.3 // indirect
22+
github.com/cespare/xxhash/v2 v2.3.0 // indirect
23+
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
24+
github.com/emicklei/go-restful/v3 v3.12.1 // indirect
25+
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
26+
github.com/go-logr/logr v1.4.2 // indirect
27+
github.com/go-openapi/jsonpointer v0.21.0 // indirect
28+
github.com/go-openapi/jsonreference v0.21.0 // indirect
29+
github.com/go-openapi/swag v0.23.0 // indirect
2830
github.com/gogo/protobuf v1.3.2 // indirect
29-
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
3031
github.com/golang/protobuf v1.5.4 // indirect
31-
github.com/google/gnostic-models v0.6.8 // indirect
32+
github.com/google/gnostic-models v0.6.9 // indirect
3233
github.com/google/go-cmp v0.6.0 // indirect
3334
github.com/google/gofuzz v1.2.0 // indirect
34-
github.com/google/uuid v1.3.0 // indirect
35+
github.com/google/uuid v1.6.0 // indirect
3536
github.com/josharian/intern v1.0.0 // indirect
3637
github.com/json-iterator/go v1.1.12 // indirect
37-
github.com/mailru/easyjson v0.7.7 // indirect
38-
github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect
38+
github.com/klauspost/compress v1.17.11 // indirect
39+
github.com/mailru/easyjson v0.9.0 // indirect
3940
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
4041
github.com/modern-go/reflect2 v1.0.2 // indirect
4142
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
4243
github.com/pkg/errors v0.9.1 // indirect
43-
github.com/prometheus/common v0.9.1 // indirect
44-
github.com/prometheus/procfs v0.0.8 // indirect
45-
golang.org/x/crypto v0.21.0 // indirect
46-
golang.org/x/net v0.23.0 // indirect
47-
golang.org/x/oauth2 v0.10.0 // indirect
48-
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 // indirect
49-
golang.org/x/sys v0.18.0 // indirect
50-
golang.org/x/term v0.18.0 // indirect
51-
golang.org/x/text v0.14.0 // indirect
52-
google.golang.org/appengine v1.6.7 // indirect
53-
google.golang.org/protobuf v1.33.0 // indirect
44+
github.com/prometheus/common v0.61.0 // indirect
45+
github.com/prometheus/procfs v0.15.1 // indirect
46+
github.com/x448/float16 v0.8.4 // indirect
47+
golang.org/x/crypto v0.30.0 // indirect
48+
golang.org/x/net v0.32.0 // indirect
49+
golang.org/x/oauth2 v0.24.0 // indirect
50+
golang.org/x/sys v0.28.0 // indirect
51+
golang.org/x/term v0.27.0 // indirect
52+
golang.org/x/text v0.21.0 // indirect
53+
google.golang.org/genproto/googleapis/rpc v0.0.0-20241216192217-9240e9c98484 // indirect
54+
google.golang.org/protobuf v1.36.0 // indirect
55+
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
5456
gopkg.in/inf.v0 v0.9.1 // indirect
55-
gopkg.in/yaml.v2 v2.4.0 // indirect
5657
gopkg.in/yaml.v3 v3.0.1 // indirect
57-
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
58-
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
59-
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
60-
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
61-
sigs.k8s.io/yaml v1.3.0 // indirect
58+
k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 // indirect
59+
k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect
60+
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
61+
sigs.k8s.io/structured-merge-diff/v4 v4.5.0 // indirect
62+
sigs.k8s.io/yaml v1.4.0 // indirect
6263
)
64+
65+
replace github.com/kubernetes-csi/csi-lib-utils => ../csi-lib-utils

0 commit comments

Comments
 (0)