Skip to content

Commit c0d82f7

Browse files
committed
Adding SlowRetry on Infeasible Provisioning
1 parent c746941 commit c0d82f7

File tree

5 files changed

+386
-198
lines changed

5 files changed

+386
-198
lines changed

controller/controller.go

+100-1
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,12 @@ import (
3030
"sync"
3131
"time"
3232

33+
"github.com/kubernetes-csi/csi-lib-utils/slowset"
3334
"github.com/prometheus/client_golang/prometheus"
3435
"github.com/prometheus/client_golang/prometheus/promhttp"
3536
"golang.org/x/time/rate"
37+
"google.golang.org/grpc/codes"
38+
"google.golang.org/grpc/status"
3639
v1 "k8s.io/api/core/v1"
3740
storage "k8s.io/api/storage/v1"
3841
storagebeta "k8s.io/api/storage/v1beta1"
@@ -183,6 +186,10 @@ type ProvisionController struct {
183186
volumeStore VolumeStore
184187

185188
volumeNameHook VolumeNameHook
189+
190+
slowSet *slowset.SlowSet
191+
192+
retryIntervalMax time.Duration
186193
}
187194

188195
const (
@@ -216,6 +223,8 @@ const (
216223
DefaultMetricsPath = "/metrics"
217224
// DefaultAddFinalizer is used when option function AddFinalizer is omitted
218225
DefaultAddFinalizer = false
226+
// DefaultRetryIntervalMax is used when option function RetryIntervalMax is omitted
227+
DefaultRetryIntervalMax = 5 * time.Minute
219228
)
220229

221230
var errRuntime = fmt.Errorf("cannot call option functions after controller has Run")
@@ -451,6 +460,18 @@ func RetryPeriod(retryPeriod time.Duration) func(*ProvisionController) error {
451460
}
452461
}
453462

463+
// RetryIntervalMax is the maximum retry interval of failed provisioning or deletion.
464+
// Defaults to 5 minutes.
465+
func RetryIntervalMax(retryIntervalMax time.Duration) func(*ProvisionController) error {
466+
return func(c *ProvisionController) error {
467+
if c.HasRun() {
468+
return errRuntime
469+
}
470+
c.retryIntervalMax = retryIntervalMax
471+
return nil
472+
}
473+
}
474+
454475
// ClaimsInformer sets the informer to use for accessing PersistentVolumeClaims.
455476
// Defaults to using a internal informer.
456477
func ClaimsInformer(informer cache.SharedIndexInformer) func(*ProvisionController) error {
@@ -667,8 +688,11 @@ func NewProvisionController(
667688
hasRun: false,
668689
hasRunLock: &sync.Mutex{},
669690
volumeNameHook: getProvisionedVolumeNameForClaim,
691+
retryIntervalMax: DefaultRetryIntervalMax,
670692
}
671693

694+
controller.slowSet = slowset.NewSlowSet(controller.retryIntervalMax)
695+
672696
for _, option := range options {
673697
err := option(controller)
674698
if err != nil {
@@ -840,6 +864,8 @@ func (ctrl *ProvisionController) Run(ctx context.Context) {
840864
defer ctrl.claimQueue.ShutDown()
841865
defer ctrl.volumeQueue.ShutDown()
842866

867+
go ctrl.slowSet.Run(ctx.Done())
868+
843869
ctrl.hasRunLock.Lock()
844870
ctrl.hasRun = true
845871
ctrl.hasRunLock.Unlock()
@@ -1085,6 +1111,10 @@ func (ctrl *ProvisionController) syncClaim(ctx context.Context, obj interface{})
10851111
return fmt.Errorf("expected claim but got %+v", obj)
10861112
}
10871113

1114+
if err := ctrl.delayProvisioningIfRecentlyInfeasible(claim); err != nil {
1115+
return err
1116+
}
1117+
10881118
should, err := ctrl.shouldProvision(ctx, claim)
10891119
if err != nil {
10901120
ctrl.updateProvisionStats(claim, err, time.Time{})
@@ -1494,7 +1524,20 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
14941524
}
14951525

14961526
ctx2 := klog.NewContext(ctx, logger)
1497-
err = fmt.Errorf("failed to provision volume with StorageClass %q: %v", claimClass, err)
1527+
1528+
if isInfeasibleError(err) {
1529+
logger.V(2).Info("Detected infeasible volume provisioning request",
1530+
"error", err,
1531+
"claim", klog.KObj(claim))
1532+
1533+
ctrl.markForSlowRetry(ctx, claim, err)
1534+
1535+
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, "ProvisioningFailed",
1536+
fmt.Sprintf("Volume provisioning failed with infeasible error. Retries will be delayed. %v", err))
1537+
1538+
return ProvisioningFinished, err
1539+
}
1540+
14981541
return ctrl.provisionVolumeErrorHandling(ctx2, result, err, claim)
14991542
}
15001543

@@ -1519,6 +1562,62 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
15191562
return ProvisioningFinished, nil
15201563
}
15211564

1565+
func (ctrl *ProvisionController) delayProvisioningIfRecentlyInfeasible(claim *v1.PersistentVolumeClaim) error {
1566+
key := string(claim.UID)
1567+
1568+
claimClass := util.GetPersistentVolumeClaimClass(claim)
1569+
currentClass, err := ctrl.getStorageClass(claimClass)
1570+
if err != nil {
1571+
return nil
1572+
}
1573+
1574+
if info, exists := ctrl.slowSet.Get(key); exists {
1575+
if info.StorageClassUID != string(currentClass.UID) {
1576+
ctrl.slowSet.Remove(key)
1577+
return nil
1578+
}
1579+
}
1580+
if delay := ctrl.slowSet.TimeRemaining(key); delay > 0 {
1581+
return util.NewDelayRetryError(fmt.Sprintf("skipping volume provisioning for pvc %s, because provisioning previously failed with infeasible error", key))
1582+
}
1583+
return nil
1584+
}
1585+
1586+
func (ctrl *ProvisionController) markForSlowRetry(ctx context.Context, claim *v1.PersistentVolumeClaim, err error) {
1587+
if isInfeasibleError(err) {
1588+
key := string(claim.UID)
1589+
1590+
claimClass := util.GetPersistentVolumeClaimClass(claim)
1591+
class, err := ctrl.getStorageClass(claimClass)
1592+
if err != nil {
1593+
logger := klog.FromContext(ctx)
1594+
logger.Error(err, "Failed to get StorageClass for delay tracking",
1595+
"PVC", klog.KObj(claim))
1596+
return
1597+
}
1598+
1599+
info := slowset.ObjectData{
1600+
Timestamp: time.Now(),
1601+
StorageClassUID: string(class.UID),
1602+
}
1603+
ctrl.slowSet.Add(key, info)
1604+
}
1605+
}
1606+
1607+
func isInfeasibleError(err error) bool {
1608+
1609+
st, ok := status.FromError(err)
1610+
if !ok {
1611+
return false
1612+
}
1613+
1614+
switch st.Code() {
1615+
case codes.InvalidArgument:
1616+
return true
1617+
}
1618+
return false
1619+
}
1620+
15221621
func (ctrl *ProvisionController) provisionVolumeErrorHandling(ctx context.Context, result ProvisioningState, err error, claim *v1.PersistentVolumeClaim) (ProvisioningState, error) {
15231622
logger := klog.FromContext(ctx)
15241623
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, "ProvisioningFailed", err.Error())

controller/controller_test.go

+113-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626

2727
"github.com/prometheus/client_golang/prometheus"
2828
dto "github.com/prometheus/client_model/go"
29+
"google.golang.org/grpc/codes"
30+
"google.golang.org/grpc/status"
2931
v1 "k8s.io/api/core/v1"
3032
storage "k8s.io/api/storage/v1"
3133
"k8s.io/apimachinery/pkg/api/resource"
@@ -47,6 +49,7 @@ import (
4749
"k8s.io/klog/v2/ktesting"
4850
_ "k8s.io/klog/v2/ktesting/init"
4951
"sigs.k8s.io/sig-storage-lib-external-provisioner/v11/controller/metrics"
52+
"sigs.k8s.io/sig-storage-lib-external-provisioner/v11/util"
5053
)
5154

5255
const (
@@ -1618,6 +1621,107 @@ func TestControllerSharedInformers(t *testing.T) {
16181621
}
16191622
}
16201623

1624+
// TestInfeasibleRetry tests that sidecar doesn't spam plugin upon infeasible error code (e.g. invalid VAC parameter)
1625+
func TestInfeasibleRetry(t *testing.T) {
1626+
basePVC := newClaim("test-claim", "uid-1-1", "class-1", "foo.bar/baz", "", nil)
1627+
storageClass := newStorageClass("class-1", "foo.bar/baz")
1628+
1629+
tests := []struct {
1630+
name string
1631+
pvc *v1.PersistentVolumeClaim
1632+
expectedProvisionCallCount int
1633+
csiProvisionError error
1634+
eventuallyRemoveFromSlowSet bool
1635+
}{
1636+
{
1637+
name: "Should retry non-infeasible error normally",
1638+
pvc: basePVC,
1639+
expectedProvisionCallCount: 2,
1640+
csiProvisionError: status.Errorf(codes.Internal, "fake non-infeasible error"),
1641+
eventuallyRemoveFromSlowSet: false,
1642+
},
1643+
{
1644+
name: "Should NOT retry infeasible error normally",
1645+
pvc: basePVC,
1646+
expectedProvisionCallCount: 1,
1647+
csiProvisionError: status.Errorf(codes.InvalidArgument, "fake infeasible error"),
1648+
eventuallyRemoveFromSlowSet: false,
1649+
},
1650+
{
1651+
name: "Should EVENTUALLY retry infeasible error",
1652+
pvc: basePVC,
1653+
expectedProvisionCallCount: 2,
1654+
csiProvisionError: status.Errorf(codes.InvalidArgument, "fake infeasible error"),
1655+
eventuallyRemoveFromSlowSet: true,
1656+
},
1657+
}
1658+
1659+
for _, test := range tests {
1660+
t.Run(test.name, func(t *testing.T) {
1661+
// Setup
1662+
_, ctx := ktesting.NewTestContext(t)
1663+
1664+
client := fake.NewSimpleClientset(test.pvc, storageClass)
1665+
1666+
provisioner := newTestProvisioner()
1667+
provisioner.returnError = test.csiProvisionError
1668+
1669+
ctrl := newTestProvisionController(ctx, client, "foo.bar/baz", provisioner)
1670+
1671+
if err := ctrl.classes.Add(storageClass); err != nil {
1672+
t.Fatalf("failed to add StorageClass to cache: %v", err)
1673+
}
1674+
1675+
// First attempt at provision
1676+
err := ctrl.syncClaim(ctx, test.pvc)
1677+
if !errors.Is(err, test.csiProvisionError) {
1678+
t.Errorf("expected error %v but got %v", test.csiProvisionError, err)
1679+
}
1680+
1681+
// For infeasible errors, verify the PVC was added to SlowSet
1682+
if status.Code(test.csiProvisionError) == codes.InvalidArgument {
1683+
key := string(test.pvc.UID)
1684+
if !ctrl.slowSet.Contains(key) {
1685+
t.Error("PVC should have been added to SlowSet after infeasible error")
1686+
}
1687+
}
1688+
1689+
// Fake time passing by removing from SlowSet
1690+
if test.eventuallyRemoveFromSlowSet {
1691+
key := string(test.pvc.UID)
1692+
ctrl.slowSet.Remove(key)
1693+
}
1694+
1695+
// Second attempt at provision
1696+
err2 := ctrl.syncClaim(ctx, test.pvc)
1697+
switch test.expectedProvisionCallCount {
1698+
case 1:
1699+
if !util.IsDelayRetryError(err2) {
1700+
t.Errorf("expected delay retry error but got %v", err2)
1701+
}
1702+
case 2:
1703+
if !errors.Is(err2, test.csiProvisionError) {
1704+
t.Errorf("expected error %v but got %v", test.csiProvisionError, err2)
1705+
}
1706+
default:
1707+
t.Errorf("unexpected provision error in second attempt: %v", err)
1708+
}
1709+
1710+
// Count the number of provision calls from the channel
1711+
provisionCount := 0
1712+
for len(provisioner.provisionCalls) > 0 {
1713+
<-provisioner.provisionCalls
1714+
provisionCount++
1715+
}
1716+
1717+
if test.expectedProvisionCallCount != provisionCount {
1718+
t.Errorf("expected %d provision calls, but got %d",
1719+
test.expectedProvisionCallCount, provisionCount)
1720+
}
1721+
})
1722+
}
1723+
}
1724+
16211725
type testMetrics struct {
16221726
provisioned counts
16231727
deleted counts
@@ -1986,11 +2090,15 @@ type provisionParams struct {
19862090
}
19872091

19882092
func newTestProvisioner() *testProvisioner {
1989-
return &testProvisioner{make(chan provisionParams, 16)}
2093+
return &testProvisioner{
2094+
provisionCalls: make(chan provisionParams, 16),
2095+
returnError: nil,
2096+
}
19902097
}
19912098

19922099
type testProvisioner struct {
19932100
provisionCalls chan provisionParams
2101+
returnError error
19942102
}
19952103

19962104
var _ Provisioner = &testProvisioner{}
@@ -2033,6 +2141,10 @@ func (p *testProvisioner) Provision(ctx context.Context, options ProvisionOption
20332141
allowedTopologies: options.StorageClass.AllowedTopologies,
20342142
}
20352143

2144+
if p.returnError != nil {
2145+
return nil, ProvisioningFinished, p.returnError
2146+
}
2147+
20362148
// Sleep to simulate work done by Provision...for long enough that
20372149
// TestMultipleControllers will consistently fail with lock disabled. If
20382150
// Provision happens too fast, the first controller creates the PV too soon

0 commit comments

Comments
 (0)