@@ -30,9 +30,12 @@ import (
30
30
"sync"
31
31
"time"
32
32
33
+ "github.com/kubernetes-csi/csi-lib-utils/slowset"
33
34
"github.com/prometheus/client_golang/prometheus"
34
35
"github.com/prometheus/client_golang/prometheus/promhttp"
35
36
"golang.org/x/time/rate"
37
+ "google.golang.org/grpc/codes"
38
+ "google.golang.org/grpc/status"
36
39
v1 "k8s.io/api/core/v1"
37
40
storage "k8s.io/api/storage/v1"
38
41
storagebeta "k8s.io/api/storage/v1beta1"
@@ -183,6 +186,10 @@ type ProvisionController struct {
183
186
volumeStore VolumeStore
184
187
185
188
volumeNameHook VolumeNameHook
189
+
190
+ slowSet * slowset.SlowSet
191
+
192
+ retryIntervalMax time.Duration
186
193
}
187
194
188
195
const (
@@ -216,6 +223,8 @@ const (
216
223
DefaultMetricsPath = "/metrics"
217
224
// DefaultAddFinalizer is used when option function AddFinalizer is omitted
218
225
DefaultAddFinalizer = false
226
+ // DefaultRetryIntervalMax is used when option function RetryIntervalMax is omitted
227
+ DefaultRetryIntervalMax = 5 * time .Minute
219
228
)
220
229
221
230
var errRuntime = fmt .Errorf ("cannot call option functions after controller has Run" )
@@ -451,6 +460,18 @@ func RetryPeriod(retryPeriod time.Duration) func(*ProvisionController) error {
451
460
}
452
461
}
453
462
463
+ // RetryIntervalMax is the maximum retry interval of failed provisioning or deletion.
464
+ // Defaults to 5 minutes.
465
+ func RetryIntervalMax (retryIntervalMax time.Duration ) func (* ProvisionController ) error {
466
+ return func (c * ProvisionController ) error {
467
+ if c .HasRun () {
468
+ return errRuntime
469
+ }
470
+ c .retryIntervalMax = retryIntervalMax
471
+ return nil
472
+ }
473
+ }
474
+
454
475
// ClaimsInformer sets the informer to use for accessing PersistentVolumeClaims.
455
476
// Defaults to using a internal informer.
456
477
func ClaimsInformer (informer cache.SharedIndexInformer ) func (* ProvisionController ) error {
@@ -667,8 +688,11 @@ func NewProvisionController(
667
688
hasRun : false ,
668
689
hasRunLock : & sync.Mutex {},
669
690
volumeNameHook : getProvisionedVolumeNameForClaim ,
691
+ retryIntervalMax : DefaultRetryIntervalMax ,
670
692
}
671
693
694
+ controller .slowSet = slowset .NewSlowSet (controller .retryIntervalMax )
695
+
672
696
for _ , option := range options {
673
697
err := option (controller )
674
698
if err != nil {
@@ -840,6 +864,8 @@ func (ctrl *ProvisionController) Run(ctx context.Context) {
840
864
defer ctrl .claimQueue .ShutDown ()
841
865
defer ctrl .volumeQueue .ShutDown ()
842
866
867
+ go ctrl .slowSet .Run (ctx .Done ())
868
+
843
869
ctrl .hasRunLock .Lock ()
844
870
ctrl .hasRun = true
845
871
ctrl .hasRunLock .Unlock ()
@@ -1085,6 +1111,10 @@ func (ctrl *ProvisionController) syncClaim(ctx context.Context, obj interface{})
1085
1111
return fmt .Errorf ("expected claim but got %+v" , obj )
1086
1112
}
1087
1113
1114
+ if err := ctrl .delayProvisioningIfRecentlyInfeasible (claim ); err != nil {
1115
+ return err
1116
+ }
1117
+
1088
1118
should , err := ctrl .shouldProvision (ctx , claim )
1089
1119
if err != nil {
1090
1120
ctrl .updateProvisionStats (claim , err , time.Time {})
@@ -1494,7 +1524,20 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
1494
1524
}
1495
1525
1496
1526
ctx2 := klog .NewContext (ctx , logger )
1497
- err = fmt .Errorf ("failed to provision volume with StorageClass %q: %v" , claimClass , err )
1527
+
1528
+ if isInfeasibleError (err ) {
1529
+ logger .V (2 ).Info ("Detected infeasible volume provisioning request" ,
1530
+ "error" , err ,
1531
+ "claim" , klog .KObj (claim ))
1532
+
1533
+ ctrl .markForSlowRetry (claim , err )
1534
+
1535
+ ctrl .eventRecorder .Event (claim , v1 .EventTypeWarning , "ProvisioningFailed" ,
1536
+ fmt .Sprintf ("Volume provisioning failed with infeasible error. Retries will be delayed. %v" , err ))
1537
+
1538
+ return ProvisioningFinished , err
1539
+ }
1540
+
1498
1541
return ctrl .provisionVolumeErrorHandling (ctx2 , result , err , claim )
1499
1542
}
1500
1543
@@ -1519,6 +1562,61 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
1519
1562
return ProvisioningFinished , nil
1520
1563
}
1521
1564
1565
+ func (ctrl * ProvisionController ) delayProvisioningIfRecentlyInfeasible (claim * v1.PersistentVolumeClaim ) error {
1566
+ key := string (claim .UID )
1567
+
1568
+ claimClass := util .GetPersistentVolumeClaimClass (claim )
1569
+ currentClass , err := ctrl .getStorageClass (claimClass )
1570
+ if err != nil {
1571
+ return fmt .Errorf ("failed to get storage class: %v" , err )
1572
+ }
1573
+
1574
+ if info , exists := ctrl .slowSet .Get (key ); exists {
1575
+ if info .StorageClassUID != string (currentClass .UID ) {
1576
+ ctrl .slowSet .Remove (key )
1577
+ return nil
1578
+ }
1579
+ }
1580
+ if delay := ctrl .slowSet .TimeRemaining (key ); delay > 0 {
1581
+ return fmt .Errorf ("skipping volume provisioning for pvc %s, because provisioning previously failed with infeasible error" , key )
1582
+ }
1583
+ return nil
1584
+ }
1585
+
1586
+ func (ctrl * ProvisionController ) markForSlowRetry (claim * v1.PersistentVolumeClaim , err error ) {
1587
+ if isInfeasibleError (err ) {
1588
+ key := string (claim .UID )
1589
+
1590
+ claimClass := util .GetPersistentVolumeClaimClass (claim )
1591
+ class , err := ctrl .getStorageClass (claimClass )
1592
+ if err != nil {
1593
+ klog .ErrorS (err , "Failed to get StorageClass for delay tracking" ,
1594
+ "PVC" , klog .KObj (claim ))
1595
+ return
1596
+ }
1597
+
1598
+ info := slowset.ObjectData {
1599
+ Timestamp : time .Now (),
1600
+ StorageClassUID : string (class .UID ),
1601
+ }
1602
+ ctrl .slowSet .Add (key , info )
1603
+ }
1604
+ }
1605
+
1606
+ func isInfeasibleError (err error ) bool {
1607
+
1608
+ st , ok := status .FromError (err )
1609
+ if ! ok {
1610
+ return false
1611
+ }
1612
+
1613
+ switch st .Code () {
1614
+ case codes .InvalidArgument :
1615
+ return true
1616
+ }
1617
+ return false
1618
+ }
1619
+
1522
1620
func (ctrl * ProvisionController ) provisionVolumeErrorHandling (ctx context.Context , result ProvisioningState , err error , claim * v1.PersistentVolumeClaim ) (ProvisioningState , error ) {
1523
1621
logger := klog .FromContext (ctx )
1524
1622
ctrl .eventRecorder .Event (claim , v1 .EventTypeWarning , "ProvisioningFailed" , err .Error ())
0 commit comments