From a3128e50be9650a4e4c62b45d0cc4318bbeef84c Mon Sep 17 00:00:00 2001
From: Fraser Waters <fraser@pulumi.com>
Date: Fri, 14 Oct 2022 12:09:10 +0100
Subject: [PATCH] Do not execute pending deletes at the start of deployment

This removes all the handling of pending deletes from the start of
deployments. Instead we allow resources to just be deleted as they
usually would at the end of the deployment.

There's a big comment in TestPendingDeleteOrder that explains the order
of operations in a succesful run and how that order differs if we try
and do pending deletes up-front.

Fixes https://github.com/pulumi/pulumi/issues/2948
---
 .../20221014--engine--pending-deletes.yaml    |   4 +
 pkg/engine/lifecycletest/pulumi_test.go       | 160 ++++++++++++++++++
 pkg/resource/deploy/deployment_executor.go    |  50 ------
 pkg/resource/deploy/step_generator.go         |  20 ---
 .../double_pending_delete_test.go             |  11 +-
 .../double_pending_delete/step3/index.ts      |   6 +-
 .../double_pending_delete/step4/index.ts      |   1 +
 7 files changed, 175 insertions(+), 77 deletions(-)
 create mode 100644 changelog/pending/20221014--engine--pending-deletes.yaml

diff --git a/changelog/pending/20221014--engine--pending-deletes.yaml b/changelog/pending/20221014--engine--pending-deletes.yaml
new file mode 100644
index 000000000000..adde30fbe7c0
--- /dev/null
+++ b/changelog/pending/20221014--engine--pending-deletes.yaml
@@ -0,0 +1,4 @@
+changes:
+- type: fix
+  scope: engine
+  description: Pending deletes are no longer executed before everything else. This correctly handles dependencies for resource graphs that were partially deleted.
diff --git a/pkg/engine/lifecycletest/pulumi_test.go b/pkg/engine/lifecycletest/pulumi_test.go
index d5b57ee64ab6..b5c202b7958b 100644
--- a/pkg/engine/lifecycletest/pulumi_test.go
+++ b/pkg/engine/lifecycletest/pulumi_test.go
@@ -3956,3 +3956,163 @@ func TestDefaultParents(t *testing.T) {
 	assert.Equal(t, snap.Resources[0].URN, snap.Resources[1].Parent)
 	assert.Equal(t, snap.Resources[0].URN, snap.Resources[2].Parent)
 }
+
+func TestPendingDeleteOrder(t *testing.T) {
+	// Test for https://github.com/pulumi/pulumi/issues/2948 Ensure that if we have resources A and B, and we
+	// go to replace A but then fail to replace B that we correctly handle everything in the same order when
+	// we retry the update.
+	//
+	// That is normally for this operation we would do the following:
+	// 1. Create new A
+	// 2. Create new B
+	// 3. Delete old B
+	// 4. Delete old A
+	// So if step 2 fails to create the new B we want to see:
+	// 1. Create new A
+	// 2. Create new B (fail)
+	// 1. Create new B
+	// 2. Delete old B
+	// 3. Delete old A
+	// Currently (and what #2948 tracks) is that the engine does the following:
+	// 1. Create new A
+	// 2. Create new B (fail)
+	// 3. Delete old A
+	// 1. Create new B
+	// 2. Delete old B
+	// That delete A fails because the delete B needs to happen first.
+
+	t.Parallel()
+
+	cloudState := map[resource.ID]resource.PropertyMap{}
+
+	failCreationOfTypB := false
+
+	loaders := []*deploytest.ProviderLoader{
+		deploytest.NewProviderLoader("pkgA", semver.MustParse("1.0.0"), func() (plugin.Provider, error) {
+			return &deploytest.Provider{
+				CreateF: func(urn resource.URN, news resource.PropertyMap, timeout float64,
+					preview bool) (resource.ID, resource.PropertyMap, resource.Status, error) {
+
+					if strings.Contains(string(urn), "typB") && failCreationOfTypB {
+						return "", nil, resource.StatusOK, fmt.Errorf("Could not create typB")
+					}
+
+					id := resource.ID(fmt.Sprintf("%d", len(cloudState)))
+					if !preview {
+						cloudState[id] = news
+					}
+					return id, news, resource.StatusOK, nil
+				},
+				DeleteF: func(urn resource.URN,
+					id resource.ID, olds resource.PropertyMap, timeout float64) (resource.Status, error) {
+					// Fail if anything in cloud state still points to us
+					for _, res := range cloudState {
+						for _, v := range res {
+							if v.IsString() && v.StringValue() == string(id) {
+								return resource.StatusOK, fmt.Errorf("Can not delete %s", id)
+							}
+						}
+					}
+
+					delete(cloudState, id)
+					return resource.StatusOK, nil
+				},
+				DiffF: func(urn resource.URN,
+					id resource.ID, olds, news resource.PropertyMap, ignoreChanges []string) (plugin.DiffResult, error) {
+					if strings.Contains(string(urn), "typA") {
+						if !olds["foo"].DeepEquals(news["foo"]) {
+							return plugin.DiffResult{
+								Changes:     plugin.DiffSome,
+								ReplaceKeys: []resource.PropertyKey{"foo"},
+								DetailedDiff: map[string]plugin.PropertyDiff{
+									"foo": {
+										Kind:      plugin.DiffUpdateReplace,
+										InputDiff: true,
+									},
+								},
+								DeleteBeforeReplace: false,
+							}, nil
+						}
+					}
+					if strings.Contains(string(urn), "typB") {
+						if !olds["parent"].DeepEquals(news["parent"]) {
+							return plugin.DiffResult{
+								Changes:     plugin.DiffSome,
+								ReplaceKeys: []resource.PropertyKey{"parent"},
+								DetailedDiff: map[string]plugin.PropertyDiff{
+									"parent": {
+										Kind:      plugin.DiffUpdateReplace,
+										InputDiff: true,
+									},
+								},
+								DeleteBeforeReplace: false,
+							}, nil
+						}
+					}
+
+					return plugin.DiffResult{}, nil
+				},
+				UpdateF: func(urn resource.URN,
+					id resource.ID, olds, news resource.PropertyMap, timeout float64,
+					ignoreChanges []string, preview bool) (resource.PropertyMap, resource.Status, error) {
+					assert.Fail(t, "Didn't expect update to be called")
+					return nil, resource.StatusOK, nil
+				},
+			}, nil
+		}, deploytest.WithoutGrpc),
+	}
+
+	ins := resource.NewPropertyMapFromMap(map[string]interface{}{
+		"foo": "bar",
+	})
+	program := deploytest.NewLanguageRuntime(func(info plugin.RunInfo, monitor *deploytest.ResourceMonitor) error {
+		_, idA, _, err := monitor.RegisterResource("pkgA:m:typA", "resA", true, deploytest.ResourceOptions{
+			Inputs: ins,
+		})
+		assert.NoError(t, err)
+
+		_, _, _, err = monitor.RegisterResource("pkgA:m:typB", "resB", true, deploytest.ResourceOptions{
+			Inputs: resource.NewPropertyMapFromMap(map[string]interface{}{
+				"parent": idA,
+			}),
+		})
+		assert.NoError(t, err)
+
+		return nil
+	})
+	host := deploytest.NewPluginHost(nil, nil, program, loaders...)
+
+	p := &TestPlan{
+		Options: UpdateOptions{Host: host},
+	}
+
+	project := p.GetProject()
+
+	// Run an update to create the resources
+	snap, res := TestOp(Update).Run(project, p.GetTarget(t, nil), p.Options, false, p.BackendClient, nil)
+	assert.Nil(t, res)
+	assert.NotNil(t, snap)
+	assert.Len(t, snap.Resources, 3)
+
+	// Trigger a replacement of A but fail to create B
+	failCreationOfTypB = true
+	ins = resource.NewPropertyMapFromMap(map[string]interface{}{
+		"foo": "baz",
+	})
+	snap, res = TestOp(Update).Run(project, p.GetTarget(t, snap), p.Options, false, p.BackendClient, nil)
+	// Assert that this fails, we should have two copies of A now, one new one and one old one pending delete
+	assert.NotNil(t, res)
+	assert.NotNil(t, snap)
+	assert.Len(t, snap.Resources, 4)
+	assert.Equal(t, snap.Resources[1].Type, tokens.Type("pkgA:m:typA"))
+	assert.False(t, snap.Resources[1].Delete)
+	assert.Equal(t, snap.Resources[2].Type, tokens.Type("pkgA:m:typA"))
+	assert.True(t, snap.Resources[2].Delete)
+
+	// Now allow B to create and try again
+	failCreationOfTypB = false
+	snap, res = TestOp(Update).Run(project, p.GetTarget(t, snap), p.Options, false, p.BackendClient, nil)
+	assert.Nil(t, res)
+	assert.NotNil(t, snap)
+	assert.Len(t, snap.Resources, 3)
+}
diff --git a/pkg/resource/deploy/deployment_executor.go b/pkg/resource/deploy/deployment_executor.go
index 6d5b81513700..4ef81b0c98cd 100644
--- a/pkg/resource/deploy/deployment_executor.go
+++ b/pkg/resource/deploy/deployment_executor.go
@@ -207,11 +207,6 @@ func (ex *deploymentExecutor) Execute(callerCtx context.Context, opts Options, p
 	// Set up a step generator for this deployment.
 	ex.stepGen = newStepGenerator(ex.deployment, opts, updateTargetsOpt, replaceTargetsOpt)
 
-	// Retire any pending deletes that are currently present in this deployment.
-	if res := ex.retirePendingDeletes(callerCtx, opts, preview); res != nil {
-		return nil, res
-	}
-
 	// Derive a cancellable context for this deployment. We will only cancel this context if some piece of the
 	// deployment's execution fails.
 	ctx, cancel := context.WithCancel(callerCtx)
@@ -459,51 +454,6 @@ func (ex *deploymentExecutor) handleSingleEvent(event SourceEvent) result.Result
 	return nil
 }
 
-// retirePendingDeletes deletes all resources that are pending deletion. Run before the start of a deployment, this pass
-// ensures that the engine never sees any resources that are pending deletion from a previous deployment.
-//
-// retirePendingDeletes re-uses the deployment executor's step generator but uses its own step executor.
-func (ex *deploymentExecutor) retirePendingDeletes(callerCtx context.Context, opts Options,
-	preview bool) result.Result {
-
-	contract.Require(ex.stepGen != nil, "ex.stepGen != nil")
-	steps := ex.stepGen.GeneratePendingDeletes()
-	if len(steps) == 0 {
-		logging.V(4).Infoln("deploymentExecutor.retirePendingDeletes(...): no pending deletions")
-		return nil
-	}
-
-	logging.V(4).Infof("deploymentExecutor.retirePendingDeletes(...): executing %d steps", len(steps))
-	ctx, cancel := context.WithCancel(callerCtx)
-
-	stepExec := newStepExecutor(ctx, cancel, ex.deployment, opts, preview, false)
-	antichains := ex.stepGen.ScheduleDeletes(steps)
-	// Submit the deletes for execution and wait for them all to retire.
-	for _, antichain := range antichains {
-		for _, step := range antichain {
-			ex.deployment.Ctx().StatusDiag.Infof(diag.RawMessage(step.URN(), "completing deletion from previous update"))
-		}
-
-		tok := stepExec.ExecuteParallel(antichain)
-		tok.Wait(ctx)
-	}
-
-	stepExec.SignalCompletion()
-	stepExec.WaitForCompletion()
-
-	// Like Refresh, we use the presence of an error in the caller's context to detect whether or not we have been
-	// cancelled.
-	canceled := callerCtx.Err() != nil
-	if stepExec.Errored() {
-		ex.reportExecResult("failed", preview)
-		return result.Bail()
-	} else if canceled {
-		ex.reportExecResult("canceled", preview)
-		return result.Bail()
-	}
-	return nil
-}
-
 // import imports a list of resources into a stack.
 func (ex *deploymentExecutor) importResources(
 	callerCtx context.Context,
diff --git a/pkg/resource/deploy/step_generator.go b/pkg/resource/deploy/step_generator.go
index 746d071dc9f8..d3b1706b7123 100644
--- a/pkg/resource/deploy/step_generator.go
+++ b/pkg/resource/deploy/step_generator.go
@@ -1228,26 +1228,6 @@ func (sg *stepGenerator) determineAllowedResourcesToDeleteFromTargets(
 	return resourcesToDelete, nil
 }
 
-// GeneratePendingDeletes generates delete steps for all resources that are pending deletion. This function should be
-// called at the start of a deployment in order to find all resources that are pending deletion from the previous
-// deployment.
-func (sg *stepGenerator) GeneratePendingDeletes() []Step {
-	var dels []Step
-	if prev := sg.deployment.prev; prev != nil {
-		logging.V(7).Infof("stepGenerator.GeneratePendingDeletes(): scanning previous snapshot for pending deletes")
-		for i := len(prev.Resources) - 1; i >= 0; i-- {
-			res := prev.Resources[i]
-			if res.Delete {
-				logging.V(7).Infof(
-					"stepGenerator.GeneratePendingDeletes(): resource (%v, %v) is pending deletion", res.URN, res.ID)
-				sg.pendingDeletes[res] = true
-				dels = append(dels, NewDeleteStep(sg.deployment, res))
-			}
-		}
-	}
-	return dels
-}
-
 // ScheduleDeletes takes a list of steps that will delete resources and "schedules" them by producing a list of list of
 // steps, where each list can be executed in parallel but a previous list must be executed to completion before
 // advancing to the next list.
diff --git a/tests/integration/double_pending_delete/double_pending_delete_test.go b/tests/integration/double_pending_delete/double_pending_delete_test.go
index d555b12d39f8..a74e3e7c85cf 100644
--- a/tests/integration/double_pending_delete/double_pending_delete_test.go
+++ b/tests/integration/double_pending_delete/double_pending_delete_test.go
@@ -44,7 +44,6 @@ func TestDoublePendingDelete(t *testing.T) {
 					b := stackInfo.Deployment.Resources[4]
 					assert.Equal(t, "b", string(b.URN.Name()))
 					assert.False(t, b.Delete)
-
 				},
 			},
 			{
@@ -52,10 +51,10 @@ func TestDoublePendingDelete(t *testing.T) {
 				Additive:      true,
 				ExpectFailure: true,
 				ExtraRuntimeValidation: func(t *testing.T, stackInfo integration.RuntimeValidationStackInfo) {
-					// There is still only one pending delete resource in this snapshot.
+					// There is still two pending delete resources in this snapshot.
 					assert.NotNil(t, stackInfo.Deployment)
 
-					assert.Equal(t, 5, len(stackInfo.Deployment.Resources))
+					assert.Equal(t, 6, len(stackInfo.Deployment.Resources))
 					stackRes := stackInfo.Deployment.Resources[0]
 					assert.Equal(t, resource.RootStackType, stackRes.URN.Type())
 					providerRes := stackInfo.Deployment.Resources[1]
@@ -69,7 +68,11 @@ func TestDoublePendingDelete(t *testing.T) {
 					assert.Equal(t, "a", string(aCondemned.URN.Name()))
 					assert.True(t, aCondemned.Delete)
 
-					b := stackInfo.Deployment.Resources[4]
+					aSecondCondemned := stackInfo.Deployment.Resources[4]
+					assert.Equal(t, "a", string(aSecondCondemned.URN.Name()))
+					assert.True(t, aSecondCondemned.Delete)
+
+					b := stackInfo.Deployment.Resources[5]
 					assert.Equal(t, "b", string(b.URN.Name()))
 					assert.False(t, b.Delete)
 				},
diff --git a/tests/integration/double_pending_delete/step3/index.ts b/tests/integration/double_pending_delete/step3/index.ts
index a565590a0803..f7e8d3a514b0 100644
--- a/tests/integration/double_pending_delete/step3/index.ts
+++ b/tests/integration/double_pending_delete/step3/index.ts
@@ -16,8 +16,7 @@ import { Resource } from "./resource";
 
 // The previous plan failed, but we're going to initiate *another* plan that
 // introduces new changes, while still keeping around the failed state
-// from the previous plan. The engine should delete all pending deletes before
-// attempting to start the next plan.
+// from the previous plan. The engine should handle all pending deletes.
 //
 // To do this, we're going to trigger another replacement of A:
 const a = new Resource("a", { fail: 3 });
@@ -27,9 +26,10 @@ const b = new Resource("b", { fail: 1 }, { dependsOn: a });
 // The snapshot now contains:
 //  A: Created
 //  A: Pending Delete
+//  A: Pending Delete
 //  B: Created
 
-// The A from the previous snapshot should have been deleted.
+// The A from the previous snapshot won't have been deleted yet because we try to leave deletes till last.
 
 // This plan is interesting because it shows that it is legal to delete the same URN multiple
 // times in the same plan. This previously triggered an assert in the engine that asserted
diff --git a/tests/integration/double_pending_delete/step4/index.ts b/tests/integration/double_pending_delete/step4/index.ts
index b29318c541d5..1e985c6c955c 100644
--- a/tests/integration/double_pending_delete/step4/index.ts
+++ b/tests/integration/double_pending_delete/step4/index.ts
@@ -25,6 +25,7 @@ const a = new Resource("a", { fail: 4 });
 // Create B
 const b = new Resource("b", { fail: 2 }, { dependsOn: a });
 
+// Delete A
 // Delete A
 // Delete B