From fd76b6df7669a44804528c4131e4021d81247de2 Mon Sep 17 00:00:00 2001
From: Andrejs Cunskis <acunskis@gitlab.com>
Date: Wed, 2 Oct 2024 14:28:20 +0300
Subject: [PATCH] Add option to retry failed helm deployment

---
 .../cng/commands/subcommands/deployment.rb    |  6 ++++-
 .../gitlab/cng/lib/deployment/installation.rb | 14 +++++++---
 .../commands/subcommands/deployment_spec.rb   |  3 ++-
 .../cng/deployment/installation_spec.rb       | 27 ++++++++++++++-----
 4 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/qa/gems/gitlab-cng/lib/gitlab/cng/commands/subcommands/deployment.rb b/qa/gems/gitlab-cng/lib/gitlab/cng/commands/subcommands/deployment.rb
index 446ee9ad57878..ca2161b6a3c15 100644
--- a/qa/gems/gitlab-cng/lib/gitlab/cng/commands/subcommands/deployment.rb
+++ b/qa/gems/gitlab-cng/lib/gitlab/cng/commands/subcommands/deployment.rb
@@ -49,6 +49,10 @@ def method_added(name)
                 type: :string,
                 repeatable: true,
                 aliases: "-e"
+              option :retry,
+                desc: "Max number of retries for failed deployment",
+                default: 0,
+                type: :numeric
 
               super(name)
             end
@@ -120,7 +124,7 @@ def kind(name = DEFAULT_HELM_RELEASE_NAME)
           def installation(name, configuration)
             Cng::Deployment::Installation.new(
               name, configuration: configuration,
-              **symbolized_options.slice(:namespace, :set, :ci, :gitlab_domain, :timeout, :chart_sha, :env)
+              **symbolized_options.slice(:namespace, :set, :ci, :gitlab_domain, :timeout, :chart_sha, :env, :retry)
             )
           end
 
diff --git a/qa/gems/gitlab-cng/lib/gitlab/cng/lib/deployment/installation.rb b/qa/gems/gitlab-cng/lib/gitlab/cng/lib/deployment/installation.rb
index 0944e9f41b045..b59ceafd65b90 100644
--- a/qa/gems/gitlab-cng/lib/gitlab/cng/lib/deployment/installation.rb
+++ b/qa/gems/gitlab-cng/lib/gitlab/cng/lib/deployment/installation.rb
@@ -56,6 +56,8 @@ def initialize(name, configuration:, namespace:, ci:, gitlab_domain:, timeout:,
           @set = args[:set] || []
           @extra_env = args[:env] || []
           @chart_sha = args[:chart_sha]
+          @retry_attempts = args[:retry] || 0
+          @deployment_attempts = 0
         end
 
         # Perform deployment with all the additional setup
@@ -88,7 +90,8 @@ def component_version_values
           :gitlab_domain,
           :timeout,
           :chart_sha,
-          :extra_env
+          :extra_env,
+          :retry_attempts
 
         alias_method :cli_values, :set
 
@@ -186,7 +189,12 @@ def run_deploy(chart_reference)
           Helpers::Spinner.spin("running helm deployment") do
             helm.upgrade(name, chart_reference, namespace: namespace, timeout: timeout, values: values, args: args)
           rescue Helm::Client::Error => e
-            handle_deploy_failure(e)
+            @deployment_attempts += 1
+            handle_deploy_failure(e) if @deployment_attempts > retry_attempts
+
+            log("Deployment failed, retrying...", :warn)
+            log("Error: #{e}", :warn)
+            retry
           end
           log("Deployment successful and app is available via: #{configuration.gitlab_url}", :success, bright: true)
         end
@@ -226,7 +234,7 @@ def create_license
         # @param [StandardError] error
         # @return [void]
         def handle_deploy_failure(error)
-          log("Helm upgrade failed!", :error)
+          log("Helm deployment failed!", :error)
           log("For more information on troubleshooting failures, see: '#{TROUBLESHOOTING_LINK}'", :warn)
 
           events = get_warning_events
diff --git a/qa/gems/gitlab-cng/spec/unit/gitlab/cng/commands/subcommands/deployment_spec.rb b/qa/gems/gitlab-cng/spec/unit/gitlab/cng/commands/subcommands/deployment_spec.rb
index af8e6ce154f11..482b517bccb08 100644
--- a/qa/gems/gitlab-cng/spec/unit/gitlab/cng/commands/subcommands/deployment_spec.rb
+++ b/qa/gems/gitlab-cng/spec/unit/gitlab/cng/commands/subcommands/deployment_spec.rb
@@ -46,7 +46,8 @@
         namespace: "gitlab",
         ci: false,
         gitlab_domain: "127.0.0.1.nip.io",
-        timeout: "10m"
+        timeout: "10m",
+        retry: 0
       )
       expect(Gitlab::Cng::Deployment::Configurations::Kind).to have_received(:new).with(
         namespace: "gitlab",
diff --git a/qa/gems/gitlab-cng/spec/unit/gitlab/cng/deployment/installation_spec.rb b/qa/gems/gitlab-cng/spec/unit/gitlab/cng/deployment/installation_spec.rb
index f5310c7482354..d56c0eb5164bb 100644
--- a/qa/gems/gitlab-cng/spec/unit/gitlab/cng/deployment/installation_spec.rb
+++ b/qa/gems/gitlab-cng/spec/unit/gitlab/cng/deployment/installation_spec.rb
@@ -11,7 +11,8 @@
         gitlab_domain: gitlab_domain,
         timeout: "10m",
         chart_sha: chart_sha,
-        env: ["RAILS_ENV_VAR=val"]
+        env: ["RAILS_ENV_VAR=val"],
+        retry: retry_attempts
       )
     end
 
@@ -20,6 +21,7 @@
     let(:chart_sha) { nil }
     let(:chart_reference) { "chart-reference" }
     let(:ci) { false }
+    let(:retry_attempts) { 0 }
 
     let(:kubeclient) do
       instance_double(Gitlab::Cng::Kubectl::Client, create_namespace: "", create_resource: "", execute: "")
@@ -100,12 +102,23 @@
         EVENTS
       end
 
-      it "automatically prints warning events" do
-        expect { expect { installation.create }.to raise_error(SystemExit) }.to output(
-          match("#{warn_event[:involvedObject][:kind]}/#{warn_event[:involvedObject][:name]}")
-          .and(match(warn_event[:message]))
-          .and(match(/For more information on troubleshooting failures, see: \S+/))
-        ).to_stdout
+      context "without retry" do
+        it "automatically prints warning events" do
+          expect { expect { installation.create }.to raise_error(SystemExit) }.to output(
+            match("#{warn_event[:involvedObject][:kind]}/#{warn_event[:involvedObject][:name]}")
+            .and(match(warn_event[:message]))
+            .and(match(/For more information on troubleshooting failures, see: \S+/))
+          ).to_stdout
+        end
+      end
+
+      context "with retry" do
+        let(:retry_attempts) { 1 }
+
+        it "retries deployment" do
+          expect { expect { installation.create }.to raise_error(SystemExit) }.to output.to_stdout
+          expect(helmclient).to have_received(:upgrade).twice
+        end
       end
     end
 
-- 
GitLab