From 2f2f6f227db60a8b2403502716ca21694844766a Mon Sep 17 00:00:00 2001 From: Jennifer Li <jli@gitlab.com> Date: Tue, 4 Feb 2025 07:44:51 +0000 Subject: [PATCH] Add custom error functions into scripts/utils.sh Add custom exit code to internal pipeline doc Ensure script is POSIX-compliant Switch initial_exit_code_file to absolute path Use set +e and set -e to prevent premature exit Wait for pipe to complete before continue --- .gitlab/ci/global.gitlab-ci.yml | 10 ++- doc/development/pipelines/internals.md | 22 +++++ scripts/utils.sh | 111 ++++++++++++++++++++++++- 3 files changed, 138 insertions(+), 5 deletions(-) diff --git a/.gitlab/ci/global.gitlab-ci.yml b/.gitlab/ci/global.gitlab-ci.yml index 33b544ad8fd58..c18320292aee8 100644 --- a/.gitlab/ci/global.gitlab-ci.yml +++ b/.gitlab/ci/global.gitlab-ci.yml @@ -9,8 +9,14 @@ - stuck_or_timeout_failure - unknown_failure exit_codes: - - 110 # known infrastructure error https://gitlab.com/gitlab-org/gitlab/-/issues/501325 - - 111 # low free disk space https://gitlab.com/gitlab-org/gitlab/-/issues/498142 + # https://docs.gitlab.com/ee/development/pipelines/internals.html#custom-exit-codes + - 110 # network connection error + - 111 # low free disk space + - 160 # failed to upload/download job artifact + - 162 # gitaly spawn failed + - 164 # Redis cluster error + - 165 # segmentation fault + - 166 # EEXIST: file already exists .default-utils-before_script: before_script: diff --git a/doc/development/pipelines/internals.md b/doc/development/pipelines/internals.md index 680d3c4ce2d88..063d0adf82218 100644 --- a/doc/development/pipelines/internals.md +++ b/doc/development/pipelines/internals.md @@ -250,6 +250,28 @@ and included in `rules` definitions via [YAML anchors](../../ci/yaml/yaml_optimi | `code-backstage-qa-patterns` | Combination of `code-patterns`, `backstage-patterns`, and `qa-patterns`. | | `static-analysis-patterns` | Only create jobs for Static Analytics configuration-related changes. | +## Custom exit codes + +GitLab CI uses custom exit codes to categorize different types of job failures. This helps with automated failure tracking and retry logic. To see which exit codes trigger automatic retries, check the retry rules in [GitLab global CI configuration](https://gitlab.com/gitlab-org/gitlab/-/blob/master/.gitlab/ci/global.gitlab-ci.yml). + +The table below lists current exit codes and their meanings: + +| exit code | Description | +|-----------|---------------------------------------| +|110 | network connection error | +|111 | low disk space | +|112 | known flaky test failure | +|160 | failed to upload/download job artifact| +|161 | 5XX server error | +|162 | Gitaly spawn failure | +|163 | RSpec job timeout | +|164 | Redis cluster error | +|165 | segmentation fault | +|166 | EEXIST: file already exists | +|167 | `gitlab.com` overloaded | + +This list can be expanded as new failure patterns emerge. To avoid conflicts with standard Bash exit codes, new custom codes must be 160 or higher. + ## Best Practices ### When to use `extends:`, `<<: *xyz` (YAML anchors), or `!reference` diff --git a/scripts/utils.sh b/scripts/utils.sh index 7ca2f0e9aa0a2..33f25bb801db2 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -544,12 +544,117 @@ function log_disk_usage() { fi } +# all functions below are for customizing CI job exit code +export TRACE_FILE=stdout_stderr_log.out + +function run_with_custom_exit_code() { + set +e # temprorarily disable exit on error to prevent premature exit + + # runs command passed in as argument, save standard error and standard output + output=$("$@" 2>&1) + initial_exit_code=$? + echo "$output" | tee "$TRACE_FILE" + + find_custom_exit_code "$initial_exit_code" + new_exit_code=$? + + echo "new_exit_code=$new_exit_code" + set -e + + exit "$new_exit_code" +} + +function find_custom_exit_code() { + local exit_code="$1" + + # Early return if exit code is 0 + [ "$exit_code" -eq 0 ] && return 0 + + # Check if TRACE_FILE is set + if [ -z "$TRACE_FILE" ] || [ ! -f "$TRACE_FILE" ]; then + echoerr "TRACE_FILE is not set or file does not exist" + exit "$exit_code" + fi + + if grep -i -q \ + -e "Failed to connect to 127.0.0.1" \ + -e "Failed to open TCP connection to" \ + -e "connection reset by peer" "$TRACE_FILE"; then + + echoerr "Detected network connection error. Changing exit code to 110." + exit_code=110 + alert_job_in_slack "$exit_code" "Network connection error" + + elif grep -i -q -e "no space left on device" "$TRACE_FILE"; then + echoerr "Detected no space left on device. Changing exit code to 111." + exit_code=111 + alert_job_in_slack "$exit_code" "Low disk space" + + elif grep -i -q \ + -e "error: downloading artifacts from coordinator" \ + -e "error: uploading artifacts as \"archive\" to coordinator" "$TRACE_FILE"; then + echoerr "Detected artifact transit error. Changing exit code to 160." + exit_code=160 + alert_job_in_slack "$exit_code" "Artifact transit error" + + elif grep -i -q \ + -e "500 Internal Server Error" \ + -e "Internal Server Error 500" \ + -e "502 Bad Gateway" \ + -e "503 Service Unavailable" "$TRACE_FILE"; then + echoerr "Detected 5XX error. Changing exit code to 161." + exit_code=161 + alert_job_in_slack "$exit_code" "5XX error" + + elif grep -i -q -e "gitaly spawn failed" "$TRACE_FILE"; then + echoerr "Detected gitaly spawn failure error. Changing exit code to 162." + exit_code=162 + alert_job_in_slack "$exit_code" "Gitaly spawn failure" + + elif grep -i -q -e \ + "Rspec suite is exceeding the 80 minute limit and is forced to exit with error" "$TRACE_FILE"; then + echoerr "Detected rspec timeout risk. Changing exit code to 163." + exit_code=163 + alert_job_in_slack "$exit_code" "RSpec taking longer than 80 minutes and forced to fail." + + elif grep -i -q \ + -e "Redis client could not fetch cluster information: Connection refused" \ + -e "Redis::Cluster::CommandErrorCollection" \ + -e "CLUSTERDOWN The cluster is down" "$TRACE_FILE"; then + echoerr "Detected Redis cluster error. Changing exit code to 164." + exit_code=164 + alert_job_in_slack "$exit_code" "Redis cluster error" + + elif grep -i -q -e "segmentation fault" "$TRACE_FILE"; then + echoerr "Detected segmentation fault. Changing exit code to 165." + exit_code=165 + alert_job_in_slack "$exit_code" "Segmentation fault" + + elif grep -i -q -e "Error: EEXIST: file already exists" "$TRACE_FILE"; then + echoerr "Detected EEXIST error. Changing exit code to 166." + exit_code=166 + alert_job_in_slack "$exit_code" "EEXIST: file already exists" + + elif grep -i -q -e \ + "fatal: remote error: GitLab is currently unable to handle this request due to load" "$TRACE_FILE"; then + echoerr "Detected GitLab overload error in job trace. Changing exit code to 167." + exit_code=167 + alert_job_in_slack "$exit_code" "gitlab.com overload" + + else + echoinfo "not changing exit code" + fi + + echoinfo "will exit with $exit_code" + return "$exit_code" +} + function alert_job_in_slack() { exit_code=$1 alert_reason=$2 local slack_channel="#dx_development-analytics_alerts" - echo "Reporting ${CI_JOB_URL} to Slack channel ${slack_channel}" + echoinfo "Reporting ${CI_JOB_URL} to Slack channel ${slack_channel}" json_payload=$(cat <<JSON { @@ -558,7 +663,7 @@ function alert_job_in_slack() { "type": "section", "text": { "type": "mrkdwn", - "text": "*Job <${CI_JOB_URL}|${CI_JOB_NAME}> in pipeline <${CI_PIPELINE_URL}|#${CI_PIPELINE_ID}> needs attention*" + "text": "*<${CI_PROJECT_URL}|${CI_PROJECT_PATH}> pipeline <${CI_PIPELINE_URL}|#${CI_PIPELINE_ID}> needs attention*" } }, { @@ -570,7 +675,7 @@ function alert_job_in_slack() { }, { "type": "mrkdwn", - "text": "*Project:* \n<${CI_PROJECT_URL}|${CI_PROJECT_PATH}>" + "text": "*Job:* \n<${CI_JOB_URL}|${CI_JOB_NAME}>" }, { "type": "mrkdwn", -- GitLab