0
0
mirror of https://github.com/go-gitea/gitea.git synced 2026-06-28 16:46:17 +02:00
gitea/models/actions/task_test.go
bircni 0f5102427e
fix(actions): ensure all waiting jobs get runners in large workflows (#38200)
## Summary

Fixes two related bugs that cause jobs in large workflows (50+ parallel
jobs) to never get a runner assigned even though runners are free.

### Bug 1 — Concurrent runner race

When N runners all poll `FetchTask` with a stale `tasksVersion`
simultaneously, they all query the same waiting job list sorted by
`(updated, id)` and all pick **job #1**. Only one wins the `UPDATE WHERE
task_id=0` optimistic lock; the rest return empty-handed but still
receive `latestVersion` in the response. They then consider themselves
"up to date" and skip `PickTask` on every subsequent poll, leaving jobs
#2–50 permanently unassigned.

**Fix:** `CreateTaskForRunner` now iterates through all matching waiting
jobs. When the optimistic lock fails on job #1, it immediately tries job
#2, then #3, etc., each in its own independent transaction so a failed
attempt rolls back cleanly before the next candidate is tried.
`PickTask` no longer wraps this call in an outer `db.WithTx` (which
caused `halfCommitter` entanglement that prevented per-attempt
rollbacks).

### Bug 2 — Idle runner doesn't re-check after finishing a task

`tasks_version` only bumps when a job transitions **to** waiting (new
workflow triggered, blocked→unblocked). After a runner finishes its
current task it polls `FetchTask` with `tasksVersion == latestVersion`,
so the server skips `PickTask` entirely — the remaining 45 waiting jobs
are invisible to the now-idle runner.

**Fix:** Also call `IncreaseTaskVersion` in `UpdateRunJob` when a
(non-reusable-caller) job transitions to a **done** state. Idle runners
then see a version mismatch on their next poll and attempt `PickTask`,
picking up the remaining jobs.
2026-06-27 17:56:12 +00:00

374 lines
10 KiB
Go

// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package actions
import (
"strings"
"testing"
runnerv1 "gitea.dev/actions-proto-go/runner/v1"
"gitea.dev/models/db"
"gitea.dev/models/unittest"
"gitea.dev/modules/actions/jobparser"
"gitea.dev/modules/timeutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/timestamppb"
)
func TestMakeTaskStepDisplayName(t *testing.T) {
tests := []struct {
name string
jobStep *jobparser.Step
expected string
}{
{
name: "explicit name",
jobStep: &jobparser.Step{
Name: "Test Step",
},
expected: "Test Step",
},
{
name: "uses step",
jobStep: &jobparser.Step{
Uses: "actions/checkout@v4",
},
expected: "Run actions/checkout@v4",
},
{
name: "single-line run",
jobStep: &jobparser.Step{
Run: "echo hello",
},
expected: "Run echo hello",
},
{
name: "multi-line run block scalar",
jobStep: &jobparser.Step{
Run: "\n echo hello \r\n echo world \n ",
},
expected: "Run echo hello",
},
{
name: "fallback to id",
jobStep: &jobparser.Step{
ID: "step-id",
},
expected: "Run step-id",
},
{
name: "very long name truncated",
jobStep: &jobparser.Step{
Name: strings.Repeat("a", 300),
},
expected: strings.Repeat("a", 252) + "…",
},
{
name: "very long run truncated",
jobStep: &jobparser.Step{
Run: strings.Repeat("a", 300),
},
expected: "Run " + strings.Repeat("a", 248) + "…",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := makeTaskStepDisplayName(tt.jobStep, 255)
assert.Equal(t, tt.expected, result)
})
}
}
func TestTaskCancellingFinalizesToCancelled(t *testing.T) {
newRunningTask := func(t *testing.T) (*ActionTask, *ActionRunJob) {
t.Helper()
run := &ActionRun{
Title: "cancelling-test-run",
RepoID: 1,
OwnerID: 2,
WorkflowID: "test.yaml",
Index: 999,
TriggerUserID: 2,
Ref: "refs/heads/master",
CommitSHA: "c2d72f548424103f01ee1dc02889c1e2bff816b0",
Event: "push",
TriggerEvent: "push",
Status: StatusRunning,
Started: timeutil.TimeStampNow(),
}
require.NoError(t, db.Insert(t.Context(), run))
job := &ActionRunJob{
RunID: run.ID,
RepoID: run.RepoID,
OwnerID: run.OwnerID,
CommitSHA: run.CommitSHA,
Name: "cancelling-finalization-job",
Attempt: 1,
JobID: "cancelling-finalization-job",
Status: StatusRunning,
}
require.NoError(t, db.Insert(t.Context(), job))
runner := &ActionRunner{
UUID: "runner-cancelling-supported",
Name: "runner-cancelling-supported",
HasCancellingSupport: true,
}
require.NoError(t, db.Insert(t.Context(), runner))
task := &ActionTask{
JobID: job.ID,
Attempt: 1,
RunnerID: runner.ID,
Status: StatusRunning,
Started: timeutil.TimeStampNow(),
RepoID: run.RepoID,
OwnerID: run.OwnerID,
CommitSHA: run.CommitSHA,
}
require.NoError(t, db.Insert(t.Context(), task))
job.TaskID = task.ID
_, err := UpdateRunJob(t.Context(), job, nil, "task_id")
require.NoError(t, err)
return task, job
}
testResult := func(t *testing.T, result runnerv1.Result) {
t.Helper()
require.NoError(t, unittest.PrepareTestDatabase())
task, job := newRunningTask(t)
require.NoError(t, StopTask(t.Context(), task.ID, StatusCancelling))
taskAfterStop := unittest.AssertExistsAndLoadBean(t, &ActionTask{ID: task.ID})
assert.Equal(t, StatusCancelling, taskAfterStop.Status)
updatedTask, err := UpdateTaskByState(t.Context(), task.RunnerID, &runnerv1.TaskState{
Id: task.ID,
Result: result,
StoppedAt: timestamppb.Now(),
})
require.NoError(t, err)
assert.Equal(t, StatusCancelled, updatedTask.Status)
taskAfterUpdate := unittest.AssertExistsAndLoadBean(t, &ActionTask{ID: task.ID})
assert.Equal(t, StatusCancelled, taskAfterUpdate.Status)
jobAfterUpdate := unittest.AssertExistsAndLoadBean(t, &ActionRunJob{ID: job.ID})
assert.Equal(t, StatusCancelled, jobAfterUpdate.Status)
}
t.Run("runner reports success", func(t *testing.T) {
testResult(t, runnerv1.Result_RESULT_SUCCESS)
})
t.Run("runner reports failure", func(t *testing.T) {
testResult(t, runnerv1.Result_RESULT_FAILURE)
})
}
func TestStopTaskCancellingFallsBackForLegacyRunner(t *testing.T) {
require.NoError(t, unittest.PrepareTestDatabase())
run := &ActionRun{
Title: "cancelling-test-run",
RepoID: 1,
OwnerID: 2,
WorkflowID: "test.yaml",
Index: 999,
TriggerUserID: 2,
Ref: "refs/heads/master",
CommitSHA: "c2d72f548424103f01ee1dc02889c1e2bff816b0",
Event: "push",
TriggerEvent: "push",
Status: StatusRunning,
Started: timeutil.TimeStampNow(),
}
require.NoError(t, db.Insert(t.Context(), run))
job := &ActionRunJob{
RunID: run.ID,
RepoID: run.RepoID,
OwnerID: run.OwnerID,
CommitSHA: run.CommitSHA,
Name: "legacy-cancelling-job",
Attempt: 1,
JobID: "legacy-cancelling-job",
Status: StatusRunning,
}
require.NoError(t, db.Insert(t.Context(), job))
runner := &ActionRunner{
UUID: "runner-legacy-no-cancelling",
Name: "runner-legacy-no-cancelling",
HasCancellingSupport: false,
}
require.NoError(t, db.Insert(t.Context(), runner))
task := &ActionTask{
JobID: job.ID,
Attempt: 1,
RunnerID: runner.ID,
Status: StatusRunning,
Started: timeutil.TimeStampNow(),
RepoID: run.RepoID,
OwnerID: run.OwnerID,
CommitSHA: run.CommitSHA,
}
require.NoError(t, db.Insert(t.Context(), task))
job.TaskID = task.ID
_, err := UpdateRunJob(t.Context(), job, nil, "task_id")
require.NoError(t, err)
require.NoError(t, StopTask(t.Context(), task.ID, StatusCancelling))
taskAfterStop := unittest.AssertExistsAndLoadBean(t, &ActionTask{ID: task.ID})
assert.Equal(t, StatusCancelled, taskAfterStop.Status)
assert.NotZero(t, taskAfterStop.Stopped)
jobAfterStop := unittest.AssertExistsAndLoadBean(t, &ActionRunJob{ID: job.ID})
assert.Equal(t, StatusCancelled, jobAfterStop.Status)
assert.NotZero(t, jobAfterStop.Stopped)
}
func TestStopTaskCancellingFallsBackForMissingRunner(t *testing.T) {
require.NoError(t, unittest.PrepareTestDatabase())
run := &ActionRun{
Title: "cancelling-test-run",
RepoID: 1,
OwnerID: 2,
WorkflowID: "test.yaml",
Index: 999,
TriggerUserID: 2,
Ref: "refs/heads/master",
CommitSHA: "c2d72f548424103f01ee1dc02889c1e2bff816b0",
Event: "push",
TriggerEvent: "push",
Status: StatusRunning,
Started: timeutil.TimeStampNow(),
}
require.NoError(t, db.Insert(t.Context(), run))
job := &ActionRunJob{
RunID: run.ID,
RepoID: run.RepoID,
OwnerID: run.OwnerID,
CommitSHA: run.CommitSHA,
Name: "missing-runner-cancelling-job",
Attempt: 1,
JobID: "missing-runner-cancelling-job",
Status: StatusRunning,
}
require.NoError(t, db.Insert(t.Context(), job))
runner := &ActionRunner{
UUID: "runner-cleaned-up-before-cancel",
Name: "runner-cleaned-up-before-cancel",
HasCancellingSupport: true,
}
require.NoError(t, db.Insert(t.Context(), runner))
task := &ActionTask{
JobID: job.ID,
Attempt: 1,
RunnerID: runner.ID,
Status: StatusRunning,
Started: timeutil.TimeStampNow(),
RepoID: run.RepoID,
OwnerID: run.OwnerID,
CommitSHA: run.CommitSHA,
}
require.NoError(t, db.Insert(t.Context(), task))
job.TaskID = task.ID
_, err := UpdateRunJob(t.Context(), job, nil, "task_id")
require.NoError(t, err)
_, err = db.DeleteByID[ActionRunner](t.Context(), runner.ID)
require.NoError(t, err)
require.NoError(t, StopTask(t.Context(), task.ID, StatusCancelling))
taskAfterStop := unittest.AssertExistsAndLoadBean(t, &ActionTask{ID: task.ID})
assert.Equal(t, StatusCancelled, taskAfterStop.Status)
assert.NotZero(t, taskAfterStop.Stopped)
jobAfterStop := unittest.AssertExistsAndLoadBean(t, &ActionRunJob{ID: job.ID})
assert.Equal(t, StatusCancelled, jobAfterStop.Status)
assert.NotZero(t, jobAfterStop.Stopped)
}
// TestReleaseTaskForRunner verifies that releasing a freshly-claimed task returns
// its job to the waiting queue and deletes the task and its steps, so a failure
// while assembling the runner response cannot strand the job in running state.
func TestReleaseTaskForRunner(t *testing.T) {
require.NoError(t, unittest.PrepareTestDatabase())
run := &ActionRun{
Title: "release-task-test-run",
RepoID: 1,
OwnerID: 2,
WorkflowID: "test.yaml",
Index: 9902,
TriggerUserID: 2,
Ref: "refs/heads/main",
CommitSHA: "c2d72f548424103f01ee1dc02889c1e2bff816b0",
Event: "push",
TriggerEvent: "push",
Status: StatusWaiting,
}
require.NoError(t, db.Insert(t.Context(), run))
job := &ActionRunJob{
RunID: run.ID,
RepoID: run.RepoID,
OwnerID: run.OwnerID,
CommitSHA: run.CommitSHA,
Name: "release-job",
Attempt: 1,
JobID: "release-job",
Status: StatusWaiting,
RunsOn: []string{"ubuntu-latest"},
WorkflowPayload: []byte("on: push\njobs:\n release-job:\n runs-on: ubuntu-latest\n steps:\n - run: echo hi\n"),
}
require.NoError(t, db.Insert(t.Context(), job))
runner := &ActionRunner{
UUID: "release-runner-uuid",
Name: "release-runner",
AgentLabels: []string{"ubuntu-latest"},
}
runner.GenerateAndFillToken()
require.NoError(t, db.Insert(t.Context(), runner))
task, ok, err := CreateTaskForRunner(t.Context(), runner)
require.NoError(t, err)
require.True(t, ok)
require.NotNil(t, task)
claimed := unittest.AssertExistsAndLoadBean(t, &ActionRunJob{ID: job.ID})
require.Equal(t, StatusRunning, claimed.Status)
require.Equal(t, task.ID, claimed.TaskID)
require.NoError(t, ReleaseTaskForRunner(t.Context(), task))
// Job is back in the waiting queue with no task assigned.
released := unittest.AssertExistsAndLoadBean(t, &ActionRunJob{ID: job.ID})
assert.Equal(t, StatusWaiting, released.Status)
assert.Zero(t, released.TaskID)
assert.Zero(t, released.Started)
// The task and its steps are gone.
unittest.AssertNotExistsBean(t, &ActionTask{ID: task.ID})
unittest.AssertNotExistsBean(t, &ActionTaskStep{TaskID: task.ID})
}