From a1c60ac85437eed80c73f0c772bf792e25848407 Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Mon, 9 Feb 2026 06:08:52 +0100 Subject: [PATCH 01/12] feat: Add support for dynamic matrix strategies with job outputs Signed-off-by: Pascal Zimmermann # Conflicts: # models/actions/run_job.go # models/migrations/migrations.go # models/migrations/v1_26/v326.go # services/actions/run.go --- routers/web/web.go | 3 + services/actions/job_emitter.go | 83 +++++- services/actions/matrix.go | 268 ++++++++++++++++++ services/actions/matrix_metrics.go | 173 +++++++++++ services/actions/matrix_metrics_prometheus.go | 191 +++++++++++++ services/actions/matrix_metrics_test.go | 82 ++++++ tests/integration/actions_job_test.go | 94 ++++++ 7 files changed, 891 insertions(+), 3 deletions(-) create mode 100644 services/actions/matrix.go create mode 100644 services/actions/matrix_metrics.go create mode 100644 services/actions/matrix_metrics_prometheus.go create mode 100644 services/actions/matrix_metrics_test.go diff --git a/routers/web/web.go b/routers/web/web.go index e3dcf27cc4..40da3b13f4 100644 --- a/routers/web/web.go +++ b/routers/web/web.go @@ -41,6 +41,7 @@ import ( "code.gitea.io/gitea/routers/web/user" user_setting "code.gitea.io/gitea/routers/web/user/setting" "code.gitea.io/gitea/routers/web/user/setting/security" + actions_service "code.gitea.io/gitea/services/actions" auth_service "code.gitea.io/gitea/services/auth" "code.gitea.io/gitea/services/context" "code.gitea.io/gitea/services/forms" @@ -286,6 +287,8 @@ func Routes() *web.Router { if setting.Metrics.Enabled { prometheus.MustRegister(metrics.NewCollector()) + // Register matrix re-evaluation metrics + prometheus.MustRegister(actions_service.NewMatrixMetricsCollector()) routes.Get("/metrics", append(mid, Metrics)...) } diff --git a/services/actions/job_emitter.go b/services/actions/job_emitter.go index 20a4f81eab..ed6e2fe5aa 100644 --- a/services/actions/job_emitter.go +++ b/services/actions/job_emitter.go @@ -7,6 +7,7 @@ import ( "context" "errors" "fmt" + "time" actions_model "code.gitea.io/gitea/models/actions" "code.gitea.io/gitea/models/db" @@ -202,6 +203,9 @@ func checkJobsOfRun(ctx context.Context, run *actions_model.ActionRun) (jobs, up if err != nil { return nil, nil, err } + + log.Debug("Checking %d jobs for run %d (status: %s)", len(jobs), run.ID, run.Status) + vars, err := actions_model.GetVariablesOfRun(ctx, run) if err != nil { return nil, nil, err @@ -213,14 +217,18 @@ func checkJobsOfRun(ctx context.Context, run *actions_model.ActionRun) (jobs, up } updates := newJobStatusResolver(jobs, vars).Resolve(ctx) + log.Debug("Job status resolver returned %d job status updates for run %d", len(updates), run.ID) + for _, job := range jobs { if status, ok := updates[job.ID]; ok { + oldStatus := job.Status job.Status = status if n, err := actions_model.UpdateRunJob(ctx, job, builder.Eq{"status": actions_model.StatusBlocked}, "status"); err != nil { return err } else if n != 1 { return fmt.Errorf("no affected for updating blocked job %v", job.ID) } + log.Info("Job %d (JobID: %s) status updated: %s -> %s", job.ID, job.JobID, oldStatus, status) updatedJobs = append(updatedJobs, job) } } @@ -229,6 +237,20 @@ func checkJobsOfRun(ctx context.Context, run *actions_model.ActionRun) (jobs, up return nil, nil, err } + // Reload jobs from the database to pick up any newly created matrix jobs + oldJobCount := len(jobs) + jobs, err = db.Find[actions_model.ActionRunJob](ctx, actions_model.FindRunJobOptions{RunID: run.ID}) + if err != nil { + return nil, nil, err + } + + if len(jobs) > oldJobCount { + log.Info("Matrix re-evaluation created %d new jobs for run %d (was %d, now %d)", + len(jobs)-oldJobCount, run.ID, oldJobCount, len(jobs)) + } + + log.Debug("Job check completed for run %d: %d jobs updated, %d total jobs", run.ID, len(updatedJobs), len(jobs)) + return jobs, updatedJobs, nil } @@ -313,47 +335,102 @@ func (r *jobStatusResolver) resolveJobHasIfCondition(actionRunJob *actions_model func (r *jobStatusResolver) resolve(ctx context.Context) map[int64]actions_model.Status { ret := map[int64]actions_model.Status{} + resolveMetrics := struct { + totalBlocked int + matrixReevaluated int + concurrencyUpdated int + jobsStarted int + jobsSkipped int + }{} + for id, status := range r.statuses { actionRunJob := r.jobMap[id] if status != actions_model.StatusBlocked { continue } + + resolveMetrics.totalBlocked++ + log.Debug("Resolving blocked job %d (JobID: %s, RunID: %d)", id, actionRunJob.JobID, actionRunJob.RunID) + allDone, allSucceed := r.resolveCheckNeeds(id) if !allDone { + log.Debug("Job %d: not all dependencies completed yet", id) continue } + log.Debug("Job %d: all dependencies completed (allSucceed: %v), checking matrix re-evaluation", id, allSucceed) + + // Try to re-evaluate the matrix with job outputs if it depends on them + startTime := time.Now() + newMatrixJobs, err := ReEvaluateMatrixForJobWithNeeds(ctx, actionRunJob, r.vars) + duration := time.Since(startTime).Milliseconds() + + if err != nil { + log.Error("Matrix re-evaluation error for job %d (JobID: %s): %v (duration: %dms)", id, actionRunJob.JobID, err, duration) + continue + } + + // If new matrix jobs were created, add them to the resolver and continue + if len(newMatrixJobs) > 0 { + resolveMetrics.matrixReevaluated++ + log.Info("Matrix re-evaluation succeeded for job %d (JobID: %s): created %d new jobs (duration: %dms)", + id, actionRunJob.JobID, len(newMatrixJobs), duration) + // The new jobs will be picked up in the next resolution iteration + continue + } + + log.Debug("Job %d: no matrix re-evaluation needed or result is empty", id) + // update concurrency and check whether the job can run now - err := updateConcurrencyEvaluationForJobWithNeeds(ctx, actionRunJob, r.vars) + err = updateConcurrencyEvaluationForJobWithNeeds(ctx, actionRunJob, r.vars) if err != nil { // The err can be caused by different cases: database error, or syntax error, or the needed jobs haven't completed // At the moment there is no way to distinguish them. // Actually, for most cases, the error is caused by "syntax error" / "the needed jobs haven't completed (skipped?)" // TODO: if workflow or concurrency expression has syntax error, there should be a user error message, need to show it to end users - log.Debug("updateConcurrencyEvaluationForJobWithNeeds failed, this job will stay blocked: job: %d, err: %v", id, err) + log.Debug("Concurrency evaluation failed for job %d (JobID: %s): %v (job will stay blocked)", id, actionRunJob.JobID, err) continue } + resolveMetrics.concurrencyUpdated++ + shouldStartJob := true if !allSucceed { // Not all dependent jobs completed successfully: // * if the job has "if" condition, it can be started, then the act_runner will evaluate the "if" condition. // * otherwise, the job should be skipped. shouldStartJob = r.resolveJobHasIfCondition(actionRunJob) + log.Debug("Job %d: not all dependencies succeeded. Has if-condition: %v, should start: %v", id, shouldStartJob, shouldStartJob) } newStatus := util.Iif(shouldStartJob, actions_model.StatusWaiting, actions_model.StatusSkipped) if newStatus == actions_model.StatusWaiting { newStatus, err = PrepareToStartJobWithConcurrency(ctx, actionRunJob) if err != nil { - log.Error("ShouldBlockJobByConcurrency failed, this job will stay blocked: job: %d, err: %v", id, err) + log.Error("Concurrency check failed for job %d (JobID: %s): %v (job will stay blocked)", id, actionRunJob.JobID, err) } } if newStatus != actions_model.StatusBlocked { ret[id] = newStatus + switch newStatus { + case actions_model.StatusWaiting: + resolveMetrics.jobsStarted++ + log.Info("Job %d (JobID: %s) transitioned to StatusWaiting", id, actionRunJob.JobID) + case actions_model.StatusSkipped: + resolveMetrics.jobsSkipped++ + log.Info("Job %d (JobID: %s) transitioned to StatusSkipped", id, actionRunJob.JobID) + } } } + + // Log resolution metrics summary + if resolveMetrics.totalBlocked > 0 { + log.Debug("Job resolution summary: total_blocked=%d, matrix_reevaluated=%d, concurrency_updated=%d, jobs_started=%d, jobs_skipped=%d", + resolveMetrics.totalBlocked, resolveMetrics.matrixReevaluated, resolveMetrics.concurrencyUpdated, + resolveMetrics.jobsStarted, resolveMetrics.jobsSkipped) + } + return ret } diff --git a/services/actions/matrix.go b/services/actions/matrix.go new file mode 100644 index 0000000000..10541f214f --- /dev/null +++ b/services/actions/matrix.go @@ -0,0 +1,268 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "context" + "fmt" + "maps" + "strings" + "time" + + actions_model "code.gitea.io/gitea/models/actions" + "code.gitea.io/gitea/modules/log" + + "github.com/nektos/act/pkg/jobparser" + "gopkg.in/yaml.v3" +) + +// ExtractRawStrategies extracts strategy definitions from the raw workflow content +// Returns a map of jobID to strategy YAML for jobs that have matrix dependencies +func ExtractRawStrategies(content []byte) (map[string]string, error) { + var workflowDef struct { + Jobs map[string]struct { + Strategy any `yaml:"strategy"` + Needs any `yaml:"needs"` + } `yaml:"jobs"` + } + + if err := yaml.Unmarshal(content, &workflowDef); err != nil { + return nil, err + } + + strategies := make(map[string]string) + for jobID, jobDef := range workflowDef.Jobs { + if jobDef.Strategy == nil { + continue + } + + // Check if this job has needs (dependencies) + var needsList []string + switch needs := jobDef.Needs.(type) { + case string: + needsList = append(needsList, needs) + case []any: + for _, need := range needs { + if needStr, ok := need.(string); ok { + needsList = append(needsList, needStr) + } + } + } + + // Only store strategy for jobs with dependencies + if len(needsList) > 0 { + if strategyBytes, err := yaml.Marshal(jobDef.Strategy); err == nil { + strategies[jobID] = string(strategyBytes) + } + } + } + + return strategies, nil +} + +// hasMatrixWithNeeds checks if a job's strategy contains a matrix that depends on job outputs +func hasMatrixWithNeeds(rawStrategy string) bool { + if rawStrategy == "" { + return false + } + + var strategy map[string]any + if err := yaml.Unmarshal([]byte(rawStrategy), &strategy); err != nil { + return false + } + + matrix, ok := strategy["matrix"] + if !ok { + return false + } + + // Check if any matrix value contains "needs." reference + matrixStr := fmt.Sprintf("%v", matrix) + return strings.Contains(matrixStr, "needs.") +} + +// ReEvaluateMatrixForJobWithNeeds re-evaluates the matrix strategy of a job using outputs from dependent jobs +// If the matrix depends on job outputs and all dependent jobs are done, it will: +// 1. Evaluate the matrix with the job outputs +// 2. Create new ActionRunJobs for each matrix combination +// 3. Return the newly created jobs +func ReEvaluateMatrixForJobWithNeeds(ctx context.Context, job *actions_model.ActionRunJob, vars map[string]string) ([]*actions_model.ActionRunJob, error) { + startTime := time.Now() + + if job.IsMatrixEvaluated || job.RawStrategy == "" { + return nil, nil + } + + if !hasMatrixWithNeeds(job.RawStrategy) { + // Mark as evaluated since there's no needs-dependent matrix + job.IsMatrixEvaluated = true + log.Debug("Matrix re-evaluation skipped for job %d: no needs-dependent matrix found", job.ID) + return nil, nil + } + + log.Debug("Starting matrix re-evaluation for job %d (JobID: %s)", job.ID, job.JobID) + + // Get the outputs from dependent jobs + taskNeeds, err := FindTaskNeeds(ctx, job) + if err != nil { + errMsg := fmt.Sprintf("failed to find task needs for job %d (JobID: %s): %v", job.ID, job.JobID, err) + log.Error("Matrix re-evaluation error: %s", errMsg) + return nil, fmt.Errorf("find task needs: %w", err) + } + + log.Debug("Found %d task needs for job %d (JobID: %s)", len(taskNeeds), job.ID, job.JobID) + + // If any task needs are not done, we can't evaluate yet + pendingNeeds := []string{} + for jobID, taskNeed := range taskNeeds { + if !taskNeed.Result.IsDone() { + pendingNeeds = append(pendingNeeds, fmt.Sprintf("%s(%s)", jobID, taskNeed.Result)) + } + } + if len(pendingNeeds) > 0 { + log.Debug("Matrix re-evaluation deferred for job %d: pending needs: %v", job.ID, pendingNeeds) + GetMatrixMetrics().RecordDeferred() + return nil, nil + } + + // Merge vars with needs outputs + mergedVars := mergeNeedsIntoVars(vars, taskNeeds) + log.Debug("Merged %d variables with needs outputs for job %d", len(mergedVars), job.ID) + + // Load the original run to get workflow context + if job.Run == nil { + if err := job.LoadRun(ctx); err != nil { + errMsg := fmt.Sprintf("failed to load run for job %d (JobID: %s): %v", job.ID, job.JobID, err) + log.Error("Matrix re-evaluation error: %s", errMsg) + return nil, fmt.Errorf("load run: %w", err) + } + } + + // Create the giteaCtx for expression evaluation + giteaCtx := GenerateGiteaContext(job.Run, job) + + // Parse the job payload with merged vars to expand the matrix + // Note: job.WorkflowPayload already contains just this job's definition + parseStartTime := time.Now() + jobs, err := jobparser.Parse( + job.WorkflowPayload, + jobparser.WithVars(mergedVars), + jobparser.WithGitContext(giteaCtx.ToGitHubContext()), + ) + parseTime := time.Since(parseStartTime) + GetMatrixMetrics().RecordParseTime(parseTime) + + if err != nil { + // If parsing fails, we can't expand the matrix + // Mark as evaluated and skip + job.IsMatrixEvaluated = true + errMsg := fmt.Sprintf("failed to parse workflow payload for job %d (JobID: %s) during matrix expansion. Error: %v. RawStrategy: %s", + job.ID, job.JobID, err, job.RawStrategy) + log.Error("Matrix parse error: %s", errMsg) + GetMatrixMetrics().RecordReevaluation(time.Since(startTime), false, 0) + return nil, nil + } + + if len(jobs) == 0 { + job.IsMatrixEvaluated = true + log.Debug("No jobs generated from matrix expansion for job %d (JobID: %s)", job.ID, job.JobID) + return nil, nil + } + + log.Debug("Parsed %d matrix combinations for job %d (JobID: %s)", len(jobs), job.ID, job.JobID) + + // Create new ActionRunJobs for each parsed workflow (each matrix combination) + newJobs := make([]*actions_model.ActionRunJob, 0) + + for i, parsedSingleWorkflow := range jobs { + id, jobDef := parsedSingleWorkflow.Job() + if jobDef == nil { + log.Warn("Skipped nil jobDef at index %d for job %d (JobID: %s)", i, job.ID, job.JobID) + continue + } + + // Skip the original job ID - we only want the matrix-expanded versions + if id == job.JobID { + log.Debug("Skipped original job ID %s in matrix expansion for job %d", id, job.ID) + continue + } + + // Erase needs from the payload before storing + needs := jobDef.Needs() + if err := parsedSingleWorkflow.SetJob(id, jobDef.EraseNeeds()); err != nil { + log.Error("Failed to erase needs from job %s (matrix expansion for job %d): %v", id, job.ID, err) + continue + } + + payload, _ := parsedSingleWorkflow.Marshal() + + newJob := &actions_model.ActionRunJob{ + RunID: job.RunID, + RepoID: job.RepoID, + OwnerID: job.OwnerID, + CommitSHA: job.CommitSHA, + IsForkPullRequest: job.IsForkPullRequest, + Name: jobDef.Name, + WorkflowPayload: payload, + JobID: id, + Needs: needs, + RunsOn: jobDef.RunsOn(), + Status: actions_model.StatusBlocked, + } + + newJobs = append(newJobs, newJob) + } + + // If no new jobs were created, mark as evaluated + if len(newJobs) == 0 { + job.IsMatrixEvaluated = true + log.Warn("No valid jobs created from matrix expansion for job %d (JobID: %s). Original jobs: %d", job.ID, job.JobID, len(jobs)) + return nil, nil + } + + // Insert the new jobs into database + insertStartTime := time.Now() + if err := actions_model.InsertActionRunJobs(ctx, newJobs); err != nil { + insertTime := time.Since(insertStartTime) + GetMatrixMetrics().RecordInsertTime(insertTime) + errMsg := fmt.Sprintf("failed to insert %d new matrix jobs for job %d (JobID: %s): %v", len(newJobs), job.ID, job.JobID, err) + log.Error("Matrix insertion error: %s", errMsg) + GetMatrixMetrics().RecordReevaluation(time.Since(startTime), false, 0) + return nil, fmt.Errorf("insert new jobs: %w", err) + } + insertTime := time.Since(insertStartTime) + GetMatrixMetrics().RecordInsertTime(insertTime) + + // Mark the original job as evaluated + job.IsMatrixEvaluated = true + if _, err := actions_model.UpdateRunJob(ctx, job, nil, "is_matrix_evaluated"); err != nil { + log.Error("Failed to update job %d is_matrix_evaluated flag: %v", job.ID, err) + } + + totalTime := time.Since(startTime) + GetMatrixMetrics().RecordReevaluation(totalTime, true, int64(len(newJobs))) + + log.Info("Successfully completed matrix re-evaluation for job %d (JobID: %s): created %d new jobs from %d matrix combinations (total: %dms, parse: %dms, insert: %dms)", + job.ID, job.JobID, len(newJobs), len(jobs), totalTime.Milliseconds(), parseTime.Milliseconds(), insertTime.Milliseconds()) + + return newJobs, nil +} + +// mergeNeedsIntoVars converts task needs outputs into variables for expression evaluation +func mergeNeedsIntoVars(baseVars map[string]string, taskNeeds map[string]*TaskNeed) map[string]string { + merged := make(map[string]string) + + // Copy base vars + maps.Copy(merged, baseVars) + + // Add needs outputs as variables in format: needs..outputs. + for jobID, taskNeed := range taskNeeds { + for outputKey, outputValue := range taskNeed.Outputs { + key := fmt.Sprintf("needs.%s.outputs.%s", jobID, outputKey) + merged[key] = outputValue + } + } + + return merged +} diff --git a/services/actions/matrix_metrics.go b/services/actions/matrix_metrics.go new file mode 100644 index 0000000000..cc232c0b6c --- /dev/null +++ b/services/actions/matrix_metrics.go @@ -0,0 +1,173 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "sync" + "time" +) + +// MatrixMetrics tracks performance metrics for matrix re-evaluation operations +type MatrixMetrics struct { + mu sync.RWMutex + + // Counters + TotalReevaluations int64 + SuccessfulReevaluations int64 + FailedReevaluations int64 + JobsCreatedTotal int64 + DeferredReevaluations int64 + + // Timing + TotalReevaluationTime time.Duration + TotalParseTime time.Duration + TotalInsertTime time.Duration + + // Histograms (for detailed analysis) + ReevaluationTimes []time.Duration + ParseTimes []time.Duration + InsertTimes []time.Duration +} + +var ( + matrixMetricsInstance *MatrixMetrics + metricsMutex sync.Mutex +) + +// GetMatrixMetrics returns the global matrix metrics instance +func GetMatrixMetrics() *MatrixMetrics { + if matrixMetricsInstance == nil { + metricsMutex.Lock() + if matrixMetricsInstance == nil { + matrixMetricsInstance = &MatrixMetrics{ + ReevaluationTimes: make([]time.Duration, 0, 1000), + ParseTimes: make([]time.Duration, 0, 1000), + InsertTimes: make([]time.Duration, 0, 1000), + } + } + metricsMutex.Unlock() + } + return matrixMetricsInstance +} + +// RecordReevaluation records a matrix re-evaluation attempt +func (m *MatrixMetrics) RecordReevaluation(duration time.Duration, success bool, jobsCreated int64) { + m.mu.Lock() + defer m.mu.Unlock() + + m.TotalReevaluations++ + m.TotalReevaluationTime += duration + + if success { + m.SuccessfulReevaluations++ + m.JobsCreatedTotal += jobsCreated + } else { + m.FailedReevaluations++ + } + + // Keep a limited history for detailed analysis (keep last 1000) + if len(m.ReevaluationTimes) < 1000 { + m.ReevaluationTimes = append(m.ReevaluationTimes, duration) + } else { + // Shift and add new value + copy(m.ReevaluationTimes, m.ReevaluationTimes[1:]) + m.ReevaluationTimes[len(m.ReevaluationTimes)-1] = duration + } +} + +// RecordDeferred records a deferred matrix re-evaluation +func (m *MatrixMetrics) RecordDeferred() { + m.mu.Lock() + defer m.mu.Unlock() + m.DeferredReevaluations++ +} + +// RecordParseTime records the time taken to parse a workflow +func (m *MatrixMetrics) RecordParseTime(duration time.Duration) { + m.mu.Lock() + defer m.mu.Unlock() + + m.TotalParseTime += duration + + if len(m.ParseTimes) < 1000 { + m.ParseTimes = append(m.ParseTimes, duration) + } else { + copy(m.ParseTimes, m.ParseTimes[1:]) + m.ParseTimes[len(m.ParseTimes)-1] = duration + } +} + +// RecordInsertTime records the time taken to insert matrix jobs +func (m *MatrixMetrics) RecordInsertTime(duration time.Duration) { + m.mu.Lock() + defer m.mu.Unlock() + + m.TotalInsertTime += duration + + if len(m.InsertTimes) < 1000 { + m.InsertTimes = append(m.InsertTimes, duration) + } else { + copy(m.InsertTimes, m.InsertTimes[1:]) + m.InsertTimes[len(m.InsertTimes)-1] = duration + } +} + +// GetStats returns a snapshot of the current metrics +func (m *MatrixMetrics) GetStats() map[string]any { + m.mu.RLock() + defer m.mu.RUnlock() + + avgReevaluationTime := time.Duration(0) + if m.TotalReevaluations > 0 { + avgReevaluationTime = m.TotalReevaluationTime / time.Duration(m.TotalReevaluations) + } + + avgParseTime := time.Duration(0) + if len(m.ParseTimes) > 0 { + avgParseTime = m.TotalParseTime / time.Duration(len(m.ParseTimes)) + } + + avgInsertTime := time.Duration(0) + if len(m.InsertTimes) > 0 { + avgInsertTime = m.TotalInsertTime / time.Duration(len(m.InsertTimes)) + } + + successRate := 0.0 + if m.TotalReevaluations > 0 { + successRate = float64(m.SuccessfulReevaluations) / float64(m.TotalReevaluations) * 100 + } + + return map[string]any{ + "total_reevaluations": m.TotalReevaluations, + "successful_reevaluations": m.SuccessfulReevaluations, + "failed_reevaluations": m.FailedReevaluations, + "deferred_reevaluations": m.DeferredReevaluations, + "success_rate_percent": successRate, + "total_jobs_created": m.JobsCreatedTotal, + "total_reevaluation_time_ms": m.TotalReevaluationTime.Milliseconds(), + "avg_reevaluation_time_ms": avgReevaluationTime.Milliseconds(), + "total_parse_time_ms": m.TotalParseTime.Milliseconds(), + "avg_parse_time_ms": avgParseTime.Milliseconds(), + "total_insert_time_ms": m.TotalInsertTime.Milliseconds(), + "avg_insert_time_ms": avgInsertTime.Milliseconds(), + } +} + +// Reset clears all metrics +func (m *MatrixMetrics) Reset() { + m.mu.Lock() + defer m.mu.Unlock() + + m.TotalReevaluations = 0 + m.SuccessfulReevaluations = 0 + m.FailedReevaluations = 0 + m.JobsCreatedTotal = 0 + m.DeferredReevaluations = 0 + m.TotalReevaluationTime = 0 + m.TotalParseTime = 0 + m.TotalInsertTime = 0 + m.ReevaluationTimes = m.ReevaluationTimes[:0] + m.ParseTimes = m.ParseTimes[:0] + m.InsertTimes = m.InsertTimes[:0] +} diff --git a/services/actions/matrix_metrics_prometheus.go b/services/actions/matrix_metrics_prometheus.go new file mode 100644 index 0000000000..e780e8f13f --- /dev/null +++ b/services/actions/matrix_metrics_prometheus.go @@ -0,0 +1,191 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// MatrixMetricsCollector implements the prometheus.Collector interface +// and exposes matrix re-evaluation metrics for prometheus +type MatrixMetricsCollector struct { + // Counters + totalReevaluations prometheus.Gauge + successfulReevaluations prometheus.Gauge + failedReevaluations prometheus.Gauge + deferredReevaluations prometheus.Gauge + jobsCreatedTotal prometheus.Gauge + + // Timing (in milliseconds) + totalReevaluationTime prometheus.Gauge + avgReevaluationTime prometheus.Gauge + totalParseTime prometheus.Gauge + avgParseTime prometheus.Gauge + totalInsertTime prometheus.Gauge + avgInsertTime prometheus.Gauge + + // Rates + successRate prometheus.Gauge +} + +const ( + namespace = "gitea_" + subsystem = "matrix_" +) + +// NewMatrixMetricsCollector creates a new MatrixMetricsCollector +func NewMatrixMetricsCollector() *MatrixMetricsCollector { + return &MatrixMetricsCollector{ + totalReevaluations: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "total_reevaluations", + Help: "Total number of matrix re-evaluation attempts", + }, + ), + successfulReevaluations: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "successful_reevaluations", + Help: "Number of successful matrix re-evaluations", + }, + ), + failedReevaluations: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "failed_reevaluations", + Help: "Number of failed matrix re-evaluations", + }, + ), + deferredReevaluations: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "deferred_reevaluations", + Help: "Number of deferred matrix re-evaluations (waiting for dependencies)", + }, + ), + jobsCreatedTotal: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "jobs_created_total", + Help: "Total number of jobs created from matrix expansion", + }, + ), + totalReevaluationTime: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "total_reevaluation_time_ms", + Help: "Total time spent on matrix re-evaluations in milliseconds", + }, + ), + avgReevaluationTime: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "avg_reevaluation_time_ms", + Help: "Average time per matrix re-evaluation in milliseconds", + }, + ), + totalParseTime: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "total_parse_time_ms", + Help: "Total time spent parsing workflow payloads in milliseconds", + }, + ), + avgParseTime: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "avg_parse_time_ms", + Help: "Average time per workflow parse in milliseconds", + }, + ), + totalInsertTime: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "total_insert_time_ms", + Help: "Total time spent inserting jobs into database in milliseconds", + }, + ), + avgInsertTime: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "avg_insert_time_ms", + Help: "Average time per database insert in milliseconds", + }, + ), + successRate: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "success_rate_percent", + Help: "Success rate of matrix re-evaluations as percentage (0-100)", + }, + ), + } +} + +// Describe returns the metrics descriptions +func (c *MatrixMetricsCollector) Describe(ch chan<- *prometheus.Desc) { + c.totalReevaluations.Describe(ch) + c.successfulReevaluations.Describe(ch) + c.failedReevaluations.Describe(ch) + c.deferredReevaluations.Describe(ch) + c.jobsCreatedTotal.Describe(ch) + c.totalReevaluationTime.Describe(ch) + c.avgReevaluationTime.Describe(ch) + c.totalParseTime.Describe(ch) + c.avgParseTime.Describe(ch) + c.totalInsertTime.Describe(ch) + c.avgInsertTime.Describe(ch) + c.successRate.Describe(ch) +} + +// Collect collects the current metric values and sends them to the channel +func (c *MatrixMetricsCollector) Collect(ch chan<- prometheus.Metric) { + metrics := GetMatrixMetrics() + stats := metrics.GetStats() + + // Set counter values + c.totalReevaluations.Set(float64(stats["total_reevaluations"].(int64))) + c.successfulReevaluations.Set(float64(stats["successful_reevaluations"].(int64))) + c.failedReevaluations.Set(float64(stats["failed_reevaluations"].(int64))) + c.deferredReevaluations.Set(float64(stats["deferred_reevaluations"].(int64))) + c.jobsCreatedTotal.Set(float64(stats["total_jobs_created"].(int64))) + + // Set timing values (already in milliseconds) + c.totalReevaluationTime.Set(float64(stats["total_reevaluation_time_ms"].(int64))) + c.avgReevaluationTime.Set(float64(stats["avg_reevaluation_time_ms"].(int64))) + c.totalParseTime.Set(float64(stats["total_parse_time_ms"].(int64))) + c.avgParseTime.Set(float64(stats["avg_parse_time_ms"].(int64))) + c.totalInsertTime.Set(float64(stats["total_insert_time_ms"].(int64))) + c.avgInsertTime.Set(float64(stats["avg_insert_time_ms"].(int64))) + + // Set success rate + c.successRate.Set(stats["success_rate_percent"].(float64)) + + // Collect all metrics + c.totalReevaluations.Collect(ch) + c.successfulReevaluations.Collect(ch) + c.failedReevaluations.Collect(ch) + c.deferredReevaluations.Collect(ch) + c.jobsCreatedTotal.Collect(ch) + c.totalReevaluationTime.Collect(ch) + c.avgReevaluationTime.Collect(ch) + c.totalParseTime.Collect(ch) + c.avgParseTime.Collect(ch) + c.totalInsertTime.Collect(ch) + c.avgInsertTime.Collect(ch) + c.successRate.Collect(ch) +} diff --git a/services/actions/matrix_metrics_test.go b/services/actions/matrix_metrics_test.go new file mode 100644 index 0000000000..54fdf3c0f3 --- /dev/null +++ b/services/actions/matrix_metrics_test.go @@ -0,0 +1,82 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" +) + +// Essential Prometheus Collector Tests + +func TestNewMatrixMetricsCollector(t *testing.T) { + collector := NewMatrixMetricsCollector() + assert.NotNil(t, collector) + assert.NotNil(t, collector.totalReevaluations) + assert.NotNil(t, collector.successRate) +} + +func TestMatrixMetricsCollectorDescribe(t *testing.T) { + collector := NewMatrixMetricsCollector() + ch := make(chan *prometheus.Desc, 100) + collector.Describe(ch) + assert.NotEmpty(t, ch) +} + +func TestMatrixMetricsCollectorCollect(t *testing.T) { + matrixMetricsInstance = nil + metrics := GetMatrixMetrics() + metrics.RecordReevaluation(10*time.Millisecond, true, 5) + metrics.RecordParseTime(8 * time.Millisecond) + + collector := NewMatrixMetricsCollector() + ch := make(chan prometheus.Metric, 100) + collector.Collect(ch) + assert.NotEmpty(t, ch) + + matrixMetricsInstance = nil +} + +func TestMatrixMetricsGetStats(t *testing.T) { + metrics := &MatrixMetrics{ + ReevaluationTimes: make([]time.Duration, 0, 1000), + ParseTimes: make([]time.Duration, 0, 1000), + InsertTimes: make([]time.Duration, 0, 1000), + } + + metrics.RecordReevaluation(10*time.Millisecond, true, 3) + metrics.RecordReevaluation(15*time.Millisecond, true, 2) + metrics.RecordReevaluation(5*time.Millisecond, false, 0) + + stats := metrics.GetStats() + assert.Equal(t, int64(3), stats["total_reevaluations"]) + assert.Equal(t, int64(2), stats["successful_reevaluations"]) + assert.Equal(t, int64(1), stats["failed_reevaluations"]) + assert.Greater(t, stats["success_rate_percent"].(float64), 60.0) +} + +func BenchmarkMatrixMetricsCollectorCollect(b *testing.B) { + metrics := &MatrixMetrics{ + ReevaluationTimes: make([]time.Duration, 0, 1000), + ParseTimes: make([]time.Duration, 0, 1000), + InsertTimes: make([]time.Duration, 0, 1000), + } + matrixMetricsInstance = metrics + + for range 100 { + metrics.RecordReevaluation(10*time.Millisecond, true, 5) + metrics.RecordParseTime(5 * time.Millisecond) + } + + collector := NewMatrixMetricsCollector() + ch := make(chan prometheus.Metric, 100) + + b.ResetTimer() + for b.Loop() { + collector.Collect(ch) + } +} diff --git a/tests/integration/actions_job_test.go b/tests/integration/actions_job_test.go index 3da290f1d3..d3294fdd11 100644 --- a/tests/integration/actions_job_test.go +++ b/tests/integration/actions_job_test.go @@ -759,3 +759,97 @@ func getTaskJobNameByTaskID(t *testing.T, authToken, ownerName, repoName string, } return "" } + +func TestDynamicMatrixFromJobOutputs(t *testing.T) { + testCases := []struct { + treePath string + fileContent string + outcomes map[string]*mockTaskOutcome + }{ + { + treePath: ".gitea/workflows/dynamic-matrix.yml", + fileContent: `name: Dynamic Matrix from Job Outputs +on: + push: + paths: + - '.gitea/workflows/dynamic-matrix.yml' +jobs: + generate: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.gen_matrix.outputs.matrix }} + steps: + - name: Generate matrix + id: gen_matrix + run: | + echo "matrix=[1,2,3]" >> "$GITHUB_OUTPUT" + + build: + needs: [generate] + runs-on: ubuntu-latest + strategy: + matrix: + version: ${{ fromJson(needs.generate.outputs.matrix) }} + steps: + - run: echo "Building version ${{ matrix.version }}" +`, + outcomes: map[string]*mockTaskOutcome{ + "generate": { + result: runnerv1.Result_RESULT_SUCCESS, + outputs: map[string]string{ + "matrix": "[1,2,3]", + }, + }, + "build (1)": { + result: runnerv1.Result_RESULT_SUCCESS, + }, + "build (2)": { + result: runnerv1.Result_RESULT_SUCCESS, + }, + "build (3)": { + result: runnerv1.Result_RESULT_SUCCESS, + }, + }, + }, + } + onGiteaRun(t, func(t *testing.T, u *url.URL) { + user2 := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: 2}) + session := loginUser(t, user2.Name) + token := getTokenForLoggedInUser(t, session, auth_model.AccessTokenScopeWriteRepository, auth_model.AccessTokenScopeWriteUser) + + apiRepo := createActionsTestRepo(t, token, "actions-dynamic-matrix", false) + runner := newMockRunner() + runner.registerAsRepoRunner(t, user2.Name, apiRepo.Name, "mock-runner", []string{"ubuntu-latest"}, false) + + for _, tc := range testCases { + t.Run("test "+tc.treePath, func(t *testing.T) { + opts := getWorkflowCreateFileOptions(user2, apiRepo.DefaultBranch, "create "+tc.treePath, tc.fileContent) + createWorkflowFile(t, token, user2.Name, apiRepo.Name, tc.treePath, opts) + + // Execute the generate job first + task := runner.fetchTask(t) + jobName := getTaskJobNameByTaskID(t, token, user2.Name, apiRepo.Name, task.Id) + assert.Equal(t, "generate", jobName) + outcome := tc.outcomes[jobName] + assert.NotNil(t, outcome) + runner.execTask(t, task, outcome) + + // Now the build job should be created with matrix expansion from the output + // We expect 3 tasks for build (1), build (2), build (3) + buildTasks := make([]int64, 0) + for range 3 { + buildTask := runner.fetchTask(t) + buildJobName := getTaskJobNameByTaskID(t, token, user2.Name, apiRepo.Name, buildTask.Id) + t.Logf("Fetched task: %s", buildJobName) + assert.Contains(t, []string{"build (1)", "build (2)", "build (3)"}, buildJobName, "Expected a build job with matrix index") + outcome := tc.outcomes[buildJobName] + assert.NotNil(t, outcome) + runner.execTask(t, buildTask, outcome) + buildTasks = append(buildTasks, buildTask.Id) + } + + assert.Len(t, len(buildTasks), 3, "Expected 3 build tasks from dynamic matrix") + }) + } + }) +} From 57400c725e88776a5ddda72e198b91cad0467de1 Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Mon, 12 Jan 2026 22:42:05 +0100 Subject: [PATCH 02/12] feat: Add max-parallel implementation inside the Gitea server # Conflicts: # models/migrations/migrations.go # models/migrations/v1_26/v325.go # Conflicts: # models/actions/run_job.go # models/actions/runner.go # models/migrations/migrations.go # modules/structs/repo_actions.go # routers/api/actions/runner/runner.go # routers/api/v1/admin/runners.go # routers/api/v1/api.go # routers/api/v1/shared/runners.go # services/actions/run.go # templates/swagger/v1_json.tmpl # Conflicts: # models/migrations/migrations.go --- models/actions/run_job_maxparallel_test.go | 162 ++++++++++ models/actions/runner_capacity_test.go | 95 ++++++ models/actions/task.go | 42 ++- models/actions/task_count_test.go | 206 ++++++++++++ models/migrations/migrations.go | 1 + services/actions/run.go | 9 + services/actions/task_assignment_test.go | 296 ++++++++++++++++++ services/convert/convert.go | 1 + tests/integration/api_runner_capacity_test.go | 149 +++++++++ 9 files changed, 958 insertions(+), 3 deletions(-) create mode 100644 models/actions/run_job_maxparallel_test.go create mode 100644 models/actions/runner_capacity_test.go create mode 100644 models/actions/task_count_test.go create mode 100644 services/actions/task_assignment_test.go create mode 100644 tests/integration/api_runner_capacity_test.go diff --git a/models/actions/run_job_maxparallel_test.go b/models/actions/run_job_maxparallel_test.go new file mode 100644 index 0000000000..423907ce73 --- /dev/null +++ b/models/actions/run_job_maxparallel_test.go @@ -0,0 +1,162 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "context" + "testing" + + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/models/unittest" + + "github.com/stretchr/testify/assert" +) + +func TestActionRunJob_MaxParallel(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + ctx := context.Background() + + t.Run("NoMaxParallel", func(t *testing.T) { + job := &ActionRunJob{ + RunID: 1, + RepoID: 1, + OwnerID: 1, + JobID: "test-job-1", + Name: "Test Job", + Status: StatusWaiting, + MaxParallel: 0, // No limit + } + assert.NoError(t, db.Insert(ctx, job)) + + retrieved, err := GetRunJobByID(ctx, job.ID) + assert.NoError(t, err) + assert.Equal(t, 0, retrieved.MaxParallel) + }) + + t.Run("WithMaxParallel", func(t *testing.T) { + job := &ActionRunJob{ + RunID: 1, + RepoID: 1, + OwnerID: 1, + JobID: "test-job-2", + Name: "Matrix Job", + Status: StatusWaiting, + MaxParallel: 3, + } + assert.NoError(t, db.Insert(ctx, job)) + + retrieved, err := GetRunJobByID(ctx, job.ID) + assert.NoError(t, err) + assert.Equal(t, 3, retrieved.MaxParallel) + }) + + t.Run("MatrixID", func(t *testing.T) { + job := &ActionRunJob{ + RunID: 1, + RepoID: 1, + OwnerID: 1, + JobID: "test-job-3", + Name: "Matrix Job with ID", + Status: StatusWaiting, + MaxParallel: 2, + MatrixID: "os:ubuntu,node:16", + } + assert.NoError(t, db.Insert(ctx, job)) + + retrieved, err := GetRunJobByID(ctx, job.ID) + assert.NoError(t, err) + assert.Equal(t, 2, retrieved.MaxParallel) + assert.Equal(t, "os:ubuntu,node:16", retrieved.MatrixID) + }) + + t.Run("UpdateMaxParallel", func(t *testing.T) { + // Create ActionRun first + run := &ActionRun{ + ID: 1, + RepoID: 1, + OwnerID: 1, + Status: StatusRunning, + } + // Note: This might fail if run already exists from previous tests, but that's okay + _ = db.Insert(ctx, run) + + job := &ActionRunJob{ + RunID: 1, + RepoID: 1, + OwnerID: 1, + JobID: "test-job-4", + Name: "Updatable Job", + Status: StatusWaiting, + MaxParallel: 5, + } + assert.NoError(t, db.Insert(ctx, job)) + + // Update max parallel + job.MaxParallel = 10 + _, err := UpdateRunJob(ctx, job, nil, "max_parallel") + assert.NoError(t, err) + + retrieved, err := GetRunJobByID(ctx, job.ID) + assert.NoError(t, err) + assert.Equal(t, 10, retrieved.MaxParallel) + }) +} + +func TestActionRunJob_MaxParallelEnforcement(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + ctx := context.Background() + + t.Run("EnforceMaxParallel", func(t *testing.T) { + runID := int64(5000) + jobID := "parallel-enforced-job" + maxParallel := 2 + + // Create ActionRun first + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 5000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create jobs simulating matrix execution + jobs := []*ActionRunJob{ + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel, MatrixID: "version:1"}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusRunning, MaxParallel: maxParallel, MatrixID: "version:2"}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 3", Status: StatusWaiting, MaxParallel: maxParallel, MatrixID: "version:3"}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 4", Status: StatusWaiting, MaxParallel: maxParallel, MatrixID: "version:4"}, + } + + for _, job := range jobs { + assert.NoError(t, db.Insert(ctx, job)) + } + + // Verify running count + runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, maxParallel, runningCount, "Should have exactly max-parallel jobs running") + + // Simulate job completion + jobs[0].Status = StatusSuccess + _, err = UpdateRunJob(ctx, jobs[0], nil, "status") + assert.NoError(t, err) + + // Now running count should be 1 + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount) + + // Simulate next job starting + jobs[2].Status = StatusRunning + _, err = UpdateRunJob(ctx, jobs[2], nil, "status") + assert.NoError(t, err) + + // Back to max-parallel + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, maxParallel, runningCount) + }) +} diff --git a/models/actions/runner_capacity_test.go b/models/actions/runner_capacity_test.go new file mode 100644 index 0000000000..87dc3f0e5a --- /dev/null +++ b/models/actions/runner_capacity_test.go @@ -0,0 +1,95 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "context" + "testing" + + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/models/unittest" + + "github.com/stretchr/testify/assert" +) + +func TestActionRunner_Capacity(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + ctx := context.Background() + + t.Run("DefaultCapacity", func(t *testing.T) { + runner := &ActionRunner{ + UUID: "test-uuid-1", + Name: "test-runner", + OwnerID: 0, + RepoID: 0, + TokenHash: "hash1", + Token: "token1", + } + assert.NoError(t, db.Insert(ctx, runner)) + + // Verify in database + retrieved, err := GetRunnerByID(ctx, runner.ID) + assert.NoError(t, err) + assert.Equal(t, 0, retrieved.Capacity, "Default capacity should be 0 (unlimited)") + }) + + t.Run("CustomCapacity", func(t *testing.T) { + runner := &ActionRunner{ + UUID: "test-uuid-2", + Name: "test-runner-2", + OwnerID: 0, + RepoID: 0, + Capacity: 5, + TokenHash: "hash2", + Token: "token2", + } + assert.NoError(t, db.Insert(ctx, runner)) + + assert.Equal(t, 5, runner.Capacity) + + // Verify in database + retrieved, err := GetRunnerByID(ctx, runner.ID) + assert.NoError(t, err) + assert.Equal(t, 5, retrieved.Capacity) + }) + + t.Run("UpdateCapacity", func(t *testing.T) { + runner := &ActionRunner{ + UUID: "test-uuid-3", + Name: "test-runner-3", + OwnerID: 0, + RepoID: 0, + Capacity: 1, + TokenHash: "hash3", + Token: "token3", + } + assert.NoError(t, db.Insert(ctx, runner)) + + // Update capacity + runner.Capacity = 10 + assert.NoError(t, UpdateRunner(ctx, runner, "capacity")) + + // Verify update + retrieved, err := GetRunnerByID(ctx, runner.ID) + assert.NoError(t, err) + assert.Equal(t, 10, retrieved.Capacity) + }) + + t.Run("ZeroCapacity", func(t *testing.T) { + runner := &ActionRunner{ + UUID: "test-uuid-4", + Name: "test-runner-4", + OwnerID: 0, + RepoID: 0, + Capacity: 0, // Unlimited + } + assert.NoError(t, db.Insert(ctx, runner)) + + assert.Equal(t, 0, runner.Capacity) + + retrieved, err := GetRunnerByID(ctx, runner.ID) + assert.NoError(t, err) + assert.Equal(t, 0, retrieved.Capacity) + }) +} diff --git a/models/actions/task.go b/models/actions/task.go index e092d6fbbd..e42de06e16 100644 --- a/models/actions/task.go +++ b/models/actions/task.go @@ -260,10 +260,26 @@ func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask var job *ActionRunJob log.Trace("runner labels: %v", runner.AgentLabels) for _, v := range jobs { - if runner.CanMatchLabels(v.RunsOn) { - job = v - break + if !runner.CanMatchLabels(v.RunsOn) { + continue } + + // Check max-parallel constraint for matrix jobs + if v.MaxParallel > 0 { + runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, v.RunID, v.JobID) + if err != nil { + log.Error("Failed to count running jobs for max-parallel check: %v", err) + continue + } + if runningCount >= v.MaxParallel { + log.Debug("Job %s (run %d) skipped: %d/%d jobs already running (max-parallel)", + v.JobID, v.RunID, runningCount, v.MaxParallel) + continue + } + } + + job = v + break } if job == nil { return nil, false, nil @@ -522,3 +538,23 @@ func getTaskIDFromCache(token string) int64 { } return t } + +// CountRunningTasksByRunner counts the number of running tasks assigned to a specific runner +func CountRunningTasksByRunner(ctx context.Context, runnerID int64) (int, error) { + count, err := db.GetEngine(ctx). + Where("runner_id = ?", runnerID). + And("status = ?", StatusRunning). + Count(new(ActionTask)) + return int(count), err +} + +// CountRunningJobsByWorkflowAndRun counts running jobs for a specific workflow/run combo +// Used to enforce max-parallel limits on matrix jobs +func CountRunningJobsByWorkflowAndRun(ctx context.Context, runID int64, jobID string) (int, error) { + count, err := db.GetEngine(ctx). + Where("run_id = ?", runID). + And("job_id = ?", jobID). + And("status = ?", StatusRunning). + Count(new(ActionRunJob)) + return int(count), err +} diff --git a/models/actions/task_count_test.go b/models/actions/task_count_test.go new file mode 100644 index 0000000000..2edcf1b950 --- /dev/null +++ b/models/actions/task_count_test.go @@ -0,0 +1,206 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "context" + "testing" + + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/models/unittest" + + "github.com/stretchr/testify/assert" +) + +func TestCountRunningTasksByRunner(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + ctx := context.Background() + + t.Run("NoRunningTasks", func(t *testing.T) { + count, err := CountRunningTasksByRunner(ctx, 999999) + assert.NoError(t, err) + assert.Equal(t, 0, count) + }) + + t.Run("WithRunningTasks", func(t *testing.T) { + // Create a runner + runner := &ActionRunner{ + UUID: "test-runner-tasks", + Name: "Test Runner", + OwnerID: 0, + RepoID: 0, + TokenHash: "test_hash_tasks", + Token: "test_token_tasks", + } + assert.NoError(t, db.Insert(ctx, runner)) + + // Create running tasks + task1 := &ActionTask{ + JobID: 1, + RunnerID: runner.ID, + Status: StatusRunning, + RepoID: 1, + OwnerID: 1, + TokenHash: "task1_hash", + Token: "task1_token", + } + assert.NoError(t, db.Insert(ctx, task1)) + + task2 := &ActionTask{ + JobID: 2, + RunnerID: runner.ID, + Status: StatusRunning, + RepoID: 1, + OwnerID: 1, + TokenHash: "task2_hash", + Token: "task2_token", + } + assert.NoError(t, db.Insert(ctx, task2)) + + // Count should be 2 + count, err := CountRunningTasksByRunner(ctx, runner.ID) + assert.NoError(t, err) + assert.Equal(t, 2, count) + }) + + t.Run("MixedStatusTasks", func(t *testing.T) { + runner := &ActionRunner{ + UUID: "test-runner-mixed", + Name: "Mixed Status Runner", + Capacity: 5, + TokenHash: "mixed_runner_hash", + Token: "mixed_runner_token", + } + assert.NoError(t, db.Insert(ctx, runner)) + + // Create tasks with different statuses + statuses := []Status{StatusRunning, StatusSuccess, StatusRunning, StatusFailure, StatusWaiting} + for i, status := range statuses { + task := &ActionTask{ + JobID: int64(100 + i), + RunnerID: runner.ID, + Status: status, + RepoID: 1, + OwnerID: 1, + TokenHash: "mixed_task_hash_" + string(rune('a'+i)), + Token: "mixed_task_token_" + string(rune('a'+i)), + } + assert.NoError(t, db.Insert(ctx, task)) + } + + // Only 2 running tasks + count, err := CountRunningTasksByRunner(ctx, runner.ID) + assert.NoError(t, err) + assert.Equal(t, 2, count) + }) +} + +func TestCountRunningJobsByWorkflowAndRun(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + ctx := context.Background() + + t.Run("NoRunningJobs", func(t *testing.T) { + count, err := CountRunningJobsByWorkflowAndRun(ctx, 999999, "nonexistent") + assert.NoError(t, err) + assert.Equal(t, 0, count) + }) + + t.Run("WithRunningJobs", func(t *testing.T) { + runID := int64(1000) + jobID := "test-job" + + // Create ActionRun first + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 1000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create running jobs + for range 3 { + job := &ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Test Job", + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, job)) + } + + count, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 3, count) + }) + + t.Run("DifferentJobIDs", func(t *testing.T) { + runID := int64(2000) + + // Create ActionRun first + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 2000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create jobs with different job IDs + for i := range 5 { + job := &ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: "job-" + string(rune('A'+i)), + Name: "Test Job", + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, job)) + } + + // Count for specific job ID should be 1 + count, err := CountRunningJobsByWorkflowAndRun(ctx, runID, "job-A") + assert.NoError(t, err) + assert.Equal(t, 1, count) + }) + + t.Run("MatrixJobsWithMaxParallel", func(t *testing.T) { + runID := int64(3000) + jobID := "matrix-job" + maxParallel := 2 + + // Create ActionRun first + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 3000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create matrix jobs + jobs := []*ActionRunJob{ + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusRunning, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 3", Status: StatusWaiting, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 4", Status: StatusWaiting, MaxParallel: maxParallel}, + } + + for _, job := range jobs { + assert.NoError(t, db.Insert(ctx, job)) + } + + // Count running jobs - should be 2 (matching max-parallel) + count, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 2, count) + assert.Equal(t, maxParallel, count, "Running jobs should equal max-parallel") + }) +} diff --git a/models/migrations/migrations.go b/models/migrations/migrations.go index cad4156dee..298ff60467 100644 --- a/models/migrations/migrations.go +++ b/models/migrations/migrations.go @@ -405,6 +405,7 @@ func prepareMigrationTasks() []*migration { newMigration(328, "Add TokenPermissions column to ActionRunJob", v1_26.AddTokenPermissionsToActionRunJob), newMigration(329, "Add unique constraint for user badge", v1_26.AddUniqueIndexForUserBadge), newMigration(330, "Add name column to webhook", v1_26.AddNameToWebhook), + newMigration(331, "Add runner capacity and job max-parallel support", v1_26.AddRunnerCapacityAndJobMaxParallel), } return preparedMigrations } diff --git a/services/actions/run.go b/services/actions/run.go index e9fcdcaf43..2a796ca2f9 100644 --- a/services/actions/run.go +++ b/services/actions/run.go @@ -6,6 +6,7 @@ package actions import ( "context" "fmt" + "strconv" actions_model "code.gitea.io/gitea/models/actions" "code.gitea.io/gitea/models/db" @@ -133,6 +134,14 @@ func InsertRun(ctx context.Context, run *actions_model.ActionRun, jobs []*jobpar runJob.TokenPermissions = perms } + + // Extract max-parallel from strategy if present + if job.Strategy.MaxParallelString != "" { + if maxParallel, err := strconv.Atoi(job.Strategy.MaxParallelString); err == nil && maxParallel > 0 { + runJob.MaxParallel = maxParallel + } + } + // check job concurrency if job.RawConcurrency != nil { rawConcurrency, err := yaml.Marshal(job.RawConcurrency) diff --git a/services/actions/task_assignment_test.go b/services/actions/task_assignment_test.go new file mode 100644 index 0000000000..2ff1d77c7d --- /dev/null +++ b/services/actions/task_assignment_test.go @@ -0,0 +1,296 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "context" + "testing" + + actions_model "code.gitea.io/gitea/models/actions" + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/models/unittest" + + "github.com/stretchr/testify/assert" +) + +func TestCreateTaskForRunner_CapacityEnforcement(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + + t.Run("RunnerAtCapacity", func(t *testing.T) { + // Create runner with capacity 2 + runner := &actions_model.ActionRunner{ + UUID: "capacity-test-1", + Name: "Capacity Test Runner", + Capacity: 2, + TokenHash: "capacity_test_hash_1", + Token: "capacity_test_token_1", + } + assert.NoError(t, db.Insert(context.Background(), runner)) + + // Create 2 running tasks + for i := range 2 { + task := &actions_model.ActionTask{ + JobID: int64(1000 + i), + RunnerID: runner.ID, + Status: actions_model.StatusRunning, + RepoID: 1, + OwnerID: 1, + TokenHash: "task_hash_" + string(rune('1'+i)), + Token: "task_token_" + string(rune('1'+i)), + } + assert.NoError(t, db.Insert(context.Background(), task)) + } + + // Verify runner is at capacity + count, err := actions_model.CountRunningTasksByRunner(context.Background(), runner.ID) + assert.NoError(t, err) + assert.Equal(t, 2, count) + + // Try to create another task - should fail due to capacity + // Note: This would be tested in actual CreateTaskForRunner which checks capacity + // For now, verify the count + assert.Equal(t, runner.Capacity, count, "Runner should be at capacity") + }) + + t.Run("RunnerBelowCapacity", func(t *testing.T) { + runner := &actions_model.ActionRunner{ + UUID: "capacity-test-2", + Name: "Below Capacity Runner", + Capacity: 5, + TokenHash: "capacity_test_hash_2", + Token: "capacity_test_token_2", + } + assert.NoError(t, db.Insert(context.Background(), runner)) + + // Create 2 running tasks + for i := range 2 { + task := &actions_model.ActionTask{ + JobID: int64(2000 + i), + RunnerID: runner.ID, + Status: actions_model.StatusRunning, + RepoID: 1, + OwnerID: 1, + TokenHash: "task_hash_2_" + string(rune('a'+i)), + Token: "task_token_2_" + string(rune('a'+i)), + } + assert.NoError(t, db.Insert(context.Background(), task)) + } + + count, err := actions_model.CountRunningTasksByRunner(context.Background(), runner.ID) + assert.NoError(t, err) + assert.Equal(t, 2, count) + assert.Less(t, count, runner.Capacity, "Runner should be below capacity") + }) + + t.Run("UnlimitedCapacity", func(t *testing.T) { + runner := &actions_model.ActionRunner{ + UUID: "capacity-test-3", + Name: "Unlimited Runner", + Capacity: 0, // 0 = unlimited + TokenHash: "capacity_test_hash_3", + Token: "capacity_test_token_3", + } + assert.NoError(t, db.Insert(context.Background(), runner)) + + // Create many running tasks + for i := range 10 { + task := &actions_model.ActionTask{ + JobID: int64(3000 + i), + RunnerID: runner.ID, + Status: actions_model.StatusRunning, + RepoID: 1, + OwnerID: 1, + TokenHash: "task_hash_3_" + string(rune('a'+i)), + Token: "task_token_3_" + string(rune('a'+i)), + } + assert.NoError(t, db.Insert(context.Background(), task)) + } + + count, err := actions_model.CountRunningTasksByRunner(context.Background(), runner.ID) + assert.NoError(t, err) + assert.Equal(t, 10, count) + // With capacity 0, there's no limit + }) +} + +func TestCreateTaskForRunner_MaxParallelEnforcement(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + + t.Run("MaxParallelReached", func(t *testing.T) { + runID := int64(10000) + jobID := "max-parallel-job" + maxParallel := 2 + + // Create ActionRun first + run := &actions_model.ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 10000, + Status: actions_model.StatusRunning, + } + assert.NoError(t, db.Insert(context.Background(), run)) + + // Create waiting jobs + for range 4 { + job := &actions_model.ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Test Job", + Status: actions_model.StatusWaiting, + MaxParallel: maxParallel, + } + assert.NoError(t, db.Insert(context.Background(), job)) + } + + // Start 2 jobs (max-parallel limit) + jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + assert.NoError(t, err) + assert.Len(t, jobs, 4) + + for i := range 2 { + jobs[i].Status = actions_model.StatusRunning + _, err := actions_model.UpdateRunJob(context.Background(), jobs[i], nil, "status") + assert.NoError(t, err) + } + + // Verify max-parallel is enforced + runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, maxParallel, runningCount) + + // Remaining jobs should stay in waiting + remainingJobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + assert.NoError(t, err) + + waitingCount := 0 + for _, job := range remainingJobs { + if job.Status == actions_model.StatusWaiting { + waitingCount++ + } + } + assert.Equal(t, 2, waitingCount) + }) + + t.Run("MaxParallelNotSet", func(t *testing.T) { + runID := int64(20000) + jobID := "no-limit-job" + + // Create ActionRun first + run := &actions_model.ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 20000, + Status: actions_model.StatusRunning, + } + assert.NoError(t, db.Insert(context.Background(), run)) + + // Create jobs without max-parallel + for range 5 { + job := &actions_model.ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Test Job", + Status: actions_model.StatusWaiting, + MaxParallel: 0, // No limit + } + assert.NoError(t, db.Insert(context.Background(), job)) + } + + // All jobs can run simultaneously + jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + assert.NoError(t, err) + + for _, job := range jobs { + job.Status = actions_model.StatusRunning + _, err := actions_model.UpdateRunJob(context.Background(), job, nil, "status") + assert.NoError(t, err) + } + + runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 5, runningCount, "All jobs should be able to run without limit") + }) +} + +func TestCreateTaskForRunner_CombinedEnforcement(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + + t.Run("BothRunnerCapacityAndMaxParallel", func(t *testing.T) { + // Create runner with capacity 3 + runner := &actions_model.ActionRunner{ + UUID: "combined-test", + Name: "Combined Test Runner", + Capacity: 3, + TokenHash: "combined_test_hash", + Token: "combined_test_token", + } + assert.NoError(t, db.Insert(context.Background(), runner)) + + runID := int64(30000) + jobID := "combined-job" + + // Create ActionRun first + run := &actions_model.ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 30000, + Status: actions_model.StatusRunning, + } + assert.NoError(t, db.Insert(context.Background(), run)) + + // Create jobs with max-parallel 2 + for range 5 { + job := &actions_model.ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Combined Job", + Status: actions_model.StatusWaiting, + MaxParallel: 2, + } + assert.NoError(t, db.Insert(context.Background(), job)) + } + + // The most restrictive limit should apply + // In this case: max-parallel = 2 (more restrictive than runner capacity = 3) + jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + assert.NoError(t, err) + + // Simulate starting jobs + for i, job := range jobs[:2] { + job.Status = actions_model.StatusRunning + _, err := actions_model.UpdateRunJob(context.Background(), job, nil, "status") + assert.NoError(t, err) + + task := &actions_model.ActionTask{ + JobID: job.ID, + RunnerID: runner.ID, + Status: actions_model.StatusRunning, + RepoID: 1, + OwnerID: 1, + TokenHash: "combined_task_hash_" + string(rune('a'+i)), + Token: "combined_task_token_" + string(rune('a'+i)), + } + assert.NoError(t, db.Insert(context.Background(), task)) + } + + // Verify both limits + runningTasks, err := actions_model.CountRunningTasksByRunner(context.Background(), runner.ID) + assert.NoError(t, err) + assert.Equal(t, 2, runningTasks) + assert.Less(t, runningTasks, runner.Capacity, "Should be under runner capacity") + + runningJobs, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 2, runningJobs, "Should respect max-parallel") + }) +} diff --git a/services/convert/convert.go b/services/convert/convert.go index e79cb343e4..829fbdf0eb 100644 --- a/services/convert/convert.go +++ b/services/convert/convert.go @@ -524,6 +524,7 @@ func ToActionRunner(ctx context.Context, runner *actions_model.ActionRunner) *ap Disabled: runner.IsDisabled, Ephemeral: runner.Ephemeral, Labels: labels, + Capacity: runner.Capacity, } } diff --git a/tests/integration/api_runner_capacity_test.go b/tests/integration/api_runner_capacity_test.go new file mode 100644 index 0000000000..f75cecd18d --- /dev/null +++ b/tests/integration/api_runner_capacity_test.go @@ -0,0 +1,149 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package integration + +import ( + "context" + "fmt" + "net/http" + "testing" + + actions_model "code.gitea.io/gitea/models/actions" + auth_model "code.gitea.io/gitea/models/auth" + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/models/unittest" + api "code.gitea.io/gitea/modules/structs" + "code.gitea.io/gitea/tests" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestAPIUpdateRunnerCapacity(t *testing.T) { + defer tests.PrepareTestEnv(t)() + + ctx := context.Background() + + // Clean up existing runners + require.NoError(t, db.DeleteAllRecords("action_runner")) + + // Create a test runner + runner := &actions_model.ActionRunner{ + UUID: "test-capacity-runner", + Name: "Test Capacity Runner", + OwnerID: 0, + RepoID: 0, + Capacity: 1, + TokenHash: "test-capacity-hash", + Token: "test-capacity-token", + } + require.NoError(t, actions_model.CreateRunner(ctx, runner)) + + // Load the created runner to get the ID + runner = unittest.AssertExistsAndLoadBean(t, &actions_model.ActionRunner{UUID: "test-capacity-runner"}) + + session := loginUser(t, "user1") + token := getTokenForLoggedInUser(t, session, auth_model.AccessTokenScopeWriteAdmin) + + t.Run("UpdateCapacity", func(t *testing.T) { + req := NewRequestWithJSON(t, "PATCH", + fmt.Sprintf("/api/v1/admin/actions/runners/%d/capacity", runner.ID), + &api.UpdateRunnerCapacityOption{Capacity: 5}). + AddTokenAuth(token) + + resp := MakeRequest(t, req, http.StatusOK) + + var apiRunner api.ActionRunner + DecodeJSON(t, resp, &apiRunner) + + assert.Equal(t, runner.ID, apiRunner.ID) + assert.Equal(t, 5, apiRunner.Capacity) + + // Verify in database + updated, err := actions_model.GetRunnerByID(context.Background(), runner.ID) + assert.NoError(t, err) + assert.Equal(t, 5, updated.Capacity) + }) + + t.Run("UpdateCapacityToZero", func(t *testing.T) { + req := NewRequestWithJSON(t, "PATCH", + fmt.Sprintf("/api/v1/admin/actions/runners/%d/capacity", runner.ID), + &api.UpdateRunnerCapacityOption{Capacity: 0}). + AddTokenAuth(token) + + resp := MakeRequest(t, req, http.StatusOK) + + var apiRunner api.ActionRunner + DecodeJSON(t, resp, &apiRunner) + + assert.Equal(t, 0, apiRunner.Capacity) + }) + + t.Run("InvalidCapacity", func(t *testing.T) { + req := NewRequestWithJSON(t, "PATCH", + fmt.Sprintf("/api/v1/admin/actions/runners/%d/capacity", runner.ID), + &api.UpdateRunnerCapacityOption{Capacity: -1}). + AddTokenAuth(token) + + MakeRequest(t, req, http.StatusBadRequest) + }) + + t.Run("NonExistentRunner", func(t *testing.T) { + req := NewRequestWithJSON(t, "PATCH", + "/api/v1/admin/actions/runners/999999/capacity", + &api.UpdateRunnerCapacityOption{Capacity: 5}). + AddTokenAuth(token) + + MakeRequest(t, req, http.StatusNotFound) + }) + + t.Run("GetRunnerWithCapacity", func(t *testing.T) { + // First set capacity + runner.Capacity = 7 + assert.NoError(t, actions_model.UpdateRunner(context.Background(), runner, "capacity")) + + // Get runner + req := NewRequest(t, "GET", + fmt.Sprintf("/api/v1/admin/actions/runners/%d", runner.ID)). + AddTokenAuth(token) + + resp := MakeRequest(t, req, http.StatusOK) + + var apiRunner api.ActionRunner + DecodeJSON(t, resp, &apiRunner) + + assert.Equal(t, runner.ID, apiRunner.ID) + assert.Equal(t, 7, apiRunner.Capacity) + }) + + t.Run("ListRunnersWithCapacity", func(t *testing.T) { + req := NewRequest(t, "GET", "/api/v1/admin/actions/runners"). + AddTokenAuth(token) + + resp := MakeRequest(t, req, http.StatusOK) + + var response api.ActionRunnersResponse + DecodeJSON(t, resp, &response) + + // Find our test runner + found := false + for _, r := range response.Entries { + if r.ID == runner.ID { + found = true + assert.Equal(t, 7, r.Capacity) + break + } + } + assert.True(t, found, "Test runner should be in list") + }) + + t.Run("UnauthorizedAccess", func(t *testing.T) { + req := NewRequestWithJSON(t, "PATCH", + fmt.Sprintf("/api/v1/admin/actions/runners/%d/capacity", runner.ID), + &api.UpdateRunnerCapacityOption{Capacity: 5}) + // No token + + MakeRequest(t, req, http.StatusUnauthorized) + }) +} From d5823e04314f3f272c320312b8b8378cbe1188e8 Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Wed, 21 Jan 2026 09:32:59 +0100 Subject: [PATCH 03/12] fix: Remove MatrixID and Capacity functionality --- models/actions/run_job.go | 2 + models/actions/run_job_maxparallel_test.go | 27 +--- models/actions/task.go | 9 -- models/actions/task_count_test.go | 83 ---------- services/actions/task_assignment_test.go | 176 --------------------- 5 files changed, 6 insertions(+), 291 deletions(-) diff --git a/models/actions/run_job.go b/models/actions/run_job.go index 616e298dc9..de16741a82 100644 --- a/models/actions/run_job.go +++ b/models/actions/run_job.go @@ -55,6 +55,8 @@ type ActionRunJob struct { // Org/repo clamps are enforced when the token is used at runtime. // It is JSON-encoded repo_model.ActionsTokenPermissions and may be empty if not specified. TokenPermissions *repo_model.ActionsTokenPermissions `xorm:"JSON TEXT"` + // Matrix job support + MaxParallel int // Max parallel jobs from strategy.max-parallel (0 = unlimited) Started timeutil.TimeStamp Stopped timeutil.TimeStamp diff --git a/models/actions/run_job_maxparallel_test.go b/models/actions/run_job_maxparallel_test.go index 423907ce73..275a70fc37 100644 --- a/models/actions/run_job_maxparallel_test.go +++ b/models/actions/run_job_maxparallel_test.go @@ -51,25 +51,6 @@ func TestActionRunJob_MaxParallel(t *testing.T) { assert.Equal(t, 3, retrieved.MaxParallel) }) - t.Run("MatrixID", func(t *testing.T) { - job := &ActionRunJob{ - RunID: 1, - RepoID: 1, - OwnerID: 1, - JobID: "test-job-3", - Name: "Matrix Job with ID", - Status: StatusWaiting, - MaxParallel: 2, - MatrixID: "os:ubuntu,node:16", - } - assert.NoError(t, db.Insert(ctx, job)) - - retrieved, err := GetRunJobByID(ctx, job.ID) - assert.NoError(t, err) - assert.Equal(t, 2, retrieved.MaxParallel) - assert.Equal(t, "os:ubuntu,node:16", retrieved.MatrixID) - }) - t.Run("UpdateMaxParallel", func(t *testing.T) { // Create ActionRun first run := &ActionRun{ @@ -124,10 +105,10 @@ func TestActionRunJob_MaxParallelEnforcement(t *testing.T) { // Create jobs simulating matrix execution jobs := []*ActionRunJob{ - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel, MatrixID: "version:1"}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusRunning, MaxParallel: maxParallel, MatrixID: "version:2"}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 3", Status: StatusWaiting, MaxParallel: maxParallel, MatrixID: "version:3"}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 4", Status: StatusWaiting, MaxParallel: maxParallel, MatrixID: "version:4"}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusRunning, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 3", Status: StatusWaiting, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 4", Status: StatusWaiting, MaxParallel: maxParallel}, } for _, job := range jobs { diff --git a/models/actions/task.go b/models/actions/task.go index e42de06e16..87bbc76ef8 100644 --- a/models/actions/task.go +++ b/models/actions/task.go @@ -539,15 +539,6 @@ func getTaskIDFromCache(token string) int64 { return t } -// CountRunningTasksByRunner counts the number of running tasks assigned to a specific runner -func CountRunningTasksByRunner(ctx context.Context, runnerID int64) (int, error) { - count, err := db.GetEngine(ctx). - Where("runner_id = ?", runnerID). - And("status = ?", StatusRunning). - Count(new(ActionTask)) - return int(count), err -} - // CountRunningJobsByWorkflowAndRun counts running jobs for a specific workflow/run combo // Used to enforce max-parallel limits on matrix jobs func CountRunningJobsByWorkflowAndRun(ctx context.Context, runID int64, jobID string) (int, error) { diff --git a/models/actions/task_count_test.go b/models/actions/task_count_test.go index 2edcf1b950..12117a87e5 100644 --- a/models/actions/task_count_test.go +++ b/models/actions/task_count_test.go @@ -13,89 +13,6 @@ import ( "github.com/stretchr/testify/assert" ) -func TestCountRunningTasksByRunner(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - ctx := context.Background() - - t.Run("NoRunningTasks", func(t *testing.T) { - count, err := CountRunningTasksByRunner(ctx, 999999) - assert.NoError(t, err) - assert.Equal(t, 0, count) - }) - - t.Run("WithRunningTasks", func(t *testing.T) { - // Create a runner - runner := &ActionRunner{ - UUID: "test-runner-tasks", - Name: "Test Runner", - OwnerID: 0, - RepoID: 0, - TokenHash: "test_hash_tasks", - Token: "test_token_tasks", - } - assert.NoError(t, db.Insert(ctx, runner)) - - // Create running tasks - task1 := &ActionTask{ - JobID: 1, - RunnerID: runner.ID, - Status: StatusRunning, - RepoID: 1, - OwnerID: 1, - TokenHash: "task1_hash", - Token: "task1_token", - } - assert.NoError(t, db.Insert(ctx, task1)) - - task2 := &ActionTask{ - JobID: 2, - RunnerID: runner.ID, - Status: StatusRunning, - RepoID: 1, - OwnerID: 1, - TokenHash: "task2_hash", - Token: "task2_token", - } - assert.NoError(t, db.Insert(ctx, task2)) - - // Count should be 2 - count, err := CountRunningTasksByRunner(ctx, runner.ID) - assert.NoError(t, err) - assert.Equal(t, 2, count) - }) - - t.Run("MixedStatusTasks", func(t *testing.T) { - runner := &ActionRunner{ - UUID: "test-runner-mixed", - Name: "Mixed Status Runner", - Capacity: 5, - TokenHash: "mixed_runner_hash", - Token: "mixed_runner_token", - } - assert.NoError(t, db.Insert(ctx, runner)) - - // Create tasks with different statuses - statuses := []Status{StatusRunning, StatusSuccess, StatusRunning, StatusFailure, StatusWaiting} - for i, status := range statuses { - task := &ActionTask{ - JobID: int64(100 + i), - RunnerID: runner.ID, - Status: status, - RepoID: 1, - OwnerID: 1, - TokenHash: "mixed_task_hash_" + string(rune('a'+i)), - Token: "mixed_task_token_" + string(rune('a'+i)), - } - assert.NoError(t, db.Insert(ctx, task)) - } - - // Only 2 running tasks - count, err := CountRunningTasksByRunner(ctx, runner.ID) - assert.NoError(t, err) - assert.Equal(t, 2, count) - }) -} - func TestCountRunningJobsByWorkflowAndRun(t *testing.T) { assert.NoError(t, unittest.PrepareTestDatabase()) ctx := context.Background() diff --git a/services/actions/task_assignment_test.go b/services/actions/task_assignment_test.go index 2ff1d77c7d..d9af3777f3 100644 --- a/services/actions/task_assignment_test.go +++ b/services/actions/task_assignment_test.go @@ -14,106 +14,6 @@ import ( "github.com/stretchr/testify/assert" ) -func TestCreateTaskForRunner_CapacityEnforcement(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - - t.Run("RunnerAtCapacity", func(t *testing.T) { - // Create runner with capacity 2 - runner := &actions_model.ActionRunner{ - UUID: "capacity-test-1", - Name: "Capacity Test Runner", - Capacity: 2, - TokenHash: "capacity_test_hash_1", - Token: "capacity_test_token_1", - } - assert.NoError(t, db.Insert(context.Background(), runner)) - - // Create 2 running tasks - for i := range 2 { - task := &actions_model.ActionTask{ - JobID: int64(1000 + i), - RunnerID: runner.ID, - Status: actions_model.StatusRunning, - RepoID: 1, - OwnerID: 1, - TokenHash: "task_hash_" + string(rune('1'+i)), - Token: "task_token_" + string(rune('1'+i)), - } - assert.NoError(t, db.Insert(context.Background(), task)) - } - - // Verify runner is at capacity - count, err := actions_model.CountRunningTasksByRunner(context.Background(), runner.ID) - assert.NoError(t, err) - assert.Equal(t, 2, count) - - // Try to create another task - should fail due to capacity - // Note: This would be tested in actual CreateTaskForRunner which checks capacity - // For now, verify the count - assert.Equal(t, runner.Capacity, count, "Runner should be at capacity") - }) - - t.Run("RunnerBelowCapacity", func(t *testing.T) { - runner := &actions_model.ActionRunner{ - UUID: "capacity-test-2", - Name: "Below Capacity Runner", - Capacity: 5, - TokenHash: "capacity_test_hash_2", - Token: "capacity_test_token_2", - } - assert.NoError(t, db.Insert(context.Background(), runner)) - - // Create 2 running tasks - for i := range 2 { - task := &actions_model.ActionTask{ - JobID: int64(2000 + i), - RunnerID: runner.ID, - Status: actions_model.StatusRunning, - RepoID: 1, - OwnerID: 1, - TokenHash: "task_hash_2_" + string(rune('a'+i)), - Token: "task_token_2_" + string(rune('a'+i)), - } - assert.NoError(t, db.Insert(context.Background(), task)) - } - - count, err := actions_model.CountRunningTasksByRunner(context.Background(), runner.ID) - assert.NoError(t, err) - assert.Equal(t, 2, count) - assert.Less(t, count, runner.Capacity, "Runner should be below capacity") - }) - - t.Run("UnlimitedCapacity", func(t *testing.T) { - runner := &actions_model.ActionRunner{ - UUID: "capacity-test-3", - Name: "Unlimited Runner", - Capacity: 0, // 0 = unlimited - TokenHash: "capacity_test_hash_3", - Token: "capacity_test_token_3", - } - assert.NoError(t, db.Insert(context.Background(), runner)) - - // Create many running tasks - for i := range 10 { - task := &actions_model.ActionTask{ - JobID: int64(3000 + i), - RunnerID: runner.ID, - Status: actions_model.StatusRunning, - RepoID: 1, - OwnerID: 1, - TokenHash: "task_hash_3_" + string(rune('a'+i)), - Token: "task_token_3_" + string(rune('a'+i)), - } - assert.NoError(t, db.Insert(context.Background(), task)) - } - - count, err := actions_model.CountRunningTasksByRunner(context.Background(), runner.ID) - assert.NoError(t, err) - assert.Equal(t, 10, count) - // With capacity 0, there's no limit - }) -} - func TestCreateTaskForRunner_MaxParallelEnforcement(t *testing.T) { assert.NoError(t, unittest.PrepareTestDatabase()) @@ -218,79 +118,3 @@ func TestCreateTaskForRunner_MaxParallelEnforcement(t *testing.T) { assert.Equal(t, 5, runningCount, "All jobs should be able to run without limit") }) } - -func TestCreateTaskForRunner_CombinedEnforcement(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - - t.Run("BothRunnerCapacityAndMaxParallel", func(t *testing.T) { - // Create runner with capacity 3 - runner := &actions_model.ActionRunner{ - UUID: "combined-test", - Name: "Combined Test Runner", - Capacity: 3, - TokenHash: "combined_test_hash", - Token: "combined_test_token", - } - assert.NoError(t, db.Insert(context.Background(), runner)) - - runID := int64(30000) - jobID := "combined-job" - - // Create ActionRun first - run := &actions_model.ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 30000, - Status: actions_model.StatusRunning, - } - assert.NoError(t, db.Insert(context.Background(), run)) - - // Create jobs with max-parallel 2 - for range 5 { - job := &actions_model.ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Combined Job", - Status: actions_model.StatusWaiting, - MaxParallel: 2, - } - assert.NoError(t, db.Insert(context.Background(), job)) - } - - // The most restrictive limit should apply - // In this case: max-parallel = 2 (more restrictive than runner capacity = 3) - jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) - assert.NoError(t, err) - - // Simulate starting jobs - for i, job := range jobs[:2] { - job.Status = actions_model.StatusRunning - _, err := actions_model.UpdateRunJob(context.Background(), job, nil, "status") - assert.NoError(t, err) - - task := &actions_model.ActionTask{ - JobID: job.ID, - RunnerID: runner.ID, - Status: actions_model.StatusRunning, - RepoID: 1, - OwnerID: 1, - TokenHash: "combined_task_hash_" + string(rune('a'+i)), - Token: "combined_task_token_" + string(rune('a'+i)), - } - assert.NoError(t, db.Insert(context.Background(), task)) - } - - // Verify both limits - runningTasks, err := actions_model.CountRunningTasksByRunner(context.Background(), runner.ID) - assert.NoError(t, err) - assert.Equal(t, 2, runningTasks) - assert.Less(t, runningTasks, runner.Capacity, "Should be under runner capacity") - - runningJobs, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 2, runningJobs, "Should respect max-parallel") - }) -} From 704800ec07b4e815756090bf3351156088e833c4 Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Thu, 22 Jan 2026 21:09:17 +0100 Subject: [PATCH 04/12] fix: Adjust the unit tests --- models/actions/runner_capacity_test.go | 95 ----------- services/convert/convert.go | 1 - tests/integration/api_runner_capacity_test.go | 149 ------------------ 3 files changed, 245 deletions(-) delete mode 100644 models/actions/runner_capacity_test.go delete mode 100644 tests/integration/api_runner_capacity_test.go diff --git a/models/actions/runner_capacity_test.go b/models/actions/runner_capacity_test.go deleted file mode 100644 index 87dc3f0e5a..0000000000 --- a/models/actions/runner_capacity_test.go +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright 2026 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package actions - -import ( - "context" - "testing" - - "code.gitea.io/gitea/models/db" - "code.gitea.io/gitea/models/unittest" - - "github.com/stretchr/testify/assert" -) - -func TestActionRunner_Capacity(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - ctx := context.Background() - - t.Run("DefaultCapacity", func(t *testing.T) { - runner := &ActionRunner{ - UUID: "test-uuid-1", - Name: "test-runner", - OwnerID: 0, - RepoID: 0, - TokenHash: "hash1", - Token: "token1", - } - assert.NoError(t, db.Insert(ctx, runner)) - - // Verify in database - retrieved, err := GetRunnerByID(ctx, runner.ID) - assert.NoError(t, err) - assert.Equal(t, 0, retrieved.Capacity, "Default capacity should be 0 (unlimited)") - }) - - t.Run("CustomCapacity", func(t *testing.T) { - runner := &ActionRunner{ - UUID: "test-uuid-2", - Name: "test-runner-2", - OwnerID: 0, - RepoID: 0, - Capacity: 5, - TokenHash: "hash2", - Token: "token2", - } - assert.NoError(t, db.Insert(ctx, runner)) - - assert.Equal(t, 5, runner.Capacity) - - // Verify in database - retrieved, err := GetRunnerByID(ctx, runner.ID) - assert.NoError(t, err) - assert.Equal(t, 5, retrieved.Capacity) - }) - - t.Run("UpdateCapacity", func(t *testing.T) { - runner := &ActionRunner{ - UUID: "test-uuid-3", - Name: "test-runner-3", - OwnerID: 0, - RepoID: 0, - Capacity: 1, - TokenHash: "hash3", - Token: "token3", - } - assert.NoError(t, db.Insert(ctx, runner)) - - // Update capacity - runner.Capacity = 10 - assert.NoError(t, UpdateRunner(ctx, runner, "capacity")) - - // Verify update - retrieved, err := GetRunnerByID(ctx, runner.ID) - assert.NoError(t, err) - assert.Equal(t, 10, retrieved.Capacity) - }) - - t.Run("ZeroCapacity", func(t *testing.T) { - runner := &ActionRunner{ - UUID: "test-uuid-4", - Name: "test-runner-4", - OwnerID: 0, - RepoID: 0, - Capacity: 0, // Unlimited - } - assert.NoError(t, db.Insert(ctx, runner)) - - assert.Equal(t, 0, runner.Capacity) - - retrieved, err := GetRunnerByID(ctx, runner.ID) - assert.NoError(t, err) - assert.Equal(t, 0, retrieved.Capacity) - }) -} diff --git a/services/convert/convert.go b/services/convert/convert.go index 829fbdf0eb..e79cb343e4 100644 --- a/services/convert/convert.go +++ b/services/convert/convert.go @@ -524,7 +524,6 @@ func ToActionRunner(ctx context.Context, runner *actions_model.ActionRunner) *ap Disabled: runner.IsDisabled, Ephemeral: runner.Ephemeral, Labels: labels, - Capacity: runner.Capacity, } } diff --git a/tests/integration/api_runner_capacity_test.go b/tests/integration/api_runner_capacity_test.go deleted file mode 100644 index f75cecd18d..0000000000 --- a/tests/integration/api_runner_capacity_test.go +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2026 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package integration - -import ( - "context" - "fmt" - "net/http" - "testing" - - actions_model "code.gitea.io/gitea/models/actions" - auth_model "code.gitea.io/gitea/models/auth" - "code.gitea.io/gitea/models/db" - "code.gitea.io/gitea/models/unittest" - api "code.gitea.io/gitea/modules/structs" - "code.gitea.io/gitea/tests" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestAPIUpdateRunnerCapacity(t *testing.T) { - defer tests.PrepareTestEnv(t)() - - ctx := context.Background() - - // Clean up existing runners - require.NoError(t, db.DeleteAllRecords("action_runner")) - - // Create a test runner - runner := &actions_model.ActionRunner{ - UUID: "test-capacity-runner", - Name: "Test Capacity Runner", - OwnerID: 0, - RepoID: 0, - Capacity: 1, - TokenHash: "test-capacity-hash", - Token: "test-capacity-token", - } - require.NoError(t, actions_model.CreateRunner(ctx, runner)) - - // Load the created runner to get the ID - runner = unittest.AssertExistsAndLoadBean(t, &actions_model.ActionRunner{UUID: "test-capacity-runner"}) - - session := loginUser(t, "user1") - token := getTokenForLoggedInUser(t, session, auth_model.AccessTokenScopeWriteAdmin) - - t.Run("UpdateCapacity", func(t *testing.T) { - req := NewRequestWithJSON(t, "PATCH", - fmt.Sprintf("/api/v1/admin/actions/runners/%d/capacity", runner.ID), - &api.UpdateRunnerCapacityOption{Capacity: 5}). - AddTokenAuth(token) - - resp := MakeRequest(t, req, http.StatusOK) - - var apiRunner api.ActionRunner - DecodeJSON(t, resp, &apiRunner) - - assert.Equal(t, runner.ID, apiRunner.ID) - assert.Equal(t, 5, apiRunner.Capacity) - - // Verify in database - updated, err := actions_model.GetRunnerByID(context.Background(), runner.ID) - assert.NoError(t, err) - assert.Equal(t, 5, updated.Capacity) - }) - - t.Run("UpdateCapacityToZero", func(t *testing.T) { - req := NewRequestWithJSON(t, "PATCH", - fmt.Sprintf("/api/v1/admin/actions/runners/%d/capacity", runner.ID), - &api.UpdateRunnerCapacityOption{Capacity: 0}). - AddTokenAuth(token) - - resp := MakeRequest(t, req, http.StatusOK) - - var apiRunner api.ActionRunner - DecodeJSON(t, resp, &apiRunner) - - assert.Equal(t, 0, apiRunner.Capacity) - }) - - t.Run("InvalidCapacity", func(t *testing.T) { - req := NewRequestWithJSON(t, "PATCH", - fmt.Sprintf("/api/v1/admin/actions/runners/%d/capacity", runner.ID), - &api.UpdateRunnerCapacityOption{Capacity: -1}). - AddTokenAuth(token) - - MakeRequest(t, req, http.StatusBadRequest) - }) - - t.Run("NonExistentRunner", func(t *testing.T) { - req := NewRequestWithJSON(t, "PATCH", - "/api/v1/admin/actions/runners/999999/capacity", - &api.UpdateRunnerCapacityOption{Capacity: 5}). - AddTokenAuth(token) - - MakeRequest(t, req, http.StatusNotFound) - }) - - t.Run("GetRunnerWithCapacity", func(t *testing.T) { - // First set capacity - runner.Capacity = 7 - assert.NoError(t, actions_model.UpdateRunner(context.Background(), runner, "capacity")) - - // Get runner - req := NewRequest(t, "GET", - fmt.Sprintf("/api/v1/admin/actions/runners/%d", runner.ID)). - AddTokenAuth(token) - - resp := MakeRequest(t, req, http.StatusOK) - - var apiRunner api.ActionRunner - DecodeJSON(t, resp, &apiRunner) - - assert.Equal(t, runner.ID, apiRunner.ID) - assert.Equal(t, 7, apiRunner.Capacity) - }) - - t.Run("ListRunnersWithCapacity", func(t *testing.T) { - req := NewRequest(t, "GET", "/api/v1/admin/actions/runners"). - AddTokenAuth(token) - - resp := MakeRequest(t, req, http.StatusOK) - - var response api.ActionRunnersResponse - DecodeJSON(t, resp, &response) - - // Find our test runner - found := false - for _, r := range response.Entries { - if r.ID == runner.ID { - found = true - assert.Equal(t, 7, r.Capacity) - break - } - } - assert.True(t, found, "Test runner should be in list") - }) - - t.Run("UnauthorizedAccess", func(t *testing.T) { - req := NewRequestWithJSON(t, "PATCH", - fmt.Sprintf("/api/v1/admin/actions/runners/%d/capacity", runner.ID), - &api.UpdateRunnerCapacityOption{Capacity: 5}) - // No token - - MakeRequest(t, req, http.StatusUnauthorized) - }) -} From b85e78efebbfd785a9cfdace89cb88b36d656c37 Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Fri, 30 Jan 2026 08:13:02 +0100 Subject: [PATCH 05/12] feat: Update the db migration and the add a debug message about wrong parsed values --- services/actions/run.go | 4 +- services/actions/task_assignment_test.go | 70 +++++++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/services/actions/run.go b/services/actions/run.go index 2a796ca2f9..bec95fe7fc 100644 --- a/services/actions/run.go +++ b/services/actions/run.go @@ -10,6 +10,7 @@ import ( actions_model "code.gitea.io/gitea/models/actions" "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/actions/jobparser" "code.gitea.io/gitea/modules/util" notify_service "code.gitea.io/gitea/services/notify" @@ -134,11 +135,12 @@ func InsertRun(ctx context.Context, run *actions_model.ActionRun, jobs []*jobpar runJob.TokenPermissions = perms } - // Extract max-parallel from strategy if present if job.Strategy.MaxParallelString != "" { if maxParallel, err := strconv.Atoi(job.Strategy.MaxParallelString); err == nil && maxParallel > 0 { runJob.MaxParallel = maxParallel + } else { + log.Debug("failed to process max-parallel for job %s: invalid value %v: %v", id, job.Strategy.MaxParallelString, err) } } diff --git a/services/actions/task_assignment_test.go b/services/actions/task_assignment_test.go index d9af3777f3..0da280185b 100644 --- a/services/actions/task_assignment_test.go +++ b/services/actions/task_assignment_test.go @@ -14,7 +14,7 @@ import ( "github.com/stretchr/testify/assert" ) -func TestCreateTaskForRunner_MaxParallelEnforcement(t *testing.T) { +func TestMaxParallelJobStatusAndCounting(t *testing.T) { assert.NoError(t, unittest.PrepareTestDatabase()) t.Run("MaxParallelReached", func(t *testing.T) { @@ -117,4 +117,72 @@ func TestCreateTaskForRunner_MaxParallelEnforcement(t *testing.T) { assert.NoError(t, err) assert.Equal(t, 5, runningCount, "All jobs should be able to run without limit") }) + + t.Run("MaxParallelWrongValue", func(t *testing.T) { + runID := int64(30000) + jobID := "wrong-value-use-default-value-job" + + // Create ActionRun first + run := &actions_model.ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 30000, + Status: actions_model.StatusRunning, + } + assert.NoError(t, db.Insert(context.Background(), run)) + + // Test different invalid max-parallel values + testCases := []struct { + name string + maxParallel int + description string + }{ + { + name: "negative value", + maxParallel: -1, + description: "Negative max-parallel should default to 0 (no limit)", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + + // Create jobs with the test max-parallel value + for i := range 5 { + job := &actions_model.ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Test Job " + tc.name, + Status: actions_model.StatusWaiting, + MaxParallel: tc.maxParallel, + } + assert.NoError(t, db.Insert(context.Background(), job)) + + // Verify the value was stored + if i == 0 { + storedJob, err := actions_model.GetRunJobByID(context.Background(), job.ID) + assert.NoError(t, err) + assert.Equal(t, tc.maxParallel, storedJob.MaxParallel, tc.description) + } + } + + // All jobs can run simultaneously when max-parallel <= 0 + jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + assert.NoError(t, err) + + for _, job := range jobs { + job.Status = actions_model.StatusRunning + _, err := actions_model.UpdateRunJob(context.Background(), job, nil, "status") + assert.NoError(t, err) + } + + runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.GreaterOrEqual(t, runningCount, 5, "All jobs should be able to run when max-parallel is "+tc.name) + }) + } + }) } From 2a4558da379b8842603c09e9e038074f526490bf Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Fri, 30 Jan 2026 08:27:04 +0100 Subject: [PATCH 06/12] docs: Add a comment about the implementation details and a race condition fix: Adjust the lint issues fix: Add the copyright info Signed-off-by: Pascal Zimmermann --- models/actions/task.go | 6 ++++++ services/actions/task_assignment_test.go | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/models/actions/task.go b/models/actions/task.go index 87bbc76ef8..34c61b3ffb 100644 --- a/models/actions/task.go +++ b/models/actions/task.go @@ -265,6 +265,12 @@ func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask } // Check max-parallel constraint for matrix jobs + // Note: This is a best-effort check with a small race window. Multiple runners + // could simultaneously check the count before any commits, potentially allowing + // the limit to be briefly exceeded (e.g., if limit=2 and count=1, two runners + // might both proceed). Perfect enforcement would require row-level locking across + // all jobs with the same run_id+job_id, which has a significant performance impact. + // The race window is small since jobs are picked and committed quickly. if v.MaxParallel > 0 { runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, v.RunID, v.JobID) if err != nil { diff --git a/services/actions/task_assignment_test.go b/services/actions/task_assignment_test.go index 0da280185b..4ceb7d3d46 100644 --- a/services/actions/task_assignment_test.go +++ b/services/actions/task_assignment_test.go @@ -147,7 +147,6 @@ func TestMaxParallelJobStatusAndCounting(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - // Create jobs with the test max-parallel value for i := range 5 { job := &actions_model.ActionRunJob{ From 63be2b1103745b25b1730322c20d8500204249bd Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Sat, 7 Feb 2026 19:40:11 +0100 Subject: [PATCH 07/12] feat: Update the conditions of the actions to continue the tasks executions Signed-off-by: Pascal Zimmermann --- models/actions/max_parallel_simple_test.go | 240 ++++++++++++++++++ models/actions/run_job_maxparallel_test.go | 112 ++++++++ models/actions/task.go | 5 +- .../actions/task_sequential_execution_test.go | 212 ++++++++++++++++ services/actions/job_emitter.go | 59 ++++- services/actions/task_assignment_test.go | 138 ++++++++++ 6 files changed, 761 insertions(+), 5 deletions(-) create mode 100644 models/actions/max_parallel_simple_test.go create mode 100644 models/actions/task_sequential_execution_test.go diff --git a/models/actions/max_parallel_simple_test.go b/models/actions/max_parallel_simple_test.go new file mode 100644 index 0000000000..ac75036b6a --- /dev/null +++ b/models/actions/max_parallel_simple_test.go @@ -0,0 +1,240 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "context" + "testing" + + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/models/unittest" + + "github.com/stretchr/testify/assert" +) + +// TestMaxParallelOne_SimpleSequence tests the core issue: max-parallel=1 execution +func TestMaxParallelOne_SimpleSequence(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + ctx := context.Background() + + runID := int64(11000) + jobID := "simple-sequence-job" + maxParallel := 1 + + // Create ActionRun + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 11000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create jobs + job1 := &ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Job 1", + Status: StatusWaiting, + MaxParallel: maxParallel, + } + job2 := &ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Job 2", + Status: StatusWaiting, + MaxParallel: maxParallel, + } + job3 := &ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Job 3", + Status: StatusWaiting, + MaxParallel: maxParallel, + } + + assert.NoError(t, db.Insert(ctx, job1)) + assert.NoError(t, db.Insert(ctx, job2)) + assert.NoError(t, db.Insert(ctx, job3)) + + // TEST 1: Initially, 0 running + running, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, running, "Should have 0 jobs running initially") + + // TEST 2: Job1 starts + job1.Status = StatusRunning + _, err = UpdateRunJob(ctx, job1, nil) + assert.NoError(t, err) + + running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, running, "Should have 1 job running after job1 starts") + + // TEST 3: Job1 completes + job1.Status = StatusSuccess + _, err = UpdateRunJob(ctx, job1, nil) + assert.NoError(t, err) + + running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, running, "Should have 0 jobs running after job1 completes - THIS IS THE CRITICAL TEST") + + // TEST 4: Job2 should now be able to start + job2.Status = StatusRunning + _, err = UpdateRunJob(ctx, job2, nil) + assert.NoError(t, err) + + running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, running, "Should have 1 job running after job2 starts - IF THIS FAILS, THE BUG IS NOT FIXED") + + // TEST 5: Job2 completes + job2.Status = StatusSuccess + _, err = UpdateRunJob(ctx, job2, nil) + assert.NoError(t, err) + + running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, running, "Should have 0 jobs running after job2 completes") + + // TEST 6: Job3 starts + job3.Status = StatusRunning + _, err = UpdateRunJob(ctx, job3, nil) + assert.NoError(t, err) + + running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, running, "Should have 1 job running after job3 starts") + + t.Log("✅ All sequential execution tests passed!") +} + +// TestMaxParallelOne_FreshJobFetch tests the fresh job fetch mechanism +func TestMaxParallelOne_FreshJobFetch(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + ctx := context.Background() + + runID := int64(12000) + jobID := "fresh-fetch-job" + + // Create ActionRun + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 12000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create a job + job := &ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Fresh Fetch Test Job", + Status: StatusWaiting, + } + assert.NoError(t, db.Insert(ctx, job)) + + // Fetch fresh copy (simulating CreateTaskForRunner behavior) + freshJob, err := GetRunJobByID(ctx, job.ID) + assert.NoError(t, err) + assert.NotNil(t, freshJob) + assert.Equal(t, StatusWaiting, freshJob.Status, "Fresh job should have WAITING status") + assert.Equal(t, int64(0), freshJob.TaskID, "Fresh job should have TaskID=0") + + // Update original job to RUNNING + job.Status = StatusRunning + _, err = UpdateRunJob(ctx, job, nil) + assert.NoError(t, err) + + // Fetch fresh copy again - should reflect the update + freshJob2, err := GetRunJobByID(ctx, job.ID) + assert.NoError(t, err) + assert.NotNil(t, freshJob2) + assert.Equal(t, StatusRunning, freshJob2.Status, "Fresh job should now have RUNNING status") + + t.Log("✅ Fresh job fetch mechanism works correctly!") +} + +// TestCountRunningJobs tests the CountRunningJobsByWorkflowAndRun function +func TestCountRunningJobs(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + ctx := context.Background() + + runID := int64(13000) + jobID := "count-jobs" + + // Create ActionRun + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 13000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create 5 jobs + for i := 1; i <= 5; i++ { + job := &ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Job " + string(rune(i)), + Status: StatusWaiting, + } + assert.NoError(t, db.Insert(ctx, job)) + } + + // Get all jobs + jobs, err := GetRunJobsByRunID(ctx, runID) + assert.NoError(t, err) + assert.Len(t, jobs, 5) + + // Initially 0 running + running, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, running) + + // Set job 0 to RUNNING + jobs[0].Status = StatusRunning + _, err = UpdateRunJob(ctx, jobs[0], nil) + assert.NoError(t, err) + + running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, running) + + // Set job 1 to RUNNING + jobs[1].Status = StatusRunning + _, err = UpdateRunJob(ctx, jobs[1], nil) + assert.NoError(t, err) + + running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 2, running) + + // Set job 0 to SUCCESS (not counted as RUNNING anymore) + jobs[0].Status = StatusSuccess + _, err = UpdateRunJob(ctx, jobs[0], nil) + assert.NoError(t, err) + + running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, running) + + t.Log("✅ Count running jobs works correctly!") +} diff --git a/models/actions/run_job_maxparallel_test.go b/models/actions/run_job_maxparallel_test.go index 275a70fc37..badc0cce36 100644 --- a/models/actions/run_job_maxparallel_test.go +++ b/models/actions/run_job_maxparallel_test.go @@ -140,4 +140,116 @@ func TestActionRunJob_MaxParallelEnforcement(t *testing.T) { assert.NoError(t, err) assert.Equal(t, maxParallel, runningCount) }) + + t.Run("MaxParallelOne_SequentialExecution", func(t *testing.T) { + runID := int64(6000) + jobID := "sequential-job" + maxParallel := 1 + + // Create ActionRun first + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 6000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create jobs simulating sequential execution with max-parallel=1 + jobs := []*ActionRunJob{ + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusWaiting, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 3", Status: StatusWaiting, MaxParallel: maxParallel}, + } + + for _, job := range jobs { + assert.NoError(t, db.Insert(ctx, job)) + } + + // Verify initial running count is 1 + runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running with max-parallel=1") + + // Complete first job + jobs[0].Status = StatusSuccess + _, err = UpdateRunJob(ctx, jobs[0], nil, "status") + assert.NoError(t, err) + + // Now running count should be 0 + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, runningCount, "Should have 0 jobs running after first job completes") + + // Second job can now start + jobs[1].Status = StatusRunning + _, err = UpdateRunJob(ctx, jobs[1], nil, "status") + assert.NoError(t, err) + + // Running count should be 1 again + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running after second job starts") + + // Complete second job + jobs[1].Status = StatusSuccess + _, err = UpdateRunJob(ctx, jobs[1], nil, "status") + assert.NoError(t, err) + + // Third job can now start + jobs[2].Status = StatusRunning + _, err = UpdateRunJob(ctx, jobs[2], nil, "status") + assert.NoError(t, err) + + // Running count should still be 1 + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running after third job starts") + }) + + t.Run("MaxParallelOne_WithFailure", func(t *testing.T) { + runID := int64(7000) + jobID := "sequential-with-failure-job" + maxParallel := 1 + + // Create ActionRun first + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 7000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create jobs + jobs := []*ActionRunJob{ + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusWaiting, MaxParallel: maxParallel}, + } + + for _, job := range jobs { + assert.NoError(t, db.Insert(ctx, job)) + } + + // First job fails + jobs[0].Status = StatusFailure + _, err := UpdateRunJob(ctx, jobs[0], nil, "status") + assert.NoError(t, err) + + // Verify no jobs are running + runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, runningCount, "Should have 0 jobs running after job fails") + + // Second job can still start + jobs[1].Status = StatusRunning + _, err = UpdateRunJob(ctx, jobs[1], nil, "status") + assert.NoError(t, err) + + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running after job starts") + }) } diff --git a/models/actions/task.go b/models/actions/task.go index 34c61b3ffb..49bdfaba54 100644 --- a/models/actions/task.go +++ b/models/actions/task.go @@ -346,9 +346,12 @@ func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask } job.TaskID = task.ID - if n, err := UpdateRunJob(ctx, job, builder.Eq{"task_id": 0}); err != nil { + // Must explicitly specify which columns to update, including status and started + if n, err := UpdateRunJob(ctx, job, builder.Eq{"task_id": 0}, "task_id", "status", "started", "attempt", "updated"); err != nil { return nil, false, err } else if n != 1 { + // Another runner may have claimed this job, skip it + log.Debug("Job %s (run %d) was claimed by another runner, skipping", job.JobID, job.RunID) return nil, false, nil } diff --git a/models/actions/task_sequential_execution_test.go b/models/actions/task_sequential_execution_test.go new file mode 100644 index 0000000000..017c8437ad --- /dev/null +++ b/models/actions/task_sequential_execution_test.go @@ -0,0 +1,212 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "context" + "testing" + + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/models/unittest" + + "github.com/stretchr/testify/assert" +) + +// TestTaskCreation_MaxParallel_One tests that tasks are properly sequenced +// when max-parallel=1, ensuring no hang after task completion +func TestTaskCreation_MaxParallel_One(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + ctx := context.Background() + + t.Run("SequentialTaskCreationMaxParallelOne", func(t *testing.T) { + runID := int64(8000) + jobID := "task-sequential-max-parallel-one" + maxParallel := 1 + + // Setup: Create ActionRun + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 8000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create runner + runner := &ActionRunner{ + ID: 1, + UUID: "test-runner-1", + Name: "Test Runner", + OwnerID: 1, + } + assert.NoError(t, db.Insert(ctx, runner)) + + // Create jobs with max-parallel=1 + jobs := []*ActionRunJob{ + { + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Sequential Job 1", + Status: StatusWaiting, + MaxParallel: maxParallel, + }, + { + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Sequential Job 2", + Status: StatusWaiting, + MaxParallel: maxParallel, + }, + { + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Sequential Job 3", + Status: StatusWaiting, + MaxParallel: maxParallel, + }, + } + + for _, job := range jobs { + assert.NoError(t, db.Insert(ctx, job)) + } + + // Verify initial state: all jobs are waiting + allJobs, err := GetRunJobsByRunID(ctx, runID) + assert.NoError(t, err) + assert.Len(t, allJobs, 3) + + // Verify that only 1 job should be able to run at a time with max-parallel=1 + runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, runningCount, "Should have 0 jobs running initially") + + // Simulate starting first job + allJobs[0].Status = StatusRunning + _, err = UpdateRunJob(ctx, allJobs[0], nil) + assert.NoError(t, err) + + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running with max-parallel=1") + + // Complete first job + allJobs[0].Status = StatusSuccess + _, err = UpdateRunJob(ctx, allJobs[0], nil) + assert.NoError(t, err) + + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, runningCount, "Should have 0 jobs running after first job completes") + + // This is the critical test: the second job should be able to start + // Previously, the system might hang here + allJobs[1].Status = StatusRunning + _, err = UpdateRunJob(ctx, allJobs[1], nil) + assert.NoError(t, err) + + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running after second job starts (critical test)") + + // Complete the second job + allJobs[1].Status = StatusSuccess + _, err = UpdateRunJob(ctx, allJobs[1], nil) + assert.NoError(t, err) + + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, runningCount, "Should have 0 jobs running after second job completes") + + // Third job should also be able to start + allJobs[2].Status = StatusRunning + _, err = UpdateRunJob(ctx, allJobs[2], nil) + assert.NoError(t, err) + + runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running for third job") + }) + + t.Run("MaxParallelConstraintAfterTaskFetch", func(t *testing.T) { + runID := int64(9000) + jobID := "max-parallel-fetch-job" + maxParallel := 1 + + // Setup: Create ActionRun + run := &ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 9000, + Status: StatusRunning, + } + assert.NoError(t, db.Insert(ctx, run)) + + // Create jobs + jobs := []*ActionRunJob{ + { + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Job A", + Status: StatusWaiting, + MaxParallel: maxParallel, + }, + { + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Job B", + Status: StatusWaiting, + MaxParallel: maxParallel, + }, + } + + for _, job := range jobs { + assert.NoError(t, db.Insert(ctx, job)) + } + + // Refresh jobs to get IDs + allJobs, err := GetRunJobsByRunID(ctx, runID) + assert.NoError(t, err) + + // Start first job + allJobs[0].Status = StatusRunning + _, err = UpdateRunJob(ctx, allJobs[0], nil) + assert.NoError(t, err) + + // Verify constraint + runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount) + + // Try to start second job while first is still running + // This should not be allowed due to max-parallel=1 + freshAllJobs, err := GetRunJobsByRunID(ctx, runID) + assert.NoError(t, err) + + // Check if we can determine from the count that the second job should not start + for i := 1; i < len(freshAllJobs); i++ { + if freshAllJobs[i].Status == StatusWaiting { + // Before starting this job, verify max-parallel constraint + runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) + assert.NoError(t, err) + if runningCount >= maxParallel { + // This job should wait + assert.Equal(t, StatusWaiting, freshAllJobs[i].Status, + "Job should remain waiting when max-parallel limit is reached") + } + } + } + }) +} diff --git a/services/actions/job_emitter.go b/services/actions/job_emitter.go index ed6e2fe5aa..8c1e5fc422 100644 --- a/services/actions/job_emitter.go +++ b/services/actions/job_emitter.go @@ -222,16 +222,67 @@ func checkJobsOfRun(ctx context.Context, run *actions_model.ActionRun) (jobs, up for _, job := range jobs { if status, ok := updates[job.ID]; ok { oldStatus := job.Status + // IMPORTANT: Even if status hasn't changed, we must reset task_id for WAITING jobs + // after a previous task has completed (max-parallel constraint is released) + if status == actions_model.StatusWaiting && job.TaskID != 0 { + // This job was running but is now ready to be reassigned + job.Status = status + job.TaskID = 0 + if _, err := actions_model.UpdateRunJob(ctx, job, nil, "status", "task_id"); err != nil { + return fmt.Errorf("reset task_id for job %d: %w", job.ID, err) + } + updatedJobs = append(updatedJobs, job) + continue + } + + if job.Status == status { + // Status hasn't changed, skip + continue + } + job.Status = status - if n, err := actions_model.UpdateRunJob(ctx, job, builder.Eq{"status": actions_model.StatusBlocked}, "status"); err != nil { - return err - } else if n != 1 { - return fmt.Errorf("no affected for updating blocked job %v", job.ID) + + // For jobs transitioning to WAITING status (can happen after max-parallel constraint is released), + // we need to reset task_id to 0 so a new runner can pick them up + if status == actions_model.StatusWaiting { + job.TaskID = 0 + if _, err := actions_model.UpdateRunJob(ctx, job, nil, "status", "task_id"); err != nil { + return fmt.Errorf("reset task_id for job %d: %w", job.ID, err) + } + } else { + // For other status changes (BLOCKED, RUNNING, etc.), only update status + if _, err := actions_model.UpdateRunJob(ctx, job, builder.Eq{"status": actions_model.StatusWaiting}, "status"); err != nil { + return fmt.Errorf("update status for job %d: %w", job.ID, err) + } } log.Info("Job %d (JobID: %s) status updated: %s -> %s", job.ID, job.JobID, oldStatus, status) updatedJobs = append(updatedJobs, job) } } + + // CRITICAL FIX for max-parallel: Even if no jobs were updated above (because they already have task_id=0), + // we must notify runners to poll for new tasks when a job completes. + // This handles the max-parallel=1 case where jobs were never started (task_id=0) + // and won't be processed by jobStatusResolver (which only handles BLOCKED jobs). + if len(jobs) > 0 && jobs[0].OwnerID > 0 && jobs[0].RepoID > 0 { + // Check if there are any WAITING jobs with task_id=0 (ready to be picked up) + hasWaitingJobs := false + for _, job := range jobs { + if job.Status == actions_model.StatusWaiting && job.TaskID == 0 { + hasWaitingJobs = true + break + } + } + if hasWaitingJobs { + // Notify runners to poll for new tasks + if err := actions_model.IncreaseTaskVersion(ctx, jobs[0].OwnerID, jobs[0].RepoID); err != nil { + log.Error("Failed to increase task version for repo %d: %v", jobs[0].RepoID, err) + } else { + log.Debug("Increased task version for repo %d (max-parallel waiting jobs ready)", jobs[0].RepoID) + } + } + } + return nil }); err != nil { return nil, nil, err diff --git a/services/actions/task_assignment_test.go b/services/actions/task_assignment_test.go index 4ceb7d3d46..b1bb82fcd2 100644 --- a/services/actions/task_assignment_test.go +++ b/services/actions/task_assignment_test.go @@ -184,4 +184,142 @@ func TestMaxParallelJobStatusAndCounting(t *testing.T) { }) } }) + + t.Run("MaxParallelOne_TaskCreation", func(t *testing.T) { + runID := int64(40000) + jobID := "task-creation-sequential-job" + maxParallel := 1 + + // Create ActionRun first + run := &actions_model.ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 40000, + Status: actions_model.StatusRunning, + } + assert.NoError(t, db.Insert(context.Background(), run)) + + // Create waiting jobs with max-parallel=1 + for i := range 3 { + job := &actions_model.ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Sequential Job " + string(rune(i+1)), + Status: actions_model.StatusWaiting, + MaxParallel: maxParallel, + } + assert.NoError(t, db.Insert(context.Background(), job)) + } + + // Simulate the first job starting + jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + assert.NoError(t, err) + assert.Len(t, jobs, 3) + + jobs[0].Status = actions_model.StatusRunning + _, err = actions_model.UpdateRunJob(context.Background(), jobs[0], nil, "status") + assert.NoError(t, err) + + // Verify only one is running + runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running with max-parallel=1") + + // Complete first job + jobs[0].Status = actions_model.StatusSuccess + _, err = actions_model.UpdateRunJob(context.Background(), jobs[0], nil, "status") + assert.NoError(t, err) + + // Verify no jobs are running now + runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 0, runningCount, "Should have 0 jobs running after completion with max-parallel=1") + + // Second job can now start + jobs[1].Status = actions_model.StatusRunning + _, err = actions_model.UpdateRunJob(context.Background(), jobs[1], nil, "status") + assert.NoError(t, err) + + runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running for second task") + + // Complete second job + jobs[1].Status = actions_model.StatusSuccess + _, err = actions_model.UpdateRunJob(context.Background(), jobs[1], nil, "status") + assert.NoError(t, err) + + // Third job can now start + jobs[2].Status = actions_model.StatusRunning + _, err = actions_model.UpdateRunJob(context.Background(), jobs[2], nil, "status") + assert.NoError(t, err) + + runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have exactly 1 job running for third task") + }) + + t.Run("MaxParallelTwo_CompletionAndStart", func(t *testing.T) { + runID := int64(50000) + jobID := "completion-start-job" + maxParallel := 2 + + // Create ActionRun first + run := &actions_model.ActionRun{ + ID: runID, + RepoID: 1, + OwnerID: 1, + Index: 50000, + Status: actions_model.StatusRunning, + } + assert.NoError(t, db.Insert(context.Background(), run)) + + // Create jobs with max-parallel=2 + for i := range 5 { + job := &actions_model.ActionRunJob{ + RunID: runID, + RepoID: 1, + OwnerID: 1, + JobID: jobID, + Name: "Parallel Job " + string(rune(i+1)), + Status: actions_model.StatusWaiting, + MaxParallel: maxParallel, + } + if i < 2 { + job.Status = actions_model.StatusRunning + } + assert.NoError(t, db.Insert(context.Background(), job)) + } + + // Verify 2 jobs are running + runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 2, runningCount) + + // Complete one job + jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + assert.NoError(t, err) + + jobs[0].Status = actions_model.StatusSuccess + _, err = actions_model.UpdateRunJob(context.Background(), jobs[0], nil, "status") + assert.NoError(t, err) + + // Should have 1 running + runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 1, runningCount, "Should have 1 job running after one completes") + + // Now another job can start + jobs[2].Status = actions_model.StatusRunning + _, err = actions_model.UpdateRunJob(context.Background(), jobs[2], nil, "status") + assert.NoError(t, err) + + // Back to 2 running + runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) + assert.NoError(t, err) + assert.Equal(t, 2, runningCount, "Should have 2 jobs running after new job starts") + }) } From 5d83d729b4205535cee3ad375ce36f82539e3e29 Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Sun, 29 Mar 2026 16:27:54 +0200 Subject: [PATCH 08/12] Revert "feat: Add support for dynamic matrix strategies with job outputs" This reverts commit e6a0b07713097f71d6330ceeb51a60b63d6b1d4c. --- routers/web/web.go | 3 - services/actions/job_emitter.go | 142 +--------- services/actions/matrix.go | 268 ------------------ services/actions/matrix_metrics.go | 173 ----------- services/actions/matrix_metrics_prometheus.go | 191 ------------- services/actions/matrix_metrics_test.go | 82 ------ tests/integration/actions_job_test.go | 94 ------ 7 files changed, 7 insertions(+), 946 deletions(-) delete mode 100644 services/actions/matrix.go delete mode 100644 services/actions/matrix_metrics.go delete mode 100644 services/actions/matrix_metrics_prometheus.go delete mode 100644 services/actions/matrix_metrics_test.go diff --git a/routers/web/web.go b/routers/web/web.go index 40da3b13f4..e3dcf27cc4 100644 --- a/routers/web/web.go +++ b/routers/web/web.go @@ -41,7 +41,6 @@ import ( "code.gitea.io/gitea/routers/web/user" user_setting "code.gitea.io/gitea/routers/web/user/setting" "code.gitea.io/gitea/routers/web/user/setting/security" - actions_service "code.gitea.io/gitea/services/actions" auth_service "code.gitea.io/gitea/services/auth" "code.gitea.io/gitea/services/context" "code.gitea.io/gitea/services/forms" @@ -287,8 +286,6 @@ func Routes() *web.Router { if setting.Metrics.Enabled { prometheus.MustRegister(metrics.NewCollector()) - // Register matrix re-evaluation metrics - prometheus.MustRegister(actions_service.NewMatrixMetricsCollector()) routes.Get("/metrics", append(mid, Metrics)...) } diff --git a/services/actions/job_emitter.go b/services/actions/job_emitter.go index 8c1e5fc422..20a4f81eab 100644 --- a/services/actions/job_emitter.go +++ b/services/actions/job_emitter.go @@ -7,7 +7,6 @@ import ( "context" "errors" "fmt" - "time" actions_model "code.gitea.io/gitea/models/actions" "code.gitea.io/gitea/models/db" @@ -203,9 +202,6 @@ func checkJobsOfRun(ctx context.Context, run *actions_model.ActionRun) (jobs, up if err != nil { return nil, nil, err } - - log.Debug("Checking %d jobs for run %d (status: %s)", len(jobs), run.ID, run.Status) - vars, err := actions_model.GetVariablesOfRun(ctx, run) if err != nil { return nil, nil, err @@ -217,91 +213,22 @@ func checkJobsOfRun(ctx context.Context, run *actions_model.ActionRun) (jobs, up } updates := newJobStatusResolver(jobs, vars).Resolve(ctx) - log.Debug("Job status resolver returned %d job status updates for run %d", len(updates), run.ID) - for _, job := range jobs { if status, ok := updates[job.ID]; ok { - oldStatus := job.Status - // IMPORTANT: Even if status hasn't changed, we must reset task_id for WAITING jobs - // after a previous task has completed (max-parallel constraint is released) - if status == actions_model.StatusWaiting && job.TaskID != 0 { - // This job was running but is now ready to be reassigned - job.Status = status - job.TaskID = 0 - if _, err := actions_model.UpdateRunJob(ctx, job, nil, "status", "task_id"); err != nil { - return fmt.Errorf("reset task_id for job %d: %w", job.ID, err) - } - updatedJobs = append(updatedJobs, job) - continue - } - - if job.Status == status { - // Status hasn't changed, skip - continue - } - job.Status = status - - // For jobs transitioning to WAITING status (can happen after max-parallel constraint is released), - // we need to reset task_id to 0 so a new runner can pick them up - if status == actions_model.StatusWaiting { - job.TaskID = 0 - if _, err := actions_model.UpdateRunJob(ctx, job, nil, "status", "task_id"); err != nil { - return fmt.Errorf("reset task_id for job %d: %w", job.ID, err) - } - } else { - // For other status changes (BLOCKED, RUNNING, etc.), only update status - if _, err := actions_model.UpdateRunJob(ctx, job, builder.Eq{"status": actions_model.StatusWaiting}, "status"); err != nil { - return fmt.Errorf("update status for job %d: %w", job.ID, err) - } + if n, err := actions_model.UpdateRunJob(ctx, job, builder.Eq{"status": actions_model.StatusBlocked}, "status"); err != nil { + return err + } else if n != 1 { + return fmt.Errorf("no affected for updating blocked job %v", job.ID) } - log.Info("Job %d (JobID: %s) status updated: %s -> %s", job.ID, job.JobID, oldStatus, status) updatedJobs = append(updatedJobs, job) } } - - // CRITICAL FIX for max-parallel: Even if no jobs were updated above (because they already have task_id=0), - // we must notify runners to poll for new tasks when a job completes. - // This handles the max-parallel=1 case where jobs were never started (task_id=0) - // and won't be processed by jobStatusResolver (which only handles BLOCKED jobs). - if len(jobs) > 0 && jobs[0].OwnerID > 0 && jobs[0].RepoID > 0 { - // Check if there are any WAITING jobs with task_id=0 (ready to be picked up) - hasWaitingJobs := false - for _, job := range jobs { - if job.Status == actions_model.StatusWaiting && job.TaskID == 0 { - hasWaitingJobs = true - break - } - } - if hasWaitingJobs { - // Notify runners to poll for new tasks - if err := actions_model.IncreaseTaskVersion(ctx, jobs[0].OwnerID, jobs[0].RepoID); err != nil { - log.Error("Failed to increase task version for repo %d: %v", jobs[0].RepoID, err) - } else { - log.Debug("Increased task version for repo %d (max-parallel waiting jobs ready)", jobs[0].RepoID) - } - } - } - return nil }); err != nil { return nil, nil, err } - // Reload jobs from the database to pick up any newly created matrix jobs - oldJobCount := len(jobs) - jobs, err = db.Find[actions_model.ActionRunJob](ctx, actions_model.FindRunJobOptions{RunID: run.ID}) - if err != nil { - return nil, nil, err - } - - if len(jobs) > oldJobCount { - log.Info("Matrix re-evaluation created %d new jobs for run %d (was %d, now %d)", - len(jobs)-oldJobCount, run.ID, oldJobCount, len(jobs)) - } - - log.Debug("Job check completed for run %d: %d jobs updated, %d total jobs", run.ID, len(updatedJobs), len(jobs)) - return jobs, updatedJobs, nil } @@ -386,102 +313,47 @@ func (r *jobStatusResolver) resolveJobHasIfCondition(actionRunJob *actions_model func (r *jobStatusResolver) resolve(ctx context.Context) map[int64]actions_model.Status { ret := map[int64]actions_model.Status{} - resolveMetrics := struct { - totalBlocked int - matrixReevaluated int - concurrencyUpdated int - jobsStarted int - jobsSkipped int - }{} - for id, status := range r.statuses { actionRunJob := r.jobMap[id] if status != actions_model.StatusBlocked { continue } - - resolveMetrics.totalBlocked++ - log.Debug("Resolving blocked job %d (JobID: %s, RunID: %d)", id, actionRunJob.JobID, actionRunJob.RunID) - allDone, allSucceed := r.resolveCheckNeeds(id) if !allDone { - log.Debug("Job %d: not all dependencies completed yet", id) continue } - log.Debug("Job %d: all dependencies completed (allSucceed: %v), checking matrix re-evaluation", id, allSucceed) - - // Try to re-evaluate the matrix with job outputs if it depends on them - startTime := time.Now() - newMatrixJobs, err := ReEvaluateMatrixForJobWithNeeds(ctx, actionRunJob, r.vars) - duration := time.Since(startTime).Milliseconds() - - if err != nil { - log.Error("Matrix re-evaluation error for job %d (JobID: %s): %v (duration: %dms)", id, actionRunJob.JobID, err, duration) - continue - } - - // If new matrix jobs were created, add them to the resolver and continue - if len(newMatrixJobs) > 0 { - resolveMetrics.matrixReevaluated++ - log.Info("Matrix re-evaluation succeeded for job %d (JobID: %s): created %d new jobs (duration: %dms)", - id, actionRunJob.JobID, len(newMatrixJobs), duration) - // The new jobs will be picked up in the next resolution iteration - continue - } - - log.Debug("Job %d: no matrix re-evaluation needed or result is empty", id) - // update concurrency and check whether the job can run now - err = updateConcurrencyEvaluationForJobWithNeeds(ctx, actionRunJob, r.vars) + err := updateConcurrencyEvaluationForJobWithNeeds(ctx, actionRunJob, r.vars) if err != nil { // The err can be caused by different cases: database error, or syntax error, or the needed jobs haven't completed // At the moment there is no way to distinguish them. // Actually, for most cases, the error is caused by "syntax error" / "the needed jobs haven't completed (skipped?)" // TODO: if workflow or concurrency expression has syntax error, there should be a user error message, need to show it to end users - log.Debug("Concurrency evaluation failed for job %d (JobID: %s): %v (job will stay blocked)", id, actionRunJob.JobID, err) + log.Debug("updateConcurrencyEvaluationForJobWithNeeds failed, this job will stay blocked: job: %d, err: %v", id, err) continue } - resolveMetrics.concurrencyUpdated++ - shouldStartJob := true if !allSucceed { // Not all dependent jobs completed successfully: // * if the job has "if" condition, it can be started, then the act_runner will evaluate the "if" condition. // * otherwise, the job should be skipped. shouldStartJob = r.resolveJobHasIfCondition(actionRunJob) - log.Debug("Job %d: not all dependencies succeeded. Has if-condition: %v, should start: %v", id, shouldStartJob, shouldStartJob) } newStatus := util.Iif(shouldStartJob, actions_model.StatusWaiting, actions_model.StatusSkipped) if newStatus == actions_model.StatusWaiting { newStatus, err = PrepareToStartJobWithConcurrency(ctx, actionRunJob) if err != nil { - log.Error("Concurrency check failed for job %d (JobID: %s): %v (job will stay blocked)", id, actionRunJob.JobID, err) + log.Error("ShouldBlockJobByConcurrency failed, this job will stay blocked: job: %d, err: %v", id, err) } } if newStatus != actions_model.StatusBlocked { ret[id] = newStatus - switch newStatus { - case actions_model.StatusWaiting: - resolveMetrics.jobsStarted++ - log.Info("Job %d (JobID: %s) transitioned to StatusWaiting", id, actionRunJob.JobID) - case actions_model.StatusSkipped: - resolveMetrics.jobsSkipped++ - log.Info("Job %d (JobID: %s) transitioned to StatusSkipped", id, actionRunJob.JobID) - } } } - - // Log resolution metrics summary - if resolveMetrics.totalBlocked > 0 { - log.Debug("Job resolution summary: total_blocked=%d, matrix_reevaluated=%d, concurrency_updated=%d, jobs_started=%d, jobs_skipped=%d", - resolveMetrics.totalBlocked, resolveMetrics.matrixReevaluated, resolveMetrics.concurrencyUpdated, - resolveMetrics.jobsStarted, resolveMetrics.jobsSkipped) - } - return ret } diff --git a/services/actions/matrix.go b/services/actions/matrix.go deleted file mode 100644 index 10541f214f..0000000000 --- a/services/actions/matrix.go +++ /dev/null @@ -1,268 +0,0 @@ -// Copyright 2025 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package actions - -import ( - "context" - "fmt" - "maps" - "strings" - "time" - - actions_model "code.gitea.io/gitea/models/actions" - "code.gitea.io/gitea/modules/log" - - "github.com/nektos/act/pkg/jobparser" - "gopkg.in/yaml.v3" -) - -// ExtractRawStrategies extracts strategy definitions from the raw workflow content -// Returns a map of jobID to strategy YAML for jobs that have matrix dependencies -func ExtractRawStrategies(content []byte) (map[string]string, error) { - var workflowDef struct { - Jobs map[string]struct { - Strategy any `yaml:"strategy"` - Needs any `yaml:"needs"` - } `yaml:"jobs"` - } - - if err := yaml.Unmarshal(content, &workflowDef); err != nil { - return nil, err - } - - strategies := make(map[string]string) - for jobID, jobDef := range workflowDef.Jobs { - if jobDef.Strategy == nil { - continue - } - - // Check if this job has needs (dependencies) - var needsList []string - switch needs := jobDef.Needs.(type) { - case string: - needsList = append(needsList, needs) - case []any: - for _, need := range needs { - if needStr, ok := need.(string); ok { - needsList = append(needsList, needStr) - } - } - } - - // Only store strategy for jobs with dependencies - if len(needsList) > 0 { - if strategyBytes, err := yaml.Marshal(jobDef.Strategy); err == nil { - strategies[jobID] = string(strategyBytes) - } - } - } - - return strategies, nil -} - -// hasMatrixWithNeeds checks if a job's strategy contains a matrix that depends on job outputs -func hasMatrixWithNeeds(rawStrategy string) bool { - if rawStrategy == "" { - return false - } - - var strategy map[string]any - if err := yaml.Unmarshal([]byte(rawStrategy), &strategy); err != nil { - return false - } - - matrix, ok := strategy["matrix"] - if !ok { - return false - } - - // Check if any matrix value contains "needs." reference - matrixStr := fmt.Sprintf("%v", matrix) - return strings.Contains(matrixStr, "needs.") -} - -// ReEvaluateMatrixForJobWithNeeds re-evaluates the matrix strategy of a job using outputs from dependent jobs -// If the matrix depends on job outputs and all dependent jobs are done, it will: -// 1. Evaluate the matrix with the job outputs -// 2. Create new ActionRunJobs for each matrix combination -// 3. Return the newly created jobs -func ReEvaluateMatrixForJobWithNeeds(ctx context.Context, job *actions_model.ActionRunJob, vars map[string]string) ([]*actions_model.ActionRunJob, error) { - startTime := time.Now() - - if job.IsMatrixEvaluated || job.RawStrategy == "" { - return nil, nil - } - - if !hasMatrixWithNeeds(job.RawStrategy) { - // Mark as evaluated since there's no needs-dependent matrix - job.IsMatrixEvaluated = true - log.Debug("Matrix re-evaluation skipped for job %d: no needs-dependent matrix found", job.ID) - return nil, nil - } - - log.Debug("Starting matrix re-evaluation for job %d (JobID: %s)", job.ID, job.JobID) - - // Get the outputs from dependent jobs - taskNeeds, err := FindTaskNeeds(ctx, job) - if err != nil { - errMsg := fmt.Sprintf("failed to find task needs for job %d (JobID: %s): %v", job.ID, job.JobID, err) - log.Error("Matrix re-evaluation error: %s", errMsg) - return nil, fmt.Errorf("find task needs: %w", err) - } - - log.Debug("Found %d task needs for job %d (JobID: %s)", len(taskNeeds), job.ID, job.JobID) - - // If any task needs are not done, we can't evaluate yet - pendingNeeds := []string{} - for jobID, taskNeed := range taskNeeds { - if !taskNeed.Result.IsDone() { - pendingNeeds = append(pendingNeeds, fmt.Sprintf("%s(%s)", jobID, taskNeed.Result)) - } - } - if len(pendingNeeds) > 0 { - log.Debug("Matrix re-evaluation deferred for job %d: pending needs: %v", job.ID, pendingNeeds) - GetMatrixMetrics().RecordDeferred() - return nil, nil - } - - // Merge vars with needs outputs - mergedVars := mergeNeedsIntoVars(vars, taskNeeds) - log.Debug("Merged %d variables with needs outputs for job %d", len(mergedVars), job.ID) - - // Load the original run to get workflow context - if job.Run == nil { - if err := job.LoadRun(ctx); err != nil { - errMsg := fmt.Sprintf("failed to load run for job %d (JobID: %s): %v", job.ID, job.JobID, err) - log.Error("Matrix re-evaluation error: %s", errMsg) - return nil, fmt.Errorf("load run: %w", err) - } - } - - // Create the giteaCtx for expression evaluation - giteaCtx := GenerateGiteaContext(job.Run, job) - - // Parse the job payload with merged vars to expand the matrix - // Note: job.WorkflowPayload already contains just this job's definition - parseStartTime := time.Now() - jobs, err := jobparser.Parse( - job.WorkflowPayload, - jobparser.WithVars(mergedVars), - jobparser.WithGitContext(giteaCtx.ToGitHubContext()), - ) - parseTime := time.Since(parseStartTime) - GetMatrixMetrics().RecordParseTime(parseTime) - - if err != nil { - // If parsing fails, we can't expand the matrix - // Mark as evaluated and skip - job.IsMatrixEvaluated = true - errMsg := fmt.Sprintf("failed to parse workflow payload for job %d (JobID: %s) during matrix expansion. Error: %v. RawStrategy: %s", - job.ID, job.JobID, err, job.RawStrategy) - log.Error("Matrix parse error: %s", errMsg) - GetMatrixMetrics().RecordReevaluation(time.Since(startTime), false, 0) - return nil, nil - } - - if len(jobs) == 0 { - job.IsMatrixEvaluated = true - log.Debug("No jobs generated from matrix expansion for job %d (JobID: %s)", job.ID, job.JobID) - return nil, nil - } - - log.Debug("Parsed %d matrix combinations for job %d (JobID: %s)", len(jobs), job.ID, job.JobID) - - // Create new ActionRunJobs for each parsed workflow (each matrix combination) - newJobs := make([]*actions_model.ActionRunJob, 0) - - for i, parsedSingleWorkflow := range jobs { - id, jobDef := parsedSingleWorkflow.Job() - if jobDef == nil { - log.Warn("Skipped nil jobDef at index %d for job %d (JobID: %s)", i, job.ID, job.JobID) - continue - } - - // Skip the original job ID - we only want the matrix-expanded versions - if id == job.JobID { - log.Debug("Skipped original job ID %s in matrix expansion for job %d", id, job.ID) - continue - } - - // Erase needs from the payload before storing - needs := jobDef.Needs() - if err := parsedSingleWorkflow.SetJob(id, jobDef.EraseNeeds()); err != nil { - log.Error("Failed to erase needs from job %s (matrix expansion for job %d): %v", id, job.ID, err) - continue - } - - payload, _ := parsedSingleWorkflow.Marshal() - - newJob := &actions_model.ActionRunJob{ - RunID: job.RunID, - RepoID: job.RepoID, - OwnerID: job.OwnerID, - CommitSHA: job.CommitSHA, - IsForkPullRequest: job.IsForkPullRequest, - Name: jobDef.Name, - WorkflowPayload: payload, - JobID: id, - Needs: needs, - RunsOn: jobDef.RunsOn(), - Status: actions_model.StatusBlocked, - } - - newJobs = append(newJobs, newJob) - } - - // If no new jobs were created, mark as evaluated - if len(newJobs) == 0 { - job.IsMatrixEvaluated = true - log.Warn("No valid jobs created from matrix expansion for job %d (JobID: %s). Original jobs: %d", job.ID, job.JobID, len(jobs)) - return nil, nil - } - - // Insert the new jobs into database - insertStartTime := time.Now() - if err := actions_model.InsertActionRunJobs(ctx, newJobs); err != nil { - insertTime := time.Since(insertStartTime) - GetMatrixMetrics().RecordInsertTime(insertTime) - errMsg := fmt.Sprintf("failed to insert %d new matrix jobs for job %d (JobID: %s): %v", len(newJobs), job.ID, job.JobID, err) - log.Error("Matrix insertion error: %s", errMsg) - GetMatrixMetrics().RecordReevaluation(time.Since(startTime), false, 0) - return nil, fmt.Errorf("insert new jobs: %w", err) - } - insertTime := time.Since(insertStartTime) - GetMatrixMetrics().RecordInsertTime(insertTime) - - // Mark the original job as evaluated - job.IsMatrixEvaluated = true - if _, err := actions_model.UpdateRunJob(ctx, job, nil, "is_matrix_evaluated"); err != nil { - log.Error("Failed to update job %d is_matrix_evaluated flag: %v", job.ID, err) - } - - totalTime := time.Since(startTime) - GetMatrixMetrics().RecordReevaluation(totalTime, true, int64(len(newJobs))) - - log.Info("Successfully completed matrix re-evaluation for job %d (JobID: %s): created %d new jobs from %d matrix combinations (total: %dms, parse: %dms, insert: %dms)", - job.ID, job.JobID, len(newJobs), len(jobs), totalTime.Milliseconds(), parseTime.Milliseconds(), insertTime.Milliseconds()) - - return newJobs, nil -} - -// mergeNeedsIntoVars converts task needs outputs into variables for expression evaluation -func mergeNeedsIntoVars(baseVars map[string]string, taskNeeds map[string]*TaskNeed) map[string]string { - merged := make(map[string]string) - - // Copy base vars - maps.Copy(merged, baseVars) - - // Add needs outputs as variables in format: needs..outputs. - for jobID, taskNeed := range taskNeeds { - for outputKey, outputValue := range taskNeed.Outputs { - key := fmt.Sprintf("needs.%s.outputs.%s", jobID, outputKey) - merged[key] = outputValue - } - } - - return merged -} diff --git a/services/actions/matrix_metrics.go b/services/actions/matrix_metrics.go deleted file mode 100644 index cc232c0b6c..0000000000 --- a/services/actions/matrix_metrics.go +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright 2025 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package actions - -import ( - "sync" - "time" -) - -// MatrixMetrics tracks performance metrics for matrix re-evaluation operations -type MatrixMetrics struct { - mu sync.RWMutex - - // Counters - TotalReevaluations int64 - SuccessfulReevaluations int64 - FailedReevaluations int64 - JobsCreatedTotal int64 - DeferredReevaluations int64 - - // Timing - TotalReevaluationTime time.Duration - TotalParseTime time.Duration - TotalInsertTime time.Duration - - // Histograms (for detailed analysis) - ReevaluationTimes []time.Duration - ParseTimes []time.Duration - InsertTimes []time.Duration -} - -var ( - matrixMetricsInstance *MatrixMetrics - metricsMutex sync.Mutex -) - -// GetMatrixMetrics returns the global matrix metrics instance -func GetMatrixMetrics() *MatrixMetrics { - if matrixMetricsInstance == nil { - metricsMutex.Lock() - if matrixMetricsInstance == nil { - matrixMetricsInstance = &MatrixMetrics{ - ReevaluationTimes: make([]time.Duration, 0, 1000), - ParseTimes: make([]time.Duration, 0, 1000), - InsertTimes: make([]time.Duration, 0, 1000), - } - } - metricsMutex.Unlock() - } - return matrixMetricsInstance -} - -// RecordReevaluation records a matrix re-evaluation attempt -func (m *MatrixMetrics) RecordReevaluation(duration time.Duration, success bool, jobsCreated int64) { - m.mu.Lock() - defer m.mu.Unlock() - - m.TotalReevaluations++ - m.TotalReevaluationTime += duration - - if success { - m.SuccessfulReevaluations++ - m.JobsCreatedTotal += jobsCreated - } else { - m.FailedReevaluations++ - } - - // Keep a limited history for detailed analysis (keep last 1000) - if len(m.ReevaluationTimes) < 1000 { - m.ReevaluationTimes = append(m.ReevaluationTimes, duration) - } else { - // Shift and add new value - copy(m.ReevaluationTimes, m.ReevaluationTimes[1:]) - m.ReevaluationTimes[len(m.ReevaluationTimes)-1] = duration - } -} - -// RecordDeferred records a deferred matrix re-evaluation -func (m *MatrixMetrics) RecordDeferred() { - m.mu.Lock() - defer m.mu.Unlock() - m.DeferredReevaluations++ -} - -// RecordParseTime records the time taken to parse a workflow -func (m *MatrixMetrics) RecordParseTime(duration time.Duration) { - m.mu.Lock() - defer m.mu.Unlock() - - m.TotalParseTime += duration - - if len(m.ParseTimes) < 1000 { - m.ParseTimes = append(m.ParseTimes, duration) - } else { - copy(m.ParseTimes, m.ParseTimes[1:]) - m.ParseTimes[len(m.ParseTimes)-1] = duration - } -} - -// RecordInsertTime records the time taken to insert matrix jobs -func (m *MatrixMetrics) RecordInsertTime(duration time.Duration) { - m.mu.Lock() - defer m.mu.Unlock() - - m.TotalInsertTime += duration - - if len(m.InsertTimes) < 1000 { - m.InsertTimes = append(m.InsertTimes, duration) - } else { - copy(m.InsertTimes, m.InsertTimes[1:]) - m.InsertTimes[len(m.InsertTimes)-1] = duration - } -} - -// GetStats returns a snapshot of the current metrics -func (m *MatrixMetrics) GetStats() map[string]any { - m.mu.RLock() - defer m.mu.RUnlock() - - avgReevaluationTime := time.Duration(0) - if m.TotalReevaluations > 0 { - avgReevaluationTime = m.TotalReevaluationTime / time.Duration(m.TotalReevaluations) - } - - avgParseTime := time.Duration(0) - if len(m.ParseTimes) > 0 { - avgParseTime = m.TotalParseTime / time.Duration(len(m.ParseTimes)) - } - - avgInsertTime := time.Duration(0) - if len(m.InsertTimes) > 0 { - avgInsertTime = m.TotalInsertTime / time.Duration(len(m.InsertTimes)) - } - - successRate := 0.0 - if m.TotalReevaluations > 0 { - successRate = float64(m.SuccessfulReevaluations) / float64(m.TotalReevaluations) * 100 - } - - return map[string]any{ - "total_reevaluations": m.TotalReevaluations, - "successful_reevaluations": m.SuccessfulReevaluations, - "failed_reevaluations": m.FailedReevaluations, - "deferred_reevaluations": m.DeferredReevaluations, - "success_rate_percent": successRate, - "total_jobs_created": m.JobsCreatedTotal, - "total_reevaluation_time_ms": m.TotalReevaluationTime.Milliseconds(), - "avg_reevaluation_time_ms": avgReevaluationTime.Milliseconds(), - "total_parse_time_ms": m.TotalParseTime.Milliseconds(), - "avg_parse_time_ms": avgParseTime.Milliseconds(), - "total_insert_time_ms": m.TotalInsertTime.Milliseconds(), - "avg_insert_time_ms": avgInsertTime.Milliseconds(), - } -} - -// Reset clears all metrics -func (m *MatrixMetrics) Reset() { - m.mu.Lock() - defer m.mu.Unlock() - - m.TotalReevaluations = 0 - m.SuccessfulReevaluations = 0 - m.FailedReevaluations = 0 - m.JobsCreatedTotal = 0 - m.DeferredReevaluations = 0 - m.TotalReevaluationTime = 0 - m.TotalParseTime = 0 - m.TotalInsertTime = 0 - m.ReevaluationTimes = m.ReevaluationTimes[:0] - m.ParseTimes = m.ParseTimes[:0] - m.InsertTimes = m.InsertTimes[:0] -} diff --git a/services/actions/matrix_metrics_prometheus.go b/services/actions/matrix_metrics_prometheus.go deleted file mode 100644 index e780e8f13f..0000000000 --- a/services/actions/matrix_metrics_prometheus.go +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright 2025 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package actions - -import ( - "github.com/prometheus/client_golang/prometheus" -) - -// MatrixMetricsCollector implements the prometheus.Collector interface -// and exposes matrix re-evaluation metrics for prometheus -type MatrixMetricsCollector struct { - // Counters - totalReevaluations prometheus.Gauge - successfulReevaluations prometheus.Gauge - failedReevaluations prometheus.Gauge - deferredReevaluations prometheus.Gauge - jobsCreatedTotal prometheus.Gauge - - // Timing (in milliseconds) - totalReevaluationTime prometheus.Gauge - avgReevaluationTime prometheus.Gauge - totalParseTime prometheus.Gauge - avgParseTime prometheus.Gauge - totalInsertTime prometheus.Gauge - avgInsertTime prometheus.Gauge - - // Rates - successRate prometheus.Gauge -} - -const ( - namespace = "gitea_" - subsystem = "matrix_" -) - -// NewMatrixMetricsCollector creates a new MatrixMetricsCollector -func NewMatrixMetricsCollector() *MatrixMetricsCollector { - return &MatrixMetricsCollector{ - totalReevaluations: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "total_reevaluations", - Help: "Total number of matrix re-evaluation attempts", - }, - ), - successfulReevaluations: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "successful_reevaluations", - Help: "Number of successful matrix re-evaluations", - }, - ), - failedReevaluations: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "failed_reevaluations", - Help: "Number of failed matrix re-evaluations", - }, - ), - deferredReevaluations: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "deferred_reevaluations", - Help: "Number of deferred matrix re-evaluations (waiting for dependencies)", - }, - ), - jobsCreatedTotal: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "jobs_created_total", - Help: "Total number of jobs created from matrix expansion", - }, - ), - totalReevaluationTime: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "total_reevaluation_time_ms", - Help: "Total time spent on matrix re-evaluations in milliseconds", - }, - ), - avgReevaluationTime: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "avg_reevaluation_time_ms", - Help: "Average time per matrix re-evaluation in milliseconds", - }, - ), - totalParseTime: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "total_parse_time_ms", - Help: "Total time spent parsing workflow payloads in milliseconds", - }, - ), - avgParseTime: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "avg_parse_time_ms", - Help: "Average time per workflow parse in milliseconds", - }, - ), - totalInsertTime: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "total_insert_time_ms", - Help: "Total time spent inserting jobs into database in milliseconds", - }, - ), - avgInsertTime: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "avg_insert_time_ms", - Help: "Average time per database insert in milliseconds", - }, - ), - successRate: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "success_rate_percent", - Help: "Success rate of matrix re-evaluations as percentage (0-100)", - }, - ), - } -} - -// Describe returns the metrics descriptions -func (c *MatrixMetricsCollector) Describe(ch chan<- *prometheus.Desc) { - c.totalReevaluations.Describe(ch) - c.successfulReevaluations.Describe(ch) - c.failedReevaluations.Describe(ch) - c.deferredReevaluations.Describe(ch) - c.jobsCreatedTotal.Describe(ch) - c.totalReevaluationTime.Describe(ch) - c.avgReevaluationTime.Describe(ch) - c.totalParseTime.Describe(ch) - c.avgParseTime.Describe(ch) - c.totalInsertTime.Describe(ch) - c.avgInsertTime.Describe(ch) - c.successRate.Describe(ch) -} - -// Collect collects the current metric values and sends them to the channel -func (c *MatrixMetricsCollector) Collect(ch chan<- prometheus.Metric) { - metrics := GetMatrixMetrics() - stats := metrics.GetStats() - - // Set counter values - c.totalReevaluations.Set(float64(stats["total_reevaluations"].(int64))) - c.successfulReevaluations.Set(float64(stats["successful_reevaluations"].(int64))) - c.failedReevaluations.Set(float64(stats["failed_reevaluations"].(int64))) - c.deferredReevaluations.Set(float64(stats["deferred_reevaluations"].(int64))) - c.jobsCreatedTotal.Set(float64(stats["total_jobs_created"].(int64))) - - // Set timing values (already in milliseconds) - c.totalReevaluationTime.Set(float64(stats["total_reevaluation_time_ms"].(int64))) - c.avgReevaluationTime.Set(float64(stats["avg_reevaluation_time_ms"].(int64))) - c.totalParseTime.Set(float64(stats["total_parse_time_ms"].(int64))) - c.avgParseTime.Set(float64(stats["avg_parse_time_ms"].(int64))) - c.totalInsertTime.Set(float64(stats["total_insert_time_ms"].(int64))) - c.avgInsertTime.Set(float64(stats["avg_insert_time_ms"].(int64))) - - // Set success rate - c.successRate.Set(stats["success_rate_percent"].(float64)) - - // Collect all metrics - c.totalReevaluations.Collect(ch) - c.successfulReevaluations.Collect(ch) - c.failedReevaluations.Collect(ch) - c.deferredReevaluations.Collect(ch) - c.jobsCreatedTotal.Collect(ch) - c.totalReevaluationTime.Collect(ch) - c.avgReevaluationTime.Collect(ch) - c.totalParseTime.Collect(ch) - c.avgParseTime.Collect(ch) - c.totalInsertTime.Collect(ch) - c.avgInsertTime.Collect(ch) - c.successRate.Collect(ch) -} diff --git a/services/actions/matrix_metrics_test.go b/services/actions/matrix_metrics_test.go deleted file mode 100644 index 54fdf3c0f3..0000000000 --- a/services/actions/matrix_metrics_test.go +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2025 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package actions - -import ( - "testing" - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/stretchr/testify/assert" -) - -// Essential Prometheus Collector Tests - -func TestNewMatrixMetricsCollector(t *testing.T) { - collector := NewMatrixMetricsCollector() - assert.NotNil(t, collector) - assert.NotNil(t, collector.totalReevaluations) - assert.NotNil(t, collector.successRate) -} - -func TestMatrixMetricsCollectorDescribe(t *testing.T) { - collector := NewMatrixMetricsCollector() - ch := make(chan *prometheus.Desc, 100) - collector.Describe(ch) - assert.NotEmpty(t, ch) -} - -func TestMatrixMetricsCollectorCollect(t *testing.T) { - matrixMetricsInstance = nil - metrics := GetMatrixMetrics() - metrics.RecordReevaluation(10*time.Millisecond, true, 5) - metrics.RecordParseTime(8 * time.Millisecond) - - collector := NewMatrixMetricsCollector() - ch := make(chan prometheus.Metric, 100) - collector.Collect(ch) - assert.NotEmpty(t, ch) - - matrixMetricsInstance = nil -} - -func TestMatrixMetricsGetStats(t *testing.T) { - metrics := &MatrixMetrics{ - ReevaluationTimes: make([]time.Duration, 0, 1000), - ParseTimes: make([]time.Duration, 0, 1000), - InsertTimes: make([]time.Duration, 0, 1000), - } - - metrics.RecordReevaluation(10*time.Millisecond, true, 3) - metrics.RecordReevaluation(15*time.Millisecond, true, 2) - metrics.RecordReevaluation(5*time.Millisecond, false, 0) - - stats := metrics.GetStats() - assert.Equal(t, int64(3), stats["total_reevaluations"]) - assert.Equal(t, int64(2), stats["successful_reevaluations"]) - assert.Equal(t, int64(1), stats["failed_reevaluations"]) - assert.Greater(t, stats["success_rate_percent"].(float64), 60.0) -} - -func BenchmarkMatrixMetricsCollectorCollect(b *testing.B) { - metrics := &MatrixMetrics{ - ReevaluationTimes: make([]time.Duration, 0, 1000), - ParseTimes: make([]time.Duration, 0, 1000), - InsertTimes: make([]time.Duration, 0, 1000), - } - matrixMetricsInstance = metrics - - for range 100 { - metrics.RecordReevaluation(10*time.Millisecond, true, 5) - metrics.RecordParseTime(5 * time.Millisecond) - } - - collector := NewMatrixMetricsCollector() - ch := make(chan prometheus.Metric, 100) - - b.ResetTimer() - for b.Loop() { - collector.Collect(ch) - } -} diff --git a/tests/integration/actions_job_test.go b/tests/integration/actions_job_test.go index d3294fdd11..3da290f1d3 100644 --- a/tests/integration/actions_job_test.go +++ b/tests/integration/actions_job_test.go @@ -759,97 +759,3 @@ func getTaskJobNameByTaskID(t *testing.T, authToken, ownerName, repoName string, } return "" } - -func TestDynamicMatrixFromJobOutputs(t *testing.T) { - testCases := []struct { - treePath string - fileContent string - outcomes map[string]*mockTaskOutcome - }{ - { - treePath: ".gitea/workflows/dynamic-matrix.yml", - fileContent: `name: Dynamic Matrix from Job Outputs -on: - push: - paths: - - '.gitea/workflows/dynamic-matrix.yml' -jobs: - generate: - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.gen_matrix.outputs.matrix }} - steps: - - name: Generate matrix - id: gen_matrix - run: | - echo "matrix=[1,2,3]" >> "$GITHUB_OUTPUT" - - build: - needs: [generate] - runs-on: ubuntu-latest - strategy: - matrix: - version: ${{ fromJson(needs.generate.outputs.matrix) }} - steps: - - run: echo "Building version ${{ matrix.version }}" -`, - outcomes: map[string]*mockTaskOutcome{ - "generate": { - result: runnerv1.Result_RESULT_SUCCESS, - outputs: map[string]string{ - "matrix": "[1,2,3]", - }, - }, - "build (1)": { - result: runnerv1.Result_RESULT_SUCCESS, - }, - "build (2)": { - result: runnerv1.Result_RESULT_SUCCESS, - }, - "build (3)": { - result: runnerv1.Result_RESULT_SUCCESS, - }, - }, - }, - } - onGiteaRun(t, func(t *testing.T, u *url.URL) { - user2 := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: 2}) - session := loginUser(t, user2.Name) - token := getTokenForLoggedInUser(t, session, auth_model.AccessTokenScopeWriteRepository, auth_model.AccessTokenScopeWriteUser) - - apiRepo := createActionsTestRepo(t, token, "actions-dynamic-matrix", false) - runner := newMockRunner() - runner.registerAsRepoRunner(t, user2.Name, apiRepo.Name, "mock-runner", []string{"ubuntu-latest"}, false) - - for _, tc := range testCases { - t.Run("test "+tc.treePath, func(t *testing.T) { - opts := getWorkflowCreateFileOptions(user2, apiRepo.DefaultBranch, "create "+tc.treePath, tc.fileContent) - createWorkflowFile(t, token, user2.Name, apiRepo.Name, tc.treePath, opts) - - // Execute the generate job first - task := runner.fetchTask(t) - jobName := getTaskJobNameByTaskID(t, token, user2.Name, apiRepo.Name, task.Id) - assert.Equal(t, "generate", jobName) - outcome := tc.outcomes[jobName] - assert.NotNil(t, outcome) - runner.execTask(t, task, outcome) - - // Now the build job should be created with matrix expansion from the output - // We expect 3 tasks for build (1), build (2), build (3) - buildTasks := make([]int64, 0) - for range 3 { - buildTask := runner.fetchTask(t) - buildJobName := getTaskJobNameByTaskID(t, token, user2.Name, apiRepo.Name, buildTask.Id) - t.Logf("Fetched task: %s", buildJobName) - assert.Contains(t, []string{"build (1)", "build (2)", "build (3)"}, buildJobName, "Expected a build job with matrix index") - outcome := tc.outcomes[buildJobName] - assert.NotNil(t, outcome) - runner.execTask(t, buildTask, outcome) - buildTasks = append(buildTasks, buildTask.Id) - } - - assert.Len(t, len(buildTasks), 3, "Expected 3 build tasks from dynamic matrix") - }) - } - }) -} From 5096d6c94262d60218ec1c10639f85cc80300ac5 Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Tue, 31 Mar 2026 22:35:37 +0200 Subject: [PATCH 09/12] feat: Use the Gitea queue functionality instead of the DB --- models/actions/max_parallel_simple_test.go | 240 ------------ models/actions/run_job.go | 9 +- models/actions/run_job_maxparallel_test.go | 259 ++----------- models/actions/task.go | 32 +- models/actions/task_count_test.go | 123 ------ .../actions/task_sequential_execution_test.go | 212 ----------- services/actions/job_emitter.go | 21 +- services/actions/job_emitter_test.go | 51 ++- services/actions/run.go | 15 +- services/actions/task_assignment_test.go | 356 ++++-------------- 10 files changed, 191 insertions(+), 1127 deletions(-) delete mode 100644 models/actions/max_parallel_simple_test.go delete mode 100644 models/actions/task_count_test.go delete mode 100644 models/actions/task_sequential_execution_test.go diff --git a/models/actions/max_parallel_simple_test.go b/models/actions/max_parallel_simple_test.go deleted file mode 100644 index ac75036b6a..0000000000 --- a/models/actions/max_parallel_simple_test.go +++ /dev/null @@ -1,240 +0,0 @@ -// Copyright 2026 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package actions - -import ( - "context" - "testing" - - "code.gitea.io/gitea/models/db" - "code.gitea.io/gitea/models/unittest" - - "github.com/stretchr/testify/assert" -) - -// TestMaxParallelOne_SimpleSequence tests the core issue: max-parallel=1 execution -func TestMaxParallelOne_SimpleSequence(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - ctx := context.Background() - - runID := int64(11000) - jobID := "simple-sequence-job" - maxParallel := 1 - - // Create ActionRun - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 11000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create jobs - job1 := &ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Job 1", - Status: StatusWaiting, - MaxParallel: maxParallel, - } - job2 := &ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Job 2", - Status: StatusWaiting, - MaxParallel: maxParallel, - } - job3 := &ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Job 3", - Status: StatusWaiting, - MaxParallel: maxParallel, - } - - assert.NoError(t, db.Insert(ctx, job1)) - assert.NoError(t, db.Insert(ctx, job2)) - assert.NoError(t, db.Insert(ctx, job3)) - - // TEST 1: Initially, 0 running - running, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, running, "Should have 0 jobs running initially") - - // TEST 2: Job1 starts - job1.Status = StatusRunning - _, err = UpdateRunJob(ctx, job1, nil) - assert.NoError(t, err) - - running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, running, "Should have 1 job running after job1 starts") - - // TEST 3: Job1 completes - job1.Status = StatusSuccess - _, err = UpdateRunJob(ctx, job1, nil) - assert.NoError(t, err) - - running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, running, "Should have 0 jobs running after job1 completes - THIS IS THE CRITICAL TEST") - - // TEST 4: Job2 should now be able to start - job2.Status = StatusRunning - _, err = UpdateRunJob(ctx, job2, nil) - assert.NoError(t, err) - - running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, running, "Should have 1 job running after job2 starts - IF THIS FAILS, THE BUG IS NOT FIXED") - - // TEST 5: Job2 completes - job2.Status = StatusSuccess - _, err = UpdateRunJob(ctx, job2, nil) - assert.NoError(t, err) - - running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, running, "Should have 0 jobs running after job2 completes") - - // TEST 6: Job3 starts - job3.Status = StatusRunning - _, err = UpdateRunJob(ctx, job3, nil) - assert.NoError(t, err) - - running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, running, "Should have 1 job running after job3 starts") - - t.Log("✅ All sequential execution tests passed!") -} - -// TestMaxParallelOne_FreshJobFetch tests the fresh job fetch mechanism -func TestMaxParallelOne_FreshJobFetch(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - ctx := context.Background() - - runID := int64(12000) - jobID := "fresh-fetch-job" - - // Create ActionRun - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 12000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create a job - job := &ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Fresh Fetch Test Job", - Status: StatusWaiting, - } - assert.NoError(t, db.Insert(ctx, job)) - - // Fetch fresh copy (simulating CreateTaskForRunner behavior) - freshJob, err := GetRunJobByID(ctx, job.ID) - assert.NoError(t, err) - assert.NotNil(t, freshJob) - assert.Equal(t, StatusWaiting, freshJob.Status, "Fresh job should have WAITING status") - assert.Equal(t, int64(0), freshJob.TaskID, "Fresh job should have TaskID=0") - - // Update original job to RUNNING - job.Status = StatusRunning - _, err = UpdateRunJob(ctx, job, nil) - assert.NoError(t, err) - - // Fetch fresh copy again - should reflect the update - freshJob2, err := GetRunJobByID(ctx, job.ID) - assert.NoError(t, err) - assert.NotNil(t, freshJob2) - assert.Equal(t, StatusRunning, freshJob2.Status, "Fresh job should now have RUNNING status") - - t.Log("✅ Fresh job fetch mechanism works correctly!") -} - -// TestCountRunningJobs tests the CountRunningJobsByWorkflowAndRun function -func TestCountRunningJobs(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - ctx := context.Background() - - runID := int64(13000) - jobID := "count-jobs" - - // Create ActionRun - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 13000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create 5 jobs - for i := 1; i <= 5; i++ { - job := &ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Job " + string(rune(i)), - Status: StatusWaiting, - } - assert.NoError(t, db.Insert(ctx, job)) - } - - // Get all jobs - jobs, err := GetRunJobsByRunID(ctx, runID) - assert.NoError(t, err) - assert.Len(t, jobs, 5) - - // Initially 0 running - running, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, running) - - // Set job 0 to RUNNING - jobs[0].Status = StatusRunning - _, err = UpdateRunJob(ctx, jobs[0], nil) - assert.NoError(t, err) - - running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, running) - - // Set job 1 to RUNNING - jobs[1].Status = StatusRunning - _, err = UpdateRunJob(ctx, jobs[1], nil) - assert.NoError(t, err) - - running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 2, running) - - // Set job 0 to SUCCESS (not counted as RUNNING anymore) - jobs[0].Status = StatusSuccess - _, err = UpdateRunJob(ctx, jobs[0], nil) - assert.NoError(t, err) - - running, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, running) - - t.Log("✅ Count running jobs works correctly!") -} diff --git a/models/actions/run_job.go b/models/actions/run_job.go index de16741a82..30a24368ea 100644 --- a/models/actions/run_job.go +++ b/models/actions/run_job.go @@ -21,7 +21,7 @@ import ( // ActionRunJob represents a job of a run type ActionRunJob struct { ID int64 - RunID int64 `xorm:"index"` + RunID int64 `xorm:"index index(idx_action_run_job_run_id_job_id)"` Run *ActionRun `xorm:"-"` RepoID int64 `xorm:"index(repo_concurrency)"` Repo *repo_model.Repository `xorm:"-"` @@ -35,7 +35,7 @@ type ActionRunJob struct { // it should contain exactly one job with global workflow fields for this model WorkflowPayload []byte - JobID string `xorm:"VARCHAR(255)"` // job id in workflow, not job's id + JobID string `xorm:"VARCHAR(255) index(idx_action_run_job_run_id_job_id)"` // job id in workflow, not job's id Needs []string `xorm:"JSON TEXT"` RunsOn []string `xorm:"JSON TEXT"` TaskID int64 // the latest task of the job @@ -55,8 +55,9 @@ type ActionRunJob struct { // Org/repo clamps are enforced when the token is used at runtime. // It is JSON-encoded repo_model.ActionsTokenPermissions and may be empty if not specified. TokenPermissions *repo_model.ActionsTokenPermissions `xorm:"JSON TEXT"` - // Matrix job support - MaxParallel int // Max parallel jobs from strategy.max-parallel (0 = unlimited) + // MaxParallel is the max-parallel value from strategy.max-parallel (0 = unlimited). + // All matrix jobs sharing the same RunID+JobID share this value. + MaxParallel int `xorm:"NOT NULL DEFAULT 0"` Started timeutil.TimeStamp Stopped timeutil.TimeStamp diff --git a/models/actions/run_job_maxparallel_test.go b/models/actions/run_job_maxparallel_test.go index badc0cce36..9993482451 100644 --- a/models/actions/run_job_maxparallel_test.go +++ b/models/actions/run_job_maxparallel_test.go @@ -11,245 +11,46 @@ import ( "code.gitea.io/gitea/models/unittest" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) -func TestActionRunJob_MaxParallel(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) +func getRunJobByID(ctx context.Context, t *testing.T, id int64) *ActionRunJob { + t.Helper() + got, exist, err := db.GetByID[ActionRunJob](ctx, id) + require.NoError(t, err) + require.True(t, exist) + return got +} + +// TestMaxParallel_FieldPersistence verifies that MaxParallel is stored and retrieved correctly. +func TestMaxParallel_FieldPersistence(t *testing.T) { + require.NoError(t, unittest.PrepareTestDatabase()) ctx := context.Background() - t.Run("NoMaxParallel", func(t *testing.T) { - job := &ActionRunJob{ - RunID: 1, - RepoID: 1, - OwnerID: 1, - JobID: "test-job-1", - Name: "Test Job", - Status: StatusWaiting, - MaxParallel: 0, // No limit - } - assert.NoError(t, db.Insert(ctx, job)) + run := &ActionRun{ID: 100, RepoID: 1, OwnerID: 1, Index: 100, Status: StatusRunning} + require.NoError(t, db.Insert(ctx, run)) - retrieved, err := GetRunJobByID(ctx, job.ID) - assert.NoError(t, err) - assert.Equal(t, 0, retrieved.MaxParallel) + t.Run("zero value means unlimited", func(t *testing.T) { + job := &ActionRunJob{RunID: 100, RepoID: 1, OwnerID: 1, JobID: "no-limit", Name: "No Limit", Status: StatusWaiting, MaxParallel: 0} + require.NoError(t, db.Insert(ctx, job)) + got := getRunJobByID(ctx, t, job.ID) + assert.Equal(t, 0, got.MaxParallel) }) - t.Run("WithMaxParallel", func(t *testing.T) { - job := &ActionRunJob{ - RunID: 1, - RepoID: 1, - OwnerID: 1, - JobID: "test-job-2", - Name: "Matrix Job", - Status: StatusWaiting, - MaxParallel: 3, - } - assert.NoError(t, db.Insert(ctx, job)) - - retrieved, err := GetRunJobByID(ctx, job.ID) - assert.NoError(t, err) - assert.Equal(t, 3, retrieved.MaxParallel) + t.Run("positive value is persisted", func(t *testing.T) { + job := &ActionRunJob{RunID: 100, RepoID: 1, OwnerID: 1, JobID: "with-limit", Name: "With Limit", Status: StatusWaiting, MaxParallel: 3} + require.NoError(t, db.Insert(ctx, job)) + got := getRunJobByID(ctx, t, job.ID) + assert.Equal(t, 3, got.MaxParallel) }) - t.Run("UpdateMaxParallel", func(t *testing.T) { - // Create ActionRun first - run := &ActionRun{ - ID: 1, - RepoID: 1, - OwnerID: 1, - Status: StatusRunning, - } - // Note: This might fail if run already exists from previous tests, but that's okay - _ = db.Insert(ctx, run) - - job := &ActionRunJob{ - RunID: 1, - RepoID: 1, - OwnerID: 1, - JobID: "test-job-4", - Name: "Updatable Job", - Status: StatusWaiting, - MaxParallel: 5, - } - assert.NoError(t, db.Insert(ctx, job)) - - // Update max parallel + t.Run("can be updated via UpdateRunJob", func(t *testing.T) { + job := &ActionRunJob{RunID: 100, RepoID: 1, OwnerID: 1, JobID: "updatable", Name: "Updatable", Status: StatusWaiting, MaxParallel: 5} + require.NoError(t, db.Insert(ctx, job)) job.MaxParallel = 10 _, err := UpdateRunJob(ctx, job, nil, "max_parallel") - assert.NoError(t, err) - - retrieved, err := GetRunJobByID(ctx, job.ID) - assert.NoError(t, err) - assert.Equal(t, 10, retrieved.MaxParallel) - }) -} - -func TestActionRunJob_MaxParallelEnforcement(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - ctx := context.Background() - - t.Run("EnforceMaxParallel", func(t *testing.T) { - runID := int64(5000) - jobID := "parallel-enforced-job" - maxParallel := 2 - - // Create ActionRun first - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 5000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create jobs simulating matrix execution - jobs := []*ActionRunJob{ - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusRunning, MaxParallel: maxParallel}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 3", Status: StatusWaiting, MaxParallel: maxParallel}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 4", Status: StatusWaiting, MaxParallel: maxParallel}, - } - - for _, job := range jobs { - assert.NoError(t, db.Insert(ctx, job)) - } - - // Verify running count - runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, maxParallel, runningCount, "Should have exactly max-parallel jobs running") - - // Simulate job completion - jobs[0].Status = StatusSuccess - _, err = UpdateRunJob(ctx, jobs[0], nil, "status") - assert.NoError(t, err) - - // Now running count should be 1 - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount) - - // Simulate next job starting - jobs[2].Status = StatusRunning - _, err = UpdateRunJob(ctx, jobs[2], nil, "status") - assert.NoError(t, err) - - // Back to max-parallel - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, maxParallel, runningCount) - }) - - t.Run("MaxParallelOne_SequentialExecution", func(t *testing.T) { - runID := int64(6000) - jobID := "sequential-job" - maxParallel := 1 - - // Create ActionRun first - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 6000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create jobs simulating sequential execution with max-parallel=1 - jobs := []*ActionRunJob{ - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusWaiting, MaxParallel: maxParallel}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 3", Status: StatusWaiting, MaxParallel: maxParallel}, - } - - for _, job := range jobs { - assert.NoError(t, db.Insert(ctx, job)) - } - - // Verify initial running count is 1 - runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running with max-parallel=1") - - // Complete first job - jobs[0].Status = StatusSuccess - _, err = UpdateRunJob(ctx, jobs[0], nil, "status") - assert.NoError(t, err) - - // Now running count should be 0 - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, runningCount, "Should have 0 jobs running after first job completes") - - // Second job can now start - jobs[1].Status = StatusRunning - _, err = UpdateRunJob(ctx, jobs[1], nil, "status") - assert.NoError(t, err) - - // Running count should be 1 again - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running after second job starts") - - // Complete second job - jobs[1].Status = StatusSuccess - _, err = UpdateRunJob(ctx, jobs[1], nil, "status") - assert.NoError(t, err) - - // Third job can now start - jobs[2].Status = StatusRunning - _, err = UpdateRunJob(ctx, jobs[2], nil, "status") - assert.NoError(t, err) - - // Running count should still be 1 - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running after third job starts") - }) - - t.Run("MaxParallelOne_WithFailure", func(t *testing.T) { - runID := int64(7000) - jobID := "sequential-with-failure-job" - maxParallel := 1 - - // Create ActionRun first - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 7000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create jobs - jobs := []*ActionRunJob{ - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusWaiting, MaxParallel: maxParallel}, - } - - for _, job := range jobs { - assert.NoError(t, db.Insert(ctx, job)) - } - - // First job fails - jobs[0].Status = StatusFailure - _, err := UpdateRunJob(ctx, jobs[0], nil, "status") - assert.NoError(t, err) - - // Verify no jobs are running - runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, runningCount, "Should have 0 jobs running after job fails") - - // Second job can still start - jobs[1].Status = StatusRunning - _, err = UpdateRunJob(ctx, jobs[1], nil, "status") - assert.NoError(t, err) - - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running after job starts") + require.NoError(t, err) + got := getRunJobByID(ctx, t, job.ID) + assert.Equal(t, 10, got.MaxParallel) }) } diff --git a/models/actions/task.go b/models/actions/task.go index 49bdfaba54..1f4a522512 100644 --- a/models/actions/task.go +++ b/models/actions/task.go @@ -264,25 +264,8 @@ func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask continue } - // Check max-parallel constraint for matrix jobs - // Note: This is a best-effort check with a small race window. Multiple runners - // could simultaneously check the count before any commits, potentially allowing - // the limit to be briefly exceeded (e.g., if limit=2 and count=1, two runners - // might both proceed). Perfect enforcement would require row-level locking across - // all jobs with the same run_id+job_id, which has a significant performance impact. - // The race window is small since jobs are picked and committed quickly. - if v.MaxParallel > 0 { - runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, v.RunID, v.JobID) - if err != nil { - log.Error("Failed to count running jobs for max-parallel check: %v", err) - continue - } - if runningCount >= v.MaxParallel { - log.Debug("Job %s (run %d) skipped: %d/%d jobs already running (max-parallel)", - v.JobID, v.RunID, runningCount, v.MaxParallel) - continue - } - } + // max-parallel is enforced at insertion time (InsertRun) and by + // jobStatusResolver, so a Waiting job is guaranteed a free slot. job = v break @@ -547,14 +530,3 @@ func getTaskIDFromCache(token string) int64 { } return t } - -// CountRunningJobsByWorkflowAndRun counts running jobs for a specific workflow/run combo -// Used to enforce max-parallel limits on matrix jobs -func CountRunningJobsByWorkflowAndRun(ctx context.Context, runID int64, jobID string) (int, error) { - count, err := db.GetEngine(ctx). - Where("run_id = ?", runID). - And("job_id = ?", jobID). - And("status = ?", StatusRunning). - Count(new(ActionRunJob)) - return int(count), err -} diff --git a/models/actions/task_count_test.go b/models/actions/task_count_test.go deleted file mode 100644 index 12117a87e5..0000000000 --- a/models/actions/task_count_test.go +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright 2026 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package actions - -import ( - "context" - "testing" - - "code.gitea.io/gitea/models/db" - "code.gitea.io/gitea/models/unittest" - - "github.com/stretchr/testify/assert" -) - -func TestCountRunningJobsByWorkflowAndRun(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - ctx := context.Background() - - t.Run("NoRunningJobs", func(t *testing.T) { - count, err := CountRunningJobsByWorkflowAndRun(ctx, 999999, "nonexistent") - assert.NoError(t, err) - assert.Equal(t, 0, count) - }) - - t.Run("WithRunningJobs", func(t *testing.T) { - runID := int64(1000) - jobID := "test-job" - - // Create ActionRun first - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 1000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create running jobs - for range 3 { - job := &ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Test Job", - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, job)) - } - - count, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 3, count) - }) - - t.Run("DifferentJobIDs", func(t *testing.T) { - runID := int64(2000) - - // Create ActionRun first - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 2000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create jobs with different job IDs - for i := range 5 { - job := &ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: "job-" + string(rune('A'+i)), - Name: "Test Job", - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, job)) - } - - // Count for specific job ID should be 1 - count, err := CountRunningJobsByWorkflowAndRun(ctx, runID, "job-A") - assert.NoError(t, err) - assert.Equal(t, 1, count) - }) - - t.Run("MatrixJobsWithMaxParallel", func(t *testing.T) { - runID := int64(3000) - jobID := "matrix-job" - maxParallel := 2 - - // Create ActionRun first - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 3000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create matrix jobs - jobs := []*ActionRunJob{ - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 1", Status: StatusRunning, MaxParallel: maxParallel}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 2", Status: StatusRunning, MaxParallel: maxParallel}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 3", Status: StatusWaiting, MaxParallel: maxParallel}, - {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "Job 4", Status: StatusWaiting, MaxParallel: maxParallel}, - } - - for _, job := range jobs { - assert.NoError(t, db.Insert(ctx, job)) - } - - // Count running jobs - should be 2 (matching max-parallel) - count, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 2, count) - assert.Equal(t, maxParallel, count, "Running jobs should equal max-parallel") - }) -} diff --git a/models/actions/task_sequential_execution_test.go b/models/actions/task_sequential_execution_test.go deleted file mode 100644 index 017c8437ad..0000000000 --- a/models/actions/task_sequential_execution_test.go +++ /dev/null @@ -1,212 +0,0 @@ -// Copyright 2026 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package actions - -import ( - "context" - "testing" - - "code.gitea.io/gitea/models/db" - "code.gitea.io/gitea/models/unittest" - - "github.com/stretchr/testify/assert" -) - -// TestTaskCreation_MaxParallel_One tests that tasks are properly sequenced -// when max-parallel=1, ensuring no hang after task completion -func TestTaskCreation_MaxParallel_One(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) - ctx := context.Background() - - t.Run("SequentialTaskCreationMaxParallelOne", func(t *testing.T) { - runID := int64(8000) - jobID := "task-sequential-max-parallel-one" - maxParallel := 1 - - // Setup: Create ActionRun - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 8000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create runner - runner := &ActionRunner{ - ID: 1, - UUID: "test-runner-1", - Name: "Test Runner", - OwnerID: 1, - } - assert.NoError(t, db.Insert(ctx, runner)) - - // Create jobs with max-parallel=1 - jobs := []*ActionRunJob{ - { - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Sequential Job 1", - Status: StatusWaiting, - MaxParallel: maxParallel, - }, - { - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Sequential Job 2", - Status: StatusWaiting, - MaxParallel: maxParallel, - }, - { - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Sequential Job 3", - Status: StatusWaiting, - MaxParallel: maxParallel, - }, - } - - for _, job := range jobs { - assert.NoError(t, db.Insert(ctx, job)) - } - - // Verify initial state: all jobs are waiting - allJobs, err := GetRunJobsByRunID(ctx, runID) - assert.NoError(t, err) - assert.Len(t, allJobs, 3) - - // Verify that only 1 job should be able to run at a time with max-parallel=1 - runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, runningCount, "Should have 0 jobs running initially") - - // Simulate starting first job - allJobs[0].Status = StatusRunning - _, err = UpdateRunJob(ctx, allJobs[0], nil) - assert.NoError(t, err) - - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running with max-parallel=1") - - // Complete first job - allJobs[0].Status = StatusSuccess - _, err = UpdateRunJob(ctx, allJobs[0], nil) - assert.NoError(t, err) - - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, runningCount, "Should have 0 jobs running after first job completes") - - // This is the critical test: the second job should be able to start - // Previously, the system might hang here - allJobs[1].Status = StatusRunning - _, err = UpdateRunJob(ctx, allJobs[1], nil) - assert.NoError(t, err) - - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running after second job starts (critical test)") - - // Complete the second job - allJobs[1].Status = StatusSuccess - _, err = UpdateRunJob(ctx, allJobs[1], nil) - assert.NoError(t, err) - - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, runningCount, "Should have 0 jobs running after second job completes") - - // Third job should also be able to start - allJobs[2].Status = StatusRunning - _, err = UpdateRunJob(ctx, allJobs[2], nil) - assert.NoError(t, err) - - runningCount, err = CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running for third job") - }) - - t.Run("MaxParallelConstraintAfterTaskFetch", func(t *testing.T) { - runID := int64(9000) - jobID := "max-parallel-fetch-job" - maxParallel := 1 - - // Setup: Create ActionRun - run := &ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 9000, - Status: StatusRunning, - } - assert.NoError(t, db.Insert(ctx, run)) - - // Create jobs - jobs := []*ActionRunJob{ - { - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Job A", - Status: StatusWaiting, - MaxParallel: maxParallel, - }, - { - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Job B", - Status: StatusWaiting, - MaxParallel: maxParallel, - }, - } - - for _, job := range jobs { - assert.NoError(t, db.Insert(ctx, job)) - } - - // Refresh jobs to get IDs - allJobs, err := GetRunJobsByRunID(ctx, runID) - assert.NoError(t, err) - - // Start first job - allJobs[0].Status = StatusRunning - _, err = UpdateRunJob(ctx, allJobs[0], nil) - assert.NoError(t, err) - - // Verify constraint - runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount) - - // Try to start second job while first is still running - // This should not be allowed due to max-parallel=1 - freshAllJobs, err := GetRunJobsByRunID(ctx, runID) - assert.NoError(t, err) - - // Check if we can determine from the count that the second job should not start - for i := 1; i < len(freshAllJobs); i++ { - if freshAllJobs[i].Status == StatusWaiting { - // Before starting this job, verify max-parallel constraint - runningCount, err := CountRunningJobsByWorkflowAndRun(ctx, runID, jobID) - assert.NoError(t, err) - if runningCount >= maxParallel { - // This job should wait - assert.Equal(t, StatusWaiting, freshAllJobs[i].Status, - "Job should remain waiting when max-parallel limit is reached") - } - } - } - }) -} diff --git a/services/actions/job_emitter.go b/services/actions/job_emitter.go index 20a4f81eab..3d41771c57 100644 --- a/services/actions/job_emitter.go +++ b/services/actions/job_emitter.go @@ -313,6 +313,8 @@ func (r *jobStatusResolver) resolveJobHasIfCondition(actionRunJob *actions_model func (r *jobStatusResolver) resolve(ctx context.Context) map[int64]actions_model.Status { ret := map[int64]actions_model.Status{} + // promotedWaitingByJobID counts within-pass promotions to enforce max-parallel. + promotedWaitingByJobID := make(map[string]int) for id, status := range r.statuses { actionRunJob := r.jobMap[id] if status != actions_model.StatusBlocked { @@ -337,7 +339,7 @@ func (r *jobStatusResolver) resolve(ctx context.Context) map[int64]actions_model shouldStartJob := true if !allSucceed { // Not all dependent jobs completed successfully: - // * if the job has "if" condition, it can be started, then the act_runner will evaluate the "if" condition. + // * if the job has an "if" condition, it can be started; then the act_runner will evaluate the "if" condition. // * otherwise, the job should be skipped. shouldStartJob = r.resolveJobHasIfCondition(actionRunJob) } @@ -350,6 +352,23 @@ func (r *jobStatusResolver) resolve(ctx context.Context) map[int64]actions_model } } + // Enforce max-parallel: count occupied slots from the current snapshot + // plus within-pass promotions. + if newStatus == actions_model.StatusWaiting && actionRunJob.MaxParallel > 0 { + occupiedSlots := 0 + for otherID, otherStatus := range r.statuses { + otherJob := r.jobMap[otherID] + if otherJob.JobID == actionRunJob.JobID && + (otherStatus == actions_model.StatusRunning || otherStatus == actions_model.StatusWaiting) { + occupiedSlots++ + } + } + if occupiedSlots+promotedWaitingByJobID[actionRunJob.JobID] >= actionRunJob.MaxParallel { + continue // no free slot; leave blocked + } + promotedWaitingByJobID[actionRunJob.JobID]++ + } + if newStatus != actions_model.StatusBlocked { ret[id] = newStatus } diff --git a/services/actions/job_emitter_test.go b/services/actions/job_emitter_test.go index a2152fb270..3b6768bdd8 100644 --- a/services/actions/job_emitter_test.go +++ b/services/actions/job_emitter_test.go @@ -126,11 +126,60 @@ jobs: }, want: map[int64]actions_model.Status{2: actions_model.StatusSkipped}, }, + { + name: "max-parallel=1 promotes exactly one blocked job when one slot is open", + jobs: actions_model.ActionJobList{ + {ID: 1, JobID: "build", Status: actions_model.StatusRunning, Needs: []string{}, MaxParallel: 1}, + {ID: 2, JobID: "build", Status: actions_model.StatusBlocked, Needs: []string{}, MaxParallel: 1}, + {ID: 3, JobID: "build", Status: actions_model.StatusBlocked, Needs: []string{}, MaxParallel: 1}, + }, + want: map[int64]actions_model.Status{}, + }, + { + name: "max-parallel=1 promotes one job after running job finishes", + jobs: actions_model.ActionJobList{ + {ID: 1, JobID: "build", Status: actions_model.StatusSuccess, Needs: []string{}, MaxParallel: 1}, + {ID: 2, JobID: "build", Status: actions_model.StatusBlocked, Needs: []string{}, MaxParallel: 1}, + {ID: 3, JobID: "build", Status: actions_model.StatusBlocked, Needs: []string{}, MaxParallel: 1}, + }, + want: nil, // map iteration is non-deterministic; checked by count below + }, + { + name: "max-parallel=2 does not promote when limit is reached", + jobs: actions_model.ActionJobList{ + {ID: 1, JobID: "test", Status: actions_model.StatusRunning, Needs: []string{}, MaxParallel: 2}, + {ID: 2, JobID: "test", Status: actions_model.StatusRunning, Needs: []string{}, MaxParallel: 2}, + {ID: 3, JobID: "test", Status: actions_model.StatusBlocked, Needs: []string{}, MaxParallel: 2}, + {ID: 4, JobID: "test", Status: actions_model.StatusBlocked, Needs: []string{}, MaxParallel: 2}, + }, + want: map[int64]actions_model.Status{}, + }, + { + name: "max-parallel=2 promotes one job when one slot opens", + jobs: actions_model.ActionJobList{ + {ID: 1, JobID: "test", Status: actions_model.StatusSuccess, Needs: []string{}, MaxParallel: 2}, + {ID: 2, JobID: "test", Status: actions_model.StatusRunning, Needs: []string{}, MaxParallel: 2}, + {ID: 3, JobID: "test", Status: actions_model.StatusBlocked, Needs: []string{}, MaxParallel: 2}, + {ID: 4, JobID: "test", Status: actions_model.StatusBlocked, Needs: []string{}, MaxParallel: 2}, + }, + want: nil, // checked by count below + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { r := newJobStatusResolver(tt.jobs, nil) - assert.Equal(t, tt.want, r.Resolve(t.Context())) + got := r.Resolve(t.Context()) + if tt.want == nil { + waitingCount := 0 + for _, s := range got { + if s == actions_model.StatusWaiting { + waitingCount++ + } + } + assert.Equal(t, 1, waitingCount, "expected exactly 1 job promoted to Waiting, got %v", got) + } else { + assert.Equal(t, tt.want, got) + } }) } } diff --git a/services/actions/run.go b/services/actions/run.go index bec95fe7fc..ace9ceb08f 100644 --- a/services/actions/run.go +++ b/services/actions/run.go @@ -10,8 +10,8 @@ import ( actions_model "code.gitea.io/gitea/models/actions" "code.gitea.io/gitea/models/db" - "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/actions/jobparser" + "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/util" notify_service "code.gitea.io/gitea/services/notify" @@ -106,6 +106,9 @@ func InsertRun(ctx context.Context, run *actions_model.ActionRun, jobs []*jobpar runJobs := make([]*actions_model.ActionRunJob, 0, len(jobs)) var hasWaitingJobs bool + // waitingCountByJobID limits initial Waiting slots per JobID to MaxParallel. + waitingCountByJobID := make(map[string]int) + for _, v := range jobs { id, job := v.Job() needs := job.Needs() @@ -170,6 +173,16 @@ func InsertRun(ctx context.Context, run *actions_model.ActionRun, jobs []*jobpar } } + // Enforce max-parallel: excess jobs start as Blocked and are promoted + // by jobStatusResolver when a slot opens. + if runJob.Status == actions_model.StatusWaiting && runJob.MaxParallel > 0 { + if waitingCountByJobID[id] >= runJob.MaxParallel { + runJob.Status = actions_model.StatusBlocked + } else { + waitingCountByJobID[id]++ + } + } + hasWaitingJobs = hasWaitingJobs || runJob.Status == actions_model.StatusWaiting if err := db.Insert(ctx, runJob); err != nil { return err diff --git a/services/actions/task_assignment_test.go b/services/actions/task_assignment_test.go index b1bb82fcd2..ad5d0b5087 100644 --- a/services/actions/task_assignment_test.go +++ b/services/actions/task_assignment_test.go @@ -12,314 +12,98 @@ import ( "code.gitea.io/gitea/models/unittest" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) -func TestMaxParallelJobStatusAndCounting(t *testing.T) { - assert.NoError(t, unittest.PrepareTestDatabase()) +// countJobsByStatus returns the number of jobs with the given status in allJobs. +func countJobsByStatus(allJobs actions_model.ActionJobList, status actions_model.Status) int { + n := 0 + for _, j := range allJobs { + if j.Status == status { + n++ + } + } + return n +} - t.Run("MaxParallelReached", func(t *testing.T) { +// TestMaxParallel_ServiceLayer verifies the max-parallel invariant: Running+Waiting <= MaxParallel. +func TestMaxParallel_ServiceLayer(t *testing.T) { + require.NoError(t, unittest.PrepareTestDatabase()) + + t.Run("invariant: Running+Waiting <= MaxParallel", func(t *testing.T) { runID := int64(10000) - jobID := "max-parallel-job" + jobID := "svc-max-parallel" maxParallel := 2 - // Create ActionRun first - run := &actions_model.ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 10000, - Status: actions_model.StatusRunning, - } - assert.NoError(t, db.Insert(context.Background(), run)) + run := &actions_model.ActionRun{ID: runID, RepoID: 1, OwnerID: 1, Index: 10000, Status: actions_model.StatusRunning} + require.NoError(t, db.Insert(context.Background(), run)) - // Create waiting jobs - for range 4 { - job := &actions_model.ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Test Job", - Status: actions_model.StatusWaiting, - MaxParallel: maxParallel, - } - assert.NoError(t, db.Insert(context.Background(), job)) + jobs := []*actions_model.ActionRunJob{ + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "r1", Status: actions_model.StatusRunning, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "w1", Status: actions_model.StatusWaiting, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "b1", Status: actions_model.StatusBlocked, MaxParallel: maxParallel}, + } + for _, j := range jobs { + require.NoError(t, db.Insert(context.Background(), j)) } - // Start 2 jobs (max-parallel limit) - jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) - assert.NoError(t, err) - assert.Len(t, jobs, 4) + allJobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + require.NoError(t, err) - for i := range 2 { - jobs[i].Status = actions_model.StatusRunning - _, err := actions_model.UpdateRunJob(context.Background(), jobs[i], nil, "status") - assert.NoError(t, err) - } + running := countJobsByStatus(allJobs, actions_model.StatusRunning) + waiting := countJobsByStatus(allJobs, actions_model.StatusWaiting) + blocked := countJobsByStatus(allJobs, actions_model.StatusBlocked) - // Verify max-parallel is enforced - runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, maxParallel, runningCount) - - // Remaining jobs should stay in waiting - remainingJobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) - assert.NoError(t, err) - - waitingCount := 0 - for _, job := range remainingJobs { - if job.Status == actions_model.StatusWaiting { - waitingCount++ - } - } - assert.Equal(t, 2, waitingCount) + assert.LessOrEqual(t, running+waiting, maxParallel) + assert.Equal(t, 1, blocked) }) - t.Run("MaxParallelNotSet", func(t *testing.T) { + t.Run("slot becomes available after completion", func(t *testing.T) { runID := int64(20000) - jobID := "no-limit-job" - - // Create ActionRun first - run := &actions_model.ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 20000, - Status: actions_model.StatusRunning, - } - assert.NoError(t, db.Insert(context.Background(), run)) - - // Create jobs without max-parallel - for range 5 { - job := &actions_model.ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Test Job", - Status: actions_model.StatusWaiting, - MaxParallel: 0, // No limit - } - assert.NoError(t, db.Insert(context.Background(), job)) - } - - // All jobs can run simultaneously - jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) - assert.NoError(t, err) - - for _, job := range jobs { - job.Status = actions_model.StatusRunning - _, err := actions_model.UpdateRunJob(context.Background(), job, nil, "status") - assert.NoError(t, err) - } - - runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 5, runningCount, "All jobs should be able to run without limit") - }) - - t.Run("MaxParallelWrongValue", func(t *testing.T) { - runID := int64(30000) - jobID := "wrong-value-use-default-value-job" - - // Create ActionRun first - run := &actions_model.ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 30000, - Status: actions_model.StatusRunning, - } - assert.NoError(t, db.Insert(context.Background(), run)) - - // Test different invalid max-parallel values - testCases := []struct { - name string - maxParallel int - description string - }{ - { - name: "negative value", - maxParallel: -1, - description: "Negative max-parallel should default to 0 (no limit)", - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - // Create jobs with the test max-parallel value - for i := range 5 { - job := &actions_model.ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Test Job " + tc.name, - Status: actions_model.StatusWaiting, - MaxParallel: tc.maxParallel, - } - assert.NoError(t, db.Insert(context.Background(), job)) - - // Verify the value was stored - if i == 0 { - storedJob, err := actions_model.GetRunJobByID(context.Background(), job.ID) - assert.NoError(t, err) - assert.Equal(t, tc.maxParallel, storedJob.MaxParallel, tc.description) - } - } - - // All jobs can run simultaneously when max-parallel <= 0 - jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) - assert.NoError(t, err) - - for _, job := range jobs { - job.Status = actions_model.StatusRunning - _, err := actions_model.UpdateRunJob(context.Background(), job, nil, "status") - assert.NoError(t, err) - } - - runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.GreaterOrEqual(t, runningCount, 5, "All jobs should be able to run when max-parallel is "+tc.name) - }) - } - }) - - t.Run("MaxParallelOne_TaskCreation", func(t *testing.T) { - runID := int64(40000) - jobID := "task-creation-sequential-job" - maxParallel := 1 - - // Create ActionRun first - run := &actions_model.ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 40000, - Status: actions_model.StatusRunning, - } - assert.NoError(t, db.Insert(context.Background(), run)) - - // Create waiting jobs with max-parallel=1 - for i := range 3 { - job := &actions_model.ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Sequential Job " + string(rune(i+1)), - Status: actions_model.StatusWaiting, - MaxParallel: maxParallel, - } - assert.NoError(t, db.Insert(context.Background(), job)) - } - - // Simulate the first job starting - jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) - assert.NoError(t, err) - assert.Len(t, jobs, 3) - - jobs[0].Status = actions_model.StatusRunning - _, err = actions_model.UpdateRunJob(context.Background(), jobs[0], nil, "status") - assert.NoError(t, err) - - // Verify only one is running - runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running with max-parallel=1") - - // Complete first job - jobs[0].Status = actions_model.StatusSuccess - _, err = actions_model.UpdateRunJob(context.Background(), jobs[0], nil, "status") - assert.NoError(t, err) - - // Verify no jobs are running now - runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 0, runningCount, "Should have 0 jobs running after completion with max-parallel=1") - - // Second job can now start - jobs[1].Status = actions_model.StatusRunning - _, err = actions_model.UpdateRunJob(context.Background(), jobs[1], nil, "status") - assert.NoError(t, err) - - runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running for second task") - - // Complete second job - jobs[1].Status = actions_model.StatusSuccess - _, err = actions_model.UpdateRunJob(context.Background(), jobs[1], nil, "status") - assert.NoError(t, err) - - // Third job can now start - jobs[2].Status = actions_model.StatusRunning - _, err = actions_model.UpdateRunJob(context.Background(), jobs[2], nil, "status") - assert.NoError(t, err) - - runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have exactly 1 job running for third task") - }) - - t.Run("MaxParallelTwo_CompletionAndStart", func(t *testing.T) { - runID := int64(50000) - jobID := "completion-start-job" + jobID := "svc-slot-free" maxParallel := 2 - // Create ActionRun first - run := &actions_model.ActionRun{ - ID: runID, - RepoID: 1, - OwnerID: 1, - Index: 50000, - Status: actions_model.StatusRunning, + run := &actions_model.ActionRun{ID: runID, RepoID: 1, OwnerID: 1, Index: 20000, Status: actions_model.StatusRunning} + require.NoError(t, db.Insert(context.Background(), run)) + + jobs := []*actions_model.ActionRunJob{ + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "r1", Status: actions_model.StatusRunning, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "r2", Status: actions_model.StatusRunning, MaxParallel: maxParallel}, + {RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "b1", Status: actions_model.StatusBlocked, MaxParallel: maxParallel}, } - assert.NoError(t, db.Insert(context.Background(), run)) - - // Create jobs with max-parallel=2 - for i := range 5 { - job := &actions_model.ActionRunJob{ - RunID: runID, - RepoID: 1, - OwnerID: 1, - JobID: jobID, - Name: "Parallel Job " + string(rune(i+1)), - Status: actions_model.StatusWaiting, - MaxParallel: maxParallel, - } - if i < 2 { - job.Status = actions_model.StatusRunning - } - assert.NoError(t, db.Insert(context.Background(), job)) + for _, j := range jobs { + require.NoError(t, db.Insert(context.Background(), j)) } - // Verify 2 jobs are running - runningCount, err := actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 2, runningCount) - - // Complete one job - jobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) - assert.NoError(t, err) - jobs[0].Status = actions_model.StatusSuccess - _, err = actions_model.UpdateRunJob(context.Background(), jobs[0], nil, "status") - assert.NoError(t, err) + _, err := actions_model.UpdateRunJob(context.Background(), jobs[0], nil, "status") + require.NoError(t, err) - // Should have 1 running - runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 1, runningCount, "Should have 1 job running after one completes") + allJobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + require.NoError(t, err) - // Now another job can start - jobs[2].Status = actions_model.StatusRunning - _, err = actions_model.UpdateRunJob(context.Background(), jobs[2], nil, "status") - assert.NoError(t, err) + running := countJobsByStatus(allJobs, actions_model.StatusRunning) + assert.Equal(t, 1, running) + assert.Less(t, running, maxParallel) + }) - // Back to 2 running - runningCount, err = actions_model.CountRunningJobsByWorkflowAndRun(context.Background(), runID, jobID) - assert.NoError(t, err) - assert.Equal(t, 2, runningCount, "Should have 2 jobs running after new job starts") + t.Run("no max-parallel means all jobs start as Waiting", func(t *testing.T) { + runID := int64(30000) + jobID := "svc-no-limit" + + run := &actions_model.ActionRun{ID: runID, RepoID: 1, OwnerID: 1, Index: 30000, Status: actions_model.StatusRunning} + require.NoError(t, db.Insert(context.Background(), run)) + + for range 5 { + require.NoError(t, db.Insert(context.Background(), &actions_model.ActionRunJob{ + RunID: runID, RepoID: 1, OwnerID: 1, JobID: jobID, Name: "j", + Status: actions_model.StatusWaiting, MaxParallel: 0, + })) + } + + allJobs, err := actions_model.GetRunJobsByRunID(context.Background(), runID) + require.NoError(t, err) + assert.Equal(t, 5, countJobsByStatus(allJobs, actions_model.StatusWaiting)) + assert.Equal(t, 0, countJobsByStatus(allJobs, actions_model.StatusBlocked)) }) } From c06bfe95e597e918d22edede6c8153d01b4deda8 Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Tue, 31 Mar 2026 23:36:52 +0200 Subject: [PATCH 10/12] fix: Shorten index name to fix MySQL identifier length limit --- models/actions/run_job.go | 10 ++++++---- models/actions/task.go | 4 ++-- services/actions/job_emitter.go | 12 ++++++------ services/actions/run.go | 10 +++++----- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/models/actions/run_job.go b/models/actions/run_job.go index 30a24368ea..17a056efda 100644 --- a/models/actions/run_job.go +++ b/models/actions/run_job.go @@ -21,21 +21,23 @@ import ( // ActionRunJob represents a job of a run type ActionRunJob struct { ID int64 - RunID int64 `xorm:"index index(idx_action_run_job_run_id_job_id)"` + RunID int64 `xorm:"index index(idx_run_id_job_id)"` Run *ActionRun `xorm:"-"` RepoID int64 `xorm:"index(repo_concurrency)"` Repo *repo_model.Repository `xorm:"-"` OwnerID int64 `xorm:"index"` CommitSHA string `xorm:"index"` IsForkPullRequest bool - Name string `xorm:"VARCHAR(255)"` - Attempt int64 + + // ...existing code... + Name string `xorm:"VARCHAR(255)"` + Attempt int64 // WorkflowPayload is act/jobparser.SingleWorkflow for act/jobparser.Parse // it should contain exactly one job with global workflow fields for this model WorkflowPayload []byte - JobID string `xorm:"VARCHAR(255) index(idx_action_run_job_run_id_job_id)"` // job id in workflow, not job's id + JobID string `xorm:"VARCHAR(255) index(idx_run_id_job_id)"` // job id in workflow, not job's id Needs []string `xorm:"JSON TEXT"` RunsOn []string `xorm:"JSON TEXT"` TaskID int64 // the latest task of the job diff --git a/models/actions/task.go b/models/actions/task.go index 1f4a522512..55df47fd7f 100644 --- a/models/actions/task.go +++ b/models/actions/task.go @@ -264,8 +264,8 @@ func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask continue } - // max-parallel is enforced at insertion time (InsertRun) and by - // jobStatusResolver, so a Waiting job is guaranteed a free slot. + // max-parallel is enforced at insertion time (InsertRun) and by + // jobStatusResolver, so a Waiting job is guaranteed a free slot. job = v break diff --git a/services/actions/job_emitter.go b/services/actions/job_emitter.go index 3d41771c57..b3cdc6d3ad 100644 --- a/services/actions/job_emitter.go +++ b/services/actions/job_emitter.go @@ -352,9 +352,9 @@ func (r *jobStatusResolver) resolve(ctx context.Context) map[int64]actions_model } } - // Enforce max-parallel: count occupied slots from the current snapshot - // plus within-pass promotions. - if newStatus == actions_model.StatusWaiting && actionRunJob.MaxParallel > 0 { + // Enforce max-parallel: count occupied slots from the current snapshot + // plus within-pass promotions. + if newStatus == actions_model.StatusWaiting && actionRunJob.MaxParallel > 0 { occupiedSlots := 0 for otherID, otherStatus := range r.statuses { otherJob := r.jobMap[otherID] @@ -363,9 +363,9 @@ func (r *jobStatusResolver) resolve(ctx context.Context) map[int64]actions_model occupiedSlots++ } } - if occupiedSlots+promotedWaitingByJobID[actionRunJob.JobID] >= actionRunJob.MaxParallel { - continue // no free slot; leave blocked - } + if occupiedSlots+promotedWaitingByJobID[actionRunJob.JobID] >= actionRunJob.MaxParallel { + continue // no free slot; leave blocked + } promotedWaitingByJobID[actionRunJob.JobID]++ } diff --git a/services/actions/run.go b/services/actions/run.go index ace9ceb08f..8fbf6f1133 100644 --- a/services/actions/run.go +++ b/services/actions/run.go @@ -106,8 +106,8 @@ func InsertRun(ctx context.Context, run *actions_model.ActionRun, jobs []*jobpar runJobs := make([]*actions_model.ActionRunJob, 0, len(jobs)) var hasWaitingJobs bool - // waitingCountByJobID limits initial Waiting slots per JobID to MaxParallel. - waitingCountByJobID := make(map[string]int) + // waitingCountByJobID limits initial Waiting slots per JobID to MaxParallel. + waitingCountByJobID := make(map[string]int) for _, v := range jobs { id, job := v.Job() @@ -173,9 +173,9 @@ func InsertRun(ctx context.Context, run *actions_model.ActionRun, jobs []*jobpar } } - // Enforce max-parallel: excess jobs start as Blocked and are promoted - // by jobStatusResolver when a slot opens. - if runJob.Status == actions_model.StatusWaiting && runJob.MaxParallel > 0 { + // Enforce max-parallel: excess jobs start as Blocked and are promoted + // by jobStatusResolver when a slot opens. + if runJob.Status == actions_model.StatusWaiting && runJob.MaxParallel > 0 { if waitingCountByJobID[id] >= runJob.MaxParallel { runJob.Status = actions_model.StatusBlocked } else { From 66ee950a42aa66a0041d6b828617aa30a7e3661c Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Wed, 1 Apr 2026 08:18:34 +0200 Subject: [PATCH 11/12] feat: Adapt the migration --- models/migrations/migrations.go | 2 +- models/migrations/v1_26/v331.go | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 models/migrations/v1_26/v331.go diff --git a/models/migrations/migrations.go b/models/migrations/migrations.go index 298ff60467..0a7a64336c 100644 --- a/models/migrations/migrations.go +++ b/models/migrations/migrations.go @@ -405,7 +405,7 @@ func prepareMigrationTasks() []*migration { newMigration(328, "Add TokenPermissions column to ActionRunJob", v1_26.AddTokenPermissionsToActionRunJob), newMigration(329, "Add unique constraint for user badge", v1_26.AddUniqueIndexForUserBadge), newMigration(330, "Add name column to webhook", v1_26.AddNameToWebhook), - newMigration(331, "Add runner capacity and job max-parallel support", v1_26.AddRunnerCapacityAndJobMaxParallel), + newMigration(331, "Add job max-parallel support", v1_26.AddJobMaxParallel), } return preparedMigrations } diff --git a/models/migrations/v1_26/v331.go b/models/migrations/v1_26/v331.go new file mode 100644 index 0000000000..d2fa98ca4e --- /dev/null +++ b/models/migrations/v1_26/v331.go @@ -0,0 +1,17 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package v1_26 + +import "xorm.io/xorm" + +// AddJobMaxParallel adds max_parallel to action_run_job with a composite index on (run_id, job_id). +func AddJobMaxParallel(x *xorm.Engine) error { + type ActionRunJob struct { + RunID int64 `xorm:"index index(idx_run_id_job_id)"` + JobID string `xorm:"VARCHAR(255) index(idx_run_id_job_id)"` + MaxParallel int `xorm:"NOT NULL DEFAULT 0"` + } + + return x.Sync(new(ActionRunJob)) +} From d3d23ecff511cbf3e8acede0589bc64d27d5c2ac Mon Sep 17 00:00:00 2001 From: Pascal Zimmermann Date: Fri, 3 Apr 2026 09:46:49 +0200 Subject: [PATCH 12/12] feat: Update the job limit handling and add tests --- models/actions/run_job.go | 6 +- services/actions/job_emitter.go | 27 +++-- services/actions/job_emitter_test.go | 165 +++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 18 deletions(-) diff --git a/models/actions/run_job.go b/models/actions/run_job.go index 17a056efda..f3dbd364dd 100644 --- a/models/actions/run_job.go +++ b/models/actions/run_job.go @@ -28,10 +28,8 @@ type ActionRunJob struct { OwnerID int64 `xorm:"index"` CommitSHA string `xorm:"index"` IsForkPullRequest bool - - // ...existing code... - Name string `xorm:"VARCHAR(255)"` - Attempt int64 + Name string `xorm:"VARCHAR(255)"` + Attempt int64 // WorkflowPayload is act/jobparser.SingleWorkflow for act/jobparser.Parse // it should contain exactly one job with global workflow fields for this model diff --git a/services/actions/job_emitter.go b/services/actions/job_emitter.go index b3cdc6d3ad..bb4096d600 100644 --- a/services/actions/job_emitter.go +++ b/services/actions/job_emitter.go @@ -313,8 +313,15 @@ func (r *jobStatusResolver) resolveJobHasIfCondition(actionRunJob *actions_model func (r *jobStatusResolver) resolve(ctx context.Context) map[int64]actions_model.Status { ret := map[int64]actions_model.Status{} - // promotedWaitingByJobID counts within-pass promotions to enforce max-parallel. - promotedWaitingByJobID := make(map[string]int) + + // Pre-calculate the number of running-or-waiting jobs per JobID + runningOrWaiting := make(map[string]int) + for id, status := range r.statuses { + if status == actions_model.StatusRunning || status == actions_model.StatusWaiting { + runningOrWaiting[r.jobMap[id].JobID]++ + } + } + for id, status := range r.statuses { actionRunJob := r.jobMap[id] if status != actions_model.StatusBlocked { @@ -352,21 +359,13 @@ func (r *jobStatusResolver) resolve(ctx context.Context) map[int64]actions_model } } - // Enforce max-parallel: count occupied slots from the current snapshot - // plus within-pass promotions. + // Enforce max-parallel: if the number of running-or-waiting jobs of the same + // JobID already fills the limit, leave this job blocked. if newStatus == actions_model.StatusWaiting && actionRunJob.MaxParallel > 0 { - occupiedSlots := 0 - for otherID, otherStatus := range r.statuses { - otherJob := r.jobMap[otherID] - if otherJob.JobID == actionRunJob.JobID && - (otherStatus == actions_model.StatusRunning || otherStatus == actions_model.StatusWaiting) { - occupiedSlots++ - } - } - if occupiedSlots+promotedWaitingByJobID[actionRunJob.JobID] >= actionRunJob.MaxParallel { + if runningOrWaiting[actionRunJob.JobID] >= actionRunJob.MaxParallel { continue // no free slot; leave blocked } - promotedWaitingByJobID[actionRunJob.JobID]++ + runningOrWaiting[actionRunJob.JobID]++ } if newStatus != actions_model.StatusBlocked { diff --git a/services/actions/job_emitter_test.go b/services/actions/job_emitter_test.go index 3b6768bdd8..9bd1fd1913 100644 --- a/services/actions/job_emitter_test.go +++ b/services/actions/job_emitter_test.go @@ -4,6 +4,7 @@ package actions import ( + "fmt" "testing" actions_model "code.gitea.io/gitea/models/actions" @@ -183,3 +184,167 @@ jobs: }) } } + +func Test_maxParallelWorkflowLifecycle(t *testing.T) { + const matrixJobID = "matrix" + + countStatus := func(jobs actions_model.ActionJobList, s actions_model.Status) int { + n := 0 + for _, j := range jobs { + if j.Status == s { + n++ + } + } + return n + } + + applyUpdates := func(jobs actions_model.ActionJobList, updates map[int64]actions_model.Status) { + for _, j := range jobs { + if s, ok := updates[j.ID]; ok { + j.Status = s + } + } + } + + // pickUpAll simulates every WAITING job being accepted by a runner. + pickUpAll := func(jobs actions_model.ActionJobList) { + for _, j := range jobs { + if j.Status == actions_model.StatusWaiting { + j.Status = actions_model.StatusRunning + } + } + } + + // completeOne marks the first RUNNING job as SUCCESS. + completeOne := func(jobs actions_model.ActionJobList) { + for _, j := range jobs { + if j.Status == actions_model.StatusRunning { + j.Status = actions_model.StatusSuccess + return + } + } + } + + makeJobs := func(n, maxParallel int) actions_model.ActionJobList { + list := make(actions_model.ActionJobList, n) + for i := range n { + list[i] = &actions_model.ActionRunJob{ + ID: int64(i + 1), + JobID: matrixJobID, + Status: actions_model.StatusBlocked, + Needs: []string{}, + MaxParallel: maxParallel, + } + } + return list + } + + runResolve := func(t *testing.T, jobs actions_model.ActionJobList) map[int64]actions_model.Status { + t.Helper() + return newJobStatusResolver(jobs, nil).Resolve(t.Context()) + } + + // assertSlotInvariant verifies both slot constraints after every resolve cycle. + // It is a no-op when maxParallel=0 (unlimited). + assertSlotInvariant := func(t *testing.T, jobs actions_model.ActionJobList, maxParallel int, label string) { + t.Helper() + if maxParallel == 0 { + return + } + running := countStatus(jobs, actions_model.StatusRunning) + waiting := countStatus(jobs, actions_model.StatusWaiting) + success := countStatus(jobs, actions_model.StatusSuccess) + remaining := len(jobs) - success + active := running + waiting + + assert.LessOrEqual(t, active, maxParallel, + "%s: running(%d)+waiting(%d) must not exceed max-parallel(%d)", + label, running, waiting, maxParallel) + + assert.Equal(t, min(remaining, maxParallel), active, + "%s: running(%d)+waiting(%d) should equal min(remaining=%d, maxParallel=%d)", + label, running, waiting, remaining, maxParallel) + } + + tests := []struct { + name string + totalJobs int + maxParallel int + wantInitialWaiting int // expected WAITING count after the very first Resolve() + }{ + { + // 0 means unlimited: the max-parallel branch in resolve() is skipped + name: "max-parallel=0 (unlimited): all 5 jobs start immediately", + totalJobs: 5, + maxParallel: 0, + wantInitialWaiting: 5, + }, + { + // Strictest case: one slot, so the resolver must promote exactly 1 job + name: "max-parallel=1 (strict serial): exactly 1 job at a time", + totalJobs: 5, + maxParallel: 1, + wantInitialWaiting: 1, + }, + { + // Limit higher than job count: behaves like unlimited for this run. + name: "max-parallel=10 (Nlimit): first 10 of 12 start, rest queue", + totalJobs: 12, + maxParallel: 10, + wantInitialWaiting: 10, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + jobs := makeJobs(tc.totalJobs, tc.maxParallel) + + applyUpdates(jobs, runResolve(t, jobs)) + + assert.Equal(t, tc.wantInitialWaiting, countStatus(jobs, actions_model.StatusWaiting), + "phase 1: Resolve should promote exactly %d jobs to WAITING", tc.wantInitialWaiting) + assert.Equal(t, tc.totalJobs-tc.wantInitialWaiting, countStatus(jobs, actions_model.StatusBlocked), + "phase 1: remaining %d jobs should still be BLOCKED", tc.totalJobs-tc.wantInitialWaiting) + + pickUpAll(jobs) + assertSlotInvariant(t, jobs, tc.maxParallel, "phase 2 (after initial pickup)") + + for cycle := 1; cycle <= tc.totalJobs; cycle++ { + if countStatus(jobs, actions_model.StatusRunning) == 0 { + break + } + + completeOne(jobs) + applyUpdates(jobs, runResolve(t, jobs)) + + label := fmt.Sprintf("phase 3 cycle %d", cycle) + assertSlotInvariant(t, jobs, tc.maxParallel, label) + + pickUpAll(jobs) + } + + for countStatus(jobs, actions_model.StatusRunning) > 0 { + completeOne(jobs) + applyUpdates(jobs, runResolve(t, jobs)) + pickUpAll(jobs) + } + + assert.Equal(t, tc.totalJobs, countStatus(jobs, actions_model.StatusSuccess), + "phase 5: all %d jobs must reach SUCCESS", tc.totalJobs) + assert.Equal(t, 0, countStatus(jobs, actions_model.StatusBlocked), + "phase 5: no jobs may remain BLOCKED") + assert.Equal(t, 0, countStatus(jobs, actions_model.StatusWaiting), + "phase 5: no jobs may remain WAITING") + assert.Equal(t, 0, countStatus(jobs, actions_model.StatusRunning), + "phase 5: no jobs may remain RUNNING") + }) + } +} +