├── .envrc.example ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── Rakefile ├── go.mod ├── go.sum ├── migrations ├── 001_pgxjob_setup.sql └── generate_migrations.sql ├── pgxjob.go ├── pgxjob_internal_test.go └── pgxjob_test.go /.envrc.example: -------------------------------------------------------------------------------- 1 | export POSTGRESQL_DATA_DIR=.dev/postgresql 2 | export PGPORT=5019 3 | export PGUSER=postgres 4 | export PGDATABASE=pgxjob_test 5 | export PGXJOB_TEST_DATABASE=pgxjob_test 6 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | env: 4 | PGXJOB_TEST_DATABASE: runner 5 | 6 | on: 7 | push: 8 | branches: [master] 9 | pull_request: 10 | branches: [master] 11 | 12 | jobs: 13 | test: 14 | name: Test 15 | runs-on: ubuntu-22.04 16 | 17 | steps: 18 | - name: Start and set up PostgreSQL 19 | run: | 20 | sudo systemctl start postgresql.service 21 | pg_isready 22 | sudo -u postgres createuser -s runner 23 | createdb runner 24 | 25 | - name: Check out code into the Go module directory 26 | uses: actions/checkout@v3 27 | 28 | - name: Set up Go 1.x 29 | uses: actions/setup-go@v4 30 | with: 31 | go-version: "1.21" 32 | 33 | - name: Setup Ruby for Rake 34 | uses: ruby/setup-ruby@v1 35 | with: 36 | ruby-version: "3.2" 37 | 38 | # - name: Setup upterm session 39 | # uses: lhotari/action-upterm@v1 40 | # with: 41 | ## limits ssh access and adds the ssh public key for the user which triggered the workflow 42 | # limit-access-to-actor: true 43 | 44 | - name: Prepare test database 45 | run: | 46 | rake test:prepare 47 | 48 | - name: Test 49 | run: go test -race -v ./... 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.dev 2 | /.envrc 3 | /tmp 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023 Jack Christensen 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "rake/clean" 2 | 3 | CLOBBER.include("tmp", "generate_migrations.sql") 4 | 5 | directory "tmp" 6 | 7 | def test_database 8 | dbname = ENV["PGXJOB_TEST_DATABASE"] 9 | if dbname.nil? || dbname.empty? 10 | puts "PGXJOB_TEST_DATABASE not set" 11 | exit 1 12 | end 13 | 14 | dbname 15 | end 16 | 17 | file "migrations/generate_migrations.sql" => FileList["migrations/*.sql"].exclude("migrations/generate_migrations.sql") do 18 | sh "tern", "gengen", "--version-table", "pgxjob_version", "-m", "migrations", "-o", "migrations/generate_migrations.sql" 19 | end 20 | 21 | file "tmp/.test_database_prepared" => FileList["tmp", "migrations/generate_migrations.sql"] do 22 | sh "dropdb", "--if-exists", test_database 23 | sh "createdb", test_database 24 | sh "psql", "--no-psqlrc", "--quiet", "--tuples-only", "--no-align", "-f", "migrations/generate_migrations.sql", "-o", "tmp/migrate.sql", test_database 25 | sh "psql", "--no-psqlrc", "--quiet", "-f", "tmp/migrate.sql", test_database 26 | touch "tmp/.test_database_prepared" 27 | end 28 | 29 | task "test:prepare" => "tmp/.test_database_prepared" 30 | 31 | task "test:full" => "test:prepare" do 32 | sh "go test -race ./..." 33 | end 34 | 35 | task "test:short" => "test:prepare" do 36 | sh "go test -short ./..." 37 | end 38 | 39 | task default: "test:short" 40 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/jackc/pgxjob 2 | 3 | go 1.21 4 | 5 | require ( 6 | github.com/jackc/pgx/v5 v5.4.2 7 | github.com/jackc/pgxutil v0.0.0-20230722221055-3c9f5efec167 8 | github.com/stretchr/testify v1.8.4 9 | ) 10 | 11 | require ( 12 | github.com/davecgh/go-spew v1.1.1 // indirect 13 | github.com/jackc/pgpassfile v1.0.0 // indirect 14 | github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect 15 | github.com/jackc/pgxlisten v0.0.0-20230728233309-2632bad3185a // indirect 16 | github.com/jackc/puddle/v2 v2.2.0 // indirect 17 | github.com/pmezard/go-difflib v1.0.0 // indirect 18 | golang.org/x/crypto v0.11.0 // indirect 19 | golang.org/x/sync v0.1.0 // indirect 20 | golang.org/x/text v0.11.0 // indirect 21 | gopkg.in/yaml.v3 v3.0.1 // indirect 22 | ) 23 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= 5 | github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= 6 | github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a h1:bbPeKD0xmW/Y25WS6cokEszi5g+S0QxI/d45PkRi7Nk= 7 | github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= 8 | github.com/jackc/pgx/v5 v5.4.2 h1:u1gmGDwbdRUZiwisBm/Ky2M14uQyUP65bG8+20nnyrg= 9 | github.com/jackc/pgx/v5 v5.4.2/go.mod h1:q6iHT8uDNXWiFNOlRqJzBTaSH3+2xCXkokxHZC5qWFY= 10 | github.com/jackc/pgxlisten v0.0.0-20230728233309-2632bad3185a h1:Di9X9kIB5WbWxOJLai5MDj0AmA6bKSM0GsHRCqs41rI= 11 | github.com/jackc/pgxlisten v0.0.0-20230728233309-2632bad3185a/go.mod h1:EqjCOzkITPCEI0My7BdE2xm3r0fZ7OZycVDP+ki1ASA= 12 | github.com/jackc/pgxutil v0.0.0-20230722221055-3c9f5efec167 h1:+6z8OAwZnCJEcy0RVqg+Xf/4iyzQeO5mTwTQ2oN0Icc= 13 | github.com/jackc/pgxutil v0.0.0-20230722221055-3c9f5efec167/go.mod h1:/ezerYvY+yygB2Jupuzbgbj+VMX2Idgm0VGIEaSnwsw= 14 | github.com/jackc/puddle/v2 v2.2.0 h1:RdcDk92EJBuBS55nQMMYFXTxwstHug4jkhT5pq8VxPk= 15 | github.com/jackc/puddle/v2 v2.2.0/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= 16 | github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= 17 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 18 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 19 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 20 | github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= 21 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 22 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 23 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 24 | github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= 25 | github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= 26 | golang.org/x/crypto v0.11.0 h1:6Ewdq3tDic1mg5xRO4milcWCfMVQhI4NkqWWvqejpuA= 27 | golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= 28 | golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= 29 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 30 | golang.org/x/text v0.11.0 h1:LAntKIrcmeSKERyiOh0XMV39LXS8IE9UL2yP7+f5ij4= 31 | golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 32 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 33 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 34 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 35 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 36 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 37 | -------------------------------------------------------------------------------- /migrations/001_pgxjob_setup.sql: -------------------------------------------------------------------------------- 1 | create table pgxjob_groups ( 2 | id int primary key generated by default as identity, 3 | name text not null unique 4 | ); 5 | 6 | create table pgxjob_types ( 7 | id int primary key generated by default as identity, 8 | name text not null unique 9 | ); 10 | 11 | create table pgxjob_workers ( 12 | id int primary key, 13 | inserted_at timestamptz not null default now(), 14 | heartbeat timestamptz not null, 15 | group_id int not null references pgxjob_groups 16 | ); 17 | 18 | create sequence pgxjob_workers_id_seq as int cycle owned by pgxjob_workers.id; 19 | alter table pgxjob_workers alter column id set default nextval('pgxjob_workers_id_seq'); 20 | 21 | -- pgxjob_asap_jobs and pgxjob_run_at_jobs can be a very hot tables. We want to keep them as small as possible. 22 | -- 23 | -- Columns are carefully ordered to avoid wasted space. See https://www.2ndquadrant.com/en/blog/on-rocks-and-sand/ for 24 | -- more info. 25 | -- 26 | -- Both tables share the same sequence. 27 | 28 | create sequence pgxjob_jobs_id_seq as bigint; 29 | 30 | -- pgxjob_asap_jobs is a queue of jobs that should be run as soon as possible. 31 | create table pgxjob_asap_jobs ( 32 | id bigint primary key default nextval('pgxjob_jobs_id_seq'), 33 | inserted_at timestamptz not null default now(), 34 | group_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_groups rows are never deleted. 35 | type_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_types rows are never deleted. 36 | worker_id int references pgxjob_workers on delete set null, 37 | params json -- use json instead of jsonb as it is faster for insert. 38 | ); 39 | 40 | -- pgxjob_run_at_jobs is a queue of jobs that should be run at a specific time. 41 | create table pgxjob_run_at_jobs ( 42 | id bigint primary key default nextval('pgxjob_jobs_id_seq'), 43 | inserted_at timestamptz not null default now(), 44 | run_at timestamptz not null, 45 | next_run_at timestamptz not null, 46 | group_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_groups rows are never deleted. 47 | type_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_types rows are never deleted. 48 | worker_id int references pgxjob_workers on delete set null, 49 | error_count int not null, 50 | last_error text, 51 | params json -- use json instead of jsonb as it is faster for insert. 52 | ); 53 | 54 | create index pgxjob_run_at_jobs_next_run_at_idx on pgxjob_run_at_jobs (next_run_at); 55 | 56 | create table pgxjob_job_runs ( 57 | job_id bigint not null, -- no foreign key because original jobs will be deleted 58 | job_inserted_at timestamptz not null, 59 | run_at timestamptz not null, 60 | 61 | -- not using tstzrange because jobs which take less than a microsecond may have the same value for started_at and 62 | -- finished_at because PostgreSQL only has microsecond level precision. A [) range where both ends have the same value 63 | -- is "empty". This special value would lose the time that the job actually ran. 64 | started_at timestamptz not null, 65 | finished_at timestamptz not null, 66 | 67 | run_number int not null, 68 | group_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_groups rows are never deleted. 69 | type_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_types rows are never deleted. 70 | params json, 71 | error text, 72 | primary key (job_id, run_number) 73 | ); 74 | 75 | create index pgxjob_job_runs_finished_at_idx on pgxjob_job_runs using brin (job_id); 76 | -------------------------------------------------------------------------------- /migrations/generate_migrations.sql: -------------------------------------------------------------------------------- 1 | -- This file was generated by tern gengen v2.2.0-pre. 2 | -- 3 | -- If using psql to execute this script use the --no-psqlrc, --tuples-only, 4 | -- --quiet, and --no-align options to only output the migration SQL. 5 | -- 6 | -- e.g. psql --no-psqlrc --tuples-only --quiet --no-align -f this_file.sql 7 | -- 8 | -- The results can be redirected to a file where the proposed changes can be 9 | -- inspected or the results can be piped back into psql to migrate immediately. 10 | -- 11 | -- e.g. psql --no-psqlrc --tuples-only --quiet --no-align -f this_file.sql | psql 12 | 13 | set tern.version = -1; 14 | do $$ 15 | declare 16 | schema_version_table_exists boolean; 17 | begin 18 | select to_regclass('pgxjob_version') is not null into schema_version_table_exists; 19 | if schema_version_table_exists then 20 | perform set_config('tern.version', version::text, false) from pgxjob_version; 21 | end if; 22 | end 23 | $$; 24 | 25 | with migrations(version, up_sql) as ( 26 | values 27 | (0, 28 | $tern_gengen$ 29 | begin; 30 | create table pgxjob_version(version int4 not null); 31 | insert into pgxjob_version(version) values(0); 32 | $tern_gengen$) 33 | 34 | , (1, 35 | $tern_gengen$ 36 | -- 001_pgxjob_setup.sql 37 | begin; 38 | create table pgxjob_groups ( 39 | id int primary key generated by default as identity, 40 | name text not null unique 41 | ); 42 | 43 | create table pgxjob_types ( 44 | id int primary key generated by default as identity, 45 | name text not null unique 46 | ); 47 | 48 | create table pgxjob_workers ( 49 | id int primary key, 50 | inserted_at timestamptz not null default now(), 51 | heartbeat timestamptz not null, 52 | group_id int not null references pgxjob_groups 53 | ); 54 | 55 | create sequence pgxjob_workers_id_seq as int cycle owned by pgxjob_workers.id; 56 | alter table pgxjob_workers alter column id set default nextval('pgxjob_workers_id_seq'); 57 | 58 | -- pgxjob_asap_jobs and pgxjob_run_at_jobs can be a very hot tables. We want to keep them as small as possible. 59 | -- 60 | -- Columns are carefully ordered to avoid wasted space. See https://www.2ndquadrant.com/en/blog/on-rocks-and-sand/ for 61 | -- more info. 62 | -- 63 | -- Both tables share the same sequence. 64 | 65 | create sequence pgxjob_jobs_id_seq as bigint; 66 | 67 | -- pgxjob_asap_jobs is a queue of jobs that should be run as soon as possible. 68 | create table pgxjob_asap_jobs ( 69 | id bigint primary key default nextval('pgxjob_jobs_id_seq'), 70 | inserted_at timestamptz not null default now(), 71 | group_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_groups rows are never deleted. 72 | type_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_types rows are never deleted. 73 | worker_id int references pgxjob_workers on delete set null, 74 | params json -- use json instead of jsonb as it is faster for insert. 75 | ); 76 | 77 | -- pgxjob_run_at_jobs is a queue of jobs that should be run at a specific time. 78 | create table pgxjob_run_at_jobs ( 79 | id bigint primary key default nextval('pgxjob_jobs_id_seq'), 80 | inserted_at timestamptz not null default now(), 81 | run_at timestamptz not null, 82 | next_run_at timestamptz not null, 83 | group_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_groups rows are never deleted. 84 | type_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_types rows are never deleted. 85 | worker_id int references pgxjob_workers on delete set null, 86 | error_count int not null, 87 | last_error text, 88 | params json -- use json instead of jsonb as it is faster for insert. 89 | ); 90 | 91 | create index pgxjob_run_at_jobs_next_run_at_idx on pgxjob_run_at_jobs (next_run_at); 92 | 93 | create table pgxjob_job_runs ( 94 | job_id bigint not null, -- no foreign key because original jobs will be deleted 95 | job_inserted_at timestamptz not null, 96 | run_at timestamptz not null, 97 | 98 | -- not using tstzrange because jobs which take less than a microsecond may have the same value for started_at and 99 | -- finished_at because PostgreSQL only has microsecond level precision. A [) range where both ends have the same value 100 | -- is "empty". This special value would lose the time that the job actually ran. 101 | started_at timestamptz not null, 102 | finished_at timestamptz not null, 103 | 104 | run_number int not null, 105 | group_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_groups rows are never deleted. 106 | type_id int not null, -- purposely not a foreign key for best insert performance. pgxjob_types rows are never deleted. 107 | params json, 108 | error text, 109 | primary key (job_id, run_number) 110 | ); 111 | 112 | create index pgxjob_job_runs_finished_at_idx on pgxjob_job_runs using brin (job_id);$tern_gengen$) 113 | 114 | ) 115 | select up_sql || ' 116 | update pgxjob_version set version = ' || version || '; 117 | commit; 118 | ' 119 | from migrations 120 | where version > current_setting('tern.version')::int4 121 | order by version asc; 122 | 123 | -------------------------------------------------------------------------------- /pgxjob.go: -------------------------------------------------------------------------------- 1 | // Package pgxjob provides a job runner using PostgreSQL. 2 | package pgxjob 3 | 4 | import ( 5 | "context" 6 | "encoding/json" 7 | "errors" 8 | "fmt" 9 | "math/rand" 10 | "os" 11 | "runtime/debug" 12 | "slices" 13 | "sync" 14 | "time" 15 | 16 | "github.com/jackc/pgx/v5" 17 | "github.com/jackc/pgx/v5/pgconn" 18 | "github.com/jackc/pgx/v5/pgtype/zeronull" 19 | "github.com/jackc/pgx/v5/pgxpool" 20 | "github.com/jackc/pgxutil" 21 | ) 22 | 23 | const PGNotifyChannel = "pgxjob_job_available" 24 | const defaultGroupName = "default" 25 | 26 | // DefaultContextScheduler is the default scheduler. It is returned by Ctx when no scheduler is set in the context. It 27 | // must be set before use. 28 | var DefaultContextScheduler *Scheduler 29 | 30 | type ctxKey struct{} 31 | 32 | // Scheduler is used to schedule jobs and start workers. 33 | type Scheduler struct { 34 | acquireConn AcquireConnFunc 35 | 36 | jobGroupsByName map[string]*JobGroup 37 | jobGroupsByID map[int32]*JobGroup 38 | 39 | jobTypesByName map[string]*JobType 40 | jobTypesByID map[int32]*JobType 41 | 42 | handleError func(err error) 43 | 44 | setupDoneChan chan struct{} 45 | 46 | config *SchedulerConfig 47 | } 48 | 49 | type SchedulerConfig struct { 50 | // AcquireConn is used to get a connection to the database. It must be set. 51 | AcquireConn AcquireConnFunc 52 | 53 | // JobGroups is a lists of job groups that can be used by the scheduler. The job group "default" is always available. 54 | JobGroups []string 55 | 56 | // JobTypes is a list of job types that can be used by the scheduler. It must be set. 57 | JobTypes []*JobTypeConfig 58 | 59 | // HandleError is a function that is called when an error occurs that cannot be handled or returned. For example, a 60 | // network outage may cause a worker to be unable to fetch a job or record the outcome of an execution. The worker 61 | // should not be stopped because of this. Instead, it should try again later when the network may have been restored. 62 | // These types of errors are passed to HandleError. If not set errors are logged to stderr. 63 | HandleError func(err error) 64 | } 65 | 66 | // NewScheduler returns a new Scheduler. 67 | func NewScheduler(config *SchedulerConfig) (*Scheduler, error) { 68 | if len(config.JobTypes) == 0 { 69 | return nil, fmt.Errorf("pgxjob: at least one job type must be registered") 70 | } 71 | 72 | if !slices.Contains(config.JobGroups, defaultGroupName) { 73 | config.JobGroups = append(config.JobGroups, defaultGroupName) 74 | } 75 | 76 | for _, jobGroupName := range config.JobGroups { 77 | if jobGroupName == "" { 78 | return nil, fmt.Errorf("pgxjob: job group name cannot be empty") 79 | } 80 | } 81 | 82 | for _, jobType := range config.JobTypes { 83 | if jobType.Name == "" { 84 | return nil, fmt.Errorf("pgxjob: job type name must be set") 85 | } 86 | if jobType.DefaultGroupName == "" { 87 | jobType.DefaultGroupName = defaultGroupName 88 | } 89 | 90 | if !slices.Contains(config.JobGroups, jobType.DefaultGroupName) { 91 | return nil, fmt.Errorf("pgxjob: job type has default group name %s that is not in job groups", jobType.DefaultGroupName) 92 | } 93 | 94 | if jobType.RunJob == nil { 95 | return nil, fmt.Errorf("params.RunJob must be set") 96 | } 97 | } 98 | 99 | s := &Scheduler{ 100 | acquireConn: config.AcquireConn, 101 | jobGroupsByName: map[string]*JobGroup{}, 102 | jobGroupsByID: map[int32]*JobGroup{}, 103 | jobTypesByName: make(map[string]*JobType), 104 | jobTypesByID: make(map[int32]*JobType), 105 | setupDoneChan: make(chan struct{}), 106 | config: config, 107 | } 108 | 109 | if config.HandleError == nil { 110 | s.handleError = func(err error) { 111 | fmt.Fprintf(os.Stderr, "pgxjob: HandleError: %v\n", err) 112 | } 113 | } else { 114 | s.handleError = config.HandleError 115 | } 116 | 117 | go func() { 118 | for { 119 | err := s.setup() 120 | if err == nil { 121 | return 122 | } 123 | s.handleError(fmt.Errorf("pgxjob: scheduler setup failed: %w", err)) 124 | time.Sleep(5 * time.Second) 125 | } 126 | }() 127 | 128 | return s, nil 129 | } 130 | 131 | // setup makes one attempt to setup the scheduler. 132 | func (s *Scheduler) setup() error { 133 | ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) 134 | defer cancel() 135 | 136 | conn, release, err := s.acquireConn(ctx) 137 | if err != nil { 138 | return fmt.Errorf("failed to get connection: %w", err) 139 | } 140 | defer release() 141 | 142 | for _, groupName := range s.config.JobGroups { 143 | err := s.registerJobGroup(ctx, conn, groupName) 144 | if err != nil { 145 | return err 146 | } 147 | } 148 | 149 | for _, jobType := range s.config.JobTypes { 150 | err := s.registerJobType(ctx, conn, jobType) 151 | if err != nil { 152 | return err 153 | } 154 | } 155 | 156 | close(s.setupDoneChan) 157 | return nil 158 | } 159 | 160 | // JobType is a type of job. 161 | type JobType struct { 162 | // ID is the ID of the job type. It is set automatically. 163 | ID int32 164 | 165 | // Name is the name of the job type. 166 | Name string 167 | 168 | // DefaultGroup is the default group to use when enqueuing jobs of this type. 169 | DefaultGroup *JobGroup 170 | 171 | // RunJob is the function that will be called when a job of this type is run. 172 | RunJob func(ctx context.Context, job *Job) error 173 | } 174 | 175 | type JobGroup struct { 176 | ID int32 177 | Name string 178 | } 179 | 180 | // registerJobGroup registers a group. It must be called before any jobs are scheduled or workers are started. 181 | func (s *Scheduler) registerJobGroup(ctx context.Context, conn DB, name string) error { 182 | var jobGroupID int32 183 | selectIDErr := conn.QueryRow(ctx, `select id from pgxjob_groups where name = $1`, name).Scan(&jobGroupID) 184 | if errors.Is(selectIDErr, pgx.ErrNoRows) { 185 | _, insertErr := conn.Exec(ctx, `insert into pgxjob_groups (name) values ($1) on conflict do nothing`, name) 186 | if insertErr != nil { 187 | return fmt.Errorf("failed to insert group %s: %w", name, insertErr) 188 | } 189 | 190 | selectIDErr = conn.QueryRow(ctx, `select id from pgxjob_groups where name = $1`, name).Scan(&jobGroupID) 191 | } 192 | if selectIDErr != nil { 193 | return fmt.Errorf("failed to select id for group %s: %w", name, selectIDErr) 194 | } 195 | 196 | jq := &JobGroup{ 197 | ID: jobGroupID, 198 | Name: name, 199 | } 200 | s.jobGroupsByName[jq.Name] = jq 201 | s.jobGroupsByID[jq.ID] = jq 202 | return nil 203 | } 204 | 205 | type JobTypeConfig struct { 206 | // Name is the name of the job type. It must be set and unique. 207 | Name string 208 | 209 | // DefaultGroupName is the name of the default group to use when enqueuing jobs of this type. If not set "default" is 210 | // used. 211 | DefaultGroupName string 212 | 213 | // RunJob is the function that will be called when a job of this type is run. It must be set. 214 | RunJob RunJobFunc 215 | } 216 | 217 | // registerJobType registers a job type. 218 | func (s *Scheduler) registerJobType(ctx context.Context, conn DB, jobTypeConfig *JobTypeConfig) error { 219 | var jobTypeID int32 220 | selectIDErr := conn.QueryRow(ctx, `select id from pgxjob_types where name = $1`, jobTypeConfig.Name).Scan(&jobTypeID) 221 | if errors.Is(selectIDErr, pgx.ErrNoRows) { 222 | _, insertErr := conn.Exec(ctx, `insert into pgxjob_types (name) values ($1) on conflict do nothing`, jobTypeConfig.Name) 223 | if insertErr != nil { 224 | return fmt.Errorf("failed to insert job type %s: %w", jobTypeConfig.Name, insertErr) 225 | } 226 | 227 | selectIDErr = conn.QueryRow(ctx, `select id from pgxjob_types where name = $1`, jobTypeConfig.Name).Scan(&jobTypeID) 228 | } 229 | if selectIDErr != nil { 230 | return fmt.Errorf("failed to select id for job type %s: %w", jobTypeConfig.Name, selectIDErr) 231 | } 232 | 233 | jt := &JobType{ 234 | ID: jobTypeID, 235 | Name: jobTypeConfig.Name, 236 | DefaultGroup: s.jobGroupsByName[jobTypeConfig.DefaultGroupName], 237 | RunJob: jobTypeConfig.RunJob, 238 | } 239 | 240 | s.jobTypesByName[jt.Name] = jt 241 | s.jobTypesByID[jt.ID] = jt 242 | 243 | return nil 244 | } 245 | 246 | type RunJobFunc func(ctx context.Context, job *Job) error 247 | 248 | // UnmarshalParams returns a JobType.RunJob function that unmarshals job.Params into a T and calls fn. 249 | func UnmarshalParams[T any](fn func(ctx context.Context, job *Job, params T) error) RunJobFunc { 250 | return func(ctx context.Context, job *Job) error { 251 | var params T 252 | err := json.Unmarshal(job.Params, ¶ms) 253 | if err != nil { 254 | return fmt.Errorf("unmarshal job params failed: %w", err) 255 | } 256 | 257 | return fn(ctx, job, params) 258 | } 259 | } 260 | 261 | // DB is the type pgxjob uses to interact with the database when it does not specifically need a *pgx.Conn. 262 | type DB interface { 263 | Begin(ctx context.Context) (pgx.Tx, error) 264 | CopyFrom(ctx context.Context, tableName pgx.Identifier, columnNames []string, rowSrc pgx.CopyFromSource) (int64, error) 265 | Exec(ctx context.Context, sql string, arguments ...any) (pgconn.CommandTag, error) 266 | Query(ctx context.Context, sql string, optionsAndArgs ...interface{}) (pgx.Rows, error) 267 | QueryRow(ctx context.Context, sql string, args ...any) pgx.Row 268 | SendBatch(ctx context.Context, b *pgx.Batch) (br pgx.BatchResults) 269 | } 270 | 271 | // JobSchedule is a schedule for a job. 272 | type JobSchedule struct { 273 | // GroupName is the name of the group to use when enqueuing the job. If not set the job type's default group is used. 274 | GroupName string 275 | 276 | // RunAt is the time to run the job. If not set the job is scheduled to run immediately. 277 | RunAt time.Time 278 | } 279 | 280 | // ScheduleNow schedules a job to be run immediately. 281 | func (m *Scheduler) ScheduleNow(ctx context.Context, db DB, jobTypeName string, jobParams any) error { 282 | return m.Schedule(ctx, db, jobTypeName, jobParams, JobSchedule{}) 283 | } 284 | 285 | // Schedule schedules a job to be run according to schedule. 286 | func (m *Scheduler) Schedule(ctx context.Context, db DB, jobTypeName string, jobParams any, schedule JobSchedule) error { 287 | select { 288 | case <-m.setupDoneChan: 289 | case <-ctx.Done(): 290 | return fmt.Errorf("pgxjob: schedule %w", ctx.Err()) 291 | } 292 | 293 | jobType, ok := m.jobTypesByName[jobTypeName] 294 | if !ok { 295 | return fmt.Errorf("pgxjob: job type with name %s not registered", jobTypeName) 296 | } 297 | 298 | var jobGroup *JobGroup 299 | if schedule.GroupName == "" { 300 | jobGroup = jobType.DefaultGroup 301 | } else { 302 | var ok bool 303 | jobGroup, ok = m.jobGroupsByName[schedule.GroupName] 304 | if !ok { 305 | return fmt.Errorf("pgxjob: group with name %s not registered", schedule.GroupName) 306 | } 307 | } 308 | 309 | if schedule.RunAt.IsZero() { 310 | batch := &pgx.Batch{} 311 | batch.Queue( 312 | `insert into pgxjob_asap_jobs (group_id, type_id, params, worker_id) 313 | values ($1, $2, $3, (select id from pgxjob_workers where group_id = $1 order by random() limit 1))`, 314 | jobGroup.ID, jobType.ID, jobParams, 315 | ) 316 | batch.Queue(`select pg_notify($1, $2)`, PGNotifyChannel, jobGroup.Name) 317 | err := db.SendBatch(ctx, batch).Close() 318 | if err != nil { 319 | return fmt.Errorf("pgxjob: failed to schedule asap job: %w", err) 320 | } 321 | } else { 322 | _, err := db.Exec(ctx, 323 | `insert into pgxjob_run_at_jobs (group_id, type_id, params, run_at, next_run_at, error_count) values ($1, $2, $3, $4, $4, 0)`, 324 | jobGroup.ID, jobType.ID, jobParams, schedule.RunAt, 325 | ) 326 | if err != nil { 327 | return fmt.Errorf("pgxjob: failed to schedule run at job: %w", err) 328 | } 329 | } 330 | 331 | return nil 332 | } 333 | 334 | // Ctx returns the *Scheduler attached to ctx. If ctx does not have a *Scheduler attached then it returns 335 | // DefaultContextScheduler. 336 | func Ctx(ctx context.Context) *Scheduler { 337 | if s, ok := ctx.Value(ctxKey{}).(*Scheduler); ok { 338 | return s 339 | } else { 340 | return DefaultContextScheduler 341 | } 342 | } 343 | 344 | // WithContext returns a copy of ctx with s attached. 345 | func (s *Scheduler) WithContext(ctx context.Context) context.Context { 346 | return context.WithValue(ctx, ctxKey{}, s) 347 | } 348 | 349 | // AcquireConnFunc is a function that acquires a database connection for exclusive use. It returns a release function 350 | // that will be called when the connection is no longer needed. 351 | type AcquireConnFunc func(ctx context.Context) (conn *pgxpool.Conn, release func(), err error) 352 | 353 | // AcquireConnFuncFromPool returns an AcquireConnFunc that acquires connections from the given *pgxpool.Pool. 354 | func AcquireConnFuncFromPool(pool *pgxpool.Pool) AcquireConnFunc { 355 | return func(ctx context.Context) (conn *pgxpool.Conn, release func(), err error) { 356 | conn, err = pool.Acquire(ctx) 357 | if err != nil { 358 | return nil, nil, fmt.Errorf("pgxjob: failed to acquire connection: %w", err) 359 | } 360 | 361 | return conn, func() { conn.Release() }, nil 362 | } 363 | } 364 | 365 | type WorkerConfig struct { 366 | // GroupName is the group to work. If empty, "default" is used. 367 | GroupName string 368 | 369 | // MaxConcurrentJobs is the maximum number of jobs to work concurrently. If not set 10 is used. 370 | MaxConcurrentJobs int 371 | 372 | // MaxPrefetchedJobs is the maximum number of prefetched jobs (i.e. jobs that are fetched from the database and 373 | // locked, but not yet being worked). If not set 1000 is used. 374 | MaxPrefetchedJobs int 375 | 376 | // PollInterval is the interval between polling for new jobs. If not set 10 seconds is used. 377 | PollInterval time.Duration 378 | 379 | // MaxBufferedJobResults is the maximum number of job results that can be buffered before the job results must be 380 | // flushed to the database. If not set 100 is used. 381 | MaxBufferedJobResults int 382 | 383 | // MaxBufferedJobResultAge is the maximum age of a buffered job result before the job results must be flushed to the 384 | // database. If not set 1 second is used. 385 | MaxBufferedJobResultAge time.Duration 386 | 387 | // ShouldLogJobRun is called for every job run. If it returns true then the run is logged to the pgxjob_job_runs 388 | // table. If it returns false it is not. If not set all job runs are logged. 389 | ShouldLogJobRun func(worker *Worker, job *Job, startTime, endTime time.Time, err error) bool 390 | 391 | minHeartbeatDelay time.Duration 392 | heartbeatDelayJitter time.Duration 393 | workerDeadWithoutHeartbeatDuration time.Duration 394 | } 395 | 396 | type Worker struct { 397 | id int32 398 | 399 | config *WorkerConfig 400 | group *JobGroup 401 | 402 | scheduler *Scheduler 403 | signalChan chan struct{} 404 | 405 | cancelCtx context.Context 406 | cancel context.CancelFunc 407 | 408 | startupCompleteChan chan struct{} 409 | 410 | mux sync.Mutex 411 | jobRunnerGoroutineCount int 412 | 413 | runningJobsMux sync.Mutex 414 | runningJobIDs map[int64]struct{} 415 | 416 | jobRunnerGoroutineWaitGroup sync.WaitGroup 417 | jobChan chan *Job 418 | 419 | jobResultsChan chan *jobResult 420 | writeJobResultsDoneChan chan struct{} 421 | 422 | heartbeatDoneChan chan struct{} 423 | fetchAndStartJobsDoneChan chan struct{} 424 | } 425 | 426 | // StartWorker starts a worker. The *Worker is returned immediately, but the startup process is run in the background. 427 | // This is to avoid blocking or returning an error if the database is temporarily unavailable. Use StartupComplete if 428 | // it is necessary to wait for the worker to be ready. 429 | func (m *Scheduler) StartWorker(config *WorkerConfig) (*Worker, error) { 430 | if config.GroupName == "" { 431 | config.GroupName = defaultGroupName 432 | } 433 | if !slices.Contains(m.config.JobGroups, config.GroupName) { 434 | return nil, fmt.Errorf("pgxjob: group with name %s not registered", config.GroupName) 435 | } 436 | 437 | if config.MaxConcurrentJobs == 0 { 438 | config.MaxConcurrentJobs = 10 439 | } 440 | 441 | if config.MaxPrefetchedJobs == 0 { 442 | config.MaxPrefetchedJobs = 1000 443 | } 444 | 445 | if config.PollInterval == 0 { 446 | config.PollInterval = 10 * time.Second 447 | } 448 | 449 | if config.MaxBufferedJobResults == 0 { 450 | config.MaxBufferedJobResults = 100 451 | } 452 | 453 | if config.MaxBufferedJobResultAge == 0 { 454 | config.MaxBufferedJobResultAge = 1 * time.Second 455 | } 456 | 457 | if config.minHeartbeatDelay == 0 { 458 | config.minHeartbeatDelay = 45 * time.Second 459 | } 460 | if config.heartbeatDelayJitter == 0 { 461 | config.heartbeatDelayJitter = 30 * time.Second 462 | } 463 | if config.workerDeadWithoutHeartbeatDuration == 0 { 464 | config.workerDeadWithoutHeartbeatDuration = 5 * (config.minHeartbeatDelay + config.heartbeatDelayJitter) 465 | } 466 | 467 | w := &Worker{ 468 | config: config, 469 | scheduler: m, 470 | signalChan: make(chan struct{}, 1), 471 | startupCompleteChan: make(chan struct{}), 472 | runningJobIDs: make(map[int64]struct{}, config.MaxConcurrentJobs+config.MaxPrefetchedJobs), 473 | jobChan: make(chan *Job), 474 | jobResultsChan: make(chan *jobResult), 475 | writeJobResultsDoneChan: make(chan struct{}), 476 | heartbeatDoneChan: make(chan struct{}), 477 | fetchAndStartJobsDoneChan: make(chan struct{}), 478 | } 479 | w.cancelCtx, w.cancel = context.WithCancel(context.Background()) 480 | 481 | go func() { 482 | for { 483 | select { 484 | case <-w.cancelCtx.Done(): 485 | return 486 | default: 487 | } 488 | 489 | err := w.start() 490 | if err == nil { 491 | go w.heartbeat() 492 | go w.writeJobResults() 493 | go w.fetchAndStartJobs() 494 | return 495 | } 496 | w.scheduler.handleError(fmt.Errorf("pgxjob: failed to setup worker: %w", err)) 497 | 498 | select { 499 | case <-w.cancelCtx.Done(): 500 | return 501 | case <-time.After(30 * time.Second): 502 | } 503 | } 504 | }() 505 | 506 | return w, nil 507 | } 508 | 509 | func (w *Worker) start() error { 510 | ctx, cancel := context.WithTimeout(w.cancelCtx, 30*time.Second) 511 | defer cancel() 512 | 513 | select { 514 | case <-w.scheduler.setupDoneChan: 515 | case <-ctx.Done(): 516 | return ctx.Err() 517 | } 518 | 519 | w.group = w.scheduler.jobGroupsByName[w.config.GroupName] 520 | 521 | conn, release, err := w.scheduler.acquireConn(ctx) 522 | if err != nil { 523 | return fmt.Errorf("failed to get connection: %w", err) 524 | } 525 | defer release() 526 | 527 | w.id, err = pgxutil.SelectRow(ctx, conn, 528 | `insert into pgxjob_workers (heartbeat, group_id) values (now(), $1) returning id`, 529 | []any{w.group.ID}, 530 | pgx.RowTo[int32], 531 | ) 532 | if err != nil { 533 | return fmt.Errorf("failed to insert into pgxjob_workers: %w", err) 534 | } 535 | 536 | close(w.startupCompleteChan) 537 | return nil 538 | } 539 | 540 | func (w *Worker) heartbeat() { 541 | defer close(w.heartbeatDoneChan) 542 | 543 | for { 544 | select { 545 | case <-w.cancelCtx.Done(): 546 | return 547 | case <-time.After(w.config.minHeartbeatDelay + time.Duration(rand.Int63n(int64(w.config.heartbeatDelayJitter)))): 548 | func() { 549 | defer func() { 550 | if r := recover(); r != nil { 551 | w.handleWorkerError(fmt.Errorf("pgxjob: panic in heartbeat for %d: %v\n%s", w.id, r, debug.Stack())) 552 | } 553 | }() 554 | 555 | err := func() error { 556 | ctx, cancel := context.WithTimeout(w.cancelCtx, 15*time.Second) 557 | defer cancel() 558 | 559 | conn, release, err := w.scheduler.acquireConn(ctx) 560 | if err != nil { 561 | return fmt.Errorf("pgxjob: heartbeat for %d: failed to get connection: %w", w.id, err) 562 | } 563 | defer release() 564 | 565 | _, err = conn.Exec(ctx, `update pgxjob_workers set heartbeat = now() where id = $1`, w.id) 566 | if err != nil { 567 | return fmt.Errorf("pgxjob: heartbeat for %d: failed to update: %w", w.id, err) 568 | } 569 | 570 | _, err = conn.Exec(ctx, `delete from pgxjob_workers where heartbeat + $1 < now()`, w.config.workerDeadWithoutHeartbeatDuration) 571 | if err != nil { 572 | return fmt.Errorf("pgxjob: heartbeat for %d: failed to cleanup dead workers: %w", w.id, err) 573 | } 574 | 575 | return nil 576 | }() 577 | if err != nil { 578 | w.handleWorkerError(err) 579 | } 580 | }() 581 | } 582 | } 583 | } 584 | 585 | func (w *Worker) writeJobResults() { 586 | defer close(w.writeJobResultsDoneChan) 587 | 588 | type pgxjobJobRun struct { 589 | JobID int64 590 | InsertedAt time.Time 591 | RunAt time.Time 592 | StartedAt time.Time 593 | FinishedAt time.Time 594 | RunNumber int32 595 | GroupID int32 596 | TypeID int32 597 | Params []byte 598 | Error zeronull.Text 599 | } 600 | 601 | type pgxjobJobUpdate struct { 602 | ID int64 603 | LastError string 604 | NextRunAt time.Time 605 | ASAP bool 606 | } 607 | 608 | jobResults := make([]*jobResult, 0, w.config.MaxBufferedJobResults) 609 | asapJobIDsToDelete := make([]int64, 0, w.config.MaxBufferedJobResults) 610 | runAtJobIDsToDelete := make([]int64, 0, w.config.MaxBufferedJobResults) 611 | pgxjobJobRunsToInsert := make([]pgxjobJobRun, 0, w.config.MaxBufferedJobResults) 612 | 613 | flushTimer := time.NewTimer(time.Hour) 614 | flushTimer.Stop() 615 | 616 | flush := func() { 617 | if !flushTimer.Stop() { 618 | select { 619 | case <-flushTimer.C: 620 | default: 621 | } 622 | } 623 | 624 | // Always clear the results even if there is an error. In case of error there is nothing that can be done. 625 | defer func() { 626 | w.runningJobsMux.Lock() 627 | for _, jr := range jobResults { 628 | delete(w.runningJobIDs, jr.job.ID) 629 | } 630 | w.runningJobsMux.Unlock() 631 | 632 | clear(jobResults) 633 | jobResults = jobResults[:0] 634 | clear(asapJobIDsToDelete) 635 | asapJobIDsToDelete = asapJobIDsToDelete[:0] 636 | clear(runAtJobIDsToDelete) 637 | runAtJobIDsToDelete = runAtJobIDsToDelete[:0] 638 | clear(pgxjobJobRunsToInsert) 639 | pgxjobJobRunsToInsert = pgxjobJobRunsToInsert[:0] 640 | }() 641 | 642 | // Job errors should be rare. So do not use persistent slice like jobIDsToDelete and pgxjobJobRunsToInsert. 643 | var pgxjobJobUpdates []pgxjobJobUpdate 644 | for _, jobResult := range jobResults { 645 | job := jobResult.job 646 | var errForInsert zeronull.Text 647 | if jobResult.err == nil { 648 | if job.ASAP { 649 | asapJobIDsToDelete = append(asapJobIDsToDelete, jobResult.job.ID) 650 | } else { 651 | runAtJobIDsToDelete = append(runAtJobIDsToDelete, jobResult.job.ID) 652 | } 653 | } else { 654 | errForInsert = zeronull.Text(jobResult.err.Error()) 655 | var errorWithRetry *ErrorWithRetry 656 | if errors.As(jobResult.err, &errorWithRetry) { 657 | pgxjobJobUpdates = append(pgxjobJobUpdates, pgxjobJobUpdate{ 658 | ID: job.ID, 659 | LastError: errorWithRetry.Err.Error(), 660 | NextRunAt: errorWithRetry.RetryAt, 661 | ASAP: job.ASAP, 662 | }) 663 | } else { 664 | if job.ASAP { 665 | asapJobIDsToDelete = append(asapJobIDsToDelete, jobResult.job.ID) 666 | } else { 667 | runAtJobIDsToDelete = append(runAtJobIDsToDelete, jobResult.job.ID) 668 | } 669 | } 670 | } 671 | if w.config.ShouldLogJobRun == nil || w.config.ShouldLogJobRun(w, job, jobResult.startTime, jobResult.finishedAt, jobResult.err) { 672 | pgxjobJobRunsToInsert = append(pgxjobJobRunsToInsert, pgxjobJobRun{ 673 | JobID: job.ID, 674 | InsertedAt: job.InsertedAt, 675 | RunAt: job.RunAt, 676 | StartedAt: jobResult.startTime, 677 | FinishedAt: jobResult.finishedAt, 678 | RunNumber: job.ErrorCount + 1, 679 | GroupID: job.Group.ID, 680 | TypeID: job.Type.ID, 681 | Params: job.Params, 682 | Error: errForInsert, 683 | }) 684 | } 685 | } 686 | 687 | batch := &pgx.Batch{} 688 | for _, jobUpdate := range pgxjobJobUpdates { 689 | if jobUpdate.ASAP { 690 | batch.Queue( 691 | `with t as ( 692 | delete from pgxjob_asap_jobs where id = $1 returning * 693 | ) 694 | insert into pgxjob_run_at_jobs (id, inserted_at, run_at, next_run_at, group_id, type_id, error_count, last_error, params) 695 | select id, inserted_at, inserted_at, $2, group_id, type_id, 1, $3, params 696 | from t`, 697 | jobUpdate.ID, jobUpdate.NextRunAt, jobUpdate.LastError, 698 | ) 699 | } else { 700 | batch.Queue( 701 | `update pgxjob_run_at_jobs set error_count = error_count + 1, last_error = $1, next_run_at = $2, worker_id = null where id = $3`, 702 | jobUpdate.LastError, jobUpdate.NextRunAt, jobUpdate.ID, 703 | ) 704 | } 705 | } 706 | if len(asapJobIDsToDelete) > 0 { 707 | batch.Queue(`delete from pgxjob_asap_jobs where id = any($1)`, asapJobIDsToDelete) 708 | } 709 | if len(runAtJobIDsToDelete) > 0 { 710 | batch.Queue(`delete from pgxjob_run_at_jobs where id = any($1)`, runAtJobIDsToDelete) 711 | } 712 | 713 | // COPY FROM is faster than INSERT for multiple rows. But it has the overhead of an extra round trip and 714 | // (auto-commit) transaction. So for small numbers of rows it is faster to bundle INSERTs with the batch that is 715 | // already being used. 716 | const jobsRunCopyThreshhold = 5 717 | 718 | if len(pgxjobJobRunsToInsert) < jobsRunCopyThreshhold { 719 | for _, jobRun := range pgxjobJobRunsToInsert { 720 | batch.Queue( 721 | `insert into pgxjob_job_runs (job_id, job_inserted_at, run_at, started_at, finished_at, run_number, group_id, type_id, params, error) values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`, 722 | jobRun.JobID, jobRun.InsertedAt, jobRun.RunAt, jobRun.StartedAt, jobRun.FinishedAt, jobRun.RunNumber, jobRun.GroupID, jobRun.TypeID, jobRun.Params, jobRun.Error, 723 | ) 724 | } 725 | } 726 | 727 | // The entirety of getting a connection and performing the updates should be very quick. But set a timeout as a 728 | // failsafe. 729 | ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) 730 | defer cancel() 731 | 732 | conn, release, err := w.scheduler.acquireConn(ctx) 733 | if err != nil { 734 | w.handleWorkerError(fmt.Errorf("pgxjob: recording job results: failed to get connection: %w", err)) 735 | return 736 | } 737 | defer release() 738 | 739 | // Note: purposely not using an explicit transaction. The batch and the copy are each transactional. The only value 740 | // of the explicit transaction would be to *not* save the batch changes if the copy failed. It is preferable to 741 | // preserve those changes even if the copy fails. It also saves a round trip for the begin and the commit. 742 | 743 | err = conn.SendBatch(ctx, batch).Close() 744 | if err != nil { 745 | w.handleWorkerError(fmt.Errorf("pgxjob: recording job results: failed to send batch: %w", err)) 746 | return 747 | } 748 | 749 | if len(pgxjobJobRunsToInsert) >= jobsRunCopyThreshhold { 750 | _, err = conn.CopyFrom( 751 | ctx, 752 | pgx.Identifier{"pgxjob_job_runs"}, 753 | []string{"job_id", "job_inserted_at", "run_at", "started_at", "finished_at", "run_number", "group_id", "type_id", "params", "error"}, 754 | pgx.CopyFromSlice(len(pgxjobJobRunsToInsert), func(i int) ([]any, error) { 755 | row := &pgxjobJobRunsToInsert[i] 756 | return []any{row.JobID, row.InsertedAt, row.RunAt, row.StartedAt, row.FinishedAt, row.RunNumber, row.GroupID, row.TypeID, row.Params, row.Error}, nil 757 | }), 758 | ) 759 | if err != nil { 760 | w.handleWorkerError(fmt.Errorf("pgxjob: recording job results: failed to copy pgxjob_job_runs: %w", err)) 761 | return 762 | } 763 | } 764 | } 765 | 766 | defer flush() 767 | 768 | appendJobResult := func(jobResult *jobResult) { 769 | jobResults = append(jobResults, jobResult) 770 | if len(jobResults) >= w.config.MaxBufferedJobResults { 771 | flush() 772 | } else if len(jobResults) == 1 { 773 | flushTimer.Reset(w.config.MaxBufferedJobResultAge) 774 | } 775 | } 776 | 777 | loop1: 778 | for { 779 | select { 780 | case <-w.cancelCtx.Done(): 781 | break loop1 782 | case jobResult := <-w.jobResultsChan: 783 | appendJobResult(jobResult) 784 | case <-flushTimer.C: 785 | flush() 786 | } 787 | } 788 | 789 | doneChan := make(chan struct{}) 790 | go func() { 791 | // must be done before waiting for w.jobRunnerGoroutineWaitGroup because fetchAndStartJobs can start a job worker 792 | // which calls w.jobRunnerGoroutineWaitGroup.Add. From the docs for Add: "Note that calls with a positive delta that 793 | // occur when the counter is zero must happen before a Wait." Violating this rule can cause a race condtion. 794 | <-w.fetchAndStartJobsDoneChan 795 | 796 | w.jobRunnerGoroutineWaitGroup.Wait() 797 | close(doneChan) 798 | }() 799 | 800 | loop2: 801 | for { 802 | select { 803 | case <-doneChan: 804 | break loop2 805 | case jobResult := <-w.jobResultsChan: 806 | appendJobResult(jobResult) 807 | case <-flushTimer.C: 808 | flush() 809 | } 810 | } 811 | } 812 | 813 | // StartupComplete returns a channel that is closed when the worker start is complete. 814 | func (w *Worker) StartupComplete() <-chan struct{} { 815 | return w.startupCompleteChan 816 | } 817 | 818 | // ID gets the worker's ID. This is only valid after the worker startup has completed. This is guaranteed while processing 819 | // a job, but not immediately after StartWorker has returned. Use StartupComplete to wait for the worker to start. 820 | func (w *Worker) ID() int32 { 821 | return w.id 822 | } 823 | 824 | // Shutdown stops the worker. It waits for all jobs to finish before returning. Cancel ctx to force shutdown without 825 | // waiting for jobs to finish or worker to cleanup. 826 | func (w *Worker) Shutdown(ctx context.Context) error { 827 | w.cancel() 828 | 829 | // Wait for all worker goroutines to finish. 830 | doneChan := make(chan struct{}) 831 | go func() { 832 | // must be done before waiting for w.jobRunnerGoroutineWaitGroup because fetchAndStartJobs can start a job worker 833 | // which calls w.jobRunnerGoroutineWaitGroup.Add. From the docs for Add: "Note that calls with a positive delta that 834 | // occur when the counter is zero must happen before a Wait." Violating this rule can cause a race condtion. 835 | <-w.fetchAndStartJobsDoneChan 836 | 837 | w.jobRunnerGoroutineWaitGroup.Wait() 838 | <-w.writeJobResultsDoneChan 839 | <-w.heartbeatDoneChan 840 | close(doneChan) 841 | }() 842 | 843 | select { 844 | case <-ctx.Done(): 845 | return ctx.Err() 846 | case <-doneChan: 847 | } 848 | 849 | // Cleanup can't be started until all worker goroutines have finished. Otherwise, the cleanup may unlock a job that is 850 | // still being worked on this worker. Another worker could pick up the job and it could be executed multiple times. 851 | cleanupErrChan := make(chan error) 852 | go func() { 853 | ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) 854 | defer cancel() 855 | 856 | conn, release, err := w.scheduler.acquireConn(ctx) 857 | if err != nil { 858 | cleanupErrChan <- fmt.Errorf("pgxjob: shutdown failed to get connection for cleanup: %w", err) 859 | return 860 | } 861 | defer release() 862 | 863 | _, err = conn.Exec(ctx, `delete from pgxjob_workers where id = $1`, w.id) 864 | if err != nil { 865 | cleanupErrChan <- fmt.Errorf("pgxjob: shutdown failed to cleanup worker: %w", err) 866 | return 867 | } 868 | 869 | close(cleanupErrChan) 870 | }() 871 | 872 | select { 873 | case <-ctx.Done(): 874 | return ctx.Err() 875 | case err := <-cleanupErrChan: 876 | return err 877 | } 878 | } 879 | 880 | type Job struct { 881 | ID int64 882 | Group *JobGroup 883 | Type *JobType 884 | Params []byte 885 | InsertedAt time.Time 886 | RunAt time.Time 887 | LastError string 888 | ErrorCount int32 889 | ASAP bool 890 | } 891 | 892 | type jobResult struct { 893 | job *Job 894 | startTime time.Time 895 | finishedAt time.Time 896 | err error 897 | } 898 | 899 | func (w *Worker) fetchAndStartJobs() error { 900 | defer close(w.fetchAndStartJobsDoneChan) 901 | 902 | for { 903 | // Check if the context is done before processing any jobs. 904 | select { 905 | case <-w.cancelCtx.Done(): 906 | return nil 907 | default: 908 | } 909 | 910 | jobs, err := w.fetchJobs() 911 | if err != nil { 912 | // context.Canceled means that w.cancelCtx was cancelled. This happens when shutting down. 913 | if errors.Is(err, context.Canceled) { 914 | return nil 915 | } 916 | w.handleWorkerError(fmt.Errorf("pgxjob: failed to fetch jobs: %w", err)) 917 | } 918 | noJobsAvailableInDatabase := len(jobs) < w.config.MaxPrefetchedJobs 919 | w.runningJobsMux.Lock() 920 | for _, job := range jobs { 921 | w.runningJobIDs[job.ID] = struct{}{} 922 | } 923 | w.runningJobsMux.Unlock() 924 | 925 | for _, job := range jobs { 926 | select { 927 | case w.jobChan <- job: 928 | case <-w.cancelCtx.Done(): 929 | return nil 930 | default: 931 | w.mux.Lock() 932 | if w.jobRunnerGoroutineCount < w.config.MaxConcurrentJobs { 933 | w.startJobRunner() 934 | } 935 | w.mux.Unlock() 936 | select { 937 | case w.jobChan <- job: 938 | case <-w.cancelCtx.Done(): 939 | return nil 940 | } 941 | } 942 | } 943 | 944 | if noJobsAvailableInDatabase { 945 | select { 946 | case <-w.cancelCtx.Done(): 947 | return nil 948 | case <-time.NewTimer(w.config.PollInterval).C: 949 | case <-w.signalChan: 950 | } 951 | } 952 | } 953 | } 954 | 955 | // fetchPrelockedASAPJobsSQL is used to fetch pgxjob_asap_jobs that were prelocked by Schedule. It takes 4 bound 956 | // parameters. $1 is group id. $2 is worker_id. $3 is an array of job IDs the worker is already working. $4 is the 957 | // maximum number of jobs to fetch. ids. 958 | const fetchPrelockedASAPJobsSQL = `select id, type_id, params, inserted_at 959 | from pgxjob_asap_jobs 960 | where group_id = $1 961 | and worker_id = $2 962 | and not id = any ($3) 963 | limit $4` 964 | 965 | // fetchAndLockASAPJobsSQL is used to fetch and lock pgxjob_asap_jobs in a single query. It takes 3 bound parameters. $1 966 | // is group id. $2 is the maximum number of jobs to fetch. $3 is worker_id that is locking the job. 967 | // 968 | // Exactly how concurrency and locking work with CTEs can be confusing, but the "for update skip locked" is held for the 969 | // entire statement (actually the lock is held for the entire transaction) per Tom Lane 970 | // (https://www.postgresql.org/message-id/1604.1499787945%40sss.pgh.pa.us). 971 | const fetchAndLockASAPJobsSQL = `with lock_jobs as ( 972 | select id 973 | from pgxjob_asap_jobs 974 | where group_id = $1 975 | and worker_id is null 976 | limit $2 977 | for update skip locked 978 | ) 979 | update pgxjob_asap_jobs 980 | set worker_id = $3 981 | where id in (select id from lock_jobs) 982 | returning id, type_id, params, inserted_at` 983 | 984 | // fetchAndLockRunAtJobsSQL is used to fetch and lock jobs in a single query. It takes 3 bound parameters. $1 is group id. $2 985 | // is the maximum number of jobs to fetch. $3 is worker_id that is locking the job. 986 | // 987 | // Exactly how concurrency and locking work with CTEs can be confusing, but the "for update skip locked" is held for the 988 | // entire statement (actually the lock is held for the entire transaction) per Tom Lane 989 | // (https://www.postgresql.org/message-id/1604.1499787945%40sss.pgh.pa.us). 990 | const fetchAndLockRunAtJobsSQL = `with lock_jobs as ( 991 | select id 992 | from pgxjob_run_at_jobs 993 | where next_run_at < now() 994 | and group_id = $1 995 | and worker_id is null 996 | limit $2 997 | for update skip locked 998 | ) 999 | update pgxjob_run_at_jobs 1000 | set worker_id = $3 1001 | where id in (select id from lock_jobs) 1002 | returning id, type_id, params, inserted_at, run_at, coalesce(last_error, ''), error_count` 1003 | 1004 | func (w *Worker) fetchJobs() ([]*Job, error) { 1005 | conn, release, err := w.scheduler.acquireConn(w.cancelCtx) 1006 | if err != nil { 1007 | return nil, fmt.Errorf("failed to get connection: %w", err) 1008 | } 1009 | defer release() 1010 | 1011 | jobTypeFromID := func(jobTypeID int32) *JobType { 1012 | if jobType, ok := w.scheduler.jobTypesByID[jobTypeID]; ok { 1013 | return jobType 1014 | } else { 1015 | // This should never happen because job types are created and never deleted. But if somehow it does happen then 1016 | // create a fake JobType with a RunJob that returns an error. 1017 | return &JobType{ 1018 | ID: jobTypeID, 1019 | RunJob: func(ctx context.Context, job *Job) error { 1020 | return fmt.Errorf("pgxjob: job type with id %d not registered", jobTypeID) 1021 | }, 1022 | } 1023 | } 1024 | } 1025 | 1026 | runningJobIDs := []int64{} // Important to use empty slice instead of nil because of NULL behavior in SQL. 1027 | w.runningJobsMux.Lock() 1028 | for jobID := range w.runningJobIDs { 1029 | runningJobIDs = append(runningJobIDs, jobID) 1030 | } 1031 | w.runningJobsMux.Unlock() 1032 | 1033 | rowToASAPJob := func(row pgx.CollectableRow) (*Job, error) { 1034 | var job Job 1035 | var jobTypeID int32 1036 | err := row.Scan( 1037 | &job.ID, &jobTypeID, &job.Params, &job.InsertedAt, 1038 | ) 1039 | if err != nil { 1040 | return nil, err 1041 | } 1042 | job.RunAt = job.InsertedAt 1043 | job.ASAP = true 1044 | 1045 | job.Group = w.group 1046 | job.Type = jobTypeFromID(jobTypeID) 1047 | 1048 | return &job, nil 1049 | } 1050 | 1051 | jobs, err := pgxutil.Select(w.cancelCtx, conn, 1052 | fetchPrelockedASAPJobsSQL, 1053 | []any{w.group.ID, w.id, runningJobIDs, w.config.MaxPrefetchedJobs}, 1054 | rowToASAPJob, 1055 | ) 1056 | if err != nil { 1057 | return nil, fmt.Errorf("failed to fetch prelocked pgxjob_asap_jobs: %w", err) 1058 | } 1059 | 1060 | if len(jobs) < w.config.MaxPrefetchedJobs { 1061 | asapJobs, err := pgxutil.Select(w.cancelCtx, conn, 1062 | fetchAndLockASAPJobsSQL, 1063 | []any{w.group.ID, w.config.MaxPrefetchedJobs, w.id}, 1064 | rowToASAPJob, 1065 | ) 1066 | if err != nil { 1067 | // If some jobs were successfully locked. Return those without an error. 1068 | if len(jobs) > 0 { 1069 | return jobs, nil 1070 | } 1071 | return nil, fmt.Errorf("failed to fetch and lock pgxjob_asap_jobs: %w", err) 1072 | } 1073 | jobs = append(jobs, asapJobs...) 1074 | } 1075 | 1076 | if len(jobs) < w.config.MaxPrefetchedJobs { 1077 | runAtJobs, err := pgxutil.Select(w.cancelCtx, conn, 1078 | fetchAndLockRunAtJobsSQL, 1079 | []any{w.group.ID, w.config.MaxPrefetchedJobs - len(jobs), w.id}, 1080 | func(row pgx.CollectableRow) (*Job, error) { 1081 | var job Job 1082 | var jobTypeID int32 1083 | err := row.Scan( 1084 | &job.ID, &jobTypeID, &job.Params, &job.InsertedAt, &job.RunAt, &job.LastError, &job.ErrorCount, 1085 | ) 1086 | if err != nil { 1087 | return nil, err 1088 | } 1089 | job.ASAP = false 1090 | 1091 | job.Group = w.group 1092 | job.Type = jobTypeFromID(jobTypeID) 1093 | 1094 | return &job, nil 1095 | }, 1096 | ) 1097 | if err != nil { 1098 | // If some jobs were successfully locked. Return those without an error. 1099 | if len(jobs) > 0 { 1100 | return jobs, nil 1101 | } 1102 | return nil, fmt.Errorf("failed to fetch and lock pgxjob_run_at_jobs: %w", err) 1103 | } 1104 | jobs = append(jobs, runAtJobs...) 1105 | } 1106 | 1107 | return jobs, nil 1108 | } 1109 | 1110 | // startJobRunner starts a new job runner. w.mux must be locked before calling. 1111 | func (w *Worker) startJobRunner() { 1112 | w.jobRunnerGoroutineWaitGroup.Add(1) 1113 | w.jobRunnerGoroutineCount++ 1114 | 1115 | go func() { 1116 | defer func() { 1117 | w.mux.Lock() 1118 | w.jobRunnerGoroutineCount-- 1119 | w.mux.Unlock() 1120 | w.jobRunnerGoroutineWaitGroup.Done() 1121 | }() 1122 | 1123 | for { 1124 | select { 1125 | case job := <-w.jobChan: 1126 | startedAt := time.Now() 1127 | var err error 1128 | func() { 1129 | defer func() { 1130 | if r := recover(); r != nil { 1131 | stack := debug.Stack() 1132 | err = fmt.Errorf("panic: %v\n%s", r, string(stack)) 1133 | } 1134 | }() 1135 | err = job.Type.RunJob(w.cancelCtx, job) 1136 | }() 1137 | finishedAt := time.Now() 1138 | w.jobResultsChan <- &jobResult{job, startedAt, finishedAt, err} 1139 | 1140 | case <-w.cancelCtx.Done(): 1141 | return 1142 | 1143 | case <-time.NewTimer(5 * time.Second).C: 1144 | return 1145 | } 1146 | } 1147 | }() 1148 | } 1149 | 1150 | func (w *Worker) handleWorkerError(err error) { 1151 | w.scheduler.handleError(fmt.Errorf("worker %v: %w", w.id, err)) 1152 | } 1153 | 1154 | // Signal causes the worker to wake up and process requests. It is safe to call this from multiple goroutines. It does 1155 | // not block. 1156 | func (w *Worker) Signal() { 1157 | select { 1158 | case w.signalChan <- struct{}{}: 1159 | default: 1160 | } 1161 | } 1162 | 1163 | // HandleNotification implements the pgxlisten.Handler interface. This allows a Worker to be used as a 1164 | // pgxlisten.Listener. When it receives a notification for the worker's job group it calls Signal. 1165 | func (w *Worker) HandleNotification(ctx context.Context, notification *pgconn.Notification, conn *pgx.Conn) error { 1166 | if notification.Payload == w.group.Name { 1167 | w.Signal() 1168 | } 1169 | 1170 | return nil 1171 | } 1172 | 1173 | type ErrorWithRetry struct { 1174 | Err error 1175 | RetryAt time.Time 1176 | } 1177 | 1178 | func (e *ErrorWithRetry) Error() string { 1179 | return e.Err.Error() 1180 | } 1181 | 1182 | func (e *ErrorWithRetry) Unwrap() error { 1183 | return e.Err 1184 | } 1185 | 1186 | // FilterError returns a RunJobFunc that calls runJob. If runJob returns an error then it calls filterError and returns 1187 | // its error. filterError is typically used to determine if the error should be retried or not. 1188 | func FilterError(runJob RunJobFunc, errorFilter ErrorFilter) RunJobFunc { 1189 | return func(ctx context.Context, job *Job) error { 1190 | jobErr := runJob(ctx, job) 1191 | if jobErr != nil { 1192 | return errorFilter.FilterError(job, jobErr) 1193 | } 1194 | 1195 | return nil 1196 | } 1197 | } 1198 | 1199 | type ErrorFilter interface { 1200 | FilterError(job *Job, jobErr error) error 1201 | } 1202 | 1203 | type FilterErrorFunc func(job *Job, jobErr error) error 1204 | 1205 | func (f FilterErrorFunc) FilterError(job *Job, jobErr error) error { 1206 | return f(job, jobErr) 1207 | } 1208 | 1209 | // RetryLinearBackoffErrorFilter is an ErrorFilter that returns an ErrorWithRetry if the job should be retried. It uses 1210 | // a linear backoff to determine when to schedule the retries. 1211 | type RetryLinearBackoffErrorFilter struct { 1212 | // MaxRetries is the maximum number of times to retry. 1213 | MaxRetries int32 1214 | 1215 | // BaseDelay is the amount of time to wait before the first retry. The wait time will increase by BaseDelay for each 1216 | // retry. 1217 | BaseDelay time.Duration 1218 | } 1219 | 1220 | // FilterError returns an ErrorWithRetry if the job should be retried. If the error is already an ErrorWithRetry then it 1221 | // is returned unmodified. If the job should not be retried then the original error is returned. 1222 | func (f *RetryLinearBackoffErrorFilter) FilterError(job *Job, jobErr error) error { 1223 | if jobErr == nil { 1224 | return nil 1225 | } 1226 | 1227 | if job.ErrorCount >= f.MaxRetries { 1228 | return jobErr 1229 | } 1230 | 1231 | var errorWithRetry *ErrorWithRetry 1232 | if errors.As(jobErr, &errorWithRetry) { 1233 | return jobErr 1234 | } 1235 | 1236 | return &ErrorWithRetry{ 1237 | Err: jobErr, 1238 | RetryAt: time.Now().Add(time.Duration(job.ErrorCount+1) * f.BaseDelay), 1239 | } 1240 | } 1241 | 1242 | // LogFinalJobRuns is a ShouldLogJobRun function that returns true for the final run of a job. That is, the run was 1243 | // successful or it failed and will not try again. 1244 | func LogFinalJobRuns(worker *Worker, job *Job, startTime, endTime time.Time, err error) bool { 1245 | if err == nil { 1246 | return true 1247 | } 1248 | 1249 | var errorWithRetry *ErrorWithRetry 1250 | return !errors.As(err, &errorWithRetry) 1251 | } 1252 | -------------------------------------------------------------------------------- /pgxjob_internal_test.go: -------------------------------------------------------------------------------- 1 | package pgxjob 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | func (c *WorkerConfig) SetMinHeartbeatDelayForTest(d time.Duration) { 8 | c.minHeartbeatDelay = d 9 | } 10 | 11 | func (c *WorkerConfig) SetHeartbeatDelayJitterForTest(d time.Duration) { 12 | c.heartbeatDelayJitter = d 13 | } 14 | 15 | func (c *WorkerConfig) SetWorkerDeadWithoutHeartbeatDurationForTest(d time.Duration) { 16 | c.workerDeadWithoutHeartbeatDuration = d 17 | } 18 | 19 | func (s *Scheduler) SetupDoneChan() chan struct{} { 20 | return s.setupDoneChan 21 | } 22 | -------------------------------------------------------------------------------- /pgxjob_test.go: -------------------------------------------------------------------------------- 1 | package pgxjob_test 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "math/rand" 8 | "os" 9 | "testing" 10 | "time" 11 | 12 | "github.com/jackc/pgx/v5" 13 | "github.com/jackc/pgx/v5/pgtype" 14 | "github.com/jackc/pgx/v5/pgxpool" 15 | "github.com/jackc/pgxjob" 16 | "github.com/jackc/pgxlisten" 17 | "github.com/jackc/pgxutil" 18 | "github.com/stretchr/testify/assert" 19 | "github.com/stretchr/testify/require" 20 | ) 21 | 22 | // mustConnect connects to the database specified by the PGXJOB_TEST_DATABASE environment variable. It automatically 23 | // closes the connection when the test is finished. 24 | func mustConnect(t testing.TB) *pgx.Conn { 25 | t.Helper() 26 | 27 | dbname := os.Getenv("PGXJOB_TEST_DATABASE") 28 | if dbname == "" { 29 | t.Fatal("PGXJOB_TEST_DATABASE environment variable must be set") 30 | } 31 | 32 | config, err := pgx.ParseConfig(fmt.Sprintf("dbname=%s", os.Getenv("PGXJOB_TEST_DATABASE"))) 33 | require.NoError(t, err) 34 | 35 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 36 | defer cancel() 37 | conn, err := pgx.ConnectConfig(ctx, config) 38 | require.NoError(t, err) 39 | 40 | t.Cleanup(func() { 41 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 42 | defer cancel() 43 | err := conn.Close(ctx) 44 | if err != nil { 45 | t.Logf("Warning: error closing connection: %v", err) 46 | } 47 | }) 48 | 49 | return conn 50 | } 51 | 52 | func mustNewDBPool(t testing.TB) *pgxpool.Pool { 53 | t.Helper() 54 | 55 | dbname := os.Getenv("PGXJOB_TEST_DATABASE") 56 | if dbname == "" { 57 | t.Fatal("PGXJOB_TEST_DATABASE environment variable must be set") 58 | } 59 | 60 | config, err := pgxpool.ParseConfig(fmt.Sprintf("dbname=%s", os.Getenv("PGXJOB_TEST_DATABASE"))) 61 | require.NoError(t, err) 62 | 63 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 64 | defer cancel() 65 | pool, err := pgxpool.NewWithConfig(ctx, config) 66 | require.NoError(t, err) 67 | 68 | t.Cleanup(func() { 69 | // Close pool in a goroutine to avoid blocking forever if there are connections checked out. 70 | go pool.Close() 71 | }) 72 | 73 | return pool 74 | } 75 | 76 | func mustCleanDatabase(t testing.TB, conn *pgx.Conn) { 77 | t.Helper() 78 | 79 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 80 | defer cancel() 81 | for _, table := range []string{ 82 | "pgxjob_workers", 83 | "pgxjob_asap_jobs", 84 | "pgxjob_run_at_jobs", 85 | "pgxjob_job_runs", 86 | } { 87 | _, err := conn.Exec(ctx, fmt.Sprintf("delete from %s", table)) 88 | require.NoErrorf(t, err, "error cleaning table %s", table) 89 | } 90 | } 91 | 92 | type asapJob struct { 93 | ID int64 94 | InsertedAt time.Time 95 | GroupID int32 96 | TypeID int32 97 | WorkerID pgtype.Int4 98 | Params []byte 99 | } 100 | 101 | type runAtJob struct { 102 | ID int64 103 | InsertedAt time.Time 104 | RunAt pgtype.Timestamptz 105 | NextRunAt pgtype.Timestamptz 106 | GroupID int32 107 | TypeID int32 108 | WorkerID pgtype.Int4 109 | ErrorCount pgtype.Int4 110 | LastError pgtype.Text 111 | Params []byte 112 | } 113 | 114 | type jobRun struct { 115 | JobID int64 116 | InsertedAt time.Time 117 | RunAt time.Time 118 | StartedAt time.Time 119 | FinishedAt time.Time 120 | RunNumber int32 121 | GroupID int32 122 | TypeID int32 123 | Params []byte 124 | LastError pgtype.Text 125 | } 126 | 127 | func TestASAPEndToEnd(t *testing.T) { 128 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 129 | defer cancel() 130 | 131 | startTime := time.Now() 132 | 133 | conn := mustConnect(t) 134 | mustCleanDatabase(t, conn) 135 | dbpool := mustNewDBPool(t) 136 | 137 | jobRanChan := make(chan struct{}) 138 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 139 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 140 | JobTypes: []*pgxjob.JobTypeConfig{ 141 | { 142 | Name: "test", 143 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 144 | jobRanChan <- struct{}{} 145 | return nil 146 | }, 147 | }, 148 | }, 149 | HandleError: func(err error) { 150 | t.Errorf("scheduler HandleError: %v", err) 151 | }, 152 | }) 153 | require.NoError(t, err) 154 | 155 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 156 | require.NoError(t, err) 157 | 158 | afterScheduleNow := time.Now() 159 | 160 | job, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_asap_jobs`, nil, pgx.RowToStructByPos[asapJob]) 161 | require.NoError(t, err) 162 | 163 | require.True(t, job.InsertedAt.After(startTime)) 164 | require.True(t, job.InsertedAt.Before(afterScheduleNow)) 165 | 166 | defaultGroupID, err := pgxutil.SelectRow(ctx, conn, `select id from pgxjob_groups where name = 'default'`, nil, pgx.RowTo[int32]) 167 | require.NoError(t, err) 168 | require.Equal(t, defaultGroupID, job.GroupID) 169 | 170 | testJobTypeID, err := pgxutil.SelectRow(ctx, conn, `select id from pgxjob_types where name = 'test'`, nil, pgx.RowTo[int32]) 171 | require.NoError(t, err) 172 | require.Equal(t, testJobTypeID, job.TypeID) 173 | 174 | require.Equal(t, []byte(nil), job.Params) 175 | 176 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{}) 177 | require.NoError(t, err) 178 | 179 | select { 180 | case <-jobRanChan: 181 | case <-time.After(30 * time.Second): 182 | t.Fatal("timed out waiting for job to run") 183 | } 184 | 185 | worker.Shutdown(context.Background()) 186 | 187 | afterRunNow := time.Now() 188 | 189 | jobRun, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_job_runs where job_id = $1`, []any{job.ID}, pgx.RowToStructByPos[jobRun]) 190 | require.NoError(t, err) 191 | 192 | require.Equal(t, job.ID, jobRun.JobID) 193 | require.True(t, jobRun.InsertedAt.Equal(job.InsertedAt)) 194 | require.True(t, jobRun.RunAt.Equal(jobRun.InsertedAt)) 195 | require.True(t, jobRun.StartedAt.After(startTime)) 196 | require.True(t, jobRun.StartedAt.Before(afterRunNow)) 197 | require.True(t, jobRun.FinishedAt.After(startTime)) 198 | require.True(t, jobRun.FinishedAt.Before(afterRunNow)) 199 | require.EqualValues(t, 1, jobRun.RunNumber) 200 | require.Equal(t, job.GroupID, jobRun.GroupID) 201 | require.Equal(t, job.TypeID, jobRun.TypeID) 202 | require.Equal(t, job.Params, jobRun.Params) 203 | require.False(t, jobRun.LastError.Valid) 204 | } 205 | 206 | func TestRunAtEndToEnd(t *testing.T) { 207 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 208 | defer cancel() 209 | 210 | startTime := time.Now() 211 | 212 | conn := mustConnect(t) 213 | mustCleanDatabase(t, conn) 214 | dbpool := mustNewDBPool(t) 215 | 216 | jobRanChan := make(chan struct{}) 217 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 218 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 219 | JobTypes: []*pgxjob.JobTypeConfig{ 220 | { 221 | Name: "test", 222 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 223 | jobRanChan <- struct{}{} 224 | return nil 225 | }, 226 | }, 227 | }, 228 | HandleError: func(err error) { 229 | t.Errorf("scheduler HandleError: %v", err) 230 | }, 231 | }) 232 | require.NoError(t, err) 233 | 234 | runAt := time.Now().Add(100 * time.Millisecond).Truncate(time.Millisecond) // Truncate because PostgreSQL only supports microsecond precision. 235 | err = scheduler.Schedule(ctx, conn, "test", nil, pgxjob.JobSchedule{RunAt: runAt}) 236 | require.NoError(t, err) 237 | 238 | afterScheduleNow := time.Now() 239 | 240 | job, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_run_at_jobs`, nil, pgx.RowToStructByPos[runAtJob]) 241 | require.NoError(t, err) 242 | 243 | require.True(t, job.InsertedAt.After(startTime)) 244 | require.True(t, job.InsertedAt.Before(afterScheduleNow)) 245 | require.True(t, job.RunAt.Time.Equal(runAt)) 246 | require.True(t, job.NextRunAt.Time.Equal(runAt)) 247 | 248 | defaultGroupID, err := pgxutil.SelectRow(ctx, conn, `select id from pgxjob_groups where name = 'default'`, nil, pgx.RowTo[int32]) 249 | require.NoError(t, err) 250 | require.Equal(t, defaultGroupID, job.GroupID) 251 | 252 | testJobTypeID, err := pgxutil.SelectRow(ctx, conn, `select id from pgxjob_types where name = 'test'`, nil, pgx.RowTo[int32]) 253 | require.NoError(t, err) 254 | require.Equal(t, testJobTypeID, job.TypeID) 255 | 256 | require.Equal(t, []byte(nil), job.Params) 257 | 258 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{ 259 | PollInterval: 50 * time.Millisecond, 260 | }) 261 | require.NoError(t, err) 262 | 263 | select { 264 | case <-jobRanChan: 265 | case <-time.After(30 * time.Second): 266 | t.Fatal("timed out waiting for job to run") 267 | } 268 | 269 | worker.Shutdown(context.Background()) 270 | 271 | afterRunNow := time.Now() 272 | 273 | jobRun, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_job_runs where job_id = $1`, []any{job.ID}, pgx.RowToStructByPos[jobRun]) 274 | require.NoError(t, err) 275 | 276 | require.Equal(t, job.ID, jobRun.JobID) 277 | require.True(t, jobRun.InsertedAt.Equal(job.InsertedAt)) 278 | require.True(t, jobRun.RunAt.Equal(runAt)) 279 | require.True(t, jobRun.StartedAt.After(startTime)) 280 | require.True(t, jobRun.StartedAt.Before(afterRunNow)) 281 | require.True(t, jobRun.FinishedAt.After(startTime)) 282 | require.True(t, jobRun.FinishedAt.Before(afterRunNow)) 283 | require.EqualValues(t, 1, jobRun.RunNumber) 284 | require.Equal(t, job.GroupID, jobRun.GroupID) 285 | require.Equal(t, job.TypeID, jobRun.TypeID) 286 | require.Equal(t, job.Params, jobRun.Params) 287 | require.False(t, jobRun.LastError.Valid) 288 | } 289 | 290 | func TestConcurrentJobSchedulingAndWorking(t *testing.T) { 291 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 292 | defer cancel() 293 | 294 | conn := mustConnect(t) 295 | mustCleanDatabase(t, conn) 296 | dbpool := mustNewDBPool(t) 297 | 298 | jobRanChan := make(chan struct{}) 299 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 300 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 301 | JobTypes: []*pgxjob.JobTypeConfig{ 302 | { 303 | Name: "test", 304 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 305 | jobRanChan <- struct{}{} 306 | return nil 307 | }, 308 | }, 309 | }, 310 | HandleError: func(err error) { 311 | t.Errorf("scheduler HandleError: %v", err) 312 | }, 313 | }) 314 | require.NoError(t, err) 315 | 316 | totalJobs := 0 317 | 318 | // Schedule an ASAP job before the worker has started. 319 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 320 | require.NoError(t, err) 321 | totalJobs++ 322 | 323 | // Schedule a Run At job before the worker has started. 324 | err = scheduler.Schedule(ctx, conn, "test", nil, pgxjob.JobSchedule{RunAt: time.Now().Add(500 * time.Millisecond)}) 325 | require.NoError(t, err) 326 | totalJobs++ 327 | 328 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{ 329 | PollInterval: 50 * time.Millisecond, 330 | }) 331 | require.NoError(t, err) 332 | 333 | for i := 0; i < 10; i++ { 334 | time.Sleep(10 * time.Millisecond) 335 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 336 | require.NoError(t, err) 337 | totalJobs++ 338 | 339 | err = scheduler.Schedule(ctx, conn, "test", nil, pgxjob.JobSchedule{RunAt: time.Now().Add(500 * time.Millisecond)}) 340 | require.NoError(t, err) 341 | totalJobs++ 342 | } 343 | 344 | for i := 0; i < totalJobs; i++ { 345 | select { 346 | case <-jobRanChan: 347 | case <-time.After(30 * time.Second): 348 | t.Fatal("timed out waiting for job to run") 349 | } 350 | } 351 | 352 | worker.Shutdown(context.Background()) 353 | 354 | pendingASAPJobsCount, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_asap_jobs`, nil, pgx.RowTo[int32]) 355 | require.NoError(t, err) 356 | require.EqualValues(t, 0, pendingASAPJobsCount) 357 | 358 | pendingRunAtJobsCount, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_run_at_jobs`, nil, pgx.RowTo[int32]) 359 | require.NoError(t, err) 360 | require.EqualValues(t, 0, pendingRunAtJobsCount) 361 | 362 | jobRunsCount, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_job_runs`, nil, pgx.RowTo[int32]) 363 | require.NoError(t, err) 364 | require.EqualValues(t, totalJobs, jobRunsCount) 365 | } 366 | 367 | func TestJobFailedNoRetry(t *testing.T) { 368 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 369 | defer cancel() 370 | 371 | startTime := time.Now() 372 | 373 | conn := mustConnect(t) 374 | mustCleanDatabase(t, conn) 375 | dbpool := mustNewDBPool(t) 376 | 377 | jobRanChan := make(chan struct{}) 378 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 379 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 380 | JobTypes: []*pgxjob.JobTypeConfig{ 381 | { 382 | Name: "test", 383 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 384 | jobRanChan <- struct{}{} 385 | return fmt.Errorf("test error") 386 | }, 387 | }, 388 | }, 389 | HandleError: func(err error) { 390 | t.Errorf("scheduler HandleError: %v", err) 391 | }, 392 | }) 393 | require.NoError(t, err) 394 | 395 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 396 | require.NoError(t, err) 397 | 398 | job, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_asap_jobs`, nil, pgx.RowToStructByPos[asapJob]) 399 | require.NoError(t, err) 400 | 401 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{}) 402 | require.NoError(t, err) 403 | 404 | select { 405 | case <-jobRanChan: 406 | case <-time.After(30 * time.Second): 407 | t.Fatal("timed out waiting for job to run") 408 | } 409 | 410 | worker.Shutdown(context.Background()) 411 | 412 | afterRunNow := time.Now() 413 | 414 | _, err = pgxutil.SelectRow(ctx, conn, `select * from pgxjob_asap_jobs where id = $1`, []any{job.ID}, pgx.RowToStructByPos[asapJob]) 415 | require.Error(t, err) 416 | require.ErrorIs(t, err, pgx.ErrNoRows) 417 | 418 | _, err = pgxutil.SelectRow(ctx, conn, `select * from pgxjob_run_at_jobs where id = $1`, []any{job.ID}, pgx.RowToStructByPos[runAtJob]) 419 | require.Error(t, err) 420 | require.ErrorIs(t, err, pgx.ErrNoRows) 421 | 422 | jobRun, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_job_runs where job_id = $1`, []any{job.ID}, pgx.RowToStructByPos[jobRun]) 423 | require.NoError(t, err) 424 | 425 | require.Equal(t, job.ID, jobRun.JobID) 426 | require.True(t, jobRun.InsertedAt.Equal(job.InsertedAt)) 427 | require.True(t, jobRun.RunAt.Equal(jobRun.InsertedAt)) 428 | require.True(t, jobRun.StartedAt.After(startTime)) 429 | require.True(t, jobRun.StartedAt.Before(afterRunNow)) 430 | require.True(t, jobRun.FinishedAt.After(startTime)) 431 | require.True(t, jobRun.FinishedAt.Before(afterRunNow)) 432 | require.EqualValues(t, 1, jobRun.RunNumber) 433 | require.Equal(t, job.GroupID, jobRun.GroupID) 434 | require.Equal(t, job.TypeID, jobRun.TypeID) 435 | require.Equal(t, job.Params, jobRun.Params) 436 | require.Equal(t, "test error", jobRun.LastError.String) 437 | } 438 | 439 | func TestUnknownJobType(t *testing.T) { 440 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 441 | defer cancel() 442 | 443 | conn := mustConnect(t) 444 | mustCleanDatabase(t, conn) 445 | dbpool := mustNewDBPool(t) 446 | 447 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 448 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 449 | JobTypes: []*pgxjob.JobTypeConfig{ 450 | { 451 | Name: "test", 452 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 453 | return nil 454 | }, 455 | }, 456 | }, 457 | HandleError: func(err error) { 458 | t.Errorf("scheduler HandleError: %v", err) 459 | }, 460 | }) 461 | require.NoError(t, err) 462 | 463 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{}) 464 | require.NoError(t, err) 465 | 466 | err = pgxutil.InsertRow(ctx, conn, "pgxjob_asap_jobs", map[string]any{ 467 | "group_id": 1, // 1 should always be the default group 468 | "type_id": -1, // -1 should never exist 469 | }) 470 | require.NoError(t, err) 471 | 472 | require.Eventually(t, func() bool { 473 | n, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_asap_jobs`, nil, pgx.RowTo[int32]) 474 | require.NoError(t, err) 475 | return n == 0 476 | }, 5*time.Second, 100*time.Millisecond) 477 | 478 | worker.Shutdown(context.Background()) 479 | 480 | jobRun, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_job_runs`, nil, pgx.RowToStructByPos[jobRun]) 481 | require.NoError(t, err) 482 | 483 | require.EqualValues(t, 1, jobRun.RunNumber) 484 | require.Equal(t, "pgxjob: job type with id -1 not registered", jobRun.LastError.String) 485 | } 486 | 487 | func TestJobFailedErrorWithRetry(t *testing.T) { 488 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 489 | defer cancel() 490 | 491 | startTime := time.Now() 492 | 493 | conn := mustConnect(t) 494 | mustCleanDatabase(t, conn) 495 | dbpool := mustNewDBPool(t) 496 | 497 | jobRanChan := make(chan struct{}) 498 | retryAt := time.Now().Add(1 * time.Hour) 499 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 500 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 501 | JobTypes: []*pgxjob.JobTypeConfig{ 502 | { 503 | Name: "test", 504 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 505 | jobRanChan <- struct{}{} 506 | return &pgxjob.ErrorWithRetry{Err: fmt.Errorf("test error"), RetryAt: retryAt} 507 | }, 508 | }, 509 | }, 510 | HandleError: func(err error) { 511 | t.Errorf("scheduler HandleError: %v", err) 512 | }, 513 | }) 514 | require.NoError(t, err) 515 | 516 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 517 | require.NoError(t, err) 518 | 519 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{}) 520 | require.NoError(t, err) 521 | 522 | select { 523 | case <-jobRanChan: 524 | case <-time.After(30 * time.Second): 525 | t.Fatal("timed out waiting for job to run") 526 | } 527 | 528 | worker.Shutdown(context.Background()) 529 | 530 | afterRunNow := time.Now() 531 | 532 | job, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_run_at_jobs`, nil, pgx.RowToStructByPos[runAtJob]) 533 | require.NoError(t, err) 534 | require.EqualValues(t, 1, job.ErrorCount.Int32) 535 | require.Equal(t, "test error", job.LastError.String) 536 | 537 | jobRun, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_job_runs where job_id = $1`, []any{job.ID}, pgx.RowToStructByPos[jobRun]) 538 | require.NoError(t, err) 539 | 540 | require.Equal(t, job.ID, jobRun.JobID) 541 | require.True(t, jobRun.InsertedAt.Equal(job.InsertedAt)) 542 | require.True(t, jobRun.RunAt.Equal(jobRun.InsertedAt)) 543 | require.True(t, jobRun.StartedAt.After(startTime)) 544 | require.True(t, jobRun.StartedAt.Before(afterRunNow)) 545 | require.True(t, jobRun.FinishedAt.After(startTime)) 546 | require.True(t, jobRun.FinishedAt.Before(afterRunNow)) 547 | require.EqualValues(t, 1, jobRun.RunNumber) 548 | require.Equal(t, job.GroupID, jobRun.GroupID) 549 | require.Equal(t, job.TypeID, jobRun.TypeID) 550 | require.Equal(t, job.Params, jobRun.Params) 551 | require.Equal(t, job.LastError, jobRun.LastError) 552 | } 553 | 554 | func TestWorkerRunsBacklog(t *testing.T) { 555 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 556 | defer cancel() 557 | 558 | conn := mustConnect(t) 559 | mustCleanDatabase(t, conn) 560 | dbpool := mustNewDBPool(t) 561 | 562 | jobRanChan := make(chan struct{}) 563 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 564 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 565 | JobTypes: []*pgxjob.JobTypeConfig{ 566 | { 567 | Name: "test", 568 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 569 | jobRanChan <- struct{}{} 570 | return nil 571 | }, 572 | }, 573 | }, 574 | HandleError: func(err error) { 575 | t.Errorf("scheduler HandleError: %v", err) 576 | }, 577 | }) 578 | require.NoError(t, err) 579 | 580 | backlogCount := 10000 581 | for i := 0; i < backlogCount; i++ { 582 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 583 | require.NoError(t, err) 584 | } 585 | 586 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{}) 587 | require.NoError(t, err) 588 | 589 | for i := 0; i < backlogCount; i++ { 590 | select { 591 | case <-jobRanChan: 592 | case <-ctx.Done(): 593 | t.Fatal("timed out waiting for job to run") 594 | } 595 | } 596 | 597 | worker.Shutdown(context.Background()) 598 | 599 | jobsStillPending, err := pgxutil.SelectRow(ctx, conn, 600 | `select (select count(*) from pgxjob_asap_jobs) + (select count(*) from pgxjob_run_at_jobs)`, 601 | nil, 602 | pgx.RowTo[int32], 603 | ) 604 | require.NoError(t, err) 605 | require.EqualValues(t, 0, jobsStillPending) 606 | 607 | jobsRun, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_job_runs`, nil, pgx.RowTo[int32]) 608 | require.NoError(t, err) 609 | require.EqualValues(t, backlogCount, jobsRun) 610 | } 611 | 612 | func TestWorkerIgnoresOtherJobGroups(t *testing.T) { 613 | // This test takes a while because it is waiting for something *not* to happen. 614 | if testing.Short() { 615 | t.Skip("skipping test in short mode.") 616 | } 617 | 618 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 619 | defer cancel() 620 | 621 | conn := mustConnect(t) 622 | mustCleanDatabase(t, conn) 623 | dbpool := mustNewDBPool(t) 624 | 625 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 626 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 627 | JobGroups: []string{"other"}, 628 | JobTypes: []*pgxjob.JobTypeConfig{ 629 | { 630 | Name: "test", 631 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 632 | return nil 633 | }, 634 | }, 635 | }, 636 | HandleError: func(err error) { 637 | t.Errorf("scheduler HandleError: %v", err) 638 | }, 639 | }) 640 | require.NoError(t, err) 641 | 642 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 643 | require.NoError(t, err) 644 | 645 | err = scheduler.Schedule(ctx, conn, "test", nil, pgxjob.JobSchedule{GroupName: "other"}) 646 | require.NoError(t, err) 647 | 648 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{ 649 | PollInterval: 500 * time.Millisecond, 650 | }) 651 | require.NoError(t, err) 652 | 653 | time.Sleep(5 * time.Second) 654 | 655 | worker.Shutdown(context.Background()) 656 | 657 | // Our job did run. 658 | jobsRun, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_job_runs`, nil, pgx.RowTo[int32]) 659 | require.NoError(t, err) 660 | require.EqualValues(t, 1, jobsRun) 661 | 662 | // But the other job did not. 663 | asapJob, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_asap_jobs`, nil, pgx.RowToAddrOfStructByPos[asapJob]) 664 | require.NoError(t, err) 665 | 666 | otherGroupID, err := pgxutil.SelectRow(ctx, conn, `select id from pgxjob_groups where name = $1`, []any{"other"}, pgx.RowTo[int32]) 667 | require.NoError(t, err) 668 | 669 | require.EqualValues(t, otherGroupID, asapJob.GroupID) 670 | } 671 | 672 | func TestWorkerShutdown(t *testing.T) { 673 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 674 | defer cancel() 675 | 676 | conn := mustConnect(t) 677 | mustCleanDatabase(t, conn) 678 | dbpool := mustNewDBPool(t) 679 | 680 | jobRanChan := make(chan struct{}) 681 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 682 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 683 | JobTypes: []*pgxjob.JobTypeConfig{ 684 | { 685 | Name: "test", 686 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 687 | select { 688 | case jobRanChan <- struct{}{}: 689 | case <-time.After(10 * time.Millisecond): 690 | } 691 | return nil 692 | }, 693 | }, 694 | }, 695 | HandleError: func(err error) { 696 | t.Errorf("scheduler HandleError: %v", err) 697 | }, 698 | }) 699 | require.NoError(t, err) 700 | 701 | asapBacklogCount := 600 702 | for i := 0; i < asapBacklogCount; i++ { 703 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 704 | require.NoError(t, err) 705 | } 706 | runAtBacklogCount := 600 707 | for i := 0; i < runAtBacklogCount; i++ { 708 | err = scheduler.Schedule(ctx, conn, "test", nil, pgxjob.JobSchedule{RunAt: time.Now().Add(-1 * time.Second)}) 709 | require.NoError(t, err) 710 | } 711 | 712 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{ 713 | MaxConcurrentJobs: 1, 714 | MaxPrefetchedJobs: 1000, 715 | }) 716 | require.NoError(t, err) 717 | 718 | // Wait for at least 10 jobs to have started/run. 719 | for i := 0; i < 10; i++ { 720 | select { 721 | case <-jobRanChan: 722 | case <-ctx.Done(): 723 | t.Fatal("timed out waiting for job to run") 724 | } 725 | } 726 | 727 | worker.Shutdown(context.Background()) 728 | 729 | select { 730 | case <-worker.StartupComplete(): 731 | case <-ctx.Done(): 732 | t.Fatal("timed out waiting for job to run") 733 | } 734 | 735 | shutdownWorkerExists, err := pgxutil.SelectRow(ctx, conn, `select exists(select id from pgxjob_workers where id = $1)`, []any{worker.ID()}, pgx.RowTo[bool]) 736 | require.NoError(t, err) 737 | require.Falsef(t, shutdownWorkerExists, "shutdown worker still exists") 738 | 739 | lockedASAPJobs, err := pgxutil.SelectRow(ctx, conn, 740 | `select count(*) from pgxjob_asap_jobs where worker_id is not null`, 741 | nil, 742 | pgx.RowTo[int32], 743 | ) 744 | require.NoError(t, err) 745 | require.EqualValues(t, 0, lockedASAPJobs) 746 | 747 | lockedRunAtJobs, err := pgxutil.SelectRow(ctx, conn, 748 | `select count(*) from pgxjob_run_at_jobs where worker_id is not null`, 749 | nil, 750 | pgx.RowTo[int32], 751 | ) 752 | require.NoError(t, err) 753 | require.EqualValues(t, 0, lockedRunAtJobs) 754 | 755 | unlockedASAPJobs, err := pgxutil.SelectRow(ctx, conn, 756 | `select count(*) from pgxjob_asap_jobs where worker_id is null`, 757 | nil, 758 | pgx.RowTo[int32], 759 | ) 760 | require.NoError(t, err) 761 | 762 | unlockedRunAtJobs, err := pgxutil.SelectRow(ctx, conn, 763 | `select count(*) from pgxjob_run_at_jobs where worker_id is null`, 764 | nil, 765 | pgx.RowTo[int32], 766 | ) 767 | require.NoError(t, err) 768 | 769 | jobsRun, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_job_runs`, nil, pgx.RowTo[int32]) 770 | require.NoError(t, err) 771 | 772 | require.EqualValues(t, asapBacklogCount+runAtBacklogCount, unlockedASAPJobs+unlockedRunAtJobs+jobsRun) 773 | 774 | } 775 | 776 | func TestWorkerHeartbeatBeats(t *testing.T) { 777 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 778 | defer cancel() 779 | 780 | conn := mustConnect(t) 781 | mustCleanDatabase(t, conn) 782 | dbpool := mustNewDBPool(t) 783 | 784 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 785 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 786 | JobTypes: []*pgxjob.JobTypeConfig{ 787 | { 788 | Name: "test", 789 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 790 | return nil 791 | }, 792 | }, 793 | }, 794 | HandleError: func(err error) { 795 | t.Errorf("scheduler HandleError: %v", err) 796 | }, 797 | }) 798 | require.NoError(t, err) 799 | 800 | workerConfig := &pgxjob.WorkerConfig{} 801 | workerConfig.SetMinHeartbeatDelayForTest(50 * time.Millisecond) 802 | workerConfig.SetHeartbeatDelayJitterForTest(50 * time.Millisecond) 803 | 804 | worker, err := scheduler.StartWorker(workerConfig) 805 | require.NoError(t, err) 806 | 807 | select { 808 | case <-worker.StartupComplete(): 809 | case <-ctx.Done(): 810 | t.Fatal("timed out waiting for worker to start") 811 | } 812 | 813 | firstHeartbeat, err := pgxutil.SelectRow(ctx, conn, `select heartbeat from pgxjob_workers where id = $1`, []any{worker.ID()}, pgx.RowTo[time.Time]) 814 | require.NoError(t, err) 815 | 816 | require.EventuallyWithT(t, func(c *assert.CollectT) { 817 | heartbeat, err := pgxutil.SelectRow(ctx, conn, `select heartbeat from pgxjob_workers where id = $1`, []any{worker.ID()}, pgx.RowTo[time.Time]) 818 | assert.NoError(c, err) 819 | assert.True(c, heartbeat.After(firstHeartbeat)) 820 | }, 5*time.Second, 100*time.Millisecond) 821 | 822 | worker.Shutdown(context.Background()) 823 | } 824 | 825 | func TestWorkerHeartbeatCleansUpDeadWorkers(t *testing.T) { 826 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 827 | defer cancel() 828 | 829 | conn := mustConnect(t) 830 | mustCleanDatabase(t, conn) 831 | dbpool := mustNewDBPool(t) 832 | 833 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 834 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 835 | JobTypes: []*pgxjob.JobTypeConfig{ 836 | { 837 | Name: "test", 838 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 839 | return nil 840 | }, 841 | }, 842 | }, 843 | HandleError: func(err error) { 844 | t.Errorf("scheduler HandleError: %v", err) 845 | }, 846 | }) 847 | require.NoError(t, err) 848 | 849 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 850 | require.NoError(t, err) 851 | 852 | err = scheduler.Schedule(ctx, conn, "test", nil, pgxjob.JobSchedule{RunAt: time.Now().Add(-30 * time.Minute)}) 853 | require.NoError(t, err) 854 | 855 | groupID, err := pgxutil.SelectRow(ctx, conn, `select id from pgxjob_groups limit 1`, nil, pgx.RowTo[int32]) 856 | require.NoError(t, err) 857 | 858 | deadWorkerID, err := pgxutil.InsertRowReturning(ctx, conn, 859 | "pgxjob_workers", 860 | map[string]any{"heartbeat": time.Now().Add(-time.Hour), "group_id": groupID}, 861 | "id", 862 | pgx.RowTo[int32], 863 | ) 864 | require.NoError(t, err) 865 | 866 | err = pgxutil.UpdateRow(ctx, conn, 867 | "pgxjob_asap_jobs", 868 | map[string]any{"inserted_at": time.Now().Add(-time.Hour), "worker_id": deadWorkerID}, 869 | nil, 870 | ) 871 | require.NoError(t, err) 872 | 873 | err = pgxutil.UpdateRow(ctx, conn, 874 | "pgxjob_run_at_jobs", 875 | map[string]any{"inserted_at": time.Now().Add(-time.Hour), "worker_id": deadWorkerID}, 876 | nil, 877 | ) 878 | require.NoError(t, err) 879 | 880 | workerConfig := &pgxjob.WorkerConfig{ 881 | PollInterval: 500 * time.Millisecond, 882 | } 883 | workerConfig.SetMinHeartbeatDelayForTest(50 * time.Millisecond) 884 | workerConfig.SetHeartbeatDelayJitterForTest(50 * time.Millisecond) 885 | 886 | worker, err := scheduler.StartWorker(workerConfig) 887 | require.NoError(t, err) 888 | 889 | require.EventuallyWithT(t, func(c *assert.CollectT) { 890 | deadWorkerExists, err := pgxutil.SelectRow(ctx, conn, `select exists(select id from pgxjob_workers where id = $1)`, []any{deadWorkerID}, pgx.RowTo[bool]) 891 | assert.NoError(c, err) 892 | assert.Falsef(c, deadWorkerExists, "dead worker still exists") 893 | 894 | asapJobsStillPending, err := pgxutil.SelectRow(ctx, conn, 895 | `select count(*) from pgxjob_asap_jobs`, 896 | nil, 897 | pgx.RowTo[int32], 898 | ) 899 | assert.NoError(c, err) 900 | assert.EqualValuesf(c, 0, asapJobsStillPending, "asap jobs still pending") 901 | 902 | runAtJobsStillPending, err := pgxutil.SelectRow(ctx, conn, 903 | `select count(*) from pgxjob_run_at_jobs`, 904 | nil, 905 | pgx.RowTo[int32], 906 | ) 907 | assert.NoError(c, err) 908 | assert.EqualValuesf(c, 0, runAtJobsStillPending, "run at jobs still pending") 909 | 910 | jobsRun, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_job_runs`, nil, pgx.RowTo[int32]) 911 | assert.NoError(c, err) 912 | assert.EqualValues(c, 2, jobsRun) 913 | }, 5*time.Second, 100*time.Millisecond) 914 | 915 | worker.Shutdown(context.Background()) 916 | } 917 | 918 | func TestWorkerShouldLogJobRun(t *testing.T) { 919 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 920 | defer cancel() 921 | 922 | conn := mustConnect(t) 923 | mustCleanDatabase(t, conn) 924 | dbpool := mustNewDBPool(t) 925 | 926 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 927 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 928 | JobTypes: []*pgxjob.JobTypeConfig{ 929 | { 930 | Name: "test", 931 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 932 | if job.ErrorCount == 0 { 933 | return &pgxjob.ErrorWithRetry{Err: fmt.Errorf("failed first time"), RetryAt: time.Now().Add(100 * time.Millisecond)} 934 | } 935 | return nil 936 | }, 937 | }, 938 | }, 939 | HandleError: func(err error) { 940 | t.Errorf("scheduler HandleError: %v", err) 941 | }, 942 | }) 943 | require.NoError(t, err) 944 | 945 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 946 | require.NoError(t, err) 947 | 948 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{ 949 | PollInterval: 50 * time.Millisecond, 950 | ShouldLogJobRun: pgxjob.LogFinalJobRuns, 951 | }) 952 | require.NoError(t, err) 953 | 954 | require.EventuallyWithT(t, func(c *assert.CollectT) { 955 | jobsStillPending, err := pgxutil.SelectRow(ctx, conn, 956 | `select (select count(*) from pgxjob_asap_jobs) + (select count(*) from pgxjob_run_at_jobs)`, 957 | nil, 958 | pgx.RowTo[int32], 959 | ) 960 | assert.NoError(c, err) 961 | assert.EqualValues(c, 0, jobsStillPending) 962 | }, 30*time.Second, 100*time.Millisecond) 963 | 964 | worker.Shutdown(context.Background()) 965 | 966 | jobRunsCount, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_job_runs`, nil, pgx.RowTo[int32]) 967 | require.NoError(t, err) 968 | require.EqualValuesf(t, 1, jobRunsCount, "only one job run should have been logged") 969 | 970 | jobRun, err := pgxutil.SelectRow(ctx, conn, `select * from pgxjob_job_runs`, nil, pgx.RowToStructByPos[jobRun]) 971 | require.NoError(t, err) 972 | require.False(t, jobRun.LastError.Valid) 973 | } 974 | 975 | func TestStartupWhenDatabaseTemporarilyUnavailable(t *testing.T) { 976 | if testing.Short() { 977 | t.Skip("skipping test in short mode.") 978 | } 979 | 980 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 981 | defer cancel() 982 | 983 | conn := mustConnect(t) 984 | mustCleanDatabase(t, conn) 985 | dbpool := mustNewDBPool(t) 986 | 987 | dbAvailableAt := time.Now().Add(5 * time.Second) 988 | acquireConn := func(ctx context.Context) (conn *pgxpool.Conn, release func(), err error) { 989 | if time.Now().Before(dbAvailableAt) { 990 | return nil, nil, fmt.Errorf("database temporarily unavailable") 991 | } 992 | return pgxjob.AcquireConnFuncFromPool(dbpool)(ctx) 993 | } 994 | 995 | jobDoneChan := make(chan struct{}) 996 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 997 | AcquireConn: acquireConn, 998 | JobTypes: []*pgxjob.JobTypeConfig{ 999 | { 1000 | Name: "test", 1001 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 1002 | jobDoneChan <- struct{}{} 1003 | return nil 1004 | }, 1005 | }, 1006 | }, 1007 | HandleError: func(err error) { 1008 | t.Logf("scheduler HandleError: %v", err) 1009 | }, 1010 | }) 1011 | require.NoError(t, err) 1012 | 1013 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{ 1014 | PollInterval: 500 * time.Millisecond, 1015 | }) 1016 | require.NoError(t, err) 1017 | 1018 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 1019 | require.NoError(t, err) 1020 | 1021 | select { 1022 | case <-worker.StartupComplete(): 1023 | case <-ctx.Done(): 1024 | t.Fatal("timed out waiting for worker to start") 1025 | } 1026 | 1027 | select { 1028 | case <-jobDoneChan: 1029 | case <-ctx.Done(): 1030 | t.Fatal("timed out waiting for worker to start") 1031 | } 1032 | 1033 | worker.Shutdown(context.Background()) 1034 | 1035 | jobsRun, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_job_runs`, nil, pgx.RowTo[int32]) 1036 | require.NoError(t, err) 1037 | require.EqualValues(t, 1, jobsRun) 1038 | } 1039 | 1040 | // This test is actually a benchmark of how much writes the database actually performs. It is used to measure and 1041 | // minimize the number of writes. 1042 | func TestBenchmarkDatabaseWrites(t *testing.T) { 1043 | if os.Getenv("BENCHMARK_DATABASE_WRITES") == "" { 1044 | t.Skip("skipping benchmark") 1045 | } 1046 | 1047 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 1048 | defer cancel() 1049 | 1050 | conn := mustConnect(t) 1051 | mustCleanDatabase(t, conn) 1052 | dbpool := mustNewDBPool(t) 1053 | 1054 | type pgStatWAL struct { 1055 | WALRecords int64 1056 | WALBytes int64 1057 | WALWrite int64 1058 | } 1059 | 1060 | type pgStatTable struct { 1061 | NTupIns int64 1062 | NTupUpd int64 1063 | NTupDel int64 1064 | NDeadTup int64 1065 | } 1066 | 1067 | // Start with a clean database and stats. 1068 | _, err := conn.Exec(ctx, `vacuum full analyze`) 1069 | require.NoError(t, err) 1070 | 1071 | startStatWAL, err := pgxutil.SelectRow(ctx, conn, `select wal_records, wal_bytes, wal_write from pg_stat_wal`, nil, pgx.RowToStructByPos[pgStatWAL]) 1072 | require.NoError(t, err) 1073 | 1074 | startStatTable, err := pgxutil.SelectRow(ctx, conn, `select n_tup_ins, n_tup_upd, n_tup_del, n_dead_tup from pg_stat_all_tables where relname = 'pgxjob_asap_jobs'`, nil, pgx.RowToStructByPos[pgStatTable]) 1075 | require.NoError(t, err) 1076 | 1077 | jobRanChan := make(chan struct{}, 100) 1078 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 1079 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 1080 | JobTypes: []*pgxjob.JobTypeConfig{ 1081 | { 1082 | Name: "test", 1083 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 1084 | jobRanChan <- struct{}{} 1085 | return nil 1086 | }, 1087 | }, 1088 | }, 1089 | HandleError: func(err error) { 1090 | t.Errorf("scheduler HandleError: %v", err) 1091 | }, 1092 | }) 1093 | require.NoError(t, err) 1094 | 1095 | totalJobs := 0 1096 | 1097 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{ 1098 | ShouldLogJobRun: func(worker *pgxjob.Worker, job *pgxjob.Job, startTime, endTime time.Time, err error) bool { 1099 | return false 1100 | }, 1101 | }) 1102 | require.NoError(t, err) 1103 | 1104 | listener := &pgxlisten.Listener{ 1105 | Connect: func(ctx context.Context) (*pgx.Conn, error) { 1106 | return pgx.Connect(ctx, fmt.Sprintf("dbname=%s", os.Getenv("PGXJOB_TEST_DATABASE"))) 1107 | }, 1108 | } 1109 | 1110 | listener.Handle(pgxjob.PGNotifyChannel, worker) 1111 | 1112 | listenerCtx, listenerCtxCancel := context.WithCancel(ctx) 1113 | defer listenerCtxCancel() 1114 | listenErrChan := make(chan error) 1115 | go func() { 1116 | err := listener.Listen(listenerCtx) 1117 | if err != nil && !errors.Is(err, context.Canceled) { 1118 | listenErrChan <- err 1119 | } 1120 | close(listenErrChan) 1121 | }() 1122 | 1123 | for i := 0; i < 100_000; i++ { 1124 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 1125 | require.NoError(t, err) 1126 | totalJobs++ 1127 | } 1128 | 1129 | for i := 0; i < totalJobs; i++ { 1130 | select { 1131 | case <-jobRanChan: 1132 | case <-time.After(30 * time.Second): 1133 | t.Fatal("timed out waiting for job to run") 1134 | } 1135 | } 1136 | 1137 | worker.Shutdown(context.Background()) 1138 | 1139 | pendingASAPJobsCount, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_asap_jobs`, nil, pgx.RowTo[int32]) 1140 | require.NoError(t, err) 1141 | require.EqualValues(t, 0, pendingASAPJobsCount) 1142 | 1143 | pendingRunAtJobsCount, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_run_at_jobs`, nil, pgx.RowTo[int32]) 1144 | require.NoError(t, err) 1145 | require.EqualValues(t, 0, pendingRunAtJobsCount) 1146 | 1147 | jobRunsCount, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_job_runs`, nil, pgx.RowTo[int32]) 1148 | require.NoError(t, err) 1149 | require.EqualValues(t, 0, jobRunsCount) 1150 | 1151 | listenerCtxCancel() 1152 | err = <-listenErrChan 1153 | require.NoError(t, err) 1154 | 1155 | endStatWAL, err := pgxutil.SelectRow(ctx, conn, `select wal_records, wal_bytes, wal_write from pg_stat_wal`, nil, pgx.RowToStructByPos[pgStatWAL]) 1156 | require.NoError(t, err) 1157 | 1158 | endStatTable, err := pgxutil.SelectRow(ctx, conn, `select n_tup_ins, n_tup_upd, n_tup_del, n_dead_tup from pg_stat_all_tables where relname = 'pgxjob_asap_jobs'`, nil, pgx.RowToStructByPos[pgStatTable]) 1159 | require.NoError(t, err) 1160 | 1161 | t.Logf("wal_records: %d", endStatWAL.WALRecords-startStatWAL.WALRecords) 1162 | t.Logf("wal_bytes: %d", endStatWAL.WALBytes-startStatWAL.WALBytes) 1163 | t.Logf("wal_write: %d", endStatWAL.WALWrite-startStatWAL.WALWrite) 1164 | t.Logf("n_tup_ins: %d", endStatTable.NTupIns-startStatTable.NTupIns) 1165 | t.Logf("n_tup_upd: %d", endStatTable.NTupUpd-startStatTable.NTupUpd) 1166 | t.Logf("n_tup_del: %d", endStatTable.NTupDel-startStatTable.NTupDel) 1167 | t.Logf("n_dead_tup: %d", endStatTable.NDeadTup-startStatTable.NDeadTup) 1168 | } 1169 | 1170 | // TestStress creates multiple workers and schedules jobs while they are being worked. 1171 | func TestStress(t *testing.T) { 1172 | if testing.Short() { 1173 | t.Skip("skipping test in short mode") 1174 | } 1175 | 1176 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 1177 | defer cancel() 1178 | 1179 | conn := mustConnect(t) 1180 | mustCleanDatabase(t, conn) 1181 | dbpool := mustNewDBPool(t) 1182 | 1183 | t1JobsQueued := 0 1184 | t1JobsRan := 0 1185 | t1JobRanChan := make(chan struct{}) 1186 | 1187 | t2JobsQueued := 0 1188 | t2JobsRan := 0 1189 | t2JobRanChan := make(chan struct{}) 1190 | 1191 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 1192 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 1193 | JobGroups: []string{"other"}, 1194 | JobTypes: []*pgxjob.JobTypeConfig{ 1195 | { 1196 | Name: "t1", 1197 | RunJob: pgxjob.FilterError(func(ctx context.Context, job *pgxjob.Job) error { 1198 | if rand.Intn(100) == 0 { 1199 | return errors.New("random error") 1200 | } 1201 | select { 1202 | case t1JobRanChan <- struct{}{}: 1203 | case <-time.NewTimer(100 * time.Millisecond).C: 1204 | return errors.New("t1JobRanChan full") 1205 | } 1206 | return nil 1207 | }, 1208 | &pgxjob.RetryLinearBackoffErrorFilter{MaxRetries: 1000, BaseDelay: 10 * time.Millisecond}), 1209 | }, 1210 | { 1211 | Name: "t2", 1212 | RunJob: pgxjob.FilterError(func(ctx context.Context, job *pgxjob.Job) error { 1213 | select { 1214 | case t2JobRanChan <- struct{}{}: 1215 | case <-time.NewTimer(100 * time.Millisecond).C: 1216 | return errors.New("t2JobRanChan full") 1217 | } 1218 | return nil 1219 | }, 1220 | &pgxjob.RetryLinearBackoffErrorFilter{MaxRetries: 1000, BaseDelay: 10 * time.Millisecond}), 1221 | }, 1222 | }, 1223 | HandleError: func(err error) { 1224 | t.Errorf("scheduler HandleError: %v", err) 1225 | }, 1226 | }) 1227 | require.NoError(t, err) 1228 | 1229 | w1, err := scheduler.StartWorker(&pgxjob.WorkerConfig{}) 1230 | require.NoError(t, err) 1231 | 1232 | w2, err := scheduler.StartWorker(&pgxjob.WorkerConfig{ 1233 | MaxConcurrentJobs: 5, 1234 | MaxPrefetchedJobs: 10, 1235 | }) 1236 | require.NoError(t, err) 1237 | 1238 | w3, err := scheduler.StartWorker(&pgxjob.WorkerConfig{ 1239 | GroupName: "other", 1240 | }) 1241 | require.NoError(t, err) 1242 | 1243 | // Schedule a bunch of random jobs. 1244 | for i := 0; i < 100_000; i++ { 1245 | n := rand.Intn(100) 1246 | if n < 5 { 1247 | scheduler.Schedule(ctx, conn, "t1", nil, pgxjob.JobSchedule{RunAt: time.Now().Add(time.Duration(rand.Intn(1000)) * time.Millisecond)}) 1248 | t1JobsQueued++ 1249 | } else if n < 10 { 1250 | scheduler.Schedule(ctx, conn, "t1", nil, pgxjob.JobSchedule{GroupName: "other", RunAt: time.Now().Add(time.Duration(rand.Intn(1000)) * time.Millisecond)}) 1251 | t1JobsQueued++ 1252 | } else if n < 15 { 1253 | scheduler.Schedule(ctx, conn, "t2", nil, pgxjob.JobSchedule{GroupName: "other"}) 1254 | t1JobsQueued++ 1255 | } else if n < 50 { 1256 | scheduler.ScheduleNow(ctx, conn, "t1", nil) 1257 | t1JobsQueued++ 1258 | } else { 1259 | scheduler.ScheduleNow(ctx, conn, "t2", nil) 1260 | t2JobsQueued++ 1261 | } 1262 | } 1263 | 1264 | for t1JobsRan+t2JobsRan < (t1JobsQueued+t2JobsQueued)/2 { 1265 | select { 1266 | case <-t1JobRanChan: 1267 | t1JobsRan++ 1268 | case <-t2JobRanChan: 1269 | t2JobsRan++ 1270 | case <-ctx.Done(): 1271 | t.Fatalf("timed out waiting for jobs to finish: %d completed", t1JobsRan+t2JobsRan) 1272 | } 1273 | } 1274 | 1275 | err = w1.Shutdown(ctx) 1276 | require.NoError(t, err) 1277 | 1278 | err = w2.Shutdown(ctx) 1279 | require.NoError(t, err) 1280 | 1281 | err = w3.Shutdown(ctx) 1282 | require.NoError(t, err) 1283 | 1284 | jobsStillPending, err := pgxutil.SelectRow(ctx, conn, 1285 | `select (select count(*) from pgxjob_asap_jobs) + (select count(*) from pgxjob_run_at_jobs)`, 1286 | nil, 1287 | pgx.RowTo[int32], 1288 | ) 1289 | require.NoError(t, err) 1290 | 1291 | lockedJobs, err := pgxutil.SelectRow(ctx, conn, 1292 | `select (select count(*) from pgxjob_asap_jobs where worker_id is not null) + (select count(*) from pgxjob_run_at_jobs where worker_id is not null)`, 1293 | nil, 1294 | pgx.RowTo[int32], 1295 | ) 1296 | require.NoError(t, err) 1297 | require.EqualValues(t, 0, lockedJobs) 1298 | 1299 | successfulJobs, err := pgxutil.SelectRow(ctx, conn, `select count(*) from pgxjob_job_runs where error is null`, nil, pgx.RowTo[int32]) 1300 | require.NoError(t, err) 1301 | require.EqualValues(t, (t1JobsQueued + t2JobsQueued), jobsStillPending+successfulJobs) 1302 | } 1303 | 1304 | func TestUnmarshalParams(t *testing.T) { 1305 | type T struct { 1306 | Foo string 1307 | Bar int 1308 | } 1309 | 1310 | fn := pgxjob.UnmarshalParams(func(ctx context.Context, job *pgxjob.Job, params *T) error { 1311 | require.Equal(t, "foo", params.Foo) 1312 | require.Equal(t, 123, params.Bar) 1313 | return nil 1314 | }) 1315 | 1316 | err := fn(context.Background(), &pgxjob.Job{Params: []byte(`{"foo":"foo","bar":123}`)}) 1317 | require.NoError(t, err) 1318 | } 1319 | 1320 | func TestFilterError(t *testing.T) { 1321 | var originalError error 1322 | fn := pgxjob.FilterError(func(ctx context.Context, job *pgxjob.Job) error { 1323 | return originalError 1324 | }, pgxjob.FilterErrorFunc(func(job *pgxjob.Job, jobErr error) error { 1325 | return fmt.Errorf("filtered error") 1326 | })) 1327 | 1328 | err := fn(context.Background(), nil) 1329 | require.NoError(t, err) 1330 | 1331 | originalError = fmt.Errorf("original error") 1332 | err = fn(context.Background(), nil) 1333 | require.EqualError(t, err, "filtered error") 1334 | } 1335 | 1336 | func TestRetryLinearBackoffErrorFilter(t *testing.T) { 1337 | errorFilter := pgxjob.RetryLinearBackoffErrorFilter{MaxRetries: 3, BaseDelay: 1 * time.Hour} 1338 | 1339 | for i, tt := range []struct { 1340 | originalError error 1341 | errorCount int32 1342 | retryDelay time.Duration 1343 | }{ 1344 | {nil, 0, 0}, 1345 | {fmt.Errorf("original error"), 0, 1 * time.Hour}, 1346 | {fmt.Errorf("original error"), 1, 2 * time.Hour}, 1347 | {fmt.Errorf("original error"), 2, 3 * time.Hour}, 1348 | {fmt.Errorf("original error"), 3, 0}, 1349 | } { 1350 | t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { 1351 | job := &pgxjob.Job{ErrorCount: tt.errorCount} 1352 | earliestRetryTime := time.Now().Add(tt.retryDelay) 1353 | err := errorFilter.FilterError(job, tt.originalError) 1354 | latestRetryTime := time.Now().Add(tt.retryDelay) 1355 | if tt.originalError == nil { 1356 | require.NoError(t, err) 1357 | } else { 1358 | var errorWithRetry *pgxjob.ErrorWithRetry 1359 | if tt.retryDelay == 0 { 1360 | require.False(t, errors.As(err, &errorWithRetry)) 1361 | } else { 1362 | require.EqualError(t, err, "original error") 1363 | require.ErrorAs(t, err, &errorWithRetry) 1364 | require.Truef(t, errorWithRetry.RetryAt.After(earliestRetryTime), "RetryAt: %v, earliestRetryTime: %v", errorWithRetry.RetryAt, earliestRetryTime) 1365 | require.Truef(t, errorWithRetry.RetryAt.Before(latestRetryTime), "RetryAt: %v, latestRetryTime: %v", errorWithRetry.RetryAt, latestRetryTime) 1366 | } 1367 | } 1368 | }) 1369 | } 1370 | } 1371 | 1372 | func TestSchedulerContext(t *testing.T) { 1373 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 1374 | defer cancel() 1375 | 1376 | conn := mustConnect(t) 1377 | mustCleanDatabase(t, conn) 1378 | dbpool := mustNewDBPool(t) 1379 | 1380 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 1381 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 1382 | JobTypes: []*pgxjob.JobTypeConfig{ 1383 | { 1384 | Name: "test", 1385 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 1386 | return nil 1387 | }, 1388 | }, 1389 | }, 1390 | HandleError: func(err error) { 1391 | t.Errorf("scheduler HandleError: %v", err) 1392 | }, 1393 | }) 1394 | require.NoError(t, err) 1395 | 1396 | select { 1397 | case <-scheduler.SetupDoneChan(): 1398 | case <-ctx.Done(): 1399 | t.Fatal("timed out waiting for scheduler to start") 1400 | } 1401 | 1402 | require.Nil(t, pgxjob.Ctx(ctx)) 1403 | pgxjob.DefaultContextScheduler = scheduler 1404 | require.Equal(t, scheduler, pgxjob.Ctx(ctx)) 1405 | 1406 | otherScheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 1407 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 1408 | JobTypes: []*pgxjob.JobTypeConfig{ 1409 | { 1410 | Name: "test", 1411 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 1412 | return nil 1413 | }, 1414 | }, 1415 | }, 1416 | HandleError: func(err error) { 1417 | t.Errorf("scheduler HandleError: %v", err) 1418 | }, 1419 | }) 1420 | require.NoError(t, err) 1421 | 1422 | select { 1423 | case <-otherScheduler.SetupDoneChan(): 1424 | case <-ctx.Done(): 1425 | t.Fatal("timed out waiting for scheduler to start") 1426 | } 1427 | 1428 | otherCtx := otherScheduler.WithContext(ctx) 1429 | require.Equal(t, otherScheduler, pgxjob.Ctx(otherCtx)) 1430 | } 1431 | 1432 | func BenchmarkRunBackloggedJobs(b *testing.B) { 1433 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 1434 | defer cancel() 1435 | 1436 | conn := mustConnect(b) 1437 | mustCleanDatabase(b, conn) 1438 | dbpool := mustNewDBPool(b) 1439 | 1440 | runJobChan := make(chan struct{}, 100) 1441 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 1442 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 1443 | JobTypes: []*pgxjob.JobTypeConfig{ 1444 | { 1445 | Name: "test", 1446 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 1447 | runJobChan <- struct{}{} 1448 | return nil 1449 | }, 1450 | }, 1451 | }, 1452 | HandleError: func(err error) { 1453 | b.Errorf("scheduler HandleError: %v", err) 1454 | }, 1455 | }) 1456 | require.NoError(b, err) 1457 | 1458 | for i := 0; i < b.N; i++ { 1459 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 1460 | require.NoError(b, err) 1461 | } 1462 | 1463 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{}) 1464 | require.NoError(b, err) 1465 | defer worker.Shutdown(context.Background()) 1466 | 1467 | b.ResetTimer() 1468 | 1469 | for i := 0; i < b.N; i++ { 1470 | select { 1471 | case <-runJobChan: 1472 | case <-ctx.Done(): 1473 | b.Fatalf("timed out waiting for jobs to finish: %d", i) 1474 | } 1475 | } 1476 | 1477 | err = worker.Shutdown(context.Background()) 1478 | require.NoError(b, err) 1479 | } 1480 | 1481 | func BenchmarkRunConcurrentlyInsertedJobs(b *testing.B) { 1482 | ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 1483 | defer cancel() 1484 | 1485 | conn := mustConnect(b) 1486 | mustCleanDatabase(b, conn) 1487 | dbpool := mustNewDBPool(b) 1488 | 1489 | runJobChan := make(chan struct{}, 100) 1490 | scheduler, err := pgxjob.NewScheduler(&pgxjob.SchedulerConfig{ 1491 | AcquireConn: pgxjob.AcquireConnFuncFromPool(dbpool), 1492 | JobTypes: []*pgxjob.JobTypeConfig{ 1493 | { 1494 | Name: "test", 1495 | RunJob: func(ctx context.Context, job *pgxjob.Job) error { 1496 | runJobChan <- struct{}{} 1497 | return nil 1498 | }, 1499 | }, 1500 | }, 1501 | HandleError: func(err error) { 1502 | b.Errorf("scheduler HandleError: %v", err) 1503 | }, 1504 | }) 1505 | require.NoError(b, err) 1506 | 1507 | worker, err := scheduler.StartWorker(&pgxjob.WorkerConfig{}) 1508 | require.NoError(b, err) 1509 | defer worker.Shutdown(context.Background()) 1510 | 1511 | listener := &pgxlisten.Listener{ 1512 | Connect: func(ctx context.Context) (*pgx.Conn, error) { 1513 | return pgx.Connect(ctx, fmt.Sprintf("dbname=%s", os.Getenv("PGXJOB_TEST_DATABASE"))) 1514 | }, 1515 | } 1516 | 1517 | listener.Handle(pgxjob.PGNotifyChannel, worker) 1518 | 1519 | listenerCtx, listenerCtxCancel := context.WithCancel(ctx) 1520 | defer listenerCtxCancel() 1521 | listenErrChan := make(chan error) 1522 | go func() { 1523 | err := listener.Listen(listenerCtx) 1524 | if err != nil && !errors.Is(err, context.Canceled) { 1525 | listenErrChan <- err 1526 | } 1527 | close(listenErrChan) 1528 | }() 1529 | 1530 | // Wait a little for the listener to start. Otherwise we might miss the first notification which would the first 1531 | // benchmark where b.N == 1 to take PollDuration time. 1532 | time.Sleep(time.Second) 1533 | 1534 | b.ResetTimer() 1535 | 1536 | for i := 0; i < b.N; i++ { 1537 | err = scheduler.ScheduleNow(ctx, conn, "test", nil) 1538 | require.NoError(b, err) 1539 | } 1540 | 1541 | for i := 0; i < b.N; i++ { 1542 | select { 1543 | case <-runJobChan: 1544 | case <-ctx.Done(): 1545 | b.Fatalf("timed out waiting for jobs to finish: %d", i) 1546 | } 1547 | } 1548 | 1549 | err = worker.Shutdown(context.Background()) 1550 | require.NoError(b, err) 1551 | 1552 | listenerCtxCancel() 1553 | err = <-listenErrChan 1554 | require.NoError(b, err) 1555 | } 1556 | 1557 | func benchmarkPostgreSQLParamsInsert(b *testing.B, params_type string) { 1558 | ctx := context.Background() 1559 | conn := mustConnect(b) 1560 | 1561 | _, err := conn.Exec(ctx, fmt.Sprintf(`create temporary table benchmark_params ( 1562 | id bigint primary key generated by default as identity, 1563 | params %s 1564 | )`, params_type)) 1565 | require.NoError(b, err) 1566 | 1567 | params := []byte(`{"id":"1234567890","foo":"bar","baz":"quz"}`) 1568 | 1569 | b.ResetTimer() 1570 | 1571 | for i := 0; i < b.N; i++ { 1572 | _, err := conn.Exec(ctx, "insert into benchmark_params (params) values ($1)", params) 1573 | require.NoError(b, err) 1574 | } 1575 | } 1576 | 1577 | func BenchmarkPostgreSQLParamsInsertJSON(b *testing.B) { 1578 | benchmarkPostgreSQLParamsInsert(b, "json") 1579 | } 1580 | 1581 | func BenchmarkPostgreSQLParamsInsertJSONB(b *testing.B) { 1582 | benchmarkPostgreSQLParamsInsert(b, "jsonb") 1583 | } 1584 | 1585 | func BenchmarkPostgreSQLParamsInsertText(b *testing.B) { 1586 | benchmarkPostgreSQLParamsInsert(b, "text") 1587 | } 1588 | 1589 | func benchmarkPostgreSQLParamsSelect(b *testing.B, params_type string) { 1590 | ctx := context.Background() 1591 | conn := mustConnect(b) 1592 | 1593 | _, err := conn.Exec(ctx, fmt.Sprintf(`create temporary table benchmark_params ( 1594 | id bigint primary key generated by default as identity, 1595 | params %s 1596 | )`, params_type)) 1597 | require.NoError(b, err) 1598 | 1599 | params := []byte(`{"id":"1234567890","foo":"bar","baz":"quz"}`) 1600 | _, err = conn.Exec(ctx, "insert into benchmark_params (params) values ($1)", params) 1601 | require.NoError(b, err) 1602 | 1603 | b.ResetTimer() 1604 | 1605 | for i := 0; i < b.N; i++ { 1606 | _, err := conn.Exec(ctx, "select * from benchmark_params") 1607 | require.NoError(b, err) 1608 | } 1609 | } 1610 | 1611 | func BenchmarkPostgreSQLParamsSelectJSON(b *testing.B) { 1612 | benchmarkPostgreSQLParamsSelect(b, "json") 1613 | } 1614 | 1615 | func BenchmarkPostgreSQLParamsSelectJSONB(b *testing.B) { 1616 | benchmarkPostgreSQLParamsSelect(b, "jsonb") 1617 | } 1618 | 1619 | func BenchmarkPostgreSQLParamsSelectText(b *testing.B) { 1620 | benchmarkPostgreSQLParamsSelect(b, "text") 1621 | } 1622 | --------------------------------------------------------------------------------