├── examples ├── crawler │ └── crawler.go ├── eternal │ └── eternal.go └── simple │ └── simple.go ├── queue.go ├── queue_test.go └── readme.md /examples/crawler/crawler.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "net/http" 7 | "net/url" 8 | "os" 9 | "os/signal" 10 | "strings" 11 | "syscall" 12 | "time" 13 | 14 | "github.com/patrickmn/go-cache" 15 | "github.com/xeoncross/goworkqueue" 16 | "golang.org/x/net/html" 17 | ) 18 | 19 | /* 20 | * Complex example of using goworkqueue to crawl a domain. 21 | * Based on https://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html 22 | * 23 | * Run: 24 | * $ go run -race crawler.go https://httpbin.org/links/5 25 | * 26 | * @todo: Make foundUrls a concurrent-safe map 27 | */ 28 | 29 | var foundUrls = make(map[string]bool) 30 | var foundDomains = make(map[string]bool) 31 | var queue *goworkqueue.Queue 32 | var domainBackoff *cache.Cache 33 | 34 | func main() { 35 | 36 | // We want to only hit the same domain at *most* every X minutes 37 | domainBackoff = cache.New(1*time.Second, 2*time.Second) 38 | // foundUrls = make(map[string]bool) 39 | seedUrls := os.Args[1:] 40 | 41 | jobQueueSize := 1000 42 | numberOfWorkers := 3 43 | 44 | queue = goworkqueue.NewQueue(jobQueueSize, numberOfWorkers, crawlWorker) 45 | 46 | // Abort when we press CTRL+C (go run...) or send a kill -9 (go build...) 47 | c := make(chan os.Signal, 1) 48 | signal.Notify(c, os.Interrupt, syscall.SIGTERM) 49 | go func() { 50 | for _ = range c { 51 | queue.Close() 52 | fmt.Println("ABORTING!") 53 | } 54 | }() 55 | 56 | // Add our urls to the job list 57 | for _, url := range seedUrls { 58 | queue.Add(url) 59 | } 60 | 61 | // Blocks until queue.Close() 62 | queue.Run() 63 | 64 | // Optional, callback for emptying the queue *if* anything remains 65 | queue.Drain(func(job interface{}) { 66 | fmt.Printf("'%v' wasn't fetched\n", job) 67 | }) 68 | 69 | // We're done! Print the results... 70 | fmt.Println("\nFound", len(foundUrls), "unique urls:") 71 | for url := range foundUrls { 72 | fmt.Println(" - " + url) 73 | } 74 | 75 | fmt.Println("\nFound", len(foundDomains), "unique domains:") 76 | for url := range foundDomains { 77 | fmt.Println(" - " + url) 78 | } 79 | 80 | } 81 | 82 | // Helper function to pull the href attribute from a Token 83 | func getHref(t html.Token) (ok bool, href string) { 84 | // Iterate over all of the Token's attributes until we find an "href" 85 | for _, a := range t.Attr { 86 | if a.Key == "href" { 87 | href = a.Val 88 | ok = true 89 | } 90 | } 91 | 92 | // "bare" return will return the variables (ok, href) as defined in 93 | // the function definition 94 | return 95 | } 96 | 97 | // Extract all http** links from a given webpage 98 | func crawlWorker(job interface{}, workerID int) { 99 | 100 | var urlString string 101 | 102 | switch v := job.(type) { 103 | case string: 104 | urlString = v 105 | default: 106 | log.Fatal("Unknown job: ", v) 107 | } 108 | 109 | domain := domainOfURL(urlString) 110 | 111 | // Too soon 112 | if _, found := domainBackoff.Get(domain); found { 113 | // fmt.Println("WAIT:", domain, "->", url) 114 | queue.Add(urlString) 115 | return 116 | } 117 | 118 | // Set the value of the key "foo" to "bar", with the default expiration time 119 | domainBackoff.Set(domain, true, cache.DefaultExpiration) 120 | 121 | fmt.Println("fetching", urlString) 122 | resp, err := http.Get(urlString) 123 | 124 | if err != nil { 125 | fmt.Println("ERROR: Failed to crawl \"" + urlString + "\"") 126 | return 127 | } 128 | 129 | b := resp.Body 130 | defer b.Close() // close Body when the function returns 131 | 132 | z := html.NewTokenizer(b) 133 | 134 | for { 135 | tt := z.Next() 136 | 137 | switch { 138 | case tt == html.ErrorToken: 139 | // End of the document, we're done 140 | return 141 | case tt == html.StartTagToken: 142 | t := z.Token() 143 | 144 | // Check if the token is an tag 145 | isAnchor := t.Data == "a" 146 | if !isAnchor { 147 | continue 148 | } 149 | 150 | // Extract the href value, if there is one 151 | ok, urlString := getHref(t) 152 | if !ok { 153 | continue 154 | } 155 | 156 | urlString = toAbsURL(resp.Request.URL, urlString) 157 | 158 | if _, ok := foundUrls[urlString]; ok { 159 | fmt.Println("ALREADY PARSED:", urlString) 160 | return 161 | } 162 | 163 | // Make sure the url begines in http** 164 | hasProto := strings.Index(urlString, "http") == 0 165 | if hasProto { 166 | domain := domainOfURL(urlString) 167 | foundDomains[domain] = true 168 | foundUrls[urlString] = true 169 | queue.Add(urlString) 170 | } 171 | } 172 | } 173 | } 174 | 175 | func toAbsURL(baseurl *url.URL, weburl string) string { 176 | relurl, err := url.Parse(weburl) 177 | if err != nil { 178 | return "" 179 | } 180 | absurl := baseurl.ResolveReference(relurl) 181 | return absurl.String() 182 | } 183 | 184 | func domainOfURL(weburl string) string { 185 | parsedURL, err := url.Parse(weburl) 186 | if err != nil { 187 | return "" 188 | } 189 | return parsedURL.Host 190 | } 191 | -------------------------------------------------------------------------------- /examples/eternal/eternal.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "os/signal" 8 | "syscall" 9 | "time" 10 | 11 | "github.com/xeoncross/goworkqueue" 12 | ) 13 | 14 | /* 15 | * Work queue that runs forever generating and processing work 16 | */ 17 | 18 | func main() { 19 | 20 | workerFunc := func(job interface{}, workerId int) { 21 | if id, ok := job.(int64); ok { 22 | fmt.Printf("Processing ID %d\n", id) 23 | time.Sleep(time.Millisecond * 500) 24 | } 25 | } 26 | 27 | jobQueueSize := 10 28 | numberOfWorkers := 3 29 | 30 | queue := goworkqueue.NewQueue(jobQueueSize, numberOfWorkers, workerFunc) 31 | 32 | // Abort when we press CTRL+C (go run...) or send a kill -9 (go build...) 33 | c := make(chan os.Signal, 1) 34 | signal.Notify(c, os.Interrupt, syscall.SIGTERM) 35 | go func() { 36 | for _ = range c { 37 | queue.Close() 38 | log.Println("Interrupt / SIGTERM received. Stopping...") 39 | } 40 | }() 41 | 42 | // Forever, we add work to the queue to be processed. 43 | // If queue.Jobs is full, this will halt until the workers 44 | // make more room in the queue - so our backlog is under control. 45 | go func() { 46 | var id int64 47 | for { 48 | // Here you could fetch data from a queue or database 49 | id++ 50 | 51 | // If we can't add a job, the queue must be closed/closing 52 | if ok := queue.Add(id); !ok { 53 | return 54 | } 55 | } 56 | 57 | }() 58 | 59 | // Blocks until queue.Close() 60 | queue.Run() 61 | 62 | // Optional, callback for emptying the queue *if* anything remains 63 | queue.Drain(func(job interface{}) { 64 | fmt.Printf("'%v' wasn't finished\n", job) 65 | }) 66 | 67 | } 68 | -------------------------------------------------------------------------------- /examples/simple/simple.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/xeoncross/goworkqueue" 8 | ) 9 | 10 | /* 11 | * Simple example of creating a work queue that gracefully handles shutdowns 12 | * or failure 13 | */ 14 | 15 | // A real worker would be parsing a web page or crunching numbers 16 | func workerFunc(job interface{}, workerID int) { 17 | 18 | fmt.Println("worker", workerID, "processing job", job) 19 | time.Sleep(1 * time.Second) 20 | fmt.Println("worker", workerID, "saving job", job) 21 | 22 | // switch v := job.(type) { 23 | // case string: 24 | // fmt.Println("string:", v) 25 | // case int, int32, int64: 26 | // fmt.Println("int:", v) 27 | // case float32: 28 | // fmt.Println("float32:", v) 29 | // default: 30 | // fmt.Println("unknown") 31 | // } 32 | } 33 | 34 | func main() { 35 | 36 | jobQueueSize := 100 37 | numberOfWorkers := 3 38 | 39 | queue := goworkqueue.NewQueue(jobQueueSize, numberOfWorkers, workerFunc) 40 | 41 | // Pretend we suddenly need to stop the workers. 42 | // This might be a SIGTERM or perhaps the workerFunc() called queue.Close() 43 | go func() { 44 | time.Sleep(1 * time.Second) 45 | queue.Close() 46 | fmt.Println("ABORT!") 47 | }() 48 | 49 | // We can optionally prefill the work queue 50 | for j := 1; j <= 20; j++ { 51 | if ok := queue.Add(fmt.Sprintf("Job %d", j)); !ok { 52 | break 53 | } 54 | } 55 | 56 | // Blocks until queue.Close() 57 | queue.Run() 58 | 59 | // It's easy to check on the status of the queue 60 | if queue.Closed() { 61 | // Always true in this case since it's below queue.Run() 62 | } 63 | 64 | // Optional, callback for emptying the queue *if* anything remains 65 | queue.Drain(func(job interface{}) { 66 | fmt.Printf("'%v' wasn't finished\n", job) 67 | }) 68 | 69 | } 70 | -------------------------------------------------------------------------------- /queue.go: -------------------------------------------------------------------------------- 1 | package goworkqueue 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | ) 7 | 8 | // Queue struct 9 | type Queue struct { 10 | jobs chan interface{} 11 | done chan bool 12 | workers chan chan int 13 | once sync.Once 14 | } 15 | 16 | // NewQueue work queue 17 | func NewQueue(size int, workers int, callback func(interface{}, int)) (q *Queue) { 18 | 19 | q = &Queue{} 20 | 21 | q.jobs = make(chan interface{}, size) 22 | q.done = make(chan bool) 23 | q.workers = make(chan chan int, workers) 24 | 25 | for w := 1; w <= workers; w++ { 26 | q.workers <- q.worker(w, callback) 27 | } 28 | 29 | close(q.workers) 30 | return 31 | } 32 | 33 | func (q *Queue) worker(id int, callback func(interface{}, int)) (done chan int) { 34 | done = make(chan int) 35 | 36 | go func() { 37 | work: 38 | for { 39 | select { 40 | case <-q.done: 41 | break work 42 | case j := <-q.jobs: 43 | callback(j, id) 44 | } 45 | } 46 | 47 | close(done) 48 | }() 49 | 50 | return done 51 | } 52 | 53 | // Run blocks until the queue is closed 54 | func (q *Queue) Run() { 55 | 56 | // Wait for all workers to be halted 57 | for w := range q.workers { 58 | <-w 59 | } 60 | 61 | // TODO? 62 | // There seems to be a theoretical chance of a race condition by Add() 63 | // checking q.done before Close() is called and then trying to send on q.jobs 64 | // *after* Close() has been called. By closing q.jobs here, instead of in 65 | // Close(), we avoid this(?) because between these two events all the workers 66 | // have to stop working which is a much greater timespan then the time 67 | // between checking q.done and sending on q.jobs 68 | close(q.jobs) 69 | } 70 | 71 | // Drain queue of jobs 72 | func (q *Queue) Drain(callback func(interface{})) { 73 | for j := range q.jobs { 74 | callback(j) 75 | } 76 | } 77 | 78 | // Close the work queue 79 | func (q *Queue) Close() { 80 | q.once.Do(func() { 81 | close(q.done) 82 | }) 83 | } 84 | 85 | // Closed reports if this queue is already closed 86 | func (q *Queue) Closed() bool { 87 | select { 88 | case <-q.done: 89 | return true 90 | default: 91 | return false 92 | } 93 | } 94 | 95 | // Add jobs to the queue as long as it hasn't be closed 96 | func (q *Queue) Add(job interface{}) bool { 97 | // Check the queue is open first 98 | select { 99 | case <-q.done: 100 | return false 101 | default: 102 | // While the jobs queue send is blocking, we might shutdown the queue 103 | select { 104 | case q.jobs <- job: 105 | return true 106 | case <-q.done: 107 | return false 108 | } 109 | } 110 | } 111 | 112 | // SleepUntilTimeOrChanActivity (whichever comes first) 113 | func SleepUntilTimeOrChanActivity(t time.Duration, c chan interface{}) { 114 | select { 115 | case <-time.After(t): 116 | case <-c: 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /queue_test.go: -------------------------------------------------------------------------------- 1 | package goworkqueue 2 | 3 | import ( 4 | "sync" 5 | "sync/atomic" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | // func TestCloseSend(t *testing.T) { 11 | // 12 | // c := make(chan struct{}) 13 | // 14 | // go func() { 15 | // time.Sleep(time.Microsecond) 16 | // close(c) 17 | // }() 18 | // 19 | // // What happens? 20 | // c <- struct{}{} 21 | // 22 | // } 23 | 24 | func TestQueue(t *testing.T) { 25 | 26 | // 1000 job queue with 100 workers 27 | workers := 100 28 | 29 | queue := NewQueue(1000, workers, func(job interface{}, workerID int) { 30 | // time.Sleep(time.Millisecond) 31 | }) 32 | 33 | // Pretend we suddenly need to stop the workers. 34 | // This might be a SIGTERM or perhaps the workerFunc() called queue.Close() 35 | go func() { 36 | time.Sleep(5 * time.Millisecond) 37 | queue.Close() 38 | }() 39 | 40 | var jobs int64 41 | 42 | var group sync.WaitGroup 43 | 44 | group.Add(workers) 45 | 46 | for j := 0; j < workers; j++ { 47 | 48 | go func(id int) { 49 | var i int 50 | for { 51 | i++ 52 | atomic.AddInt64(&jobs, 1) 53 | if ok := queue.Add(i); !ok { 54 | break 55 | } 56 | } 57 | 58 | group.Done() 59 | // fmt.Printf("%d: %d jobs\n", id, i) 60 | }(j) 61 | } 62 | 63 | // Blocks until queue.Close() 64 | queue.Run() 65 | 66 | // ensure all goroutines are finished 67 | group.Wait() 68 | 69 | // fmt.Printf("%d jobs processed\n", atomic.LoadInt64(&jobs)) 70 | 71 | } 72 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # GoWorkQueue 2 | 3 | Super simple single-node job queue with managed workers. Perfect for small jobs like digesting streams or simple crawl jobs. 4 | 5 | No race conditions and has a graceful shutdown. 6 | 7 | ## Install 8 | 9 | go get github.com/xeoncross/goworkqueue 10 | 11 | ## Usage 12 | 13 | Create a new queue instance with a callback for each job you want run. 14 | 15 | queue := goworkqueue.NewQueue(1000, 5, func(job interface{}, workerID int) { 16 | fmt.Println("Processing", job) 17 | }) 18 | queue.Add("one") // anything can add "jobs" to process 19 | queue.Run() // Blocks until queue.Close() is called 20 | 21 | See the example/example.go for more information. 22 | 23 | Released Free under the MIT license http://davidpennington.me 24 | --------------------------------------------------------------------------------