├── .github
└── workflows
│ └── tests.yml
├── .gitignore
├── LICENSE
├── README.md
├── _examples
├── books.toscrape.com
│ ├── books_to_scrape
│ │ ├── constants.go
│ │ ├── errors.go
│ │ ├── job.go
│ │ ├── record.go
│ │ ├── settings.go
│ │ └── spider.go
│ ├── go.mod
│ ├── go.sum
│ ├── itstimeitsnowornever.csv
│ └── main.go
├── scrapejsp
│ ├── go.mod
│ ├── go.sum
│ ├── main.go
│ ├── scrapejsp
│ │ ├── constants.go
│ │ ├── errors.go
│ │ ├── job.go
│ │ ├── record.go
│ │ ├── settings.go
│ │ └── spider.go
│ └── utils
│ │ └── helper.go
└── scrapejsp_method2
│ ├── go.mod
│ ├── go.sum
│ ├── itstimeitsnowornever.csv
│ ├── main.go
│ └── scrapejsp
│ ├── constants.go
│ ├── errors.go
│ ├── job.go
│ ├── record.go
│ ├── settings.go
│ └── spider.go
├── assets
├── demo.gif
└── logo.webp
├── cmd
├── cli
│ ├── cli.go
│ ├── pipeline.go
│ ├── startproject.go
│ └── templates
│ │ ├── constants.tmpl
│ │ ├── errors.tmpl
│ │ ├── job.tmpl
│ │ ├── main.tmpl
│ │ ├── pipeline.tmpl
│ │ ├── record.tmpl
│ │ ├── settings.tmpl
│ │ └── spider.tmpl
└── gos
│ ├── client.go
│ ├── constants.go
│ ├── gos.go
│ ├── ports.go
│ └── types.go
├── go.mod
├── go.sum
├── internal
├── cmap
│ ├── cmap.go
│ ├── cmap_test.go
│ ├── cmaph.go
│ └── types.go
├── fsm
│ ├── fsm.go
│ └── fsm_test.go
├── resource_pool
│ ├── pool_builder.go
│ └── resource_pool.go
└── types
│ └── option.go
├── main.go
└── pkg
├── builtin
├── middlewares
│ ├── dupefilter.go
│ ├── dupefilter_test.go
│ ├── multi_cookiejar.go
│ ├── multi_cookiejar_test.go
│ ├── retry.go
│ └── retry_test.go
└── pipelines
│ ├── dummy.go
│ ├── export_to_csv.go
│ ├── export_to_csv_test.go
│ ├── export_to_firebase.go
│ ├── export_to_gsheet.go
│ ├── export_to_json.go
│ ├── export_to_json_test.go
│ ├── export_to_mongodb.go
│ └── type.go
├── core
├── core.go
└── ports.go
├── engine
├── engine.go
└── ports.go
├── executor
├── executor.go
└── ports.go
├── executor_adapters
└── http_native
│ ├── adapter.go
│ ├── adapter_test.go
│ └── helper.go
├── middlewaremanager
└── middlewaremanager.go
├── pipeline_manager
├── constants.go
├── group.go
├── options.go
├── pipeline_manager.go
├── pipeline_manager_test.go
└── ports.go
└── scheduler
├── constants.go
├── options.go
├── ports.go
├── request.go
├── response.go
├── scheduler.go
├── scheduler_work.go
├── selectors.go
├── selectors_test.go
├── types.go
├── worker.go
└── worker_test.go
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Test on Pull Request
2 |
3 | on:
4 | push:
5 | branches:
6 | - 'main'
7 | pull_request:
8 | branches:
9 | - 'main'
10 |
11 | jobs:
12 | test:
13 | name: Run Go Tests
14 | runs-on: ubuntu-latest
15 |
16 | strategy:
17 | matrix:
18 | go-version: ['1.22']
19 |
20 | steps:
21 | - name: Checkout code
22 | uses: actions/checkout@v3
23 |
24 | - name: Set up Go
25 | uses: actions/setup-go@v4
26 | with:
27 | go-version: ${{ matrix.go-version }}
28 |
29 | - name: Run tests
30 | run: go test -race -v ./...
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # If you prefer the allow list template instead of the deny list, see community template:
2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
3 | #
4 | # Binaries for programs and plugins
5 | *.exe
6 | *.exe~
7 | *.dll
8 | *.so
9 | *.dylib
10 | *.psd
11 |
12 | # Test binary, built with `go test -c`
13 | *.test
14 |
15 | # Output of the go coverage tool, specifically when used with LiteIDE
16 | *.out
17 |
18 | # Dependency directories (remove the comment below to include it)
19 | # vendor/
20 |
21 | # Go workspace file
22 | go.work
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | License text copyright (c) 2020 MariaDB Corporation Ab, All Rights Reserved.
2 | “Business Source License” is a trademark of MariaDB Corporation Ab.
3 |
4 | Parameters
5 |
6 | Licensor: Techengine(goscrapy dot dev AT gmail dot com).
7 | Licensed Work: Goscrapy.
8 | Additional Use Grant: You may make production use of the Licensed Work,
9 | provided such use does not include offering the Licensed Work
10 | to third parties on a hosted or embedded basis which is
11 | competitive with any services offered or will be offered
12 | in future by the licensor.
13 |
14 | For information about alternative licensing arrangements for the Licensed Work,
15 | please contact goscrapy dot dev AT gmail dot com.
16 |
17 | Notice
18 |
19 | Business Source License 1.1
20 |
21 | Terms
22 |
23 | The Licensor hereby grants you the right to copy, modify, create derivative
24 | works, redistribute, and make non-production use of the Licensed Work. The
25 | Licensor may make an Additional Use Grant, above, permitting limited production use.
26 |
27 | Effective on the Change Date, or the fourth anniversary of the first publicly
28 | available distribution of a specific version of the Licensed Work under this
29 | License, whichever comes first, the Licensor hereby grants you rights under
30 | the terms of the Change License, and the rights granted in the paragraph
31 | above terminate.
32 |
33 | If your use of the Licensed Work does not comply with the requirements
34 | currently in effect as described in this License, you must purchase a
35 | commercial license from the Licensor, its affiliated entities, or authorized
36 | resellers, or you must refrain from using the Licensed Work.
37 |
38 | All copies of the original and modified Licensed Work, and derivative works
39 | of the Licensed Work, are subject to this License. This License applies
40 | separately for each version of the Licensed Work and the Change Date may vary
41 | for each version of the Licensed Work released by Licensor.
42 |
43 | You must conspicuously display this License on each original or modified copy
44 | of the Licensed Work. If you receive the Licensed Work in original or
45 | modified form from a third party, the terms and conditions set forth in this
46 | License apply to your use of that work.
47 |
48 | Any use of the Licensed Work in violation of this License will automatically
49 | terminate your rights under this License for the current and all other
50 | versions of the Licensed Work.
51 |
52 | This License does not grant you any right in any trademark or logo of
53 | Licensor or its affiliates (provided that you may use a trademark or logo of
54 | Licensor as expressly required by this License).
55 |
56 | TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
57 | AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
58 | EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
59 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
60 | TITLE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GoScrapy: Web Scraping Framework in Go
2 | [](https://github.com/tech-engine/goscrapy)
3 |
4 |
5 |
6 |
7 | **GoScrapy** aims to be a powerful web scraping framework in Go, inspired by Python's Scrapy framework. It offers an easy-to-use Scrapy-like experience for extracting data from websites, making it an ideal tool for various data collection and analysis tasks, especially for those coming from Python and wanting to try scraping in Golang..
8 |
9 | ## Getting Started
10 |
11 | Goscrapy requires **Go version 1.22** or higher to run.
12 |
13 | ### 1: Project Initialization
14 |
15 | ```sh
16 | go mod init books_to_scrape
17 | ```
18 |
19 | ### 2. Install goscrapy cli
20 |
21 | ```sh
22 | go install github.com/tech-engine/goscrapy@latest
23 | ```
24 | **Note**: make sure to always keep your goscrapy cli updated.
25 |
26 | ### 3. Verify Installation
27 |
28 | ```sh
29 | goscrapy -v
30 | ```
31 | ### 4. Create a New Project
32 |
33 | ```sh
34 | goscrapy startproject books_to_scrape
35 | ```
36 | This will create a new project directory with all the files necessary to begin working with **GoScrapy**.
37 |
38 | ```sh
39 | \iyuioy\go\go-test-scrapy> goscrapy startproject books_to_scrape
40 |
41 | 🚀 GoScrapy generating project files. Please wait!
42 |
43 | ✔️ books_to_scrape\constants.go
44 | ✔️ books_to_scrape\errors.go
45 | ✔️ books_to_scrape\job.go
46 | ✔️ main.go
47 | ✔️ books_to_scrape\record.go
48 | ✔️ books_to_scrape\spider.go
49 |
50 | ✨ Congrates. books_to_scrape created successfully.
51 | ```
52 |
53 | ### spider.go
54 | In your __`spider.go`__ file, set up and execute your spider.
55 |
56 | For detailed code, please refer to the [sample code here](./_examples/scrapejsp_method2/scrapejsp/spider.go).
57 |
58 | ```go
59 | package scrapejsp
60 |
61 | import (
62 | "context"
63 | "encoding/json"
64 | "fmt"
65 | "log"
66 |
67 | "github.com/tech-engine/goscrapy/cmd/gos"
68 | "github.com/tech-engine/goscrapy/pkg/core"
69 | )
70 |
71 | type Spider struct {
72 | gos.ICoreSpider[*Record]
73 | }
74 |
75 | func NewSpider(ctx context.Context) (*Spider, <-chan error) {
76 |
77 | // use proxies
78 | // proxies := core.WithProxies("proxy_url1", "proxy_url2", ...)
79 | // core := gos.New[*Record]().WithClient(
80 | // gos.DefaultClient(proxies),
81 | // )
82 |
83 | core := gos.New[*Record]()
84 |
85 | // Add middlewares
86 | core.MiddlewareManager.Add(MIDDLEWARES...)
87 | // Add pipelines
88 | core.PipelineManager.Add(PIPELINES...)
89 |
90 | errCh := make(chan error)
91 |
92 | go func() {
93 | errCh <- core.Start(ctx)
94 | }()
95 |
96 | return &Spider{
97 | core,
98 | }, errCh
99 | }
100 |
101 | // This is the entrypoint to the spider
102 | func (s *Spider) StartRequest(ctx context.Context, job *Job) {
103 |
104 | req := s.NewRequest()
105 | // req.Meta("JOB", job)
106 | req.Url("https://jsonplaceholder.typicode.com/todos/1")
107 |
108 | s.Request(req, s.parse)
109 | }
110 |
111 | func (s *Spider) Close(ctx context.Context) {
112 | }
113 |
114 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) {
115 | fmt.Printf("status: %d", resp.StatusCode())
116 |
117 | var data Record
118 | err := json.Unmarshal(resp.Bytes(), &data)
119 | if err != nil {
120 | log.Fatalln(err)
121 | }
122 |
123 | // to push to pipelines
124 | s.Yield(&data)
125 | }
126 | ```
127 |
128 |
129 |
130 |
131 |
132 | ## Wiki
133 | Please follow the [wiki](https://github.com/tech-engine/goscrapy/wiki) docs for details.
134 |
135 | ### Note
136 |
137 | **GoScrapy** is not stable, so its API may change drastically. Please exercise caution when using it in production.
138 |
139 | ## License
140 |
141 | **GoScrapy** is available under the BSL with an additional usage grant that allows free internal use. Please ensure that you agree with the license before contributing to **GoScrapy**, as by contributing to the GoScrapy project, you agree to the terms of the license.
142 |
143 | ## Roadmap
144 |
145 | - ~~Cookie management~~
146 | - ~~Builtin & Custom Middlewares support~~
147 | - ~~Css & Xpath Selectors~~
148 | - Logging
149 | - Triggers
150 | - Tests(work in progress)
151 |
152 | ## Partners
153 |
154 |
155 |
156 |
157 |
158 | ## Get in touch
159 | [Discord](https://discord.gg/FPvxETjYPH)
160 |
--------------------------------------------------------------------------------
/_examples/books.toscrape.com/books_to_scrape/constants.go:
--------------------------------------------------------------------------------
1 | package books_to_scrape
2 |
3 | // you can define your constants here
4 |
--------------------------------------------------------------------------------
/_examples/books.toscrape.com/books_to_scrape/errors.go:
--------------------------------------------------------------------------------
1 | package books_to_scrape
2 |
3 | // you can define your errors here
4 |
--------------------------------------------------------------------------------
/_examples/books.toscrape.com/books_to_scrape/job.go:
--------------------------------------------------------------------------------
1 | package books_to_scrape
2 |
3 | // id field is compulsory in a Job defination. You can add your custom to Job
4 | type Job struct {
5 | id string
6 | }
7 |
8 | // do not delete/edit
9 | func NewJob(id string) *Job {
10 | return &Job{
11 | id: id,
12 | }
13 | }
14 |
15 | // do not delete/edit
16 | func (j *Job) Id() string {
17 | return j.id
18 | }
19 |
20 | // do not delete
21 | func (j *Job) Reset() {
22 | j.id = ""
23 | }
24 |
25 | // add your custom receiver functions below
26 |
--------------------------------------------------------------------------------
/_examples/books.toscrape.com/books_to_scrape/record.go:
--------------------------------------------------------------------------------
1 | package books_to_scrape
2 |
3 | import (
4 | "reflect"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/core"
7 | )
8 |
9 | /*
10 | json and csv struct field tags are required, if you want the Record to be exported
11 | or processed by builtin pipelines
12 | */
13 |
14 | type Record struct {
15 | J *Job `json:"-" csv:"-"` // JobId is required
16 | // add you custom fields here
17 | Title string `json:"title" csv:"title"`
18 | Price string `json:"price" csv:"price"`
19 | Stock string `json:"stock" csv:"stock"`
20 | Rating string `json:"rating" csv:"rating"`
21 | Description string `json:"description" csv:"description"`
22 | Upc string `json:"upc" csv:"upc"`
23 | ProductType string `json:"product_type" csv:"product_type"`
24 | Reviews string `json:"reviews" csv:"reviews"`
25 | }
26 |
27 | // modify below code only if you know what you are doing
28 | func (r *Record) Record() *Record {
29 | return r
30 | }
31 |
32 | func (r *Record) RecordKeys() []string {
33 | dataType := reflect.TypeOf(*r)
34 | if dataType.Kind() != reflect.Struct {
35 | panic("Record is not a struct")
36 | }
37 |
38 | numFields := dataType.NumField()
39 | keys := make([]string, numFields)
40 |
41 | for i := 0; i < numFields; i++ {
42 | field := dataType.Field(i)
43 | csvTag := field.Tag.Get("csv")
44 | keys[i] = csvTag
45 | }
46 |
47 | return keys
48 | }
49 |
50 | func (r *Record) RecordFlat() []any {
51 |
52 | inputType := reflect.TypeOf(*r)
53 |
54 | if inputType.Kind() != reflect.Struct {
55 | panic("Record is not a struct")
56 | }
57 |
58 | inputValue := reflect.ValueOf(*r)
59 |
60 | slice := make([]any, inputType.NumField())
61 |
62 | for i := 0; i < inputType.NumField(); i++ {
63 | slice[i] = inputValue.Field(i).Interface()
64 | }
65 | return slice
66 | }
67 |
68 | func (r *Record) Job() core.IJob {
69 | return r.J
70 | }
71 |
--------------------------------------------------------------------------------
/_examples/books.toscrape.com/books_to_scrape/settings.go:
--------------------------------------------------------------------------------
1 | package books_to_scrape
2 |
3 | import (
4 | "os"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/builtin/middlewares"
7 | "github.com/tech-engine/goscrapy/pkg/builtin/pipelines"
8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager"
9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
10 | )
11 |
12 | // HTTP Transport settings
13 |
14 | // Default: 10000
15 | const MIDDLEWARE_HTTP_TIMEOUT_MS = ""
16 |
17 | // Default: 100
18 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN = ""
19 |
20 | // Default: 100
21 | const MIDDLEWARE_HTTP_MAX_CONN_PER_HOST = ""
22 |
23 | // Default: 100
24 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST = ""
25 |
26 | // Inbuilt Retry middleware settings
27 |
28 | // Default: 3
29 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = ""
30 |
31 | // Default: 500, 502, 503, 504, 522, 524, 408, 429
32 | const MIDDLEWARE_HTTP_RETRY_CODES = ""
33 |
34 | // Default: 1s
35 | const MIDDLEWARE_HTTP_RETRY_BASE_DELAY = ""
36 |
37 | // Default: 1000000
38 | const SCHEDULER_REQ_RES_POOL_SIZE = ""
39 |
40 | // Default: num of CPU * 3
41 | const SCHEDULER_CONCURRENCY = ""
42 |
43 | // Default: 1000000
44 | const SCHEDULER_WORK_QUEUE_SIZE = ""
45 |
46 | // Pipeline Manager settings
47 |
48 | // Default: 10000
49 | const PIPELINEMANAGER_ITEMPOOL_SIZE = ""
50 |
51 | // Default: 24
52 | const PIPELINEMANAGER_ITEM_SIZE = ""
53 |
54 | // Default: 0
55 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = ""
56 |
57 | // Default: 1000
58 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = ""
59 |
60 | // Middlewares here
61 | // Executed in reverse order from bottom to top.
62 | var MIDDLEWARES = []middlewaremanager.Middleware{
63 | middlewares.Retry(),
64 | middlewares.MultiCookieJar,
65 | middlewares.DupeFilter,
66 | }
67 |
68 | var export2CSV = pipelines.Export2CSV[*Record](pipelines.Export2CSVOpts{
69 | Filename: "itstimeitsnowornever.csv",
70 | })
71 |
72 | // use export 2 json pipeline
73 | // var export2Json = pipelines.Export2JSON[*Record](pipelines.Export2JSONOpts{
74 | // Filename: "itstimeitsnowornever.json",
75 | // Immediate: true,
76 | // })
77 |
78 | // add pipeline to group
79 | //func myCustomPipelineGroup() *pm.Group[*Record] {
80 | // pipelineGroup := pm.NewGroup[*Record]()
81 | // pipelineGroup.Add(export2CSV)
82 | // // pipelineGroup.Add(export2Json)
83 | // return pipelineGroup
84 | //}
85 |
86 | // Pipelines here
87 | // Executed in the order they appear.
88 | var PIPELINES = []pm.IPipeline[*Record]{
89 | export2CSV,
90 | // export2Json,
91 | // myCustomPipelineGroup(),
92 | }
93 |
94 | func init() {
95 | var settings = map[string]string{
96 | "MIDDLEWARE_HTTP_TIMEOUT_MS": MIDDLEWARE_HTTP_TIMEOUT_MS,
97 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN": MIDDLEWARE_HTTP_MAX_IDLE_CONN,
98 | "MIDDLEWARE_HTTP_MAX_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_CONN_PER_HOST,
99 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST,
100 | "MIDDLEWARE_HTTP_RETRY_MAX_RETRIES": MIDDLEWARE_HTTP_RETRY_MAX_RETRIES,
101 | "MIDDLEWARE_HTTP_RETRY_CODES": MIDDLEWARE_HTTP_RETRY_CODES,
102 | "MIDDLEWARE_HTTP_RETRY_BASE_DELAY": MIDDLEWARE_HTTP_RETRY_BASE_DELAY,
103 | "SCHEDULER_REQ_RES_POOL_SIZE": SCHEDULER_REQ_RES_POOL_SIZE,
104 | "SCHEDULER_CONCURRENCY": SCHEDULER_CONCURRENCY,
105 | "SCHEDULER_WORK_QUEUE_SIZE": SCHEDULER_WORK_QUEUE_SIZE,
106 | "PIPELINEMANAGER_ITEMPOOL_SIZE": PIPELINEMANAGER_ITEMPOOL_SIZE,
107 | "PIPELINEMANAGER_ITEM_SIZE": PIPELINEMANAGER_ITEM_SIZE,
108 | "PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE": PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE,
109 | "PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY": PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY,
110 | }
111 |
112 | for key, value := range settings {
113 | if value != "" {
114 | os.Setenv(key, value)
115 | }
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/_examples/books.toscrape.com/books_to_scrape/spider.go:
--------------------------------------------------------------------------------
1 | package books_to_scrape
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "regexp"
7 | "strings"
8 |
9 | "github.com/tech-engine/goscrapy/cmd/gos"
10 | "github.com/tech-engine/goscrapy/pkg/core"
11 | )
12 |
13 | type Spider struct {
14 | gos.ICoreSpider[*Record]
15 | baseUrl string
16 | }
17 |
18 | func New(ctx context.Context) (*Spider, <-chan error) {
19 |
20 | // use proxies
21 | // proxies := core.WithProxies("proxy_url1", "proxy_url2", ...)
22 | // core := gos.New[*Record]().WithClient(
23 | // gos.DefaultClient(proxies),
24 | // )
25 |
26 | core := gos.New[*Record]()
27 |
28 | // Add middlewares
29 | core.MiddlewareManager.Add(MIDDLEWARES...)
30 | // Add pipelines
31 | core.PipelineManager.Add(PIPELINES...)
32 |
33 | errCh := make(chan error)
34 |
35 | spider := &Spider{
36 | core,
37 | "https://books.toscrape.com",
38 | }
39 |
40 | go func() {
41 | errCh <- core.Start(ctx)
42 | spider.Close(ctx)
43 | }()
44 |
45 | return spider, errCh
46 | }
47 |
48 | func (s *Spider) StartRequest(ctx context.Context, job *Job) {
49 |
50 | // for each request we must call NewRequest() and never reuse it
51 | req := s.NewRequest()
52 |
53 | // GET is the request method
54 | req.Url(s.baseUrl)
55 |
56 | s.Request(req, s.parse)
57 | }
58 |
59 | // can be called when spider is about to close
60 | func (s *Spider) Close(ctx context.Context) {
61 | fmt.Println("closing")
62 | }
63 |
64 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) {
65 | fmt.Printf("GET: %d %s\n", resp.StatusCode(), resp.Request().URL.String())
66 | for _, productUrl := range resp.Css("article.product_pod h3 a").Attr("href") {
67 | req := s.NewRequest()
68 |
69 | if strings.HasPrefix(productUrl, "catalogue/") {
70 | productUrl = fmt.Sprintf("%s/%s", s.baseUrl, productUrl)
71 | } else {
72 | productUrl = fmt.Sprintf("%s/catalogue/%s", s.baseUrl, productUrl)
73 | }
74 |
75 | req.Url(productUrl)
76 | s.Request(req, s.parseProduct)
77 | fmt.Printf("GET: %s\n", productUrl)
78 | }
79 |
80 | // pagination
81 | nextUrls := resp.Css("li.next a").Attr("href")
82 |
83 | if len(nextUrls) <= 0 {
84 | return
85 | }
86 |
87 | nextUrl := fmt.Sprintf("%s/%s", s.baseUrl, nextUrls[0])
88 |
89 | if !strings.HasPrefix(nextUrls[0], "catalogue/") {
90 | nextUrl = fmt.Sprintf("%s/catalogue/%s", s.baseUrl, nextUrls[0])
91 | }
92 |
93 | req := s.NewRequest()
94 | req.Url(nextUrl)
95 | s.Request(req, s.parse)
96 | }
97 |
98 | func (s *Spider) parseProduct(ctx context.Context, resp core.IResponseReader) {
99 | product := resp.Css("article.product_page")
100 |
101 | var title string
102 | if titles := product.Css(".product_main h1").Text(); len(titles) > 0 {
103 | title = titles[0]
104 | }
105 |
106 | var price string
107 | if prices := product.Css(".price_color").Text(); len(prices) > 0 {
108 | price = prices[0]
109 | }
110 |
111 | var stock string
112 | if stocks := product.Css(".availability").Text(); len(stocks) > 0 {
113 | match := regexp.MustCompile(`\((\d+) available\)`).FindStringSubmatch(strings.TrimSpace(stocks[0]))
114 |
115 | if len(match) > 0 {
116 | stock = match[1]
117 | }
118 | }
119 |
120 | var rating string
121 | if ratingClassAttrs := product.Css(".star-rating").Attr("class"); len(ratingClassAttrs) > 0 {
122 | rating = strings.Split(ratingClassAttrs[0], " ")[1]
123 |
124 | }
125 |
126 | var productDescription string
127 | if productDescriptions := product.Css("#product_description + *").Text(); len(productDescriptions) > 0 {
128 | productDescription = productDescriptions[0]
129 | }
130 |
131 | var upc string
132 | if upcs := product.Css("table tr:nth-child(1) td").Text(); len(upcs) > 0 {
133 | upc = upcs[0]
134 | }
135 |
136 | var productType string
137 | if productTypes := product.Css("table tr:nth-child(2) td").Text(); len(productTypes) > 0 {
138 | productType = productTypes[0]
139 | }
140 |
141 | var reviewCount string
142 | if reviewCounts := product.Css("table tr:nth-child(7) td").Text(); len(reviewCounts) > 0 {
143 | reviewCount = reviewCounts[0]
144 | }
145 |
146 | s.Yield(&Record{
147 | Title: title,
148 | Price: price,
149 | Stock: stock,
150 | Rating: rating,
151 | Description: productDescription,
152 | Upc: upc,
153 | ProductType: productType,
154 | Reviews: reviewCount,
155 | })
156 | }
157 |
--------------------------------------------------------------------------------
/_examples/books.toscrape.com/go.mod:
--------------------------------------------------------------------------------
1 | module books_to_scrape
2 |
3 | go 1.21.0
4 |
5 | require github.com/tech-engine/goscrapy v0.13.1
6 |
7 | require (
8 | cloud.google.com/go v0.110.6 // indirect
9 | cloud.google.com/go/compute v1.23.0 // indirect
10 | cloud.google.com/go/compute/metadata v0.2.3 // indirect
11 | cloud.google.com/go/firestore v1.13.0 // indirect
12 | cloud.google.com/go/iam v1.1.1 // indirect
13 | cloud.google.com/go/longrunning v0.5.1 // indirect
14 | cloud.google.com/go/storage v1.33.0 // indirect
15 | firebase.google.com/go v3.13.0+incompatible // indirect
16 | github.com/andybalholm/cascadia v1.3.2 // indirect
17 | github.com/antchfx/htmlquery v1.3.2 // indirect
18 | github.com/antchfx/xpath v1.3.1 // indirect
19 | github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d // indirect
20 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
21 | github.com/golang/protobuf v1.5.3 // indirect
22 | github.com/golang/snappy v0.0.4 // indirect
23 | github.com/google/go-cmp v0.5.9 // indirect
24 | github.com/google/s2a-go v0.1.4 // indirect
25 | github.com/google/uuid v1.3.0 // indirect
26 | github.com/googleapis/enterprise-certificate-proxy v0.2.5 // indirect
27 | github.com/googleapis/gax-go/v2 v2.12.0 // indirect
28 | github.com/inconshreveable/mousetrap v1.1.0 // indirect
29 | github.com/klauspost/compress v1.13.6 // indirect
30 | github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect
31 | github.com/segmentio/fasthash v1.0.3 // indirect
32 | github.com/spf13/cobra v1.8.1 // indirect
33 | github.com/spf13/pflag v1.0.5 // indirect
34 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect
35 | github.com/xdg-go/scram v1.1.2 // indirect
36 | github.com/xdg-go/stringprep v1.0.4 // indirect
37 | github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect
38 | go.mongodb.org/mongo-driver v1.12.1 // indirect
39 | go.opencensus.io v0.24.0 // indirect
40 | golang.org/x/crypto v0.21.0 // indirect
41 | golang.org/x/net v0.23.0 // indirect
42 | golang.org/x/oauth2 v0.11.0 // indirect
43 | golang.org/x/sync v0.3.0 // indirect
44 | golang.org/x/sys v0.18.0 // indirect
45 | golang.org/x/text v0.14.0 // indirect
46 | golang.org/x/time v0.3.0 // indirect
47 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
48 | google.golang.org/api v0.136.0 // indirect
49 | google.golang.org/appengine v1.6.7 // indirect
50 | google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect
51 | google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 // indirect
52 | google.golang.org/genproto/googleapis/rpc v0.0.0-20230807174057-1744710a1577 // indirect
53 | google.golang.org/grpc v1.57.1 // indirect
54 | google.golang.org/protobuf v1.33.0 // indirect
55 | )
56 |
--------------------------------------------------------------------------------
/_examples/books.toscrape.com/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "os"
8 | "os/signal"
9 | "sync"
10 | "syscall"
11 |
12 | // replace with your own project name
13 | "books_to_scrape/books_to_scrape"
14 | )
15 |
16 | // sample terminate function to demostrate spider termination.
17 | func OnTerminate(fn func()) {
18 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
19 | <-ctx.Done()
20 | stop()
21 | fn()
22 | }
23 |
24 | func main() {
25 | ctx, cancel := context.WithCancel(context.Background())
26 |
27 | var wg sync.WaitGroup
28 | wg.Add(1)
29 |
30 | spider, errCh := books_to_scrape.New(ctx)
31 | go func() {
32 | defer wg.Done()
33 |
34 | err := <-errCh
35 |
36 | if err != nil && errors.Is(err, context.Canceled) {
37 | return
38 | }
39 |
40 | fmt.Printf("failed: %q", err)
41 | }()
42 |
43 | // start the scraper with a job, currently nil is passed but you can pass your job here
44 | spider.StartRequest(ctx, nil)
45 |
46 | OnTerminate(func() {
47 | fmt.Println("exit signal received: shutting down gracefully")
48 | cancel()
49 | wg.Wait()
50 | })
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/_examples/scrapejsp/go.mod:
--------------------------------------------------------------------------------
1 | module scrapejsp
2 |
3 | go 1.21.0
4 |
5 | require github.com/tech-engine/goscrapy v0.12.0
6 |
7 | require (
8 | cloud.google.com/go v0.110.6 // indirect
9 | cloud.google.com/go/compute v1.23.0 // indirect
10 | cloud.google.com/go/compute/metadata v0.2.3 // indirect
11 | cloud.google.com/go/firestore v1.13.0 // indirect
12 | cloud.google.com/go/iam v1.1.1 // indirect
13 | cloud.google.com/go/longrunning v0.5.1 // indirect
14 | cloud.google.com/go/storage v1.33.0 // indirect
15 | firebase.google.com/go v3.13.0+incompatible // indirect
16 | github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d // indirect
17 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
18 | github.com/golang/protobuf v1.5.3 // indirect
19 | github.com/golang/snappy v0.0.4 // indirect
20 | github.com/google/go-cmp v0.5.9 // indirect
21 | github.com/google/s2a-go v0.1.4 // indirect
22 | github.com/google/uuid v1.3.0 // indirect
23 | github.com/googleapis/enterprise-certificate-proxy v0.2.5 // indirect
24 | github.com/googleapis/gax-go/v2 v2.12.0 // indirect
25 | github.com/klauspost/compress v1.13.6 // indirect
26 | github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect
27 | github.com/segmentio/fasthash v1.0.3 // indirect
28 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect
29 | github.com/xdg-go/scram v1.1.2 // indirect
30 | github.com/xdg-go/stringprep v1.0.4 // indirect
31 | github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect
32 | go.mongodb.org/mongo-driver v1.12.1 // indirect
33 | go.opencensus.io v0.24.0 // indirect
34 | golang.org/x/crypto v0.21.0 // indirect
35 | golang.org/x/net v0.23.0 // indirect
36 | golang.org/x/oauth2 v0.11.0 // indirect
37 | golang.org/x/sync v0.3.0 // indirect
38 | golang.org/x/sys v0.18.0 // indirect
39 | golang.org/x/text v0.14.0 // indirect
40 | golang.org/x/time v0.3.0 // indirect
41 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
42 | google.golang.org/api v0.136.0 // indirect
43 | google.golang.org/appengine v1.6.7 // indirect
44 | google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect
45 | google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 // indirect
46 | google.golang.org/genproto/googleapis/rpc v0.0.0-20230807174057-1744710a1577 // indirect
47 | google.golang.org/grpc v1.57.1 // indirect
48 | google.golang.org/protobuf v1.33.0 // indirect
49 | )
50 |
--------------------------------------------------------------------------------
/_examples/scrapejsp/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "os"
8 | "os/signal"
9 | "scrapejsp/scrapejsp"
10 | "sync"
11 | "syscall"
12 |
13 | // replace with your own project name
14 |
15 | "github.com/tech-engine/goscrapy/cmd/gos"
16 | )
17 |
18 | // sample terminate function to demostrate spider termination.
19 | func OnTerminate(fn func()) {
20 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
21 | <-ctx.Done()
22 | stop()
23 | fn()
24 | }
25 |
26 | func main() {
27 | ctx, cancel := context.WithCancel(context.Background())
28 |
29 | var wg sync.WaitGroup
30 | wg.Add(1)
31 |
32 | // get core spider
33 | gos := gos.New[*scrapejsp.Record]()
34 |
35 | // use proxies
36 | // proxies := gos.WithProxies("proxy_url1", "proxy_url2", ...)
37 |
38 | // get core spider
39 | // gos := gos.New[*scrapejsp.Record]().WithClient(
40 | // gos.DefaultClient(proxies),
41 | // )
42 |
43 | // use middlewares
44 | gos.MiddlewareManager.Add(scrapejsp.MIDDLEWARES...)
45 |
46 | // use pipelines
47 | gos.PipelineManager.Add(scrapejsp.PIPELINES...)
48 |
49 | // export2Csv := pipelines.Export2CSV[*scrapejsp.Record](pipelines.Export2CSVOpts{
50 | // Filename: "itstimeitsnowornever.csv",
51 | // })
52 |
53 | // // use export 2 json pipeline
54 | // export2Json := pipelines.Export2JSON[*scrapejsp.Record](pipelines.Export2JSONOpts{
55 | // Filename: "itstimeitsnowornever.json",
56 | // Immediate: true,
57 | // })
58 |
59 | // add pipeline to group
60 | // pipelineGroup := pm.NewGroup[*scrapejsp.Record]()
61 | // pipelineGroup.Add(export2Csv)
62 | // pipelineGroup.Add(export2Json)
63 | // gos.PipelineManager.Add(
64 | // pipelineGroup,
65 | // )
66 |
67 | go func() {
68 | defer wg.Done()
69 |
70 | err := gos.Start(ctx)
71 |
72 | if err != nil && errors.Is(err, context.Canceled) {
73 | return
74 | }
75 |
76 | fmt.Printf("failed: %q", err)
77 | }()
78 |
79 | spider := scrapejsp.NewSpider(gos)
80 |
81 | // start the scraper with a job, currently nil is passed but you can pass your job here
82 | spider.StartRequest(ctx, nil)
83 |
84 | OnTerminate(func() {
85 | fmt.Println("exit signal received: shutting down gracefully")
86 | cancel()
87 | wg.Wait()
88 | })
89 |
90 | }
91 |
--------------------------------------------------------------------------------
/_examples/scrapejsp/scrapejsp/constants.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | // you can define your constants here
4 |
--------------------------------------------------------------------------------
/_examples/scrapejsp/scrapejsp/errors.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | // you can define your errors here
4 |
--------------------------------------------------------------------------------
/_examples/scrapejsp/scrapejsp/job.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | // id field is compulsory in a Job defination. You can add your custom to Job
4 | type Job struct {
5 | id string
6 | // query string
7 | }
8 |
9 | // do not delete/edit
10 | func NewJob(id string) *Job {
11 | return &Job{
12 | id: id,
13 | }
14 | }
15 |
16 | // do not delete/edit
17 | func (j *Job) Id() string {
18 | return j.id
19 | }
20 |
21 | // do not delete
22 | func (j *Job) Reset() {
23 | j.id = ""
24 | }
25 |
26 | // add your custom receiver functions below
27 | // func (j *Job) SetQuery(query string) {
28 | // j.query = query
29 | // return
30 | // }
31 |
--------------------------------------------------------------------------------
/_examples/scrapejsp/scrapejsp/record.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | import (
4 | "reflect"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/core"
7 | )
8 |
9 | // do not modify this file
10 |
11 | type Record struct {
12 | J *Job `json:"-" csv:"-"` // JobId is required
13 | UserId int `csv:"userId" json:"userId"`
14 | Id int `csv:"id" json:"id"`
15 | Title string `csv:"title" json:"title"`
16 | Completed bool `csv:"completed" json:"completed"`
17 | }
18 |
19 | func (r *Record) Record() *Record {
20 | return r
21 | }
22 |
23 | func (r *Record) RecordKeys() []string {
24 | dataType := reflect.TypeOf(*r)
25 | if dataType.Kind() != reflect.Struct {
26 | panic("Record is not a struct")
27 | }
28 |
29 | numFields := dataType.NumField()
30 | keys := make([]string, numFields)
31 |
32 | for i := 0; i < numFields; i++ {
33 | field := dataType.Field(i)
34 | csvTag := field.Tag.Get("csv")
35 | keys[i] = csvTag
36 | }
37 |
38 | return keys
39 | }
40 |
41 | func (r *Record) RecordFlat() []any {
42 |
43 | inputType := reflect.TypeOf(*r)
44 |
45 | if inputType.Kind() != reflect.Struct {
46 | panic("Record is not a struct")
47 | }
48 |
49 | inputValue := reflect.ValueOf(*r)
50 |
51 | slice := make([]any, inputType.NumField())
52 |
53 | for i := 0; i < inputType.NumField(); i++ {
54 | slice[i] = inputValue.Field(i).Interface()
55 | }
56 | return slice
57 | }
58 |
59 | func (r *Record) Job() core.IJob {
60 | return r.J
61 | }
62 |
--------------------------------------------------------------------------------
/_examples/scrapejsp/scrapejsp/settings.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | import (
4 | "os"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/builtin/middlewares"
7 | "github.com/tech-engine/goscrapy/pkg/builtin/pipelines"
8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager"
9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
10 | )
11 |
12 | // HTTP Transport settings
13 |
14 | // Default: 10000
15 | const MIDDLEWARE_HTTP_TIMEOUT_MS = ""
16 |
17 | // Default: 100
18 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN = ""
19 |
20 | // Default: 100
21 | const MIDDLEWARE_HTTP_MAX_CONN_PER_HOST = ""
22 |
23 | // Default: 100
24 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST = ""
25 |
26 | // Inbuilt Retry middleware settings
27 |
28 | // Default: 3
29 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = ""
30 |
31 | // Default: 500, 502, 503, 504, 522, 524, 408, 429
32 | const MIDDLEWARE_HTTP_RETRY_CODES = ""
33 |
34 | // Default: 1s
35 | const MIDDLEWARE_HTTP_RETRY_BASE_DELAY = ""
36 |
37 | // Default: 1000000
38 | const SCHEDULER_REQ_RES_POOL_SIZE = ""
39 |
40 | // Default: num of CPU * 3
41 | const SCHEDULER_CONCURRENCY = ""
42 |
43 | // Default: 1000000
44 | const SCHEDULER_WORK_QUEUE_SIZE = ""
45 |
46 | // Pipeline Manager settings
47 |
48 | // Default: 10000
49 | const PIPELINEMANAGER_ITEMPOOL_SIZE = ""
50 |
51 | // Default: 24
52 | const PIPELINEMANAGER_ITEM_SIZE = ""
53 |
54 | // Default: 0
55 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = ""
56 |
57 | // Default: 1000
58 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = ""
59 |
60 | // Middlewares here
61 | // Executed in reverse order from bottom to top.
62 | var MIDDLEWARES = []middlewaremanager.Middleware{
63 | middlewares.Retry(),
64 | middlewares.MultiCookieJar,
65 | middlewares.DupeFilter,
66 | }
67 |
68 | var export2CSV = pipelines.Export2CSV[*Record](pipelines.Export2CSVOpts{
69 | Filename: "itstimeitsnowornever.csv",
70 | })
71 |
72 | // use export 2 json pipeline
73 | // var export2Json = pipelines.Export2JSON[*Record](pipelines.Export2JSONOpts{
74 | // Filename: "itstimeitsnowornever.json",
75 | // Immediate: true,
76 | // })
77 |
78 | // add pipeline to group
79 | //func myCustomPipelineGroup() *pm.Group[*Record] {
80 | // pipelineGroup := pm.NewGroup[*Record]()
81 | // pipelineGroup.Add(export2CSV)
82 | // // pipelineGroup.Add(export2Json)
83 | // return pipelineGroup
84 | //}
85 |
86 | // Pipelines here
87 | // Executed in the order they appear.
88 | var PIPELINES = []pm.IPipeline[*Record]{
89 | export2CSV,
90 | // export2Json,
91 | // myCustomPipelineGroup(),
92 | }
93 |
94 | func init() {
95 | var settings = map[string]string{
96 | "MIDDLEWARE_HTTP_TIMEOUT_MS": MIDDLEWARE_HTTP_TIMEOUT_MS,
97 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN": MIDDLEWARE_HTTP_MAX_IDLE_CONN,
98 | "MIDDLEWARE_HTTP_MAX_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_CONN_PER_HOST,
99 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST,
100 | "MIDDLEWARE_HTTP_RETRY_MAX_RETRIES": MIDDLEWARE_HTTP_RETRY_MAX_RETRIES,
101 | "MIDDLEWARE_HTTP_RETRY_CODES": MIDDLEWARE_HTTP_RETRY_CODES,
102 | "MIDDLEWARE_HTTP_RETRY_BASE_DELAY": MIDDLEWARE_HTTP_RETRY_BASE_DELAY,
103 | "SCHEDULER_REQ_RES_POOL_SIZE": SCHEDULER_REQ_RES_POOL_SIZE,
104 | "SCHEDULER_CONCURRENCY": SCHEDULER_CONCURRENCY,
105 | "SCHEDULER_WORK_QUEUE_SIZE": SCHEDULER_WORK_QUEUE_SIZE,
106 | "PIPELINEMANAGER_ITEMPOOL_SIZE": PIPELINEMANAGER_ITEMPOOL_SIZE,
107 | "PIPELINEMANAGER_ITEM_SIZE": PIPELINEMANAGER_ITEM_SIZE,
108 | "PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE": PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE,
109 | "PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY": PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY,
110 | }
111 |
112 | for key, value := range settings {
113 | if value != "" {
114 | os.Setenv(key, value)
115 | }
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/_examples/scrapejsp/scrapejsp/spider.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "fmt"
7 | "log"
8 |
9 | "github.com/tech-engine/goscrapy/cmd/gos"
10 | "github.com/tech-engine/goscrapy/pkg/core"
11 | )
12 |
13 | type Spider struct {
14 | gos.ICoreSpider[*Record]
15 | }
16 |
17 | func NewSpider(core gos.ICoreSpider[*Record]) *Spider {
18 | return &Spider{
19 | core,
20 | }
21 | }
22 |
23 | // This is the entrypoint to the spider
24 | func (s *Spider) StartRequest(ctx context.Context, job *Job) {
25 |
26 | req := s.NewRequest()
27 | // req.Meta("JOB", job)
28 | req.Url("https://jsonplaceholder.typicode.com/todos/1")
29 |
30 | s.Request(req, s.parse)
31 | }
32 |
33 | func (s *Spider) Close(ctx context.Context) {
34 | }
35 |
36 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) {
37 | fmt.Printf("status: %d", resp.StatusCode())
38 |
39 | var data Record
40 | err := json.Unmarshal(resp.Bytes(), &data)
41 | if err != nil {
42 | log.Fatalln(err)
43 | }
44 |
45 | // to push to pipelines
46 | s.Yield(&data)
47 | }
48 |
--------------------------------------------------------------------------------
/_examples/scrapejsp/utils/helper.go:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import (
4 | "context"
5 | "os"
6 | "os/signal"
7 | "syscall"
8 | )
9 |
10 | func OnTerminate(fn func()) {
11 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
12 | <-ctx.Done()
13 | stop()
14 | fn()
15 | }
16 |
--------------------------------------------------------------------------------
/_examples/scrapejsp_method2/go.mod:
--------------------------------------------------------------------------------
1 | module scrapejsp
2 |
3 | go 1.21.0
4 |
5 | require github.com/tech-engine/goscrapy v0.12.1
6 |
7 | require (
8 | cloud.google.com/go v0.110.6 // indirect
9 | cloud.google.com/go/compute v1.23.0 // indirect
10 | cloud.google.com/go/compute/metadata v0.2.3 // indirect
11 | cloud.google.com/go/firestore v1.13.0 // indirect
12 | cloud.google.com/go/iam v1.1.1 // indirect
13 | cloud.google.com/go/longrunning v0.5.1 // indirect
14 | cloud.google.com/go/storage v1.33.0 // indirect
15 | firebase.google.com/go v3.13.0+incompatible // indirect
16 | github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d // indirect
17 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
18 | github.com/golang/protobuf v1.5.3 // indirect
19 | github.com/golang/snappy v0.0.4 // indirect
20 | github.com/google/go-cmp v0.5.9 // indirect
21 | github.com/google/s2a-go v0.1.4 // indirect
22 | github.com/google/uuid v1.3.0 // indirect
23 | github.com/googleapis/enterprise-certificate-proxy v0.2.5 // indirect
24 | github.com/googleapis/gax-go/v2 v2.12.0 // indirect
25 | github.com/klauspost/compress v1.13.6 // indirect
26 | github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect
27 | github.com/segmentio/fasthash v1.0.3 // indirect
28 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect
29 | github.com/xdg-go/scram v1.1.2 // indirect
30 | github.com/xdg-go/stringprep v1.0.4 // indirect
31 | github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect
32 | go.mongodb.org/mongo-driver v1.12.1 // indirect
33 | go.opencensus.io v0.24.0 // indirect
34 | golang.org/x/crypto v0.31.0 // indirect
35 | golang.org/x/net v0.23.0 // indirect
36 | golang.org/x/oauth2 v0.11.0 // indirect
37 | golang.org/x/sync v0.10.0 // indirect
38 | golang.org/x/sys v0.28.0 // indirect
39 | golang.org/x/text v0.21.0 // indirect
40 | golang.org/x/time v0.3.0 // indirect
41 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
42 | google.golang.org/api v0.136.0 // indirect
43 | google.golang.org/appengine v1.6.7 // indirect
44 | google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect
45 | google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 // indirect
46 | google.golang.org/genproto/googleapis/rpc v0.0.0-20230807174057-1744710a1577 // indirect
47 | google.golang.org/grpc v1.57.1 // indirect
48 | google.golang.org/protobuf v1.33.0 // indirect
49 | )
50 |
--------------------------------------------------------------------------------
/_examples/scrapejsp_method2/itstimeitsnowornever.csv:
--------------------------------------------------------------------------------
1 | userId,id,title,completed
2 | 1,1,delectus aut autem,false
3 | 1,1,delectus aut autem,false
4 |
--------------------------------------------------------------------------------
/_examples/scrapejsp_method2/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "os"
8 | "os/signal"
9 | "sync"
10 | "syscall"
11 |
12 | // replace with your own project name
13 | "scrapejsp/scrapejsp"
14 | )
15 |
16 | // sample terminate function to demostrate spider termination.
17 | func OnTerminate(fn func()) {
18 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
19 | <-ctx.Done()
20 | stop()
21 | fn()
22 | }
23 |
24 | func main() {
25 | ctx, cancel := context.WithCancel(context.Background())
26 |
27 | var wg sync.WaitGroup
28 | wg.Add(1)
29 |
30 | spider, errCh := scrapejsp.NewSpider(ctx)
31 |
32 | go func() {
33 | defer wg.Done()
34 |
35 | err := <-errCh
36 |
37 | if err != nil && errors.Is(err, context.Canceled) {
38 | return
39 | }
40 |
41 | fmt.Printf("failed: %q", err)
42 | }()
43 |
44 | // start the scraper with a job, currently nil is passed but you can pass your job here
45 | spider.StartRequest(ctx, nil)
46 |
47 | OnTerminate(func() {
48 | fmt.Println("exit signal received: shutting down gracefully")
49 | cancel()
50 | wg.Wait()
51 | })
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/_examples/scrapejsp_method2/scrapejsp/constants.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | // you can define your constants here
4 |
--------------------------------------------------------------------------------
/_examples/scrapejsp_method2/scrapejsp/errors.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | // you can define your errors here
4 |
--------------------------------------------------------------------------------
/_examples/scrapejsp_method2/scrapejsp/job.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | // id field is compulsory in a Job defination. You can add your custom to Job
4 | type Job struct {
5 | id string
6 | // query string
7 | }
8 |
9 | // do not delete/edit
10 | func NewJob(id string) *Job {
11 | return &Job{
12 | id: id,
13 | }
14 | }
15 |
16 | // do not delete/edit
17 | func (j *Job) Id() string {
18 | return j.id
19 | }
20 |
21 | // do not delete
22 | func (j *Job) Reset() {
23 | j.id = ""
24 | }
25 |
26 | // add your custom receiver functions below
27 | // func (j *Job) SetQuery(query string) {
28 | // j.query = query
29 | // return
30 | // }
31 |
--------------------------------------------------------------------------------
/_examples/scrapejsp_method2/scrapejsp/record.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | import (
4 | "reflect"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/core"
7 | )
8 |
9 | // do not modify this file
10 |
11 | type Record struct {
12 | J *Job `json:"-" csv:"-"` // JobId is required
13 | UserId int `csv:"userId" json:"userId"`
14 | Id int `csv:"id" json:"id"`
15 | Title string `csv:"title" json:"title"`
16 | Completed bool `csv:"completed" json:"completed"`
17 | }
18 |
19 | func (r *Record) Record() *Record {
20 | return r
21 | }
22 |
23 | func (r *Record) RecordKeys() []string {
24 | dataType := reflect.TypeOf(*r)
25 | if dataType.Kind() != reflect.Struct {
26 | panic("Record is not a struct")
27 | }
28 |
29 | numFields := dataType.NumField()
30 | keys := make([]string, numFields)
31 |
32 | for i := 0; i < numFields; i++ {
33 | field := dataType.Field(i)
34 | csvTag := field.Tag.Get("csv")
35 | keys[i] = csvTag
36 | }
37 |
38 | return keys
39 | }
40 |
41 | func (r *Record) RecordFlat() []any {
42 |
43 | inputType := reflect.TypeOf(*r)
44 |
45 | if inputType.Kind() != reflect.Struct {
46 | panic("Record is not a struct")
47 | }
48 |
49 | inputValue := reflect.ValueOf(*r)
50 |
51 | slice := make([]any, inputType.NumField())
52 |
53 | for i := 0; i < inputType.NumField(); i++ {
54 | slice[i] = inputValue.Field(i).Interface()
55 | }
56 | return slice
57 | }
58 |
59 | func (r *Record) Job() core.IJob {
60 | return r.J
61 | }
62 |
--------------------------------------------------------------------------------
/_examples/scrapejsp_method2/scrapejsp/settings.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | import (
4 | "os"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/builtin/middlewares"
7 | "github.com/tech-engine/goscrapy/pkg/builtin/pipelines"
8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager"
9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
10 | )
11 |
12 | // HTTP Transport settings
13 |
14 | // Default: 10000
15 | const MIDDLEWARE_HTTP_TIMEOUT_MS = ""
16 |
17 | // Default: 100
18 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN = ""
19 |
20 | // Default: 100
21 | const MIDDLEWARE_HTTP_MAX_CONN_PER_HOST = ""
22 |
23 | // Default: 100
24 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST = ""
25 |
26 | // Inbuilt Retry middleware settings
27 |
28 | // Default: 3
29 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = ""
30 |
31 | // Default: 500, 502, 503, 504, 522, 524, 408, 429
32 | const MIDDLEWARE_HTTP_RETRY_CODES = ""
33 |
34 | // Default: 1s
35 | const MIDDLEWARE_HTTP_RETRY_BASE_DELAY = ""
36 |
37 | // Default: 1000000
38 | const SCHEDULER_REQ_RES_POOL_SIZE = ""
39 |
40 | // Default: num of CPU * 3
41 | const SCHEDULER_CONCURRENCY = ""
42 |
43 | // Default: 1000000
44 | const SCHEDULER_WORK_QUEUE_SIZE = ""
45 |
46 | // Pipeline Manager settings
47 |
48 | // Default: 10000
49 | const PIPELINEMANAGER_ITEMPOOL_SIZE = ""
50 |
51 | // Default: 24
52 | const PIPELINEMANAGER_ITEM_SIZE = ""
53 |
54 | // Default: 0
55 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = ""
56 |
57 | // Default: 1000
58 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = ""
59 |
60 | // Middlewares here
61 | // Executed in reverse order from bottom to top.
62 | var MIDDLEWARES = []middlewaremanager.Middleware{
63 | middlewares.Retry(),
64 | middlewares.MultiCookieJar,
65 | middlewares.DupeFilter,
66 | }
67 |
68 | var export2CSV = pipelines.Export2CSV[*Record](pipelines.Export2CSVOpts{
69 | Filename: "itstimeitsnowornever.csv",
70 | })
71 |
72 | // use export 2 json pipeline
73 | // var export2Json = pipelines.Export2JSON[*Record](pipelines.Export2JSONOpts{
74 | // Filename: "itstimeitsnowornever.json",
75 | // Immediate: true,
76 | // })
77 |
78 | // add pipeline to group
79 | func myCustomPipelineGroup() *pm.Group[*Record] {
80 | pipelineGroup := pm.NewGroup[*Record]()
81 | pipelineGroup.Add(export2CSV)
82 | // pipelineGroup.Add(export2Json)
83 | return pipelineGroup
84 | }
85 |
86 | // Pipelines here
87 | // Executed in the order they appear.
88 | var PIPELINES = []pm.IPipeline[*Record]{
89 | export2CSV,
90 | // export2Json,
91 | myCustomPipelineGroup(),
92 | }
93 |
94 | func init() {
95 | var settings = map[string]string{
96 | "MIDDLEWARE_HTTP_TIMEOUT_MS": MIDDLEWARE_HTTP_TIMEOUT_MS,
97 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN": MIDDLEWARE_HTTP_MAX_IDLE_CONN,
98 | "MIDDLEWARE_HTTP_MAX_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_CONN_PER_HOST,
99 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST,
100 | "MIDDLEWARE_HTTP_RETRY_MAX_RETRIES": MIDDLEWARE_HTTP_RETRY_MAX_RETRIES,
101 | "MIDDLEWARE_HTTP_RETRY_CODES": MIDDLEWARE_HTTP_RETRY_CODES,
102 | "MIDDLEWARE_HTTP_RETRY_BASE_DELAY": MIDDLEWARE_HTTP_RETRY_BASE_DELAY,
103 | "SCHEDULER_REQ_RES_POOL_SIZE": SCHEDULER_REQ_RES_POOL_SIZE,
104 | "SCHEDULER_CONCURRENCY": SCHEDULER_CONCURRENCY,
105 | "SCHEDULER_WORK_QUEUE_SIZE": SCHEDULER_WORK_QUEUE_SIZE,
106 | "PIPELINEMANAGER_ITEMPOOL_SIZE": PIPELINEMANAGER_ITEMPOOL_SIZE,
107 | "PIPELINEMANAGER_ITEM_SIZE": PIPELINEMANAGER_ITEM_SIZE,
108 | "PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE": PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE,
109 | "PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY": PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY,
110 | }
111 |
112 | for key, value := range settings {
113 | if value != "" {
114 | os.Setenv(key, value)
115 | }
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/_examples/scrapejsp_method2/scrapejsp/spider.go:
--------------------------------------------------------------------------------
1 | package scrapejsp
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "fmt"
7 | "log"
8 |
9 | "github.com/tech-engine/goscrapy/cmd/gos"
10 | "github.com/tech-engine/goscrapy/pkg/core"
11 | )
12 |
13 | type Spider struct {
14 | gos.ICoreSpider[*Record]
15 | }
16 |
17 | func NewSpider(ctx context.Context) (*Spider, <-chan error) {
18 |
19 | // use proxies
20 | // proxies := core.WithProxies("proxy_url1", "proxy_url2", ...)
21 | // core := gos.New[*Record]().WithClient(
22 | // gos.DefaultClient(proxies),
23 | // )
24 |
25 | core := gos.New[*Record]()
26 |
27 | // Add middlewares
28 | core.MiddlewareManager.Add(MIDDLEWARES...)
29 | // Add pipelines
30 | core.PipelineManager.Add(PIPELINES...)
31 |
32 | errCh := make(chan error)
33 |
34 | go func() {
35 | errCh <- core.Start(ctx)
36 | }()
37 |
38 | return &Spider{
39 | core,
40 | }, errCh
41 | }
42 |
43 | // This is the entrypoint to the spider
44 | func (s *Spider) StartRequest(ctx context.Context, job *Job) {
45 |
46 | req := s.NewRequest()
47 | // req.Meta("JOB", job)
48 | req.Url("https://jsonplaceholder.typicode.com/todos/1")
49 |
50 | s.Request(req, s.parse)
51 | }
52 |
53 | func (s *Spider) Close(ctx context.Context) {
54 | }
55 |
56 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) {
57 | fmt.Printf("status: %d", resp.StatusCode())
58 |
59 | var data Record
60 | err := json.Unmarshal(resp.Bytes(), &data)
61 | if err != nil {
62 | log.Fatalln(err)
63 | }
64 |
65 | // to push to pipelines
66 | s.Yield(&data)
67 | }
68 |
--------------------------------------------------------------------------------
/assets/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-engine/goscrapy/bc83234e737e68850fe1c8ac4a0d36fdb5d32e85/assets/demo.gif
--------------------------------------------------------------------------------
/assets/logo.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-engine/goscrapy/bc83234e737e68850fe1c8ac4a0d36fdb5d32e85/assets/logo.webp
--------------------------------------------------------------------------------
/cmd/cli/cli.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2023 Tech Engine
3 | */
4 | package cli
5 |
6 | import (
7 | "fmt"
8 | "os"
9 |
10 | "github.com/spf13/cobra"
11 | )
12 |
13 | const VERSION = "0.11.1"
14 |
15 | const BANNER_MSG = `
16 | _________ ________
17 | __ ____/______ __ ___/_____________________ _________ _____ __
18 | _ / __ _ __ \_____ \ _ ___/__ ___/_ __ '/___ __ \__ / / /
19 | / /_/ / / /_/ /____/ / / /__ _ / / /_/ / __ /_/ /_ /_/ /
20 | \____/ \____/ /____/ \___/ /_/ \__,_/ _ .___/ _\__, /
21 | /_/ /____/
22 |
23 | GoScrapy: Harnessing Go's power for efficient web scraping, inspired by Python's Scrapy framework.`
24 |
25 | // rootCmd represents the base command when called without any subcommands
26 | var rootCmd = &cobra.Command{
27 | Use: "goscrapy [command]",
28 | Short: "A command line tool to everything related to GoScrapy.",
29 | Long: BANNER_MSG,
30 | Version: VERSION,
31 | }
32 |
33 | // Execute adds all child commands to the root command and sets flags appropriately.
34 | // This is called by main.main(). It only needs to happen once to the rootCmd.
35 | func Execute() {
36 | if err := rootCmd.Execute(); err != nil {
37 | fmt.Printf("Whoops :( !!! There was an error '%s'", err.Error())
38 | os.Exit(1)
39 | }
40 | }
41 |
42 | func init() {
43 | }
44 |
--------------------------------------------------------------------------------
/cmd/cli/pipeline.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2023 Tech Engine
3 | */
4 | package cli
5 |
6 | import (
7 | "bytes"
8 | "fmt"
9 | "go/format"
10 | "path/filepath"
11 | "regexp"
12 | "strings"
13 | "text/template"
14 | "unicode"
15 |
16 | "github.com/spf13/cobra"
17 | )
18 |
19 | var pipelineCmd = &cobra.Command{
20 | Use: "pipeline [pipeline-name]",
21 | Short: "Creates a new GoScrapy pipeline with the specified name",
22 | Args: cobra.ExactArgs(1),
23 | Run: func(cmd *cobra.Command, args []string) {
24 | var pipelineName = strings.TrimSpace(args[0])
25 |
26 | if pipelineName == "" {
27 | fmt.Printf("⚠️ please provide a pipeline-name")
28 | return
29 | }
30 |
31 | // read template
32 | tmplContent, err := templatesFS.ReadFile("templates/pipeline.tmpl")
33 |
34 | if err != nil {
35 | fmt.Printf("❌ Error reading template: %v", err)
36 | return
37 | }
38 |
39 | tmpl, err := template.New(pipelineName).
40 | Funcs(template.FuncMap{
41 | "capitalizeFirstLetter": capitalizeFirstLetter,
42 | }).Parse(string(tmplContent))
43 |
44 | if err != nil {
45 | fmt.Printf("❌ Error parsing template: %v", err)
46 | return
47 | }
48 |
49 | buffer := &bytes.Buffer{}
50 |
51 | err = tmpl.Execute(buffer, removeSpecialChars(pipelineName))
52 |
53 | if err != nil {
54 | fmt.Printf("❌ Error executing template: '%s', %v", tmpl.Name(), err)
55 | return
56 | }
57 |
58 | // formate golang code
59 | formattedCode, err := format.Source(buffer.Bytes())
60 |
61 | if err != nil {
62 | fmt.Printf("❌ Error formatting sourcecode '%s', %v", tmpl.Name(), err)
63 | return
64 | }
65 |
66 | sourceFilename := filepath.Join("pipelines", strings.TrimSuffix(tmpl.Name(), ".tmpl")+".go")
67 |
68 | // write go file
69 | err = writeToFile(sourceFilename, formattedCode)
70 |
71 | if err != nil {
72 | fmt.Printf("❌ Error creating %s.", sourceFilename)
73 | return
74 | }
75 |
76 | fmt.Printf("✔️ %s\n", sourceFilename)
77 |
78 | fmt.Printf("\n✨ Congrates, %s created successfully.", pipelineName)
79 | },
80 | }
81 |
82 | func init() {
83 | rootCmd.AddCommand(pipelineCmd)
84 | }
85 |
86 | func capitalizeFirstLetter(s string) string {
87 | if s == "" {
88 | return s
89 | }
90 |
91 | r := []rune(s)
92 | r[0] = unicode.ToUpper(r[0])
93 |
94 | return string(r)
95 | }
96 |
97 | func removeSpecialChars(input string) string {
98 | // Define a regular expression to match non-alphanumeric characters
99 | reg := regexp.MustCompile("[^a-zA-Z0-9]+")
100 |
101 | // Replace matched characters with an empty string
102 | result := reg.ReplaceAllString(input, "")
103 |
104 | return result
105 | }
106 |
--------------------------------------------------------------------------------
/cmd/cli/startproject.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2023 Tech Engine
3 | */
4 | package cli
5 |
6 | import (
7 | "bytes"
8 | "embed"
9 | "fmt"
10 | "go/format"
11 | "io/fs"
12 | "os"
13 | "path"
14 | "path/filepath"
15 | "strings"
16 | "text/template"
17 |
18 | "github.com/spf13/cobra"
19 | )
20 |
21 | //go:embed templates/*
22 | var templatesFS embed.FS
23 |
24 | var templateDir = filepath.Join(filepath.Dir("."), "/templates")
25 |
26 | // startprojectCmd represents the startproject command
27 | var startprojectCmd = &cobra.Command{
28 | Use: "startproject [projectname]",
29 | Short: "Creates a new GoScrapy project with the specified name",
30 | Args: cobra.ExactArgs(1),
31 | Run: func(cmd *cobra.Command, args []string) {
32 | var projectName = strings.TrimSpace(args[0])
33 |
34 | if projectName == "" {
35 | fmt.Printf("⚠️ please provide a projectname")
36 | return
37 | }
38 |
39 | templateFiles, err := fs.Glob(templatesFS, "templates/*.tmpl")
40 |
41 | if err != nil {
42 | fmt.Printf("❌ Error finding template: %v", err)
43 | return
44 | }
45 |
46 | fmt.Printf("\n🚀 GoScrapy generating project files. Please wait!\n\n")
47 |
48 | // create [projectName] dir where we will put spider code & pipelines
49 | err = createDirIfNotExist(projectName)
50 |
51 | if err != nil {
52 | fmt.Printf("❌ Error creating dir '%s', %v", projectName, err)
53 | return
54 | }
55 |
56 | // create [projectName]/pipelines dir
57 | err = createDirIfNotExist(path.Join(projectName, "pipelines"))
58 |
59 | if err != nil {
60 | fmt.Printf("❌ Error creating dir %s/pipelines, %v", projectName, err)
61 | return
62 | }
63 |
64 | var sourceFilename string
65 |
66 | // Parse and execute each template
67 | for _, templateFile := range templateFiles {
68 |
69 | if templateFile == "templates/pipeline.tmpl" {
70 | continue
71 | }
72 |
73 | tmplContent, err := templatesFS.ReadFile(templateFile)
74 |
75 | if err != nil {
76 | fmt.Printf("❌ Error reading template: %v", err)
77 | return
78 | }
79 |
80 | tmplName := filepath.Base(templateFile)
81 | tmpl, err := template.New(tmplName).Parse(string(tmplContent))
82 |
83 | if err != nil {
84 | fmt.Printf("❌ Error parsing template: %v", err)
85 | return
86 | }
87 |
88 | buffer := &bytes.Buffer{}
89 |
90 | err = tmpl.Execute(buffer, projectName)
91 |
92 | if err != nil {
93 | fmt.Printf("❌ Error executing template: '%s', %v", tmpl.Name(), err)
94 | return
95 | }
96 |
97 | formattedCode, err := format.Source(buffer.Bytes())
98 |
99 | if err != nil {
100 | fmt.Printf("❌ Error formatting sourcecode '%s', %v", tmpl.Name(), err)
101 | return
102 | }
103 |
104 | filename := strings.TrimSuffix(tmpl.Name(), ".tmpl") + ".go"
105 |
106 | if templateFile == "templates/main.tmpl" {
107 | sourceFilename = filename
108 | } else {
109 | sourceFilename = filepath.Join(projectName, filename)
110 | }
111 |
112 | err = writeToFile(sourceFilename, formattedCode)
113 |
114 | if err != nil {
115 | fmt.Printf("❌ Error creating %s.", sourceFilename)
116 | return
117 | }
118 |
119 | fmt.Printf("✔️ %s\n", sourceFilename)
120 |
121 | }
122 | fmt.Printf("\n✨ Congrates, %s created successfully.", projectName)
123 | },
124 | }
125 |
126 | func init() {
127 | rootCmd.AddCommand(startprojectCmd)
128 | }
129 |
130 | func writeToFile(filename string, data []byte) error {
131 |
132 | file, err := os.Create(filename)
133 | if err != nil {
134 | return err
135 | }
136 | defer file.Close()
137 | _, err = file.Write(data)
138 | return err
139 | }
140 |
141 | func createDirIfNotExist(dir string) error {
142 | if _, err := os.Stat(dir); !os.IsNotExist(err) {
143 | // Directory exists, prompt user for confirmation
144 | fmt.Printf("Directory '%s' already exists. Continue? (Y/N): ", dir)
145 | var input string
146 | _, err := fmt.Scan(&input)
147 |
148 | if err != nil {
149 | return err
150 | }
151 |
152 | if strings.ToLower(input) != "y" {
153 | return nil
154 | }
155 | }
156 |
157 | return os.MkdirAll(dir, os.ModePerm)
158 | }
159 |
--------------------------------------------------------------------------------
/cmd/cli/templates/constants.tmpl:
--------------------------------------------------------------------------------
1 | package {{.}}
2 |
3 | // you can define your constants here
--------------------------------------------------------------------------------
/cmd/cli/templates/errors.tmpl:
--------------------------------------------------------------------------------
1 | package {{.}}
2 |
3 | // you can define your errors here
--------------------------------------------------------------------------------
/cmd/cli/templates/job.tmpl:
--------------------------------------------------------------------------------
1 | package {{.}}
2 |
3 | // id field is compulsory in a Job defination. You can add your custom to Job
4 | type Job struct {
5 | id string
6 | }
7 |
8 | // do not delete/edit
9 | func NewJob(id string) *Job {
10 | return &Job{
11 | id: id,
12 | }
13 | }
14 |
15 | // do not delete/edit
16 | func (j *Job) Id() string {
17 | return j.id
18 | }
19 |
20 | // do not delete
21 | func (j *Job) Reset() {
22 | j.id = ""
23 | }
24 |
25 |
26 | // add your custom receiver functions below
--------------------------------------------------------------------------------
/cmd/cli/templates/main.tmpl:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "sync"
8 | "os"
9 | "os/signal"
10 | "syscall"
11 | // replace with your own project name
12 | "{{.}}/{{.}}"
13 | )
14 | // sample terminate function to demostrate spider termination.
15 | func OnTerminate(fn func()) {
16 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
17 | <-ctx.Done()
18 | stop()
19 | fn()
20 | }
21 |
22 | func main() {
23 | ctx, cancel := context.WithCancel(context.Background())
24 |
25 | var wg sync.WaitGroup
26 | wg.Add(1)
27 |
28 | spider, errCh := {{.}}.New(ctx)
29 | go func() {
30 | defer wg.Done()
31 |
32 | err := <-errCh
33 |
34 | if err != nil && errors.Is(err, context.Canceled) {
35 | return
36 | }
37 |
38 | fmt.Printf("failed: %q", err)
39 | }()
40 |
41 | // start the scraper with a job, currently nil is passed but you can pass your job here
42 | spider.StartRequest(ctx, nil)
43 |
44 | OnTerminate(func() {
45 | fmt.Println("exit signal received: shutting down gracefully")
46 | cancel()
47 | wg.Wait()
48 | })
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/cmd/cli/templates/pipeline.tmpl:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
3 | import (
4 | "context"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/core"
7 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
8 | )
9 |
10 | type {{.}}[OUT any] struct {
11 | // add your custom fields here
12 | }
13 |
14 | func {{ capitalizeFirstLetter .}}[OUT any](args ...string) *{{.}}[OUT] {
15 | // your custom initialization code goes here
16 | return &{{.}}[OUT]{}
17 | }
18 |
19 | // Open runs when we start the corespider engine.
20 | func (p *{{.}}[OUT]) Open(ctx context.Context) error {
21 | return nil
22 | }
23 |
24 | // Close runs just before the corespider engine exits.
25 | func (p *{{.}}[OUT]) Close() {
26 | }
27 |
28 | // your custome pipeline processing code goes here
29 | func (p *{{.}}[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error {
30 |
31 | // original is the output yield from spider
32 | // original.Job() - access Job
33 | // original.Record() - access Record
34 | // original.RecordKeys() - access Record keys in a slice
35 | // original.RecordFlat() - access Record in []any format
36 |
37 | return nil
38 | }
39 |
--------------------------------------------------------------------------------
/cmd/cli/templates/record.tmpl:
--------------------------------------------------------------------------------
1 | package {{.}}
2 |
3 | import (
4 | "reflect"
5 | "github.com/tech-engine/goscrapy/pkg/core"
6 | )
7 |
8 | /*
9 | json and csv struct field tags are required, if you want the Record to be exported
10 | or processed by builtin pipelines
11 | */
12 |
13 | type Record struct {
14 | J *Job `json:"-" csv:"-"` // JobId is required
15 | // add you custom fields here
16 | Title string `json:"title" csv:"title"`
17 | }
18 |
19 | // modify below code only if you know what you are doing
20 | func (r *Record) Record() *Record {
21 | return r
22 | }
23 |
24 | func (r *Record) RecordKeys() []string {
25 | dataType := reflect.TypeOf(*r)
26 | if dataType.Kind() != reflect.Struct {
27 | panic("Record is not a struct")
28 | }
29 |
30 | numFields := dataType.NumField()
31 | keys := make([]string, numFields)
32 |
33 | for i := 0; i < numFields; i++ {
34 | field := dataType.Field(i)
35 | csvTag := field.Tag.Get("csv")
36 | keys[i] = csvTag
37 | }
38 |
39 | return keys
40 | }
41 |
42 | func (r *Record) RecordFlat() []any {
43 |
44 | inputType := reflect.TypeOf(*r)
45 |
46 | if inputType.Kind() != reflect.Struct {
47 | panic("Record is not a struct")
48 | }
49 |
50 | inputValue := reflect.ValueOf(*r)
51 |
52 | slice := make([]any, inputType.NumField())
53 |
54 | for i := 0; i < inputType.NumField(); i++ {
55 | slice[i] = inputValue.Field(i).Interface()
56 | }
57 | return slice
58 | }
59 |
60 | func (r *Record) Job() core.IJob {
61 | return r.J
62 | }
63 |
--------------------------------------------------------------------------------
/cmd/cli/templates/settings.tmpl:
--------------------------------------------------------------------------------
1 | package {{.}}
2 |
3 | import (
4 | "os"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/builtin/middlewares"
7 | "github.com/tech-engine/goscrapy/pkg/builtin/pipelines"
8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager"
9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
10 | )
11 |
12 | // HTTP Transport settings
13 |
14 | // Default: 10000
15 | const MIDDLEWARE_HTTP_TIMEOUT_MS = ""
16 |
17 | // Default: 100
18 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN = ""
19 |
20 | // Default: 100
21 | const MIDDLEWARE_HTTP_MAX_CONN_PER_HOST = ""
22 |
23 | // Default: 100
24 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST = ""
25 |
26 | // Inbuilt Retry middleware settings
27 |
28 | // Default: 3
29 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = ""
30 |
31 | // Default: 500, 502, 503, 504, 522, 524, 408, 429
32 | const MIDDLEWARE_HTTP_RETRY_CODES = ""
33 |
34 | // Default: 1s
35 | const MIDDLEWARE_HTTP_RETRY_BASE_DELAY = ""
36 |
37 | // Default: 1000000
38 | const SCHEDULER_REQ_RES_POOL_SIZE = ""
39 |
40 | // Default: num of CPU * 3
41 | const SCHEDULER_CONCURRENCY = ""
42 |
43 | // Default: 1000000
44 | const SCHEDULER_WORK_QUEUE_SIZE = ""
45 |
46 | // Pipeline Manager settings
47 |
48 | // Default: 10000
49 | const PIPELINEMANAGER_ITEMPOOL_SIZE = ""
50 |
51 | // Default: 24
52 | const PIPELINEMANAGER_ITEM_SIZE = ""
53 |
54 | // Default: 0
55 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = ""
56 |
57 | // Default: 1000
58 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = ""
59 |
60 | // Middlewares here
61 | // Executed in reverse order from bottom to top.
62 | var MIDDLEWARES = []middlewaremanager.Middleware{
63 | middlewares.Retry(),
64 | middlewares.MultiCookieJar,
65 | middlewares.DupeFilter,
66 | }
67 |
68 | var export2CSV = pipelines.Export2CSV[*Record](pipelines.Export2CSVOpts{
69 | Filename: "itstimeitsnowornever.csv",
70 | })
71 |
72 | // use export 2 json pipeline
73 | // var export2Json = pipelines.Export2JSON[*Record](pipelines.Export2JSONOpts{
74 | // Filename: "itstimeitsnowornever.json",
75 | // Immediate: true,
76 | // })
77 |
78 | // add pipeline to group
79 | //func myCustomPipelineGroup() *pm.Group[*Record] {
80 | // pipelineGroup := pm.NewGroup[*Record]()
81 | // pipelineGroup.Add(export2CSV)
82 | // // pipelineGroup.Add(export2Json)
83 | // return pipelineGroup
84 | //}
85 |
86 | // Pipelines here
87 | // Executed in the order they appear.
88 | var PIPELINES = []pm.IPipeline[*Record]{
89 | export2CSV,
90 | // export2Json,
91 | // myCustomPipelineGroup(),
92 | }
93 |
94 | func init() {
95 | var settings = map[string]string{
96 | "MIDDLEWARE_HTTP_TIMEOUT_MS": MIDDLEWARE_HTTP_TIMEOUT_MS,
97 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN": MIDDLEWARE_HTTP_MAX_IDLE_CONN,
98 | "MIDDLEWARE_HTTP_MAX_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_CONN_PER_HOST,
99 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST,
100 | "MIDDLEWARE_HTTP_RETRY_MAX_RETRIES": MIDDLEWARE_HTTP_RETRY_MAX_RETRIES,
101 | "MIDDLEWARE_HTTP_RETRY_CODES": MIDDLEWARE_HTTP_RETRY_CODES,
102 | "MIDDLEWARE_HTTP_RETRY_BASE_DELAY": MIDDLEWARE_HTTP_RETRY_BASE_DELAY,
103 | "SCHEDULER_REQ_RES_POOL_SIZE": SCHEDULER_REQ_RES_POOL_SIZE,
104 | "SCHEDULER_CONCURRENCY": SCHEDULER_CONCURRENCY,
105 | "SCHEDULER_WORK_QUEUE_SIZE": SCHEDULER_WORK_QUEUE_SIZE,
106 | "PIPELINEMANAGER_ITEMPOOL_SIZE": PIPELINEMANAGER_ITEMPOOL_SIZE,
107 | "PIPELINEMANAGER_ITEM_SIZE": PIPELINEMANAGER_ITEM_SIZE,
108 | "PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE": PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE,
109 | "PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY": PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY,
110 | }
111 |
112 | for key, value := range settings {
113 | if value != "" {
114 | os.Setenv(key, value)
115 | }
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/cmd/cli/templates/spider.tmpl:
--------------------------------------------------------------------------------
1 | package {{.}}
2 |
3 | import (
4 | "context"
5 | "net/http"
6 | "encoding/json"
7 | "log"
8 |
9 | "github.com/tech-engine/goscrapy/cmd/gos"
10 | "github.com/tech-engine/goscrapy/pkg/core"
11 | )
12 |
13 | type Spider struct {
14 | gos.ICoreSpider[*Record]
15 | }
16 |
17 | func New(ctx context.Context) (*Spider, <-chan error) {
18 |
19 | // use proxies
20 | // proxies := core.WithProxies("proxy_url1", "proxy_url2", ...)
21 | // core := gos.New[*Record]().WithClient(
22 | // gos.DefaultClient(proxies),
23 | // )
24 |
25 | core := gos.New[*Record]()
26 |
27 | // Add middlewares
28 | core.MiddlewareManager.Add(MIDDLEWARES...)
29 | // Add pipelines
30 | core.PipelineManager.Add(PIPELINES...)
31 |
32 | errCh := make(chan error)
33 |
34 | go func() {
35 | errCh <- core.Start(ctx)
36 | }()
37 |
38 | return &Spider{
39 | core,
40 | }, errCh
41 | }
42 |
43 | func (s *Spider) StartRequest(ctx context.Context, job *Job) {
44 |
45 | // for each request we must call NewRequest() and never reuse it
46 | req := s.NewRequest()
47 |
48 | var headers http.Header
49 |
50 | // GET is the request method, method chaining possible
51 | req.Url("").
52 | Meta("MY_KEY1", "MY_VALUE").
53 | Meta("MY_KEY2", true).
54 | Header(headers)
55 |
56 | /* POST
57 | req.Url()
58 | req.Method("POST")
59 | req.Body()
60 | */
61 |
62 | // call the next parse method
63 | s.Request(req, s.parse)
64 | }
65 |
66 | // can be called when spider is about to close
67 | func (s *Spider) Close(ctx context.Context) {
68 | }
69 |
70 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) {
71 | // response.Body()
72 | // response.StatusCode()
73 | // response.Header()
74 | // response.Bytes()
75 | // response.Meta("MY_KEY1")
76 |
77 | // yielding output pushes output to be processed by pipelines, also check output.go for the fields
78 | var data Record
79 |
80 | err := json.Unmarshal(resp.Bytes(), &data)
81 | if err != nil {
82 | log.Panicln(err)
83 | }
84 |
85 | // s.Yield(&data)
86 | }
87 |
--------------------------------------------------------------------------------
/cmd/gos/client.go:
--------------------------------------------------------------------------------
1 | package gos
2 |
3 | import (
4 | "log"
5 | "net/http"
6 | "net/url"
7 | "os"
8 | "strconv"
9 | "strings"
10 | "sync/atomic"
11 | "time"
12 |
13 | "github.com/tech-engine/goscrapy/internal/types"
14 | )
15 |
16 | type clientOpts struct {
17 | timeout time.Duration
18 | transportOpts
19 | }
20 |
21 | type transportOpts struct {
22 | proxyFn func(*http.Request) (*url.URL, error)
23 | maxIdleConns, maxConnsPerHost, maxIdleConnsPerHost int
24 | }
25 |
26 | func defaultClientOpts() clientOpts {
27 | opts := clientOpts{
28 | timeout: MIDDLEWARE_DEFAULT_HTTP_TIMEOUT_MS * time.Millisecond,
29 | transportOpts: transportOpts{
30 | proxyFn: nil,
31 | maxIdleConns: MIDDLEWARE_DEFAULT_HTTP_MAX_IDLE_CONN,
32 | maxConnsPerHost: MIDDLEWARE_DEFAULT_HTTP_MAX_CONN_PER_HOST,
33 | maxIdleConnsPerHost: MIDDLEWARE_DEFAULT_HTTP_MAX_IDLE_CONN_PER_HOST,
34 | },
35 | }
36 |
37 | value, ok := os.LookupEnv("MIDDLEWARE_HTTP_MAX_IDLE_CONN")
38 |
39 | if ok {
40 | maxIdleConn, err := strconv.Atoi(value)
41 | if err == nil {
42 | opts.maxIdleConns = maxIdleConn
43 | }
44 | }
45 |
46 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_MAX_CONN_PER_HOST")
47 |
48 | if ok {
49 | maxConnPerHost, err := strconv.Atoi(value)
50 | if err == nil {
51 | opts.maxConnsPerHost = maxConnPerHost
52 | }
53 | }
54 |
55 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST")
56 |
57 | if ok {
58 | maxIdleConnPerHost, err := strconv.Atoi(value)
59 | if err == nil {
60 | opts.maxConnsPerHost = maxIdleConnPerHost
61 | }
62 | }
63 |
64 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_TIMEOUT_MS")
65 |
66 | if ok {
67 | timeoutMs, err := strconv.Atoi(value)
68 | if err == nil {
69 | opts.timeout = time.Duration(timeoutMs) * time.Millisecond
70 | }
71 | }
72 |
73 | return opts
74 | }
75 |
76 | func WithTimeout(t time.Duration) types.OptFunc[clientOpts] {
77 | return func(opts *clientOpts) {
78 | opts.timeout = t
79 | }
80 | }
81 |
82 | func WithMaxIdleConns(maxIdleConns int) types.OptFunc[clientOpts] {
83 | return func(opts *clientOpts) {
84 | opts.maxIdleConns = maxIdleConns
85 | }
86 | }
87 |
88 | func WithMaxConnsPerHost(maxConnsPerHost int) types.OptFunc[clientOpts] {
89 | return func(opts *clientOpts) {
90 | opts.maxConnsPerHost = maxConnsPerHost
91 | }
92 | }
93 |
94 | func WithMaxIdleConnsPerHost(maxIdleConnsPerHost int) types.OptFunc[clientOpts] {
95 | return func(opts *clientOpts) {
96 | opts.maxIdleConnsPerHost = maxIdleConnsPerHost
97 | }
98 | }
99 |
100 | func WithProxyFn(fn func(*http.Request) (*url.URL, error)) types.OptFunc[clientOpts] {
101 | return func(opts *clientOpts) {
102 | opts.proxyFn = fn
103 | }
104 | }
105 |
106 | func WithProxies(proxies ...string) types.OptFunc[clientOpts] {
107 | return func(opts *clientOpts) {
108 | proxyUrls := make([]*url.URL, 0, len(proxies))
109 |
110 | for _, proxy := range proxies {
111 | u, err := url.Parse(strings.TrimSpace(proxy))
112 | if err != nil {
113 | log.Panic(err)
114 | return
115 | }
116 | proxyUrls = append(proxyUrls, u)
117 | }
118 | opts.proxyFn = roundRobin(proxyUrls)
119 | }
120 | }
121 |
122 | // round robin algo for proxy rotation
123 | func roundRobin(urls []*url.URL) func(*http.Request) (*url.URL, error) {
124 | var index uint32
125 | len := uint32(len(urls))
126 | return func(*http.Request) (*url.URL, error) {
127 | index := atomic.AddUint32(&index, 1)
128 | u := urls[(index-1)%len]
129 | return u, nil
130 | }
131 | }
132 |
133 | // createDefaultHTTPClient creates a default http client with defaults.
134 | // If default values are set in the env it will pick the defaults from the env.
135 | func DefaultClient(opts ...types.OptFunc[clientOpts]) *http.Client {
136 | cli := &http.Client{}
137 |
138 | // load in default options
139 | cliOpts := defaultClientOpts()
140 |
141 | for _, opt := range opts {
142 | opt(&cliOpts)
143 | }
144 |
145 | t := http.DefaultTransport.(*http.Transport).Clone()
146 |
147 | // set all value from transport options
148 | t.MaxIdleConns = cliOpts.maxIdleConns
149 | t.MaxConnsPerHost = cliOpts.maxConnsPerHost
150 | t.MaxIdleConnsPerHost = cliOpts.maxIdleConnsPerHost
151 | t.Proxy = cliOpts.proxyFn
152 |
153 | // set client options
154 | cli.Timeout = cliOpts.timeout
155 |
156 | cli.Transport = t
157 |
158 | return cli
159 | }
160 |
--------------------------------------------------------------------------------
/cmd/gos/constants.go:
--------------------------------------------------------------------------------
1 | package gos
2 |
3 | const MIDDLEWARE_DEFAULT_HTTP_MAX_IDLE_CONN = 100
4 |
5 | const MIDDLEWARE_DEFAULT_HTTP_MAX_CONN_PER_HOST = 100
6 |
7 | const MIDDLEWARE_DEFAULT_HTTP_MAX_IDLE_CONN_PER_HOST = 100
8 |
9 | const MIDDLEWARE_DEFAULT_HTTP_TIMEOUT_MS = 10000
10 |
--------------------------------------------------------------------------------
/cmd/gos/gos.go:
--------------------------------------------------------------------------------
1 | package gos
2 |
3 | import (
4 | "context"
5 | "net/http"
6 |
7 | "github.com/tech-engine/goscrapy/pkg/core"
8 | "github.com/tech-engine/goscrapy/pkg/engine"
9 | "github.com/tech-engine/goscrapy/pkg/executor"
10 | httpnative "github.com/tech-engine/goscrapy/pkg/executor_adapters/http_native"
11 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager"
12 | pipelinemanager "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
13 | "github.com/tech-engine/goscrapy/pkg/scheduler"
14 | )
15 |
16 | func New[OUT any]() *gosBuilder[OUT] {
17 | c := &gosBuilder[OUT]{
18 | httpClient: DefaultClient(),
19 | }
20 |
21 | c.MiddlewareManager = middlewaremanager.New(c.httpClient)
22 |
23 | c.ExecutorAdapter = httpnative.NewHTTPClientAdapter(c.MiddlewareManager.HTTPClient(), 0)
24 |
25 | c.Executor = executor.New(c.ExecutorAdapter)
26 |
27 | c.Scheduler = scheduler.New(c.Executor)
28 |
29 | c.PipelineManager = pipelinemanager.New[OUT]()
30 |
31 | c.Engine = engine.New(c.Scheduler, c.PipelineManager)
32 |
33 | c.Core = core.New(c.Engine)
34 | return c
35 | }
36 |
37 | func (c *gosBuilder[OUT]) WithClient(cli *http.Client) *gosBuilder[OUT] {
38 | c.httpClient = cli
39 | return c
40 | }
41 |
42 | func (c *gosBuilder[OUT]) Start(ctx context.Context) error {
43 | return c.Engine.Start(ctx)
44 | }
45 |
--------------------------------------------------------------------------------
/cmd/gos/ports.go:
--------------------------------------------------------------------------------
1 | package gos
2 |
3 | import (
4 | "net/http"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/core"
7 | "github.com/tech-engine/goscrapy/pkg/engine"
8 | "github.com/tech-engine/goscrapy/pkg/executor"
9 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager"
10 | pipelinemanager "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
11 | "github.com/tech-engine/goscrapy/pkg/scheduler"
12 | )
13 |
14 | // Any custom spider created using GoScrapy Framework must implement ICoreSpider[OUT any] interface
15 | type ICoreSpider[OUT any] interface {
16 | Request(req core.IRequestReader, cb core.ResponseCallback)
17 | NewRequest() core.IRequestRW
18 | Yield(core.IOutput[OUT])
19 | }
20 |
21 | // Separate interface created for configuration purposes
22 |
23 | // engine.*Engine[OUT] accepts a pipeline manager that implements engine.IPipelineManager[OUT]
24 | // interface which doesn't have the Add function as engine.IPipelineManager[OUT]
25 | // is not responsible for adding pipelines.
26 | // But pipelinemanager.*PipelineManager[OUT] does exposes an Add function for external configuration
27 | // purpose and to access it we have created the IPipelineManagerAdder[OUT] interface.
28 | type IPipelineManagerAdder[OUT any] interface {
29 | engine.IPipelineManager[OUT]
30 | Add(...pipelinemanager.IPipeline[OUT])
31 | }
32 |
33 | // core.*Core[OUT] accepts an engine that implements core.IEngine[OUT] interface which
34 | // doesn't have the WithScheduler function as core.IEngine[OUT] is not responsible for
35 | // setting Scheduler. But engine.*Engine[OUT] does exposes a WithScheduler function for external
36 | // configuration purposes and to access it we have created the IEngineConfigurer[OUT] interface.
37 | // Same is the case for WithPipelineManager function.
38 | type IEngineConfigurer[OUT any] interface {
39 | core.IEngine[OUT]
40 | WithScheduler(engine.IScheduler)
41 | WithPipelineManager(engine.IPipelineManager[OUT])
42 | }
43 |
44 | // engine.*Engine[OUT] accepts a scheduler that implements engine.IScheduler[OUT] interface which
45 | // doesn't have the WithExecutor function as engine.IScheduler[OUT] is not responsible for
46 | // setting an Executor. But engine.IScheduler does exposes a WithExecutor function for external
47 | // configuration purposes and to access it we have created the ISchedulerConfigurer[OUT] interface.
48 | type ISchedulerConfigurer[OUT any] interface {
49 | engine.IScheduler
50 | WithExecutor(scheduler.IExecutor)
51 | }
52 |
53 | // scheduler.*Scheduler accepts a executor that implements scheduler.IExecutor interface which
54 | // doesn't have the WithAdapter function as scheduler.IExecutor is not responsible for
55 | // setting an adapter. But scheduler.*Scheduler does exposes a WithAdapter function for external
56 | // configuration purposes and to access it we have created the IExecutorConfigurer[OUT] interface.
57 | type IExecutorConfigurer[OUT any] interface {
58 | scheduler.IExecutor
59 | WithAdapter(executor.IExecutorAdapter)
60 | }
61 |
62 | // executor.*Executor accepts a adapter that implements executor.IExecutorAdapter interface which
63 | // doesn't have the WithClient function as executor.IExecutorAdapter is not responsible for
64 | // setting a http client. But executoradapter.*HTTPAdapter does exposes a WithClient function for external
65 | // configuration purposes and to access it we have created the IExecutorAdapterConfigurer[OUT] interface.
66 | type IExecutorAdapterConfigurer interface {
67 | executor.IExecutorAdapter
68 | WithClient(*http.Client)
69 | }
70 |
71 | type IMiddlewareManager interface {
72 | HTTPClient() *http.Client
73 | Add(...middlewaremanager.Middleware)
74 | }
75 |
--------------------------------------------------------------------------------
/cmd/gos/types.go:
--------------------------------------------------------------------------------
1 | package gos
2 |
3 | import (
4 | "net/http"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/core"
7 | )
8 |
9 | type gosBuilder[OUT any] struct {
10 | *core.Core[OUT]
11 | Engine IEngineConfigurer[OUT]
12 | PipelineManager IPipelineManagerAdder[OUT]
13 | Scheduler ISchedulerConfigurer[OUT]
14 | Executor IExecutorConfigurer[OUT]
15 | ExecutorAdapter IExecutorAdapterConfigurer
16 | MiddlewareManager IMiddlewareManager
17 | httpClient *http.Client
18 | }
19 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/tech-engine/goscrapy
2 |
3 | go 1.22
4 |
5 | require (
6 | firebase.google.com/go v3.13.0+incompatible
7 | github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1
8 | github.com/segmentio/fasthash v1.0.3
9 | github.com/spf13/cobra v1.8.1
10 | github.com/stretchr/testify v1.9.0
11 | go.mongodb.org/mongo-driver v1.17.0
12 | golang.org/x/crypto v0.31.0
13 | golang.org/x/net v0.29.0
14 | golang.org/x/sync v0.10.0
15 | google.golang.org/api v0.198.0
16 | )
17 |
18 | require (
19 | cloud.google.com/go/auth v0.9.4 // indirect
20 | cloud.google.com/go/auth/oauth2adapt v0.2.4 // indirect
21 | github.com/antchfx/xpath v1.3.1 // indirect
22 | github.com/felixge/httpsnoop v1.0.4 // indirect
23 | github.com/go-logr/logr v1.4.2 // indirect
24 | github.com/go-logr/stdr v1.2.2 // indirect
25 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.55.0 // indirect
26 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.55.0 // indirect
27 | go.opentelemetry.io/otel v1.30.0 // indirect
28 | go.opentelemetry.io/otel/metric v1.30.0 // indirect
29 | go.opentelemetry.io/otel/trace v1.30.0 // indirect
30 | )
31 |
32 | require (
33 | cloud.google.com/go v0.115.1 // indirect
34 | cloud.google.com/go/compute/metadata v0.5.1 // indirect
35 | cloud.google.com/go/firestore v1.17.0 // indirect
36 | cloud.google.com/go/iam v1.2.1 // indirect
37 | cloud.google.com/go/longrunning v0.6.1 // indirect
38 | cloud.google.com/go/storage v1.43.0 // indirect
39 | github.com/andybalholm/cascadia v1.3.2
40 | github.com/antchfx/htmlquery v1.3.2
41 | github.com/davecgh/go-spew v1.1.1 // indirect
42 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
43 | github.com/golang/protobuf v1.5.4 // indirect
44 | github.com/golang/snappy v0.0.4 // indirect
45 | github.com/google/s2a-go v0.1.8 // indirect
46 | github.com/google/uuid v1.6.0 // indirect
47 | github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect
48 | github.com/googleapis/gax-go/v2 v2.13.0 // indirect
49 | github.com/inconshreveable/mousetrap v1.1.0 // indirect
50 | github.com/klauspost/compress v1.17.9 // indirect
51 | github.com/montanaflynn/stats v0.7.1 // indirect
52 | github.com/pmezard/go-difflib v1.0.0 // indirect
53 | github.com/spf13/pflag v1.0.5 // indirect
54 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect
55 | github.com/xdg-go/scram v1.1.2 // indirect
56 | github.com/xdg-go/stringprep v1.0.4 // indirect
57 | github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect
58 | go.opencensus.io v0.24.0 // indirect
59 | golang.org/x/oauth2 v0.23.0 // indirect
60 | golang.org/x/sys v0.28.0 // indirect
61 | golang.org/x/text v0.21.0 // indirect
62 | golang.org/x/time v0.6.0 // indirect
63 | google.golang.org/appengine v1.6.8 // indirect
64 | google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1 // indirect
65 | google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect
66 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect
67 | google.golang.org/grpc v1.67.0 // indirect
68 | google.golang.org/protobuf v1.34.2 // indirect
69 | gopkg.in/yaml.v3 v3.0.1 // indirect
70 | )
71 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
2 | cloud.google.com/go v0.115.1 h1:Jo0SM9cQnSkYfp44+v+NQXHpcHqlnRJk2qxh6yvxxxQ=
3 | cloud.google.com/go v0.115.1/go.mod h1:DuujITeaufu3gL68/lOFIirVNJwQeyf5UXyi+Wbgknc=
4 | cloud.google.com/go/auth v0.9.4 h1:DxF7imbEbiFu9+zdKC6cKBko1e8XeJnipNqIbWZ+kDI=
5 | cloud.google.com/go/auth v0.9.4/go.mod h1:SHia8n6//Ya940F1rLimhJCjjx7KE17t0ctFEci3HkA=
6 | cloud.google.com/go/auth/oauth2adapt v0.2.4 h1:0GWE/FUsXhf6C+jAkWgYm7X9tK8cuEIfy19DBn6B6bY=
7 | cloud.google.com/go/auth/oauth2adapt v0.2.4/go.mod h1:jC/jOpwFP6JBxhB3P5Rr0a9HLMC/Pe3eaL4NmdvqPtc=
8 | cloud.google.com/go/compute/metadata v0.5.1 h1:NM6oZeZNlYjiwYje+sYFjEpP0Q0zCan1bmQW/KmIrGs=
9 | cloud.google.com/go/compute/metadata v0.5.1/go.mod h1:C66sj2AluDcIqakBq/M8lw8/ybHgOZqin2obFxa/E5k=
10 | cloud.google.com/go/firestore v1.17.0 h1:iEd1LBbkDZTFsLw3sTH50eyg4qe8eoG6CjocmEXO9aQ=
11 | cloud.google.com/go/firestore v1.17.0/go.mod h1:69uPx1papBsY8ZETooc71fOhoKkD70Q1DwMrtKuOT/Y=
12 | cloud.google.com/go/iam v1.2.1 h1:QFct02HRb7H12J/3utj0qf5tobFh9V4vR6h9eX5EBRU=
13 | cloud.google.com/go/iam v1.2.1/go.mod h1:3VUIJDPpwT6p/amXRC5GY8fCCh70lxPygguVtI0Z4/g=
14 | cloud.google.com/go/longrunning v0.6.1 h1:lOLTFxYpr8hcRtcwWir5ITh1PAKUD/sG2lKrTSYjyMc=
15 | cloud.google.com/go/longrunning v0.6.1/go.mod h1:nHISoOZpBcmlwbJmiVk5oDRz0qG/ZxPynEGs1iZ79s0=
16 | cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs=
17 | cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0=
18 | firebase.google.com/go v3.13.0+incompatible h1:3TdYC3DDi6aHn20qoRkxwGqNgdjtblwVAyRLQwGn/+4=
19 | firebase.google.com/go v3.13.0+incompatible/go.mod h1:xlah6XbEyW6tbfSklcfe5FHJIwjt8toICdV5Wh9ptHs=
20 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
21 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
22 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
23 | github.com/antchfx/htmlquery v1.3.2 h1:85YdttVkR1rAY+Oiv/nKI4FCimID+NXhDn82kz3mEvs=
24 | github.com/antchfx/htmlquery v1.3.2/go.mod h1:1mbkcEgEarAokJiWhTfr4hR06w/q2ZZjnYLrDt6CTUk=
25 | github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk=
26 | github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
27 | github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
28 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
29 | github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
30 | github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
31 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
32 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
33 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
34 | github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
35 | github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
36 | github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
37 | github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
38 | github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
39 | github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
40 | github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
41 | github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
42 | github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
43 | github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
44 | github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
45 | github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1 h1:FWNFq4fM1wPfcK40yHE5UO3RUdSNPaBC+j3PokzA6OQ=
46 | github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1/go.mod h1:5YoVOkjYAQumqlV356Hj3xeYh4BdZuLE0/nRkf2NKkI=
47 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
48 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
49 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
50 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
51 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
52 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
53 | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
54 | github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
55 | github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
56 | github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
57 | github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
58 | github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
59 | github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
60 | github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
61 | github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
62 | github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
63 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
64 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
65 | github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
66 | github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
67 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
68 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
69 | github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
70 | github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
71 | github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
72 | github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
73 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
74 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
75 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
76 | github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc=
77 | github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0=
78 | github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM=
79 | github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA=
80 | github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
81 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
82 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
83 | github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw=
84 | github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA=
85 | github.com/googleapis/gax-go/v2 v2.13.0 h1:yitjD5f7jQHhyDsnhKEBU52NdvvdSeGzlAnDPT0hH1s=
86 | github.com/googleapis/gax-go/v2 v2.13.0/go.mod h1:Z/fvTZXF8/uw7Xu5GuslPw+bplx6SS338j1Is2S+B7A=
87 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
88 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
89 | github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
90 | github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
91 | github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE=
92 | github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow=
93 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
94 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
95 | github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
96 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
97 | github.com/segmentio/fasthash v1.0.3 h1:EI9+KE1EwvMLBWwjpRDc+fEM+prwxDYbslddQGtrmhM=
98 | github.com/segmentio/fasthash v1.0.3/go.mod h1:waKX8l2N8yckOgmSsXJi7x1ZfdKZ4x7KRMzBtS3oedY=
99 | github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
100 | github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
101 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
102 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
103 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
104 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
105 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
106 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
107 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
108 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
109 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
110 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
111 | github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c=
112 | github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI=
113 | github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY=
114 | github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4=
115 | github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8=
116 | github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM=
117 | github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM=
118 | github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI=
119 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
120 | go.mongodb.org/mongo-driver v1.17.0 h1:Hp4q2MCjvY19ViwimTs00wHi7G4yzxh4/2+nTx8r40k=
121 | go.mongodb.org/mongo-driver v1.17.0/go.mod h1:wwWm/+BuOddhcq3n68LKRmgk2wXzmF6s0SFOa0GINL4=
122 | go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
123 | go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
124 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.55.0 h1:hCq2hNMwsegUvPzI7sPOvtO9cqyy5GbWt/Ybp2xrx8Q=
125 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.55.0/go.mod h1:LqaApwGx/oUmzsbqxkzuBvyoPpkxk3JQWnqfVrJ3wCA=
126 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.55.0 h1:ZIg3ZT/aQ7AfKqdwp7ECpOK6vHqquXXuyTjIO8ZdmPs=
127 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.55.0/go.mod h1:DQAwmETtZV00skUwgD6+0U89g80NKsJE3DCKeLLPQMI=
128 | go.opentelemetry.io/otel v1.30.0 h1:F2t8sK4qf1fAmY9ua4ohFS/K+FUuOPemHUIXHtktrts=
129 | go.opentelemetry.io/otel v1.30.0/go.mod h1:tFw4Br9b7fOS+uEao81PJjVMjW/5fvNCbpsDIXqP0pc=
130 | go.opentelemetry.io/otel/metric v1.30.0 h1:4xNulvn9gjzo4hjg+wzIKG7iNFEaBMX00Qd4QIZs7+w=
131 | go.opentelemetry.io/otel/metric v1.30.0/go.mod h1:aXTfST94tswhWEb+5QjlSqG+cZlmyXy/u8jFpor3WqQ=
132 | go.opentelemetry.io/otel/sdk v1.29.0 h1:vkqKjk7gwhS8VaWb0POZKmIEDimRCMsopNYnriHyryo=
133 | go.opentelemetry.io/otel/sdk v1.29.0/go.mod h1:pM8Dx5WKnvxLCb+8lG1PRNIDxu9g9b9g59Qr7hfAAok=
134 | go.opentelemetry.io/otel/trace v1.30.0 h1:7UBkkYzeg3C7kQX8VAidWh2biiQbtAKjyIML8dQ9wmc=
135 | go.opentelemetry.io/otel/trace v1.30.0/go.mod h1:5EyKqTzzmyqB9bwtCCq6pDLktPK6fmGf/Dph+8VI02o=
136 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
137 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
138 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
139 | golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
140 | golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
141 | golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
142 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
143 | golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
144 | golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
145 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
146 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
147 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
148 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
149 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
150 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
151 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
152 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
153 | golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
154 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
155 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
156 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
157 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
158 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
159 | golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo=
160 | golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0=
161 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
162 | golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs=
163 | golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
164 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
165 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
166 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
167 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
168 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
169 | golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
170 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
171 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
172 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
173 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
174 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
175 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
176 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
177 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
178 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
179 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
180 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
181 | golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
182 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
183 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
184 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
185 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
186 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
187 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
188 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
189 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
190 | golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
191 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
192 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
193 | golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
194 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
195 | golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U=
196 | golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
197 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
198 | golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
199 | golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
200 | golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
201 | golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
202 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
203 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
204 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
205 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
206 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
207 | google.golang.org/api v0.198.0 h1:OOH5fZatk57iN0A7tjJQzt6aPfYQ1JiWkt1yGseazks=
208 | google.golang.org/api v0.198.0/go.mod h1:/Lblzl3/Xqqk9hw/yS97TImKTUwnf1bv89v7+OagJzc=
209 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
210 | google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
211 | google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM=
212 | google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds=
213 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
214 | google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
215 | google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
216 | google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1 h1:BulPr26Jqjnd4eYDVe+YvyR7Yc2vJGkO5/0UxD0/jZU=
217 | google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:hL97c3SYopEHblzpxRL4lSs523++l8DYxGM1FQiYmb4=
218 | google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 h1:hjSy6tcFQZ171igDaN5QHOw2n6vx40juYbC/x67CEhc=
219 | google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:qpvKtACPCQhAdu3PyQgV4l3LMXZEtft7y8QcarRsp9I=
220 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ=
221 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU=
222 | google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
223 | google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
224 | google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
225 | google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
226 | google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
227 | google.golang.org/grpc v1.67.0 h1:IdH9y6PF5MPSdAntIcpjQ+tXO41pcQsfZV2RxtQgVcw=
228 | google.golang.org/grpc v1.67.0/go.mod h1:1gLDyUQU7CTLJI90u3nXZ9ekeghjeM7pTDZlqFNg2AA=
229 | google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
230 | google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
231 | google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
232 | google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
233 | google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
234 | google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
235 | google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
236 | google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
237 | google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
238 | google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
239 | google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
240 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
241 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
242 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
243 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
244 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
245 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
246 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
247 | honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
248 | honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
249 |
--------------------------------------------------------------------------------
/internal/cmap/cmap.go:
--------------------------------------------------------------------------------
1 | package cmap
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "sync"
7 |
8 | "github.com/tech-engine/goscrapy/internal/types"
9 | )
10 |
11 | var ERR_MAX_ITEM_EXCEEDED = errors.New("CMAP max item exceeded")
12 |
13 | type CMap[K comparable, V any] struct {
14 | opts
15 | lock sync.RWMutex
16 | data map[K]Void[V]
17 | }
18 |
19 | func NewCMap[K comparable, V any](optFuncs ...types.OptFunc[opts]) *CMap[K, V] {
20 |
21 | opts := defaultOpts()
22 |
23 | for _, fn := range optFuncs {
24 | fn(&opts)
25 | }
26 |
27 | return &CMap[K, V]{
28 | opts: opts,
29 | data: make(map[K]Void[V], opts.size),
30 | }
31 | }
32 |
33 | func (cm *CMap[K, V]) Get(key K) (V, bool) {
34 |
35 | cm.lock.RLock()
36 | defer cm.lock.RUnlock()
37 |
38 | val, ok := cm.data[key]
39 |
40 | return val.Data, ok
41 | }
42 |
43 | func (cm *CMap[K, V]) Set(key K, val V) error {
44 |
45 | cm.lock.Lock()
46 | defer cm.lock.Unlock()
47 |
48 | _, ok := cm.data[key]
49 |
50 | if (len(cm.data) >= cm.size) && !ok {
51 | return fmt.Errorf("Set: %w: max allowed=[%d]", ERR_MAX_ITEM_EXCEEDED, cm.size)
52 | }
53 |
54 | cm.data[key] = Void[V]{val}
55 |
56 | return nil
57 | }
58 |
59 | func (cm *CMap[K, V]) Del(key K) {
60 | cm.lock.Lock()
61 | delete(cm.data, key)
62 | cm.lock.Unlock()
63 | }
64 |
65 | func (cm *CMap[K, V]) Clear() {
66 | clear(cm.data)
67 | }
68 |
69 | func (cm *CMap[K, V]) Keys() []any {
70 | keys := make([]any, cm.size)
71 |
72 | var i = 0
73 | for key := range cm.data {
74 | keys[i] = key
75 | i++
76 | }
77 |
78 | return keys
79 | }
80 |
81 | func (cm *CMap[K, V]) Len() int {
82 |
83 | cm.lock.RLock()
84 | defer cm.lock.RUnlock()
85 |
86 | return len(cm.data)
87 | }
88 |
--------------------------------------------------------------------------------
/internal/cmap/cmap_test.go:
--------------------------------------------------------------------------------
1 | package cmap
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 |
7 | "github.com/stretchr/testify/assert"
8 | )
9 |
10 | type testCase struct {
11 | key,
12 | val,
13 | op string
14 | }
15 |
16 | func TestCMapGet(t *testing.T) {
17 | cmap := NewCMap[string, string]()
18 | cmap.Set("KEY_1", "VAL_1")
19 |
20 | val, ok := cmap.Get("KEY_1")
21 |
22 | assert.True(t, ok, "ok=true expected for key=%s", "KEY_1")
23 |
24 | assert.Equalf(t, "VAL_1", val, "val=%s expected for key=%s but got %s", "VAL_1", "KEY_1", val)
25 |
26 | val, ok = cmap.Get("KEY_2")
27 |
28 | assert.Falsef(t, ok, "ok=false expected for key=%s", "KEY_2")
29 |
30 | assert.Emptyf(t, val, "empty string expected for key=%s but got %s")
31 | }
32 |
33 | func TestCMapSet(t *testing.T) {
34 | cmap := NewCMap[string, string]()
35 | err := cmap.Set("KEY_1", "VAL_1")
36 |
37 | assert.NoErrorf(t, err, "nil error expected but got %s", err)
38 | }
39 |
40 | func TestCMapLimit(t *testing.T) {
41 | var err error
42 | cmap := NewCMap[string, string](WithSize(2))
43 |
44 | err = cmap.Set("KEY_1", "VAL_1")
45 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_1", err)
46 |
47 | err = cmap.Set("KEY_2", "VAL_2")
48 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_2", err)
49 |
50 | err = cmap.Set("KEY_3", "VAL_3")
51 |
52 | if assert.Errorf(t, err, "error is expected for key=%s", "KEY_3") {
53 | assert.ErrorIs(t, err, ERR_MAX_ITEM_EXCEEDED)
54 | }
55 |
56 | cmap.Del("KEY_2")
57 |
58 | err = cmap.Set("KEY_3", "VAL_3")
59 |
60 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_3", err)
61 | }
62 |
63 | func TestCMapDel(t *testing.T) {
64 | cmap := NewCMap[string, string]()
65 |
66 | cmap.Set("KEY_1", "VAL_1")
67 |
68 | cmap.Del("KEY_1")
69 |
70 | assert.Equalf(t, 0, cmap.Len(), "len=%d expected but has %d", 0, cmap.Len())
71 |
72 | val, ok := cmap.Get("KEY_1")
73 |
74 | assert.Falsef(t, ok, "false expected for key=%s", "KEY_1")
75 | assert.Equalf(t, "", val, "empty string expected but got %s", val)
76 | }
77 |
78 | func TestCMapConcurrency(t *testing.T) {
79 |
80 | testCases := []testCase{
81 | {
82 | key: "KEY_1",
83 | val: "VAL_1",
84 | op: "WRITE",
85 | },
86 | {
87 | key: "KEY_2",
88 | val: "VAL_2",
89 | op: "WRITE",
90 | },
91 | {
92 | key: "KEY_2",
93 | val: "VAL_2",
94 | op: "READ",
95 | },
96 | {
97 | key: "KEY_3",
98 | val: "VAL_3",
99 | op: "WRITE",
100 | },
101 | {
102 | key: "KEY_4",
103 | val: "VAL_4",
104 | op: "WRITE",
105 | },
106 | {
107 | key: "KEY_1",
108 | val: "VAL_1",
109 | op: "READ",
110 | },
111 | {
112 | key: "KEY_1",
113 | val: "VAL_1",
114 | op: "WRITE",
115 | },
116 | }
117 |
118 | cmap := NewCMap[string, string](WithSize(5))
119 |
120 | for i := 0; i < 100; i++ {
121 | t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
122 | t.Parallel()
123 | for _, tc := range testCases {
124 | if tc.op == "READ" {
125 | cmap.Get(tc.key)
126 | } else {
127 | err := cmap.Set(tc.key, tc.val)
128 |
129 | assert.NoError(t, err)
130 | }
131 | }
132 | })
133 | }
134 | }
135 |
--------------------------------------------------------------------------------
/internal/cmap/cmaph.go:
--------------------------------------------------------------------------------
1 | package cmap
2 |
3 | import (
4 | "fmt"
5 | "sync"
6 |
7 | "github.com/segmentio/fasthash/fnv1a"
8 | "github.com/tech-engine/goscrapy/internal/types"
9 | )
10 |
11 | type CMapH struct {
12 | opts
13 | lock sync.RWMutex
14 | data map[uint64]Void[any]
15 | keys []any
16 | lastKeyIndex int
17 | }
18 |
19 | func defaultOpts() opts {
20 | return opts{
21 | size: 24,
22 | hashFn: fnv1a.HashString64,
23 | }
24 | }
25 |
26 | func WithSize(size int) types.OptFunc[opts] {
27 | return func(opts *opts) {
28 | opts.size = size
29 | }
30 | }
31 |
32 | func WithHashFn(fn hashFn) types.OptFunc[opts] {
33 | return func(opts *opts) {
34 | opts.hashFn = fn
35 | }
36 | }
37 |
38 | func NewHCMap(optFuncs ...types.OptFunc[opts]) *CMapH {
39 |
40 | opts := defaultOpts()
41 |
42 | for _, fn := range optFuncs {
43 | fn(&opts)
44 | }
45 |
46 | return &CMapH{
47 | opts: opts,
48 | data: make(map[uint64]Void[any], opts.size),
49 | keys: make([]any, opts.size),
50 | }
51 | }
52 |
53 | func (cm *CMapH) Get(key string) (any, bool) {
54 |
55 | hkey := cm.hashFn(key)
56 |
57 | cm.lock.RLock()
58 | defer cm.lock.RUnlock()
59 |
60 | val, ok := cm.data[hkey]
61 |
62 | return val.Data, ok
63 | }
64 |
65 | func (cm *CMapH) Set(key string, val any) error {
66 |
67 | hkey := cm.hashFn(key)
68 |
69 | cm.lock.Lock()
70 | defer cm.lock.Unlock()
71 |
72 | _, ok := cm.data[hkey]
73 |
74 | if (len(cm.data) > cm.size) && !ok {
75 | return fmt.Errorf("Set: max items of %d exceeded", cm.size)
76 | }
77 |
78 | cm.data[hkey] = Void[any]{val}
79 |
80 | if cm.lastKeyIndex < cm.opts.size {
81 | cm.keys[cm.lastKeyIndex] = key
82 | cm.lastKeyIndex++
83 | }
84 |
85 | return nil
86 | }
87 |
88 | func (cm *CMapH) Len() int {
89 |
90 | cm.lock.RLock()
91 | defer cm.lock.RUnlock()
92 |
93 | return len(cm.data)
94 | }
95 |
96 | func (cm *CMapH) Del(key string) {
97 | hkey := cm.hashFn(key)
98 | cm.lock.Lock()
99 | delete(cm.data, hkey)
100 | cm.lock.Unlock()
101 | }
102 |
103 | func (cm *CMapH) Clear() {
104 | clear(cm.data)
105 | }
106 |
107 | func (cm *CMapH) Keys() []any {
108 | return cm.keys
109 | }
110 |
--------------------------------------------------------------------------------
/internal/cmap/types.go:
--------------------------------------------------------------------------------
1 | package cmap
2 |
3 | type opts struct {
4 | size int
5 | hashFn hashFn
6 | }
7 |
8 | type hashFn func(string) uint64
9 |
10 | type Void[V any] struct {
11 | Data V
12 | }
13 |
--------------------------------------------------------------------------------
/internal/fsm/fsm.go:
--------------------------------------------------------------------------------
1 | package fsm
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 |
7 | "github.com/tech-engine/goscrapy/internal/cmap"
8 | )
9 |
10 | var ERR_MAX_ITEM_EXCEEDED = errors.New("FSM max item exceeded")
11 |
12 | type FixedSizeMap[K comparable, V any] struct {
13 | size uint64
14 | data map[K]cmap.Void[V]
15 | }
16 |
17 | func New[K comparable, V any](size uint64) *FixedSizeMap[K, V] {
18 | return &FixedSizeMap[K, V]{
19 | size: size,
20 | data: make(map[K]cmap.Void[V], size),
21 | }
22 | }
23 |
24 | func (fsm *FixedSizeMap[K, V]) Get(key K) (V, bool) {
25 |
26 | val, ok := fsm.data[key]
27 |
28 | return val.Data, ok
29 | }
30 |
31 | func (fsm *FixedSizeMap[K, V]) Set(key K, val V) error {
32 |
33 | if _, ok := fsm.data[key]; !ok && (len(fsm.data) >= int(fsm.size)) {
34 | return fmt.Errorf("Set:fixedsizemap.go: %w: max allowed=[%d]", ERR_MAX_ITEM_EXCEEDED, fsm.size)
35 | }
36 |
37 | fsm.data[key] = cmap.Void[V]{Data: val}
38 |
39 | return nil
40 | }
41 |
42 | func (fsm *FixedSizeMap[K, V]) Clear() {
43 | clear(fsm.data)
44 | }
45 |
--------------------------------------------------------------------------------
/internal/fsm/fsm_test.go:
--------------------------------------------------------------------------------
1 | package fsm
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/stretchr/testify/assert"
7 | )
8 |
9 | type testCase struct {
10 | key,
11 | val,
12 | op string
13 | }
14 |
15 | func TestFsmGet(t *testing.T) {
16 | fsm := New[string, string](5)
17 | fsm.Set("KEY_1", "VAL_1")
18 |
19 | val, ok := fsm.Get("KEY_1")
20 |
21 | assert.True(t, ok, "ok=true expected for key=%s", "KEY_1")
22 |
23 | assert.Equalf(t, "VAL_1", val, "val=%s expected for key=%s but got %s", "VAL_1", "KEY_1", val)
24 |
25 | val, ok = fsm.Get("KEY_2")
26 |
27 | assert.Falsef(t, ok, "ok=false expected for key=%s", "KEY_2")
28 |
29 | assert.Emptyf(t, val, "empty string expected for key=%s but got %s")
30 | }
31 |
32 | func TestFsmSet(t *testing.T) {
33 | fsm := New[string, string](5)
34 | err := fsm.Set("KEY_1", "VAL_1")
35 |
36 | assert.NoErrorf(t, err, "nil error expected but got %s", err)
37 | }
38 |
39 | func TestFsmLimit(t *testing.T) {
40 | var err error
41 | fsm := New[string, string](2)
42 |
43 | err = fsm.Set("KEY_1", "VAL_1")
44 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_1", err)
45 |
46 | err = fsm.Set("KEY_2", "VAL_2")
47 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_2", err)
48 |
49 | err = fsm.Set("KEY_3", "VAL_3")
50 |
51 | if assert.Errorf(t, err, "error is expected for key=%s", "KEY_3") {
52 | assert.ErrorIs(t, err, ERR_MAX_ITEM_EXCEEDED)
53 | }
54 | }
55 |
56 | func TestFsmClear(t *testing.T) {
57 | fsm := New[string, string](5)
58 |
59 | fsm.Set("KEY_1", "VAL_1")
60 |
61 | fsm.Clear()
62 |
63 | val, ok := fsm.Get("KEY_1")
64 |
65 | assert.Falsef(t, ok, "ok=false expected for key=%s", "KEY_1")
66 |
67 | assert.Emptyf(t, val, "empty string expected for key=%s but got %s")
68 | }
69 |
--------------------------------------------------------------------------------
/internal/resource_pool/pool_builder.go:
--------------------------------------------------------------------------------
1 | package rp
2 |
3 | func WithSize[T any](size uint64) PoolOption[T] {
4 | return func(p *Pooler[T]) {
5 | p.pool = NewPool[T](size)
6 | }
7 | }
8 |
9 | type Pooler[T any] struct {
10 | pool Pool[T]
11 | }
12 |
13 | type PoolOption[T any] func(*Pooler[T])
14 |
15 | func NewPooler[T any](options ...PoolOption[T]) *Pooler[T] {
16 | pool := &Pooler[T]{}
17 |
18 | for _, option := range options {
19 | option(pool)
20 | }
21 |
22 | return pool
23 | }
24 |
25 | func (p *Pooler[T]) Acquire() *T {
26 | if p.pool != nil {
27 | return p.pool.Acquire()
28 | }
29 | return new(T)
30 | }
31 |
32 | func (p *Pooler[T]) Release(item *T) {
33 | if p.pool != nil {
34 | p.pool.Release(item)
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/internal/resource_pool/resource_pool.go:
--------------------------------------------------------------------------------
1 | package rp
2 |
3 | type Pool[T any] chan *T
4 |
5 | func (p Pool[T]) Acquire() (_p *T) {
6 | select {
7 | case item := <-p:
8 | return item
9 | default:
10 | return _p
11 | }
12 | }
13 |
14 | func (p Pool[T]) Release(item *T) {
15 | select {
16 | case p <- item:
17 | default:
18 | }
19 | }
20 |
21 | func NewPool[K any](max uint64) Pool[K] {
22 | itemPool := make(Pool[K], max)
23 | return itemPool
24 | }
25 |
--------------------------------------------------------------------------------
/internal/types/option.go:
--------------------------------------------------------------------------------
1 | package types
2 |
3 | type OptFunc[T any] func(*T)
4 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import "github.com/tech-engine/goscrapy/cmd/cli"
4 |
5 | func main() {
6 | cli.Execute()
7 | }
8 |
--------------------------------------------------------------------------------
/pkg/builtin/middlewares/dupefilter.go:
--------------------------------------------------------------------------------
1 | package middlewares
2 |
3 | import (
4 | "encoding/hex"
5 | "errors"
6 | "fmt"
7 | "hash"
8 | "io"
9 | "net/http"
10 | "sort"
11 | "strings"
12 | "sync"
13 |
14 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager"
15 | "golang.org/x/crypto/blake2b"
16 | )
17 |
18 | var ERR_DUPEFILTER_BLOCKED = errors.New("duplicate request")
19 |
20 | type RequestMap struct {
21 | seen map[string]struct{}
22 | mu sync.RWMutex
23 | }
24 |
25 | func NewRequestMap() *RequestMap {
26 | return &RequestMap{
27 | seen: make(map[string]struct{}),
28 | }
29 | }
30 |
31 | func generateSHA1FingerprintFromReq(r *http.Request) (string, error) {
32 |
33 | var (
34 | err error
35 | body io.ReadCloser
36 | hash hash.Hash
37 | )
38 |
39 | if r.GetBody != nil {
40 | body, err = r.GetBody()
41 | if err != nil {
42 | return "", err
43 | }
44 | defer body.Close()
45 | }
46 |
47 | var combinedBuf strings.Builder
48 |
49 | hash, err = blake2b.New256(nil)
50 | if err != nil {
51 | return "", err
52 | }
53 |
54 | if body != nil {
55 |
56 | if _, err = io.Copy(hash, body); err != nil {
57 | return "", err
58 | }
59 | }
60 |
61 | combinedBuf.WriteString(r.Method)
62 | combinedBuf.WriteString(r.URL.String())
63 |
64 | headerKeys := make([]string, 0, len(r.Header))
65 | for key := range r.Header {
66 | headerKeys = append(headerKeys, key)
67 | }
68 |
69 | sort.Strings(headerKeys)
70 |
71 | // added sorted headers
72 | for _, key := range headerKeys {
73 | for _, value := range r.Header[key] {
74 | combinedBuf.WriteString(key)
75 | combinedBuf.WriteString(value)
76 | }
77 | }
78 |
79 | if _, err = hash.Write([]byte(combinedBuf.String())); err != nil {
80 | return "", err
81 | }
82 |
83 | finalHash := hash.Sum(nil)
84 |
85 | return hex.EncodeToString(finalHash[:]), nil
86 |
87 | }
88 |
89 | func DupeFilter(next http.RoundTripper) http.RoundTripper {
90 | requestMap := NewRequestMap()
91 | return middlewaremanager.MiddlewareFunc(func(req *http.Request) (*http.Response, error) {
92 | signature, err := generateSHA1FingerprintFromReq(req)
93 |
94 | if err != nil {
95 | return nil, fmt.Errorf("duplicatefilter.go:DupeFilterMiddleware: error generating request signature %w", err)
96 | }
97 |
98 | requestMap.mu.Lock()
99 |
100 | // we have already seen this signature so we skip
101 | if _, ok := requestMap.seen[signature]; ok {
102 | requestMap.mu.Unlock()
103 | return nil, fmt.Errorf("duplicatefilter.go:DupeFilterMiddleware: %w", ERR_DUPEFILTER_BLOCKED)
104 | }
105 |
106 | requestMap.seen[signature] = struct{}{}
107 | requestMap.mu.Unlock()
108 |
109 | return next.RoundTrip(req)
110 | })
111 | }
112 |
--------------------------------------------------------------------------------
/pkg/builtin/middlewares/dupefilter_test.go:
--------------------------------------------------------------------------------
1 | package middlewares
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "net/http"
7 | "net/http/httptest"
8 | "strings"
9 | "sync"
10 | "testing"
11 |
12 | "github.com/stretchr/testify/assert"
13 | )
14 |
15 | type testCase struct {
16 | Name,
17 | Method string
18 | Header http.Header
19 | Body io.Reader
20 | mayBlock bool
21 | }
22 |
23 | func handler(w http.ResponseWriter, r *http.Request) {
24 | w.WriteHeader(http.StatusOK)
25 | }
26 |
27 | func TestDupeFilter(t *testing.T) {
28 |
29 | // Set our custom transport middleware
30 | client := &http.Client{
31 | Transport: DupeFilter(http.DefaultTransport),
32 | }
33 |
34 | testServer := httptest.NewServer(http.HandlerFunc(handler))
35 |
36 | testCases := []testCase{
37 | {
38 | Name: "test1",
39 | Method: "GET",
40 | Header: http.Header{
41 | "X-test": []string{"test_val_1"},
42 | },
43 | mayBlock: true,
44 | },
45 | {
46 | Name: "test2",
47 | Method: "GET",
48 | Header: http.Header{
49 | "X-test": []string{"test_val_2"},
50 | },
51 | Body: nil,
52 | },
53 | {
54 | Name: "test3",
55 | Method: "GET",
56 | Header: http.Header{
57 | "X-test": []string{"test_val_1"},
58 | },
59 | Body: nil,
60 | mayBlock: true,
61 | },
62 | {
63 | Name: "test4",
64 | Method: "POST",
65 | Header: http.Header{
66 | "X-test-another": []string{"test_val_1"},
67 | },
68 | Body: strings.NewReader("hello"),
69 | mayBlock: true,
70 | },
71 | {
72 | Name: "test5",
73 | Method: "PATCH",
74 | Header: http.Header{
75 | "X-test-another": []string{"test_val_1"},
76 | },
77 | Body: strings.NewReader("hello"),
78 | },
79 | {
80 | Name: "test6",
81 | Method: "POST",
82 | Header: http.Header{
83 | "X-test-another": []string{"test_val_1"},
84 | },
85 | Body: strings.NewReader("hello"),
86 | mayBlock: true,
87 | },
88 | {
89 | Name: "test7",
90 | Method: "POST",
91 | Header: http.Header{
92 | "X-test-another": []string{"test_val_1"},
93 | },
94 | Body: strings.NewReader("hello1"),
95 | },
96 | }
97 |
98 | expectedPassCases := 5
99 | actualPassCases := 0
100 |
101 | var m sync.Mutex
102 | for _, tc := range testCases {
103 | func() {
104 |
105 | t.Run(tc.Method, func(t *testing.T) {
106 | t.Parallel()
107 | req, err := http.NewRequest(tc.Method, testServer.URL, tc.Body)
108 |
109 | assert.Nil(t, err, "error creating http request", tc.Name)
110 |
111 | req.Header = tc.Header
112 |
113 | resp, err := client.Do(req)
114 | if tc.mayBlock && err != nil {
115 | assert.ErrorIs(t, err, ERR_DUPEFILTER_BLOCKED, fmt.Sprintf("http request %s not blocked", tc.Name))
116 | }
117 |
118 | if resp != nil {
119 | assert.Equal(t, 200, resp.StatusCode, "statuscode 200 expected")
120 | m.Lock()
121 | defer m.Unlock()
122 | actualPassCases++
123 | }
124 |
125 | })
126 | }()
127 |
128 | }
129 |
130 | t.Cleanup(func() {
131 | assert.Equal(t, expectedPassCases, actualPassCases, fmt.Sprintf("expected pass cases %d not equal to actual pass cases %d", expectedPassCases, actualPassCases))
132 | testServer.Close()
133 | })
134 |
135 | }
136 |
--------------------------------------------------------------------------------
/pkg/builtin/middlewares/multi_cookiejar.go:
--------------------------------------------------------------------------------
1 | package middlewares
2 |
3 | import (
4 | "net/http"
5 | "net/http/cookiejar"
6 | "sync"
7 |
8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager"
9 | )
10 |
11 | type multiCookieJar struct {
12 | jars map[string]http.CookieJar
13 | mu sync.RWMutex
14 | }
15 |
16 | // NewMultiCookieJar creates a new MultiCookieJar.
17 | func NewMultiCookieJar() *multiCookieJar {
18 | return &multiCookieJar{
19 | jars: make(map[string]http.CookieJar),
20 | }
21 | }
22 |
23 | // GetCookieJar returns a CookieJar corresponding to a key or create one if key doesn't exist
24 | func (m *multiCookieJar) GetCookieJar(key string) http.CookieJar {
25 |
26 | m.mu.Lock()
27 | defer m.mu.Unlock()
28 | jar, ok := m.jars[key]
29 |
30 | // in case we don't have a cookie jar based on the key, we create a new one
31 | if !ok {
32 | jar, _ = cookiejar.New(nil)
33 | }
34 | m.jars[key] = jar
35 | return jar
36 | }
37 |
38 | func MultiCookieJar(next http.RoundTripper) http.RoundTripper {
39 | mCookieJar := NewMultiCookieJar()
40 | return middlewaremanager.MiddlewareFunc(func(req *http.Request) (*http.Response, error) {
41 | // Header keys are received as the client sends them and not normalized.
42 |
43 | jar := mCookieJar.GetCookieJar(req.Header.Get("X-Goscrapy-Cookie-Jar-Key"))
44 |
45 | reqCookies := jar.Cookies(req.URL)
46 |
47 | for _, rc := range reqCookies {
48 | req.AddCookie(rc)
49 | }
50 |
51 | // remove X-Goscrapy-CookieJar-Key header
52 | req.Header.Del("X-Goscrapy-Cookie-Jar-Key")
53 |
54 | // It is in this step Headers are normalized and sent out.
55 | resp, err := next.RoundTrip(req)
56 |
57 | if resp != nil {
58 | // update cookies
59 | jar.SetCookies(req.URL, resp.Cookies())
60 | }
61 |
62 | return resp, err
63 | })
64 | }
65 |
--------------------------------------------------------------------------------
/pkg/builtin/middlewares/multi_cookiejar_test.go:
--------------------------------------------------------------------------------
1 | package middlewares
2 |
3 | import (
4 | "fmt"
5 | "net/http"
6 | "net/http/httptest"
7 | "strings"
8 | "testing"
9 |
10 | "slices"
11 |
12 | "github.com/stretchr/testify/assert"
13 | )
14 |
15 | // Set our custom transport middleware
16 | var client = &http.Client{
17 | Transport: MultiCookieJar(http.DefaultTransport),
18 | }
19 |
20 | func filteredHeaders(h http.Header) http.Header {
21 | var newHeader = make(http.Header)
22 | skipHeaders := []string{"User-Agent", "Accept-Encoding", "Cookie", "Content-Length", "Date"}
23 |
24 | for name, value := range h {
25 | // we skip the default headers
26 | if slices.Contains(skipHeaders, name) {
27 | continue
28 | }
29 | newHeader.Add(name, value[0])
30 | }
31 |
32 | return newHeader
33 | }
34 |
35 | func makeTestRequestWithClient(client *http.Client) func(string, string, http.Header) (*http.Response, error) {
36 | return func(method, url string, header http.Header) (*http.Response, error) {
37 | // Create a first GET request without any cookie
38 | req, err := http.NewRequest(method, url, nil)
39 |
40 | if err != nil {
41 | return nil, fmt.Errorf("makeRequestWithClient: error creating http request %w", err)
42 | }
43 |
44 | if header != nil {
45 | req.Header = header
46 | }
47 |
48 | return client.Do(req)
49 | }
50 | }
51 |
52 | // handlerGetCookieJar provides us our dummy server handlers
53 | func handlerGetCookieJar(t *testing.T) *http.ServeMux {
54 | mux := http.NewServeMux()
55 | skipHeaders := []string{"User-Agent", "Accept-Encoding", "Cookie"}
56 | // /get-cookie receives headers from client and set those headers as response cookies
57 | mux.HandleFunc("/set-cookie", func(w http.ResponseWriter, r *http.Request) {
58 | for name, value := range r.Header {
59 | // Set the cookie in the response
60 | if len(value) <= 0 {
61 | continue
62 | }
63 |
64 | // we skip the default headers
65 | if slices.Contains(skipHeaders, name) {
66 | continue
67 | }
68 |
69 | http.SetCookie(w, &http.Cookie{
70 | Name: name,
71 | Value: value[0],
72 | Domain: r.URL.Host,
73 | Path: "/",
74 | })
75 | }
76 |
77 | w.WriteHeader(http.StatusOK)
78 |
79 | })
80 |
81 | // /verify receives cookies(auto injected by middleware) from client and respond back with the cookie value in response headers
82 | mux.HandleFunc("/verify", func(w http.ResponseWriter, r *http.Request) {
83 |
84 | // Inspect the request's cookies
85 | receivedCookies := r.Cookies()
86 |
87 | for _, c := range receivedCookies {
88 | w.Header().Set(c.Name, c.Value)
89 | }
90 |
91 | w.WriteHeader(http.StatusOK)
92 |
93 | })
94 | return mux
95 | }
96 |
97 | // There are 2 stages.
98 | //
99 | // Stage 1: We send a request with a few headers to our test server, and get the exact same headers back
100 | // as response cookies.
101 | //
102 | // Stage 2: To verify if our cookie middleware worked as expected, we will send another request to /verify.
103 | // If we get the exact same headers we sent in Stage 1, as response headers, our middleware worked as expected.
104 | func RunWithCookieJar(t *testing.T, key string) {
105 | // Create a test server with the custom RoundTripper
106 | key = strings.ToLower(key)
107 | testServer := httptest.NewServer(handlerGetCookieJar(t))
108 | defer testServer.Close()
109 |
110 | requester := makeTestRequestWithClient(client)
111 |
112 | // Stage 1
113 | headerOne := http.Header{
114 | "X-Goscrapy-Server-Req-" + key: []string{"single_host_req_" + key},
115 | }
116 |
117 | if key != "" {
118 | headerOne.Add("X-Goscrapy-Cookie-Jar-Key", key)
119 | }
120 |
121 | respOne, err := requester("GET", testServer.URL+"/set-cookie", headerOne)
122 |
123 | assert.Nil(t, err, "error making http request 1")
124 |
125 | defer respOne.Body.Close()
126 |
127 | // we verify if we have received the same cookies that we have set in "X-Goscrapy-Server-Req-1" header
128 | respOneCookies := respOne.Cookies()
129 |
130 | assert.Lenf(t, respOneCookies, 1, "expected only %d cookie but got %d", 1, len(respOneCookies))
131 |
132 | found := false
133 |
134 | for _, cookie := range respOneCookies {
135 | if strings.ToLower(cookie.Name) == "x-goscrapy-server-req-"+key && strings.ToLower(cookie.Value) == "single_host_req_"+key {
136 | found = true
137 | break
138 | }
139 | }
140 |
141 | assert.Truef(t, found, "expected response cookies [X-Goscrapy-Server-Req-%s=single_host_req_%s] not found", key, key)
142 |
143 | // second stage 2:
144 | headerTwo := http.Header{
145 | "X-Goscrapy-Cookie-Jar-Key": []string{key},
146 | }
147 | respTwo, err := requester("GET", testServer.URL+"/verify", headerTwo)
148 |
149 | assert.Nil(t, err, "error making http request 2")
150 |
151 | defer respTwo.Body.Close()
152 |
153 | respTwoHeader := filteredHeaders(respTwo.Header)
154 |
155 | assert.Lenf(t, respTwoHeader, 1, "expected only %d header but got %d", 1, len(respTwoHeader))
156 |
157 | assert.Equal(t, "single_host_req_"+key, respTwoHeader.Get("X-Goscrapy-Server-Req-"+key))
158 | }
159 |
160 | func TestMultiCookierJar(t *testing.T) {
161 |
162 | testCases := []struct {
163 | Name,
164 | SessionKey string
165 | }{
166 | {
167 | Name: "DEFAULT_COOKIEJAR",
168 | SessionKey: "",
169 | },
170 | {
171 | Name: "SINGLE_COOKIEJAR",
172 | SessionKey: "jar1",
173 | },
174 | {
175 | Name: "SINGLE_COOKIEJAR",
176 | SessionKey: "jar2",
177 | },
178 | {
179 | Name: "SINGLE_COOKIEJAR",
180 | SessionKey: "jar3",
181 | },
182 | }
183 |
184 | for _, tc := range testCases {
185 | t.Run(tc.Name, func(t *testing.T) {
186 | t.Parallel()
187 | RunWithCookieJar(t, tc.SessionKey)
188 | })
189 | }
190 |
191 | for _, tc := range testCases {
192 | t.Run(tc.Name, func(t *testing.T) {
193 | RunWithCookieJar(t, tc.SessionKey)
194 | })
195 | }
196 | }
197 |
--------------------------------------------------------------------------------
/pkg/builtin/middlewares/retry.go:
--------------------------------------------------------------------------------
1 | package middlewares
2 |
3 | import (
4 | "math"
5 | "net/http"
6 | "os"
7 | "slices"
8 | "strconv"
9 | "strings"
10 | "time"
11 |
12 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager"
13 | )
14 |
15 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = 3
16 |
17 | var MIDDLEWARE_HTTP_RETRY_CODES = []int{500, 502, 503, 504, 522, 524, 408, 429}
18 |
19 | type RetryCb func(*http.Request, uint8) bool
20 |
21 | type RetryOpts struct {
22 | MaxRetries uint8
23 | Codes []int
24 | BaseDelay time.Duration
25 | Cb RetryCb
26 | }
27 |
28 | func defaultRetryOpts() *RetryOpts {
29 | opts := &RetryOpts{
30 | MaxRetries: MIDDLEWARE_HTTP_RETRY_MAX_RETRIES,
31 | Codes: MIDDLEWARE_HTTP_RETRY_CODES,
32 | BaseDelay: 1 * time.Second,
33 | }
34 |
35 | value, ok := os.LookupEnv("MIDDLEWARE_HTTP_RETRY_MAX_RETRIES")
36 |
37 | if ok {
38 | maxRetries, err := strconv.Atoi(value)
39 | if err == nil {
40 | opts.MaxRetries = uint8(maxRetries)
41 | }
42 | }
43 |
44 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_RETRY_CODES")
45 |
46 | if ok {
47 | codesStr := strings.Split(value, ",")
48 | codes := make([]int, 0, len(codesStr))
49 |
50 | for _, codeStr := range codesStr {
51 | if code, err := strconv.Atoi(strings.TrimSpace(codeStr)); err == nil {
52 | codes = append(codes, code)
53 | }
54 | }
55 |
56 | if len(codes) > 0 {
57 | opts.Codes = codes[:]
58 | }
59 |
60 | }
61 |
62 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_RETRY_BASE_DELAY")
63 |
64 | if ok {
65 | baseDelay, err := time.ParseDuration(value)
66 | if err == nil {
67 | opts.BaseDelay = baseDelay
68 | }
69 | }
70 |
71 | return opts
72 | }
73 |
74 | func Retry(opts ...RetryOpts) func(http.RoundTripper) http.RoundTripper {
75 |
76 | retryOpts := defaultRetryOpts()
77 |
78 | // overwrite defaults
79 | if len(opts) > 0 {
80 | if opts[0].MaxRetries > 0 {
81 | retryOpts.MaxRetries = opts[0].MaxRetries
82 | }
83 |
84 | if opts[0].Codes != nil {
85 | retryOpts.Codes = opts[0].Codes[:]
86 | }
87 |
88 | if opts[0].BaseDelay > 0 {
89 | retryOpts.BaseDelay = opts[0].BaseDelay
90 | }
91 |
92 | if opts[0].Cb != nil {
93 | retryOpts.Cb = opts[0].Cb
94 | }
95 | }
96 |
97 | return func(next http.RoundTripper) http.RoundTripper {
98 | return middlewaremanager.MiddlewareFunc(func(req *http.Request) (*http.Response, error) {
99 |
100 | var (
101 | resp *http.Response
102 | err error
103 | retries uint8 = retryOpts.MaxRetries
104 | i uint8
105 | )
106 |
107 | retryHeader := req.Header.Get("X-Goscrapy-Middleware-Max-Retry")
108 |
109 | if retryHeader != "" {
110 | r, _ := strconv.Atoi(retryHeader)
111 | retries = uint8(r)
112 | req.Header.Del("X-Goscrapy-Middleware-Max-Retry")
113 | }
114 |
115 | retries += 1
116 |
117 | timer := time.NewTimer(retryOpts.BaseDelay)
118 |
119 | for i = 0; i < retries; i++ {
120 | resp, err = next.RoundTrip(req)
121 |
122 | // call retry callback, if present
123 | if i > 0 && retryOpts.Cb != nil && !retryOpts.Cb(req, i) {
124 | break
125 | }
126 |
127 | if err != nil {
128 | select {
129 | case <-timer.C:
130 | // calculate next delay
131 | timer.Reset(time.Duration(math.Pow(2, float64(i))) * retryOpts.BaseDelay)
132 | continue
133 | }
134 | }
135 |
136 | if !slices.Contains(retryOpts.Codes, resp.StatusCode) {
137 | break
138 | }
139 |
140 | select {
141 | case <-timer.C:
142 | // calculate next delay
143 | timer.Reset(time.Duration(math.Pow(2, float64(i))) * retryOpts.BaseDelay)
144 | }
145 | }
146 |
147 | if !timer.Stop() {
148 | <-timer.C
149 | }
150 |
151 | return resp, err
152 | })
153 | }
154 | }
155 |
--------------------------------------------------------------------------------
/pkg/builtin/middlewares/retry_test.go:
--------------------------------------------------------------------------------
1 | package middlewares
2 |
3 | import (
4 | "net/http"
5 | "net/http/httptest"
6 | "testing"
7 |
8 | "github.com/stretchr/testify/assert"
9 | )
10 |
11 | func retry500Handler(w http.ResponseWriter, r *http.Request) {
12 | w.WriteHeader(http.StatusInternalServerError)
13 | }
14 |
15 | func TestRetry(t *testing.T) {
16 |
17 | var (
18 | expectedRetryCnt uint8 = 3
19 | actualRetryCnt uint8
20 | )
21 |
22 | client := &http.Client{
23 | Transport: Retry(RetryOpts{
24 | MaxRetries: expectedRetryCnt,
25 | Cb: func(r *http.Request, retry uint8) bool {
26 | actualRetryCnt = retry
27 | return true
28 | },
29 | })(http.DefaultTransport),
30 | }
31 |
32 | testServer := httptest.NewServer(http.HandlerFunc(retry500Handler))
33 |
34 | req, err := http.NewRequest("GET", testServer.URL, nil)
35 |
36 | assert.Nil(t, err, "error creating http request")
37 |
38 | resp, err := client.Do(req)
39 |
40 | assert.Nil(t, err, "error making request")
41 |
42 | resp.Body.Close()
43 |
44 | assert.Equal(t, expectedRetryCnt, actualRetryCnt)
45 | testServer.Close()
46 | }
47 |
48 | func TestRetryWithCb(t *testing.T) {
49 |
50 | var (
51 | retryCnt uint8 = 3
52 | expectedRetryCnt uint8 = 3
53 | actualRetryCnt uint8
54 | )
55 |
56 | client := &http.Client{
57 | Transport: Retry(RetryOpts{
58 | MaxRetries: retryCnt,
59 | Cb: func(r *http.Request, retry uint8) bool {
60 | actualRetryCnt = retry
61 | return retry <= 1
62 | },
63 | })(http.DefaultTransport),
64 | }
65 |
66 | testServer := httptest.NewServer(http.HandlerFunc(retry500Handler))
67 |
68 | req, err := http.NewRequest("GET", testServer.URL, nil)
69 |
70 | assert.Nil(t, err, "error creating http request")
71 |
72 | resp, err := client.Do(req)
73 |
74 | assert.Nil(t, err, "error making request")
75 |
76 | resp.Body.Close()
77 |
78 | assert.Less(t, actualRetryCnt, expectedRetryCnt)
79 | testServer.Close()
80 | }
81 |
82 | func TestRetryWithHttpCodes(t *testing.T) {
83 |
84 | var (
85 | retryCnt uint8 = 3
86 | expectedRetryCnt uint8 = 0
87 | actualRetryCnt uint8
88 | )
89 |
90 | client := &http.Client{
91 | Transport: Retry(RetryOpts{
92 | MaxRetries: retryCnt,
93 | Codes: []int{467},
94 | Cb: func(r *http.Request, retry uint8) bool {
95 | actualRetryCnt = retry
96 | return true
97 | },
98 | })(http.DefaultTransport),
99 | }
100 |
101 | testServer := httptest.NewServer(http.HandlerFunc(retry500Handler))
102 |
103 | req, err := http.NewRequest("GET", testServer.URL, nil)
104 |
105 | assert.Nil(t, err, "error creating http request")
106 |
107 | resp, err := client.Do(req)
108 |
109 | assert.Nil(t, err, "error making request")
110 |
111 | resp.Body.Close()
112 |
113 | assert.Equal(t, expectedRetryCnt, actualRetryCnt)
114 | testServer.Close()
115 | }
116 |
--------------------------------------------------------------------------------
/pkg/builtin/pipelines/dummy.go:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
3 | import (
4 | "reflect"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/core"
7 | )
8 |
9 | type dummyJob struct {
10 | id string
11 | }
12 |
13 | func (j *dummyJob) Id() string {
14 | return "dummyJob"
15 | }
16 |
17 | func (o *dummyRecord) Record() *dummyRecord {
18 | return o
19 | }
20 |
21 | func (o *dummyRecord) RecordKeys() []string {
22 | dataType := reflect.TypeOf(dummyRecord{})
23 | if dataType.Kind() != reflect.Struct {
24 | panic("Record is not a struct")
25 | }
26 |
27 | numFields := dataType.NumField()
28 | keys := make([]string, numFields)
29 |
30 | for i := 0; i < numFields; i++ {
31 | field := dataType.Field(i)
32 | csvTag := field.Tag.Get("csv")
33 | keys[i] = csvTag
34 | }
35 |
36 | return keys
37 | }
38 |
39 | func (o *dummyRecord) RecordFlat() []any {
40 |
41 | inputType := reflect.TypeOf(o)
42 |
43 | if inputType.Kind() != reflect.Struct {
44 | panic("Record is not a struct")
45 | }
46 |
47 | inputValue := reflect.ValueOf(o)
48 |
49 | slice := make([]any, inputType.NumField())
50 |
51 | for i := 0; i < inputType.NumField(); i++ {
52 | slice[i] = inputValue.Field(i).Interface()
53 | }
54 | return slice
55 | }
56 |
57 | func (o *dummyRecord) Job() core.IJob {
58 | return o.J
59 | }
60 |
61 | type dummyRecord struct {
62 | Id string `json:"id" csv:"id"`
63 | Name string `json:"name" csv:"name"`
64 | J *dummyJob `json:"-" csv:"-"`
65 | }
66 |
--------------------------------------------------------------------------------
/pkg/builtin/pipelines/export_to_csv.go:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "time"
7 |
8 | "context"
9 |
10 | "github.com/gocarina/gocsv"
11 | "github.com/tech-engine/goscrapy/pkg/core"
12 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
13 | )
14 |
15 | // Export2CSV configuration struct.
16 | // File Field will take precedence over Filename field.
17 | type Export2CSVOpts struct {
18 | Filename string
19 | File *os.File
20 | }
21 |
22 | type export2CSV[OUT any] struct {
23 | filename string
24 | file *os.File
25 | }
26 |
27 | func Export2CSV[OUT any](opts ...Export2CSVOpts) *export2CSV[OUT] {
28 | e := &export2CSV[OUT]{
29 | filename: fmt.Sprintf("JOB_%s.csv", time.Now().UTC().Format("2006-01-02-15-04-05")),
30 | }
31 |
32 | if len(opts) > 0 {
33 | if opts[0].Filename != "" {
34 | e.filename = opts[0].Filename
35 | }
36 |
37 | if opts[0].File != nil {
38 | e.file = opts[0].File
39 | }
40 | }
41 |
42 | return e
43 | }
44 |
45 | func (p *export2CSV[OUT]) Open(ctx context.Context) error {
46 | if p.file != nil {
47 | p.filename = ""
48 | return nil
49 | }
50 |
51 | file, err := os.OpenFile(p.filename, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0640)
52 |
53 | if err != nil {
54 | return err
55 | }
56 |
57 | p.file = file
58 | return nil
59 | }
60 |
61 | func (p *export2CSV[OUT]) Close() {
62 | p.file.Close()
63 | }
64 |
65 | func (p *export2CSV[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error {
66 |
67 | fileInfo, err := p.file.Stat()
68 |
69 | if err != nil {
70 | return err
71 | }
72 |
73 | size := fileInfo.Size()
74 |
75 | data := []OUT{original.Record()}
76 |
77 | if size > 0 {
78 | err = gocsv.MarshalWithoutHeaders(data, p.file)
79 | } else {
80 | err = gocsv.MarshalFile(data, p.file)
81 | }
82 |
83 | return err
84 | }
85 |
--------------------------------------------------------------------------------
/pkg/builtin/pipelines/export_to_csv_test.go:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
3 | import (
4 | "context"
5 | "encoding/csv"
6 | "os"
7 | "testing"
8 |
9 | "github.com/stretchr/testify/assert"
10 | )
11 |
12 | func TestExport2CSV(t *testing.T) {
13 |
14 | f, err := os.CreateTemp(".", "export_2_csv.csv")
15 |
16 | assert.NoError(t, err)
17 |
18 | defer os.Remove(f.Name())
19 |
20 | pipeline := Export2CSV[*dummyRecord](Export2CSVOpts{
21 | File: f,
22 | })
23 |
24 | defer pipeline.Close()
25 |
26 | err = pipeline.Open(context.Background())
27 |
28 | assert.NoError(t, err)
29 |
30 | record := &dummyRecord{Id: "1", Name: "rick"}
31 |
32 | err = pipeline.ProcessItem(nil, record)
33 |
34 | assert.NoError(t, err)
35 |
36 | f.Seek(0, 0)
37 |
38 | reader := csv.NewReader(f)
39 |
40 | csvRecords, err := reader.ReadAll()
41 |
42 | assert.NoError(t, err)
43 |
44 | assert.Equal(t, convertToSliceOfStrings(record), csvRecords[1])
45 |
46 | }
47 |
48 | func convertToSliceOfStrings(record *dummyRecord) []string {
49 | return []string{record.Id, record.Name}
50 | }
51 |
--------------------------------------------------------------------------------
/pkg/builtin/pipelines/export_to_firebase.go:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "log"
7 |
8 | firebase "firebase.google.com/go"
9 | "firebase.google.com/go/db"
10 | "github.com/tech-engine/goscrapy/pkg/core"
11 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
12 | "google.golang.org/api/option"
13 | )
14 |
15 | type export2FIREBASE[OUT any] struct {
16 | ctx context.Context
17 | ref *db.Ref
18 | }
19 |
20 | func Export2FIREBASE[OUT any](_url, filePath, collName string) *export2FIREBASE[OUT] {
21 | ctx := context.Background()
22 |
23 | conf := &firebase.Config{
24 | DatabaseURL: _url,
25 | }
26 |
27 | opt := option.WithCredentialsFile(filePath)
28 |
29 | app, err := firebase.NewApp(ctx, conf, opt)
30 |
31 | if err != nil {
32 | log.Printf("Export2FIREBASE: Error initializing app %s", err)
33 | return nil
34 | }
35 |
36 | client, err := app.Database(ctx)
37 |
38 | if err != nil {
39 | log.Printf("Export2FIREBASE: Error initializing Firebase client %s", err)
40 | return nil
41 | }
42 |
43 | return &export2FIREBASE[OUT]{
44 | ctx: ctx,
45 | ref: client.NewRef(collName),
46 | }
47 | }
48 |
49 | func (p *export2FIREBASE[OUT]) Open(ctx context.Context) error {
50 | return nil
51 | }
52 |
53 | func (p *export2FIREBASE[OUT]) Close() {
54 | }
55 |
56 | // your custom pipeline processing code goes here
57 | func (p *export2FIREBASE[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error {
58 |
59 | if _, err := p.ref.Push(p.ctx, original.Record()); err != nil {
60 | return fmt.Errorf("Export2FIREBASE: error inserting data to DB %w", err)
61 | }
62 |
63 | return nil
64 | }
65 |
--------------------------------------------------------------------------------
/pkg/builtin/pipelines/export_to_gsheet.go:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "log"
7 |
8 | "github.com/tech-engine/goscrapy/pkg/core"
9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
10 | "google.golang.org/api/option"
11 | "google.golang.org/api/sheets/v4"
12 | )
13 |
14 | type export2GSHEET[OUT any] struct {
15 | service *sheets.Service
16 | sheetName string
17 | spreadSheetId string
18 | sheetId int64
19 | }
20 |
21 | func Export2GSHEET[OUT any](keyFilePath, spreadSheetId string, sheetId int64) *export2GSHEET[OUT] {
22 | ctx := context.Background()
23 |
24 | service, err := sheets.NewService(ctx, option.WithCredentialsFile(keyFilePath))
25 |
26 | if err != nil {
27 | log.Printf("Export2GSHEET: error creating a service using provided creds %s", err)
28 | return nil
29 | }
30 |
31 | response, err := service.Spreadsheets.Get(spreadSheetId).Fields("sheets(properties(sheetId,title))").Do()
32 |
33 | if err != nil {
34 | log.Printf("Export2GSHEET: error getting spreadsheet by id %s %s", spreadSheetId, err)
35 | return nil
36 | }
37 |
38 | if response.HTTPStatusCode != 200 {
39 | log.Printf(fmt.Sprintf("Export2GSHEET: %d status code received", response.HTTPStatusCode))
40 | return nil
41 | }
42 |
43 | sheetName := ""
44 |
45 | for _, sheet := range response.Sheets {
46 | if sheet.Properties.SheetId == sheetId {
47 | sheetName = sheet.Properties.Title
48 | break
49 | }
50 | }
51 |
52 | if sheetName == "" {
53 | log.Printf("Export2GSHEET: %d status code received", response.HTTPStatusCode)
54 | return nil
55 | }
56 |
57 | return &export2GSHEET[OUT]{
58 | service: service,
59 | sheetName: sheetName,
60 | spreadSheetId: spreadSheetId,
61 | sheetId: sheetId,
62 | }
63 | }
64 |
65 | func (p *export2GSHEET[OUT]) Open(ctx context.Context) error {
66 | return nil
67 | }
68 |
69 | func (p *export2GSHEET[OUT]) Close() {
70 | }
71 |
72 | func (p *export2GSHEET[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error {
73 |
74 | records := original.RecordFlat()
75 | row := &sheets.ValueRange{
76 | Values: [][]any{records},
77 | }
78 |
79 | response, err := p.service.Spreadsheets.Values.Append(p.spreadSheetId, p.sheetName, row).
80 | ValueInputOption("USER_ENTERED").
81 | InsertDataOption("INSERT_ROWS").
82 | Context(context.Background()).
83 | Do()
84 |
85 | if err != nil || response.HTTPStatusCode != 200 {
86 | return err
87 | }
88 |
89 | return nil
90 | }
91 |
--------------------------------------------------------------------------------
/pkg/builtin/pipelines/export_to_json.go:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
3 | import (
4 | "bufio"
5 | "encoding/json"
6 | "fmt"
7 | "io"
8 | "os"
9 | "time"
10 |
11 | "github.com/tech-engine/goscrapy/pkg/core"
12 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
13 | "golang.org/x/net/context"
14 | )
15 |
16 | // Immediate: export2JSON internally creates a bufio.Writer from provided io.Writer.
17 | // Immediate=true, flushes bufio.Writer immediately after processing.
18 | type Export2JSONOpts struct {
19 | Filename string
20 | File io.WriteCloser
21 | Immediate bool
22 | }
23 |
24 | type export2JSON[OUT any] struct {
25 | filename string
26 | file io.WriteCloser
27 | buff *bufio.Writer
28 | immediateFlush bool
29 | }
30 |
31 | func Export2JSON[OUT any](opts ...Export2JSONOpts) *export2JSON[OUT] {
32 | e := &export2JSON[OUT]{
33 | filename: fmt.Sprintf("JOB_%s.json", time.Now().UTC().Format("2006-01-02-15-04-05")),
34 | }
35 |
36 | if len(opts) > 0 {
37 | opt := opts[0]
38 |
39 | if opt.Filename != "" {
40 | e.filename = opt.Filename
41 | }
42 |
43 | if opt.File != nil {
44 | e.file = opt.File
45 | }
46 |
47 | e.immediateFlush = opt.Immediate
48 | }
49 |
50 | return e
51 | }
52 |
53 | func (p *export2JSON[OUT]) Open(ctx context.Context) error {
54 | if p.file != nil {
55 | p.buff = bufio.NewWriter(p.file)
56 | return nil
57 | }
58 |
59 | file, err := os.OpenFile(p.filename, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0640)
60 |
61 | if err != nil {
62 | return err
63 | }
64 |
65 | p.file = file
66 | p.buff = bufio.NewWriter(p.file)
67 | return nil
68 | }
69 |
70 | func (p *export2JSON[OUT]) Close() {
71 | // flushed data to writer
72 | p.buff.Flush()
73 | p.file.Close()
74 | }
75 |
76 | func (p *export2JSON[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error {
77 |
78 | jsonEncoder := json.NewEncoder(p.buff)
79 |
80 | // Encode and write the JSON data
81 | if err := jsonEncoder.Encode(original.Record()); err != nil {
82 | return err
83 | }
84 |
85 | if p.immediateFlush {
86 | p.buff.Flush()
87 | }
88 |
89 | return nil
90 | }
91 |
--------------------------------------------------------------------------------
/pkg/builtin/pipelines/export_to_json_test.go:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "os"
7 | "testing"
8 |
9 | "github.com/stretchr/testify/assert"
10 | )
11 |
12 | func TestExport2JSON(t *testing.T) {
13 |
14 | f, err := os.CreateTemp(".", "export_2_json")
15 |
16 | assert.NoError(t, err)
17 |
18 | defer os.Remove(f.Name())
19 |
20 | pipeline := Export2JSON[*dummyRecord](Export2JSONOpts{
21 | File: f,
22 | Immediate: true,
23 | })
24 |
25 | defer pipeline.Close()
26 |
27 | err = pipeline.Open(context.Background())
28 |
29 | assert.NoError(t, err)
30 |
31 | record := &dummyRecord{Id: "1", Name: "rick"}
32 |
33 | err = pipeline.ProcessItem(nil, record)
34 |
35 | assert.NoError(t, err)
36 |
37 | f.Seek(0, 0)
38 |
39 | d := json.NewDecoder(f)
40 |
41 | var out dummyRecord
42 | err = d.Decode(&out)
43 |
44 | assert.NoError(t, err)
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/pkg/builtin/pipelines/export_to_mongodb.go:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "log"
7 |
8 | "github.com/tech-engine/goscrapy/pkg/core"
9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager"
10 | "go.mongodb.org/mongo-driver/bson"
11 | "go.mongodb.org/mongo-driver/bson/primitive"
12 | "go.mongodb.org/mongo-driver/mongo"
13 | "go.mongodb.org/mongo-driver/mongo/options"
14 | )
15 |
16 | type export2MONGODB[OUT any] struct {
17 | ctx context.Context
18 | client *mongo.Client
19 | collection *mongo.Collection
20 | }
21 |
22 | func Export2MONGODB[OUT any](_url string, dbName string, collName string) *export2MONGODB[OUT] {
23 |
24 | ctx := context.Background()
25 |
26 | serverAPI := options.ServerAPI(options.ServerAPIVersion1)
27 | opts := options.Client().ApplyURI(_url).SetServerAPIOptions(serverAPI)
28 |
29 | client, err := mongo.Connect(ctx, opts)
30 |
31 | if err != nil {
32 | log.Printf("Export2MONGODB: error connecting to DB %s", err)
33 | return nil
34 | }
35 |
36 | var result bson.M
37 |
38 | if err := client.Database(dbName).RunCommand(ctx, bson.D{{Key: "ping", Value: 1}}).Decode(&result); err != nil {
39 | log.Printf("Export2MONGODB: error connecting to DB %s", err)
40 | return nil
41 | }
42 |
43 | collection := client.Database(dbName).Collection(collName)
44 |
45 | return &export2MONGODB[OUT]{
46 | ctx: ctx,
47 | client: client,
48 | collection: collection,
49 | }
50 | }
51 |
52 | func (p *export2MONGODB[OUT]) Open(ctx context.Context) error {
53 | return nil
54 | }
55 |
56 | func (p *export2MONGODB[OUT]) Close() {
57 | }
58 |
59 | func (p *export2MONGODB[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error {
60 |
61 | doc := primitive.D{}
62 | recordFlat := original.RecordFlat()
63 |
64 | for i, key := range original.RecordKeys() {
65 | doc = append(doc, primitive.E{Key: key, Value: recordFlat[i]})
66 | }
67 |
68 | _, err := p.collection.InsertMany(p.ctx, []any{doc})
69 |
70 | if err != nil {
71 | return fmt.Errorf("Export2MONGODB: error inserting data to DB %w", err)
72 | }
73 |
74 | return nil
75 | }
76 |
--------------------------------------------------------------------------------
/pkg/builtin/pipelines/type.go:
--------------------------------------------------------------------------------
1 | package pipelines
2 |
--------------------------------------------------------------------------------
/pkg/core/core.go:
--------------------------------------------------------------------------------
1 | package core
2 |
3 | type Core[OUT any] struct {
4 | engine IEngine[OUT]
5 | }
6 |
7 | func New[OUT any](engine IEngine[OUT]) *Core[OUT] {
8 | return &Core[OUT]{
9 | engine: engine,
10 | }
11 | }
12 |
13 | func (c *Core[OUT]) Request(req IRequestReader, cb ResponseCallback) {
14 | c.engine.Schedule(req, cb)
15 | }
16 |
17 | func (c *Core[OUT]) NewRequest() IRequestRW {
18 | return c.engine.NewRequest()
19 | }
20 |
21 | func (c *Core[OUT]) Yield(out IOutput[OUT]) {
22 | c.engine.Yield(out)
23 | }
24 |
--------------------------------------------------------------------------------
/pkg/core/ports.go:
--------------------------------------------------------------------------------
1 | package core
2 |
3 | import (
4 | "context"
5 | "io"
6 | "net/http"
7 | "net/url"
8 |
9 | "github.com/tech-engine/goscrapy/internal/fsm"
10 | "golang.org/x/net/html"
11 | )
12 |
13 | type IEngine[OUT any] interface {
14 | Start(context.Context) error
15 | NewRequest() IRequestRW
16 | Schedule(IRequestReader, ResponseCallback)
17 | Yield(IOutput[OUT])
18 | }
19 |
20 | type IRequestReader interface {
21 | ReadContext() context.Context
22 | ReadUrl() *url.URL
23 | ReadHeader() http.Header
24 | ReadMethod() string
25 | ReadBody() io.ReadCloser
26 | ReadMeta() *fsm.FixedSizeMap[string, any]
27 | ReadCookieJar() string
28 | }
29 |
30 | type IRequestWriter interface {
31 | WithContext(context.Context) IRequestWriter
32 | Url(string) IRequestWriter
33 | Header(http.Header) IRequestWriter
34 | Method(string) IRequestWriter
35 | Body(any) IRequestWriter
36 | Meta(string, any) IRequestWriter
37 | CookieJar(string) IRequestWriter
38 | }
39 |
40 | type IRequestRW interface {
41 | IRequestReader
42 | IRequestWriter
43 | Reset()
44 | }
45 |
46 | type IResponseReader interface {
47 | Header() http.Header
48 | Body() io.ReadCloser
49 | Bytes() []byte
50 | StatusCode() int
51 | Cookies() []*http.Cookie
52 | Request() *http.Request
53 | Meta(string) (any, bool)
54 | ISelector
55 | }
56 |
57 | type IJob interface {
58 | Id() string
59 | }
60 |
61 | type IOutput[OUT any] interface {
62 | Record() OUT
63 | RecordKeys() []string
64 | RecordFlat() []any
65 | Job() IJob
66 | }
67 |
68 | type ResponseCallback func(context.Context, IResponseReader)
69 | type ISelectorGetter interface {
70 | Get() *html.Node
71 | GetAll() []*html.Node
72 | Text(...string) []string
73 | Attr(string) []string
74 | }
75 |
76 | type ISelector interface {
77 | Css(string) ISelector
78 | Xpath(string) ISelector
79 | ISelectorGetter
80 | }
81 |
--------------------------------------------------------------------------------
/pkg/engine/engine.go:
--------------------------------------------------------------------------------
1 | package engine
2 |
3 | import (
4 | "context"
5 | "sync"
6 |
7 | "github.com/tech-engine/goscrapy/pkg/core"
8 | )
9 |
10 | type Engine[OUT any] struct {
11 | scheduler IScheduler
12 | pipelineManager IPipelineManager[OUT]
13 | outputCh chan core.IOutput[OUT]
14 | }
15 |
16 | func New[OUT any](schd IScheduler, pm IPipelineManager[OUT]) *Engine[OUT] {
17 |
18 | engine := &Engine[OUT]{
19 | scheduler: schd,
20 | pipelineManager: pm,
21 | }
22 |
23 | return engine
24 | }
25 |
26 | // start the core
27 | func (m *Engine[OUT]) Start(ctx context.Context) error {
28 |
29 | var (
30 | wg sync.WaitGroup
31 | errCh = make(chan error, 2)
32 | )
33 |
34 | wg.Add(2)
35 |
36 | pmCtx, pmCancel := context.WithCancel(context.Background())
37 |
38 | go func() {
39 |
40 | defer wg.Done()
41 | defer pmCancel()
42 |
43 | errCh <- m.scheduler.Start(ctx)
44 |
45 | }()
46 |
47 | go func() {
48 |
49 | defer wg.Done()
50 |
51 | errCh <- m.pipelineManager.Start(pmCtx)
52 |
53 | }()
54 |
55 | wg.Wait()
56 |
57 | close(errCh)
58 |
59 | for err := range errCh {
60 | if err != nil {
61 | return err
62 | }
63 | }
64 | return nil
65 | }
66 |
67 | func (m *Engine[OUT]) Schedule(req core.IRequestReader, cb core.ResponseCallback) {
68 | m.scheduler.Schedule(req, cb)
69 | }
70 |
71 | func (m *Engine[OUT]) Yield(out core.IOutput[OUT]) {
72 | m.pipelineManager.Push(out)
73 | }
74 |
75 | func (m *Engine[OUT]) NewRequest() core.IRequestRW {
76 | return m.scheduler.NewRequest()
77 | }
78 |
79 | func (m *Engine[OUT]) WithScheduler(schd IScheduler) {
80 | m.scheduler = schd
81 | }
82 |
83 | func (m *Engine[OUT]) WithPipelineManager(pm IPipelineManager[OUT]) {
84 | m.pipelineManager = pm
85 | }
86 |
--------------------------------------------------------------------------------
/pkg/engine/ports.go:
--------------------------------------------------------------------------------
1 | package engine
2 |
3 | import (
4 | "context"
5 | "io"
6 | "net/http"
7 |
8 | "github.com/tech-engine/goscrapy/internal/fsm"
9 | "github.com/tech-engine/goscrapy/pkg/core"
10 | )
11 |
12 | type IPipelineManager[OUT any] interface {
13 | Start(context.Context) error
14 | Push(core.IOutput[OUT])
15 | }
16 |
17 | type Resetter interface {
18 | Reset()
19 | }
20 |
21 | type IResponseWriter interface {
22 | WriteHeader(http.Header)
23 | WriteBody(io.ReadCloser)
24 | WriteStatusCode(int)
25 | WriteCookies([]*http.Cookie)
26 | WriteRequest(*http.Request)
27 | WriteMeta(*fsm.FixedSizeMap[string, any])
28 | }
29 |
30 | type IResponse interface {
31 | core.IResponseReader
32 | IResponseWriter
33 | }
34 |
35 | type IScheduler interface {
36 | Start(context.Context) error
37 | Schedule(core.IRequestReader, core.ResponseCallback)
38 | NewRequest() core.IRequestRW
39 | }
40 |
--------------------------------------------------------------------------------
/pkg/executor/executor.go:
--------------------------------------------------------------------------------
1 | package executor
2 |
3 | import (
4 | "github.com/tech-engine/goscrapy/pkg/core"
5 | "github.com/tech-engine/goscrapy/pkg/engine"
6 | )
7 |
8 | type Executor struct {
9 | adapter IExecutorAdapter
10 | }
11 |
12 | func New(adapter IExecutorAdapter) *Executor {
13 | return &Executor{
14 | adapter: adapter,
15 | }
16 | }
17 |
18 | func (e *Executor) Execute(req core.IRequestReader, res engine.IResponseWriter) error {
19 |
20 | request := e.adapter.Acquire()
21 |
22 | if req.ReadContext() != nil {
23 | request.WithContext(req.ReadContext())
24 | }
25 |
26 | headers := req.ReadHeader()
27 | // we inject a header for cookiejar implementation
28 | headers.Add("X-Goscrapy-Cookie-Jar-Key", req.ReadCookieJar())
29 |
30 | request.URL = req.ReadUrl()
31 | request.Method = "GET"
32 |
33 | if req.ReadMethod() != "" {
34 | request.Method = req.ReadMethod()
35 | }
36 |
37 | request.Header = headers
38 |
39 | request.Body = req.ReadBody()
40 |
41 | return e.adapter.Do(res, request)
42 | }
43 |
44 | func (e *Executor) WithAdapter(adapter IExecutorAdapter) {
45 | e.adapter = adapter
46 | }
47 |
--------------------------------------------------------------------------------
/pkg/executor/ports.go:
--------------------------------------------------------------------------------
1 | package executor
2 |
3 | import (
4 | "net/http"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/engine"
7 | )
8 |
9 | type IExecutorAdapter interface {
10 | Do(engine.IResponseWriter, *http.Request) error
11 | Acquire() *http.Request
12 | WithClient(*http.Client)
13 | }
14 |
--------------------------------------------------------------------------------
/pkg/executor_adapters/http_native/adapter.go:
--------------------------------------------------------------------------------
1 | package httpnative
2 |
3 | import (
4 | "fmt"
5 | "net/http"
6 | "os"
7 | "strconv"
8 |
9 | rp "github.com/tech-engine/goscrapy/internal/resource_pool"
10 | "github.com/tech-engine/goscrapy/pkg/engine"
11 | )
12 |
13 | const EX_ADAPTER_DEFAULT_REQ_RES_POOL_SIZE = 1e6
14 |
15 | // HTTPAdapter implements Executor's ExecAdapter interface
16 | type HTTPAdapter struct {
17 | client *http.Client
18 | reqpool *rp.Pooler[http.Request]
19 | }
20 |
21 | func NewHTTPClientAdapter(client *http.Client, poolSize uint64) *HTTPAdapter {
22 | if client == nil {
23 | client = http.DefaultClient
24 | }
25 |
26 | if poolSize == 0 {
27 | poolSize = EX_ADAPTER_DEFAULT_REQ_RES_POOL_SIZE
28 | value, ok := os.LookupEnv("SCHEDULER_REQ_RES_POOL_SIZE")
29 |
30 | if ok {
31 | parsedPoolSize, err := strconv.ParseUint(value, 10, 64)
32 | if err == nil {
33 | poolSize = parsedPoolSize
34 | }
35 | }
36 | }
37 |
38 | return &HTTPAdapter{
39 | client: client,
40 | reqpool: rp.NewPooler(rp.WithSize[http.Request](poolSize)),
41 | }
42 | }
43 |
44 | func (r *HTTPAdapter) Acquire() *http.Request {
45 | req := r.reqpool.Acquire()
46 | if req == nil {
47 | req = &http.Request{}
48 | }
49 | return req
50 | }
51 |
52 | func (r *HTTPAdapter) WithClient(client *http.Client) {
53 | r.client = client
54 | }
55 |
56 | func (r *HTTPAdapter) Do(res engine.IResponseWriter, req *http.Request) error {
57 | defer r.reqpool.Release(req)
58 |
59 | source, err := r.client.Do(req)
60 |
61 | if err != nil {
62 | return fmt.Errorf("Do: error dispatching request %w", err)
63 | }
64 |
65 | res.WriteRequest(req)
66 | HTTPRequestAdapterResponse(res, source)
67 | return nil
68 | }
69 |
--------------------------------------------------------------------------------
/pkg/executor_adapters/http_native/adapter_test.go:
--------------------------------------------------------------------------------
1 | package httpnative
2 |
3 | import (
4 | "context"
5 | "io"
6 | "net/http"
7 | "net/http/httptest"
8 | "net/url"
9 | "strconv"
10 | "strings"
11 | "testing"
12 | "time"
13 |
14 | "github.com/stretchr/testify/assert"
15 | "github.com/tech-engine/goscrapy/internal/fsm"
16 | )
17 |
18 | type testCase struct {
19 | name,
20 | method string
21 | body io.ReadCloser
22 | expected []byte
23 | }
24 |
25 | var testServer = httptest.NewServer(handler())
26 |
27 | type testResponseWriter struct {
28 | statuscode int
29 | body io.ReadCloser
30 | }
31 |
32 | func (r *testResponseWriter) WriteHeader(h http.Header) {
33 | }
34 |
35 | func (r *testResponseWriter) WriteBody(b io.ReadCloser) {
36 | r.body = b
37 | }
38 |
39 | func (r *testResponseWriter) WriteStatusCode(s int) {
40 | r.statuscode = s
41 | }
42 |
43 | func (r *testResponseWriter) WriteCookies(c []*http.Cookie) {
44 | }
45 |
46 | func (r *testResponseWriter) WriteRequest(req *http.Request) {
47 | }
48 |
49 | func (r *testResponseWriter) WriteMeta(m *fsm.FixedSizeMap[string, any]) {
50 | }
51 |
52 | func handler() *http.ServeMux {
53 | mux := http.NewServeMux()
54 | // /get-cookie receives headers from client and set those headers as response cookies
55 | mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
56 | switch r.Method {
57 | case "GET", "DELETE":
58 | // selectively sleep based of delay header to test context in request
59 | if delay := r.Header.Get("delay"); delay != "" {
60 | d, _ := strconv.Atoi(delay)
61 | time.Sleep(time.Duration(d) * time.Second)
62 | }
63 | w.WriteHeader(http.StatusOK)
64 | w.Write([]byte("ok"))
65 | case "POST", "PATCH", "PUT":
66 | w.WriteHeader(http.StatusOK)
67 | b, _ := io.ReadAll(r.Body)
68 | w.Write(b)
69 | default:
70 | w.WriteHeader(http.StatusMethodNotAllowed)
71 | }
72 | })
73 |
74 | return mux
75 | }
76 |
77 | func run(t *testing.T, adapter *HTTPAdapter, method string, body io.ReadCloser, expected []byte) {
78 |
79 | var err error
80 | resp := &testResponseWriter{}
81 |
82 | urlParsed, err := url.Parse(testServer.URL)
83 |
84 | assert.NoError(t, err)
85 |
86 | req := adapter.Acquire()
87 |
88 | req.URL = urlParsed
89 |
90 | req.Method = method
91 | req.Body = body
92 | err = adapter.Do(resp, req)
93 |
94 | assert.NoError(t, err)
95 |
96 | defer resp.body.Close()
97 |
98 | assert.Equal(t, 200, resp.statuscode)
99 |
100 | respB, err := io.ReadAll(resp.body)
101 |
102 | assert.NoError(t, err)
103 |
104 | assert.Equalf(t, expected, respB, "expected %s, got %s", string(expected), string(respB))
105 |
106 | }
107 |
108 | func TestAdapterRequest(t *testing.T) {
109 |
110 | adapter := NewHTTPClientAdapter(&http.Client{}, 10)
111 | testCases := []testCase{
112 | {
113 | name: "GET",
114 | method: "GET",
115 | expected: []byte("ok"),
116 | },
117 | {
118 | name: "DELETE",
119 | method: "DELETE",
120 | expected: []byte("ok"),
121 | },
122 | {
123 | name: "POST",
124 | method: "POST",
125 | body: io.NopCloser(strings.NewReader("post")),
126 | expected: []byte("post"),
127 | },
128 | {
129 | name: "PATCH",
130 | method: "PATCH",
131 | body: io.NopCloser(strings.NewReader("patch")),
132 | expected: []byte("patch"),
133 | },
134 | {
135 | name: "PUT",
136 | method: "PUT",
137 | body: io.NopCloser(strings.NewReader("put")),
138 | expected: []byte("put"),
139 | },
140 | }
141 | for _, tc := range testCases {
142 | t.Run(tc.method, func(t *testing.T) {
143 | t.Parallel()
144 | run(t, adapter, tc.method, tc.body, tc.expected)
145 | })
146 | }
147 | }
148 |
149 | func TestAdapterRequestCtx(t *testing.T) {
150 | adapter := NewHTTPClientAdapter(&http.Client{}, 10)
151 |
152 | resp := &testResponseWriter{}
153 |
154 | urlParsed, err := url.Parse(testServer.URL)
155 |
156 | assert.NoError(t, err)
157 |
158 | ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
159 | defer cancel()
160 |
161 | // added so that we can distinguise this request and sleep selectively for 3 seconds in our test server
162 | // which will cause the context to expire before we get a response from server
163 | headers := http.Header{}
164 | headers.Add("delay", "3")
165 |
166 | req := adapter.Acquire()
167 |
168 | req.URL = urlParsed
169 | req = req.WithContext(ctx)
170 | req.Header = headers
171 |
172 | err = adapter.Do(resp, req)
173 |
174 | assert.ErrorIs(t, err, context.DeadlineExceeded)
175 | }
176 |
--------------------------------------------------------------------------------
/pkg/executor_adapters/http_native/helper.go:
--------------------------------------------------------------------------------
1 | package httpnative
2 |
3 | import (
4 | "net/http"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/engine"
7 | )
8 |
9 | func HTTPRequestAdapterResponse(res engine.IResponseWriter, source *http.Response) {
10 |
11 | res.WriteHeader(source.Header)
12 | res.WriteStatusCode(source.StatusCode)
13 | res.WriteCookies(source.Cookies())
14 | res.WriteBody(source.Body)
15 | }
16 |
--------------------------------------------------------------------------------
/pkg/middlewaremanager/middlewaremanager.go:
--------------------------------------------------------------------------------
1 | package middlewaremanager
2 |
3 | import "net/http"
4 |
5 | type Middleware func(next http.RoundTripper) http.RoundTripper
6 |
7 | type MiddlewareFunc func(req *http.Request) (*http.Response, error)
8 |
9 | func (mf MiddlewareFunc) RoundTrip(req *http.Request) (*http.Response, error) {
10 | return mf(req)
11 | }
12 |
13 | type MiddlewareManager struct {
14 | httpClient *http.Client
15 | }
16 |
17 | func New(cli *http.Client) *MiddlewareManager {
18 | return &MiddlewareManager{
19 | httpClient: cli,
20 | }
21 | }
22 |
23 | func (m *MiddlewareManager) HTTPClient() *http.Client {
24 | return m.httpClient
25 | }
26 |
27 | func (m *MiddlewareManager) Add(middlewares ...Middleware) {
28 | for _, middleware := range middlewares {
29 | m.httpClient.Transport = middleware(m.httpClient.Transport)
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/pkg/pipeline_manager/constants.go:
--------------------------------------------------------------------------------
1 | package pipelinemanager
2 |
3 | // If true, all pipelines' Open method complete without an error,
4 | // otherwise, pipeline manager won't start and return an error corresponding
5 | // to the first pipeline to return an non-nil error.
6 |
7 | // const PIPELINEMANAGER_PIPELINES_MUST_OPEN = false
8 |
9 | // Reuseable Pipeline Item pool size
10 | const PIPELINEMANAGER_ITEMPOOL_SIZE = 10000
11 |
12 | // Max key-value pairs a Pipiline Item can have
13 | const PIPELINEMANAGER_ITEM_SIZE = 24
14 |
15 | // Output queue buffer size. Yield items are pushed to this queue,
16 | // before being feed into the start of the pipelines.
17 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = 0
18 |
19 | // Max number of Outputs that will be allowed to processed concurrently in the pipeline
20 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = 1000
21 |
--------------------------------------------------------------------------------
/pkg/pipeline_manager/group.go:
--------------------------------------------------------------------------------
1 | package pipelinemanager
2 |
3 | import (
4 | "context"
5 | "sync"
6 |
7 | "github.com/tech-engine/goscrapy/pkg/core"
8 | "golang.org/x/sync/errgroup"
9 | )
10 |
11 | type Group[OUT any] struct {
12 | nodes []IPipeline[OUT]
13 | ignoreErrors bool
14 | }
15 |
16 | // Create a Group which is a collection of pipelines intended to run concurrently
17 | // as opposed to sequentially. Group implements IPipeline interface, meaning it behaves
18 | // like a single pipeline.
19 | //
20 | // Common usecase: Since each pipeline in a Group runs concurrently it is not meant
21 | // for data transformation but only for data export or other similar independent tasks.
22 | // A Group must not modify
23 | func NewGroup[OUT any]() *Group[OUT] {
24 | return &Group[OUT]{
25 | nodes: make([]IPipeline[OUT], 0),
26 | }
27 | }
28 |
29 | func (g *Group[OUT]) Open(ctx context.Context) error {
30 | if g.ignoreErrors {
31 | var wg sync.WaitGroup
32 | wg.Add(len(g.nodes))
33 |
34 | for _, p := range g.nodes {
35 | go func() {
36 | defer wg.Done()
37 | p.Open(ctx)
38 | }()
39 | }
40 |
41 | wg.Wait()
42 | return nil
43 | }
44 |
45 | group, groupCtx := errgroup.WithContext(ctx)
46 | for _, p := range g.nodes {
47 | group.Go(func() error {
48 | return p.Open(groupCtx)
49 | })
50 | }
51 | return group.Wait()
52 | }
53 |
54 | func (g *Group[OUT]) Close() {
55 | var wg sync.WaitGroup
56 | wg.Add(len(g.nodes))
57 |
58 | for _, p := range g.nodes {
59 | go func() {
60 | defer wg.Done()
61 | p.Close()
62 | }()
63 | }
64 |
65 | wg.Wait()
66 | }
67 |
68 | // WithIgnoreError sets ignoreErrors = true
69 | //
70 | // When ignoreErrors = true, Group's ProcessItem & Open function will always return
71 | // a nil error.
72 | //
73 | // When ignoreErrors = false(default), Group's ProcessItem & Open will return the first non-nil
74 | // error. In addition to that context passed to Open function is also cancelled.
75 | func (g *Group[OUT]) WithIgnoreError() {
76 | g.ignoreErrors = true
77 | }
78 |
79 | func (g *Group[OUT]) Add(p ...IPipeline[OUT]) {
80 | g.nodes = append(g.nodes, p...)
81 | }
82 |
83 | func (g *Group[OUT]) ProcessItem(pi IPipelineItem, out core.IOutput[OUT]) error {
84 |
85 | if g.ignoreErrors {
86 | var wg sync.WaitGroup
87 | wg.Add(len(g.nodes))
88 |
89 | for _, p := range g.nodes {
90 | go func() {
91 | defer wg.Done()
92 | p.ProcessItem(pi, out)
93 | }()
94 | }
95 |
96 | wg.Wait()
97 | return nil
98 | }
99 |
100 | errGroup := errgroup.Group{}
101 | for _, p := range g.nodes {
102 | errGroup.Go(func() error {
103 | return p.ProcessItem(pi, out)
104 | })
105 | }
106 | return errGroup.Wait()
107 | }
108 |
--------------------------------------------------------------------------------
/pkg/pipeline_manager/options.go:
--------------------------------------------------------------------------------
1 | package pipelinemanager
2 |
3 | import (
4 | "os"
5 | "strconv"
6 |
7 | "github.com/tech-engine/goscrapy/internal/types"
8 | )
9 |
10 | type opts struct {
11 | itemPoolSize, itemSize, outputQueueBuffSize, maxProcessItemConcurrency uint64
12 | }
13 |
14 | // Setup all the default pipelinemanger options.
15 | func defaultOpts() opts {
16 | opts := opts{}
17 | opts.itemPoolSize = PIPELINEMANAGER_ITEMPOOL_SIZE
18 | envVal, ok := os.LookupEnv("PIPELINEMANAGER_ITEMPOOL_SIZE")
19 |
20 | if ok {
21 | parsedPoolSize, err := strconv.ParseUint(envVal, 10, 64)
22 | if err == nil {
23 | opts.itemPoolSize = parsedPoolSize
24 | }
25 | }
26 |
27 | opts.itemSize = PIPELINEMANAGER_ITEM_SIZE
28 | envVal, ok = os.LookupEnv("PIPELINEMANAGER_ITEM_SIZE")
29 |
30 | if ok {
31 | parsedSize, err := strconv.ParseUint(envVal, 10, 64)
32 | if err == nil {
33 | opts.itemSize = parsedSize
34 | }
35 | }
36 |
37 | opts.outputQueueBuffSize = PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE
38 | envVal, ok = os.LookupEnv("PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE")
39 |
40 | if ok {
41 | parsedOutputBufSize, err := strconv.ParseUint(envVal, 10, 64)
42 | if err == nil {
43 | opts.outputQueueBuffSize = parsedOutputBufSize
44 | }
45 | }
46 |
47 | opts.maxProcessItemConcurrency = PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY
48 | envVal, ok = os.LookupEnv("PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY")
49 |
50 | if ok {
51 | parsedMaxItem, err := strconv.ParseUint(envVal, 10, 64)
52 | if err == nil {
53 | opts.maxProcessItemConcurrency = parsedMaxItem
54 | }
55 | }
56 |
57 | return opts
58 | }
59 |
60 | func WithItemPoolSize(val uint64) types.OptFunc[opts] {
61 | return func(opts *opts) {
62 | opts.itemPoolSize = val
63 | }
64 | }
65 |
66 | func WithItemSize(val uint64) types.OptFunc[opts] {
67 | return func(opts *opts) {
68 | opts.itemSize = val
69 | }
70 | }
71 |
72 | func WithOutputQueueSize(val uint64) types.OptFunc[opts] {
73 | return func(opts *opts) {
74 | opts.outputQueueBuffSize = val
75 | }
76 | }
77 |
78 | func WithProcessItemConcurrency(val uint64) types.OptFunc[opts] {
79 | return func(opts *opts) {
80 | opts.maxProcessItemConcurrency = val
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/pkg/pipeline_manager/pipeline_manager.go:
--------------------------------------------------------------------------------
1 | package pipelinemanager
2 |
3 | import (
4 | "context"
5 | "sync"
6 |
7 | "github.com/tech-engine/goscrapy/internal/cmap"
8 | rp "github.com/tech-engine/goscrapy/internal/resource_pool"
9 | "github.com/tech-engine/goscrapy/internal/types"
10 | "github.com/tech-engine/goscrapy/pkg/core"
11 | "golang.org/x/sync/errgroup"
12 | )
13 |
14 | type PipelineManager[OUT any] struct {
15 | opts
16 | itemPool *rp.Pooler[cmap.CMap[string, any]]
17 | outputQueue chan core.IOutput[OUT]
18 | pipelines []IPipeline[OUT]
19 | }
20 |
21 | func New[OUT any](optFuncs ...types.OptFunc[opts]) *PipelineManager[OUT] {
22 |
23 | // set default options
24 | opts := defaultOpts()
25 |
26 | // set custom options
27 | for _, fn := range optFuncs {
28 | fn(&opts)
29 | }
30 |
31 | return &PipelineManager[OUT]{
32 | opts: opts,
33 | outputQueue: make(chan core.IOutput[OUT], opts.outputQueueBuffSize),
34 | pipelines: make([]IPipeline[OUT], 0),
35 | itemPool: rp.NewPooler(rp.WithSize[cmap.CMap[string, any]](opts.itemPoolSize)),
36 | }
37 | }
38 |
39 | func (pm *PipelineManager[OUT]) Add(pipeline ...IPipeline[OUT]) {
40 | pm.pipelines = append(pm.pipelines, pipeline...)
41 | }
42 |
43 | // runs after the spider's Open func and calls all open function of pipelines
44 | func (pm *PipelineManager[OUT]) Start(ctx context.Context) error {
45 |
46 | var (
47 | group *errgroup.Group
48 | groupCtx context.Context
49 | err error
50 | )
51 |
52 | if err = ctx.Err(); err != nil {
53 | return err
54 | }
55 |
56 | // Below code ensures that we return an error in case any of the pipelines open
57 | // funtion returns and error if opts.openMust has been set to true
58 |
59 | group, groupCtx = errgroup.WithContext(ctx)
60 |
61 | for _, pipeline := range pm.pipelines {
62 | group.Go(func() error {
63 | return pipeline.Open(groupCtx)
64 | })
65 | }
66 |
67 | // we return early as there would be no point in processing items on the
68 | if err = group.Wait(); err != nil {
69 | return err
70 | }
71 |
72 | // upon exiting, stop pipeline manager
73 | defer pm.stop()
74 |
75 | // Below we listen on the outputQueue for new yield outputs
76 |
77 | var wg sync.WaitGroup
78 | defer wg.Wait()
79 |
80 | // This semaphone will make sure only a fixed number of goroutines
81 | // are spun up to process items from output queue
82 | semaphone := make(chan struct{}, pm.opts.maxProcessItemConcurrency)
83 |
84 | for {
85 | select {
86 | case semaphone <- struct{}{}:
87 |
88 | wg.Add(1)
89 | go func() {
90 |
91 | defer wg.Done()
92 | defer func() { <-semaphone }()
93 |
94 | // this select is to make sure this goroutine doesn't get's blocked
95 | // waiting for items on queue and get a chance to exit when on context cancellation
96 | select {
97 | case item := <-pm.outputQueue:
98 | if ctx.Err() != nil {
99 | return
100 | }
101 | pm.processItem(item)
102 | case <-ctx.Done():
103 | // currently not needed but we also consider closing pm.outputQueue channel in future
104 | return
105 | }
106 |
107 | }()
108 |
109 | case <-ctx.Done():
110 | return ctx.Err()
111 | }
112 | }
113 | }
114 |
115 | // Stoping manager would call the close function of every pipeline
116 | func (pm *PipelineManager[OUT]) stop() {
117 | var wg sync.WaitGroup
118 |
119 | wg.Add(len(pm.pipelines))
120 | defer wg.Wait()
121 |
122 | for _, p := range pm.pipelines {
123 | go func() {
124 | defer wg.Done()
125 | p.Close()
126 | }()
127 |
128 | }
129 | }
130 |
131 | func (pm *PipelineManager[OUT]) Push(original core.IOutput[OUT]) {
132 | if len(pm.pipelines) <= 0 {
133 | return
134 | }
135 | pm.outputQueue <- original
136 | }
137 |
138 | // Below function passes each yield output through our pipelines
139 | func (pm *PipelineManager[OUT]) processItem(original core.IOutput[OUT]) {
140 |
141 | // call sync pipelines
142 | var (
143 | pItem *cmap.CMap[string, any] // pipeline item
144 | err error
145 | )
146 |
147 | pItem = pm.itemPool.Acquire()
148 |
149 | defer func() {
150 | pItem.Clear()
151 | pm.itemPool.Release(pItem)
152 | }()
153 |
154 | if pItem == nil {
155 | pItem = cmap.NewCMap[string, any](cmap.WithSize(int(pm.itemSize)))
156 | }
157 |
158 | for _, pipeline := range pm.pipelines {
159 |
160 | // we check if pipeline is a group by checking
161 | if err = pipeline.ProcessItem(IPipelineItem(pItem), original); err != nil {
162 | return
163 | }
164 | }
165 | }
166 |
--------------------------------------------------------------------------------
/pkg/pipeline_manager/pipeline_manager_test.go:
--------------------------------------------------------------------------------
1 | package pipelinemanager
2 |
3 | import (
4 | "context"
5 | "reflect"
6 | "sync"
7 | "testing"
8 |
9 | "github.com/stretchr/testify/assert"
10 | "github.com/tech-engine/goscrapy/pkg/core"
11 | )
12 |
13 | type safeDummyRecord struct {
14 | mu sync.Mutex
15 | id, age int
16 | }
17 |
18 | func (s *safeDummyRecord) Set(id, age int) {
19 | s.mu.Lock()
20 | defer s.mu.Unlock()
21 | s.id = id
22 | s.age = age
23 | }
24 |
25 | func (s *safeDummyRecord) GetVal() [2]int {
26 | s.mu.Lock()
27 | defer s.mu.Unlock()
28 | return [2]int{s.id, s.age}
29 | }
30 |
31 | type dummyRecord struct {
32 | Id, Age int
33 | }
34 |
35 | type dummyJob struct {
36 | id string
37 | }
38 |
39 | func (j *dummyJob) Id() string {
40 | return "dummyJob"
41 | }
42 |
43 | func (o *dummyRecord) Record() *dummyRecord {
44 | return o
45 | }
46 |
47 | func (o *dummyRecord) RecordKeys() []string {
48 | dataType := reflect.TypeOf(*o)
49 | if dataType.Kind() != reflect.Struct {
50 | panic("Record is not a struct")
51 | }
52 |
53 | numFields := dataType.NumField()
54 | keys := make([]string, numFields)
55 |
56 | for i := 0; i < numFields; i++ {
57 | field := dataType.Field(i)
58 | csvTag := field.Tag.Get("csv")
59 | keys[i] = csvTag
60 | }
61 |
62 | return keys
63 | }
64 |
65 | func (o *dummyRecord) RecordFlat() []any {
66 |
67 | inputType := reflect.TypeOf(*o)
68 |
69 | if inputType.Kind() != reflect.Struct {
70 | panic("Record is not a struct")
71 | }
72 |
73 | inputValue := reflect.ValueOf(*o)
74 |
75 | slice := make([]any, inputType.NumField())
76 |
77 | for i := 0; i < inputType.NumField(); i++ {
78 | slice[i] = inputValue.Field(i).Interface()
79 | }
80 | return slice
81 | }
82 |
83 | func (o *dummyRecord) Job() core.IJob {
84 | return nil
85 | }
86 |
87 | // dummy pipeline 1
88 | type doublePipeline[OUT any] struct {
89 | }
90 |
91 | func newDoublePipeline[OUT any]() *doublePipeline[OUT] {
92 | return &doublePipeline[OUT]{}
93 | }
94 |
95 | func (p *doublePipeline[OUT]) Open(ctx context.Context) error {
96 | return nil
97 | }
98 |
99 | func (p *doublePipeline[OUT]) Close() {
100 | }
101 |
102 | func (p *doublePipeline[OUT]) ProcessItem(item IPipelineItem, original core.IOutput[OUT]) error {
103 | rec := original.RecordFlat()
104 | item.Set("id", rec[0])
105 | item.Set("age", rec[1].(int)*2)
106 | return nil
107 | }
108 |
109 | // dummy pipeline 2
110 | type dummyPipeline2[OUT any] struct {
111 | safeRecord safeDummyRecord
112 | }
113 |
114 | func newDummyPipeline2[OUT any]() *dummyPipeline2[OUT] {
115 | return &dummyPipeline2[OUT]{
116 | safeRecord: safeDummyRecord{},
117 | }
118 | }
119 |
120 | func (p *dummyPipeline2[OUT]) Open(ctx context.Context) error {
121 | return nil
122 | }
123 |
124 | func (p *dummyPipeline2[OUT]) Close() {
125 | }
126 |
127 | func (p *dummyPipeline2[OUT]) ProcessItem(item IPipelineItem, original core.IOutput[OUT]) error {
128 | id, _ := item.Get("id")
129 | age, _ := item.Get("age")
130 | p.safeRecord.Set(id.(int), age.(int))
131 |
132 | return nil
133 | }
134 |
135 | func TestPipelineManager(t *testing.T) {
136 | // create a pipeline manager
137 | var wg sync.WaitGroup
138 | pipelineManager := New[*dummyRecord]()
139 | // add a dummy test pipeline
140 | readPipeline := newDummyPipeline2[*dummyRecord]()
141 | pipelineManager.Add(
142 | newDoublePipeline[*dummyRecord](),
143 | readPipeline,
144 | )
145 | // start the pipeline
146 | wg.Add(1)
147 | go func() {
148 | wg.Done()
149 | pipelineManager.Start(context.Background())
150 | }()
151 | // push item to pipeline
152 | pipelineManager.Push(&dummyRecord{Id: 1, Age: 19})
153 | // verify what we pushed is what we get
154 | safeRecord := readPipeline.safeRecord.GetVal()
155 | assert.Equalf(t, 1, safeRecord[0], "expected id=1, got=%s", safeRecord[0])
156 | assert.Equalf(t, 38, safeRecord[1], "expected age=1, got=%s", safeRecord[1])
157 | wg.Wait()
158 | }
159 |
--------------------------------------------------------------------------------
/pkg/pipeline_manager/ports.go:
--------------------------------------------------------------------------------
1 | package pipelinemanager
2 |
3 | import (
4 | "context"
5 |
6 | "github.com/tech-engine/goscrapy/pkg/core"
7 | )
8 |
9 | // We have added it here as PipelineManager is the one that passes IPipelineItems to pipelines
10 | // and so must be aware of IPipelineItem.
11 | type IPipelineItem interface {
12 | Get(string) (any, bool)
13 | Set(string, any) error
14 | Del(string)
15 | Keys() []any
16 | Clear()
17 | }
18 |
19 | type IPipeline[OUT any] interface {
20 | Open(context.Context) error
21 | Close()
22 | ProcessItem(IPipelineItem, core.IOutput[OUT]) error
23 | }
24 |
--------------------------------------------------------------------------------
/pkg/scheduler/constants.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | // These constants will be overwritten by enviroment variables
4 | const SCHEDULER_DEFAULT_REQ_RES_POOL_SIZE uint64 = 1e6
5 | const SCHEDULER_DEFAULT_WORKER_MULTIPLIER uint16 = 3
6 | const SCHEDULER_DEFAULT_WORK_QUEUE_SIZE = 1e6
7 |
--------------------------------------------------------------------------------
/pkg/scheduler/options.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "os"
5 | "runtime"
6 | "strconv"
7 |
8 | "github.com/tech-engine/goscrapy/internal/types"
9 | )
10 |
11 | type opts struct {
12 | numWorkers uint16
13 | reqResPoolSize uint64
14 | workQueueSize uint64
15 | }
16 |
17 | func defaultOpts() opts {
18 | opts := opts{}
19 | opts.reqResPoolSize = SCHEDULER_DEFAULT_REQ_RES_POOL_SIZE
20 | value, ok := os.LookupEnv("SCHEDULER_REQ_RES_POOL_SIZE")
21 |
22 | if ok {
23 | parsedPoolSize, err := strconv.ParseUint(value, 10, 64)
24 | if err == nil {
25 | opts.reqResPoolSize = parsedPoolSize
26 | }
27 | }
28 |
29 | opts.numWorkers = uint16(runtime.GOMAXPROCS(0)) * SCHEDULER_DEFAULT_WORKER_MULTIPLIER
30 | value, ok = os.LookupEnv("SCHEDULER_CONCURRENCY")
31 |
32 | if ok {
33 | multiplier, err := strconv.ParseUint(value, 10, 16)
34 | if err == nil {
35 | opts.numWorkers = uint16(multiplier)
36 | }
37 | }
38 |
39 | opts.workQueueSize = SCHEDULER_DEFAULT_WORK_QUEUE_SIZE
40 | value, ok = os.LookupEnv("SCHEDULER_WORK_QUEUE_SIZE")
41 |
42 | if ok {
43 | workQueueSize, err := strconv.ParseUint(value, 10, 64)
44 | if err == nil {
45 | opts.workQueueSize = workQueueSize
46 | }
47 | }
48 | return opts
49 | }
50 |
51 | func WithReqResPoolSize(n uint64) types.OptFunc[opts] {
52 | return func(opts *opts) {
53 | opts.reqResPoolSize = n
54 | }
55 | }
56 |
57 | func WithWorkers(n uint16) types.OptFunc[opts] {
58 | return func(opts *opts) {
59 | opts.numWorkers = n
60 | }
61 | }
62 |
63 | func WithWorkQueueSize(n uint64) types.OptFunc[opts] {
64 | return func(opts *opts) {
65 | opts.workQueueSize = n
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/pkg/scheduler/ports.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "github.com/tech-engine/goscrapy/pkg/core"
5 | "github.com/tech-engine/goscrapy/pkg/engine"
6 | )
7 |
8 | // An executor must implement the IExecutor interface to be used by the scheduler.*Scheduler
9 | type IExecutor interface {
10 | Execute(core.IRequestReader, engine.IResponseWriter) error
11 | }
12 |
--------------------------------------------------------------------------------
/pkg/scheduler/request.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "bytes"
5 | "context"
6 | "encoding/json"
7 | "fmt"
8 | "io"
9 | "net/http"
10 | "net/url"
11 | "strings"
12 |
13 | "github.com/tech-engine/goscrapy/internal/fsm"
14 | "github.com/tech-engine/goscrapy/pkg/core"
15 | )
16 |
17 | type request struct {
18 | ctx context.Context
19 | url *url.URL
20 | method string
21 | body io.ReadCloser
22 | header http.Header
23 | meta *fsm.FixedSizeMap[string, any]
24 | cookieJarKey string
25 | }
26 |
27 | // Request inplements core.IRequestReader
28 | func (r *request) ReadMethod() string {
29 | return r.method
30 | }
31 |
32 | func (r *request) ReadUrl() *url.URL {
33 | return r.url
34 | }
35 |
36 | func (r *request) ReadHeader() http.Header {
37 | return r.header
38 | }
39 |
40 | func (r *request) ReadBody() io.ReadCloser {
41 | return r.body
42 | }
43 |
44 | func (r *request) ReadContext() context.Context {
45 | return r.ctx
46 | }
47 |
48 | // ReadMeta give us a shallow copy of meta.
49 | func (r *request) ReadMeta() *fsm.FixedSizeMap[string, any] {
50 | return r.meta
51 | }
52 |
53 | // Read the cookie jar key associated with a request
54 | func (r *request) ReadCookieJar() string {
55 | return r.cookieJarKey
56 | }
57 |
58 | // Request inplements core.IRequestWriter
59 | func (r *request) Url(_url string) core.IRequestWriter {
60 | __url, err := url.Parse(_url)
61 |
62 | if err != nil {
63 | panic(fmt.Sprintf("SetUrl: error parsing url"))
64 | }
65 |
66 | r.url = __url
67 | return r
68 | }
69 |
70 | func (r *request) Method(method string) core.IRequestWriter {
71 | r.method = strings.ToUpper(method)
72 | return r
73 | }
74 |
75 | func (r *request) Body(body any) core.IRequestWriter {
76 | switch v := body.(type) {
77 | case io.Reader:
78 | r.body = io.NopCloser(v)
79 | case io.ReadCloser:
80 | r.body = v
81 | case string:
82 | r.body = io.NopCloser(strings.NewReader(v))
83 | case []byte:
84 | r.body = io.NopCloser(bytes.NewReader(v))
85 | default:
86 | var buf *bytes.Buffer
87 | _ = json.NewEncoder(buf).Encode(v)
88 | r.body = io.NopCloser(buf)
89 | }
90 |
91 | return r
92 | }
93 |
94 | func (r *request) Header(header http.Header) core.IRequestWriter {
95 | r.header = header
96 | return r
97 | }
98 |
99 | func (r *request) CookieJar(key string) core.IRequestWriter {
100 | r.cookieJarKey = key
101 | return r
102 | }
103 |
104 | // Pass meta data as key/value pair to be available in callback response.
105 | func (r *request) Meta(key string, val any) core.IRequestWriter {
106 | if r.meta == nil {
107 | r.meta = fsm.New[string, any](24)
108 | }
109 | r.meta.Set(key, val)
110 | return r
111 | }
112 |
113 | func (r *request) WithContext(ctx context.Context) core.IRequestWriter {
114 | r.ctx = ctx
115 | return r
116 | }
117 |
118 | // func (r *request) MetaDataKey(key string) (any, bool) {
119 | // if r.meta == nil {
120 | // return nil, false
121 | // }
122 |
123 | // val, ok := r.meta[key]
124 | // return val, ok
125 | // }
126 |
127 | func (r *request) Reset() {
128 | r.method = ""
129 | r.url = nil
130 | if r.header != nil {
131 | for key := range r.header {
132 | r.header.Del(key)
133 | }
134 | }
135 | r.body = nil
136 | r.cookieJarKey = ""
137 | if r.meta != nil {
138 | r.meta.Clear()
139 | }
140 | }
141 |
--------------------------------------------------------------------------------
/pkg/scheduler/response.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "bytes"
5 | "io"
6 | "net/http"
7 |
8 | "github.com/tech-engine/goscrapy/internal/fsm"
9 | "github.com/tech-engine/goscrapy/pkg/core"
10 | "golang.org/x/net/html"
11 | )
12 |
13 | func NewResponse() *response {
14 | return &response{}
15 | }
16 |
17 | type response struct {
18 | statusCode int
19 | body io.ReadCloser
20 | header http.Header
21 | cookies []*http.Cookie
22 | request *http.Request
23 | meta *fsm.FixedSizeMap[string, any]
24 | nodes Selectors
25 | }
26 |
27 | // response implementing core.ResponseReader
28 | func (r *response) Request() *http.Request {
29 | return r.request
30 | }
31 |
32 | func (r *response) StatusCode() int {
33 | return r.statusCode
34 | }
35 |
36 | func (r *response) Body() io.ReadCloser {
37 | return r.body
38 | }
39 |
40 | func (r *response) Header() http.Header {
41 | return r.header
42 | }
43 |
44 | func (r *response) Cookies() []*http.Cookie {
45 | return r.cookies
46 | }
47 |
48 | func (r *response) Meta(key string) (any, bool) {
49 | return r.meta.Get(key)
50 | }
51 |
52 | func (r *response) Bytes() []byte {
53 | buff := new(bytes.Buffer)
54 | buff.ReadFrom(r.body)
55 | return buff.Bytes()
56 | }
57 |
58 | func (r *response) Reset() {
59 | r.statusCode = 0
60 | r.body = nil
61 | r.header = nil
62 | r.cookies = nil
63 | r.request = nil
64 | // because we there isn't guarantee that we will have the same pair for req-res from the pools,
65 | // we must set it meta=nil upon releasing req-res to their respective pools, otherwise we will have corrupt data.
66 | r.meta = nil
67 | r.nodes = nil
68 | }
69 |
70 | // response implementing engine.ResponseWriter
71 | func (r *response) WriteRequest(request *http.Request) {
72 | r.request = request
73 | }
74 |
75 | func (r *response) WriteHeader(header http.Header) {
76 | r.header = header
77 | }
78 |
79 | func (r *response) WriteBody(body io.ReadCloser) {
80 | r.body = body
81 | }
82 |
83 | func (r *response) WriteStatusCode(statuscode int) {
84 | r.statusCode = statuscode
85 | }
86 |
87 | func (r *response) WriteCookies(cookies []*http.Cookie) {
88 | r.cookies = cookies
89 | }
90 |
91 | func (r *response) WriteMeta(meta *fsm.FixedSizeMap[string, any]) {
92 | r.meta = meta
93 | }
94 |
95 | func (r *response) Css(selector string) core.ISelector {
96 |
97 | if r.nodes == nil {
98 | if nodes, err := NewSelector(r.body); err == nil {
99 | r.nodes = nodes
100 | }
101 | }
102 |
103 | return r.nodes.Css(selector)
104 | }
105 |
106 | func (r *response) Xpath(xpath string) core.ISelector {
107 |
108 | if r.nodes == nil {
109 | if nodes, err := NewSelector(r.body); err == nil {
110 | r.nodes = nodes
111 | }
112 | }
113 | return r.nodes.Xpath(xpath)
114 | }
115 |
116 | func (r *response) Text(def ...string) []string {
117 | return r.nodes.Text(def...)
118 | }
119 |
120 | func (r *response) Attr(attrName string) []string {
121 | return r.nodes.Attr(attrName)
122 | }
123 |
124 | func (r *response) Get() *html.Node {
125 | return r.nodes.Get()
126 | }
127 |
128 | func (r *response) GetAll() []*html.Node {
129 | return r.nodes.GetAll()
130 | }
131 |
--------------------------------------------------------------------------------
/pkg/scheduler/scheduler.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "context"
5 | "net/http"
6 | "sync"
7 |
8 | rp "github.com/tech-engine/goscrapy/internal/resource_pool"
9 | "github.com/tech-engine/goscrapy/internal/types"
10 | "github.com/tech-engine/goscrapy/pkg/core"
11 | )
12 |
13 | type scheduler struct {
14 | opts
15 | executor IExecutor
16 | schedulerWorkPool *rp.Pooler[schedulerWork]
17 | requestPool *rp.Pooler[request]
18 | workerQueue WorkerQueue
19 | workQueue WorkQueue
20 | }
21 |
22 | // NewScheduler creates a new scheduler.
23 | func New(executor IExecutor, optFuncs ...types.OptFunc[opts]) *scheduler {
24 |
25 | // set default options
26 | opts := defaultOpts()
27 |
28 | // set custom options
29 | for _, fn := range optFuncs {
30 | fn(&opts)
31 | }
32 |
33 | return &scheduler{
34 | opts: opts,
35 | executor: executor,
36 | schedulerWorkPool: rp.NewPooler(rp.WithSize[schedulerWork](opts.reqResPoolSize)),
37 | requestPool: rp.NewPooler(rp.WithSize[request](opts.reqResPoolSize)),
38 | workerQueue: make(WorkerQueue, opts.numWorkers),
39 | workQueue: make(WorkQueue, opts.workQueueSize),
40 | }
41 | }
42 |
43 | func (s *scheduler) WithExecutor(executor IExecutor) {
44 | s.executor = executor
45 | }
46 |
47 | // Handles creating workers and listening on the work queue
48 | func (s *scheduler) Start(ctx context.Context) error {
49 |
50 | if ctx.Err() != nil {
51 | return ctx.Err()
52 | }
53 |
54 | var (
55 | i uint16
56 | err error
57 | wg sync.WaitGroup
58 | )
59 |
60 | defer wg.Wait()
61 | wg.Add(int(s.opts.numWorkers))
62 |
63 | // this is to make sure that we close the scheduler and after that close all the workers
64 | wCtx, wCancel := context.WithCancel(context.Background())
65 |
66 | for i = 0; i < s.opts.numWorkers; i++ {
67 | go func() {
68 | defer wg.Done()
69 | worker := NewWorker(i+1, s.executor, s.workerQueue, s.schedulerWorkPool, s.requestPool, s.opts.reqResPoolSize)
70 |
71 | // blocking
72 | _ = worker.Start(wCtx)
73 | }()
74 | }
75 |
76 | // below will trigger context cancellation for the worker after scheduler is done.
77 | defer wCancel()
78 |
79 | for {
80 | select {
81 | case work := <-s.workQueue:
82 |
83 | // the below check ensures our scheduler don't pick any worker once context has been cancelled
84 | if err = ctx.Err(); err != nil {
85 | return err
86 | }
87 |
88 | wg.Add(1)
89 | go s.push(&wg, work)
90 | case <-ctx.Done():
91 | return ctx.Err()
92 | }
93 | }
94 | }
95 |
96 | func (s *scheduler) Schedule(req core.IRequestReader, next core.ResponseCallback) {
97 |
98 | work := s.schedulerWorkPool.Acquire()
99 |
100 | if work == nil {
101 | work = &schedulerWork{}
102 | }
103 |
104 | work.request = req
105 | work.next = next
106 |
107 | s.workQueue <- work
108 | }
109 |
110 | func (s *scheduler) NewRequest() core.IRequestRW {
111 | req := s.requestPool.Acquire()
112 | if req == nil {
113 | req = &request{
114 | method: "GET",
115 | header: make(http.Header),
116 | }
117 | }
118 | return req
119 | }
120 |
121 | // push a *schedulerWork unit to a worker
122 | func (s *scheduler) push(wg *sync.WaitGroup, work *schedulerWork) {
123 | defer wg.Done()
124 |
125 | // pull a worker and push a task in the worker's queue
126 | worker := <-s.workerQueue
127 | worker <- work
128 | }
129 |
--------------------------------------------------------------------------------
/pkg/scheduler/scheduler_work.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import "github.com/tech-engine/goscrapy/pkg/core"
4 |
5 | type schedulerWork struct {
6 | next core.ResponseCallback
7 | request core.IRequestReader
8 | }
9 |
10 | func (s *schedulerWork) Reset() {
11 | s.next = nil
12 | s.request = nil
13 | }
14 |
--------------------------------------------------------------------------------
/pkg/scheduler/selectors.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "io"
5 | "strings"
6 |
7 | "github.com/andybalholm/cascadia"
8 | "github.com/antchfx/htmlquery"
9 | "github.com/tech-engine/goscrapy/pkg/core"
10 | "golang.org/x/net/html"
11 | )
12 |
13 | type Selectors []*html.Node
14 |
15 | func NewSelector(r io.Reader) (Selectors, error) {
16 | root, err := html.Parse(r)
17 | if err != nil {
18 | return nil, err
19 | }
20 | return Selectors([]*html.Node{root}), nil
21 | }
22 |
23 | // Css selector - select element by id, class, nodename etc.
24 | func (nodes Selectors) Css(selector string) core.ISelector {
25 | sel, err := cascadia.ParseWithPseudoElement(selector)
26 | if err != nil {
27 | return Selectors([]*html.Node{})
28 | }
29 |
30 | selected := make(Selectors, 0, len(nodes))
31 | for _, node := range nodes {
32 | selected = append(selected, cascadia.QueryAll(node, sel)...)
33 | }
34 |
35 | return selected
36 | }
37 |
38 | // Xpath selector - select element using an xpath expression.
39 | func (nodes Selectors) Xpath(xpath string) core.ISelector {
40 | selected := make(Selectors, 0, len(nodes))
41 | for _, node := range nodes {
42 | matches, err := htmlquery.QueryAll(node, xpath)
43 | if err != nil {
44 | continue
45 | }
46 | selected = append(selected, matches...)
47 | }
48 | return selected
49 | }
50 |
51 | // Extracts all the text of a node and it's descendents.
52 | func (nodes Selectors) Text(def ...string) []string {
53 | texts := make([]string, 0, len(nodes))
54 | for _, node := range nodes {
55 | text := strings.TrimSpace(htmlquery.InnerText(node))
56 | if text == "" && len(def) > 0 {
57 | texts = append(texts, def[0])
58 | continue
59 | }
60 | texts = append(texts, text)
61 | }
62 | return texts
63 | }
64 |
65 | // Extracts attribute values
66 | func (nodes Selectors) Attr(attrName string) []string {
67 | attrs := make([]string, 0, len(nodes))
68 | for _, node := range nodes {
69 | for _, attr := range node.Attr {
70 | if attr.Key == attrName {
71 | attrs = append(attrs, attr.Val)
72 | }
73 | }
74 | }
75 | return attrs
76 | }
77 |
78 | // Get the first matched node
79 | func (nodes Selectors) Get() *html.Node {
80 | if len(nodes) <= 0 {
81 | return nil
82 | }
83 | return nodes[0]
84 | }
85 |
86 | // Gets all the matched nodes
87 | func (nodes Selectors) GetAll() []*html.Node {
88 | return nodes
89 | }
90 |
--------------------------------------------------------------------------------
/pkg/scheduler/selectors_test.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "strings"
5 | "testing"
6 |
7 | "github.com/stretchr/testify/assert"
8 | )
9 |
10 | func TestSelectors(t *testing.T) {
11 | html := `
12 |
13 |
14 |
15 |
Title
16 |
Introduction paragraph 1
17 |
Example Link
18 |
This is test paragraph
19 |
Introduction paragraph 3
20 |
21 |
22 |
23 | `
24 |
25 | selector, err := NewSelector(strings.NewReader(html))
26 |
27 | assert.NoError(t, err)
28 |
29 | cssSelector := selector.Css("p.intro")
30 |
31 | cssNodes := cssSelector.GetAll()
32 | assert.Len(t, cssNodes, 2, "expected nodes=2, got=%s", len(cssNodes))
33 |
34 | cssNodesTexts := cssSelector.Text()
35 | assert.Equal(t, "Introduction paragraph 1", cssNodesTexts[0], "expected paragraph text=Introduction paragraph 1, got=%s", cssNodesTexts[0])
36 |
37 | xpathSelector := selector.Xpath("//p[@data-mg='test']")
38 |
39 | xpathNodes := xpathSelector.GetAll()
40 | assert.Len(t, xpathNodes, 1, "expected xpath nodes=1, got=%s", len(xpathNodes))
41 |
42 | xpathNodesTexts := xpathSelector.Text()
43 | assert.Len(t, xpathNodesTexts, 1, "expected xpathNodesTexts=1, got=%s", len(xpathNodesTexts))
44 | assert.Equal(t, "Introduction paragraph 3", xpathNodesTexts[0], "expected paragraph text=Introduction paragraph 3, got=%s", xpathNodesTexts[0])
45 |
46 | attrValues := selector.Css("a").Attr("href")
47 | assert.Len(t, xpathNodesTexts, 1, "expected attrValues=1, got=%s", len(attrValues))
48 | assert.Equal(t, "http://example.com", attrValues[0], "expected href=http://example.com, got=%s", attrValues[0])
49 |
50 | noCssElements := selector.Css("p.box").GetAll()
51 | assert.Empty(t, noCssElements, "expected element=0, got=%s", len(noCssElements))
52 |
53 | noXpathElements := selector.Xpath("//p[@class='test']").GetAll()
54 | assert.Empty(t, noXpathElements, "expected element=0, got=%s", len(noXpathElements))
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/pkg/scheduler/types.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | type WorkQueue chan *schedulerWork
4 | type WorkerQueue chan WorkQueue
5 |
--------------------------------------------------------------------------------
/pkg/scheduler/worker.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "context"
5 | "io"
6 | "sync"
7 |
8 | rp "github.com/tech-engine/goscrapy/internal/resource_pool"
9 | )
10 |
11 | // Worker will handle the execution of a Work unit
12 | type Worker struct {
13 | ID uint16
14 | executor IExecutor
15 | workerQueue WorkerQueue
16 | workQueue WorkQueue
17 | schedulerWorkPool *rp.Pooler[schedulerWork]
18 | responsePool *rp.Pooler[response]
19 | requestPool *rp.Pooler[request]
20 | }
21 |
22 | func NewWorker(id uint16, executor IExecutor, workerQueue WorkerQueue, schedulerWorkPool *rp.Pooler[schedulerWork], requestPool *rp.Pooler[request], respPoolSize uint64) *Worker {
23 |
24 | return &Worker{
25 | ID: id,
26 | workerQueue: workerQueue,
27 | executor: executor,
28 | workQueue: make(WorkQueue),
29 | schedulerWorkPool: schedulerWorkPool,
30 | requestPool: requestPool,
31 | responsePool: rp.NewPooler(rp.WithSize[response](respPoolSize)),
32 | }
33 | }
34 |
35 | // Handles listen for any incoming work in workQueue
36 | func (w *Worker) Start(ctx context.Context) error {
37 | var err error
38 |
39 | if err = ctx.Err(); err != nil {
40 | return err
41 | }
42 |
43 | var wg sync.WaitGroup
44 |
45 | // we wait for all worker jobs to be completed finished/fail afer context cancellation
46 | defer wg.Wait()
47 |
48 | for {
49 |
50 | // make this worker available again
51 | w.workerQueue <- w.workQueue
52 |
53 | select {
54 | case work := <-w.workQueue:
55 |
56 | if err = ctx.Err(); err != nil {
57 | return err
58 | }
59 |
60 | wg.Add(1)
61 |
62 | // we don't want the workers to crash, so we ignore the error from execute
63 | _ = w.execute(ctx, work)
64 | wg.Done()
65 |
66 | case <-ctx.Done():
67 | return ctx.Err()
68 | }
69 | }
70 | }
71 |
72 | // Handles executing a scheduler work and calling the next callback of with the result as response
73 | func (w *Worker) execute(ctx context.Context, work *schedulerWork) error {
74 |
75 | res := w.responsePool.Acquire()
76 |
77 | if res == nil {
78 | res = &response{}
79 | }
80 |
81 | // we do some cleanup here on the response object
82 | defer func() {
83 | w.resetAndRelease(work)
84 |
85 | // discard unread body
86 | if res.body != nil {
87 | io.Copy(io.Discard, res.body)
88 | res.body.Close()
89 | }
90 |
91 | res.Reset()
92 | w.responsePool.Release(res)
93 | }()
94 |
95 | if err := w.executor.Execute(work.request, res); err != nil {
96 | // resetAndRelease(work)
97 | return err
98 | }
99 |
100 | next := (*work).next
101 | pCtx := work.request.ReadContext()
102 |
103 | // next==nil means this is the last callback of the spider
104 | if next == nil {
105 | return nil
106 | }
107 |
108 | // call to callback must me blocking so that the callback can read from the response
109 | // before the response is resetted and returned to pool
110 | if pCtx == nil {
111 | pCtx = context.Background()
112 | }
113 |
114 | // we copy meta from our request to our response to be accessible to the spider
115 | res.WriteMeta(work.request.ReadMeta())
116 |
117 | next(context.WithValue(pCtx, "WORKER_ID", w.ID), res)
118 | return nil
119 | }
120 |
121 | func (w *Worker) resetAndRelease(work *schedulerWork) {
122 | // release *request to pool
123 | req, ok := work.request.(*request)
124 |
125 | if !ok {
126 | return
127 | }
128 |
129 | req.Reset()
130 |
131 | w.requestPool.Release(req)
132 |
133 | // release *schedulerWork to pool
134 | work.Reset()
135 |
136 | w.schedulerWorkPool.Release(work)
137 | }
138 |
--------------------------------------------------------------------------------
/pkg/scheduler/worker_test.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "context"
5 | "net/http"
6 | "testing"
7 |
8 | rp "github.com/tech-engine/goscrapy/internal/resource_pool"
9 | "github.com/tech-engine/goscrapy/pkg/core"
10 | "github.com/tech-engine/goscrapy/pkg/engine"
11 | )
12 |
13 | type dummyExecutor struct {
14 | }
15 |
16 | func (e *dummyExecutor) Execute(reader core.IRequestReader, writer engine.IResponseWriter) error {
17 | return nil
18 | }
19 |
20 | func TestWorker(t *testing.T) {
21 | // create a worker
22 | var workerId uint16 = 1
23 | var respPoolSize uint64 = 1
24 |
25 | executor := &dummyExecutor{}
26 | workerQueue := make(WorkerQueue, 1)
27 | schedulerWorkPool := rp.NewPooler(rp.WithSize[schedulerWork](1))
28 | requestPool := rp.NewPooler(rp.WithSize[request](1))
29 |
30 | worker := NewWorker(
31 | workerId,
32 | executor,
33 | workerQueue,
34 | schedulerWorkPool,
35 | requestPool,
36 | respPoolSize,
37 | )
38 |
39 | ctx, cancel := context.WithCancel(context.Background())
40 |
41 | // start the worker
42 | go func() {
43 | worker.Start(ctx)
44 | }()
45 |
46 | // create a scheduler work
47 | work := &schedulerWork{
48 | next: func(ctx context.Context, resp core.IResponseReader) {
49 | },
50 | request: &request{
51 | method: "GET",
52 | header: make(http.Header),
53 | },
54 | }
55 | // execute a task
56 | worker.execute(ctx, work)
57 | cancel()
58 | }
59 |
--------------------------------------------------------------------------------