├── .github └── workflows │ └── tests.yml ├── .gitignore ├── LICENSE ├── README.md ├── _examples ├── books.toscrape.com │ ├── books_to_scrape │ │ ├── constants.go │ │ ├── errors.go │ │ ├── job.go │ │ ├── record.go │ │ ├── settings.go │ │ └── spider.go │ ├── go.mod │ ├── go.sum │ ├── itstimeitsnowornever.csv │ └── main.go ├── scrapejsp │ ├── go.mod │ ├── go.sum │ ├── main.go │ ├── scrapejsp │ │ ├── constants.go │ │ ├── errors.go │ │ ├── job.go │ │ ├── record.go │ │ ├── settings.go │ │ └── spider.go │ └── utils │ │ └── helper.go └── scrapejsp_method2 │ ├── go.mod │ ├── go.sum │ ├── itstimeitsnowornever.csv │ ├── main.go │ └── scrapejsp │ ├── constants.go │ ├── errors.go │ ├── job.go │ ├── record.go │ ├── settings.go │ └── spider.go ├── assets ├── demo.gif └── logo.webp ├── cmd ├── cli │ ├── cli.go │ ├── pipeline.go │ ├── startproject.go │ └── templates │ │ ├── constants.tmpl │ │ ├── errors.tmpl │ │ ├── job.tmpl │ │ ├── main.tmpl │ │ ├── pipeline.tmpl │ │ ├── record.tmpl │ │ ├── settings.tmpl │ │ └── spider.tmpl └── gos │ ├── client.go │ ├── constants.go │ ├── gos.go │ ├── ports.go │ └── types.go ├── go.mod ├── go.sum ├── internal ├── cmap │ ├── cmap.go │ ├── cmap_test.go │ ├── cmaph.go │ └── types.go ├── fsm │ ├── fsm.go │ └── fsm_test.go ├── resource_pool │ ├── pool_builder.go │ └── resource_pool.go └── types │ └── option.go ├── main.go └── pkg ├── builtin ├── middlewares │ ├── dupefilter.go │ ├── dupefilter_test.go │ ├── multi_cookiejar.go │ ├── multi_cookiejar_test.go │ ├── retry.go │ └── retry_test.go └── pipelines │ ├── dummy.go │ ├── export_to_csv.go │ ├── export_to_csv_test.go │ ├── export_to_firebase.go │ ├── export_to_gsheet.go │ ├── export_to_json.go │ ├── export_to_json_test.go │ ├── export_to_mongodb.go │ └── type.go ├── core ├── core.go └── ports.go ├── engine ├── engine.go └── ports.go ├── executor ├── executor.go └── ports.go ├── executor_adapters └── http_native │ ├── adapter.go │ ├── adapter_test.go │ └── helper.go ├── middlewaremanager └── middlewaremanager.go ├── pipeline_manager ├── constants.go ├── group.go ├── options.go ├── pipeline_manager.go ├── pipeline_manager_test.go └── ports.go └── scheduler ├── constants.go ├── options.go ├── ports.go ├── request.go ├── response.go ├── scheduler.go ├── scheduler_work.go ├── selectors.go ├── selectors_test.go ├── types.go ├── worker.go └── worker_test.go /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Test on Pull Request 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'main' 7 | pull_request: 8 | branches: 9 | - 'main' 10 | 11 | jobs: 12 | test: 13 | name: Run Go Tests 14 | runs-on: ubuntu-latest 15 | 16 | strategy: 17 | matrix: 18 | go-version: ['1.22'] 19 | 20 | steps: 21 | - name: Checkout code 22 | uses: actions/checkout@v3 23 | 24 | - name: Set up Go 25 | uses: actions/setup-go@v4 26 | with: 27 | go-version: ${{ matrix.go-version }} 28 | 29 | - name: Run tests 30 | run: go test -race -v ./... 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | *.psd 11 | 12 | # Test binary, built with `go test -c` 13 | *.test 14 | 15 | # Output of the go coverage tool, specifically when used with LiteIDE 16 | *.out 17 | 18 | # Dependency directories (remove the comment below to include it) 19 | # vendor/ 20 | 21 | # Go workspace file 22 | go.work -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | License text copyright (c) 2020 MariaDB Corporation Ab, All Rights Reserved. 2 | “Business Source License” is a trademark of MariaDB Corporation Ab. 3 | 4 | Parameters 5 | 6 | Licensor: Techengine(goscrapy dot dev AT gmail dot com). 7 | Licensed Work: Goscrapy. 8 | Additional Use Grant: You may make production use of the Licensed Work, 9 | provided such use does not include offering the Licensed Work 10 | to third parties on a hosted or embedded basis which is 11 | competitive with any services offered or will be offered 12 | in future by the licensor. 13 | 14 | For information about alternative licensing arrangements for the Licensed Work, 15 | please contact goscrapy dot dev AT gmail dot com. 16 | 17 | Notice 18 | 19 | Business Source License 1.1 20 | 21 | Terms 22 | 23 | The Licensor hereby grants you the right to copy, modify, create derivative 24 | works, redistribute, and make non-production use of the Licensed Work. The 25 | Licensor may make an Additional Use Grant, above, permitting limited production use. 26 | 27 | Effective on the Change Date, or the fourth anniversary of the first publicly 28 | available distribution of a specific version of the Licensed Work under this 29 | License, whichever comes first, the Licensor hereby grants you rights under 30 | the terms of the Change License, and the rights granted in the paragraph 31 | above terminate. 32 | 33 | If your use of the Licensed Work does not comply with the requirements 34 | currently in effect as described in this License, you must purchase a 35 | commercial license from the Licensor, its affiliated entities, or authorized 36 | resellers, or you must refrain from using the Licensed Work. 37 | 38 | All copies of the original and modified Licensed Work, and derivative works 39 | of the Licensed Work, are subject to this License. This License applies 40 | separately for each version of the Licensed Work and the Change Date may vary 41 | for each version of the Licensed Work released by Licensor. 42 | 43 | You must conspicuously display this License on each original or modified copy 44 | of the Licensed Work. If you receive the Licensed Work in original or 45 | modified form from a third party, the terms and conditions set forth in this 46 | License apply to your use of that work. 47 | 48 | Any use of the Licensed Work in violation of this License will automatically 49 | terminate your rights under this License for the current and all other 50 | versions of the Licensed Work. 51 | 52 | This License does not grant you any right in any trademark or logo of 53 | Licensor or its affiliates (provided that you may use a trademark or logo of 54 | Licensor as expressly required by this License). 55 | 56 | TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON 57 | AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, 58 | EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF 59 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND 60 | TITLE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GoScrapy: Web Scraping Framework in Go 2 | [![Alt Text](https://goreportcard.com/badge/github.com/tech-engine/goscrapy)](https://github.com/tech-engine/goscrapy) 3 |

4 | 5 |

6 | 7 | **GoScrapy** aims to be a powerful web scraping framework in Go, inspired by Python's Scrapy framework. It offers an easy-to-use Scrapy-like experience for extracting data from websites, making it an ideal tool for various data collection and analysis tasks, especially for those coming from Python and wanting to try scraping in Golang.. 8 | 9 | ## Getting Started 10 | 11 | Goscrapy requires **Go version 1.22** or higher to run. 12 | 13 | ### 1: Project Initialization 14 | 15 | ```sh 16 | go mod init books_to_scrape 17 | ``` 18 | 19 | ### 2. Install goscrapy cli 20 | 21 | ```sh 22 | go install github.com/tech-engine/goscrapy@latest 23 | ``` 24 | **Note**: make sure to always keep your goscrapy cli updated. 25 | 26 | ### 3. Verify Installation 27 | 28 | ```sh 29 | goscrapy -v 30 | ``` 31 | ### 4. Create a New Project 32 | 33 | ```sh 34 | goscrapy startproject books_to_scrape 35 | ``` 36 | This will create a new project directory with all the files necessary to begin working with **GoScrapy**. 37 | 38 | ```sh 39 | \iyuioy\go\go-test-scrapy> goscrapy startproject books_to_scrape 40 | 41 | 🚀 GoScrapy generating project files. Please wait! 42 | 43 | ✔️ books_to_scrape\constants.go 44 | ✔️ books_to_scrape\errors.go 45 | ✔️ books_to_scrape\job.go 46 | ✔️ main.go 47 | ✔️ books_to_scrape\record.go 48 | ✔️ books_to_scrape\spider.go 49 | 50 | ✨ Congrates. books_to_scrape created successfully. 51 | ``` 52 | 53 | ### spider.go 54 | In your __`spider.go`__ file, set up and execute your spider. 55 | 56 | For detailed code, please refer to the [sample code here](./_examples/scrapejsp_method2/scrapejsp/spider.go). 57 | 58 | ```go 59 | package scrapejsp 60 | 61 | import ( 62 | "context" 63 | "encoding/json" 64 | "fmt" 65 | "log" 66 | 67 | "github.com/tech-engine/goscrapy/cmd/gos" 68 | "github.com/tech-engine/goscrapy/pkg/core" 69 | ) 70 | 71 | type Spider struct { 72 | gos.ICoreSpider[*Record] 73 | } 74 | 75 | func NewSpider(ctx context.Context) (*Spider, <-chan error) { 76 | 77 | // use proxies 78 | // proxies := core.WithProxies("proxy_url1", "proxy_url2", ...) 79 | // core := gos.New[*Record]().WithClient( 80 | // gos.DefaultClient(proxies), 81 | // ) 82 | 83 | core := gos.New[*Record]() 84 | 85 | // Add middlewares 86 | core.MiddlewareManager.Add(MIDDLEWARES...) 87 | // Add pipelines 88 | core.PipelineManager.Add(PIPELINES...) 89 | 90 | errCh := make(chan error) 91 | 92 | go func() { 93 | errCh <- core.Start(ctx) 94 | }() 95 | 96 | return &Spider{ 97 | core, 98 | }, errCh 99 | } 100 | 101 | // This is the entrypoint to the spider 102 | func (s *Spider) StartRequest(ctx context.Context, job *Job) { 103 | 104 | req := s.NewRequest() 105 | // req.Meta("JOB", job) 106 | req.Url("https://jsonplaceholder.typicode.com/todos/1") 107 | 108 | s.Request(req, s.parse) 109 | } 110 | 111 | func (s *Spider) Close(ctx context.Context) { 112 | } 113 | 114 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) { 115 | fmt.Printf("status: %d", resp.StatusCode()) 116 | 117 | var data Record 118 | err := json.Unmarshal(resp.Bytes(), &data) 119 | if err != nil { 120 | log.Fatalln(err) 121 | } 122 | 123 | // to push to pipelines 124 | s.Yield(&data) 125 | } 126 | ``` 127 | 128 |

129 | 130 |

131 | 132 | ## Wiki 133 | Please follow the [wiki](https://github.com/tech-engine/goscrapy/wiki) docs for details. 134 | 135 | ### Note 136 | 137 | **GoScrapy** is not stable, so its API may change drastically. Please exercise caution when using it in production. 138 | 139 | ## License 140 | 141 | **GoScrapy** is available under the BSL with an additional usage grant that allows free internal use. Please ensure that you agree with the license before contributing to **GoScrapy**, as by contributing to the GoScrapy project, you agree to the terms of the license. 142 | 143 | ## Roadmap 144 | 145 | - ~~Cookie management~~ 146 | - ~~Builtin & Custom Middlewares support~~ 147 | - ~~Css & Xpath Selectors~~ 148 | - Logging 149 | - Triggers 150 | - Tests(work in progress) 151 | 152 | ## Partners 153 | 154 | 155 | 156 | 157 | 158 | ## Get in touch 159 | [Discord](https://discord.gg/FPvxETjYPH) 160 | -------------------------------------------------------------------------------- /_examples/books.toscrape.com/books_to_scrape/constants.go: -------------------------------------------------------------------------------- 1 | package books_to_scrape 2 | 3 | // you can define your constants here 4 | -------------------------------------------------------------------------------- /_examples/books.toscrape.com/books_to_scrape/errors.go: -------------------------------------------------------------------------------- 1 | package books_to_scrape 2 | 3 | // you can define your errors here 4 | -------------------------------------------------------------------------------- /_examples/books.toscrape.com/books_to_scrape/job.go: -------------------------------------------------------------------------------- 1 | package books_to_scrape 2 | 3 | // id field is compulsory in a Job defination. You can add your custom to Job 4 | type Job struct { 5 | id string 6 | } 7 | 8 | // do not delete/edit 9 | func NewJob(id string) *Job { 10 | return &Job{ 11 | id: id, 12 | } 13 | } 14 | 15 | // do not delete/edit 16 | func (j *Job) Id() string { 17 | return j.id 18 | } 19 | 20 | // do not delete 21 | func (j *Job) Reset() { 22 | j.id = "" 23 | } 24 | 25 | // add your custom receiver functions below 26 | -------------------------------------------------------------------------------- /_examples/books.toscrape.com/books_to_scrape/record.go: -------------------------------------------------------------------------------- 1 | package books_to_scrape 2 | 3 | import ( 4 | "reflect" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/core" 7 | ) 8 | 9 | /* 10 | json and csv struct field tags are required, if you want the Record to be exported 11 | or processed by builtin pipelines 12 | */ 13 | 14 | type Record struct { 15 | J *Job `json:"-" csv:"-"` // JobId is required 16 | // add you custom fields here 17 | Title string `json:"title" csv:"title"` 18 | Price string `json:"price" csv:"price"` 19 | Stock string `json:"stock" csv:"stock"` 20 | Rating string `json:"rating" csv:"rating"` 21 | Description string `json:"description" csv:"description"` 22 | Upc string `json:"upc" csv:"upc"` 23 | ProductType string `json:"product_type" csv:"product_type"` 24 | Reviews string `json:"reviews" csv:"reviews"` 25 | } 26 | 27 | // modify below code only if you know what you are doing 28 | func (r *Record) Record() *Record { 29 | return r 30 | } 31 | 32 | func (r *Record) RecordKeys() []string { 33 | dataType := reflect.TypeOf(*r) 34 | if dataType.Kind() != reflect.Struct { 35 | panic("Record is not a struct") 36 | } 37 | 38 | numFields := dataType.NumField() 39 | keys := make([]string, numFields) 40 | 41 | for i := 0; i < numFields; i++ { 42 | field := dataType.Field(i) 43 | csvTag := field.Tag.Get("csv") 44 | keys[i] = csvTag 45 | } 46 | 47 | return keys 48 | } 49 | 50 | func (r *Record) RecordFlat() []any { 51 | 52 | inputType := reflect.TypeOf(*r) 53 | 54 | if inputType.Kind() != reflect.Struct { 55 | panic("Record is not a struct") 56 | } 57 | 58 | inputValue := reflect.ValueOf(*r) 59 | 60 | slice := make([]any, inputType.NumField()) 61 | 62 | for i := 0; i < inputType.NumField(); i++ { 63 | slice[i] = inputValue.Field(i).Interface() 64 | } 65 | return slice 66 | } 67 | 68 | func (r *Record) Job() core.IJob { 69 | return r.J 70 | } 71 | -------------------------------------------------------------------------------- /_examples/books.toscrape.com/books_to_scrape/settings.go: -------------------------------------------------------------------------------- 1 | package books_to_scrape 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/builtin/middlewares" 7 | "github.com/tech-engine/goscrapy/pkg/builtin/pipelines" 8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager" 9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 10 | ) 11 | 12 | // HTTP Transport settings 13 | 14 | // Default: 10000 15 | const MIDDLEWARE_HTTP_TIMEOUT_MS = "" 16 | 17 | // Default: 100 18 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN = "" 19 | 20 | // Default: 100 21 | const MIDDLEWARE_HTTP_MAX_CONN_PER_HOST = "" 22 | 23 | // Default: 100 24 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST = "" 25 | 26 | // Inbuilt Retry middleware settings 27 | 28 | // Default: 3 29 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = "" 30 | 31 | // Default: 500, 502, 503, 504, 522, 524, 408, 429 32 | const MIDDLEWARE_HTTP_RETRY_CODES = "" 33 | 34 | // Default: 1s 35 | const MIDDLEWARE_HTTP_RETRY_BASE_DELAY = "" 36 | 37 | // Default: 1000000 38 | const SCHEDULER_REQ_RES_POOL_SIZE = "" 39 | 40 | // Default: num of CPU * 3 41 | const SCHEDULER_CONCURRENCY = "" 42 | 43 | // Default: 1000000 44 | const SCHEDULER_WORK_QUEUE_SIZE = "" 45 | 46 | // Pipeline Manager settings 47 | 48 | // Default: 10000 49 | const PIPELINEMANAGER_ITEMPOOL_SIZE = "" 50 | 51 | // Default: 24 52 | const PIPELINEMANAGER_ITEM_SIZE = "" 53 | 54 | // Default: 0 55 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = "" 56 | 57 | // Default: 1000 58 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = "" 59 | 60 | // Middlewares here 61 | // Executed in reverse order from bottom to top. 62 | var MIDDLEWARES = []middlewaremanager.Middleware{ 63 | middlewares.Retry(), 64 | middlewares.MultiCookieJar, 65 | middlewares.DupeFilter, 66 | } 67 | 68 | var export2CSV = pipelines.Export2CSV[*Record](pipelines.Export2CSVOpts{ 69 | Filename: "itstimeitsnowornever.csv", 70 | }) 71 | 72 | // use export 2 json pipeline 73 | // var export2Json = pipelines.Export2JSON[*Record](pipelines.Export2JSONOpts{ 74 | // Filename: "itstimeitsnowornever.json", 75 | // Immediate: true, 76 | // }) 77 | 78 | // add pipeline to group 79 | //func myCustomPipelineGroup() *pm.Group[*Record] { 80 | // pipelineGroup := pm.NewGroup[*Record]() 81 | // pipelineGroup.Add(export2CSV) 82 | // // pipelineGroup.Add(export2Json) 83 | // return pipelineGroup 84 | //} 85 | 86 | // Pipelines here 87 | // Executed in the order they appear. 88 | var PIPELINES = []pm.IPipeline[*Record]{ 89 | export2CSV, 90 | // export2Json, 91 | // myCustomPipelineGroup(), 92 | } 93 | 94 | func init() { 95 | var settings = map[string]string{ 96 | "MIDDLEWARE_HTTP_TIMEOUT_MS": MIDDLEWARE_HTTP_TIMEOUT_MS, 97 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN": MIDDLEWARE_HTTP_MAX_IDLE_CONN, 98 | "MIDDLEWARE_HTTP_MAX_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_CONN_PER_HOST, 99 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST, 100 | "MIDDLEWARE_HTTP_RETRY_MAX_RETRIES": MIDDLEWARE_HTTP_RETRY_MAX_RETRIES, 101 | "MIDDLEWARE_HTTP_RETRY_CODES": MIDDLEWARE_HTTP_RETRY_CODES, 102 | "MIDDLEWARE_HTTP_RETRY_BASE_DELAY": MIDDLEWARE_HTTP_RETRY_BASE_DELAY, 103 | "SCHEDULER_REQ_RES_POOL_SIZE": SCHEDULER_REQ_RES_POOL_SIZE, 104 | "SCHEDULER_CONCURRENCY": SCHEDULER_CONCURRENCY, 105 | "SCHEDULER_WORK_QUEUE_SIZE": SCHEDULER_WORK_QUEUE_SIZE, 106 | "PIPELINEMANAGER_ITEMPOOL_SIZE": PIPELINEMANAGER_ITEMPOOL_SIZE, 107 | "PIPELINEMANAGER_ITEM_SIZE": PIPELINEMANAGER_ITEM_SIZE, 108 | "PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE": PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE, 109 | "PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY": PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY, 110 | } 111 | 112 | for key, value := range settings { 113 | if value != "" { 114 | os.Setenv(key, value) 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /_examples/books.toscrape.com/books_to_scrape/spider.go: -------------------------------------------------------------------------------- 1 | package books_to_scrape 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "regexp" 7 | "strings" 8 | 9 | "github.com/tech-engine/goscrapy/cmd/gos" 10 | "github.com/tech-engine/goscrapy/pkg/core" 11 | ) 12 | 13 | type Spider struct { 14 | gos.ICoreSpider[*Record] 15 | baseUrl string 16 | } 17 | 18 | func New(ctx context.Context) (*Spider, <-chan error) { 19 | 20 | // use proxies 21 | // proxies := core.WithProxies("proxy_url1", "proxy_url2", ...) 22 | // core := gos.New[*Record]().WithClient( 23 | // gos.DefaultClient(proxies), 24 | // ) 25 | 26 | core := gos.New[*Record]() 27 | 28 | // Add middlewares 29 | core.MiddlewareManager.Add(MIDDLEWARES...) 30 | // Add pipelines 31 | core.PipelineManager.Add(PIPELINES...) 32 | 33 | errCh := make(chan error) 34 | 35 | spider := &Spider{ 36 | core, 37 | "https://books.toscrape.com", 38 | } 39 | 40 | go func() { 41 | errCh <- core.Start(ctx) 42 | spider.Close(ctx) 43 | }() 44 | 45 | return spider, errCh 46 | } 47 | 48 | func (s *Spider) StartRequest(ctx context.Context, job *Job) { 49 | 50 | // for each request we must call NewRequest() and never reuse it 51 | req := s.NewRequest() 52 | 53 | // GET is the request method 54 | req.Url(s.baseUrl) 55 | 56 | s.Request(req, s.parse) 57 | } 58 | 59 | // can be called when spider is about to close 60 | func (s *Spider) Close(ctx context.Context) { 61 | fmt.Println("closing") 62 | } 63 | 64 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) { 65 | fmt.Printf("GET: %d %s\n", resp.StatusCode(), resp.Request().URL.String()) 66 | for _, productUrl := range resp.Css("article.product_pod h3 a").Attr("href") { 67 | req := s.NewRequest() 68 | 69 | if strings.HasPrefix(productUrl, "catalogue/") { 70 | productUrl = fmt.Sprintf("%s/%s", s.baseUrl, productUrl) 71 | } else { 72 | productUrl = fmt.Sprintf("%s/catalogue/%s", s.baseUrl, productUrl) 73 | } 74 | 75 | req.Url(productUrl) 76 | s.Request(req, s.parseProduct) 77 | fmt.Printf("GET: %s\n", productUrl) 78 | } 79 | 80 | // pagination 81 | nextUrls := resp.Css("li.next a").Attr("href") 82 | 83 | if len(nextUrls) <= 0 { 84 | return 85 | } 86 | 87 | nextUrl := fmt.Sprintf("%s/%s", s.baseUrl, nextUrls[0]) 88 | 89 | if !strings.HasPrefix(nextUrls[0], "catalogue/") { 90 | nextUrl = fmt.Sprintf("%s/catalogue/%s", s.baseUrl, nextUrls[0]) 91 | } 92 | 93 | req := s.NewRequest() 94 | req.Url(nextUrl) 95 | s.Request(req, s.parse) 96 | } 97 | 98 | func (s *Spider) parseProduct(ctx context.Context, resp core.IResponseReader) { 99 | product := resp.Css("article.product_page") 100 | 101 | var title string 102 | if titles := product.Css(".product_main h1").Text(); len(titles) > 0 { 103 | title = titles[0] 104 | } 105 | 106 | var price string 107 | if prices := product.Css(".price_color").Text(); len(prices) > 0 { 108 | price = prices[0] 109 | } 110 | 111 | var stock string 112 | if stocks := product.Css(".availability").Text(); len(stocks) > 0 { 113 | match := regexp.MustCompile(`\((\d+) available\)`).FindStringSubmatch(strings.TrimSpace(stocks[0])) 114 | 115 | if len(match) > 0 { 116 | stock = match[1] 117 | } 118 | } 119 | 120 | var rating string 121 | if ratingClassAttrs := product.Css(".star-rating").Attr("class"); len(ratingClassAttrs) > 0 { 122 | rating = strings.Split(ratingClassAttrs[0], " ")[1] 123 | 124 | } 125 | 126 | var productDescription string 127 | if productDescriptions := product.Css("#product_description + *").Text(); len(productDescriptions) > 0 { 128 | productDescription = productDescriptions[0] 129 | } 130 | 131 | var upc string 132 | if upcs := product.Css("table tr:nth-child(1) td").Text(); len(upcs) > 0 { 133 | upc = upcs[0] 134 | } 135 | 136 | var productType string 137 | if productTypes := product.Css("table tr:nth-child(2) td").Text(); len(productTypes) > 0 { 138 | productType = productTypes[0] 139 | } 140 | 141 | var reviewCount string 142 | if reviewCounts := product.Css("table tr:nth-child(7) td").Text(); len(reviewCounts) > 0 { 143 | reviewCount = reviewCounts[0] 144 | } 145 | 146 | s.Yield(&Record{ 147 | Title: title, 148 | Price: price, 149 | Stock: stock, 150 | Rating: rating, 151 | Description: productDescription, 152 | Upc: upc, 153 | ProductType: productType, 154 | Reviews: reviewCount, 155 | }) 156 | } 157 | -------------------------------------------------------------------------------- /_examples/books.toscrape.com/go.mod: -------------------------------------------------------------------------------- 1 | module books_to_scrape 2 | 3 | go 1.21.0 4 | 5 | require github.com/tech-engine/goscrapy v0.13.1 6 | 7 | require ( 8 | cloud.google.com/go v0.110.6 // indirect 9 | cloud.google.com/go/compute v1.23.0 // indirect 10 | cloud.google.com/go/compute/metadata v0.2.3 // indirect 11 | cloud.google.com/go/firestore v1.13.0 // indirect 12 | cloud.google.com/go/iam v1.1.1 // indirect 13 | cloud.google.com/go/longrunning v0.5.1 // indirect 14 | cloud.google.com/go/storage v1.33.0 // indirect 15 | firebase.google.com/go v3.13.0+incompatible // indirect 16 | github.com/andybalholm/cascadia v1.3.2 // indirect 17 | github.com/antchfx/htmlquery v1.3.2 // indirect 18 | github.com/antchfx/xpath v1.3.1 // indirect 19 | github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d // indirect 20 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 21 | github.com/golang/protobuf v1.5.3 // indirect 22 | github.com/golang/snappy v0.0.4 // indirect 23 | github.com/google/go-cmp v0.5.9 // indirect 24 | github.com/google/s2a-go v0.1.4 // indirect 25 | github.com/google/uuid v1.3.0 // indirect 26 | github.com/googleapis/enterprise-certificate-proxy v0.2.5 // indirect 27 | github.com/googleapis/gax-go/v2 v2.12.0 // indirect 28 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 29 | github.com/klauspost/compress v1.13.6 // indirect 30 | github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect 31 | github.com/segmentio/fasthash v1.0.3 // indirect 32 | github.com/spf13/cobra v1.8.1 // indirect 33 | github.com/spf13/pflag v1.0.5 // indirect 34 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect 35 | github.com/xdg-go/scram v1.1.2 // indirect 36 | github.com/xdg-go/stringprep v1.0.4 // indirect 37 | github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect 38 | go.mongodb.org/mongo-driver v1.12.1 // indirect 39 | go.opencensus.io v0.24.0 // indirect 40 | golang.org/x/crypto v0.21.0 // indirect 41 | golang.org/x/net v0.23.0 // indirect 42 | golang.org/x/oauth2 v0.11.0 // indirect 43 | golang.org/x/sync v0.3.0 // indirect 44 | golang.org/x/sys v0.18.0 // indirect 45 | golang.org/x/text v0.14.0 // indirect 46 | golang.org/x/time v0.3.0 // indirect 47 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect 48 | google.golang.org/api v0.136.0 // indirect 49 | google.golang.org/appengine v1.6.7 // indirect 50 | google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect 51 | google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 // indirect 52 | google.golang.org/genproto/googleapis/rpc v0.0.0-20230807174057-1744710a1577 // indirect 53 | google.golang.org/grpc v1.57.1 // indirect 54 | google.golang.org/protobuf v1.33.0 // indirect 55 | ) 56 | -------------------------------------------------------------------------------- /_examples/books.toscrape.com/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "os/signal" 9 | "sync" 10 | "syscall" 11 | 12 | // replace with your own project name 13 | "books_to_scrape/books_to_scrape" 14 | ) 15 | 16 | // sample terminate function to demostrate spider termination. 17 | func OnTerminate(fn func()) { 18 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM) 19 | <-ctx.Done() 20 | stop() 21 | fn() 22 | } 23 | 24 | func main() { 25 | ctx, cancel := context.WithCancel(context.Background()) 26 | 27 | var wg sync.WaitGroup 28 | wg.Add(1) 29 | 30 | spider, errCh := books_to_scrape.New(ctx) 31 | go func() { 32 | defer wg.Done() 33 | 34 | err := <-errCh 35 | 36 | if err != nil && errors.Is(err, context.Canceled) { 37 | return 38 | } 39 | 40 | fmt.Printf("failed: %q", err) 41 | }() 42 | 43 | // start the scraper with a job, currently nil is passed but you can pass your job here 44 | spider.StartRequest(ctx, nil) 45 | 46 | OnTerminate(func() { 47 | fmt.Println("exit signal received: shutting down gracefully") 48 | cancel() 49 | wg.Wait() 50 | }) 51 | 52 | } 53 | -------------------------------------------------------------------------------- /_examples/scrapejsp/go.mod: -------------------------------------------------------------------------------- 1 | module scrapejsp 2 | 3 | go 1.21.0 4 | 5 | require github.com/tech-engine/goscrapy v0.12.0 6 | 7 | require ( 8 | cloud.google.com/go v0.110.6 // indirect 9 | cloud.google.com/go/compute v1.23.0 // indirect 10 | cloud.google.com/go/compute/metadata v0.2.3 // indirect 11 | cloud.google.com/go/firestore v1.13.0 // indirect 12 | cloud.google.com/go/iam v1.1.1 // indirect 13 | cloud.google.com/go/longrunning v0.5.1 // indirect 14 | cloud.google.com/go/storage v1.33.0 // indirect 15 | firebase.google.com/go v3.13.0+incompatible // indirect 16 | github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d // indirect 17 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 18 | github.com/golang/protobuf v1.5.3 // indirect 19 | github.com/golang/snappy v0.0.4 // indirect 20 | github.com/google/go-cmp v0.5.9 // indirect 21 | github.com/google/s2a-go v0.1.4 // indirect 22 | github.com/google/uuid v1.3.0 // indirect 23 | github.com/googleapis/enterprise-certificate-proxy v0.2.5 // indirect 24 | github.com/googleapis/gax-go/v2 v2.12.0 // indirect 25 | github.com/klauspost/compress v1.13.6 // indirect 26 | github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect 27 | github.com/segmentio/fasthash v1.0.3 // indirect 28 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect 29 | github.com/xdg-go/scram v1.1.2 // indirect 30 | github.com/xdg-go/stringprep v1.0.4 // indirect 31 | github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect 32 | go.mongodb.org/mongo-driver v1.12.1 // indirect 33 | go.opencensus.io v0.24.0 // indirect 34 | golang.org/x/crypto v0.21.0 // indirect 35 | golang.org/x/net v0.23.0 // indirect 36 | golang.org/x/oauth2 v0.11.0 // indirect 37 | golang.org/x/sync v0.3.0 // indirect 38 | golang.org/x/sys v0.18.0 // indirect 39 | golang.org/x/text v0.14.0 // indirect 40 | golang.org/x/time v0.3.0 // indirect 41 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect 42 | google.golang.org/api v0.136.0 // indirect 43 | google.golang.org/appengine v1.6.7 // indirect 44 | google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect 45 | google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 // indirect 46 | google.golang.org/genproto/googleapis/rpc v0.0.0-20230807174057-1744710a1577 // indirect 47 | google.golang.org/grpc v1.57.1 // indirect 48 | google.golang.org/protobuf v1.33.0 // indirect 49 | ) 50 | -------------------------------------------------------------------------------- /_examples/scrapejsp/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "os/signal" 9 | "scrapejsp/scrapejsp" 10 | "sync" 11 | "syscall" 12 | 13 | // replace with your own project name 14 | 15 | "github.com/tech-engine/goscrapy/cmd/gos" 16 | ) 17 | 18 | // sample terminate function to demostrate spider termination. 19 | func OnTerminate(fn func()) { 20 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM) 21 | <-ctx.Done() 22 | stop() 23 | fn() 24 | } 25 | 26 | func main() { 27 | ctx, cancel := context.WithCancel(context.Background()) 28 | 29 | var wg sync.WaitGroup 30 | wg.Add(1) 31 | 32 | // get core spider 33 | gos := gos.New[*scrapejsp.Record]() 34 | 35 | // use proxies 36 | // proxies := gos.WithProxies("proxy_url1", "proxy_url2", ...) 37 | 38 | // get core spider 39 | // gos := gos.New[*scrapejsp.Record]().WithClient( 40 | // gos.DefaultClient(proxies), 41 | // ) 42 | 43 | // use middlewares 44 | gos.MiddlewareManager.Add(scrapejsp.MIDDLEWARES...) 45 | 46 | // use pipelines 47 | gos.PipelineManager.Add(scrapejsp.PIPELINES...) 48 | 49 | // export2Csv := pipelines.Export2CSV[*scrapejsp.Record](pipelines.Export2CSVOpts{ 50 | // Filename: "itstimeitsnowornever.csv", 51 | // }) 52 | 53 | // // use export 2 json pipeline 54 | // export2Json := pipelines.Export2JSON[*scrapejsp.Record](pipelines.Export2JSONOpts{ 55 | // Filename: "itstimeitsnowornever.json", 56 | // Immediate: true, 57 | // }) 58 | 59 | // add pipeline to group 60 | // pipelineGroup := pm.NewGroup[*scrapejsp.Record]() 61 | // pipelineGroup.Add(export2Csv) 62 | // pipelineGroup.Add(export2Json) 63 | // gos.PipelineManager.Add( 64 | // pipelineGroup, 65 | // ) 66 | 67 | go func() { 68 | defer wg.Done() 69 | 70 | err := gos.Start(ctx) 71 | 72 | if err != nil && errors.Is(err, context.Canceled) { 73 | return 74 | } 75 | 76 | fmt.Printf("failed: %q", err) 77 | }() 78 | 79 | spider := scrapejsp.NewSpider(gos) 80 | 81 | // start the scraper with a job, currently nil is passed but you can pass your job here 82 | spider.StartRequest(ctx, nil) 83 | 84 | OnTerminate(func() { 85 | fmt.Println("exit signal received: shutting down gracefully") 86 | cancel() 87 | wg.Wait() 88 | }) 89 | 90 | } 91 | -------------------------------------------------------------------------------- /_examples/scrapejsp/scrapejsp/constants.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | // you can define your constants here 4 | -------------------------------------------------------------------------------- /_examples/scrapejsp/scrapejsp/errors.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | // you can define your errors here 4 | -------------------------------------------------------------------------------- /_examples/scrapejsp/scrapejsp/job.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | // id field is compulsory in a Job defination. You can add your custom to Job 4 | type Job struct { 5 | id string 6 | // query string 7 | } 8 | 9 | // do not delete/edit 10 | func NewJob(id string) *Job { 11 | return &Job{ 12 | id: id, 13 | } 14 | } 15 | 16 | // do not delete/edit 17 | func (j *Job) Id() string { 18 | return j.id 19 | } 20 | 21 | // do not delete 22 | func (j *Job) Reset() { 23 | j.id = "" 24 | } 25 | 26 | // add your custom receiver functions below 27 | // func (j *Job) SetQuery(query string) { 28 | // j.query = query 29 | // return 30 | // } 31 | -------------------------------------------------------------------------------- /_examples/scrapejsp/scrapejsp/record.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | import ( 4 | "reflect" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/core" 7 | ) 8 | 9 | // do not modify this file 10 | 11 | type Record struct { 12 | J *Job `json:"-" csv:"-"` // JobId is required 13 | UserId int `csv:"userId" json:"userId"` 14 | Id int `csv:"id" json:"id"` 15 | Title string `csv:"title" json:"title"` 16 | Completed bool `csv:"completed" json:"completed"` 17 | } 18 | 19 | func (r *Record) Record() *Record { 20 | return r 21 | } 22 | 23 | func (r *Record) RecordKeys() []string { 24 | dataType := reflect.TypeOf(*r) 25 | if dataType.Kind() != reflect.Struct { 26 | panic("Record is not a struct") 27 | } 28 | 29 | numFields := dataType.NumField() 30 | keys := make([]string, numFields) 31 | 32 | for i := 0; i < numFields; i++ { 33 | field := dataType.Field(i) 34 | csvTag := field.Tag.Get("csv") 35 | keys[i] = csvTag 36 | } 37 | 38 | return keys 39 | } 40 | 41 | func (r *Record) RecordFlat() []any { 42 | 43 | inputType := reflect.TypeOf(*r) 44 | 45 | if inputType.Kind() != reflect.Struct { 46 | panic("Record is not a struct") 47 | } 48 | 49 | inputValue := reflect.ValueOf(*r) 50 | 51 | slice := make([]any, inputType.NumField()) 52 | 53 | for i := 0; i < inputType.NumField(); i++ { 54 | slice[i] = inputValue.Field(i).Interface() 55 | } 56 | return slice 57 | } 58 | 59 | func (r *Record) Job() core.IJob { 60 | return r.J 61 | } 62 | -------------------------------------------------------------------------------- /_examples/scrapejsp/scrapejsp/settings.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/builtin/middlewares" 7 | "github.com/tech-engine/goscrapy/pkg/builtin/pipelines" 8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager" 9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 10 | ) 11 | 12 | // HTTP Transport settings 13 | 14 | // Default: 10000 15 | const MIDDLEWARE_HTTP_TIMEOUT_MS = "" 16 | 17 | // Default: 100 18 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN = "" 19 | 20 | // Default: 100 21 | const MIDDLEWARE_HTTP_MAX_CONN_PER_HOST = "" 22 | 23 | // Default: 100 24 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST = "" 25 | 26 | // Inbuilt Retry middleware settings 27 | 28 | // Default: 3 29 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = "" 30 | 31 | // Default: 500, 502, 503, 504, 522, 524, 408, 429 32 | const MIDDLEWARE_HTTP_RETRY_CODES = "" 33 | 34 | // Default: 1s 35 | const MIDDLEWARE_HTTP_RETRY_BASE_DELAY = "" 36 | 37 | // Default: 1000000 38 | const SCHEDULER_REQ_RES_POOL_SIZE = "" 39 | 40 | // Default: num of CPU * 3 41 | const SCHEDULER_CONCURRENCY = "" 42 | 43 | // Default: 1000000 44 | const SCHEDULER_WORK_QUEUE_SIZE = "" 45 | 46 | // Pipeline Manager settings 47 | 48 | // Default: 10000 49 | const PIPELINEMANAGER_ITEMPOOL_SIZE = "" 50 | 51 | // Default: 24 52 | const PIPELINEMANAGER_ITEM_SIZE = "" 53 | 54 | // Default: 0 55 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = "" 56 | 57 | // Default: 1000 58 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = "" 59 | 60 | // Middlewares here 61 | // Executed in reverse order from bottom to top. 62 | var MIDDLEWARES = []middlewaremanager.Middleware{ 63 | middlewares.Retry(), 64 | middlewares.MultiCookieJar, 65 | middlewares.DupeFilter, 66 | } 67 | 68 | var export2CSV = pipelines.Export2CSV[*Record](pipelines.Export2CSVOpts{ 69 | Filename: "itstimeitsnowornever.csv", 70 | }) 71 | 72 | // use export 2 json pipeline 73 | // var export2Json = pipelines.Export2JSON[*Record](pipelines.Export2JSONOpts{ 74 | // Filename: "itstimeitsnowornever.json", 75 | // Immediate: true, 76 | // }) 77 | 78 | // add pipeline to group 79 | //func myCustomPipelineGroup() *pm.Group[*Record] { 80 | // pipelineGroup := pm.NewGroup[*Record]() 81 | // pipelineGroup.Add(export2CSV) 82 | // // pipelineGroup.Add(export2Json) 83 | // return pipelineGroup 84 | //} 85 | 86 | // Pipelines here 87 | // Executed in the order they appear. 88 | var PIPELINES = []pm.IPipeline[*Record]{ 89 | export2CSV, 90 | // export2Json, 91 | // myCustomPipelineGroup(), 92 | } 93 | 94 | func init() { 95 | var settings = map[string]string{ 96 | "MIDDLEWARE_HTTP_TIMEOUT_MS": MIDDLEWARE_HTTP_TIMEOUT_MS, 97 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN": MIDDLEWARE_HTTP_MAX_IDLE_CONN, 98 | "MIDDLEWARE_HTTP_MAX_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_CONN_PER_HOST, 99 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST, 100 | "MIDDLEWARE_HTTP_RETRY_MAX_RETRIES": MIDDLEWARE_HTTP_RETRY_MAX_RETRIES, 101 | "MIDDLEWARE_HTTP_RETRY_CODES": MIDDLEWARE_HTTP_RETRY_CODES, 102 | "MIDDLEWARE_HTTP_RETRY_BASE_DELAY": MIDDLEWARE_HTTP_RETRY_BASE_DELAY, 103 | "SCHEDULER_REQ_RES_POOL_SIZE": SCHEDULER_REQ_RES_POOL_SIZE, 104 | "SCHEDULER_CONCURRENCY": SCHEDULER_CONCURRENCY, 105 | "SCHEDULER_WORK_QUEUE_SIZE": SCHEDULER_WORK_QUEUE_SIZE, 106 | "PIPELINEMANAGER_ITEMPOOL_SIZE": PIPELINEMANAGER_ITEMPOOL_SIZE, 107 | "PIPELINEMANAGER_ITEM_SIZE": PIPELINEMANAGER_ITEM_SIZE, 108 | "PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE": PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE, 109 | "PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY": PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY, 110 | } 111 | 112 | for key, value := range settings { 113 | if value != "" { 114 | os.Setenv(key, value) 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /_examples/scrapejsp/scrapejsp/spider.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "log" 8 | 9 | "github.com/tech-engine/goscrapy/cmd/gos" 10 | "github.com/tech-engine/goscrapy/pkg/core" 11 | ) 12 | 13 | type Spider struct { 14 | gos.ICoreSpider[*Record] 15 | } 16 | 17 | func NewSpider(core gos.ICoreSpider[*Record]) *Spider { 18 | return &Spider{ 19 | core, 20 | } 21 | } 22 | 23 | // This is the entrypoint to the spider 24 | func (s *Spider) StartRequest(ctx context.Context, job *Job) { 25 | 26 | req := s.NewRequest() 27 | // req.Meta("JOB", job) 28 | req.Url("https://jsonplaceholder.typicode.com/todos/1") 29 | 30 | s.Request(req, s.parse) 31 | } 32 | 33 | func (s *Spider) Close(ctx context.Context) { 34 | } 35 | 36 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) { 37 | fmt.Printf("status: %d", resp.StatusCode()) 38 | 39 | var data Record 40 | err := json.Unmarshal(resp.Bytes(), &data) 41 | if err != nil { 42 | log.Fatalln(err) 43 | } 44 | 45 | // to push to pipelines 46 | s.Yield(&data) 47 | } 48 | -------------------------------------------------------------------------------- /_examples/scrapejsp/utils/helper.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "os" 6 | "os/signal" 7 | "syscall" 8 | ) 9 | 10 | func OnTerminate(fn func()) { 11 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM) 12 | <-ctx.Done() 13 | stop() 14 | fn() 15 | } 16 | -------------------------------------------------------------------------------- /_examples/scrapejsp_method2/go.mod: -------------------------------------------------------------------------------- 1 | module scrapejsp 2 | 3 | go 1.21.0 4 | 5 | require github.com/tech-engine/goscrapy v0.12.1 6 | 7 | require ( 8 | cloud.google.com/go v0.110.6 // indirect 9 | cloud.google.com/go/compute v1.23.0 // indirect 10 | cloud.google.com/go/compute/metadata v0.2.3 // indirect 11 | cloud.google.com/go/firestore v1.13.0 // indirect 12 | cloud.google.com/go/iam v1.1.1 // indirect 13 | cloud.google.com/go/longrunning v0.5.1 // indirect 14 | cloud.google.com/go/storage v1.33.0 // indirect 15 | firebase.google.com/go v3.13.0+incompatible // indirect 16 | github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d // indirect 17 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 18 | github.com/golang/protobuf v1.5.3 // indirect 19 | github.com/golang/snappy v0.0.4 // indirect 20 | github.com/google/go-cmp v0.5.9 // indirect 21 | github.com/google/s2a-go v0.1.4 // indirect 22 | github.com/google/uuid v1.3.0 // indirect 23 | github.com/googleapis/enterprise-certificate-proxy v0.2.5 // indirect 24 | github.com/googleapis/gax-go/v2 v2.12.0 // indirect 25 | github.com/klauspost/compress v1.13.6 // indirect 26 | github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect 27 | github.com/segmentio/fasthash v1.0.3 // indirect 28 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect 29 | github.com/xdg-go/scram v1.1.2 // indirect 30 | github.com/xdg-go/stringprep v1.0.4 // indirect 31 | github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect 32 | go.mongodb.org/mongo-driver v1.12.1 // indirect 33 | go.opencensus.io v0.24.0 // indirect 34 | golang.org/x/crypto v0.31.0 // indirect 35 | golang.org/x/net v0.23.0 // indirect 36 | golang.org/x/oauth2 v0.11.0 // indirect 37 | golang.org/x/sync v0.10.0 // indirect 38 | golang.org/x/sys v0.28.0 // indirect 39 | golang.org/x/text v0.21.0 // indirect 40 | golang.org/x/time v0.3.0 // indirect 41 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect 42 | google.golang.org/api v0.136.0 // indirect 43 | google.golang.org/appengine v1.6.7 // indirect 44 | google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect 45 | google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 // indirect 46 | google.golang.org/genproto/googleapis/rpc v0.0.0-20230807174057-1744710a1577 // indirect 47 | google.golang.org/grpc v1.57.1 // indirect 48 | google.golang.org/protobuf v1.33.0 // indirect 49 | ) 50 | -------------------------------------------------------------------------------- /_examples/scrapejsp_method2/itstimeitsnowornever.csv: -------------------------------------------------------------------------------- 1 | userId,id,title,completed 2 | 1,1,delectus aut autem,false 3 | 1,1,delectus aut autem,false 4 | -------------------------------------------------------------------------------- /_examples/scrapejsp_method2/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "os/signal" 9 | "sync" 10 | "syscall" 11 | 12 | // replace with your own project name 13 | "scrapejsp/scrapejsp" 14 | ) 15 | 16 | // sample terminate function to demostrate spider termination. 17 | func OnTerminate(fn func()) { 18 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM) 19 | <-ctx.Done() 20 | stop() 21 | fn() 22 | } 23 | 24 | func main() { 25 | ctx, cancel := context.WithCancel(context.Background()) 26 | 27 | var wg sync.WaitGroup 28 | wg.Add(1) 29 | 30 | spider, errCh := scrapejsp.NewSpider(ctx) 31 | 32 | go func() { 33 | defer wg.Done() 34 | 35 | err := <-errCh 36 | 37 | if err != nil && errors.Is(err, context.Canceled) { 38 | return 39 | } 40 | 41 | fmt.Printf("failed: %q", err) 42 | }() 43 | 44 | // start the scraper with a job, currently nil is passed but you can pass your job here 45 | spider.StartRequest(ctx, nil) 46 | 47 | OnTerminate(func() { 48 | fmt.Println("exit signal received: shutting down gracefully") 49 | cancel() 50 | wg.Wait() 51 | }) 52 | 53 | } 54 | -------------------------------------------------------------------------------- /_examples/scrapejsp_method2/scrapejsp/constants.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | // you can define your constants here 4 | -------------------------------------------------------------------------------- /_examples/scrapejsp_method2/scrapejsp/errors.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | // you can define your errors here 4 | -------------------------------------------------------------------------------- /_examples/scrapejsp_method2/scrapejsp/job.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | // id field is compulsory in a Job defination. You can add your custom to Job 4 | type Job struct { 5 | id string 6 | // query string 7 | } 8 | 9 | // do not delete/edit 10 | func NewJob(id string) *Job { 11 | return &Job{ 12 | id: id, 13 | } 14 | } 15 | 16 | // do not delete/edit 17 | func (j *Job) Id() string { 18 | return j.id 19 | } 20 | 21 | // do not delete 22 | func (j *Job) Reset() { 23 | j.id = "" 24 | } 25 | 26 | // add your custom receiver functions below 27 | // func (j *Job) SetQuery(query string) { 28 | // j.query = query 29 | // return 30 | // } 31 | -------------------------------------------------------------------------------- /_examples/scrapejsp_method2/scrapejsp/record.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | import ( 4 | "reflect" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/core" 7 | ) 8 | 9 | // do not modify this file 10 | 11 | type Record struct { 12 | J *Job `json:"-" csv:"-"` // JobId is required 13 | UserId int `csv:"userId" json:"userId"` 14 | Id int `csv:"id" json:"id"` 15 | Title string `csv:"title" json:"title"` 16 | Completed bool `csv:"completed" json:"completed"` 17 | } 18 | 19 | func (r *Record) Record() *Record { 20 | return r 21 | } 22 | 23 | func (r *Record) RecordKeys() []string { 24 | dataType := reflect.TypeOf(*r) 25 | if dataType.Kind() != reflect.Struct { 26 | panic("Record is not a struct") 27 | } 28 | 29 | numFields := dataType.NumField() 30 | keys := make([]string, numFields) 31 | 32 | for i := 0; i < numFields; i++ { 33 | field := dataType.Field(i) 34 | csvTag := field.Tag.Get("csv") 35 | keys[i] = csvTag 36 | } 37 | 38 | return keys 39 | } 40 | 41 | func (r *Record) RecordFlat() []any { 42 | 43 | inputType := reflect.TypeOf(*r) 44 | 45 | if inputType.Kind() != reflect.Struct { 46 | panic("Record is not a struct") 47 | } 48 | 49 | inputValue := reflect.ValueOf(*r) 50 | 51 | slice := make([]any, inputType.NumField()) 52 | 53 | for i := 0; i < inputType.NumField(); i++ { 54 | slice[i] = inputValue.Field(i).Interface() 55 | } 56 | return slice 57 | } 58 | 59 | func (r *Record) Job() core.IJob { 60 | return r.J 61 | } 62 | -------------------------------------------------------------------------------- /_examples/scrapejsp_method2/scrapejsp/settings.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/builtin/middlewares" 7 | "github.com/tech-engine/goscrapy/pkg/builtin/pipelines" 8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager" 9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 10 | ) 11 | 12 | // HTTP Transport settings 13 | 14 | // Default: 10000 15 | const MIDDLEWARE_HTTP_TIMEOUT_MS = "" 16 | 17 | // Default: 100 18 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN = "" 19 | 20 | // Default: 100 21 | const MIDDLEWARE_HTTP_MAX_CONN_PER_HOST = "" 22 | 23 | // Default: 100 24 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST = "" 25 | 26 | // Inbuilt Retry middleware settings 27 | 28 | // Default: 3 29 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = "" 30 | 31 | // Default: 500, 502, 503, 504, 522, 524, 408, 429 32 | const MIDDLEWARE_HTTP_RETRY_CODES = "" 33 | 34 | // Default: 1s 35 | const MIDDLEWARE_HTTP_RETRY_BASE_DELAY = "" 36 | 37 | // Default: 1000000 38 | const SCHEDULER_REQ_RES_POOL_SIZE = "" 39 | 40 | // Default: num of CPU * 3 41 | const SCHEDULER_CONCURRENCY = "" 42 | 43 | // Default: 1000000 44 | const SCHEDULER_WORK_QUEUE_SIZE = "" 45 | 46 | // Pipeline Manager settings 47 | 48 | // Default: 10000 49 | const PIPELINEMANAGER_ITEMPOOL_SIZE = "" 50 | 51 | // Default: 24 52 | const PIPELINEMANAGER_ITEM_SIZE = "" 53 | 54 | // Default: 0 55 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = "" 56 | 57 | // Default: 1000 58 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = "" 59 | 60 | // Middlewares here 61 | // Executed in reverse order from bottom to top. 62 | var MIDDLEWARES = []middlewaremanager.Middleware{ 63 | middlewares.Retry(), 64 | middlewares.MultiCookieJar, 65 | middlewares.DupeFilter, 66 | } 67 | 68 | var export2CSV = pipelines.Export2CSV[*Record](pipelines.Export2CSVOpts{ 69 | Filename: "itstimeitsnowornever.csv", 70 | }) 71 | 72 | // use export 2 json pipeline 73 | // var export2Json = pipelines.Export2JSON[*Record](pipelines.Export2JSONOpts{ 74 | // Filename: "itstimeitsnowornever.json", 75 | // Immediate: true, 76 | // }) 77 | 78 | // add pipeline to group 79 | func myCustomPipelineGroup() *pm.Group[*Record] { 80 | pipelineGroup := pm.NewGroup[*Record]() 81 | pipelineGroup.Add(export2CSV) 82 | // pipelineGroup.Add(export2Json) 83 | return pipelineGroup 84 | } 85 | 86 | // Pipelines here 87 | // Executed in the order they appear. 88 | var PIPELINES = []pm.IPipeline[*Record]{ 89 | export2CSV, 90 | // export2Json, 91 | myCustomPipelineGroup(), 92 | } 93 | 94 | func init() { 95 | var settings = map[string]string{ 96 | "MIDDLEWARE_HTTP_TIMEOUT_MS": MIDDLEWARE_HTTP_TIMEOUT_MS, 97 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN": MIDDLEWARE_HTTP_MAX_IDLE_CONN, 98 | "MIDDLEWARE_HTTP_MAX_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_CONN_PER_HOST, 99 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST, 100 | "MIDDLEWARE_HTTP_RETRY_MAX_RETRIES": MIDDLEWARE_HTTP_RETRY_MAX_RETRIES, 101 | "MIDDLEWARE_HTTP_RETRY_CODES": MIDDLEWARE_HTTP_RETRY_CODES, 102 | "MIDDLEWARE_HTTP_RETRY_BASE_DELAY": MIDDLEWARE_HTTP_RETRY_BASE_DELAY, 103 | "SCHEDULER_REQ_RES_POOL_SIZE": SCHEDULER_REQ_RES_POOL_SIZE, 104 | "SCHEDULER_CONCURRENCY": SCHEDULER_CONCURRENCY, 105 | "SCHEDULER_WORK_QUEUE_SIZE": SCHEDULER_WORK_QUEUE_SIZE, 106 | "PIPELINEMANAGER_ITEMPOOL_SIZE": PIPELINEMANAGER_ITEMPOOL_SIZE, 107 | "PIPELINEMANAGER_ITEM_SIZE": PIPELINEMANAGER_ITEM_SIZE, 108 | "PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE": PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE, 109 | "PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY": PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY, 110 | } 111 | 112 | for key, value := range settings { 113 | if value != "" { 114 | os.Setenv(key, value) 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /_examples/scrapejsp_method2/scrapejsp/spider.go: -------------------------------------------------------------------------------- 1 | package scrapejsp 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "log" 8 | 9 | "github.com/tech-engine/goscrapy/cmd/gos" 10 | "github.com/tech-engine/goscrapy/pkg/core" 11 | ) 12 | 13 | type Spider struct { 14 | gos.ICoreSpider[*Record] 15 | } 16 | 17 | func NewSpider(ctx context.Context) (*Spider, <-chan error) { 18 | 19 | // use proxies 20 | // proxies := core.WithProxies("proxy_url1", "proxy_url2", ...) 21 | // core := gos.New[*Record]().WithClient( 22 | // gos.DefaultClient(proxies), 23 | // ) 24 | 25 | core := gos.New[*Record]() 26 | 27 | // Add middlewares 28 | core.MiddlewareManager.Add(MIDDLEWARES...) 29 | // Add pipelines 30 | core.PipelineManager.Add(PIPELINES...) 31 | 32 | errCh := make(chan error) 33 | 34 | go func() { 35 | errCh <- core.Start(ctx) 36 | }() 37 | 38 | return &Spider{ 39 | core, 40 | }, errCh 41 | } 42 | 43 | // This is the entrypoint to the spider 44 | func (s *Spider) StartRequest(ctx context.Context, job *Job) { 45 | 46 | req := s.NewRequest() 47 | // req.Meta("JOB", job) 48 | req.Url("https://jsonplaceholder.typicode.com/todos/1") 49 | 50 | s.Request(req, s.parse) 51 | } 52 | 53 | func (s *Spider) Close(ctx context.Context) { 54 | } 55 | 56 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) { 57 | fmt.Printf("status: %d", resp.StatusCode()) 58 | 59 | var data Record 60 | err := json.Unmarshal(resp.Bytes(), &data) 61 | if err != nil { 62 | log.Fatalln(err) 63 | } 64 | 65 | // to push to pipelines 66 | s.Yield(&data) 67 | } 68 | -------------------------------------------------------------------------------- /assets/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-engine/goscrapy/bc83234e737e68850fe1c8ac4a0d36fdb5d32e85/assets/demo.gif -------------------------------------------------------------------------------- /assets/logo.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-engine/goscrapy/bc83234e737e68850fe1c8ac4a0d36fdb5d32e85/assets/logo.webp -------------------------------------------------------------------------------- /cmd/cli/cli.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2023 Tech Engine 3 | */ 4 | package cli 5 | 6 | import ( 7 | "fmt" 8 | "os" 9 | 10 | "github.com/spf13/cobra" 11 | ) 12 | 13 | const VERSION = "0.11.1" 14 | 15 | const BANNER_MSG = ` 16 | _________ ________ 17 | __ ____/______ __ ___/_____________________ _________ _____ __ 18 | _ / __ _ __ \_____ \ _ ___/__ ___/_ __ '/___ __ \__ / / / 19 | / /_/ / / /_/ /____/ / / /__ _ / / /_/ / __ /_/ /_ /_/ / 20 | \____/ \____/ /____/ \___/ /_/ \__,_/ _ .___/ _\__, / 21 | /_/ /____/ 22 | 23 | GoScrapy: Harnessing Go's power for efficient web scraping, inspired by Python's Scrapy framework.` 24 | 25 | // rootCmd represents the base command when called without any subcommands 26 | var rootCmd = &cobra.Command{ 27 | Use: "goscrapy [command]", 28 | Short: "A command line tool to everything related to GoScrapy.", 29 | Long: BANNER_MSG, 30 | Version: VERSION, 31 | } 32 | 33 | // Execute adds all child commands to the root command and sets flags appropriately. 34 | // This is called by main.main(). It only needs to happen once to the rootCmd. 35 | func Execute() { 36 | if err := rootCmd.Execute(); err != nil { 37 | fmt.Printf("Whoops :( !!! There was an error '%s'", err.Error()) 38 | os.Exit(1) 39 | } 40 | } 41 | 42 | func init() { 43 | } 44 | -------------------------------------------------------------------------------- /cmd/cli/pipeline.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2023 Tech Engine 3 | */ 4 | package cli 5 | 6 | import ( 7 | "bytes" 8 | "fmt" 9 | "go/format" 10 | "path/filepath" 11 | "regexp" 12 | "strings" 13 | "text/template" 14 | "unicode" 15 | 16 | "github.com/spf13/cobra" 17 | ) 18 | 19 | var pipelineCmd = &cobra.Command{ 20 | Use: "pipeline [pipeline-name]", 21 | Short: "Creates a new GoScrapy pipeline with the specified name", 22 | Args: cobra.ExactArgs(1), 23 | Run: func(cmd *cobra.Command, args []string) { 24 | var pipelineName = strings.TrimSpace(args[0]) 25 | 26 | if pipelineName == "" { 27 | fmt.Printf("⚠️ please provide a pipeline-name") 28 | return 29 | } 30 | 31 | // read template 32 | tmplContent, err := templatesFS.ReadFile("templates/pipeline.tmpl") 33 | 34 | if err != nil { 35 | fmt.Printf("❌ Error reading template: %v", err) 36 | return 37 | } 38 | 39 | tmpl, err := template.New(pipelineName). 40 | Funcs(template.FuncMap{ 41 | "capitalizeFirstLetter": capitalizeFirstLetter, 42 | }).Parse(string(tmplContent)) 43 | 44 | if err != nil { 45 | fmt.Printf("❌ Error parsing template: %v", err) 46 | return 47 | } 48 | 49 | buffer := &bytes.Buffer{} 50 | 51 | err = tmpl.Execute(buffer, removeSpecialChars(pipelineName)) 52 | 53 | if err != nil { 54 | fmt.Printf("❌ Error executing template: '%s', %v", tmpl.Name(), err) 55 | return 56 | } 57 | 58 | // formate golang code 59 | formattedCode, err := format.Source(buffer.Bytes()) 60 | 61 | if err != nil { 62 | fmt.Printf("❌ Error formatting sourcecode '%s', %v", tmpl.Name(), err) 63 | return 64 | } 65 | 66 | sourceFilename := filepath.Join("pipelines", strings.TrimSuffix(tmpl.Name(), ".tmpl")+".go") 67 | 68 | // write go file 69 | err = writeToFile(sourceFilename, formattedCode) 70 | 71 | if err != nil { 72 | fmt.Printf("❌ Error creating %s.", sourceFilename) 73 | return 74 | } 75 | 76 | fmt.Printf("✔️ %s\n", sourceFilename) 77 | 78 | fmt.Printf("\n✨ Congrates, %s created successfully.", pipelineName) 79 | }, 80 | } 81 | 82 | func init() { 83 | rootCmd.AddCommand(pipelineCmd) 84 | } 85 | 86 | func capitalizeFirstLetter(s string) string { 87 | if s == "" { 88 | return s 89 | } 90 | 91 | r := []rune(s) 92 | r[0] = unicode.ToUpper(r[0]) 93 | 94 | return string(r) 95 | } 96 | 97 | func removeSpecialChars(input string) string { 98 | // Define a regular expression to match non-alphanumeric characters 99 | reg := regexp.MustCompile("[^a-zA-Z0-9]+") 100 | 101 | // Replace matched characters with an empty string 102 | result := reg.ReplaceAllString(input, "") 103 | 104 | return result 105 | } 106 | -------------------------------------------------------------------------------- /cmd/cli/startproject.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2023 Tech Engine 3 | */ 4 | package cli 5 | 6 | import ( 7 | "bytes" 8 | "embed" 9 | "fmt" 10 | "go/format" 11 | "io/fs" 12 | "os" 13 | "path" 14 | "path/filepath" 15 | "strings" 16 | "text/template" 17 | 18 | "github.com/spf13/cobra" 19 | ) 20 | 21 | //go:embed templates/* 22 | var templatesFS embed.FS 23 | 24 | var templateDir = filepath.Join(filepath.Dir("."), "/templates") 25 | 26 | // startprojectCmd represents the startproject command 27 | var startprojectCmd = &cobra.Command{ 28 | Use: "startproject [projectname]", 29 | Short: "Creates a new GoScrapy project with the specified name", 30 | Args: cobra.ExactArgs(1), 31 | Run: func(cmd *cobra.Command, args []string) { 32 | var projectName = strings.TrimSpace(args[0]) 33 | 34 | if projectName == "" { 35 | fmt.Printf("⚠️ please provide a projectname") 36 | return 37 | } 38 | 39 | templateFiles, err := fs.Glob(templatesFS, "templates/*.tmpl") 40 | 41 | if err != nil { 42 | fmt.Printf("❌ Error finding template: %v", err) 43 | return 44 | } 45 | 46 | fmt.Printf("\n🚀 GoScrapy generating project files. Please wait!\n\n") 47 | 48 | // create [projectName] dir where we will put spider code & pipelines 49 | err = createDirIfNotExist(projectName) 50 | 51 | if err != nil { 52 | fmt.Printf("❌ Error creating dir '%s', %v", projectName, err) 53 | return 54 | } 55 | 56 | // create [projectName]/pipelines dir 57 | err = createDirIfNotExist(path.Join(projectName, "pipelines")) 58 | 59 | if err != nil { 60 | fmt.Printf("❌ Error creating dir %s/pipelines, %v", projectName, err) 61 | return 62 | } 63 | 64 | var sourceFilename string 65 | 66 | // Parse and execute each template 67 | for _, templateFile := range templateFiles { 68 | 69 | if templateFile == "templates/pipeline.tmpl" { 70 | continue 71 | } 72 | 73 | tmplContent, err := templatesFS.ReadFile(templateFile) 74 | 75 | if err != nil { 76 | fmt.Printf("❌ Error reading template: %v", err) 77 | return 78 | } 79 | 80 | tmplName := filepath.Base(templateFile) 81 | tmpl, err := template.New(tmplName).Parse(string(tmplContent)) 82 | 83 | if err != nil { 84 | fmt.Printf("❌ Error parsing template: %v", err) 85 | return 86 | } 87 | 88 | buffer := &bytes.Buffer{} 89 | 90 | err = tmpl.Execute(buffer, projectName) 91 | 92 | if err != nil { 93 | fmt.Printf("❌ Error executing template: '%s', %v", tmpl.Name(), err) 94 | return 95 | } 96 | 97 | formattedCode, err := format.Source(buffer.Bytes()) 98 | 99 | if err != nil { 100 | fmt.Printf("❌ Error formatting sourcecode '%s', %v", tmpl.Name(), err) 101 | return 102 | } 103 | 104 | filename := strings.TrimSuffix(tmpl.Name(), ".tmpl") + ".go" 105 | 106 | if templateFile == "templates/main.tmpl" { 107 | sourceFilename = filename 108 | } else { 109 | sourceFilename = filepath.Join(projectName, filename) 110 | } 111 | 112 | err = writeToFile(sourceFilename, formattedCode) 113 | 114 | if err != nil { 115 | fmt.Printf("❌ Error creating %s.", sourceFilename) 116 | return 117 | } 118 | 119 | fmt.Printf("✔️ %s\n", sourceFilename) 120 | 121 | } 122 | fmt.Printf("\n✨ Congrates, %s created successfully.", projectName) 123 | }, 124 | } 125 | 126 | func init() { 127 | rootCmd.AddCommand(startprojectCmd) 128 | } 129 | 130 | func writeToFile(filename string, data []byte) error { 131 | 132 | file, err := os.Create(filename) 133 | if err != nil { 134 | return err 135 | } 136 | defer file.Close() 137 | _, err = file.Write(data) 138 | return err 139 | } 140 | 141 | func createDirIfNotExist(dir string) error { 142 | if _, err := os.Stat(dir); !os.IsNotExist(err) { 143 | // Directory exists, prompt user for confirmation 144 | fmt.Printf("Directory '%s' already exists. Continue? (Y/N): ", dir) 145 | var input string 146 | _, err := fmt.Scan(&input) 147 | 148 | if err != nil { 149 | return err 150 | } 151 | 152 | if strings.ToLower(input) != "y" { 153 | return nil 154 | } 155 | } 156 | 157 | return os.MkdirAll(dir, os.ModePerm) 158 | } 159 | -------------------------------------------------------------------------------- /cmd/cli/templates/constants.tmpl: -------------------------------------------------------------------------------- 1 | package {{.}} 2 | 3 | // you can define your constants here -------------------------------------------------------------------------------- /cmd/cli/templates/errors.tmpl: -------------------------------------------------------------------------------- 1 | package {{.}} 2 | 3 | // you can define your errors here -------------------------------------------------------------------------------- /cmd/cli/templates/job.tmpl: -------------------------------------------------------------------------------- 1 | package {{.}} 2 | 3 | // id field is compulsory in a Job defination. You can add your custom to Job 4 | type Job struct { 5 | id string 6 | } 7 | 8 | // do not delete/edit 9 | func NewJob(id string) *Job { 10 | return &Job{ 11 | id: id, 12 | } 13 | } 14 | 15 | // do not delete/edit 16 | func (j *Job) Id() string { 17 | return j.id 18 | } 19 | 20 | // do not delete 21 | func (j *Job) Reset() { 22 | j.id = "" 23 | } 24 | 25 | 26 | // add your custom receiver functions below -------------------------------------------------------------------------------- /cmd/cli/templates/main.tmpl: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "sync" 8 | "os" 9 | "os/signal" 10 | "syscall" 11 | // replace with your own project name 12 | "{{.}}/{{.}}" 13 | ) 14 | // sample terminate function to demostrate spider termination. 15 | func OnTerminate(fn func()) { 16 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM) 17 | <-ctx.Done() 18 | stop() 19 | fn() 20 | } 21 | 22 | func main() { 23 | ctx, cancel := context.WithCancel(context.Background()) 24 | 25 | var wg sync.WaitGroup 26 | wg.Add(1) 27 | 28 | spider, errCh := {{.}}.New(ctx) 29 | go func() { 30 | defer wg.Done() 31 | 32 | err := <-errCh 33 | 34 | if err != nil && errors.Is(err, context.Canceled) { 35 | return 36 | } 37 | 38 | fmt.Printf("failed: %q", err) 39 | }() 40 | 41 | // start the scraper with a job, currently nil is passed but you can pass your job here 42 | spider.StartRequest(ctx, nil) 43 | 44 | OnTerminate(func() { 45 | fmt.Println("exit signal received: shutting down gracefully") 46 | cancel() 47 | wg.Wait() 48 | }) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /cmd/cli/templates/pipeline.tmpl: -------------------------------------------------------------------------------- 1 | package pipelines 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/core" 7 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 8 | ) 9 | 10 | type {{.}}[OUT any] struct { 11 | // add your custom fields here 12 | } 13 | 14 | func {{ capitalizeFirstLetter .}}[OUT any](args ...string) *{{.}}[OUT] { 15 | // your custom initialization code goes here 16 | return &{{.}}[OUT]{} 17 | } 18 | 19 | // Open runs when we start the corespider engine. 20 | func (p *{{.}}[OUT]) Open(ctx context.Context) error { 21 | return nil 22 | } 23 | 24 | // Close runs just before the corespider engine exits. 25 | func (p *{{.}}[OUT]) Close() { 26 | } 27 | 28 | // your custome pipeline processing code goes here 29 | func (p *{{.}}[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error { 30 | 31 | // original is the output yield from spider 32 | // original.Job() - access Job 33 | // original.Record() - access Record 34 | // original.RecordKeys() - access Record keys in a slice 35 | // original.RecordFlat() - access Record in []any format 36 | 37 | return nil 38 | } 39 | -------------------------------------------------------------------------------- /cmd/cli/templates/record.tmpl: -------------------------------------------------------------------------------- 1 | package {{.}} 2 | 3 | import ( 4 | "reflect" 5 | "github.com/tech-engine/goscrapy/pkg/core" 6 | ) 7 | 8 | /* 9 | json and csv struct field tags are required, if you want the Record to be exported 10 | or processed by builtin pipelines 11 | */ 12 | 13 | type Record struct { 14 | J *Job `json:"-" csv:"-"` // JobId is required 15 | // add you custom fields here 16 | Title string `json:"title" csv:"title"` 17 | } 18 | 19 | // modify below code only if you know what you are doing 20 | func (r *Record) Record() *Record { 21 | return r 22 | } 23 | 24 | func (r *Record) RecordKeys() []string { 25 | dataType := reflect.TypeOf(*r) 26 | if dataType.Kind() != reflect.Struct { 27 | panic("Record is not a struct") 28 | } 29 | 30 | numFields := dataType.NumField() 31 | keys := make([]string, numFields) 32 | 33 | for i := 0; i < numFields; i++ { 34 | field := dataType.Field(i) 35 | csvTag := field.Tag.Get("csv") 36 | keys[i] = csvTag 37 | } 38 | 39 | return keys 40 | } 41 | 42 | func (r *Record) RecordFlat() []any { 43 | 44 | inputType := reflect.TypeOf(*r) 45 | 46 | if inputType.Kind() != reflect.Struct { 47 | panic("Record is not a struct") 48 | } 49 | 50 | inputValue := reflect.ValueOf(*r) 51 | 52 | slice := make([]any, inputType.NumField()) 53 | 54 | for i := 0; i < inputType.NumField(); i++ { 55 | slice[i] = inputValue.Field(i).Interface() 56 | } 57 | return slice 58 | } 59 | 60 | func (r *Record) Job() core.IJob { 61 | return r.J 62 | } 63 | -------------------------------------------------------------------------------- /cmd/cli/templates/settings.tmpl: -------------------------------------------------------------------------------- 1 | package {{.}} 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/builtin/middlewares" 7 | "github.com/tech-engine/goscrapy/pkg/builtin/pipelines" 8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager" 9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 10 | ) 11 | 12 | // HTTP Transport settings 13 | 14 | // Default: 10000 15 | const MIDDLEWARE_HTTP_TIMEOUT_MS = "" 16 | 17 | // Default: 100 18 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN = "" 19 | 20 | // Default: 100 21 | const MIDDLEWARE_HTTP_MAX_CONN_PER_HOST = "" 22 | 23 | // Default: 100 24 | const MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST = "" 25 | 26 | // Inbuilt Retry middleware settings 27 | 28 | // Default: 3 29 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = "" 30 | 31 | // Default: 500, 502, 503, 504, 522, 524, 408, 429 32 | const MIDDLEWARE_HTTP_RETRY_CODES = "" 33 | 34 | // Default: 1s 35 | const MIDDLEWARE_HTTP_RETRY_BASE_DELAY = "" 36 | 37 | // Default: 1000000 38 | const SCHEDULER_REQ_RES_POOL_SIZE = "" 39 | 40 | // Default: num of CPU * 3 41 | const SCHEDULER_CONCURRENCY = "" 42 | 43 | // Default: 1000000 44 | const SCHEDULER_WORK_QUEUE_SIZE = "" 45 | 46 | // Pipeline Manager settings 47 | 48 | // Default: 10000 49 | const PIPELINEMANAGER_ITEMPOOL_SIZE = "" 50 | 51 | // Default: 24 52 | const PIPELINEMANAGER_ITEM_SIZE = "" 53 | 54 | // Default: 0 55 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = "" 56 | 57 | // Default: 1000 58 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = "" 59 | 60 | // Middlewares here 61 | // Executed in reverse order from bottom to top. 62 | var MIDDLEWARES = []middlewaremanager.Middleware{ 63 | middlewares.Retry(), 64 | middlewares.MultiCookieJar, 65 | middlewares.DupeFilter, 66 | } 67 | 68 | var export2CSV = pipelines.Export2CSV[*Record](pipelines.Export2CSVOpts{ 69 | Filename: "itstimeitsnowornever.csv", 70 | }) 71 | 72 | // use export 2 json pipeline 73 | // var export2Json = pipelines.Export2JSON[*Record](pipelines.Export2JSONOpts{ 74 | // Filename: "itstimeitsnowornever.json", 75 | // Immediate: true, 76 | // }) 77 | 78 | // add pipeline to group 79 | //func myCustomPipelineGroup() *pm.Group[*Record] { 80 | // pipelineGroup := pm.NewGroup[*Record]() 81 | // pipelineGroup.Add(export2CSV) 82 | // // pipelineGroup.Add(export2Json) 83 | // return pipelineGroup 84 | //} 85 | 86 | // Pipelines here 87 | // Executed in the order they appear. 88 | var PIPELINES = []pm.IPipeline[*Record]{ 89 | export2CSV, 90 | // export2Json, 91 | // myCustomPipelineGroup(), 92 | } 93 | 94 | func init() { 95 | var settings = map[string]string{ 96 | "MIDDLEWARE_HTTP_TIMEOUT_MS": MIDDLEWARE_HTTP_TIMEOUT_MS, 97 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN": MIDDLEWARE_HTTP_MAX_IDLE_CONN, 98 | "MIDDLEWARE_HTTP_MAX_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_CONN_PER_HOST, 99 | "MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST": MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST, 100 | "MIDDLEWARE_HTTP_RETRY_MAX_RETRIES": MIDDLEWARE_HTTP_RETRY_MAX_RETRIES, 101 | "MIDDLEWARE_HTTP_RETRY_CODES": MIDDLEWARE_HTTP_RETRY_CODES, 102 | "MIDDLEWARE_HTTP_RETRY_BASE_DELAY": MIDDLEWARE_HTTP_RETRY_BASE_DELAY, 103 | "SCHEDULER_REQ_RES_POOL_SIZE": SCHEDULER_REQ_RES_POOL_SIZE, 104 | "SCHEDULER_CONCURRENCY": SCHEDULER_CONCURRENCY, 105 | "SCHEDULER_WORK_QUEUE_SIZE": SCHEDULER_WORK_QUEUE_SIZE, 106 | "PIPELINEMANAGER_ITEMPOOL_SIZE": PIPELINEMANAGER_ITEMPOOL_SIZE, 107 | "PIPELINEMANAGER_ITEM_SIZE": PIPELINEMANAGER_ITEM_SIZE, 108 | "PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE": PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE, 109 | "PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY": PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY, 110 | } 111 | 112 | for key, value := range settings { 113 | if value != "" { 114 | os.Setenv(key, value) 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /cmd/cli/templates/spider.tmpl: -------------------------------------------------------------------------------- 1 | package {{.}} 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | "encoding/json" 7 | "log" 8 | 9 | "github.com/tech-engine/goscrapy/cmd/gos" 10 | "github.com/tech-engine/goscrapy/pkg/core" 11 | ) 12 | 13 | type Spider struct { 14 | gos.ICoreSpider[*Record] 15 | } 16 | 17 | func New(ctx context.Context) (*Spider, <-chan error) { 18 | 19 | // use proxies 20 | // proxies := core.WithProxies("proxy_url1", "proxy_url2", ...) 21 | // core := gos.New[*Record]().WithClient( 22 | // gos.DefaultClient(proxies), 23 | // ) 24 | 25 | core := gos.New[*Record]() 26 | 27 | // Add middlewares 28 | core.MiddlewareManager.Add(MIDDLEWARES...) 29 | // Add pipelines 30 | core.PipelineManager.Add(PIPELINES...) 31 | 32 | errCh := make(chan error) 33 | 34 | go func() { 35 | errCh <- core.Start(ctx) 36 | }() 37 | 38 | return &Spider{ 39 | core, 40 | }, errCh 41 | } 42 | 43 | func (s *Spider) StartRequest(ctx context.Context, job *Job) { 44 | 45 | // for each request we must call NewRequest() and never reuse it 46 | req := s.NewRequest() 47 | 48 | var headers http.Header 49 | 50 | // GET is the request method, method chaining possible 51 | req.Url(""). 52 | Meta("MY_KEY1", "MY_VALUE"). 53 | Meta("MY_KEY2", true). 54 | Header(headers) 55 | 56 | /* POST 57 | req.Url() 58 | req.Method("POST") 59 | req.Body() 60 | */ 61 | 62 | // call the next parse method 63 | s.Request(req, s.parse) 64 | } 65 | 66 | // can be called when spider is about to close 67 | func (s *Spider) Close(ctx context.Context) { 68 | } 69 | 70 | func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) { 71 | // response.Body() 72 | // response.StatusCode() 73 | // response.Header() 74 | // response.Bytes() 75 | // response.Meta("MY_KEY1") 76 | 77 | // yielding output pushes output to be processed by pipelines, also check output.go for the fields 78 | var data Record 79 | 80 | err := json.Unmarshal(resp.Bytes(), &data) 81 | if err != nil { 82 | log.Panicln(err) 83 | } 84 | 85 | // s.Yield(&data) 86 | } 87 | -------------------------------------------------------------------------------- /cmd/gos/client.go: -------------------------------------------------------------------------------- 1 | package gos 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | "net/url" 7 | "os" 8 | "strconv" 9 | "strings" 10 | "sync/atomic" 11 | "time" 12 | 13 | "github.com/tech-engine/goscrapy/internal/types" 14 | ) 15 | 16 | type clientOpts struct { 17 | timeout time.Duration 18 | transportOpts 19 | } 20 | 21 | type transportOpts struct { 22 | proxyFn func(*http.Request) (*url.URL, error) 23 | maxIdleConns, maxConnsPerHost, maxIdleConnsPerHost int 24 | } 25 | 26 | func defaultClientOpts() clientOpts { 27 | opts := clientOpts{ 28 | timeout: MIDDLEWARE_DEFAULT_HTTP_TIMEOUT_MS * time.Millisecond, 29 | transportOpts: transportOpts{ 30 | proxyFn: nil, 31 | maxIdleConns: MIDDLEWARE_DEFAULT_HTTP_MAX_IDLE_CONN, 32 | maxConnsPerHost: MIDDLEWARE_DEFAULT_HTTP_MAX_CONN_PER_HOST, 33 | maxIdleConnsPerHost: MIDDLEWARE_DEFAULT_HTTP_MAX_IDLE_CONN_PER_HOST, 34 | }, 35 | } 36 | 37 | value, ok := os.LookupEnv("MIDDLEWARE_HTTP_MAX_IDLE_CONN") 38 | 39 | if ok { 40 | maxIdleConn, err := strconv.Atoi(value) 41 | if err == nil { 42 | opts.maxIdleConns = maxIdleConn 43 | } 44 | } 45 | 46 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_MAX_CONN_PER_HOST") 47 | 48 | if ok { 49 | maxConnPerHost, err := strconv.Atoi(value) 50 | if err == nil { 51 | opts.maxConnsPerHost = maxConnPerHost 52 | } 53 | } 54 | 55 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_MAX_IDLE_CONN_PER_HOST") 56 | 57 | if ok { 58 | maxIdleConnPerHost, err := strconv.Atoi(value) 59 | if err == nil { 60 | opts.maxConnsPerHost = maxIdleConnPerHost 61 | } 62 | } 63 | 64 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_TIMEOUT_MS") 65 | 66 | if ok { 67 | timeoutMs, err := strconv.Atoi(value) 68 | if err == nil { 69 | opts.timeout = time.Duration(timeoutMs) * time.Millisecond 70 | } 71 | } 72 | 73 | return opts 74 | } 75 | 76 | func WithTimeout(t time.Duration) types.OptFunc[clientOpts] { 77 | return func(opts *clientOpts) { 78 | opts.timeout = t 79 | } 80 | } 81 | 82 | func WithMaxIdleConns(maxIdleConns int) types.OptFunc[clientOpts] { 83 | return func(opts *clientOpts) { 84 | opts.maxIdleConns = maxIdleConns 85 | } 86 | } 87 | 88 | func WithMaxConnsPerHost(maxConnsPerHost int) types.OptFunc[clientOpts] { 89 | return func(opts *clientOpts) { 90 | opts.maxConnsPerHost = maxConnsPerHost 91 | } 92 | } 93 | 94 | func WithMaxIdleConnsPerHost(maxIdleConnsPerHost int) types.OptFunc[clientOpts] { 95 | return func(opts *clientOpts) { 96 | opts.maxIdleConnsPerHost = maxIdleConnsPerHost 97 | } 98 | } 99 | 100 | func WithProxyFn(fn func(*http.Request) (*url.URL, error)) types.OptFunc[clientOpts] { 101 | return func(opts *clientOpts) { 102 | opts.proxyFn = fn 103 | } 104 | } 105 | 106 | func WithProxies(proxies ...string) types.OptFunc[clientOpts] { 107 | return func(opts *clientOpts) { 108 | proxyUrls := make([]*url.URL, 0, len(proxies)) 109 | 110 | for _, proxy := range proxies { 111 | u, err := url.Parse(strings.TrimSpace(proxy)) 112 | if err != nil { 113 | log.Panic(err) 114 | return 115 | } 116 | proxyUrls = append(proxyUrls, u) 117 | } 118 | opts.proxyFn = roundRobin(proxyUrls) 119 | } 120 | } 121 | 122 | // round robin algo for proxy rotation 123 | func roundRobin(urls []*url.URL) func(*http.Request) (*url.URL, error) { 124 | var index uint32 125 | len := uint32(len(urls)) 126 | return func(*http.Request) (*url.URL, error) { 127 | index := atomic.AddUint32(&index, 1) 128 | u := urls[(index-1)%len] 129 | return u, nil 130 | } 131 | } 132 | 133 | // createDefaultHTTPClient creates a default http client with defaults. 134 | // If default values are set in the env it will pick the defaults from the env. 135 | func DefaultClient(opts ...types.OptFunc[clientOpts]) *http.Client { 136 | cli := &http.Client{} 137 | 138 | // load in default options 139 | cliOpts := defaultClientOpts() 140 | 141 | for _, opt := range opts { 142 | opt(&cliOpts) 143 | } 144 | 145 | t := http.DefaultTransport.(*http.Transport).Clone() 146 | 147 | // set all value from transport options 148 | t.MaxIdleConns = cliOpts.maxIdleConns 149 | t.MaxConnsPerHost = cliOpts.maxConnsPerHost 150 | t.MaxIdleConnsPerHost = cliOpts.maxIdleConnsPerHost 151 | t.Proxy = cliOpts.proxyFn 152 | 153 | // set client options 154 | cli.Timeout = cliOpts.timeout 155 | 156 | cli.Transport = t 157 | 158 | return cli 159 | } 160 | -------------------------------------------------------------------------------- /cmd/gos/constants.go: -------------------------------------------------------------------------------- 1 | package gos 2 | 3 | const MIDDLEWARE_DEFAULT_HTTP_MAX_IDLE_CONN = 100 4 | 5 | const MIDDLEWARE_DEFAULT_HTTP_MAX_CONN_PER_HOST = 100 6 | 7 | const MIDDLEWARE_DEFAULT_HTTP_MAX_IDLE_CONN_PER_HOST = 100 8 | 9 | const MIDDLEWARE_DEFAULT_HTTP_TIMEOUT_MS = 10000 10 | -------------------------------------------------------------------------------- /cmd/gos/gos.go: -------------------------------------------------------------------------------- 1 | package gos 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | 7 | "github.com/tech-engine/goscrapy/pkg/core" 8 | "github.com/tech-engine/goscrapy/pkg/engine" 9 | "github.com/tech-engine/goscrapy/pkg/executor" 10 | httpnative "github.com/tech-engine/goscrapy/pkg/executor_adapters/http_native" 11 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager" 12 | pipelinemanager "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 13 | "github.com/tech-engine/goscrapy/pkg/scheduler" 14 | ) 15 | 16 | func New[OUT any]() *gosBuilder[OUT] { 17 | c := &gosBuilder[OUT]{ 18 | httpClient: DefaultClient(), 19 | } 20 | 21 | c.MiddlewareManager = middlewaremanager.New(c.httpClient) 22 | 23 | c.ExecutorAdapter = httpnative.NewHTTPClientAdapter(c.MiddlewareManager.HTTPClient(), 0) 24 | 25 | c.Executor = executor.New(c.ExecutorAdapter) 26 | 27 | c.Scheduler = scheduler.New(c.Executor) 28 | 29 | c.PipelineManager = pipelinemanager.New[OUT]() 30 | 31 | c.Engine = engine.New(c.Scheduler, c.PipelineManager) 32 | 33 | c.Core = core.New(c.Engine) 34 | return c 35 | } 36 | 37 | func (c *gosBuilder[OUT]) WithClient(cli *http.Client) *gosBuilder[OUT] { 38 | c.httpClient = cli 39 | return c 40 | } 41 | 42 | func (c *gosBuilder[OUT]) Start(ctx context.Context) error { 43 | return c.Engine.Start(ctx) 44 | } 45 | -------------------------------------------------------------------------------- /cmd/gos/ports.go: -------------------------------------------------------------------------------- 1 | package gos 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/core" 7 | "github.com/tech-engine/goscrapy/pkg/engine" 8 | "github.com/tech-engine/goscrapy/pkg/executor" 9 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager" 10 | pipelinemanager "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 11 | "github.com/tech-engine/goscrapy/pkg/scheduler" 12 | ) 13 | 14 | // Any custom spider created using GoScrapy Framework must implement ICoreSpider[OUT any] interface 15 | type ICoreSpider[OUT any] interface { 16 | Request(req core.IRequestReader, cb core.ResponseCallback) 17 | NewRequest() core.IRequestRW 18 | Yield(core.IOutput[OUT]) 19 | } 20 | 21 | // Separate interface created for configuration purposes 22 | 23 | // engine.*Engine[OUT] accepts a pipeline manager that implements engine.IPipelineManager[OUT] 24 | // interface which doesn't have the Add function as engine.IPipelineManager[OUT] 25 | // is not responsible for adding pipelines. 26 | // But pipelinemanager.*PipelineManager[OUT] does exposes an Add function for external configuration 27 | // purpose and to access it we have created the IPipelineManagerAdder[OUT] interface. 28 | type IPipelineManagerAdder[OUT any] interface { 29 | engine.IPipelineManager[OUT] 30 | Add(...pipelinemanager.IPipeline[OUT]) 31 | } 32 | 33 | // core.*Core[OUT] accepts an engine that implements core.IEngine[OUT] interface which 34 | // doesn't have the WithScheduler function as core.IEngine[OUT] is not responsible for 35 | // setting Scheduler. But engine.*Engine[OUT] does exposes a WithScheduler function for external 36 | // configuration purposes and to access it we have created the IEngineConfigurer[OUT] interface. 37 | // Same is the case for WithPipelineManager function. 38 | type IEngineConfigurer[OUT any] interface { 39 | core.IEngine[OUT] 40 | WithScheduler(engine.IScheduler) 41 | WithPipelineManager(engine.IPipelineManager[OUT]) 42 | } 43 | 44 | // engine.*Engine[OUT] accepts a scheduler that implements engine.IScheduler[OUT] interface which 45 | // doesn't have the WithExecutor function as engine.IScheduler[OUT] is not responsible for 46 | // setting an Executor. But engine.IScheduler does exposes a WithExecutor function for external 47 | // configuration purposes and to access it we have created the ISchedulerConfigurer[OUT] interface. 48 | type ISchedulerConfigurer[OUT any] interface { 49 | engine.IScheduler 50 | WithExecutor(scheduler.IExecutor) 51 | } 52 | 53 | // scheduler.*Scheduler accepts a executor that implements scheduler.IExecutor interface which 54 | // doesn't have the WithAdapter function as scheduler.IExecutor is not responsible for 55 | // setting an adapter. But scheduler.*Scheduler does exposes a WithAdapter function for external 56 | // configuration purposes and to access it we have created the IExecutorConfigurer[OUT] interface. 57 | type IExecutorConfigurer[OUT any] interface { 58 | scheduler.IExecutor 59 | WithAdapter(executor.IExecutorAdapter) 60 | } 61 | 62 | // executor.*Executor accepts a adapter that implements executor.IExecutorAdapter interface which 63 | // doesn't have the WithClient function as executor.IExecutorAdapter is not responsible for 64 | // setting a http client. But executoradapter.*HTTPAdapter does exposes a WithClient function for external 65 | // configuration purposes and to access it we have created the IExecutorAdapterConfigurer[OUT] interface. 66 | type IExecutorAdapterConfigurer interface { 67 | executor.IExecutorAdapter 68 | WithClient(*http.Client) 69 | } 70 | 71 | type IMiddlewareManager interface { 72 | HTTPClient() *http.Client 73 | Add(...middlewaremanager.Middleware) 74 | } 75 | -------------------------------------------------------------------------------- /cmd/gos/types.go: -------------------------------------------------------------------------------- 1 | package gos 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/core" 7 | ) 8 | 9 | type gosBuilder[OUT any] struct { 10 | *core.Core[OUT] 11 | Engine IEngineConfigurer[OUT] 12 | PipelineManager IPipelineManagerAdder[OUT] 13 | Scheduler ISchedulerConfigurer[OUT] 14 | Executor IExecutorConfigurer[OUT] 15 | ExecutorAdapter IExecutorAdapterConfigurer 16 | MiddlewareManager IMiddlewareManager 17 | httpClient *http.Client 18 | } 19 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/tech-engine/goscrapy 2 | 3 | go 1.22 4 | 5 | require ( 6 | firebase.google.com/go v3.13.0+incompatible 7 | github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1 8 | github.com/segmentio/fasthash v1.0.3 9 | github.com/spf13/cobra v1.8.1 10 | github.com/stretchr/testify v1.9.0 11 | go.mongodb.org/mongo-driver v1.17.0 12 | golang.org/x/crypto v0.31.0 13 | golang.org/x/net v0.29.0 14 | golang.org/x/sync v0.10.0 15 | google.golang.org/api v0.198.0 16 | ) 17 | 18 | require ( 19 | cloud.google.com/go/auth v0.9.4 // indirect 20 | cloud.google.com/go/auth/oauth2adapt v0.2.4 // indirect 21 | github.com/antchfx/xpath v1.3.1 // indirect 22 | github.com/felixge/httpsnoop v1.0.4 // indirect 23 | github.com/go-logr/logr v1.4.2 // indirect 24 | github.com/go-logr/stdr v1.2.2 // indirect 25 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.55.0 // indirect 26 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.55.0 // indirect 27 | go.opentelemetry.io/otel v1.30.0 // indirect 28 | go.opentelemetry.io/otel/metric v1.30.0 // indirect 29 | go.opentelemetry.io/otel/trace v1.30.0 // indirect 30 | ) 31 | 32 | require ( 33 | cloud.google.com/go v0.115.1 // indirect 34 | cloud.google.com/go/compute/metadata v0.5.1 // indirect 35 | cloud.google.com/go/firestore v1.17.0 // indirect 36 | cloud.google.com/go/iam v1.2.1 // indirect 37 | cloud.google.com/go/longrunning v0.6.1 // indirect 38 | cloud.google.com/go/storage v1.43.0 // indirect 39 | github.com/andybalholm/cascadia v1.3.2 40 | github.com/antchfx/htmlquery v1.3.2 41 | github.com/davecgh/go-spew v1.1.1 // indirect 42 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 43 | github.com/golang/protobuf v1.5.4 // indirect 44 | github.com/golang/snappy v0.0.4 // indirect 45 | github.com/google/s2a-go v0.1.8 // indirect 46 | github.com/google/uuid v1.6.0 // indirect 47 | github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect 48 | github.com/googleapis/gax-go/v2 v2.13.0 // indirect 49 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 50 | github.com/klauspost/compress v1.17.9 // indirect 51 | github.com/montanaflynn/stats v0.7.1 // indirect 52 | github.com/pmezard/go-difflib v1.0.0 // indirect 53 | github.com/spf13/pflag v1.0.5 // indirect 54 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect 55 | github.com/xdg-go/scram v1.1.2 // indirect 56 | github.com/xdg-go/stringprep v1.0.4 // indirect 57 | github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect 58 | go.opencensus.io v0.24.0 // indirect 59 | golang.org/x/oauth2 v0.23.0 // indirect 60 | golang.org/x/sys v0.28.0 // indirect 61 | golang.org/x/text v0.21.0 // indirect 62 | golang.org/x/time v0.6.0 // indirect 63 | google.golang.org/appengine v1.6.8 // indirect 64 | google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1 // indirect 65 | google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect 66 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect 67 | google.golang.org/grpc v1.67.0 // indirect 68 | google.golang.org/protobuf v1.34.2 // indirect 69 | gopkg.in/yaml.v3 v3.0.1 // indirect 70 | ) 71 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= 2 | cloud.google.com/go v0.115.1 h1:Jo0SM9cQnSkYfp44+v+NQXHpcHqlnRJk2qxh6yvxxxQ= 3 | cloud.google.com/go v0.115.1/go.mod h1:DuujITeaufu3gL68/lOFIirVNJwQeyf5UXyi+Wbgknc= 4 | cloud.google.com/go/auth v0.9.4 h1:DxF7imbEbiFu9+zdKC6cKBko1e8XeJnipNqIbWZ+kDI= 5 | cloud.google.com/go/auth v0.9.4/go.mod h1:SHia8n6//Ya940F1rLimhJCjjx7KE17t0ctFEci3HkA= 6 | cloud.google.com/go/auth/oauth2adapt v0.2.4 h1:0GWE/FUsXhf6C+jAkWgYm7X9tK8cuEIfy19DBn6B6bY= 7 | cloud.google.com/go/auth/oauth2adapt v0.2.4/go.mod h1:jC/jOpwFP6JBxhB3P5Rr0a9HLMC/Pe3eaL4NmdvqPtc= 8 | cloud.google.com/go/compute/metadata v0.5.1 h1:NM6oZeZNlYjiwYje+sYFjEpP0Q0zCan1bmQW/KmIrGs= 9 | cloud.google.com/go/compute/metadata v0.5.1/go.mod h1:C66sj2AluDcIqakBq/M8lw8/ybHgOZqin2obFxa/E5k= 10 | cloud.google.com/go/firestore v1.17.0 h1:iEd1LBbkDZTFsLw3sTH50eyg4qe8eoG6CjocmEXO9aQ= 11 | cloud.google.com/go/firestore v1.17.0/go.mod h1:69uPx1papBsY8ZETooc71fOhoKkD70Q1DwMrtKuOT/Y= 12 | cloud.google.com/go/iam v1.2.1 h1:QFct02HRb7H12J/3utj0qf5tobFh9V4vR6h9eX5EBRU= 13 | cloud.google.com/go/iam v1.2.1/go.mod h1:3VUIJDPpwT6p/amXRC5GY8fCCh70lxPygguVtI0Z4/g= 14 | cloud.google.com/go/longrunning v0.6.1 h1:lOLTFxYpr8hcRtcwWir5ITh1PAKUD/sG2lKrTSYjyMc= 15 | cloud.google.com/go/longrunning v0.6.1/go.mod h1:nHISoOZpBcmlwbJmiVk5oDRz0qG/ZxPynEGs1iZ79s0= 16 | cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs= 17 | cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0= 18 | firebase.google.com/go v3.13.0+incompatible h1:3TdYC3DDi6aHn20qoRkxwGqNgdjtblwVAyRLQwGn/+4= 19 | firebase.google.com/go v3.13.0+incompatible/go.mod h1:xlah6XbEyW6tbfSklcfe5FHJIwjt8toICdV5Wh9ptHs= 20 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 21 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= 22 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 23 | github.com/antchfx/htmlquery v1.3.2 h1:85YdttVkR1rAY+Oiv/nKI4FCimID+NXhDn82kz3mEvs= 24 | github.com/antchfx/htmlquery v1.3.2/go.mod h1:1mbkcEgEarAokJiWhTfr4hR06w/q2ZZjnYLrDt6CTUk= 25 | github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk= 26 | github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= 27 | github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= 28 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= 29 | github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= 30 | github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= 31 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 32 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 33 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 34 | github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= 35 | github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= 36 | github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= 37 | github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= 38 | github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= 39 | github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= 40 | github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= 41 | github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= 42 | github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 43 | github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= 44 | github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= 45 | github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1 h1:FWNFq4fM1wPfcK40yHE5UO3RUdSNPaBC+j3PokzA6OQ= 46 | github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1/go.mod h1:5YoVOkjYAQumqlV356Hj3xeYh4BdZuLE0/nRkf2NKkI= 47 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= 48 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 49 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= 50 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 51 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= 52 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 53 | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 54 | github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= 55 | github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= 56 | github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= 57 | github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= 58 | github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= 59 | github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= 60 | github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= 61 | github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= 62 | github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= 63 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= 64 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= 65 | github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= 66 | github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= 67 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= 68 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 69 | github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 70 | github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 71 | github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 72 | github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 73 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 74 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 75 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 76 | github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc= 77 | github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0= 78 | github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM= 79 | github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA= 80 | github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 81 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 82 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 83 | github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw= 84 | github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA= 85 | github.com/googleapis/gax-go/v2 v2.13.0 h1:yitjD5f7jQHhyDsnhKEBU52NdvvdSeGzlAnDPT0hH1s= 86 | github.com/googleapis/gax-go/v2 v2.13.0/go.mod h1:Z/fvTZXF8/uw7Xu5GuslPw+bplx6SS338j1Is2S+B7A= 87 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= 88 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= 89 | github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= 90 | github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= 91 | github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= 92 | github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= 93 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 94 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 95 | github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= 96 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 97 | github.com/segmentio/fasthash v1.0.3 h1:EI9+KE1EwvMLBWwjpRDc+fEM+prwxDYbslddQGtrmhM= 98 | github.com/segmentio/fasthash v1.0.3/go.mod h1:waKX8l2N8yckOgmSsXJi7x1ZfdKZ4x7KRMzBtS3oedY= 99 | github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= 100 | github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= 101 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= 102 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 103 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 104 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 105 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 106 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 107 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 108 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= 109 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= 110 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 111 | github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= 112 | github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= 113 | github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= 114 | github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= 115 | github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= 116 | github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= 117 | github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= 118 | github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= 119 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 120 | go.mongodb.org/mongo-driver v1.17.0 h1:Hp4q2MCjvY19ViwimTs00wHi7G4yzxh4/2+nTx8r40k= 121 | go.mongodb.org/mongo-driver v1.17.0/go.mod h1:wwWm/+BuOddhcq3n68LKRmgk2wXzmF6s0SFOa0GINL4= 122 | go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= 123 | go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= 124 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.55.0 h1:hCq2hNMwsegUvPzI7sPOvtO9cqyy5GbWt/Ybp2xrx8Q= 125 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.55.0/go.mod h1:LqaApwGx/oUmzsbqxkzuBvyoPpkxk3JQWnqfVrJ3wCA= 126 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.55.0 h1:ZIg3ZT/aQ7AfKqdwp7ECpOK6vHqquXXuyTjIO8ZdmPs= 127 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.55.0/go.mod h1:DQAwmETtZV00skUwgD6+0U89g80NKsJE3DCKeLLPQMI= 128 | go.opentelemetry.io/otel v1.30.0 h1:F2t8sK4qf1fAmY9ua4ohFS/K+FUuOPemHUIXHtktrts= 129 | go.opentelemetry.io/otel v1.30.0/go.mod h1:tFw4Br9b7fOS+uEao81PJjVMjW/5fvNCbpsDIXqP0pc= 130 | go.opentelemetry.io/otel/metric v1.30.0 h1:4xNulvn9gjzo4hjg+wzIKG7iNFEaBMX00Qd4QIZs7+w= 131 | go.opentelemetry.io/otel/metric v1.30.0/go.mod h1:aXTfST94tswhWEb+5QjlSqG+cZlmyXy/u8jFpor3WqQ= 132 | go.opentelemetry.io/otel/sdk v1.29.0 h1:vkqKjk7gwhS8VaWb0POZKmIEDimRCMsopNYnriHyryo= 133 | go.opentelemetry.io/otel/sdk v1.29.0/go.mod h1:pM8Dx5WKnvxLCb+8lG1PRNIDxu9g9b9g59Qr7hfAAok= 134 | go.opentelemetry.io/otel/trace v1.30.0 h1:7UBkkYzeg3C7kQX8VAidWh2biiQbtAKjyIML8dQ9wmc= 135 | go.opentelemetry.io/otel/trace v1.30.0/go.mod h1:5EyKqTzzmyqB9bwtCCq6pDLktPK6fmGf/Dph+8VI02o= 136 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 137 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 138 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 139 | golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= 140 | golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= 141 | golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 142 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= 143 | golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= 144 | golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= 145 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 146 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 147 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 148 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 149 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 150 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 151 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 152 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 153 | golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 154 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 155 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 156 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 157 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 158 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 159 | golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= 160 | golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= 161 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= 162 | golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= 163 | golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= 164 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 165 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 166 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 167 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 168 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 169 | golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= 170 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 171 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 172 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 173 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 174 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 175 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 176 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 177 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 178 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 179 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 180 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 181 | golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= 182 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 183 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 184 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 185 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 186 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 187 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 188 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 189 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 190 | golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= 191 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 192 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 193 | golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= 194 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= 195 | golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U= 196 | golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= 197 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 198 | golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 199 | golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= 200 | golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= 201 | golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= 202 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 203 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 204 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 205 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 206 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 207 | google.golang.org/api v0.198.0 h1:OOH5fZatk57iN0A7tjJQzt6aPfYQ1JiWkt1yGseazks= 208 | google.golang.org/api v0.198.0/go.mod h1:/Lblzl3/Xqqk9hw/yS97TImKTUwnf1bv89v7+OagJzc= 209 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= 210 | google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= 211 | google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= 212 | google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= 213 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= 214 | google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= 215 | google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= 216 | google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1 h1:BulPr26Jqjnd4eYDVe+YvyR7Yc2vJGkO5/0UxD0/jZU= 217 | google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:hL97c3SYopEHblzpxRL4lSs523++l8DYxGM1FQiYmb4= 218 | google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 h1:hjSy6tcFQZ171igDaN5QHOw2n6vx40juYbC/x67CEhc= 219 | google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:qpvKtACPCQhAdu3PyQgV4l3LMXZEtft7y8QcarRsp9I= 220 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ= 221 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= 222 | google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= 223 | google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= 224 | google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= 225 | google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= 226 | google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= 227 | google.golang.org/grpc v1.67.0 h1:IdH9y6PF5MPSdAntIcpjQ+tXO41pcQsfZV2RxtQgVcw= 228 | google.golang.org/grpc v1.67.0/go.mod h1:1gLDyUQU7CTLJI90u3nXZ9ekeghjeM7pTDZlqFNg2AA= 229 | google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= 230 | google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= 231 | google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= 232 | google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= 233 | google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= 234 | google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 235 | google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 236 | google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 237 | google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= 238 | google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= 239 | google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= 240 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= 241 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= 242 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 243 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 244 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 245 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 246 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 247 | honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 248 | honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 249 | -------------------------------------------------------------------------------- /internal/cmap/cmap.go: -------------------------------------------------------------------------------- 1 | package cmap 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "sync" 7 | 8 | "github.com/tech-engine/goscrapy/internal/types" 9 | ) 10 | 11 | var ERR_MAX_ITEM_EXCEEDED = errors.New("CMAP max item exceeded") 12 | 13 | type CMap[K comparable, V any] struct { 14 | opts 15 | lock sync.RWMutex 16 | data map[K]Void[V] 17 | } 18 | 19 | func NewCMap[K comparable, V any](optFuncs ...types.OptFunc[opts]) *CMap[K, V] { 20 | 21 | opts := defaultOpts() 22 | 23 | for _, fn := range optFuncs { 24 | fn(&opts) 25 | } 26 | 27 | return &CMap[K, V]{ 28 | opts: opts, 29 | data: make(map[K]Void[V], opts.size), 30 | } 31 | } 32 | 33 | func (cm *CMap[K, V]) Get(key K) (V, bool) { 34 | 35 | cm.lock.RLock() 36 | defer cm.lock.RUnlock() 37 | 38 | val, ok := cm.data[key] 39 | 40 | return val.Data, ok 41 | } 42 | 43 | func (cm *CMap[K, V]) Set(key K, val V) error { 44 | 45 | cm.lock.Lock() 46 | defer cm.lock.Unlock() 47 | 48 | _, ok := cm.data[key] 49 | 50 | if (len(cm.data) >= cm.size) && !ok { 51 | return fmt.Errorf("Set: %w: max allowed=[%d]", ERR_MAX_ITEM_EXCEEDED, cm.size) 52 | } 53 | 54 | cm.data[key] = Void[V]{val} 55 | 56 | return nil 57 | } 58 | 59 | func (cm *CMap[K, V]) Del(key K) { 60 | cm.lock.Lock() 61 | delete(cm.data, key) 62 | cm.lock.Unlock() 63 | } 64 | 65 | func (cm *CMap[K, V]) Clear() { 66 | clear(cm.data) 67 | } 68 | 69 | func (cm *CMap[K, V]) Keys() []any { 70 | keys := make([]any, cm.size) 71 | 72 | var i = 0 73 | for key := range cm.data { 74 | keys[i] = key 75 | i++ 76 | } 77 | 78 | return keys 79 | } 80 | 81 | func (cm *CMap[K, V]) Len() int { 82 | 83 | cm.lock.RLock() 84 | defer cm.lock.RUnlock() 85 | 86 | return len(cm.data) 87 | } 88 | -------------------------------------------------------------------------------- /internal/cmap/cmap_test.go: -------------------------------------------------------------------------------- 1 | package cmap 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | type testCase struct { 11 | key, 12 | val, 13 | op string 14 | } 15 | 16 | func TestCMapGet(t *testing.T) { 17 | cmap := NewCMap[string, string]() 18 | cmap.Set("KEY_1", "VAL_1") 19 | 20 | val, ok := cmap.Get("KEY_1") 21 | 22 | assert.True(t, ok, "ok=true expected for key=%s", "KEY_1") 23 | 24 | assert.Equalf(t, "VAL_1", val, "val=%s expected for key=%s but got %s", "VAL_1", "KEY_1", val) 25 | 26 | val, ok = cmap.Get("KEY_2") 27 | 28 | assert.Falsef(t, ok, "ok=false expected for key=%s", "KEY_2") 29 | 30 | assert.Emptyf(t, val, "empty string expected for key=%s but got %s") 31 | } 32 | 33 | func TestCMapSet(t *testing.T) { 34 | cmap := NewCMap[string, string]() 35 | err := cmap.Set("KEY_1", "VAL_1") 36 | 37 | assert.NoErrorf(t, err, "nil error expected but got %s", err) 38 | } 39 | 40 | func TestCMapLimit(t *testing.T) { 41 | var err error 42 | cmap := NewCMap[string, string](WithSize(2)) 43 | 44 | err = cmap.Set("KEY_1", "VAL_1") 45 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_1", err) 46 | 47 | err = cmap.Set("KEY_2", "VAL_2") 48 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_2", err) 49 | 50 | err = cmap.Set("KEY_3", "VAL_3") 51 | 52 | if assert.Errorf(t, err, "error is expected for key=%s", "KEY_3") { 53 | assert.ErrorIs(t, err, ERR_MAX_ITEM_EXCEEDED) 54 | } 55 | 56 | cmap.Del("KEY_2") 57 | 58 | err = cmap.Set("KEY_3", "VAL_3") 59 | 60 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_3", err) 61 | } 62 | 63 | func TestCMapDel(t *testing.T) { 64 | cmap := NewCMap[string, string]() 65 | 66 | cmap.Set("KEY_1", "VAL_1") 67 | 68 | cmap.Del("KEY_1") 69 | 70 | assert.Equalf(t, 0, cmap.Len(), "len=%d expected but has %d", 0, cmap.Len()) 71 | 72 | val, ok := cmap.Get("KEY_1") 73 | 74 | assert.Falsef(t, ok, "false expected for key=%s", "KEY_1") 75 | assert.Equalf(t, "", val, "empty string expected but got %s", val) 76 | } 77 | 78 | func TestCMapConcurrency(t *testing.T) { 79 | 80 | testCases := []testCase{ 81 | { 82 | key: "KEY_1", 83 | val: "VAL_1", 84 | op: "WRITE", 85 | }, 86 | { 87 | key: "KEY_2", 88 | val: "VAL_2", 89 | op: "WRITE", 90 | }, 91 | { 92 | key: "KEY_2", 93 | val: "VAL_2", 94 | op: "READ", 95 | }, 96 | { 97 | key: "KEY_3", 98 | val: "VAL_3", 99 | op: "WRITE", 100 | }, 101 | { 102 | key: "KEY_4", 103 | val: "VAL_4", 104 | op: "WRITE", 105 | }, 106 | { 107 | key: "KEY_1", 108 | val: "VAL_1", 109 | op: "READ", 110 | }, 111 | { 112 | key: "KEY_1", 113 | val: "VAL_1", 114 | op: "WRITE", 115 | }, 116 | } 117 | 118 | cmap := NewCMap[string, string](WithSize(5)) 119 | 120 | for i := 0; i < 100; i++ { 121 | t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { 122 | t.Parallel() 123 | for _, tc := range testCases { 124 | if tc.op == "READ" { 125 | cmap.Get(tc.key) 126 | } else { 127 | err := cmap.Set(tc.key, tc.val) 128 | 129 | assert.NoError(t, err) 130 | } 131 | } 132 | }) 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /internal/cmap/cmaph.go: -------------------------------------------------------------------------------- 1 | package cmap 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | 7 | "github.com/segmentio/fasthash/fnv1a" 8 | "github.com/tech-engine/goscrapy/internal/types" 9 | ) 10 | 11 | type CMapH struct { 12 | opts 13 | lock sync.RWMutex 14 | data map[uint64]Void[any] 15 | keys []any 16 | lastKeyIndex int 17 | } 18 | 19 | func defaultOpts() opts { 20 | return opts{ 21 | size: 24, 22 | hashFn: fnv1a.HashString64, 23 | } 24 | } 25 | 26 | func WithSize(size int) types.OptFunc[opts] { 27 | return func(opts *opts) { 28 | opts.size = size 29 | } 30 | } 31 | 32 | func WithHashFn(fn hashFn) types.OptFunc[opts] { 33 | return func(opts *opts) { 34 | opts.hashFn = fn 35 | } 36 | } 37 | 38 | func NewHCMap(optFuncs ...types.OptFunc[opts]) *CMapH { 39 | 40 | opts := defaultOpts() 41 | 42 | for _, fn := range optFuncs { 43 | fn(&opts) 44 | } 45 | 46 | return &CMapH{ 47 | opts: opts, 48 | data: make(map[uint64]Void[any], opts.size), 49 | keys: make([]any, opts.size), 50 | } 51 | } 52 | 53 | func (cm *CMapH) Get(key string) (any, bool) { 54 | 55 | hkey := cm.hashFn(key) 56 | 57 | cm.lock.RLock() 58 | defer cm.lock.RUnlock() 59 | 60 | val, ok := cm.data[hkey] 61 | 62 | return val.Data, ok 63 | } 64 | 65 | func (cm *CMapH) Set(key string, val any) error { 66 | 67 | hkey := cm.hashFn(key) 68 | 69 | cm.lock.Lock() 70 | defer cm.lock.Unlock() 71 | 72 | _, ok := cm.data[hkey] 73 | 74 | if (len(cm.data) > cm.size) && !ok { 75 | return fmt.Errorf("Set: max items of %d exceeded", cm.size) 76 | } 77 | 78 | cm.data[hkey] = Void[any]{val} 79 | 80 | if cm.lastKeyIndex < cm.opts.size { 81 | cm.keys[cm.lastKeyIndex] = key 82 | cm.lastKeyIndex++ 83 | } 84 | 85 | return nil 86 | } 87 | 88 | func (cm *CMapH) Len() int { 89 | 90 | cm.lock.RLock() 91 | defer cm.lock.RUnlock() 92 | 93 | return len(cm.data) 94 | } 95 | 96 | func (cm *CMapH) Del(key string) { 97 | hkey := cm.hashFn(key) 98 | cm.lock.Lock() 99 | delete(cm.data, hkey) 100 | cm.lock.Unlock() 101 | } 102 | 103 | func (cm *CMapH) Clear() { 104 | clear(cm.data) 105 | } 106 | 107 | func (cm *CMapH) Keys() []any { 108 | return cm.keys 109 | } 110 | -------------------------------------------------------------------------------- /internal/cmap/types.go: -------------------------------------------------------------------------------- 1 | package cmap 2 | 3 | type opts struct { 4 | size int 5 | hashFn hashFn 6 | } 7 | 8 | type hashFn func(string) uint64 9 | 10 | type Void[V any] struct { 11 | Data V 12 | } 13 | -------------------------------------------------------------------------------- /internal/fsm/fsm.go: -------------------------------------------------------------------------------- 1 | package fsm 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | 7 | "github.com/tech-engine/goscrapy/internal/cmap" 8 | ) 9 | 10 | var ERR_MAX_ITEM_EXCEEDED = errors.New("FSM max item exceeded") 11 | 12 | type FixedSizeMap[K comparable, V any] struct { 13 | size uint64 14 | data map[K]cmap.Void[V] 15 | } 16 | 17 | func New[K comparable, V any](size uint64) *FixedSizeMap[K, V] { 18 | return &FixedSizeMap[K, V]{ 19 | size: size, 20 | data: make(map[K]cmap.Void[V], size), 21 | } 22 | } 23 | 24 | func (fsm *FixedSizeMap[K, V]) Get(key K) (V, bool) { 25 | 26 | val, ok := fsm.data[key] 27 | 28 | return val.Data, ok 29 | } 30 | 31 | func (fsm *FixedSizeMap[K, V]) Set(key K, val V) error { 32 | 33 | if _, ok := fsm.data[key]; !ok && (len(fsm.data) >= int(fsm.size)) { 34 | return fmt.Errorf("Set:fixedsizemap.go: %w: max allowed=[%d]", ERR_MAX_ITEM_EXCEEDED, fsm.size) 35 | } 36 | 37 | fsm.data[key] = cmap.Void[V]{Data: val} 38 | 39 | return nil 40 | } 41 | 42 | func (fsm *FixedSizeMap[K, V]) Clear() { 43 | clear(fsm.data) 44 | } 45 | -------------------------------------------------------------------------------- /internal/fsm/fsm_test.go: -------------------------------------------------------------------------------- 1 | package fsm 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | type testCase struct { 10 | key, 11 | val, 12 | op string 13 | } 14 | 15 | func TestFsmGet(t *testing.T) { 16 | fsm := New[string, string](5) 17 | fsm.Set("KEY_1", "VAL_1") 18 | 19 | val, ok := fsm.Get("KEY_1") 20 | 21 | assert.True(t, ok, "ok=true expected for key=%s", "KEY_1") 22 | 23 | assert.Equalf(t, "VAL_1", val, "val=%s expected for key=%s but got %s", "VAL_1", "KEY_1", val) 24 | 25 | val, ok = fsm.Get("KEY_2") 26 | 27 | assert.Falsef(t, ok, "ok=false expected for key=%s", "KEY_2") 28 | 29 | assert.Emptyf(t, val, "empty string expected for key=%s but got %s") 30 | } 31 | 32 | func TestFsmSet(t *testing.T) { 33 | fsm := New[string, string](5) 34 | err := fsm.Set("KEY_1", "VAL_1") 35 | 36 | assert.NoErrorf(t, err, "nil error expected but got %s", err) 37 | } 38 | 39 | func TestFsmLimit(t *testing.T) { 40 | var err error 41 | fsm := New[string, string](2) 42 | 43 | err = fsm.Set("KEY_1", "VAL_1") 44 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_1", err) 45 | 46 | err = fsm.Set("KEY_2", "VAL_2") 47 | assert.NoErrorf(t, err, "nil error expected for key=%s but got %s", "KEY_2", err) 48 | 49 | err = fsm.Set("KEY_3", "VAL_3") 50 | 51 | if assert.Errorf(t, err, "error is expected for key=%s", "KEY_3") { 52 | assert.ErrorIs(t, err, ERR_MAX_ITEM_EXCEEDED) 53 | } 54 | } 55 | 56 | func TestFsmClear(t *testing.T) { 57 | fsm := New[string, string](5) 58 | 59 | fsm.Set("KEY_1", "VAL_1") 60 | 61 | fsm.Clear() 62 | 63 | val, ok := fsm.Get("KEY_1") 64 | 65 | assert.Falsef(t, ok, "ok=false expected for key=%s", "KEY_1") 66 | 67 | assert.Emptyf(t, val, "empty string expected for key=%s but got %s") 68 | } 69 | -------------------------------------------------------------------------------- /internal/resource_pool/pool_builder.go: -------------------------------------------------------------------------------- 1 | package rp 2 | 3 | func WithSize[T any](size uint64) PoolOption[T] { 4 | return func(p *Pooler[T]) { 5 | p.pool = NewPool[T](size) 6 | } 7 | } 8 | 9 | type Pooler[T any] struct { 10 | pool Pool[T] 11 | } 12 | 13 | type PoolOption[T any] func(*Pooler[T]) 14 | 15 | func NewPooler[T any](options ...PoolOption[T]) *Pooler[T] { 16 | pool := &Pooler[T]{} 17 | 18 | for _, option := range options { 19 | option(pool) 20 | } 21 | 22 | return pool 23 | } 24 | 25 | func (p *Pooler[T]) Acquire() *T { 26 | if p.pool != nil { 27 | return p.pool.Acquire() 28 | } 29 | return new(T) 30 | } 31 | 32 | func (p *Pooler[T]) Release(item *T) { 33 | if p.pool != nil { 34 | p.pool.Release(item) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /internal/resource_pool/resource_pool.go: -------------------------------------------------------------------------------- 1 | package rp 2 | 3 | type Pool[T any] chan *T 4 | 5 | func (p Pool[T]) Acquire() (_p *T) { 6 | select { 7 | case item := <-p: 8 | return item 9 | default: 10 | return _p 11 | } 12 | } 13 | 14 | func (p Pool[T]) Release(item *T) { 15 | select { 16 | case p <- item: 17 | default: 18 | } 19 | } 20 | 21 | func NewPool[K any](max uint64) Pool[K] { 22 | itemPool := make(Pool[K], max) 23 | return itemPool 24 | } 25 | -------------------------------------------------------------------------------- /internal/types/option.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type OptFunc[T any] func(*T) 4 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "github.com/tech-engine/goscrapy/cmd/cli" 4 | 5 | func main() { 6 | cli.Execute() 7 | } 8 | -------------------------------------------------------------------------------- /pkg/builtin/middlewares/dupefilter.go: -------------------------------------------------------------------------------- 1 | package middlewares 2 | 3 | import ( 4 | "encoding/hex" 5 | "errors" 6 | "fmt" 7 | "hash" 8 | "io" 9 | "net/http" 10 | "sort" 11 | "strings" 12 | "sync" 13 | 14 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager" 15 | "golang.org/x/crypto/blake2b" 16 | ) 17 | 18 | var ERR_DUPEFILTER_BLOCKED = errors.New("duplicate request") 19 | 20 | type RequestMap struct { 21 | seen map[string]struct{} 22 | mu sync.RWMutex 23 | } 24 | 25 | func NewRequestMap() *RequestMap { 26 | return &RequestMap{ 27 | seen: make(map[string]struct{}), 28 | } 29 | } 30 | 31 | func generateSHA1FingerprintFromReq(r *http.Request) (string, error) { 32 | 33 | var ( 34 | err error 35 | body io.ReadCloser 36 | hash hash.Hash 37 | ) 38 | 39 | if r.GetBody != nil { 40 | body, err = r.GetBody() 41 | if err != nil { 42 | return "", err 43 | } 44 | defer body.Close() 45 | } 46 | 47 | var combinedBuf strings.Builder 48 | 49 | hash, err = blake2b.New256(nil) 50 | if err != nil { 51 | return "", err 52 | } 53 | 54 | if body != nil { 55 | 56 | if _, err = io.Copy(hash, body); err != nil { 57 | return "", err 58 | } 59 | } 60 | 61 | combinedBuf.WriteString(r.Method) 62 | combinedBuf.WriteString(r.URL.String()) 63 | 64 | headerKeys := make([]string, 0, len(r.Header)) 65 | for key := range r.Header { 66 | headerKeys = append(headerKeys, key) 67 | } 68 | 69 | sort.Strings(headerKeys) 70 | 71 | // added sorted headers 72 | for _, key := range headerKeys { 73 | for _, value := range r.Header[key] { 74 | combinedBuf.WriteString(key) 75 | combinedBuf.WriteString(value) 76 | } 77 | } 78 | 79 | if _, err = hash.Write([]byte(combinedBuf.String())); err != nil { 80 | return "", err 81 | } 82 | 83 | finalHash := hash.Sum(nil) 84 | 85 | return hex.EncodeToString(finalHash[:]), nil 86 | 87 | } 88 | 89 | func DupeFilter(next http.RoundTripper) http.RoundTripper { 90 | requestMap := NewRequestMap() 91 | return middlewaremanager.MiddlewareFunc(func(req *http.Request) (*http.Response, error) { 92 | signature, err := generateSHA1FingerprintFromReq(req) 93 | 94 | if err != nil { 95 | return nil, fmt.Errorf("duplicatefilter.go:DupeFilterMiddleware: error generating request signature %w", err) 96 | } 97 | 98 | requestMap.mu.Lock() 99 | 100 | // we have already seen this signature so we skip 101 | if _, ok := requestMap.seen[signature]; ok { 102 | requestMap.mu.Unlock() 103 | return nil, fmt.Errorf("duplicatefilter.go:DupeFilterMiddleware: %w", ERR_DUPEFILTER_BLOCKED) 104 | } 105 | 106 | requestMap.seen[signature] = struct{}{} 107 | requestMap.mu.Unlock() 108 | 109 | return next.RoundTrip(req) 110 | }) 111 | } 112 | -------------------------------------------------------------------------------- /pkg/builtin/middlewares/dupefilter_test.go: -------------------------------------------------------------------------------- 1 | package middlewares 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "net/http" 7 | "net/http/httptest" 8 | "strings" 9 | "sync" 10 | "testing" 11 | 12 | "github.com/stretchr/testify/assert" 13 | ) 14 | 15 | type testCase struct { 16 | Name, 17 | Method string 18 | Header http.Header 19 | Body io.Reader 20 | mayBlock bool 21 | } 22 | 23 | func handler(w http.ResponseWriter, r *http.Request) { 24 | w.WriteHeader(http.StatusOK) 25 | } 26 | 27 | func TestDupeFilter(t *testing.T) { 28 | 29 | // Set our custom transport middleware 30 | client := &http.Client{ 31 | Transport: DupeFilter(http.DefaultTransport), 32 | } 33 | 34 | testServer := httptest.NewServer(http.HandlerFunc(handler)) 35 | 36 | testCases := []testCase{ 37 | { 38 | Name: "test1", 39 | Method: "GET", 40 | Header: http.Header{ 41 | "X-test": []string{"test_val_1"}, 42 | }, 43 | mayBlock: true, 44 | }, 45 | { 46 | Name: "test2", 47 | Method: "GET", 48 | Header: http.Header{ 49 | "X-test": []string{"test_val_2"}, 50 | }, 51 | Body: nil, 52 | }, 53 | { 54 | Name: "test3", 55 | Method: "GET", 56 | Header: http.Header{ 57 | "X-test": []string{"test_val_1"}, 58 | }, 59 | Body: nil, 60 | mayBlock: true, 61 | }, 62 | { 63 | Name: "test4", 64 | Method: "POST", 65 | Header: http.Header{ 66 | "X-test-another": []string{"test_val_1"}, 67 | }, 68 | Body: strings.NewReader("hello"), 69 | mayBlock: true, 70 | }, 71 | { 72 | Name: "test5", 73 | Method: "PATCH", 74 | Header: http.Header{ 75 | "X-test-another": []string{"test_val_1"}, 76 | }, 77 | Body: strings.NewReader("hello"), 78 | }, 79 | { 80 | Name: "test6", 81 | Method: "POST", 82 | Header: http.Header{ 83 | "X-test-another": []string{"test_val_1"}, 84 | }, 85 | Body: strings.NewReader("hello"), 86 | mayBlock: true, 87 | }, 88 | { 89 | Name: "test7", 90 | Method: "POST", 91 | Header: http.Header{ 92 | "X-test-another": []string{"test_val_1"}, 93 | }, 94 | Body: strings.NewReader("hello1"), 95 | }, 96 | } 97 | 98 | expectedPassCases := 5 99 | actualPassCases := 0 100 | 101 | var m sync.Mutex 102 | for _, tc := range testCases { 103 | func() { 104 | 105 | t.Run(tc.Method, func(t *testing.T) { 106 | t.Parallel() 107 | req, err := http.NewRequest(tc.Method, testServer.URL, tc.Body) 108 | 109 | assert.Nil(t, err, "error creating http request", tc.Name) 110 | 111 | req.Header = tc.Header 112 | 113 | resp, err := client.Do(req) 114 | if tc.mayBlock && err != nil { 115 | assert.ErrorIs(t, err, ERR_DUPEFILTER_BLOCKED, fmt.Sprintf("http request %s not blocked", tc.Name)) 116 | } 117 | 118 | if resp != nil { 119 | assert.Equal(t, 200, resp.StatusCode, "statuscode 200 expected") 120 | m.Lock() 121 | defer m.Unlock() 122 | actualPassCases++ 123 | } 124 | 125 | }) 126 | }() 127 | 128 | } 129 | 130 | t.Cleanup(func() { 131 | assert.Equal(t, expectedPassCases, actualPassCases, fmt.Sprintf("expected pass cases %d not equal to actual pass cases %d", expectedPassCases, actualPassCases)) 132 | testServer.Close() 133 | }) 134 | 135 | } 136 | -------------------------------------------------------------------------------- /pkg/builtin/middlewares/multi_cookiejar.go: -------------------------------------------------------------------------------- 1 | package middlewares 2 | 3 | import ( 4 | "net/http" 5 | "net/http/cookiejar" 6 | "sync" 7 | 8 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager" 9 | ) 10 | 11 | type multiCookieJar struct { 12 | jars map[string]http.CookieJar 13 | mu sync.RWMutex 14 | } 15 | 16 | // NewMultiCookieJar creates a new MultiCookieJar. 17 | func NewMultiCookieJar() *multiCookieJar { 18 | return &multiCookieJar{ 19 | jars: make(map[string]http.CookieJar), 20 | } 21 | } 22 | 23 | // GetCookieJar returns a CookieJar corresponding to a key or create one if key doesn't exist 24 | func (m *multiCookieJar) GetCookieJar(key string) http.CookieJar { 25 | 26 | m.mu.Lock() 27 | defer m.mu.Unlock() 28 | jar, ok := m.jars[key] 29 | 30 | // in case we don't have a cookie jar based on the key, we create a new one 31 | if !ok { 32 | jar, _ = cookiejar.New(nil) 33 | } 34 | m.jars[key] = jar 35 | return jar 36 | } 37 | 38 | func MultiCookieJar(next http.RoundTripper) http.RoundTripper { 39 | mCookieJar := NewMultiCookieJar() 40 | return middlewaremanager.MiddlewareFunc(func(req *http.Request) (*http.Response, error) { 41 | // Header keys are received as the client sends them and not normalized. 42 | 43 | jar := mCookieJar.GetCookieJar(req.Header.Get("X-Goscrapy-Cookie-Jar-Key")) 44 | 45 | reqCookies := jar.Cookies(req.URL) 46 | 47 | for _, rc := range reqCookies { 48 | req.AddCookie(rc) 49 | } 50 | 51 | // remove X-Goscrapy-CookieJar-Key header 52 | req.Header.Del("X-Goscrapy-Cookie-Jar-Key") 53 | 54 | // It is in this step Headers are normalized and sent out. 55 | resp, err := next.RoundTrip(req) 56 | 57 | if resp != nil { 58 | // update cookies 59 | jar.SetCookies(req.URL, resp.Cookies()) 60 | } 61 | 62 | return resp, err 63 | }) 64 | } 65 | -------------------------------------------------------------------------------- /pkg/builtin/middlewares/multi_cookiejar_test.go: -------------------------------------------------------------------------------- 1 | package middlewares 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "net/http/httptest" 7 | "strings" 8 | "testing" 9 | 10 | "slices" 11 | 12 | "github.com/stretchr/testify/assert" 13 | ) 14 | 15 | // Set our custom transport middleware 16 | var client = &http.Client{ 17 | Transport: MultiCookieJar(http.DefaultTransport), 18 | } 19 | 20 | func filteredHeaders(h http.Header) http.Header { 21 | var newHeader = make(http.Header) 22 | skipHeaders := []string{"User-Agent", "Accept-Encoding", "Cookie", "Content-Length", "Date"} 23 | 24 | for name, value := range h { 25 | // we skip the default headers 26 | if slices.Contains(skipHeaders, name) { 27 | continue 28 | } 29 | newHeader.Add(name, value[0]) 30 | } 31 | 32 | return newHeader 33 | } 34 | 35 | func makeTestRequestWithClient(client *http.Client) func(string, string, http.Header) (*http.Response, error) { 36 | return func(method, url string, header http.Header) (*http.Response, error) { 37 | // Create a first GET request without any cookie 38 | req, err := http.NewRequest(method, url, nil) 39 | 40 | if err != nil { 41 | return nil, fmt.Errorf("makeRequestWithClient: error creating http request %w", err) 42 | } 43 | 44 | if header != nil { 45 | req.Header = header 46 | } 47 | 48 | return client.Do(req) 49 | } 50 | } 51 | 52 | // handlerGetCookieJar provides us our dummy server handlers 53 | func handlerGetCookieJar(t *testing.T) *http.ServeMux { 54 | mux := http.NewServeMux() 55 | skipHeaders := []string{"User-Agent", "Accept-Encoding", "Cookie"} 56 | // /get-cookie receives headers from client and set those headers as response cookies 57 | mux.HandleFunc("/set-cookie", func(w http.ResponseWriter, r *http.Request) { 58 | for name, value := range r.Header { 59 | // Set the cookie in the response 60 | if len(value) <= 0 { 61 | continue 62 | } 63 | 64 | // we skip the default headers 65 | if slices.Contains(skipHeaders, name) { 66 | continue 67 | } 68 | 69 | http.SetCookie(w, &http.Cookie{ 70 | Name: name, 71 | Value: value[0], 72 | Domain: r.URL.Host, 73 | Path: "/", 74 | }) 75 | } 76 | 77 | w.WriteHeader(http.StatusOK) 78 | 79 | }) 80 | 81 | // /verify receives cookies(auto injected by middleware) from client and respond back with the cookie value in response headers 82 | mux.HandleFunc("/verify", func(w http.ResponseWriter, r *http.Request) { 83 | 84 | // Inspect the request's cookies 85 | receivedCookies := r.Cookies() 86 | 87 | for _, c := range receivedCookies { 88 | w.Header().Set(c.Name, c.Value) 89 | } 90 | 91 | w.WriteHeader(http.StatusOK) 92 | 93 | }) 94 | return mux 95 | } 96 | 97 | // There are 2 stages. 98 | // 99 | // Stage 1: We send a request with a few headers to our test server, and get the exact same headers back 100 | // as response cookies. 101 | // 102 | // Stage 2: To verify if our cookie middleware worked as expected, we will send another request to /verify. 103 | // If we get the exact same headers we sent in Stage 1, as response headers, our middleware worked as expected. 104 | func RunWithCookieJar(t *testing.T, key string) { 105 | // Create a test server with the custom RoundTripper 106 | key = strings.ToLower(key) 107 | testServer := httptest.NewServer(handlerGetCookieJar(t)) 108 | defer testServer.Close() 109 | 110 | requester := makeTestRequestWithClient(client) 111 | 112 | // Stage 1 113 | headerOne := http.Header{ 114 | "X-Goscrapy-Server-Req-" + key: []string{"single_host_req_" + key}, 115 | } 116 | 117 | if key != "" { 118 | headerOne.Add("X-Goscrapy-Cookie-Jar-Key", key) 119 | } 120 | 121 | respOne, err := requester("GET", testServer.URL+"/set-cookie", headerOne) 122 | 123 | assert.Nil(t, err, "error making http request 1") 124 | 125 | defer respOne.Body.Close() 126 | 127 | // we verify if we have received the same cookies that we have set in "X-Goscrapy-Server-Req-1" header 128 | respOneCookies := respOne.Cookies() 129 | 130 | assert.Lenf(t, respOneCookies, 1, "expected only %d cookie but got %d", 1, len(respOneCookies)) 131 | 132 | found := false 133 | 134 | for _, cookie := range respOneCookies { 135 | if strings.ToLower(cookie.Name) == "x-goscrapy-server-req-"+key && strings.ToLower(cookie.Value) == "single_host_req_"+key { 136 | found = true 137 | break 138 | } 139 | } 140 | 141 | assert.Truef(t, found, "expected response cookies [X-Goscrapy-Server-Req-%s=single_host_req_%s] not found", key, key) 142 | 143 | // second stage 2: 144 | headerTwo := http.Header{ 145 | "X-Goscrapy-Cookie-Jar-Key": []string{key}, 146 | } 147 | respTwo, err := requester("GET", testServer.URL+"/verify", headerTwo) 148 | 149 | assert.Nil(t, err, "error making http request 2") 150 | 151 | defer respTwo.Body.Close() 152 | 153 | respTwoHeader := filteredHeaders(respTwo.Header) 154 | 155 | assert.Lenf(t, respTwoHeader, 1, "expected only %d header but got %d", 1, len(respTwoHeader)) 156 | 157 | assert.Equal(t, "single_host_req_"+key, respTwoHeader.Get("X-Goscrapy-Server-Req-"+key)) 158 | } 159 | 160 | func TestMultiCookierJar(t *testing.T) { 161 | 162 | testCases := []struct { 163 | Name, 164 | SessionKey string 165 | }{ 166 | { 167 | Name: "DEFAULT_COOKIEJAR", 168 | SessionKey: "", 169 | }, 170 | { 171 | Name: "SINGLE_COOKIEJAR", 172 | SessionKey: "jar1", 173 | }, 174 | { 175 | Name: "SINGLE_COOKIEJAR", 176 | SessionKey: "jar2", 177 | }, 178 | { 179 | Name: "SINGLE_COOKIEJAR", 180 | SessionKey: "jar3", 181 | }, 182 | } 183 | 184 | for _, tc := range testCases { 185 | t.Run(tc.Name, func(t *testing.T) { 186 | t.Parallel() 187 | RunWithCookieJar(t, tc.SessionKey) 188 | }) 189 | } 190 | 191 | for _, tc := range testCases { 192 | t.Run(tc.Name, func(t *testing.T) { 193 | RunWithCookieJar(t, tc.SessionKey) 194 | }) 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /pkg/builtin/middlewares/retry.go: -------------------------------------------------------------------------------- 1 | package middlewares 2 | 3 | import ( 4 | "math" 5 | "net/http" 6 | "os" 7 | "slices" 8 | "strconv" 9 | "strings" 10 | "time" 11 | 12 | "github.com/tech-engine/goscrapy/pkg/middlewaremanager" 13 | ) 14 | 15 | const MIDDLEWARE_HTTP_RETRY_MAX_RETRIES = 3 16 | 17 | var MIDDLEWARE_HTTP_RETRY_CODES = []int{500, 502, 503, 504, 522, 524, 408, 429} 18 | 19 | type RetryCb func(*http.Request, uint8) bool 20 | 21 | type RetryOpts struct { 22 | MaxRetries uint8 23 | Codes []int 24 | BaseDelay time.Duration 25 | Cb RetryCb 26 | } 27 | 28 | func defaultRetryOpts() *RetryOpts { 29 | opts := &RetryOpts{ 30 | MaxRetries: MIDDLEWARE_HTTP_RETRY_MAX_RETRIES, 31 | Codes: MIDDLEWARE_HTTP_RETRY_CODES, 32 | BaseDelay: 1 * time.Second, 33 | } 34 | 35 | value, ok := os.LookupEnv("MIDDLEWARE_HTTP_RETRY_MAX_RETRIES") 36 | 37 | if ok { 38 | maxRetries, err := strconv.Atoi(value) 39 | if err == nil { 40 | opts.MaxRetries = uint8(maxRetries) 41 | } 42 | } 43 | 44 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_RETRY_CODES") 45 | 46 | if ok { 47 | codesStr := strings.Split(value, ",") 48 | codes := make([]int, 0, len(codesStr)) 49 | 50 | for _, codeStr := range codesStr { 51 | if code, err := strconv.Atoi(strings.TrimSpace(codeStr)); err == nil { 52 | codes = append(codes, code) 53 | } 54 | } 55 | 56 | if len(codes) > 0 { 57 | opts.Codes = codes[:] 58 | } 59 | 60 | } 61 | 62 | value, ok = os.LookupEnv("MIDDLEWARE_HTTP_RETRY_BASE_DELAY") 63 | 64 | if ok { 65 | baseDelay, err := time.ParseDuration(value) 66 | if err == nil { 67 | opts.BaseDelay = baseDelay 68 | } 69 | } 70 | 71 | return opts 72 | } 73 | 74 | func Retry(opts ...RetryOpts) func(http.RoundTripper) http.RoundTripper { 75 | 76 | retryOpts := defaultRetryOpts() 77 | 78 | // overwrite defaults 79 | if len(opts) > 0 { 80 | if opts[0].MaxRetries > 0 { 81 | retryOpts.MaxRetries = opts[0].MaxRetries 82 | } 83 | 84 | if opts[0].Codes != nil { 85 | retryOpts.Codes = opts[0].Codes[:] 86 | } 87 | 88 | if opts[0].BaseDelay > 0 { 89 | retryOpts.BaseDelay = opts[0].BaseDelay 90 | } 91 | 92 | if opts[0].Cb != nil { 93 | retryOpts.Cb = opts[0].Cb 94 | } 95 | } 96 | 97 | return func(next http.RoundTripper) http.RoundTripper { 98 | return middlewaremanager.MiddlewareFunc(func(req *http.Request) (*http.Response, error) { 99 | 100 | var ( 101 | resp *http.Response 102 | err error 103 | retries uint8 = retryOpts.MaxRetries 104 | i uint8 105 | ) 106 | 107 | retryHeader := req.Header.Get("X-Goscrapy-Middleware-Max-Retry") 108 | 109 | if retryHeader != "" { 110 | r, _ := strconv.Atoi(retryHeader) 111 | retries = uint8(r) 112 | req.Header.Del("X-Goscrapy-Middleware-Max-Retry") 113 | } 114 | 115 | retries += 1 116 | 117 | timer := time.NewTimer(retryOpts.BaseDelay) 118 | 119 | for i = 0; i < retries; i++ { 120 | resp, err = next.RoundTrip(req) 121 | 122 | // call retry callback, if present 123 | if i > 0 && retryOpts.Cb != nil && !retryOpts.Cb(req, i) { 124 | break 125 | } 126 | 127 | if err != nil { 128 | select { 129 | case <-timer.C: 130 | // calculate next delay 131 | timer.Reset(time.Duration(math.Pow(2, float64(i))) * retryOpts.BaseDelay) 132 | continue 133 | } 134 | } 135 | 136 | if !slices.Contains(retryOpts.Codes, resp.StatusCode) { 137 | break 138 | } 139 | 140 | select { 141 | case <-timer.C: 142 | // calculate next delay 143 | timer.Reset(time.Duration(math.Pow(2, float64(i))) * retryOpts.BaseDelay) 144 | } 145 | } 146 | 147 | if !timer.Stop() { 148 | <-timer.C 149 | } 150 | 151 | return resp, err 152 | }) 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /pkg/builtin/middlewares/retry_test.go: -------------------------------------------------------------------------------- 1 | package middlewares 2 | 3 | import ( 4 | "net/http" 5 | "net/http/httptest" 6 | "testing" 7 | 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func retry500Handler(w http.ResponseWriter, r *http.Request) { 12 | w.WriteHeader(http.StatusInternalServerError) 13 | } 14 | 15 | func TestRetry(t *testing.T) { 16 | 17 | var ( 18 | expectedRetryCnt uint8 = 3 19 | actualRetryCnt uint8 20 | ) 21 | 22 | client := &http.Client{ 23 | Transport: Retry(RetryOpts{ 24 | MaxRetries: expectedRetryCnt, 25 | Cb: func(r *http.Request, retry uint8) bool { 26 | actualRetryCnt = retry 27 | return true 28 | }, 29 | })(http.DefaultTransport), 30 | } 31 | 32 | testServer := httptest.NewServer(http.HandlerFunc(retry500Handler)) 33 | 34 | req, err := http.NewRequest("GET", testServer.URL, nil) 35 | 36 | assert.Nil(t, err, "error creating http request") 37 | 38 | resp, err := client.Do(req) 39 | 40 | assert.Nil(t, err, "error making request") 41 | 42 | resp.Body.Close() 43 | 44 | assert.Equal(t, expectedRetryCnt, actualRetryCnt) 45 | testServer.Close() 46 | } 47 | 48 | func TestRetryWithCb(t *testing.T) { 49 | 50 | var ( 51 | retryCnt uint8 = 3 52 | expectedRetryCnt uint8 = 3 53 | actualRetryCnt uint8 54 | ) 55 | 56 | client := &http.Client{ 57 | Transport: Retry(RetryOpts{ 58 | MaxRetries: retryCnt, 59 | Cb: func(r *http.Request, retry uint8) bool { 60 | actualRetryCnt = retry 61 | return retry <= 1 62 | }, 63 | })(http.DefaultTransport), 64 | } 65 | 66 | testServer := httptest.NewServer(http.HandlerFunc(retry500Handler)) 67 | 68 | req, err := http.NewRequest("GET", testServer.URL, nil) 69 | 70 | assert.Nil(t, err, "error creating http request") 71 | 72 | resp, err := client.Do(req) 73 | 74 | assert.Nil(t, err, "error making request") 75 | 76 | resp.Body.Close() 77 | 78 | assert.Less(t, actualRetryCnt, expectedRetryCnt) 79 | testServer.Close() 80 | } 81 | 82 | func TestRetryWithHttpCodes(t *testing.T) { 83 | 84 | var ( 85 | retryCnt uint8 = 3 86 | expectedRetryCnt uint8 = 0 87 | actualRetryCnt uint8 88 | ) 89 | 90 | client := &http.Client{ 91 | Transport: Retry(RetryOpts{ 92 | MaxRetries: retryCnt, 93 | Codes: []int{467}, 94 | Cb: func(r *http.Request, retry uint8) bool { 95 | actualRetryCnt = retry 96 | return true 97 | }, 98 | })(http.DefaultTransport), 99 | } 100 | 101 | testServer := httptest.NewServer(http.HandlerFunc(retry500Handler)) 102 | 103 | req, err := http.NewRequest("GET", testServer.URL, nil) 104 | 105 | assert.Nil(t, err, "error creating http request") 106 | 107 | resp, err := client.Do(req) 108 | 109 | assert.Nil(t, err, "error making request") 110 | 111 | resp.Body.Close() 112 | 113 | assert.Equal(t, expectedRetryCnt, actualRetryCnt) 114 | testServer.Close() 115 | } 116 | -------------------------------------------------------------------------------- /pkg/builtin/pipelines/dummy.go: -------------------------------------------------------------------------------- 1 | package pipelines 2 | 3 | import ( 4 | "reflect" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/core" 7 | ) 8 | 9 | type dummyJob struct { 10 | id string 11 | } 12 | 13 | func (j *dummyJob) Id() string { 14 | return "dummyJob" 15 | } 16 | 17 | func (o *dummyRecord) Record() *dummyRecord { 18 | return o 19 | } 20 | 21 | func (o *dummyRecord) RecordKeys() []string { 22 | dataType := reflect.TypeOf(dummyRecord{}) 23 | if dataType.Kind() != reflect.Struct { 24 | panic("Record is not a struct") 25 | } 26 | 27 | numFields := dataType.NumField() 28 | keys := make([]string, numFields) 29 | 30 | for i := 0; i < numFields; i++ { 31 | field := dataType.Field(i) 32 | csvTag := field.Tag.Get("csv") 33 | keys[i] = csvTag 34 | } 35 | 36 | return keys 37 | } 38 | 39 | func (o *dummyRecord) RecordFlat() []any { 40 | 41 | inputType := reflect.TypeOf(o) 42 | 43 | if inputType.Kind() != reflect.Struct { 44 | panic("Record is not a struct") 45 | } 46 | 47 | inputValue := reflect.ValueOf(o) 48 | 49 | slice := make([]any, inputType.NumField()) 50 | 51 | for i := 0; i < inputType.NumField(); i++ { 52 | slice[i] = inputValue.Field(i).Interface() 53 | } 54 | return slice 55 | } 56 | 57 | func (o *dummyRecord) Job() core.IJob { 58 | return o.J 59 | } 60 | 61 | type dummyRecord struct { 62 | Id string `json:"id" csv:"id"` 63 | Name string `json:"name" csv:"name"` 64 | J *dummyJob `json:"-" csv:"-"` 65 | } 66 | -------------------------------------------------------------------------------- /pkg/builtin/pipelines/export_to_csv.go: -------------------------------------------------------------------------------- 1 | package pipelines 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "time" 7 | 8 | "context" 9 | 10 | "github.com/gocarina/gocsv" 11 | "github.com/tech-engine/goscrapy/pkg/core" 12 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 13 | ) 14 | 15 | // Export2CSV configuration struct. 16 | // File Field will take precedence over Filename field. 17 | type Export2CSVOpts struct { 18 | Filename string 19 | File *os.File 20 | } 21 | 22 | type export2CSV[OUT any] struct { 23 | filename string 24 | file *os.File 25 | } 26 | 27 | func Export2CSV[OUT any](opts ...Export2CSVOpts) *export2CSV[OUT] { 28 | e := &export2CSV[OUT]{ 29 | filename: fmt.Sprintf("JOB_%s.csv", time.Now().UTC().Format("2006-01-02-15-04-05")), 30 | } 31 | 32 | if len(opts) > 0 { 33 | if opts[0].Filename != "" { 34 | e.filename = opts[0].Filename 35 | } 36 | 37 | if opts[0].File != nil { 38 | e.file = opts[0].File 39 | } 40 | } 41 | 42 | return e 43 | } 44 | 45 | func (p *export2CSV[OUT]) Open(ctx context.Context) error { 46 | if p.file != nil { 47 | p.filename = "" 48 | return nil 49 | } 50 | 51 | file, err := os.OpenFile(p.filename, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0640) 52 | 53 | if err != nil { 54 | return err 55 | } 56 | 57 | p.file = file 58 | return nil 59 | } 60 | 61 | func (p *export2CSV[OUT]) Close() { 62 | p.file.Close() 63 | } 64 | 65 | func (p *export2CSV[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error { 66 | 67 | fileInfo, err := p.file.Stat() 68 | 69 | if err != nil { 70 | return err 71 | } 72 | 73 | size := fileInfo.Size() 74 | 75 | data := []OUT{original.Record()} 76 | 77 | if size > 0 { 78 | err = gocsv.MarshalWithoutHeaders(data, p.file) 79 | } else { 80 | err = gocsv.MarshalFile(data, p.file) 81 | } 82 | 83 | return err 84 | } 85 | -------------------------------------------------------------------------------- /pkg/builtin/pipelines/export_to_csv_test.go: -------------------------------------------------------------------------------- 1 | package pipelines 2 | 3 | import ( 4 | "context" 5 | "encoding/csv" 6 | "os" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestExport2CSV(t *testing.T) { 13 | 14 | f, err := os.CreateTemp(".", "export_2_csv.csv") 15 | 16 | assert.NoError(t, err) 17 | 18 | defer os.Remove(f.Name()) 19 | 20 | pipeline := Export2CSV[*dummyRecord](Export2CSVOpts{ 21 | File: f, 22 | }) 23 | 24 | defer pipeline.Close() 25 | 26 | err = pipeline.Open(context.Background()) 27 | 28 | assert.NoError(t, err) 29 | 30 | record := &dummyRecord{Id: "1", Name: "rick"} 31 | 32 | err = pipeline.ProcessItem(nil, record) 33 | 34 | assert.NoError(t, err) 35 | 36 | f.Seek(0, 0) 37 | 38 | reader := csv.NewReader(f) 39 | 40 | csvRecords, err := reader.ReadAll() 41 | 42 | assert.NoError(t, err) 43 | 44 | assert.Equal(t, convertToSliceOfStrings(record), csvRecords[1]) 45 | 46 | } 47 | 48 | func convertToSliceOfStrings(record *dummyRecord) []string { 49 | return []string{record.Id, record.Name} 50 | } 51 | -------------------------------------------------------------------------------- /pkg/builtin/pipelines/export_to_firebase.go: -------------------------------------------------------------------------------- 1 | package pipelines 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | 8 | firebase "firebase.google.com/go" 9 | "firebase.google.com/go/db" 10 | "github.com/tech-engine/goscrapy/pkg/core" 11 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 12 | "google.golang.org/api/option" 13 | ) 14 | 15 | type export2FIREBASE[OUT any] struct { 16 | ctx context.Context 17 | ref *db.Ref 18 | } 19 | 20 | func Export2FIREBASE[OUT any](_url, filePath, collName string) *export2FIREBASE[OUT] { 21 | ctx := context.Background() 22 | 23 | conf := &firebase.Config{ 24 | DatabaseURL: _url, 25 | } 26 | 27 | opt := option.WithCredentialsFile(filePath) 28 | 29 | app, err := firebase.NewApp(ctx, conf, opt) 30 | 31 | if err != nil { 32 | log.Printf("Export2FIREBASE: Error initializing app %s", err) 33 | return nil 34 | } 35 | 36 | client, err := app.Database(ctx) 37 | 38 | if err != nil { 39 | log.Printf("Export2FIREBASE: Error initializing Firebase client %s", err) 40 | return nil 41 | } 42 | 43 | return &export2FIREBASE[OUT]{ 44 | ctx: ctx, 45 | ref: client.NewRef(collName), 46 | } 47 | } 48 | 49 | func (p *export2FIREBASE[OUT]) Open(ctx context.Context) error { 50 | return nil 51 | } 52 | 53 | func (p *export2FIREBASE[OUT]) Close() { 54 | } 55 | 56 | // your custom pipeline processing code goes here 57 | func (p *export2FIREBASE[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error { 58 | 59 | if _, err := p.ref.Push(p.ctx, original.Record()); err != nil { 60 | return fmt.Errorf("Export2FIREBASE: error inserting data to DB %w", err) 61 | } 62 | 63 | return nil 64 | } 65 | -------------------------------------------------------------------------------- /pkg/builtin/pipelines/export_to_gsheet.go: -------------------------------------------------------------------------------- 1 | package pipelines 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | 8 | "github.com/tech-engine/goscrapy/pkg/core" 9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 10 | "google.golang.org/api/option" 11 | "google.golang.org/api/sheets/v4" 12 | ) 13 | 14 | type export2GSHEET[OUT any] struct { 15 | service *sheets.Service 16 | sheetName string 17 | spreadSheetId string 18 | sheetId int64 19 | } 20 | 21 | func Export2GSHEET[OUT any](keyFilePath, spreadSheetId string, sheetId int64) *export2GSHEET[OUT] { 22 | ctx := context.Background() 23 | 24 | service, err := sheets.NewService(ctx, option.WithCredentialsFile(keyFilePath)) 25 | 26 | if err != nil { 27 | log.Printf("Export2GSHEET: error creating a service using provided creds %s", err) 28 | return nil 29 | } 30 | 31 | response, err := service.Spreadsheets.Get(spreadSheetId).Fields("sheets(properties(sheetId,title))").Do() 32 | 33 | if err != nil { 34 | log.Printf("Export2GSHEET: error getting spreadsheet by id %s %s", spreadSheetId, err) 35 | return nil 36 | } 37 | 38 | if response.HTTPStatusCode != 200 { 39 | log.Printf(fmt.Sprintf("Export2GSHEET: %d status code received", response.HTTPStatusCode)) 40 | return nil 41 | } 42 | 43 | sheetName := "" 44 | 45 | for _, sheet := range response.Sheets { 46 | if sheet.Properties.SheetId == sheetId { 47 | sheetName = sheet.Properties.Title 48 | break 49 | } 50 | } 51 | 52 | if sheetName == "" { 53 | log.Printf("Export2GSHEET: %d status code received", response.HTTPStatusCode) 54 | return nil 55 | } 56 | 57 | return &export2GSHEET[OUT]{ 58 | service: service, 59 | sheetName: sheetName, 60 | spreadSheetId: spreadSheetId, 61 | sheetId: sheetId, 62 | } 63 | } 64 | 65 | func (p *export2GSHEET[OUT]) Open(ctx context.Context) error { 66 | return nil 67 | } 68 | 69 | func (p *export2GSHEET[OUT]) Close() { 70 | } 71 | 72 | func (p *export2GSHEET[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error { 73 | 74 | records := original.RecordFlat() 75 | row := &sheets.ValueRange{ 76 | Values: [][]any{records}, 77 | } 78 | 79 | response, err := p.service.Spreadsheets.Values.Append(p.spreadSheetId, p.sheetName, row). 80 | ValueInputOption("USER_ENTERED"). 81 | InsertDataOption("INSERT_ROWS"). 82 | Context(context.Background()). 83 | Do() 84 | 85 | if err != nil || response.HTTPStatusCode != 200 { 86 | return err 87 | } 88 | 89 | return nil 90 | } 91 | -------------------------------------------------------------------------------- /pkg/builtin/pipelines/export_to_json.go: -------------------------------------------------------------------------------- 1 | package pipelines 2 | 3 | import ( 4 | "bufio" 5 | "encoding/json" 6 | "fmt" 7 | "io" 8 | "os" 9 | "time" 10 | 11 | "github.com/tech-engine/goscrapy/pkg/core" 12 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 13 | "golang.org/x/net/context" 14 | ) 15 | 16 | // Immediate: export2JSON internally creates a bufio.Writer from provided io.Writer. 17 | // Immediate=true, flushes bufio.Writer immediately after processing. 18 | type Export2JSONOpts struct { 19 | Filename string 20 | File io.WriteCloser 21 | Immediate bool 22 | } 23 | 24 | type export2JSON[OUT any] struct { 25 | filename string 26 | file io.WriteCloser 27 | buff *bufio.Writer 28 | immediateFlush bool 29 | } 30 | 31 | func Export2JSON[OUT any](opts ...Export2JSONOpts) *export2JSON[OUT] { 32 | e := &export2JSON[OUT]{ 33 | filename: fmt.Sprintf("JOB_%s.json", time.Now().UTC().Format("2006-01-02-15-04-05")), 34 | } 35 | 36 | if len(opts) > 0 { 37 | opt := opts[0] 38 | 39 | if opt.Filename != "" { 40 | e.filename = opt.Filename 41 | } 42 | 43 | if opt.File != nil { 44 | e.file = opt.File 45 | } 46 | 47 | e.immediateFlush = opt.Immediate 48 | } 49 | 50 | return e 51 | } 52 | 53 | func (p *export2JSON[OUT]) Open(ctx context.Context) error { 54 | if p.file != nil { 55 | p.buff = bufio.NewWriter(p.file) 56 | return nil 57 | } 58 | 59 | file, err := os.OpenFile(p.filename, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0640) 60 | 61 | if err != nil { 62 | return err 63 | } 64 | 65 | p.file = file 66 | p.buff = bufio.NewWriter(p.file) 67 | return nil 68 | } 69 | 70 | func (p *export2JSON[OUT]) Close() { 71 | // flushed data to writer 72 | p.buff.Flush() 73 | p.file.Close() 74 | } 75 | 76 | func (p *export2JSON[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error { 77 | 78 | jsonEncoder := json.NewEncoder(p.buff) 79 | 80 | // Encode and write the JSON data 81 | if err := jsonEncoder.Encode(original.Record()); err != nil { 82 | return err 83 | } 84 | 85 | if p.immediateFlush { 86 | p.buff.Flush() 87 | } 88 | 89 | return nil 90 | } 91 | -------------------------------------------------------------------------------- /pkg/builtin/pipelines/export_to_json_test.go: -------------------------------------------------------------------------------- 1 | package pipelines 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "os" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestExport2JSON(t *testing.T) { 13 | 14 | f, err := os.CreateTemp(".", "export_2_json") 15 | 16 | assert.NoError(t, err) 17 | 18 | defer os.Remove(f.Name()) 19 | 20 | pipeline := Export2JSON[*dummyRecord](Export2JSONOpts{ 21 | File: f, 22 | Immediate: true, 23 | }) 24 | 25 | defer pipeline.Close() 26 | 27 | err = pipeline.Open(context.Background()) 28 | 29 | assert.NoError(t, err) 30 | 31 | record := &dummyRecord{Id: "1", Name: "rick"} 32 | 33 | err = pipeline.ProcessItem(nil, record) 34 | 35 | assert.NoError(t, err) 36 | 37 | f.Seek(0, 0) 38 | 39 | d := json.NewDecoder(f) 40 | 41 | var out dummyRecord 42 | err = d.Decode(&out) 43 | 44 | assert.NoError(t, err) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /pkg/builtin/pipelines/export_to_mongodb.go: -------------------------------------------------------------------------------- 1 | package pipelines 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | 8 | "github.com/tech-engine/goscrapy/pkg/core" 9 | pm "github.com/tech-engine/goscrapy/pkg/pipeline_manager" 10 | "go.mongodb.org/mongo-driver/bson" 11 | "go.mongodb.org/mongo-driver/bson/primitive" 12 | "go.mongodb.org/mongo-driver/mongo" 13 | "go.mongodb.org/mongo-driver/mongo/options" 14 | ) 15 | 16 | type export2MONGODB[OUT any] struct { 17 | ctx context.Context 18 | client *mongo.Client 19 | collection *mongo.Collection 20 | } 21 | 22 | func Export2MONGODB[OUT any](_url string, dbName string, collName string) *export2MONGODB[OUT] { 23 | 24 | ctx := context.Background() 25 | 26 | serverAPI := options.ServerAPI(options.ServerAPIVersion1) 27 | opts := options.Client().ApplyURI(_url).SetServerAPIOptions(serverAPI) 28 | 29 | client, err := mongo.Connect(ctx, opts) 30 | 31 | if err != nil { 32 | log.Printf("Export2MONGODB: error connecting to DB %s", err) 33 | return nil 34 | } 35 | 36 | var result bson.M 37 | 38 | if err := client.Database(dbName).RunCommand(ctx, bson.D{{Key: "ping", Value: 1}}).Decode(&result); err != nil { 39 | log.Printf("Export2MONGODB: error connecting to DB %s", err) 40 | return nil 41 | } 42 | 43 | collection := client.Database(dbName).Collection(collName) 44 | 45 | return &export2MONGODB[OUT]{ 46 | ctx: ctx, 47 | client: client, 48 | collection: collection, 49 | } 50 | } 51 | 52 | func (p *export2MONGODB[OUT]) Open(ctx context.Context) error { 53 | return nil 54 | } 55 | 56 | func (p *export2MONGODB[OUT]) Close() { 57 | } 58 | 59 | func (p *export2MONGODB[OUT]) ProcessItem(item pm.IPipelineItem, original core.IOutput[OUT]) error { 60 | 61 | doc := primitive.D{} 62 | recordFlat := original.RecordFlat() 63 | 64 | for i, key := range original.RecordKeys() { 65 | doc = append(doc, primitive.E{Key: key, Value: recordFlat[i]}) 66 | } 67 | 68 | _, err := p.collection.InsertMany(p.ctx, []any{doc}) 69 | 70 | if err != nil { 71 | return fmt.Errorf("Export2MONGODB: error inserting data to DB %w", err) 72 | } 73 | 74 | return nil 75 | } 76 | -------------------------------------------------------------------------------- /pkg/builtin/pipelines/type.go: -------------------------------------------------------------------------------- 1 | package pipelines 2 | -------------------------------------------------------------------------------- /pkg/core/core.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | type Core[OUT any] struct { 4 | engine IEngine[OUT] 5 | } 6 | 7 | func New[OUT any](engine IEngine[OUT]) *Core[OUT] { 8 | return &Core[OUT]{ 9 | engine: engine, 10 | } 11 | } 12 | 13 | func (c *Core[OUT]) Request(req IRequestReader, cb ResponseCallback) { 14 | c.engine.Schedule(req, cb) 15 | } 16 | 17 | func (c *Core[OUT]) NewRequest() IRequestRW { 18 | return c.engine.NewRequest() 19 | } 20 | 21 | func (c *Core[OUT]) Yield(out IOutput[OUT]) { 22 | c.engine.Yield(out) 23 | } 24 | -------------------------------------------------------------------------------- /pkg/core/ports.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "net/http" 7 | "net/url" 8 | 9 | "github.com/tech-engine/goscrapy/internal/fsm" 10 | "golang.org/x/net/html" 11 | ) 12 | 13 | type IEngine[OUT any] interface { 14 | Start(context.Context) error 15 | NewRequest() IRequestRW 16 | Schedule(IRequestReader, ResponseCallback) 17 | Yield(IOutput[OUT]) 18 | } 19 | 20 | type IRequestReader interface { 21 | ReadContext() context.Context 22 | ReadUrl() *url.URL 23 | ReadHeader() http.Header 24 | ReadMethod() string 25 | ReadBody() io.ReadCloser 26 | ReadMeta() *fsm.FixedSizeMap[string, any] 27 | ReadCookieJar() string 28 | } 29 | 30 | type IRequestWriter interface { 31 | WithContext(context.Context) IRequestWriter 32 | Url(string) IRequestWriter 33 | Header(http.Header) IRequestWriter 34 | Method(string) IRequestWriter 35 | Body(any) IRequestWriter 36 | Meta(string, any) IRequestWriter 37 | CookieJar(string) IRequestWriter 38 | } 39 | 40 | type IRequestRW interface { 41 | IRequestReader 42 | IRequestWriter 43 | Reset() 44 | } 45 | 46 | type IResponseReader interface { 47 | Header() http.Header 48 | Body() io.ReadCloser 49 | Bytes() []byte 50 | StatusCode() int 51 | Cookies() []*http.Cookie 52 | Request() *http.Request 53 | Meta(string) (any, bool) 54 | ISelector 55 | } 56 | 57 | type IJob interface { 58 | Id() string 59 | } 60 | 61 | type IOutput[OUT any] interface { 62 | Record() OUT 63 | RecordKeys() []string 64 | RecordFlat() []any 65 | Job() IJob 66 | } 67 | 68 | type ResponseCallback func(context.Context, IResponseReader) 69 | type ISelectorGetter interface { 70 | Get() *html.Node 71 | GetAll() []*html.Node 72 | Text(...string) []string 73 | Attr(string) []string 74 | } 75 | 76 | type ISelector interface { 77 | Css(string) ISelector 78 | Xpath(string) ISelector 79 | ISelectorGetter 80 | } 81 | -------------------------------------------------------------------------------- /pkg/engine/engine.go: -------------------------------------------------------------------------------- 1 | package engine 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | 7 | "github.com/tech-engine/goscrapy/pkg/core" 8 | ) 9 | 10 | type Engine[OUT any] struct { 11 | scheduler IScheduler 12 | pipelineManager IPipelineManager[OUT] 13 | outputCh chan core.IOutput[OUT] 14 | } 15 | 16 | func New[OUT any](schd IScheduler, pm IPipelineManager[OUT]) *Engine[OUT] { 17 | 18 | engine := &Engine[OUT]{ 19 | scheduler: schd, 20 | pipelineManager: pm, 21 | } 22 | 23 | return engine 24 | } 25 | 26 | // start the core 27 | func (m *Engine[OUT]) Start(ctx context.Context) error { 28 | 29 | var ( 30 | wg sync.WaitGroup 31 | errCh = make(chan error, 2) 32 | ) 33 | 34 | wg.Add(2) 35 | 36 | pmCtx, pmCancel := context.WithCancel(context.Background()) 37 | 38 | go func() { 39 | 40 | defer wg.Done() 41 | defer pmCancel() 42 | 43 | errCh <- m.scheduler.Start(ctx) 44 | 45 | }() 46 | 47 | go func() { 48 | 49 | defer wg.Done() 50 | 51 | errCh <- m.pipelineManager.Start(pmCtx) 52 | 53 | }() 54 | 55 | wg.Wait() 56 | 57 | close(errCh) 58 | 59 | for err := range errCh { 60 | if err != nil { 61 | return err 62 | } 63 | } 64 | return nil 65 | } 66 | 67 | func (m *Engine[OUT]) Schedule(req core.IRequestReader, cb core.ResponseCallback) { 68 | m.scheduler.Schedule(req, cb) 69 | } 70 | 71 | func (m *Engine[OUT]) Yield(out core.IOutput[OUT]) { 72 | m.pipelineManager.Push(out) 73 | } 74 | 75 | func (m *Engine[OUT]) NewRequest() core.IRequestRW { 76 | return m.scheduler.NewRequest() 77 | } 78 | 79 | func (m *Engine[OUT]) WithScheduler(schd IScheduler) { 80 | m.scheduler = schd 81 | } 82 | 83 | func (m *Engine[OUT]) WithPipelineManager(pm IPipelineManager[OUT]) { 84 | m.pipelineManager = pm 85 | } 86 | -------------------------------------------------------------------------------- /pkg/engine/ports.go: -------------------------------------------------------------------------------- 1 | package engine 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "net/http" 7 | 8 | "github.com/tech-engine/goscrapy/internal/fsm" 9 | "github.com/tech-engine/goscrapy/pkg/core" 10 | ) 11 | 12 | type IPipelineManager[OUT any] interface { 13 | Start(context.Context) error 14 | Push(core.IOutput[OUT]) 15 | } 16 | 17 | type Resetter interface { 18 | Reset() 19 | } 20 | 21 | type IResponseWriter interface { 22 | WriteHeader(http.Header) 23 | WriteBody(io.ReadCloser) 24 | WriteStatusCode(int) 25 | WriteCookies([]*http.Cookie) 26 | WriteRequest(*http.Request) 27 | WriteMeta(*fsm.FixedSizeMap[string, any]) 28 | } 29 | 30 | type IResponse interface { 31 | core.IResponseReader 32 | IResponseWriter 33 | } 34 | 35 | type IScheduler interface { 36 | Start(context.Context) error 37 | Schedule(core.IRequestReader, core.ResponseCallback) 38 | NewRequest() core.IRequestRW 39 | } 40 | -------------------------------------------------------------------------------- /pkg/executor/executor.go: -------------------------------------------------------------------------------- 1 | package executor 2 | 3 | import ( 4 | "github.com/tech-engine/goscrapy/pkg/core" 5 | "github.com/tech-engine/goscrapy/pkg/engine" 6 | ) 7 | 8 | type Executor struct { 9 | adapter IExecutorAdapter 10 | } 11 | 12 | func New(adapter IExecutorAdapter) *Executor { 13 | return &Executor{ 14 | adapter: adapter, 15 | } 16 | } 17 | 18 | func (e *Executor) Execute(req core.IRequestReader, res engine.IResponseWriter) error { 19 | 20 | request := e.adapter.Acquire() 21 | 22 | if req.ReadContext() != nil { 23 | request.WithContext(req.ReadContext()) 24 | } 25 | 26 | headers := req.ReadHeader() 27 | // we inject a header for cookiejar implementation 28 | headers.Add("X-Goscrapy-Cookie-Jar-Key", req.ReadCookieJar()) 29 | 30 | request.URL = req.ReadUrl() 31 | request.Method = "GET" 32 | 33 | if req.ReadMethod() != "" { 34 | request.Method = req.ReadMethod() 35 | } 36 | 37 | request.Header = headers 38 | 39 | request.Body = req.ReadBody() 40 | 41 | return e.adapter.Do(res, request) 42 | } 43 | 44 | func (e *Executor) WithAdapter(adapter IExecutorAdapter) { 45 | e.adapter = adapter 46 | } 47 | -------------------------------------------------------------------------------- /pkg/executor/ports.go: -------------------------------------------------------------------------------- 1 | package executor 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/engine" 7 | ) 8 | 9 | type IExecutorAdapter interface { 10 | Do(engine.IResponseWriter, *http.Request) error 11 | Acquire() *http.Request 12 | WithClient(*http.Client) 13 | } 14 | -------------------------------------------------------------------------------- /pkg/executor_adapters/http_native/adapter.go: -------------------------------------------------------------------------------- 1 | package httpnative 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "os" 7 | "strconv" 8 | 9 | rp "github.com/tech-engine/goscrapy/internal/resource_pool" 10 | "github.com/tech-engine/goscrapy/pkg/engine" 11 | ) 12 | 13 | const EX_ADAPTER_DEFAULT_REQ_RES_POOL_SIZE = 1e6 14 | 15 | // HTTPAdapter implements Executor's ExecAdapter interface 16 | type HTTPAdapter struct { 17 | client *http.Client 18 | reqpool *rp.Pooler[http.Request] 19 | } 20 | 21 | func NewHTTPClientAdapter(client *http.Client, poolSize uint64) *HTTPAdapter { 22 | if client == nil { 23 | client = http.DefaultClient 24 | } 25 | 26 | if poolSize == 0 { 27 | poolSize = EX_ADAPTER_DEFAULT_REQ_RES_POOL_SIZE 28 | value, ok := os.LookupEnv("SCHEDULER_REQ_RES_POOL_SIZE") 29 | 30 | if ok { 31 | parsedPoolSize, err := strconv.ParseUint(value, 10, 64) 32 | if err == nil { 33 | poolSize = parsedPoolSize 34 | } 35 | } 36 | } 37 | 38 | return &HTTPAdapter{ 39 | client: client, 40 | reqpool: rp.NewPooler(rp.WithSize[http.Request](poolSize)), 41 | } 42 | } 43 | 44 | func (r *HTTPAdapter) Acquire() *http.Request { 45 | req := r.reqpool.Acquire() 46 | if req == nil { 47 | req = &http.Request{} 48 | } 49 | return req 50 | } 51 | 52 | func (r *HTTPAdapter) WithClient(client *http.Client) { 53 | r.client = client 54 | } 55 | 56 | func (r *HTTPAdapter) Do(res engine.IResponseWriter, req *http.Request) error { 57 | defer r.reqpool.Release(req) 58 | 59 | source, err := r.client.Do(req) 60 | 61 | if err != nil { 62 | return fmt.Errorf("Do: error dispatching request %w", err) 63 | } 64 | 65 | res.WriteRequest(req) 66 | HTTPRequestAdapterResponse(res, source) 67 | return nil 68 | } 69 | -------------------------------------------------------------------------------- /pkg/executor_adapters/http_native/adapter_test.go: -------------------------------------------------------------------------------- 1 | package httpnative 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "net/http" 7 | "net/http/httptest" 8 | "net/url" 9 | "strconv" 10 | "strings" 11 | "testing" 12 | "time" 13 | 14 | "github.com/stretchr/testify/assert" 15 | "github.com/tech-engine/goscrapy/internal/fsm" 16 | ) 17 | 18 | type testCase struct { 19 | name, 20 | method string 21 | body io.ReadCloser 22 | expected []byte 23 | } 24 | 25 | var testServer = httptest.NewServer(handler()) 26 | 27 | type testResponseWriter struct { 28 | statuscode int 29 | body io.ReadCloser 30 | } 31 | 32 | func (r *testResponseWriter) WriteHeader(h http.Header) { 33 | } 34 | 35 | func (r *testResponseWriter) WriteBody(b io.ReadCloser) { 36 | r.body = b 37 | } 38 | 39 | func (r *testResponseWriter) WriteStatusCode(s int) { 40 | r.statuscode = s 41 | } 42 | 43 | func (r *testResponseWriter) WriteCookies(c []*http.Cookie) { 44 | } 45 | 46 | func (r *testResponseWriter) WriteRequest(req *http.Request) { 47 | } 48 | 49 | func (r *testResponseWriter) WriteMeta(m *fsm.FixedSizeMap[string, any]) { 50 | } 51 | 52 | func handler() *http.ServeMux { 53 | mux := http.NewServeMux() 54 | // /get-cookie receives headers from client and set those headers as response cookies 55 | mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { 56 | switch r.Method { 57 | case "GET", "DELETE": 58 | // selectively sleep based of delay header to test context in request 59 | if delay := r.Header.Get("delay"); delay != "" { 60 | d, _ := strconv.Atoi(delay) 61 | time.Sleep(time.Duration(d) * time.Second) 62 | } 63 | w.WriteHeader(http.StatusOK) 64 | w.Write([]byte("ok")) 65 | case "POST", "PATCH", "PUT": 66 | w.WriteHeader(http.StatusOK) 67 | b, _ := io.ReadAll(r.Body) 68 | w.Write(b) 69 | default: 70 | w.WriteHeader(http.StatusMethodNotAllowed) 71 | } 72 | }) 73 | 74 | return mux 75 | } 76 | 77 | func run(t *testing.T, adapter *HTTPAdapter, method string, body io.ReadCloser, expected []byte) { 78 | 79 | var err error 80 | resp := &testResponseWriter{} 81 | 82 | urlParsed, err := url.Parse(testServer.URL) 83 | 84 | assert.NoError(t, err) 85 | 86 | req := adapter.Acquire() 87 | 88 | req.URL = urlParsed 89 | 90 | req.Method = method 91 | req.Body = body 92 | err = adapter.Do(resp, req) 93 | 94 | assert.NoError(t, err) 95 | 96 | defer resp.body.Close() 97 | 98 | assert.Equal(t, 200, resp.statuscode) 99 | 100 | respB, err := io.ReadAll(resp.body) 101 | 102 | assert.NoError(t, err) 103 | 104 | assert.Equalf(t, expected, respB, "expected %s, got %s", string(expected), string(respB)) 105 | 106 | } 107 | 108 | func TestAdapterRequest(t *testing.T) { 109 | 110 | adapter := NewHTTPClientAdapter(&http.Client{}, 10) 111 | testCases := []testCase{ 112 | { 113 | name: "GET", 114 | method: "GET", 115 | expected: []byte("ok"), 116 | }, 117 | { 118 | name: "DELETE", 119 | method: "DELETE", 120 | expected: []byte("ok"), 121 | }, 122 | { 123 | name: "POST", 124 | method: "POST", 125 | body: io.NopCloser(strings.NewReader("post")), 126 | expected: []byte("post"), 127 | }, 128 | { 129 | name: "PATCH", 130 | method: "PATCH", 131 | body: io.NopCloser(strings.NewReader("patch")), 132 | expected: []byte("patch"), 133 | }, 134 | { 135 | name: "PUT", 136 | method: "PUT", 137 | body: io.NopCloser(strings.NewReader("put")), 138 | expected: []byte("put"), 139 | }, 140 | } 141 | for _, tc := range testCases { 142 | t.Run(tc.method, func(t *testing.T) { 143 | t.Parallel() 144 | run(t, adapter, tc.method, tc.body, tc.expected) 145 | }) 146 | } 147 | } 148 | 149 | func TestAdapterRequestCtx(t *testing.T) { 150 | adapter := NewHTTPClientAdapter(&http.Client{}, 10) 151 | 152 | resp := &testResponseWriter{} 153 | 154 | urlParsed, err := url.Parse(testServer.URL) 155 | 156 | assert.NoError(t, err) 157 | 158 | ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) 159 | defer cancel() 160 | 161 | // added so that we can distinguise this request and sleep selectively for 3 seconds in our test server 162 | // which will cause the context to expire before we get a response from server 163 | headers := http.Header{} 164 | headers.Add("delay", "3") 165 | 166 | req := adapter.Acquire() 167 | 168 | req.URL = urlParsed 169 | req = req.WithContext(ctx) 170 | req.Header = headers 171 | 172 | err = adapter.Do(resp, req) 173 | 174 | assert.ErrorIs(t, err, context.DeadlineExceeded) 175 | } 176 | -------------------------------------------------------------------------------- /pkg/executor_adapters/http_native/helper.go: -------------------------------------------------------------------------------- 1 | package httpnative 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/engine" 7 | ) 8 | 9 | func HTTPRequestAdapterResponse(res engine.IResponseWriter, source *http.Response) { 10 | 11 | res.WriteHeader(source.Header) 12 | res.WriteStatusCode(source.StatusCode) 13 | res.WriteCookies(source.Cookies()) 14 | res.WriteBody(source.Body) 15 | } 16 | -------------------------------------------------------------------------------- /pkg/middlewaremanager/middlewaremanager.go: -------------------------------------------------------------------------------- 1 | package middlewaremanager 2 | 3 | import "net/http" 4 | 5 | type Middleware func(next http.RoundTripper) http.RoundTripper 6 | 7 | type MiddlewareFunc func(req *http.Request) (*http.Response, error) 8 | 9 | func (mf MiddlewareFunc) RoundTrip(req *http.Request) (*http.Response, error) { 10 | return mf(req) 11 | } 12 | 13 | type MiddlewareManager struct { 14 | httpClient *http.Client 15 | } 16 | 17 | func New(cli *http.Client) *MiddlewareManager { 18 | return &MiddlewareManager{ 19 | httpClient: cli, 20 | } 21 | } 22 | 23 | func (m *MiddlewareManager) HTTPClient() *http.Client { 24 | return m.httpClient 25 | } 26 | 27 | func (m *MiddlewareManager) Add(middlewares ...Middleware) { 28 | for _, middleware := range middlewares { 29 | m.httpClient.Transport = middleware(m.httpClient.Transport) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /pkg/pipeline_manager/constants.go: -------------------------------------------------------------------------------- 1 | package pipelinemanager 2 | 3 | // If true, all pipelines' Open method complete without an error, 4 | // otherwise, pipeline manager won't start and return an error corresponding 5 | // to the first pipeline to return an non-nil error. 6 | 7 | // const PIPELINEMANAGER_PIPELINES_MUST_OPEN = false 8 | 9 | // Reuseable Pipeline Item pool size 10 | const PIPELINEMANAGER_ITEMPOOL_SIZE = 10000 11 | 12 | // Max key-value pairs a Pipiline Item can have 13 | const PIPELINEMANAGER_ITEM_SIZE = 24 14 | 15 | // Output queue buffer size. Yield items are pushed to this queue, 16 | // before being feed into the start of the pipelines. 17 | const PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE = 0 18 | 19 | // Max number of Outputs that will be allowed to processed concurrently in the pipeline 20 | const PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY = 1000 21 | -------------------------------------------------------------------------------- /pkg/pipeline_manager/group.go: -------------------------------------------------------------------------------- 1 | package pipelinemanager 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | 7 | "github.com/tech-engine/goscrapy/pkg/core" 8 | "golang.org/x/sync/errgroup" 9 | ) 10 | 11 | type Group[OUT any] struct { 12 | nodes []IPipeline[OUT] 13 | ignoreErrors bool 14 | } 15 | 16 | // Create a Group which is a collection of pipelines intended to run concurrently 17 | // as opposed to sequentially. Group implements IPipeline interface, meaning it behaves 18 | // like a single pipeline. 19 | // 20 | // Common usecase: Since each pipeline in a Group runs concurrently it is not meant 21 | // for data transformation but only for data export or other similar independent tasks. 22 | // A Group must not modify 23 | func NewGroup[OUT any]() *Group[OUT] { 24 | return &Group[OUT]{ 25 | nodes: make([]IPipeline[OUT], 0), 26 | } 27 | } 28 | 29 | func (g *Group[OUT]) Open(ctx context.Context) error { 30 | if g.ignoreErrors { 31 | var wg sync.WaitGroup 32 | wg.Add(len(g.nodes)) 33 | 34 | for _, p := range g.nodes { 35 | go func() { 36 | defer wg.Done() 37 | p.Open(ctx) 38 | }() 39 | } 40 | 41 | wg.Wait() 42 | return nil 43 | } 44 | 45 | group, groupCtx := errgroup.WithContext(ctx) 46 | for _, p := range g.nodes { 47 | group.Go(func() error { 48 | return p.Open(groupCtx) 49 | }) 50 | } 51 | return group.Wait() 52 | } 53 | 54 | func (g *Group[OUT]) Close() { 55 | var wg sync.WaitGroup 56 | wg.Add(len(g.nodes)) 57 | 58 | for _, p := range g.nodes { 59 | go func() { 60 | defer wg.Done() 61 | p.Close() 62 | }() 63 | } 64 | 65 | wg.Wait() 66 | } 67 | 68 | // WithIgnoreError sets ignoreErrors = true 69 | // 70 | // When ignoreErrors = true, Group's ProcessItem & Open function will always return 71 | // a nil error. 72 | // 73 | // When ignoreErrors = false(default), Group's ProcessItem & Open will return the first non-nil 74 | // error. In addition to that context passed to Open function is also cancelled. 75 | func (g *Group[OUT]) WithIgnoreError() { 76 | g.ignoreErrors = true 77 | } 78 | 79 | func (g *Group[OUT]) Add(p ...IPipeline[OUT]) { 80 | g.nodes = append(g.nodes, p...) 81 | } 82 | 83 | func (g *Group[OUT]) ProcessItem(pi IPipelineItem, out core.IOutput[OUT]) error { 84 | 85 | if g.ignoreErrors { 86 | var wg sync.WaitGroup 87 | wg.Add(len(g.nodes)) 88 | 89 | for _, p := range g.nodes { 90 | go func() { 91 | defer wg.Done() 92 | p.ProcessItem(pi, out) 93 | }() 94 | } 95 | 96 | wg.Wait() 97 | return nil 98 | } 99 | 100 | errGroup := errgroup.Group{} 101 | for _, p := range g.nodes { 102 | errGroup.Go(func() error { 103 | return p.ProcessItem(pi, out) 104 | }) 105 | } 106 | return errGroup.Wait() 107 | } 108 | -------------------------------------------------------------------------------- /pkg/pipeline_manager/options.go: -------------------------------------------------------------------------------- 1 | package pipelinemanager 2 | 3 | import ( 4 | "os" 5 | "strconv" 6 | 7 | "github.com/tech-engine/goscrapy/internal/types" 8 | ) 9 | 10 | type opts struct { 11 | itemPoolSize, itemSize, outputQueueBuffSize, maxProcessItemConcurrency uint64 12 | } 13 | 14 | // Setup all the default pipelinemanger options. 15 | func defaultOpts() opts { 16 | opts := opts{} 17 | opts.itemPoolSize = PIPELINEMANAGER_ITEMPOOL_SIZE 18 | envVal, ok := os.LookupEnv("PIPELINEMANAGER_ITEMPOOL_SIZE") 19 | 20 | if ok { 21 | parsedPoolSize, err := strconv.ParseUint(envVal, 10, 64) 22 | if err == nil { 23 | opts.itemPoolSize = parsedPoolSize 24 | } 25 | } 26 | 27 | opts.itemSize = PIPELINEMANAGER_ITEM_SIZE 28 | envVal, ok = os.LookupEnv("PIPELINEMANAGER_ITEM_SIZE") 29 | 30 | if ok { 31 | parsedSize, err := strconv.ParseUint(envVal, 10, 64) 32 | if err == nil { 33 | opts.itemSize = parsedSize 34 | } 35 | } 36 | 37 | opts.outputQueueBuffSize = PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE 38 | envVal, ok = os.LookupEnv("PIPELINEMANAGER_OUTPUT_QUEUE_BUF_SIZE") 39 | 40 | if ok { 41 | parsedOutputBufSize, err := strconv.ParseUint(envVal, 10, 64) 42 | if err == nil { 43 | opts.outputQueueBuffSize = parsedOutputBufSize 44 | } 45 | } 46 | 47 | opts.maxProcessItemConcurrency = PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY 48 | envVal, ok = os.LookupEnv("PIPELINEMANAGER_MAX_PROCESS_ITEM_CONCURRENCY") 49 | 50 | if ok { 51 | parsedMaxItem, err := strconv.ParseUint(envVal, 10, 64) 52 | if err == nil { 53 | opts.maxProcessItemConcurrency = parsedMaxItem 54 | } 55 | } 56 | 57 | return opts 58 | } 59 | 60 | func WithItemPoolSize(val uint64) types.OptFunc[opts] { 61 | return func(opts *opts) { 62 | opts.itemPoolSize = val 63 | } 64 | } 65 | 66 | func WithItemSize(val uint64) types.OptFunc[opts] { 67 | return func(opts *opts) { 68 | opts.itemSize = val 69 | } 70 | } 71 | 72 | func WithOutputQueueSize(val uint64) types.OptFunc[opts] { 73 | return func(opts *opts) { 74 | opts.outputQueueBuffSize = val 75 | } 76 | } 77 | 78 | func WithProcessItemConcurrency(val uint64) types.OptFunc[opts] { 79 | return func(opts *opts) { 80 | opts.maxProcessItemConcurrency = val 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /pkg/pipeline_manager/pipeline_manager.go: -------------------------------------------------------------------------------- 1 | package pipelinemanager 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | 7 | "github.com/tech-engine/goscrapy/internal/cmap" 8 | rp "github.com/tech-engine/goscrapy/internal/resource_pool" 9 | "github.com/tech-engine/goscrapy/internal/types" 10 | "github.com/tech-engine/goscrapy/pkg/core" 11 | "golang.org/x/sync/errgroup" 12 | ) 13 | 14 | type PipelineManager[OUT any] struct { 15 | opts 16 | itemPool *rp.Pooler[cmap.CMap[string, any]] 17 | outputQueue chan core.IOutput[OUT] 18 | pipelines []IPipeline[OUT] 19 | } 20 | 21 | func New[OUT any](optFuncs ...types.OptFunc[opts]) *PipelineManager[OUT] { 22 | 23 | // set default options 24 | opts := defaultOpts() 25 | 26 | // set custom options 27 | for _, fn := range optFuncs { 28 | fn(&opts) 29 | } 30 | 31 | return &PipelineManager[OUT]{ 32 | opts: opts, 33 | outputQueue: make(chan core.IOutput[OUT], opts.outputQueueBuffSize), 34 | pipelines: make([]IPipeline[OUT], 0), 35 | itemPool: rp.NewPooler(rp.WithSize[cmap.CMap[string, any]](opts.itemPoolSize)), 36 | } 37 | } 38 | 39 | func (pm *PipelineManager[OUT]) Add(pipeline ...IPipeline[OUT]) { 40 | pm.pipelines = append(pm.pipelines, pipeline...) 41 | } 42 | 43 | // runs after the spider's Open func and calls all open function of pipelines 44 | func (pm *PipelineManager[OUT]) Start(ctx context.Context) error { 45 | 46 | var ( 47 | group *errgroup.Group 48 | groupCtx context.Context 49 | err error 50 | ) 51 | 52 | if err = ctx.Err(); err != nil { 53 | return err 54 | } 55 | 56 | // Below code ensures that we return an error in case any of the pipelines open 57 | // funtion returns and error if opts.openMust has been set to true 58 | 59 | group, groupCtx = errgroup.WithContext(ctx) 60 | 61 | for _, pipeline := range pm.pipelines { 62 | group.Go(func() error { 63 | return pipeline.Open(groupCtx) 64 | }) 65 | } 66 | 67 | // we return early as there would be no point in processing items on the 68 | if err = group.Wait(); err != nil { 69 | return err 70 | } 71 | 72 | // upon exiting, stop pipeline manager 73 | defer pm.stop() 74 | 75 | // Below we listen on the outputQueue for new yield outputs 76 | 77 | var wg sync.WaitGroup 78 | defer wg.Wait() 79 | 80 | // This semaphone will make sure only a fixed number of goroutines 81 | // are spun up to process items from output queue 82 | semaphone := make(chan struct{}, pm.opts.maxProcessItemConcurrency) 83 | 84 | for { 85 | select { 86 | case semaphone <- struct{}{}: 87 | 88 | wg.Add(1) 89 | go func() { 90 | 91 | defer wg.Done() 92 | defer func() { <-semaphone }() 93 | 94 | // this select is to make sure this goroutine doesn't get's blocked 95 | // waiting for items on queue and get a chance to exit when on context cancellation 96 | select { 97 | case item := <-pm.outputQueue: 98 | if ctx.Err() != nil { 99 | return 100 | } 101 | pm.processItem(item) 102 | case <-ctx.Done(): 103 | // currently not needed but we also consider closing pm.outputQueue channel in future 104 | return 105 | } 106 | 107 | }() 108 | 109 | case <-ctx.Done(): 110 | return ctx.Err() 111 | } 112 | } 113 | } 114 | 115 | // Stoping manager would call the close function of every pipeline 116 | func (pm *PipelineManager[OUT]) stop() { 117 | var wg sync.WaitGroup 118 | 119 | wg.Add(len(pm.pipelines)) 120 | defer wg.Wait() 121 | 122 | for _, p := range pm.pipelines { 123 | go func() { 124 | defer wg.Done() 125 | p.Close() 126 | }() 127 | 128 | } 129 | } 130 | 131 | func (pm *PipelineManager[OUT]) Push(original core.IOutput[OUT]) { 132 | if len(pm.pipelines) <= 0 { 133 | return 134 | } 135 | pm.outputQueue <- original 136 | } 137 | 138 | // Below function passes each yield output through our pipelines 139 | func (pm *PipelineManager[OUT]) processItem(original core.IOutput[OUT]) { 140 | 141 | // call sync pipelines 142 | var ( 143 | pItem *cmap.CMap[string, any] // pipeline item 144 | err error 145 | ) 146 | 147 | pItem = pm.itemPool.Acquire() 148 | 149 | defer func() { 150 | pItem.Clear() 151 | pm.itemPool.Release(pItem) 152 | }() 153 | 154 | if pItem == nil { 155 | pItem = cmap.NewCMap[string, any](cmap.WithSize(int(pm.itemSize))) 156 | } 157 | 158 | for _, pipeline := range pm.pipelines { 159 | 160 | // we check if pipeline is a group by checking 161 | if err = pipeline.ProcessItem(IPipelineItem(pItem), original); err != nil { 162 | return 163 | } 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /pkg/pipeline_manager/pipeline_manager_test.go: -------------------------------------------------------------------------------- 1 | package pipelinemanager 2 | 3 | import ( 4 | "context" 5 | "reflect" 6 | "sync" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | "github.com/tech-engine/goscrapy/pkg/core" 11 | ) 12 | 13 | type safeDummyRecord struct { 14 | mu sync.Mutex 15 | id, age int 16 | } 17 | 18 | func (s *safeDummyRecord) Set(id, age int) { 19 | s.mu.Lock() 20 | defer s.mu.Unlock() 21 | s.id = id 22 | s.age = age 23 | } 24 | 25 | func (s *safeDummyRecord) GetVal() [2]int { 26 | s.mu.Lock() 27 | defer s.mu.Unlock() 28 | return [2]int{s.id, s.age} 29 | } 30 | 31 | type dummyRecord struct { 32 | Id, Age int 33 | } 34 | 35 | type dummyJob struct { 36 | id string 37 | } 38 | 39 | func (j *dummyJob) Id() string { 40 | return "dummyJob" 41 | } 42 | 43 | func (o *dummyRecord) Record() *dummyRecord { 44 | return o 45 | } 46 | 47 | func (o *dummyRecord) RecordKeys() []string { 48 | dataType := reflect.TypeOf(*o) 49 | if dataType.Kind() != reflect.Struct { 50 | panic("Record is not a struct") 51 | } 52 | 53 | numFields := dataType.NumField() 54 | keys := make([]string, numFields) 55 | 56 | for i := 0; i < numFields; i++ { 57 | field := dataType.Field(i) 58 | csvTag := field.Tag.Get("csv") 59 | keys[i] = csvTag 60 | } 61 | 62 | return keys 63 | } 64 | 65 | func (o *dummyRecord) RecordFlat() []any { 66 | 67 | inputType := reflect.TypeOf(*o) 68 | 69 | if inputType.Kind() != reflect.Struct { 70 | panic("Record is not a struct") 71 | } 72 | 73 | inputValue := reflect.ValueOf(*o) 74 | 75 | slice := make([]any, inputType.NumField()) 76 | 77 | for i := 0; i < inputType.NumField(); i++ { 78 | slice[i] = inputValue.Field(i).Interface() 79 | } 80 | return slice 81 | } 82 | 83 | func (o *dummyRecord) Job() core.IJob { 84 | return nil 85 | } 86 | 87 | // dummy pipeline 1 88 | type doublePipeline[OUT any] struct { 89 | } 90 | 91 | func newDoublePipeline[OUT any]() *doublePipeline[OUT] { 92 | return &doublePipeline[OUT]{} 93 | } 94 | 95 | func (p *doublePipeline[OUT]) Open(ctx context.Context) error { 96 | return nil 97 | } 98 | 99 | func (p *doublePipeline[OUT]) Close() { 100 | } 101 | 102 | func (p *doublePipeline[OUT]) ProcessItem(item IPipelineItem, original core.IOutput[OUT]) error { 103 | rec := original.RecordFlat() 104 | item.Set("id", rec[0]) 105 | item.Set("age", rec[1].(int)*2) 106 | return nil 107 | } 108 | 109 | // dummy pipeline 2 110 | type dummyPipeline2[OUT any] struct { 111 | safeRecord safeDummyRecord 112 | } 113 | 114 | func newDummyPipeline2[OUT any]() *dummyPipeline2[OUT] { 115 | return &dummyPipeline2[OUT]{ 116 | safeRecord: safeDummyRecord{}, 117 | } 118 | } 119 | 120 | func (p *dummyPipeline2[OUT]) Open(ctx context.Context) error { 121 | return nil 122 | } 123 | 124 | func (p *dummyPipeline2[OUT]) Close() { 125 | } 126 | 127 | func (p *dummyPipeline2[OUT]) ProcessItem(item IPipelineItem, original core.IOutput[OUT]) error { 128 | id, _ := item.Get("id") 129 | age, _ := item.Get("age") 130 | p.safeRecord.Set(id.(int), age.(int)) 131 | 132 | return nil 133 | } 134 | 135 | func TestPipelineManager(t *testing.T) { 136 | // create a pipeline manager 137 | var wg sync.WaitGroup 138 | pipelineManager := New[*dummyRecord]() 139 | // add a dummy test pipeline 140 | readPipeline := newDummyPipeline2[*dummyRecord]() 141 | pipelineManager.Add( 142 | newDoublePipeline[*dummyRecord](), 143 | readPipeline, 144 | ) 145 | // start the pipeline 146 | wg.Add(1) 147 | go func() { 148 | wg.Done() 149 | pipelineManager.Start(context.Background()) 150 | }() 151 | // push item to pipeline 152 | pipelineManager.Push(&dummyRecord{Id: 1, Age: 19}) 153 | // verify what we pushed is what we get 154 | safeRecord := readPipeline.safeRecord.GetVal() 155 | assert.Equalf(t, 1, safeRecord[0], "expected id=1, got=%s", safeRecord[0]) 156 | assert.Equalf(t, 38, safeRecord[1], "expected age=1, got=%s", safeRecord[1]) 157 | wg.Wait() 158 | } 159 | -------------------------------------------------------------------------------- /pkg/pipeline_manager/ports.go: -------------------------------------------------------------------------------- 1 | package pipelinemanager 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/tech-engine/goscrapy/pkg/core" 7 | ) 8 | 9 | // We have added it here as PipelineManager is the one that passes IPipelineItems to pipelines 10 | // and so must be aware of IPipelineItem. 11 | type IPipelineItem interface { 12 | Get(string) (any, bool) 13 | Set(string, any) error 14 | Del(string) 15 | Keys() []any 16 | Clear() 17 | } 18 | 19 | type IPipeline[OUT any] interface { 20 | Open(context.Context) error 21 | Close() 22 | ProcessItem(IPipelineItem, core.IOutput[OUT]) error 23 | } 24 | -------------------------------------------------------------------------------- /pkg/scheduler/constants.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | // These constants will be overwritten by enviroment variables 4 | const SCHEDULER_DEFAULT_REQ_RES_POOL_SIZE uint64 = 1e6 5 | const SCHEDULER_DEFAULT_WORKER_MULTIPLIER uint16 = 3 6 | const SCHEDULER_DEFAULT_WORK_QUEUE_SIZE = 1e6 7 | -------------------------------------------------------------------------------- /pkg/scheduler/options.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "os" 5 | "runtime" 6 | "strconv" 7 | 8 | "github.com/tech-engine/goscrapy/internal/types" 9 | ) 10 | 11 | type opts struct { 12 | numWorkers uint16 13 | reqResPoolSize uint64 14 | workQueueSize uint64 15 | } 16 | 17 | func defaultOpts() opts { 18 | opts := opts{} 19 | opts.reqResPoolSize = SCHEDULER_DEFAULT_REQ_RES_POOL_SIZE 20 | value, ok := os.LookupEnv("SCHEDULER_REQ_RES_POOL_SIZE") 21 | 22 | if ok { 23 | parsedPoolSize, err := strconv.ParseUint(value, 10, 64) 24 | if err == nil { 25 | opts.reqResPoolSize = parsedPoolSize 26 | } 27 | } 28 | 29 | opts.numWorkers = uint16(runtime.GOMAXPROCS(0)) * SCHEDULER_DEFAULT_WORKER_MULTIPLIER 30 | value, ok = os.LookupEnv("SCHEDULER_CONCURRENCY") 31 | 32 | if ok { 33 | multiplier, err := strconv.ParseUint(value, 10, 16) 34 | if err == nil { 35 | opts.numWorkers = uint16(multiplier) 36 | } 37 | } 38 | 39 | opts.workQueueSize = SCHEDULER_DEFAULT_WORK_QUEUE_SIZE 40 | value, ok = os.LookupEnv("SCHEDULER_WORK_QUEUE_SIZE") 41 | 42 | if ok { 43 | workQueueSize, err := strconv.ParseUint(value, 10, 64) 44 | if err == nil { 45 | opts.workQueueSize = workQueueSize 46 | } 47 | } 48 | return opts 49 | } 50 | 51 | func WithReqResPoolSize(n uint64) types.OptFunc[opts] { 52 | return func(opts *opts) { 53 | opts.reqResPoolSize = n 54 | } 55 | } 56 | 57 | func WithWorkers(n uint16) types.OptFunc[opts] { 58 | return func(opts *opts) { 59 | opts.numWorkers = n 60 | } 61 | } 62 | 63 | func WithWorkQueueSize(n uint64) types.OptFunc[opts] { 64 | return func(opts *opts) { 65 | opts.workQueueSize = n 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /pkg/scheduler/ports.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "github.com/tech-engine/goscrapy/pkg/core" 5 | "github.com/tech-engine/goscrapy/pkg/engine" 6 | ) 7 | 8 | // An executor must implement the IExecutor interface to be used by the scheduler.*Scheduler 9 | type IExecutor interface { 10 | Execute(core.IRequestReader, engine.IResponseWriter) error 11 | } 12 | -------------------------------------------------------------------------------- /pkg/scheduler/request.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | "net/http" 10 | "net/url" 11 | "strings" 12 | 13 | "github.com/tech-engine/goscrapy/internal/fsm" 14 | "github.com/tech-engine/goscrapy/pkg/core" 15 | ) 16 | 17 | type request struct { 18 | ctx context.Context 19 | url *url.URL 20 | method string 21 | body io.ReadCloser 22 | header http.Header 23 | meta *fsm.FixedSizeMap[string, any] 24 | cookieJarKey string 25 | } 26 | 27 | // Request inplements core.IRequestReader 28 | func (r *request) ReadMethod() string { 29 | return r.method 30 | } 31 | 32 | func (r *request) ReadUrl() *url.URL { 33 | return r.url 34 | } 35 | 36 | func (r *request) ReadHeader() http.Header { 37 | return r.header 38 | } 39 | 40 | func (r *request) ReadBody() io.ReadCloser { 41 | return r.body 42 | } 43 | 44 | func (r *request) ReadContext() context.Context { 45 | return r.ctx 46 | } 47 | 48 | // ReadMeta give us a shallow copy of meta. 49 | func (r *request) ReadMeta() *fsm.FixedSizeMap[string, any] { 50 | return r.meta 51 | } 52 | 53 | // Read the cookie jar key associated with a request 54 | func (r *request) ReadCookieJar() string { 55 | return r.cookieJarKey 56 | } 57 | 58 | // Request inplements core.IRequestWriter 59 | func (r *request) Url(_url string) core.IRequestWriter { 60 | __url, err := url.Parse(_url) 61 | 62 | if err != nil { 63 | panic(fmt.Sprintf("SetUrl: error parsing url")) 64 | } 65 | 66 | r.url = __url 67 | return r 68 | } 69 | 70 | func (r *request) Method(method string) core.IRequestWriter { 71 | r.method = strings.ToUpper(method) 72 | return r 73 | } 74 | 75 | func (r *request) Body(body any) core.IRequestWriter { 76 | switch v := body.(type) { 77 | case io.Reader: 78 | r.body = io.NopCloser(v) 79 | case io.ReadCloser: 80 | r.body = v 81 | case string: 82 | r.body = io.NopCloser(strings.NewReader(v)) 83 | case []byte: 84 | r.body = io.NopCloser(bytes.NewReader(v)) 85 | default: 86 | var buf *bytes.Buffer 87 | _ = json.NewEncoder(buf).Encode(v) 88 | r.body = io.NopCloser(buf) 89 | } 90 | 91 | return r 92 | } 93 | 94 | func (r *request) Header(header http.Header) core.IRequestWriter { 95 | r.header = header 96 | return r 97 | } 98 | 99 | func (r *request) CookieJar(key string) core.IRequestWriter { 100 | r.cookieJarKey = key 101 | return r 102 | } 103 | 104 | // Pass meta data as key/value pair to be available in callback response. 105 | func (r *request) Meta(key string, val any) core.IRequestWriter { 106 | if r.meta == nil { 107 | r.meta = fsm.New[string, any](24) 108 | } 109 | r.meta.Set(key, val) 110 | return r 111 | } 112 | 113 | func (r *request) WithContext(ctx context.Context) core.IRequestWriter { 114 | r.ctx = ctx 115 | return r 116 | } 117 | 118 | // func (r *request) MetaDataKey(key string) (any, bool) { 119 | // if r.meta == nil { 120 | // return nil, false 121 | // } 122 | 123 | // val, ok := r.meta[key] 124 | // return val, ok 125 | // } 126 | 127 | func (r *request) Reset() { 128 | r.method = "" 129 | r.url = nil 130 | if r.header != nil { 131 | for key := range r.header { 132 | r.header.Del(key) 133 | } 134 | } 135 | r.body = nil 136 | r.cookieJarKey = "" 137 | if r.meta != nil { 138 | r.meta.Clear() 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /pkg/scheduler/response.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "net/http" 7 | 8 | "github.com/tech-engine/goscrapy/internal/fsm" 9 | "github.com/tech-engine/goscrapy/pkg/core" 10 | "golang.org/x/net/html" 11 | ) 12 | 13 | func NewResponse() *response { 14 | return &response{} 15 | } 16 | 17 | type response struct { 18 | statusCode int 19 | body io.ReadCloser 20 | header http.Header 21 | cookies []*http.Cookie 22 | request *http.Request 23 | meta *fsm.FixedSizeMap[string, any] 24 | nodes Selectors 25 | } 26 | 27 | // response implementing core.ResponseReader 28 | func (r *response) Request() *http.Request { 29 | return r.request 30 | } 31 | 32 | func (r *response) StatusCode() int { 33 | return r.statusCode 34 | } 35 | 36 | func (r *response) Body() io.ReadCloser { 37 | return r.body 38 | } 39 | 40 | func (r *response) Header() http.Header { 41 | return r.header 42 | } 43 | 44 | func (r *response) Cookies() []*http.Cookie { 45 | return r.cookies 46 | } 47 | 48 | func (r *response) Meta(key string) (any, bool) { 49 | return r.meta.Get(key) 50 | } 51 | 52 | func (r *response) Bytes() []byte { 53 | buff := new(bytes.Buffer) 54 | buff.ReadFrom(r.body) 55 | return buff.Bytes() 56 | } 57 | 58 | func (r *response) Reset() { 59 | r.statusCode = 0 60 | r.body = nil 61 | r.header = nil 62 | r.cookies = nil 63 | r.request = nil 64 | // because we there isn't guarantee that we will have the same pair for req-res from the pools, 65 | // we must set it meta=nil upon releasing req-res to their respective pools, otherwise we will have corrupt data. 66 | r.meta = nil 67 | r.nodes = nil 68 | } 69 | 70 | // response implementing engine.ResponseWriter 71 | func (r *response) WriteRequest(request *http.Request) { 72 | r.request = request 73 | } 74 | 75 | func (r *response) WriteHeader(header http.Header) { 76 | r.header = header 77 | } 78 | 79 | func (r *response) WriteBody(body io.ReadCloser) { 80 | r.body = body 81 | } 82 | 83 | func (r *response) WriteStatusCode(statuscode int) { 84 | r.statusCode = statuscode 85 | } 86 | 87 | func (r *response) WriteCookies(cookies []*http.Cookie) { 88 | r.cookies = cookies 89 | } 90 | 91 | func (r *response) WriteMeta(meta *fsm.FixedSizeMap[string, any]) { 92 | r.meta = meta 93 | } 94 | 95 | func (r *response) Css(selector string) core.ISelector { 96 | 97 | if r.nodes == nil { 98 | if nodes, err := NewSelector(r.body); err == nil { 99 | r.nodes = nodes 100 | } 101 | } 102 | 103 | return r.nodes.Css(selector) 104 | } 105 | 106 | func (r *response) Xpath(xpath string) core.ISelector { 107 | 108 | if r.nodes == nil { 109 | if nodes, err := NewSelector(r.body); err == nil { 110 | r.nodes = nodes 111 | } 112 | } 113 | return r.nodes.Xpath(xpath) 114 | } 115 | 116 | func (r *response) Text(def ...string) []string { 117 | return r.nodes.Text(def...) 118 | } 119 | 120 | func (r *response) Attr(attrName string) []string { 121 | return r.nodes.Attr(attrName) 122 | } 123 | 124 | func (r *response) Get() *html.Node { 125 | return r.nodes.Get() 126 | } 127 | 128 | func (r *response) GetAll() []*html.Node { 129 | return r.nodes.GetAll() 130 | } 131 | -------------------------------------------------------------------------------- /pkg/scheduler/scheduler.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | "sync" 7 | 8 | rp "github.com/tech-engine/goscrapy/internal/resource_pool" 9 | "github.com/tech-engine/goscrapy/internal/types" 10 | "github.com/tech-engine/goscrapy/pkg/core" 11 | ) 12 | 13 | type scheduler struct { 14 | opts 15 | executor IExecutor 16 | schedulerWorkPool *rp.Pooler[schedulerWork] 17 | requestPool *rp.Pooler[request] 18 | workerQueue WorkerQueue 19 | workQueue WorkQueue 20 | } 21 | 22 | // NewScheduler creates a new scheduler. 23 | func New(executor IExecutor, optFuncs ...types.OptFunc[opts]) *scheduler { 24 | 25 | // set default options 26 | opts := defaultOpts() 27 | 28 | // set custom options 29 | for _, fn := range optFuncs { 30 | fn(&opts) 31 | } 32 | 33 | return &scheduler{ 34 | opts: opts, 35 | executor: executor, 36 | schedulerWorkPool: rp.NewPooler(rp.WithSize[schedulerWork](opts.reqResPoolSize)), 37 | requestPool: rp.NewPooler(rp.WithSize[request](opts.reqResPoolSize)), 38 | workerQueue: make(WorkerQueue, opts.numWorkers), 39 | workQueue: make(WorkQueue, opts.workQueueSize), 40 | } 41 | } 42 | 43 | func (s *scheduler) WithExecutor(executor IExecutor) { 44 | s.executor = executor 45 | } 46 | 47 | // Handles creating workers and listening on the work queue 48 | func (s *scheduler) Start(ctx context.Context) error { 49 | 50 | if ctx.Err() != nil { 51 | return ctx.Err() 52 | } 53 | 54 | var ( 55 | i uint16 56 | err error 57 | wg sync.WaitGroup 58 | ) 59 | 60 | defer wg.Wait() 61 | wg.Add(int(s.opts.numWorkers)) 62 | 63 | // this is to make sure that we close the scheduler and after that close all the workers 64 | wCtx, wCancel := context.WithCancel(context.Background()) 65 | 66 | for i = 0; i < s.opts.numWorkers; i++ { 67 | go func() { 68 | defer wg.Done() 69 | worker := NewWorker(i+1, s.executor, s.workerQueue, s.schedulerWorkPool, s.requestPool, s.opts.reqResPoolSize) 70 | 71 | // blocking 72 | _ = worker.Start(wCtx) 73 | }() 74 | } 75 | 76 | // below will trigger context cancellation for the worker after scheduler is done. 77 | defer wCancel() 78 | 79 | for { 80 | select { 81 | case work := <-s.workQueue: 82 | 83 | // the below check ensures our scheduler don't pick any worker once context has been cancelled 84 | if err = ctx.Err(); err != nil { 85 | return err 86 | } 87 | 88 | wg.Add(1) 89 | go s.push(&wg, work) 90 | case <-ctx.Done(): 91 | return ctx.Err() 92 | } 93 | } 94 | } 95 | 96 | func (s *scheduler) Schedule(req core.IRequestReader, next core.ResponseCallback) { 97 | 98 | work := s.schedulerWorkPool.Acquire() 99 | 100 | if work == nil { 101 | work = &schedulerWork{} 102 | } 103 | 104 | work.request = req 105 | work.next = next 106 | 107 | s.workQueue <- work 108 | } 109 | 110 | func (s *scheduler) NewRequest() core.IRequestRW { 111 | req := s.requestPool.Acquire() 112 | if req == nil { 113 | req = &request{ 114 | method: "GET", 115 | header: make(http.Header), 116 | } 117 | } 118 | return req 119 | } 120 | 121 | // push a *schedulerWork unit to a worker 122 | func (s *scheduler) push(wg *sync.WaitGroup, work *schedulerWork) { 123 | defer wg.Done() 124 | 125 | // pull a worker and push a task in the worker's queue 126 | worker := <-s.workerQueue 127 | worker <- work 128 | } 129 | -------------------------------------------------------------------------------- /pkg/scheduler/scheduler_work.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import "github.com/tech-engine/goscrapy/pkg/core" 4 | 5 | type schedulerWork struct { 6 | next core.ResponseCallback 7 | request core.IRequestReader 8 | } 9 | 10 | func (s *schedulerWork) Reset() { 11 | s.next = nil 12 | s.request = nil 13 | } 14 | -------------------------------------------------------------------------------- /pkg/scheduler/selectors.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "io" 5 | "strings" 6 | 7 | "github.com/andybalholm/cascadia" 8 | "github.com/antchfx/htmlquery" 9 | "github.com/tech-engine/goscrapy/pkg/core" 10 | "golang.org/x/net/html" 11 | ) 12 | 13 | type Selectors []*html.Node 14 | 15 | func NewSelector(r io.Reader) (Selectors, error) { 16 | root, err := html.Parse(r) 17 | if err != nil { 18 | return nil, err 19 | } 20 | return Selectors([]*html.Node{root}), nil 21 | } 22 | 23 | // Css selector - select element by id, class, nodename etc. 24 | func (nodes Selectors) Css(selector string) core.ISelector { 25 | sel, err := cascadia.ParseWithPseudoElement(selector) 26 | if err != nil { 27 | return Selectors([]*html.Node{}) 28 | } 29 | 30 | selected := make(Selectors, 0, len(nodes)) 31 | for _, node := range nodes { 32 | selected = append(selected, cascadia.QueryAll(node, sel)...) 33 | } 34 | 35 | return selected 36 | } 37 | 38 | // Xpath selector - select element using an xpath expression. 39 | func (nodes Selectors) Xpath(xpath string) core.ISelector { 40 | selected := make(Selectors, 0, len(nodes)) 41 | for _, node := range nodes { 42 | matches, err := htmlquery.QueryAll(node, xpath) 43 | if err != nil { 44 | continue 45 | } 46 | selected = append(selected, matches...) 47 | } 48 | return selected 49 | } 50 | 51 | // Extracts all the text of a node and it's descendents. 52 | func (nodes Selectors) Text(def ...string) []string { 53 | texts := make([]string, 0, len(nodes)) 54 | for _, node := range nodes { 55 | text := strings.TrimSpace(htmlquery.InnerText(node)) 56 | if text == "" && len(def) > 0 { 57 | texts = append(texts, def[0]) 58 | continue 59 | } 60 | texts = append(texts, text) 61 | } 62 | return texts 63 | } 64 | 65 | // Extracts attribute values 66 | func (nodes Selectors) Attr(attrName string) []string { 67 | attrs := make([]string, 0, len(nodes)) 68 | for _, node := range nodes { 69 | for _, attr := range node.Attr { 70 | if attr.Key == attrName { 71 | attrs = append(attrs, attr.Val) 72 | } 73 | } 74 | } 75 | return attrs 76 | } 77 | 78 | // Get the first matched node 79 | func (nodes Selectors) Get() *html.Node { 80 | if len(nodes) <= 0 { 81 | return nil 82 | } 83 | return nodes[0] 84 | } 85 | 86 | // Gets all the matched nodes 87 | func (nodes Selectors) GetAll() []*html.Node { 88 | return nodes 89 | } 90 | -------------------------------------------------------------------------------- /pkg/scheduler/selectors_test.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestSelectors(t *testing.T) { 11 | html := ` 12 | 13 | 14 |
15 |

Title

16 |

Introduction paragraph 1

17 | Example Link 18 |

This is test paragraph

19 |

Introduction paragraph 3

20 |
21 | 22 | 23 | ` 24 | 25 | selector, err := NewSelector(strings.NewReader(html)) 26 | 27 | assert.NoError(t, err) 28 | 29 | cssSelector := selector.Css("p.intro") 30 | 31 | cssNodes := cssSelector.GetAll() 32 | assert.Len(t, cssNodes, 2, "expected nodes=2, got=%s", len(cssNodes)) 33 | 34 | cssNodesTexts := cssSelector.Text() 35 | assert.Equal(t, "Introduction paragraph 1", cssNodesTexts[0], "expected paragraph text=Introduction paragraph 1, got=%s", cssNodesTexts[0]) 36 | 37 | xpathSelector := selector.Xpath("//p[@data-mg='test']") 38 | 39 | xpathNodes := xpathSelector.GetAll() 40 | assert.Len(t, xpathNodes, 1, "expected xpath nodes=1, got=%s", len(xpathNodes)) 41 | 42 | xpathNodesTexts := xpathSelector.Text() 43 | assert.Len(t, xpathNodesTexts, 1, "expected xpathNodesTexts=1, got=%s", len(xpathNodesTexts)) 44 | assert.Equal(t, "Introduction paragraph 3", xpathNodesTexts[0], "expected paragraph text=Introduction paragraph 3, got=%s", xpathNodesTexts[0]) 45 | 46 | attrValues := selector.Css("a").Attr("href") 47 | assert.Len(t, xpathNodesTexts, 1, "expected attrValues=1, got=%s", len(attrValues)) 48 | assert.Equal(t, "http://example.com", attrValues[0], "expected href=http://example.com, got=%s", attrValues[0]) 49 | 50 | noCssElements := selector.Css("p.box").GetAll() 51 | assert.Empty(t, noCssElements, "expected element=0, got=%s", len(noCssElements)) 52 | 53 | noXpathElements := selector.Xpath("//p[@class='test']").GetAll() 54 | assert.Empty(t, noXpathElements, "expected element=0, got=%s", len(noXpathElements)) 55 | 56 | } 57 | -------------------------------------------------------------------------------- /pkg/scheduler/types.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | type WorkQueue chan *schedulerWork 4 | type WorkerQueue chan WorkQueue 5 | -------------------------------------------------------------------------------- /pkg/scheduler/worker.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "sync" 7 | 8 | rp "github.com/tech-engine/goscrapy/internal/resource_pool" 9 | ) 10 | 11 | // Worker will handle the execution of a Work unit 12 | type Worker struct { 13 | ID uint16 14 | executor IExecutor 15 | workerQueue WorkerQueue 16 | workQueue WorkQueue 17 | schedulerWorkPool *rp.Pooler[schedulerWork] 18 | responsePool *rp.Pooler[response] 19 | requestPool *rp.Pooler[request] 20 | } 21 | 22 | func NewWorker(id uint16, executor IExecutor, workerQueue WorkerQueue, schedulerWorkPool *rp.Pooler[schedulerWork], requestPool *rp.Pooler[request], respPoolSize uint64) *Worker { 23 | 24 | return &Worker{ 25 | ID: id, 26 | workerQueue: workerQueue, 27 | executor: executor, 28 | workQueue: make(WorkQueue), 29 | schedulerWorkPool: schedulerWorkPool, 30 | requestPool: requestPool, 31 | responsePool: rp.NewPooler(rp.WithSize[response](respPoolSize)), 32 | } 33 | } 34 | 35 | // Handles listen for any incoming work in workQueue 36 | func (w *Worker) Start(ctx context.Context) error { 37 | var err error 38 | 39 | if err = ctx.Err(); err != nil { 40 | return err 41 | } 42 | 43 | var wg sync.WaitGroup 44 | 45 | // we wait for all worker jobs to be completed finished/fail afer context cancellation 46 | defer wg.Wait() 47 | 48 | for { 49 | 50 | // make this worker available again 51 | w.workerQueue <- w.workQueue 52 | 53 | select { 54 | case work := <-w.workQueue: 55 | 56 | if err = ctx.Err(); err != nil { 57 | return err 58 | } 59 | 60 | wg.Add(1) 61 | 62 | // we don't want the workers to crash, so we ignore the error from execute 63 | _ = w.execute(ctx, work) 64 | wg.Done() 65 | 66 | case <-ctx.Done(): 67 | return ctx.Err() 68 | } 69 | } 70 | } 71 | 72 | // Handles executing a scheduler work and calling the next callback of with the result as response 73 | func (w *Worker) execute(ctx context.Context, work *schedulerWork) error { 74 | 75 | res := w.responsePool.Acquire() 76 | 77 | if res == nil { 78 | res = &response{} 79 | } 80 | 81 | // we do some cleanup here on the response object 82 | defer func() { 83 | w.resetAndRelease(work) 84 | 85 | // discard unread body 86 | if res.body != nil { 87 | io.Copy(io.Discard, res.body) 88 | res.body.Close() 89 | } 90 | 91 | res.Reset() 92 | w.responsePool.Release(res) 93 | }() 94 | 95 | if err := w.executor.Execute(work.request, res); err != nil { 96 | // resetAndRelease(work) 97 | return err 98 | } 99 | 100 | next := (*work).next 101 | pCtx := work.request.ReadContext() 102 | 103 | // next==nil means this is the last callback of the spider 104 | if next == nil { 105 | return nil 106 | } 107 | 108 | // call to callback must me blocking so that the callback can read from the response 109 | // before the response is resetted and returned to pool 110 | if pCtx == nil { 111 | pCtx = context.Background() 112 | } 113 | 114 | // we copy meta from our request to our response to be accessible to the spider 115 | res.WriteMeta(work.request.ReadMeta()) 116 | 117 | next(context.WithValue(pCtx, "WORKER_ID", w.ID), res) 118 | return nil 119 | } 120 | 121 | func (w *Worker) resetAndRelease(work *schedulerWork) { 122 | // release *request to pool 123 | req, ok := work.request.(*request) 124 | 125 | if !ok { 126 | return 127 | } 128 | 129 | req.Reset() 130 | 131 | w.requestPool.Release(req) 132 | 133 | // release *schedulerWork to pool 134 | work.Reset() 135 | 136 | w.schedulerWorkPool.Release(work) 137 | } 138 | -------------------------------------------------------------------------------- /pkg/scheduler/worker_test.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | "testing" 7 | 8 | rp "github.com/tech-engine/goscrapy/internal/resource_pool" 9 | "github.com/tech-engine/goscrapy/pkg/core" 10 | "github.com/tech-engine/goscrapy/pkg/engine" 11 | ) 12 | 13 | type dummyExecutor struct { 14 | } 15 | 16 | func (e *dummyExecutor) Execute(reader core.IRequestReader, writer engine.IResponseWriter) error { 17 | return nil 18 | } 19 | 20 | func TestWorker(t *testing.T) { 21 | // create a worker 22 | var workerId uint16 = 1 23 | var respPoolSize uint64 = 1 24 | 25 | executor := &dummyExecutor{} 26 | workerQueue := make(WorkerQueue, 1) 27 | schedulerWorkPool := rp.NewPooler(rp.WithSize[schedulerWork](1)) 28 | requestPool := rp.NewPooler(rp.WithSize[request](1)) 29 | 30 | worker := NewWorker( 31 | workerId, 32 | executor, 33 | workerQueue, 34 | schedulerWorkPool, 35 | requestPool, 36 | respPoolSize, 37 | ) 38 | 39 | ctx, cancel := context.WithCancel(context.Background()) 40 | 41 | // start the worker 42 | go func() { 43 | worker.Start(ctx) 44 | }() 45 | 46 | // create a scheduler work 47 | work := &schedulerWork{ 48 | next: func(ctx context.Context, resp core.IResponseReader) { 49 | }, 50 | request: &request{ 51 | method: "GET", 52 | header: make(http.Header), 53 | }, 54 | } 55 | // execute a task 56 | worker.execute(ctx, work) 57 | cancel() 58 | } 59 | --------------------------------------------------------------------------------