├── server ├── public │ └── build │ │ └── .gitkeep ├── assets │ ├── css │ │ └── tailwind.css │ └── img │ │ ├── favicon.ico │ │ ├── favicon-16x16.png │ │ ├── favicon-32x32.png │ │ ├── apple-touch-icon.png │ │ ├── android-chrome-192x192.png │ │ ├── android-chrome-512x512.png │ │ └── till-logo.svg ├── proxy.go ├── stats.go ├── ui.go ├── templates │ ├── index.html │ ├── layouts │ │ └── master.html │ └── requests │ │ └── index.html ├── handlers │ ├── request_handlers.go │ └── handlers.go └── server.go ├── examples ├── python │ └── scrapy │ │ ├── tutorial │ │ ├── tutorial │ │ │ ├── __init__.py │ │ │ ├── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── quotes_spider.py │ │ │ ├── items.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ ├── middlewares.py │ │ │ ├── quotes-1.html │ │ │ └── quotes-2.html │ │ └── scrapy.cfg │ │ └── README.md ├── ruby │ └── kimurai │ │ ├── Gemfile │ │ ├── README.md │ │ ├── example.rb │ │ └── Gemfile.lock ├── go │ ├── standard │ │ ├── go.mod │ │ ├── README.md │ │ └── main.go │ └── colly │ │ ├── go.mod │ │ ├── README.md │ │ ├── main.go │ │ └── go.sum └── nodejs │ ├── puppeteer │ ├── package.json │ ├── README.md │ └── example.js │ └── plain │ ├── package.json │ ├── get.js │ ├── post.js │ └── README.md ├── img ├── stat-ui.png ├── how-it-works.png ├── integrations │ ├── go.png │ ├── colly.png │ ├── java.png │ ├── ruby.png │ ├── kimurai.png │ ├── nodejs.png │ ├── python.png │ ├── scrapy.png │ ├── selenium.png │ └── puppeteer.png ├── request-log-ui.png ├── request-log-detail-ui.png ├── http-caching-flowchart.png ├── icons8-spade.svg └── till-logo.svg ├── postcss.config.js ├── internal └── tillclient │ ├── errors.go │ ├── timestamp.go │ ├── tillclient.go │ ├── instances.go │ └── instance_stats.go ├── .gitignore ├── tailwind.config.js ├── cmd ├── version.go ├── root.go └── serve.go ├── package.json ├── main.go ├── .github └── workflows │ └── release.yml ├── proxy ├── http.go ├── utils.go ├── config.go ├── tunneling.go ├── cert.go └── proxy.go ├── .goreleaser.yml ├── go.mod ├── LICENSE └── README.md /server/public/build/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/tutorial/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /img/stat-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/stat-ui.png -------------------------------------------------------------------------------- /examples/ruby/kimurai/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gem 'kimurai', '~> 1.4' -------------------------------------------------------------------------------- /img/how-it-works.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/how-it-works.png -------------------------------------------------------------------------------- /img/integrations/go.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/go.png -------------------------------------------------------------------------------- /img/request-log-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/request-log-ui.png -------------------------------------------------------------------------------- /server/assets/css/tailwind.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; -------------------------------------------------------------------------------- /examples/go/standard/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/DataHenHQ/till/examples/go/standard 2 | 3 | go 1.16 4 | -------------------------------------------------------------------------------- /img/integrations/colly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/colly.png -------------------------------------------------------------------------------- /img/integrations/java.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/java.png -------------------------------------------------------------------------------- /img/integrations/ruby.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/ruby.png -------------------------------------------------------------------------------- /img/integrations/kimurai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/kimurai.png -------------------------------------------------------------------------------- /img/integrations/nodejs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/nodejs.png -------------------------------------------------------------------------------- /img/integrations/python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/python.png -------------------------------------------------------------------------------- /img/integrations/scrapy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/scrapy.png -------------------------------------------------------------------------------- /img/integrations/selenium.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/selenium.png -------------------------------------------------------------------------------- /img/request-log-detail-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/request-log-detail-ui.png -------------------------------------------------------------------------------- /server/assets/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/server/assets/img/favicon.ico -------------------------------------------------------------------------------- /img/http-caching-flowchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/http-caching-flowchart.png -------------------------------------------------------------------------------- /img/integrations/puppeteer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/img/integrations/puppeteer.png -------------------------------------------------------------------------------- /server/assets/img/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/server/assets/img/favicon-16x16.png -------------------------------------------------------------------------------- /server/assets/img/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/server/assets/img/favicon-32x32.png -------------------------------------------------------------------------------- /server/assets/img/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/server/assets/img/apple-touch-icon.png -------------------------------------------------------------------------------- /postcss.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: [ 3 | require('tailwindcss'), 4 | require('autoprefixer') 5 | ] 6 | } -------------------------------------------------------------------------------- /server/assets/img/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/server/assets/img/android-chrome-192x192.png -------------------------------------------------------------------------------- /server/assets/img/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHenHQ/till/HEAD/server/assets/img/android-chrome-512x512.png -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/tutorial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /internal/tillclient/errors.go: -------------------------------------------------------------------------------- 1 | package tillclient 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | ) 7 | 8 | type CustomError struct { 9 | StatusCode int 10 | 11 | Err error 12 | } 13 | 14 | func (r *CustomError) Error() string { 15 | return fmt.Sprintf("status %v", r.Err) 16 | } 17 | 18 | var ErrNotFound = errors.New("not found") 19 | -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tutorial.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tutorial 12 | -------------------------------------------------------------------------------- /examples/nodejs/puppeteer/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pupeteer", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "example.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "puppeteer": "^10.2.0" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/tutorial/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class TutorialItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /examples/nodejs/plain/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "main.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "got": "^11.8.2", 13 | "hpagent": "^0.1.2" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /examples/go/colly/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/DataHenHQ/till/examples/go/colly 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.7.1 // indirect 7 | github.com/antchfx/xmlquery v1.3.6 // indirect 8 | github.com/gocolly/colly/v2 v2.1.0 9 | github.com/temoto/robotstxt v1.1.2 // indirect 10 | golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d // indirect 11 | google.golang.org/appengine v1.6.7 // indirect 12 | ) 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | 16 | .vscode 17 | __debug_bin 18 | dist 19 | 20 | .env.* 21 | 22 | node_modules 23 | __pycache__ 24 | 25 | server/public/build/* -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/tutorial/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class TutorialPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /examples/go/colly/README.md: -------------------------------------------------------------------------------- 1 | # Integrating Till with Go Colly 2 | 3 | You can integrate Till with [Colly](https://github.com/gocolly/colly) 4 | 5 | ## 1. Install Till 6 | Follow the instructions to [install Till](https://till.datahen.com/docs/installation) 7 | 8 | ## 2. Run the example file 9 | 10 | To run the example file: 11 | ```bash 12 | $ go run main.go 13 | ``` 14 | 15 | ### 3. Verify that it works 16 | 17 | Visit the Till UI at [http://localhost:2980/requests](http://localhost:2980/requests) to see that your new requests are shown. -------------------------------------------------------------------------------- /examples/go/standard/README.md: -------------------------------------------------------------------------------- 1 | # Integrating Till with Go's net/http standard library 2 | 3 | You can integrate Till with net/http library 4 | 5 | ## 1. Install Till 6 | Follow the instructions to [install Till](https://till.datahen.com/docs/installation) 7 | 8 | ## 2. Run the example file 9 | 10 | To run the example file: 11 | ```bash 12 | $ go run main.go 13 | ``` 14 | 15 | ### 3. Verify that it works 16 | 17 | Visit the Till UI at [http://localhost:2980/requests](http://localhost:2980/requests) to see that your new requests are shown. -------------------------------------------------------------------------------- /examples/nodejs/plain/get.js: -------------------------------------------------------------------------------- 1 | const got = require('got'); 2 | const {HttpsProxyAgent} = require('hpagent'); 3 | 4 | (async function main() { 5 | const response = await got.get('https://fetchtest.datahen.com/echo/request', { 6 | agent: { 7 | https: new HttpsProxyAgent({ 8 | proxy: 'http://localhost:2933', 9 | }), 10 | }, 11 | https: { 12 | rejectUnauthorized: false, 13 | }, 14 | headers: { 15 | 'X-DH-Cache-Freshness': 'now' // Forces a cache miss. 16 | } 17 | }); 18 | 19 | console.log({ response }); 20 | })(); -------------------------------------------------------------------------------- /examples/nodejs/puppeteer/README.md: -------------------------------------------------------------------------------- 1 | # Integrating Till with Puppeteer 2 | 3 | You can integrate Till with Puppeteer. 4 | 5 | ## 1. Install Till 6 | Follow the instructions to [install Till](https://till.datahen.com/docs/installation) 7 | 8 | ## 2. Run the example files 9 | 10 | The [example.js](example.js) file shows an example of a Puppeteer integration. 11 | ```bash 12 | $ node example.js 13 | ``` 14 | ### 3. Verify that it works 15 | 16 | Visit the Till UI at [http://localhost:2980/requests](http://localhost:2980/requests) to see that your new requests are shown. -------------------------------------------------------------------------------- /tailwind.config.js: -------------------------------------------------------------------------------- 1 | const defaultTheme = require('tailwindcss/defaultTheme') 2 | 3 | module.exports = { 4 | purge: [ 5 | './server/templates/**/*.html', 6 | './server/templates/**/*.vue', 7 | ], 8 | darkMode: false, // or 'media' or 'class' 9 | theme: { 10 | extend: { 11 | fontFamily: { 12 | sans: ['Inter var', ...defaultTheme.fontFamily.sans], 13 | }, 14 | }, 15 | }, 16 | variants: { 17 | extend: {}, 18 | }, 19 | plugins: [ 20 | require('@tailwindcss/forms'), 21 | require('@tailwindcss/typography') 22 | ], 23 | } 24 | -------------------------------------------------------------------------------- /examples/nodejs/plain/post.js: -------------------------------------------------------------------------------- 1 | const got = require('got'); 2 | const {HttpsProxyAgent} = require('hpagent'); 3 | 4 | (async function main() { 5 | const response = await got.post('https://postman-echo.com/post', { 6 | agent: { 7 | https: new HttpsProxyAgent({ 8 | proxy: 'http://localhost:2933', 9 | }), 10 | }, 11 | https: { 12 | rejectUnauthorized: false, 13 | }, 14 | headers: { 15 | 'X-DH-Cache-Freshness': 'now' // Forces a cache miss. 16 | }, 17 | json: { 18 | hello: 'world' 19 | } 20 | }); 21 | 22 | console.log({ response }); 23 | })(); 24 | -------------------------------------------------------------------------------- /examples/nodejs/plain/README.md: -------------------------------------------------------------------------------- 1 | # Integrating Till with Node.js 2 | 3 | You can integrate Till with Node.js based scrapers. 4 | 5 | ## 1. Install Till 6 | Follow the instructions to [install Till](https://till.datahen.com/docs/installation) 7 | 8 | ## 2. Run the example files 9 | 10 | The [get.js](get.js) file shows an example of a GET request. 11 | ```bash 12 | $ node get.js 13 | ``` 14 | 15 | The [post.js](post.js) file shows an example of a POST request. 16 | ```bash 17 | $ node post.js 18 | ``` 19 | 20 | ### 3. Verify that it works 21 | 22 | Visit the Till UI at [http://localhost:2980/requests](http://localhost:2980/requests) to see that your new requests are shown. -------------------------------------------------------------------------------- /cmd/version.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/spf13/cobra" 7 | ) 8 | 9 | func init() { 10 | rootCmd.AddCommand(versionCmd) 11 | } 12 | 13 | var ( 14 | ReleaseVersion = "dev" 15 | ReleaseCommit = "none" 16 | ReleaseDate = "unknown" 17 | ) 18 | 19 | var versionCmd = &cobra.Command{ 20 | Use: "version", 21 | Short: "Print the version number of DataHen Till", 22 | Long: `All software has versions. This is Till's`, 23 | Run: func(_ *cobra.Command, _ []string) { 24 | fmt.Println("DataHen Till") 25 | fmt.Println("Version:", ReleaseVersion) 26 | fmt.Println("Date:", ReleaseDate) 27 | fmt.Println("Commit:", ReleaseCommit) 28 | }, 29 | } 30 | -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/tutorial/spiders/quotes_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class QuotesSpider(scrapy.Spider): 5 | name = "quotes" 6 | 7 | def start_requests(self): 8 | urls = [ 9 | 'http://quotes.toscrape.com/page/1/', 10 | 'http://quotes.toscrape.com/page/2/', 11 | ] 12 | for url in urls: 13 | yield scrapy.Request(url=url, callback=self.parse) 14 | 15 | def parse(self, response): 16 | page = response.url.split("/")[-2] 17 | filename = f'quotes-{page}.html' 18 | with open(filename, 'wb') as f: 19 | f.write(response.body) 20 | self.log(f'Saved file {filename}') -------------------------------------------------------------------------------- /examples/ruby/kimurai/README.md: -------------------------------------------------------------------------------- 1 | # Integrating Till with Kimurai framework 2 | 3 | You can integrate Till with Ruby's [Kimurai framework](https://github.com/vifreefly/kimuraframework). 4 | 5 | ## 1. Install Till 6 | 7 | Follow the instructions to [install Till](https://till.datahen.com/docs/installation) 8 | 9 | ## 2. Run the example file 10 | 11 | The [example.rb](example.rb) file shows an example of Kimurai framework integration with Till. 12 | ```bash 13 | # Install the bundle 14 | $ bundle install 15 | 16 | # Run the scraper 17 | $ ruby example.rb 18 | ``` 19 | 20 | ### 3. Verify that it works 21 | 22 | Visit the Till UI at [http://localhost:2980/requests](http://localhost:2980/requests) to see that your new requests are shown. -------------------------------------------------------------------------------- /examples/nodejs/puppeteer/example.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | 3 | (async () => { 4 | const browser = await puppeteer.launch( 5 | { 6 | headless: true, 7 | ignoreHTTPSErrors: true, 8 | acceptInsecureCerts: true, 9 | args: [ 10 | '--proxy-server=http://localhost:2933', 11 | '--ignore-certificate-errors', 12 | '--ignore-certificate-errors-spki-list ', 13 | ], 14 | } 15 | ); 16 | 17 | const page = await browser.newPage(); 18 | 19 | await page.setExtraHTTPHeaders({ 20 | // Add the header to force a Cache Miss on Till 21 | 'X-DH-Cache-Freshness': 'now' 22 | }) 23 | 24 | await page.goto('https://fetchtest.datahen.com/echo/request'); 25 | 26 | const txt = await page.content() 27 | console.log(txt); 28 | 29 | await browser.close(); 30 | })(); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "till", 3 | "version": "1.0.0", 4 | "main": "index.js", 5 | "repository": "git@github.com:DataHenHQ/till.git", 6 | "author": "Parama Danoesubroto ", 7 | "license": "MIT", 8 | "devDependencies": { 9 | "@tailwindcss/forms": "^0.2.1", 10 | "autoprefixer": "^10.1.0", 11 | "postcss-cli": "^8.3.1", 12 | "tailwindcss": "^2.0.2" 13 | }, 14 | "scripts": { 15 | "build": "postcss server/assets/css/tailwind.css -o server/public/build/css/tailwind.css && rm -rf server/public/build/img && cp -r server/assets/img server/public/build/img", 16 | "build:production": "NODE_ENV=production postcss server/assets/css/tailwind.css -o server/public/build/css/tailwind.css && cp -r server/assets/img server/public/build/img" 17 | }, 18 | "dependencies": { 19 | "@tailwindcss/typography": "^0.4.0" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /examples/python/scrapy/README.md: -------------------------------------------------------------------------------- 1 | # Integrating Till with Scrapy 2 | 3 | The example in [this directory](tutorial/) was taken from Scrapy's [tutorial page](https://docs.scrapy.org/en/latest/intro/tutorial.html) and modified to integrate with Till. 4 | 5 | To integrate with Till, we only need to do two things: 6 | 7 | 1. On [middlewares.py](tutorial/tutorial/middlewares.py#L14) file, add the `TillMiddleware` class. 8 | 2. On [settings.py](tutorial/tutorial/settings.py#L54) file, enable the downloader middlewares and add the `tutorial.middlewares.TillMiddleware` key. 9 | 10 | ## 1. Install Till 11 | Follow the instructions to [install Till](https://till.datahen.com/docs/installation) 12 | 13 | ## 2. Run the example 14 | 15 | ```bash 16 | # Install Scrapy 17 | $ pip install scrapy 18 | # On the tutorial directory, run: 19 | $ scrapy crawl quotes 20 | ``` 21 | 22 | ### 3. Verify that it works 23 | 24 | Visit the Till UI at [http://localhost:2980/requests](http://localhost:2980/requests) to see that your new requests are shown. -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2021 DataHen Canada Inc 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package main 16 | 17 | import ( 18 | "github.com/DataHenHQ/till/cmd" 19 | ) 20 | 21 | var ( 22 | version = "" 23 | commit = "" 24 | date = "" 25 | baseurl = "" 26 | pubkey = "" 27 | ) 28 | 29 | func main() { 30 | if version != "" { 31 | cmd.ReleaseVersion = version 32 | } 33 | if commit != "" { 34 | cmd.ReleaseCommit = commit 35 | } 36 | if date != "" { 37 | cmd.ReleaseDate = date 38 | } 39 | if baseurl != "" { 40 | cmd.BaseURL = baseurl 41 | } 42 | if pubkey != "" { 43 | cmd.PubKey = pubkey 44 | } 45 | cmd.Execute() 46 | } 47 | -------------------------------------------------------------------------------- /server/proxy.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "crypto/tls" 5 | "fmt" 6 | "log" 7 | "net/http" 8 | "time" 9 | 10 | "github.com/DataHenHQ/till/internal/tillclient" 11 | "github.com/DataHenHQ/till/proxy" 12 | ) 13 | 14 | type ProxyServer struct { 15 | server *http.Server 16 | port string 17 | instance *tillclient.Instance 18 | } 19 | 20 | func NewProxyServer(port string, i *tillclient.Instance) (s *ProxyServer, err error) { 21 | s = &ProxyServer{ 22 | server: &http.Server{ 23 | 24 | Addr: fmt.Sprintf(":%v", port), 25 | ReadTimeout: 1 * time.Minute, 26 | WriteTimeout: 1 * time.Minute, 27 | Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 28 | if r.Method == http.MethodConnect { 29 | proxy.HandleTunneling(w, r) 30 | } else { 31 | proxy.HandleHTTP(w, r) 32 | } 33 | }), 34 | // Disable HTTP/2. 35 | TLSNextProto: make(map[string]func(*http.Server, *tls.Conn, http.Handler)), 36 | }, 37 | port: port, 38 | instance: i, 39 | } 40 | 41 | return s, nil 42 | } 43 | 44 | func (s *ProxyServer) ListenAndServe() { 45 | fmt.Printf("Starting Till on http://localhost:%v\n", s.port) 46 | if err := s.server.ListenAndServe(); err != nil { 47 | log.Println("shutting down TIll Proxy server:", err) 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /internal/tillclient/timestamp.go: -------------------------------------------------------------------------------- 1 | // this file is adapted from by go-github package https://github.com/google/go-github/blob/master/github/timestamp.go 2 | package tillclient 3 | 4 | import ( 5 | "strconv" 6 | "time" 7 | ) 8 | 9 | // Timestamp represents a time that can be unmarshalled from a JSON string 10 | // formatted as either an RFC3339 or Unix timestamp. This is necessary for some 11 | // fields since the GitHub API is inconsistent in how it represents times. All 12 | // exported methods of time.Time can be called on Timestamp. 13 | type Timestamp struct { 14 | time.Time 15 | } 16 | 17 | func (t Timestamp) String() string { 18 | return t.Time.String() 19 | } 20 | 21 | // UnmarshalJSON implements the json.Unmarshaler interface. 22 | // Time is expected in RFC3339 or Unix format. 23 | func (t *Timestamp) UnmarshalJSON(data []byte) (err error) { 24 | str := string(data) 25 | i, err := strconv.ParseInt(str, 10, 64) 26 | if err == nil { 27 | t.Time = time.Unix(i, 0) 28 | if t.Time.Year() > 3000 { 29 | t.Time = time.Unix(0, i*1e6) 30 | } 31 | } else { 32 | t.Time, err = time.Parse(`"`+time.RFC3339+`"`, str) 33 | } 34 | return 35 | } 36 | 37 | // Equal reports whether t and u are equal based on time.Equal 38 | func (t Timestamp) Equal(u Timestamp) bool { 39 | return t.Time.Equal(u.Time) 40 | } 41 | -------------------------------------------------------------------------------- /server/stats.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "sync" 8 | "time" 9 | 10 | "github.com/DataHenHQ/till/internal/tillclient" 11 | ) 12 | 13 | func newInstanceStatMutex() tillclient.InstanceStatMutex { 14 | return tillclient.InstanceStatMutex{ 15 | Mutex: &sync.Mutex{}, 16 | InstanceStat: tillclient.InstanceStat{ 17 | Requests: newZeroStat(), 18 | SuccessfulRequests: newZeroStat(), 19 | FailedRequests: newZeroStat(), 20 | InterceptedRequests: newZeroStat(), 21 | CacheHits: newZeroStat(), 22 | CacheSets: newZeroStat(), 23 | Name: &InstanceName, 24 | }, 25 | } 26 | } 27 | 28 | func startRecurringStatUpdate() { 29 | client, err := tillclient.NewClient(Token) 30 | if err != nil { 31 | log.Fatal(err) 32 | } 33 | 34 | for { 35 | time.Sleep(time.Second * 5) 36 | // Take a snapshot of the state of the instate stat by doing deep copy 37 | is := StatMu.InstanceStat.DeepCopy() 38 | 39 | // if instance stat is zero then skip this step 40 | if is.IsZero() { 41 | continue 42 | } 43 | 44 | // Update the stat on the cloud 45 | i, _, err := client.InstanceStats.Update(context.Background(), is) 46 | if err != nil { 47 | fmt.Printf("gotten error: %v\n", err) 48 | continue 49 | } 50 | 51 | // set the current instance global var 52 | curri = *i 53 | 54 | resetInstanceStatDelta(is) 55 | 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: goreleaser 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | pull_request: 8 | tags: 9 | - 'v*' 10 | jobs: 11 | goreleaser: 12 | name: Release 13 | runs-on: ubuntu-latest 14 | container: 15 | image: bepsays/ci-goreleaser:1.17.2 16 | steps: 17 | - 18 | name: Checkout 19 | uses: actions/checkout@v2 20 | with: 21 | fetch-depth: 0 22 | 23 | - 24 | name: Set up Go 25 | uses: actions/setup-go@v2 26 | with: 27 | go-version: 1.17 28 | - 29 | name: Granting private modules access 30 | run: | 31 | git config --global url."https://${{ secrets.GO_MODULES_TOKEN }}:x-oauth-basic@github.com/DataHenHQ".insteadOf "https://github.com/DataHenHQ" 32 | 33 | - 34 | name: Setup Node 35 | uses: actions/setup-node@v1 36 | with: 37 | node-version: '12.x' 38 | 39 | - 40 | name: Install Yarn 41 | run: npm install -g yarn 42 | 43 | - 44 | name: Yarn install 45 | run: yarn install --frozen-lockfile 46 | 47 | - 48 | name: Yarn build production 49 | run: yarn run build:production 50 | 51 | - 52 | name: Run GoReleaser 53 | uses: goreleaser/goreleaser-action@v2 54 | with: 55 | version: latest 56 | args: release --rm-dist 57 | env: 58 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /proxy/http.go: -------------------------------------------------------------------------------- 1 | package proxy 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "net/http" 7 | 8 | "github.com/DataHenHQ/tillup/features" 9 | "github.com/DataHenHQ/tillup/interceptions" 10 | ) 11 | 12 | // HandleHTTP proxies the request from source to target 13 | func HandleHTTP(sw http.ResponseWriter, sreq *http.Request) error { 14 | 15 | // Hijack the source connection 16 | sconn, _, err := sw.(http.Hijacker).Hijack() 17 | if err != nil { 18 | http.Error(sw, "no upstream", 503) 19 | e := errors.New(fmt.Sprint("unable to hijack the source connection", sreq.Host, err)) 20 | return e 21 | } 22 | defer sconn.Close() 23 | 24 | // Generate the Page 25 | pconf := generatePageConfig(sreq) 26 | scheme := sreq.URL.Scheme 27 | p, err := NewPageFromRequest(sreq, scheme, pconf) 28 | if err != nil { 29 | return err 30 | } 31 | 32 | // If Interception is allowed and it matches 33 | if features.Allow(features.Interceptions) { 34 | if ok, in := interceptions.Matches(sreq); ok && in != nil { 35 | resp, err := in.CreateResponse() 36 | if err != nil { 37 | return err 38 | } 39 | 40 | writeToSource(sconn, resp, p) 41 | 42 | // Increment intercepted reqs stats 43 | incrInterceptedRequestStatDelta() 44 | return nil 45 | } 46 | } 47 | 48 | // Send request to target server 49 | tresp, err := sendToTarget(sreq.Context(), sconn, sreq, scheme, p, pconf) 50 | if err != nil { 51 | return err 52 | } 53 | defer tresp.Body.Close() 54 | 55 | // Write response back to the source connection 56 | writeToSource(sconn, tresp, p) 57 | return nil 58 | } 59 | -------------------------------------------------------------------------------- /proxy/utils.go: -------------------------------------------------------------------------------- 1 | package proxy 2 | 3 | import ( 4 | "bufio" 5 | "errors" 6 | "io/ioutil" 7 | "math/rand" 8 | "net" 9 | "os" 10 | "path/filepath" 11 | "time" 12 | ) 13 | 14 | func createDirIfNotExist(dirpath string) (err error) { 15 | if _, err := os.Stat(dirpath); os.IsNotExist(err) { 16 | return os.MkdirAll(dirpath, os.ModeDir|0755) 17 | } 18 | return nil 19 | } 20 | 21 | // write the full filepath, also creates non existent directory if not exist 22 | func writeFullFilePath(fullpath string, data []byte, perm os.FileMode) (err error) { 23 | dir := filepath.Dir(fullpath) 24 | err = createDirIfNotExist(dir) 25 | if err != nil { 26 | return err 27 | } 28 | 29 | return ioutil.WriteFile(fullpath, data, perm) 30 | } 31 | 32 | func getRandom(s []string) string { 33 | r := rand.New(rand.NewSource(time.Now().UnixNano())) 34 | i := r.Intn(len(s)) 35 | 36 | return s[i] 37 | } 38 | 39 | // dnsName returns the DNS name in addr, if any. 40 | func dnsName(addr string) string { 41 | host, _, err := net.SplitHostPort(addr) 42 | if err != nil { 43 | return "" 44 | } 45 | return host 46 | } 47 | 48 | // LoadProxyFile will load the file 49 | func LoadProxyFile(path string) (count int, urls []string, err error) { 50 | if path == "" { 51 | return 0, nil, errors.New("proxy file path cannot be blank") 52 | } 53 | 54 | f, err := os.Open(path) 55 | if err != nil { 56 | return 0, nil, err 57 | } 58 | defer f.Close() 59 | 60 | s := bufio.NewScanner(f) 61 | 62 | for s.Scan() { 63 | urls = append(urls, s.Text()) 64 | } 65 | 66 | count = len(urls) 67 | 68 | return count, urls, nil 69 | } 70 | -------------------------------------------------------------------------------- /examples/go/colly/main.go: -------------------------------------------------------------------------------- 1 | // Tutorial from from https://github.com/gocolly/colly/blob/master/_examples/basic/basic.go 2 | // modified to integrate with Till 3 | package main 4 | 5 | import ( 6 | "crypto/tls" 7 | "fmt" 8 | "log" 9 | "net/http" 10 | "net/url" 11 | 12 | "github.com/gocolly/colly/v2" 13 | ) 14 | 15 | func main() { 16 | // Instantiate default collector 17 | c := colly.NewCollector( 18 | // Visit only domains: hackerspaces.org, wiki.hackerspaces.org 19 | colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"), 20 | ) 21 | 22 | // Integration with Till 23 | proxyUrl, err := url.Parse("http://localhost:2933") 24 | if err != nil { 25 | log.Fatal(err) 26 | } 27 | tilltransport := http.Transport{ 28 | Proxy: http.ProxyURL(proxyUrl), 29 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 30 | } 31 | c.WithTransport(&tilltransport) 32 | 33 | // Add custom headers to tell Till what to do 34 | c.OnRequest(func(req *colly.Request) { 35 | // Add the header to force a Cache Miss on Till 36 | req.Headers.Add("X-DH-Cache-Freshness", "now") 37 | }) 38 | 39 | // On every a element which has href attribute call callback 40 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 41 | link := e.Attr("href") 42 | // Print link 43 | fmt.Printf("Link found: %q -> %s\n", e.Text, link) 44 | // Visit link found on page 45 | // Only those links are visited which are in AllowedDomains 46 | c.Visit(e.Request.AbsoluteURL(link)) 47 | }) 48 | 49 | // Before making a request print "Visiting ..." 50 | c.OnRequest(func(r *colly.Request) { 51 | fmt.Println("Visiting", r.URL.String()) 52 | }) 53 | 54 | // Start scraping on https://hackerspaces.org 55 | c.Visit("https://hackerspaces.org/") 56 | } 57 | -------------------------------------------------------------------------------- /server/ui.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "fmt" 5 | "io/fs" 6 | "log" 7 | "net/http" 8 | "time" 9 | 10 | "github.com/DataHenHQ/till/internal/tillclient" 11 | "github.com/DataHenHQ/till/server/handlers" 12 | "github.com/gorilla/mux" 13 | ) 14 | 15 | type UIServer struct { 16 | server *http.Server 17 | port string 18 | instance *tillclient.Instance 19 | } 20 | 21 | func NewUIServer(port string, i *tillclient.Instance) (s *UIServer, err error) { 22 | 23 | r := mux.NewRouter() 24 | 25 | r.HandleFunc("/", handlers.IndexHandler) 26 | 27 | if !LoggerConfig.Disabled { 28 | r.HandleFunc("/requests", handlers.RequestIndexHandler) 29 | r.HandleFunc("/requests/{rid}", handlers.RequestShowHandler) 30 | r.HandleFunc("/requests/{rid}/content", handlers.RequestContentShowHandler) 31 | } 32 | 33 | // wildcard for content, so that URL path is similar to original request 34 | r.PathPrefix("/requests/{rid}/content/").HandlerFunc(handlers.RequestContentShowHandler) 35 | 36 | // serve static assets 37 | var rawPublicFs = fs.FS(embeddedAssets) 38 | assetsFs, err := fs.Sub(rawPublicFs, "public/build") 39 | fs := http.FileServer(http.FS(assetsFs)) 40 | r.PathPrefix("/assets/").Handler(http.StripPrefix("/assets/", fs)) 41 | 42 | s = &UIServer{ 43 | server: &http.Server{ 44 | Addr: fmt.Sprintf(":%v", port), 45 | ReadTimeout: 1 * time.Minute, 46 | WriteTimeout: 1 * time.Minute, 47 | Handler: r, 48 | }, 49 | port: port, 50 | instance: i, 51 | } 52 | 53 | return s, nil 54 | } 55 | 56 | func (s *UIServer) ListenAndServe() { 57 | fmt.Printf("Starting Till UI on http://localhost:%v\n", s.port) 58 | if err := s.server.ListenAndServe(); err != nil { 59 | log.Println("shutting down DataHen TIll UI:", err) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /img/icons8-spade.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.goreleaser.yml: -------------------------------------------------------------------------------- 1 | # This is an example .goreleaser.yml file with some sane defaults. 2 | # Make sure to check the documentation at http://goreleaser.com 3 | before: 4 | hooks: 5 | # You may remove this if you don't use go modules. 6 | - go mod download 7 | # you may remove this if you don't need go generate 8 | - go generate ./... 9 | builds: 10 | - id: till_darwin 11 | binary: till 12 | main: main.go 13 | goos: 14 | - darwin 15 | goarch: 16 | - amd64 17 | - arm64 18 | env: 19 | - CGO_ENABLED=1 20 | - CC=o64-clang 21 | - CXX=o64-clang++ 22 | flags: 23 | - -tags 24 | - sqlite 25 | 26 | - id: till_linux 27 | binary: till 28 | main: main.go 29 | env: 30 | - CGO_ENABLED=1 31 | flags: 32 | - -tags 33 | - sqlite 34 | goos: 35 | - linux 36 | goarch: 37 | - amd64 38 | - 386 39 | 40 | - id: till_windows_i686 41 | binary: till 42 | main: main.go 43 | ldflags: 44 | - "-extldflags '-static'" 45 | env: 46 | - CGO_ENABLED=1 47 | - CC=i686-w64-mingw32-gcc 48 | - CXX=i686-w64-mingw32-g++ 49 | flags: 50 | - -tags 51 | - sqlite 52 | goos: 53 | - windows 54 | goarch: 55 | - 386 56 | 57 | - id: till_windows_x64 58 | binary: till 59 | main: main.go 60 | ldflags: 61 | - "-extldflags '-static'" 62 | env: 63 | - CGO_ENABLED=1 64 | - CC=x86_64-w64-mingw32-gcc 65 | - CXX=x86_64-w64-mingw32-g++ 66 | flags: 67 | - -tags 68 | - sqlite 69 | goos: 70 | - windows 71 | goarch: 72 | - amd64 73 | 74 | archives: 75 | - replacements: 76 | darwin: Darwin_MacOS 77 | linux: Linux 78 | windows: Windows 79 | 386: i386 80 | amd64: x86_64 81 | checksum: 82 | name_template: 'checksums.txt' 83 | snapshot: 84 | name_template: "{{ .Tag }}-next" 85 | changelog: 86 | sort: asc 87 | filters: 88 | exclude: 89 | - '^docs:' 90 | - '^test:' 91 | release: 92 | prerelease: auto -------------------------------------------------------------------------------- /examples/ruby/kimurai/example.rb: -------------------------------------------------------------------------------- 1 | # Code example from https://github.com/gitter-badger/kimurai 2 | 3 | require 'kimurai' 4 | 5 | class GithubSpider < Kimurai::Base 6 | @name = "github_spider" 7 | @engine = :mechanize 8 | @start_urls = ["https://github.com/search?q=Ruby%20Web%20Scraping"] 9 | @config = { 10 | 11 | # Integrate with Till 12 | proxy: "localhost:2933:http", 13 | ignore_ssl_errors: true, 14 | 15 | # IMPORTANT: Custom headers in Kimurai only works on :mechanize and :poltergeist_phantomjs drivers 16 | # (Selenium don't allow to set/get headers) 17 | headers: { 18 | # Add the header to force a Cache Miss on Till 19 | "X-DH-Cache-Freshness" => "now", 20 | }, 21 | 22 | 23 | } 24 | 25 | def parse(response, url:, data: {}) 26 | response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a| 27 | request_to :parse_repo_page, url: absolute_url(a[:href], base: url) 28 | end 29 | 30 | if next_page = response.at_xpath("//a[@class='next_page']") 31 | request_to :parse, url: absolute_url(next_page[:href], base: url) 32 | end 33 | end 34 | 35 | def parse_repo_page(response, url:, data: {}) 36 | item = {} 37 | 38 | item[:owner] = response.xpath("//h1//a[@rel='author']").text 39 | item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text 40 | item[:repo_url] = url 41 | item[:description] = response.xpath("//span[@itemprop='about']").text.squish 42 | item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish } 43 | item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish 44 | item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish 45 | item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish 46 | item[:last_commit] = response.xpath("//span[@itemprop='dateModified']/*").text 47 | 48 | save_to "results.json", item, format: :pretty_json 49 | end 50 | end 51 | 52 | GithubSpider.crawl! -------------------------------------------------------------------------------- /server/templates/index.html: -------------------------------------------------------------------------------- 1 | {{define "content"}} 2 |
3 |
4 |
5 |
6 | Total Requests 7 |
8 |
9 | {{ localizeInt .Requests}} 10 |
11 |
12 | 13 |
14 |
15 | Successful Requests 16 |
17 |
18 | {{localizeInt .SuccessfulRequests}} 19 |
20 |
21 | 22 |
23 |
24 | Failed Requests 25 |
26 |
27 | {{localizeInt .FailedRequests}} 28 |
29 |
30 | 31 |
32 |
33 | Intercepted Requests 34 |
35 |
36 | {{localizeInt .InterceptedRequests}} 37 |
38 |
39 | 40 |
41 |
42 | Cache Hits 43 |
44 |
45 | {{localizeInt .CacheHits}} 46 |
47 |
48 | 49 | 50 |
51 |
52 | Cache Sets 53 |
54 |
55 | {{localizeInt .CacheSets}} 56 |
57 |
58 |
59 |
60 | {{end}} -------------------------------------------------------------------------------- /examples/go/standard/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "crypto/tls" 6 | "fmt" 7 | "log" 8 | "net/http" 9 | "net/http/httputil" 10 | "net/url" 11 | ) 12 | 13 | func main() { 14 | 15 | // setup the proxy connection to Till 16 | proxyUrl, err := url.Parse("http://localhost:2933") 17 | myClient := &http.Client{Transport: &http.Transport{ 18 | Proxy: http.ProxyURL(proxyUrl), 19 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 20 | }} 21 | 22 | // 23 | // Example 1: GET request 24 | // 25 | 26 | // create a new GET request 27 | greq, err := http.NewRequest("GET", "https://fetchtest.datahen.com/echo/request", nil) 28 | if err != nil { 29 | log.Fatal(err) 30 | } 31 | 32 | // Add the header to force a Cache Miss on Till 33 | greq.Header.Add("X-DH-Cache-Freshness", "now") 34 | 35 | // Do the actual request 36 | gresp, err := myClient.Do(greq) 37 | if err != nil { 38 | log.Fatal(err) 39 | } 40 | 41 | // print out the response 42 | grout, err := httputil.DumpResponse(gresp, true) 43 | if err != nil { 44 | log.Fatal(err) 45 | } 46 | fmt.Println("-------------------") 47 | fmt.Println("GET RESPONSE:") 48 | fmt.Println("-------------------") 49 | fmt.Println(string(grout)) 50 | 51 | // 52 | // Example 2: Post request 53 | // 54 | 55 | // create a new POST request 56 | jsonData := `{"hello":"world"}` 57 | 58 | preq, err := http.NewRequest("POST", "https://postman-echo.com/post", bytes.NewBuffer([]byte(jsonData))) 59 | if err != nil { 60 | log.Fatal(err) 61 | } 62 | 63 | // Add the header to force a Cache Miss on Till 64 | preq.Header.Add("X-DH-Cache-Freshness", "now") 65 | 66 | preq.Header.Set("Content-Type", "application/json") 67 | preq.Header.Set("Accept", "*/*") 68 | 69 | // Do the actual request 70 | presp, err := myClient.Do(preq) 71 | if err != nil { 72 | log.Fatal(err) 73 | } 74 | 75 | // print out the response 76 | prout, err := httputil.DumpResponse(presp, true) 77 | if err != nil { 78 | log.Fatal(err) 79 | } 80 | fmt.Println("-------------------") 81 | fmt.Println("POST RESPONSE:") 82 | fmt.Println("-------------------") 83 | fmt.Println(string(prout)) 84 | 85 | } 86 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/DataHenHQ/till 2 | 3 | go 1.17 4 | 5 | require ( 6 | github.com/DataHenHQ/datahen v0.0.0-20210206012353-c71b87ce80de 7 | github.com/DataHenHQ/license v0.0.0-20210421182239-f7393536552b 8 | github.com/DataHenHQ/tillup v0.0.0-20211019162626-e1b7912712c4 9 | github.com/DataHenHQ/useragent v0.0.0-20210226181302-3735e350d96c 10 | github.com/foolin/goview v0.3.0 11 | github.com/go-resty/resty/v2 v2.6.0 12 | github.com/gorilla/mux v1.8.0 13 | github.com/mitchellh/go-homedir v1.1.0 14 | github.com/spf13/cobra v1.1.3 15 | github.com/spf13/viper v1.7.1 16 | github.com/volatiletech/null/v8 v8.1.2 17 | golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4 18 | golang.org/x/text v0.3.7 19 | ) 20 | 21 | require ( 22 | github.com/friendsofgo/errors v0.9.2 // indirect 23 | github.com/fsnotify/fsnotify v1.4.9 // indirect 24 | github.com/gocarina/gocsv v0.0.0-20201103164230-b291445e0dd2 // indirect 25 | github.com/gofrs/uuid v3.2.0+incompatible // indirect 26 | github.com/golang-migrate/migrate/v4 v4.14.1 // indirect 27 | github.com/gorhill/cronexpr v0.0.0-20180427100037-88b0669f7d75 // indirect 28 | github.com/hashicorp/errwrap v1.0.0 // indirect 29 | github.com/hashicorp/go-multierror v1.1.0 // indirect 30 | github.com/hashicorp/hcl v1.0.0 // indirect 31 | github.com/inconshreveable/mousetrap v1.0.0 // indirect 32 | github.com/magiconair/properties v1.8.1 // indirect 33 | github.com/mattn/go-sqlite3 v1.14.7 // indirect 34 | github.com/mitchellh/mapstructure v1.1.2 // indirect 35 | github.com/oklog/ulid/v2 v2.0.2 // indirect 36 | github.com/pelletier/go-toml v1.2.0 // indirect 37 | github.com/spf13/afero v1.4.1 // indirect 38 | github.com/spf13/cast v1.3.1 // indirect 39 | github.com/spf13/jwalterweatherman v1.0.0 // indirect 40 | github.com/spf13/pflag v1.0.5 // indirect 41 | github.com/subosito/gotenv v1.2.0 // indirect 42 | github.com/volatiletech/inflect v0.0.1 // indirect 43 | github.com/volatiletech/randomize v0.0.1 // indirect 44 | github.com/volatiletech/sqlboiler/v4 v4.6.0 // indirect 45 | github.com/volatiletech/strmangle v0.0.1 // indirect 46 | golang.org/x/sys v0.0.0-20210603125802-9665404d3644 // indirect 47 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect 48 | gopkg.in/asaskevich/govalidator.v9 v9.0.0-20180315120708-ccb8e960c48f // indirect 49 | gopkg.in/ini.v1 v1.51.0 // indirect 50 | gopkg.in/yaml.v2 v2.4.0 // indirect 51 | ) 52 | -------------------------------------------------------------------------------- /internal/tillclient/tillclient.go: -------------------------------------------------------------------------------- 1 | package tillclient 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "errors" 7 | "fmt" 8 | "io/ioutil" 9 | "time" 10 | 11 | "github.com/DataHenHQ/license" 12 | "github.com/go-resty/resty/v2" 13 | ) 14 | 15 | var BaseURL string 16 | 17 | type service struct { 18 | client *Client 19 | } 20 | 21 | type Client struct { 22 | *resty.Client 23 | Token string 24 | 25 | common service // Reuse a single struct instead of allocating one for each service on the heap. 26 | 27 | Instances *InstancesService 28 | InstanceStats *InstanceStatsService 29 | } 30 | 31 | func NewClient(token string) (c *Client, err error) { 32 | 33 | c = &Client{ 34 | Client: resty.New(), 35 | } 36 | c.SetTimeout(1 * time.Minute) 37 | c.SetHostURL(BaseURL) 38 | c.Token = token 39 | 40 | c.OnAfterResponse(verifySignature) 41 | 42 | c.common.client = c 43 | 44 | // assigns the common service 45 | c.Instances = (*InstancesService)(&c.common) 46 | c.InstanceStats = (*InstanceStatsService)(&c.common) 47 | 48 | return c, nil 49 | } 50 | 51 | // a middleware that verifies the signature from the api response, and then replace it with the actual content data 52 | func verifySignature(c *resty.Client, resp *resty.Response) error { 53 | 54 | // if error = 404 then return ErrNotFound 55 | if resp.StatusCode() == 404 { 56 | return ErrNotFound 57 | } 58 | 59 | // if error status code is more than 399, return an error 60 | if resp.StatusCode() > 399 { 61 | return &CustomError{ 62 | StatusCode: resp.StatusCode(), 63 | Err: errors.New(resp.Status()), 64 | } 65 | } 66 | 67 | // verify the response body and extract the data 68 | data, err := license.Verify(resp.Body()) 69 | if err != nil { 70 | return err 71 | } 72 | 73 | // replace the raw response body with the new content data. 74 | // NOTE: we can't use resp.Body() anymore in downstream, because it still refers to old body 75 | nbody := ioutil.NopCloser(bytes.NewReader(data)) 76 | resp.RawResponse.Body = nbody 77 | 78 | return nil // if its success otherwise return error 79 | } 80 | 81 | func (c *Client) NewRequest(ctx context.Context, urlStr string, body interface{}) (*resty.Request, error) { 82 | if c.Token == "" { 83 | return nil, errors.New("token required") 84 | } 85 | 86 | req := c.R() 87 | req.SetContext(ctx) 88 | 89 | req.SetHeader("Authorization", fmt.Sprintf("Bearer %s", c.Token)) 90 | 91 | return req, nil 92 | } 93 | -------------------------------------------------------------------------------- /proxy/config.go: -------------------------------------------------------------------------------- 1 | package proxy 2 | 3 | import ( 4 | "net/http" 5 | "strconv" 6 | 7 | "github.com/DataHenHQ/tillup/cache" 8 | "github.com/DataHenHQ/tillup/cache/freshness" 9 | "github.com/DataHenHQ/tillup/sessions" 10 | ) 11 | 12 | // PageConfig is where the page configuration is set 13 | type PageConfig struct { 14 | ForceUA bool // if true, overrides the User-Agent header 15 | UaType string // default to "desktop" 16 | UseProxy bool 17 | 18 | // StickySession features 19 | SessionID string 20 | StickyCookies bool 21 | StickyUA bool 22 | 23 | // Interceptions feature 24 | IgnoreInterceptions []string 25 | IgnoreAllInterceptions bool 26 | 27 | // Cache feature 28 | CacheFreshness freshness.Type 29 | CacheServeFailures bool 30 | } 31 | 32 | // UATypeHeader is the custom header that the scraper calls till to set the user agent type 33 | const UATypeHeader = "X-DH-UA-Type" 34 | 35 | func generatePageConfig(req *http.Request) (pconf *PageConfig) { 36 | useProxy := false 37 | if ProxyCount > 0 { 38 | useProxy = true 39 | } 40 | 41 | pconf = &PageConfig{ 42 | ForceUA: ForceUA, 43 | UaType: UAType, 44 | UseProxy: useProxy, 45 | 46 | // StickySessions feature defaults to true for sticky cookies and sticky ua 47 | StickyCookies: true, 48 | StickyUA: true, 49 | 50 | // Interceptions feature 51 | IgnoreInterceptions: []string{}, 52 | IgnoreAllInterceptions: false, 53 | 54 | // Cache feature 55 | CacheFreshness: CacheConfig.Freshness, 56 | CacheServeFailures: CacheConfig.ServeFailures, 57 | } 58 | 59 | if uatype := req.Header.Get(UATypeHeader); uatype != "" { 60 | pconf.UaType = uatype 61 | req.Header.Del(UATypeHeader) 62 | } 63 | 64 | // Get the session ID header 65 | if sessid := req.Header.Get(sessions.SessionIDHeader); sessid != "" { 66 | pconf.SessionID = sessid 67 | req.Header.Del(sessions.SessionIDHeader) 68 | } 69 | 70 | // Get the Sticky UA header 71 | if val := req.Header.Get(sessions.StickyUAHeader); val != "" { 72 | pconf.StickyUA, _ = strconv.ParseBool(val) 73 | req.Header.Del(sessions.StickyUAHeader) 74 | } 75 | 76 | // Get the Sticky Cookies header 77 | if val := req.Header.Get(sessions.StickyCookiesHeader); val != "" { 78 | pconf.StickyCookies, _ = strconv.ParseBool(val) 79 | req.Header.Del(sessions.StickyCookiesHeader) 80 | } 81 | 82 | // Get the Cache Freshness header 83 | if val := req.Header.Get(cache.FreshnessHeader); val != "" { 84 | pconf.CacheFreshness = freshness.ConvToType(val) 85 | req.Header.Del(cache.FreshnessHeader) 86 | } 87 | 88 | // Get the Cache Serve Failures 89 | if val := req.Header.Get(cache.ServeFailures); val != "" { 90 | pconf.CacheServeFailures, _ = strconv.ParseBool(val) 91 | req.Header.Del(cache.ServeFailures) 92 | } 93 | 94 | return pconf 95 | } 96 | -------------------------------------------------------------------------------- /img/till-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 7 | 8 | 10 | 13 | 18 | 20 | 25 | 28 | 31 | 34 | 44 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /server/assets/img/till-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 7 | 8 | 10 | 13 | 18 | 20 | 25 | 28 | 31 | 34 | 44 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /cmd/root.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "encoding/hex" 5 | "fmt" 6 | "log" 7 | "os" 8 | "path/filepath" 9 | 10 | "github.com/DataHenHQ/license" 11 | "github.com/DataHenHQ/till/internal/tillclient" 12 | homedir "github.com/mitchellh/go-homedir" 13 | "github.com/spf13/cobra" 14 | "github.com/spf13/viper" 15 | ) 16 | 17 | var cfgFile string 18 | var tillHomeDir string 19 | var BaseURL = "https://till.datahen.com/api/v1" 20 | var PubKey = "ca60c6f94f2ff9f030e4636e66e018fe4f930a16e8915920f390b9bcff9adf9f" 21 | 22 | // rootCmd represents the base command when called without any subcommands 23 | var rootCmd = &cobra.Command{ 24 | Use: "till", 25 | Short: "DataHen Till empowers your existing web scraper to be unblockable, scalable and maintainable without code changes", 26 | Long: `DataHen Till is a standalone tool that runs alongside your web scraper, 27 | and instantly makes your existing web scraper unblockable, scalable, and maintainable, 28 | without requiring any code changes on your scraper code.`, 29 | // Uncomment the following line if your bare application 30 | // has an action associated with it: 31 | // Run: func(cmd *cobra.Command, args []string) { }, 32 | } 33 | 34 | // Execute adds all child commands to the root command and sets flags appropriately. 35 | // This is called by main.main(). It only needs to happen once to the rootCmd. 36 | func Execute() { 37 | // set the base url 38 | tillclient.BaseURL = BaseURL 39 | 40 | // set the license's public key 41 | decpubkey, err := hex.DecodeString(string(PubKey)) 42 | if err != nil { 43 | log.Fatalln("could not decode public key:", PubKey) 44 | } 45 | license.PublicKey = decpubkey 46 | 47 | if err := rootCmd.Execute(); err != nil { 48 | fmt.Println(err) 49 | os.Exit(1) 50 | } 51 | } 52 | 53 | func init() { 54 | initTillHomeDir() 55 | cobra.OnInitialize(initConfig) 56 | 57 | // Here you will define your flags and configuration settings. 58 | // Cobra supports persistent flags, which, if defined here, 59 | // will be global for your application. 60 | rootCmd.PersistentFlags().StringVar(&cfgFile, "config", "", fmt.Sprintf("config file (default is %v)", filepath.Join(tillHomeDir, "config.yaml"))) 61 | 62 | // Cobra also supports local flags, which will only run 63 | // when this action is called directly. 64 | // rootCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") 65 | } 66 | 67 | func initTillHomeDir() { 68 | userhome, err := homedir.Dir() 69 | if err != nil { 70 | fmt.Println(err) 71 | os.Exit(1) 72 | } 73 | 74 | // Search config in home directory with name ".till" (without extension). 75 | tillHomeDir = filepath.Join(userhome, ".config", "datahen", "till") 76 | } 77 | 78 | // initConfig reads in config file and ENV variables if set. 79 | func initConfig() { 80 | if cfgFile != "" { 81 | // Use config file from the flag. 82 | viper.SetConfigFile(cfgFile) 83 | } else { 84 | // Find home directory. 85 | viper.AddConfigPath(tillHomeDir) 86 | viper.SetConfigName("config") 87 | } 88 | 89 | viper.AutomaticEnv() // read in environment variables that match 90 | 91 | // If a config file is found, read it in. 92 | if err := viper.ReadInConfig(); err == nil { 93 | fmt.Println("Using config file:", viper.ConfigFileUsed()) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /proxy/tunneling.go: -------------------------------------------------------------------------------- 1 | package proxy 2 | 3 | import ( 4 | "bufio" 5 | "crypto/tls" 6 | "log" 7 | "net" 8 | "net/http" 9 | 10 | "github.com/DataHenHQ/tillup/features" 11 | "github.com/DataHenHQ/tillup/interceptions" 12 | ) 13 | 14 | func HandleTunneling(sw http.ResponseWriter, sreq *http.Request) error { 15 | 16 | // get the hostname based on the source request's target host 17 | hostname := dnsName(sreq.Host) 18 | if hostname == "" { 19 | log.Println("cannot determine cert name for " + sreq.Host) 20 | http.Error(sw, "cannot determine cert name for "+sreq.Host, 503) 21 | return nil 22 | } 23 | 24 | // Generate provisional cert to be used to respond to the source request, by pretending to be target certificate 25 | provisionalCert, err := GenCert([]string{hostname}) 26 | if err != nil { 27 | log.Println("cert", err) 28 | http.Error(sw, "no upstream", 503) 29 | return nil 30 | } 31 | sConfig := tls.Config{ 32 | MinVersion: tls.VersionTLS12, 33 | //CipherSuites: []uint16{tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA}, 34 | Certificates: []tls.Certificate{*provisionalCert}, 35 | } 36 | 37 | // Does TLS Handshake to the source connection 38 | // tlsHandshakeSource(sconn, provisionalCert) 39 | sconn, err := Handshake(sw, &sConfig) 40 | if err != nil { 41 | return err 42 | } 43 | defer sconn.Close() 44 | 45 | // Reads the source's connection into a new request 46 | reader := bufio.NewReader(sconn) 47 | treq, err := http.ReadRequest(reader) 48 | if err != nil { 49 | log.Println(err) 50 | } 51 | 52 | // Generate the PageConfig 53 | pconf := generatePageConfig(treq) 54 | scheme := "https" 55 | 56 | // create new page from request 57 | p, err := NewPageFromRequest(treq, scheme, pconf) 58 | if err != nil { 59 | return err 60 | } 61 | 62 | // If Interception is allowed and it matches 63 | if features.Allow(features.Interceptions) { 64 | if ok, in := interceptions.Matches(treq); ok && in != nil { 65 | resp, err := in.CreateResponse() 66 | if err != nil { 67 | return err 68 | } 69 | 70 | writeToSource(sconn, resp, p) 71 | 72 | // Increment intercepted reqs stats 73 | incrInterceptedRequestStatDelta() 74 | return nil 75 | } 76 | } 77 | 78 | // Send request to target server 79 | tresp, err := sendToTarget(sreq.Context(), sconn, treq, scheme, p, pconf) 80 | if err != nil { 81 | return err 82 | } 83 | defer tresp.Body.Close() 84 | 85 | // Write response back to the source connection 86 | writeToSource(sconn, tresp, p) 87 | return nil 88 | } 89 | 90 | // Handshake hijacks w's underlying net.Conn, responds to the CONNECT request 91 | // and manually performs the TLS handshake. It returns the net.Conn or and 92 | // error if any. 93 | func Handshake(w http.ResponseWriter, config *tls.Config) (net.Conn, error) { 94 | raw, _, err := w.(http.Hijacker).Hijack() 95 | if err != nil { 96 | http.Error(w, "no upstream", 503) 97 | return nil, err 98 | } 99 | if _, err = raw.Write(okHeader); err != nil { 100 | raw.Close() 101 | return nil, err 102 | } 103 | conn := tls.Server(raw, config) 104 | err = conn.Handshake() 105 | if err != nil { 106 | conn.Close() 107 | raw.Close() 108 | return nil, err 109 | } 110 | return conn, nil 111 | } 112 | -------------------------------------------------------------------------------- /examples/ruby/kimurai/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | activesupport (6.1.4) 5 | concurrent-ruby (~> 1.0, >= 1.0.2) 6 | i18n (>= 1.6, < 2) 7 | minitest (>= 5.1) 8 | tzinfo (~> 2.0) 9 | zeitwerk (~> 2.3) 10 | addressable (2.8.0) 11 | public_suffix (>= 2.0.2, < 5.0) 12 | capybara (3.35.3) 13 | addressable 14 | mini_mime (>= 0.1.3) 15 | nokogiri (~> 1.8) 16 | rack (>= 1.6.0) 17 | rack-test (>= 0.6.3) 18 | regexp_parser (>= 1.5, < 3.0) 19 | xpath (~> 3.2) 20 | capybara-mechanize (1.11.0) 21 | capybara (>= 2.4.4, < 4) 22 | mechanize (~> 2.7.0) 23 | childprocess (3.0.0) 24 | chronic (0.10.2) 25 | cliver (0.3.2) 26 | coderay (1.1.3) 27 | concurrent-ruby (1.1.9) 28 | connection_pool (2.2.5) 29 | domain_name (0.5.20190701) 30 | unf (>= 0.0.5, < 1.0.0) 31 | headless (2.3.1) 32 | http-cookie (1.0.4) 33 | domain_name (~> 0.5) 34 | i18n (1.8.10) 35 | concurrent-ruby (~> 1.0) 36 | kimurai (1.4.0) 37 | activesupport 38 | capybara (>= 2.15, < 4.0) 39 | capybara-mechanize 40 | cliver 41 | headless 42 | murmurhash3 43 | nokogiri 44 | pmap 45 | poltergeist 46 | pry 47 | rbcat (~> 0.2) 48 | selenium-webdriver 49 | thor 50 | whenever 51 | mechanize (2.7.7) 52 | domain_name (~> 0.5, >= 0.5.1) 53 | http-cookie (~> 1.0) 54 | mime-types (>= 1.17.2) 55 | net-http-digest_auth (~> 1.1, >= 1.1.1) 56 | net-http-persistent (>= 2.5.2) 57 | nokogiri (~> 1.6) 58 | ntlm-http (~> 0.1, >= 0.1.1) 59 | webrick (~> 1.7) 60 | webrobots (>= 0.0.9, < 0.2) 61 | method_source (1.0.0) 62 | mime-types (3.3.1) 63 | mime-types-data (~> 3.2015) 64 | mime-types-data (3.2021.0704) 65 | mini_mime (1.1.0) 66 | mini_portile2 (2.6.1) 67 | minitest (5.14.4) 68 | murmurhash3 (0.1.6) 69 | net-http-digest_auth (1.4.1) 70 | net-http-persistent (4.0.1) 71 | connection_pool (~> 2.2) 72 | nokogiri (1.12.3) 73 | mini_portile2 (~> 2.6.1) 74 | racc (~> 1.4) 75 | ntlm-http (0.1.1) 76 | pmap (1.1.1) 77 | poltergeist (1.18.1) 78 | capybara (>= 2.1, < 4) 79 | cliver (~> 0.3.1) 80 | websocket-driver (>= 0.2.0) 81 | pry (0.14.1) 82 | coderay (~> 1.1) 83 | method_source (~> 1.0) 84 | public_suffix (4.0.6) 85 | racc (1.5.2) 86 | rack (2.2.3) 87 | rack-test (1.1.0) 88 | rack (>= 1.0, < 3) 89 | rbcat (0.2.1) 90 | regexp_parser (2.1.1) 91 | rubyzip (2.3.2) 92 | selenium-webdriver (3.142.7) 93 | childprocess (>= 0.5, < 4.0) 94 | rubyzip (>= 1.2.2) 95 | thor (1.1.0) 96 | tzinfo (2.0.4) 97 | concurrent-ruby (~> 1.0) 98 | unf (0.1.4) 99 | unf_ext 100 | unf_ext (0.0.7.7) 101 | webrick (1.7.0) 102 | webrobots (0.1.2) 103 | websocket-driver (0.7.5) 104 | websocket-extensions (>= 0.1.0) 105 | websocket-extensions (0.1.5) 106 | whenever (1.0.0) 107 | chronic (>= 0.6.3) 108 | xpath (3.2.0) 109 | nokogiri (~> 1.8) 110 | zeitwerk (2.4.2) 111 | 112 | PLATFORMS 113 | ruby 114 | 115 | DEPENDENCIES 116 | kimurai (~> 1.4) 117 | 118 | BUNDLED WITH 119 | 2.1.4 120 | -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/tutorial/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for tutorial project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'tutorial' 11 | 12 | SPIDER_MODULES = ['tutorial.spiders'] 13 | NEWSPIDER_MODULE = 'tutorial.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = True 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | #CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | #DOWNLOAD_DELAY = 3 29 | # The download delay setting will honor only one of: 30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | #CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | #COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | #TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | #DEFAULT_REQUEST_HEADERS = { 41 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 42 | # 'Accept-Language': 'en', 43 | #} 44 | 45 | # Enable or disable spider middlewares 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47 | #SPIDER_MIDDLEWARES = { 48 | # 'tutorial.middlewares.TutorialSpiderMiddleware': 543, 49 | #} 50 | 51 | # Enable or disable downloader middlewares 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53 | DOWNLOADER_MIDDLEWARES = { 54 | 'tutorial.middlewares.TillMiddleware': 350, # Add the Till Middleware 55 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, 56 | } 57 | 58 | # Enable or disable extensions 59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 66 | #ITEM_PIPELINES = { 67 | # 'tutorial.pipelines.TutorialPipeline': 300, 68 | #} 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 72 | #AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED = True 86 | #HTTPCACHE_EXPIRATION_SECS = 0 87 | #HTTPCACHE_DIR = 'httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | -------------------------------------------------------------------------------- /server/handlers/request_handlers.go: -------------------------------------------------------------------------------- 1 | package handlers 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/DataHenHQ/tillup/logger" 10 | "github.com/gorilla/mux" 11 | ) 12 | 13 | func RequestIndexHandler(w http.ResponseWriter, r *http.Request) { 14 | 15 | var ( 16 | f = logger.Filter{} 17 | err error 18 | perPage = 100 19 | startAfter string 20 | endBefore string 21 | ) 22 | 23 | if q, ok := r.URL.Query()["from_content_length"]; ok && len(q) == 1 { 24 | v, _ := strconv.ParseInt(q[0], 10, 64) 25 | if err == nil { 26 | f.FromResponseContentLength = &v 27 | } 28 | } 29 | if q, ok := r.URL.Query()["to_content_length"]; ok && len(q) == 1 { 30 | v, _ := strconv.ParseInt(q[0], 10, 64) 31 | if err == nil { 32 | f.ToResponseContentLength = &v 33 | } 34 | } 35 | if q, ok := r.URL.Query()["from_time"]; ok && len(q) == 1 { 36 | v, err := time.Parse(time.RFC3339, q[0]) 37 | if err == nil { 38 | f.FromTime = v 39 | } 40 | } 41 | if q, ok := r.URL.Query()["to_time"]; ok && len(q) == 1 { 42 | v, err := time.Parse(time.RFC3339, q[0]) 43 | if err == nil { 44 | f.ToTime = v 45 | } 46 | } 47 | if q, ok := r.URL.Query()["url"]; ok && len(q) == 1 { 48 | f.RequestURL = q[0] 49 | } 50 | if q, ok := r.URL.Query()["code"]; ok && len(q) == 1 { 51 | f.ResponseStatusCode = q[0] 52 | } 53 | if q, ok := r.URL.Query()["gid"]; ok && len(q) == 1 { 54 | f.Gid = q[0] 55 | } 56 | if q, ok := r.URL.Query()["session_id"]; ok && len(q) == 1 { 57 | f.SessionID = q[0] 58 | } 59 | if q, ok := r.URL.Query()["cache"]; ok && len(q) == 1 { 60 | val := false 61 | switch q[0] { 62 | case "HIT": 63 | val = true 64 | case "MISS": 65 | val = false 66 | } 67 | 68 | f.CacheHit = &val 69 | } 70 | if q, ok := r.URL.Query()["method"]; ok && len(q) == 1 { 71 | f.RequestMethod = q[0] 72 | } 73 | if q, ok := r.URL.Query()["start_after"]; ok && len(q) == 1 { 74 | startAfter = q[0] 75 | } 76 | if q, ok := r.URL.Query()["end_before"]; ok && len(q) == 1 { 77 | endBefore = q[0] 78 | } 79 | 80 | is, p, err := logger.GetItems(r.Context(), f, perPage, startAfter, endBefore) 81 | if err != nil { 82 | fmt.Println("error on requests", err) 83 | } 84 | 85 | Renderer.Render(w, http.StatusOK, "requests/index", map[string]interface{}{ 86 | "title": "Request Log", 87 | "tab": "requests", 88 | "Items": is, 89 | "Pagination": p, 90 | "CurrentURL": r.URL.RequestURI(), 91 | "Filter": f, 92 | }) 93 | } 94 | 95 | func RequestShowHandler(w http.ResponseWriter, r *http.Request) { 96 | vars := mux.Vars(r) 97 | rid := vars["rid"] 98 | 99 | i, err := logger.GetItem(r.Context(), rid) 100 | if err != nil { 101 | fmt.Println("error on requests", err) 102 | } 103 | 104 | Renderer.Render(w, http.StatusOK, "requests/show", map[string]interface{}{ 105 | "tab": "requests", 106 | "Item": i, 107 | }) 108 | } 109 | 110 | func RequestContentShowHandler(w http.ResponseWriter, r *http.Request) { 111 | vars := mux.Vars(r) 112 | rid := vars["rid"] 113 | 114 | i, err := logger.GetItem(r.Context(), rid) 115 | if err != nil { 116 | fmt.Println("error on requests", err) 117 | } 118 | 119 | // hijack the response writer, to get the raw Connection 120 | rawConn, _, err := w.(http.Hijacker).Hijack() 121 | if err != nil { 122 | http.Error(w, "error writing content", 500) 123 | return 124 | } 125 | defer rawConn.Close() 126 | 127 | // build the HTTP response 128 | resp := logger.BuildHTTPResponse(*i) 129 | 130 | // does a raw write of the response into the connection 131 | resp.Write(rawConn) 132 | 133 | } 134 | -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/tutorial/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | from w3lib.http import basic_auth_header 12 | 13 | # Till middleware 14 | class TillMiddleware(object): 15 | def process_request(self, request, spider): 16 | request.meta["proxy"] = "http://localhost:2933" # Connect to Till 17 | request.headers["X-DH-Cache-Freshness"] = "now" # Add the header to force a Cache Miss on Till 18 | 19 | class TutorialSpiderMiddleware: 20 | # Not all methods need to be defined. If a method is not defined, 21 | # scrapy acts as if the spider middleware does not modify the 22 | # passed objects. 23 | 24 | @classmethod 25 | def from_crawler(cls, crawler): 26 | # This method is used by Scrapy to create your spiders. 27 | s = cls() 28 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 29 | return s 30 | 31 | def process_spider_input(self, response, spider): 32 | # Called for each response that goes through the spider 33 | # middleware and into the spider. 34 | 35 | # Should return None or raise an exception. 36 | return None 37 | 38 | def process_spider_output(self, response, result, spider): 39 | # Called with the results returned from the Spider, after 40 | # it has processed the response. 41 | 42 | # Must return an iterable of Request, or item objects. 43 | for i in result: 44 | yield i 45 | 46 | def process_spider_exception(self, response, exception, spider): 47 | # Called when a spider or process_spider_input() method 48 | # (from other spider middleware) raises an exception. 49 | 50 | # Should return either None or an iterable of Request or item objects. 51 | pass 52 | 53 | def process_start_requests(self, start_requests, spider): 54 | # Called with the start requests of the spider, and works 55 | # similarly to the process_spider_output() method, except 56 | # that it doesn’t have a response associated. 57 | 58 | # Must return only requests (not items). 59 | for r in start_requests: 60 | yield r 61 | 62 | def spider_opened(self, spider): 63 | spider.logger.info('Spider opened: %s' % spider.name) 64 | 65 | 66 | class TutorialDownloaderMiddleware: 67 | # Not all methods need to be defined. If a method is not defined, 68 | # scrapy acts as if the downloader middleware does not modify the 69 | # passed objects. 70 | 71 | @classmethod 72 | def from_crawler(cls, crawler): 73 | # This method is used by Scrapy to create your spiders. 74 | s = cls() 75 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 76 | return s 77 | 78 | def process_request(self, request, spider): 79 | # Called for each request that goes through the downloader 80 | # middleware. 81 | 82 | # Must either: 83 | # - return None: continue processing this request 84 | # - or return a Response object 85 | # - or return a Request object 86 | # - or raise IgnoreRequest: process_exception() methods of 87 | # installed downloader middleware will be called 88 | return None 89 | 90 | def process_response(self, request, response, spider): 91 | # Called with the response returned from the downloader. 92 | 93 | # Must either; 94 | # - return a Response object 95 | # - return a Request object 96 | # - or raise IgnoreRequest 97 | return response 98 | 99 | def process_exception(self, request, exception, spider): 100 | # Called when a download handler or a process_request() 101 | # (from other downloader middleware) raises an exception. 102 | 103 | # Must either: 104 | # - return None: continue processing this exception 105 | # - return a Response object: stops process_exception() chain 106 | # - return a Request object: stops process_exception() chain 107 | pass 108 | 109 | def spider_opened(self, spider): 110 | spider.logger.info('Spider opened: %s' % spider.name) 111 | -------------------------------------------------------------------------------- /internal/tillclient/instances.go: -------------------------------------------------------------------------------- 1 | package tillclient 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "io/ioutil" 8 | 9 | "github.com/DataHenHQ/tillup/features" 10 | "github.com/go-resty/resty/v2" 11 | ) 12 | 13 | type InstancesService service 14 | 15 | type Instance struct { 16 | ID *int64 `json:"id,omitempty"` 17 | Name *string `json:"name,omitempty"` 18 | Description *string `json:"description,omitempty"` 19 | Requests *int64 `json:"requests,omitempty"` 20 | InterceptedRequests *int64 `json:"intercepted_requests,omitempty"` 21 | SuccessfulRequests *int64 `json:"successful_requests,omitempty"` 22 | FailedRequests *int64 `json:"failed_requests,omitempty"` 23 | CacheHits *int64 `json:"cache_hits,omitempty"` 24 | CacheSets *int64 `json:"cache_sets,omitempty"` 25 | Features *[]features.Feature `json:"features,omitempty"` 26 | 27 | CreatedAt *Timestamp `json:"created_at,omitempty"` 28 | UpdatedAt *Timestamp `json:"updated_at,omitempty"` 29 | } 30 | 31 | func (s *InstancesService) Get(ctx context.Context, name string) (*Instance, *resty.Response, error) { 32 | u := fmt.Sprintf("instances/%v", name) 33 | req, err := s.client.NewRequest(ctx, u, nil) 34 | 35 | instance := new(Instance) 36 | 37 | resp, err := req.Get(u) 38 | if err != nil { 39 | return nil, resp, err 40 | } 41 | defer resp.RawResponse.Body.Close() 42 | b, err := ioutil.ReadAll(resp.RawResponse.Body) 43 | if err != nil { 44 | return nil, nil, err 45 | } 46 | 47 | json.Unmarshal(b, &instance) 48 | 49 | return instance, resp, nil 50 | } 51 | 52 | // GetID returns the ID field if it's non-nil, zero value otherwise. 53 | func (i *Instance) GetID() int64 { 54 | if i == nil || i.ID == nil { 55 | return 0 56 | } 57 | return *i.ID 58 | } 59 | 60 | // GetName returns the Name field if it's non-nil, zero value otherwise. 61 | func (i *Instance) GetName() string { 62 | if i == nil || i.Name == nil { 63 | return "" 64 | } 65 | return *i.Name 66 | } 67 | 68 | // GetDescription returns the Description field if it's non-nil, zero value otherwise. 69 | func (i *Instance) GetDescription() string { 70 | if i == nil || i.Description == nil { 71 | return "" 72 | } 73 | return *i.Description 74 | } 75 | 76 | // GetFeatures returns the Features field if it's non-nil, zero value otherwise. 77 | func (i *Instance) GetFeatures() []features.Feature { 78 | if i == nil || i.Features == nil { 79 | return []features.Feature{} 80 | } 81 | return *i.Features 82 | } 83 | 84 | // GetCreatedAt returns the CreatedAt field if it's non-nil, zero value otherwise. 85 | func (i *Instance) GetCreatedAt() Timestamp { 86 | if i == nil || i.CreatedAt == nil { 87 | return Timestamp{} 88 | } 89 | return *i.CreatedAt 90 | } 91 | 92 | // GetRequests returns the Requests field if it's non-nil, zero value otherwise. 93 | func (i *Instance) GetRequests() int64 { 94 | if i == nil || i.Requests == nil { 95 | return 0 96 | } 97 | return *i.Requests 98 | } 99 | 100 | // GetSuccessfulRequests returns the SuccessfulRequests field if it's non-nil, zero value otherwise. 101 | func (i *Instance) GetSuccessfulRequests() int64 { 102 | if i == nil || i.SuccessfulRequests == nil { 103 | return 0 104 | } 105 | return *i.SuccessfulRequests 106 | } 107 | 108 | // GetFailedRequests returns the FailedRequests field if it's non-nil, zero value otherwise. 109 | func (i *Instance) GetFailedRequests() int64 { 110 | if i == nil || i.FailedRequests == nil { 111 | return 0 112 | } 113 | return *i.FailedRequests 114 | } 115 | 116 | // GetInterceptedRequests returns the InterceptedRequests field if it's non-nil, zero value otherwise. 117 | func (i *Instance) GetInterceptedRequests() int64 { 118 | if i == nil || i.InterceptedRequests == nil { 119 | return 0 120 | } 121 | return *i.InterceptedRequests 122 | } 123 | 124 | // GetCacheHits returns the CacheHits field if it's non-nil, zero value otherwise. 125 | func (i *Instance) GetCacheHits() int64 { 126 | if i == nil || i.CacheHits == nil { 127 | return 0 128 | } 129 | return *i.CacheHits 130 | } 131 | 132 | // GetCacheSets returns the CacheSets field if it's non-nil, zero value otherwise. 133 | func (i *Instance) GetCacheSets() int64 { 134 | if i == nil || i.CacheSets == nil { 135 | return 0 136 | } 137 | return *i.CacheSets 138 | } 139 | 140 | // GetUpdatedAt returns the UpdatedAt field if it's non-nil, zero value otherwise. 141 | func (i *Instance) GetUpdatedAt() Timestamp { 142 | if i == nil || i.UpdatedAt == nil { 143 | return Timestamp{} 144 | } 145 | return *i.UpdatedAt 146 | } 147 | -------------------------------------------------------------------------------- /internal/tillclient/instance_stats.go: -------------------------------------------------------------------------------- 1 | package tillclient 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "errors" 7 | "fmt" 8 | "io/ioutil" 9 | "sync" 10 | 11 | "github.com/go-resty/resty/v2" 12 | ) 13 | 14 | type InstanceStatsService service 15 | 16 | type InstanceStat struct { 17 | Name *string `json:"name,omitempty"` 18 | Requests *uint64 `json:"requests,omitempty"` 19 | InterceptedRequests *uint64 `json:"intercepted_requests,omitempty"` 20 | SuccessfulRequests *uint64 `json:"successful_requests,omitempty"` 21 | FailedRequests *uint64 `json:"failed_requests,omitempty"` 22 | CacheHits *uint64 `json:"cache_hits,omitempty"` 23 | CacheSets *uint64 `json:"cache_sets,omitempty"` 24 | } 25 | 26 | // InstanceStatMutex is used for struct to for atomic incr and decr of counters 27 | type InstanceStatMutex struct { 28 | InstanceStat 29 | Mutex *sync.Mutex 30 | } 31 | 32 | func (s *InstanceStatsService) Update(ctx context.Context, is InstanceStat) (*Instance, *resty.Response, error) { 33 | if is.GetName() == "" { 34 | return nil, nil, errors.New("instance name is required") 35 | } 36 | 37 | u := fmt.Sprintf("instances/%v/stats", is.GetName()) 38 | req, err := s.client.NewRequest(ctx, u, nil) 39 | 40 | req.SetBody(is) 41 | 42 | instance := new(Instance) 43 | 44 | resp, err := req.Put(u) 45 | if err != nil { 46 | return nil, resp, err 47 | } 48 | defer resp.RawResponse.Body.Close() 49 | b, err := ioutil.ReadAll(resp.RawResponse.Body) 50 | if err != nil { 51 | return nil, nil, err 52 | } 53 | 54 | json.Unmarshal(b, &instance) 55 | 56 | return instance, resp, nil 57 | } 58 | 59 | // GetRequests returns the Requests field if it's non-nil, zero value otherwise. 60 | func (is *InstanceStat) GetRequests() uint64 { 61 | if is == nil || is.Requests == nil { 62 | return 0 63 | } 64 | return *is.Requests 65 | } 66 | 67 | // GetInterceptedRequests returns the InterceptedRequests field if it's non-nil, zero value otherwise. 68 | func (is *InstanceStat) GetInterceptedRequests() uint64 { 69 | if is == nil || is.InterceptedRequests == nil { 70 | return 0 71 | } 72 | return *is.InterceptedRequests 73 | } 74 | 75 | // GetFailedRequests returns the FailedRequests field if it's non-nil, zero value otherwise. 76 | func (is *InstanceStat) GetFailedRequests() uint64 { 77 | if is == nil || is.FailedRequests == nil { 78 | return 0 79 | } 80 | return *is.FailedRequests 81 | } 82 | 83 | // GetSuccessfulRequests returns the SuccessfulRequests field if it's non-nil, zero value otherwise. 84 | func (is *InstanceStat) GetSuccessfulRequests() uint64 { 85 | if is == nil || is.SuccessfulRequests == nil { 86 | return 0 87 | } 88 | return *is.SuccessfulRequests 89 | } 90 | 91 | // GetCacheHits returns the CacheHits field if it's non-nil, zero value otherwise. 92 | func (is *InstanceStat) GetCacheHits() uint64 { 93 | if is == nil || is.CacheHits == nil { 94 | return 0 95 | } 96 | return *is.CacheHits 97 | } 98 | 99 | // GetCacheSets returns the CacheSets field if it's non-nil, zero value otherwise. 100 | func (is *InstanceStat) GetCacheSets() uint64 { 101 | if is == nil || is.CacheSets == nil { 102 | return 0 103 | } 104 | return *is.CacheSets 105 | } 106 | 107 | // GetName returns the Name field if it's non-nil, zero value otherwise. 108 | func (is *InstanceStat) GetName() string { 109 | if is == nil || is.Name == nil { 110 | return "" 111 | } 112 | return *is.Name 113 | } 114 | 115 | // IsZero checks if it is zero value 116 | func (is *InstanceStat) IsZero() bool { 117 | if is.GetRequests() == 0 && 118 | is.GetInterceptedRequests() == 0 && 119 | is.GetSuccessfulRequests() == 0 && 120 | is.GetFailedRequests() == 0 && 121 | is.GetCacheHits() == 0 && 122 | is.GetCacheSets() == 0 { 123 | return true 124 | } 125 | return false 126 | } 127 | 128 | // DeepCopy goes through the fields and copy them, so that all values are copied, and all pointer don't point to the same address 129 | func (is *InstanceStat) DeepCopy() (nis InstanceStat) { 130 | if is.Name != nil { 131 | name := is.GetName() 132 | nis.Name = &name 133 | } 134 | 135 | if is.Requests != nil { 136 | val := is.GetRequests() 137 | nis.Requests = &val 138 | } 139 | 140 | if is.SuccessfulRequests != nil { 141 | val := is.GetSuccessfulRequests() 142 | nis.SuccessfulRequests = &val 143 | } 144 | 145 | if is.FailedRequests != nil { 146 | val := is.GetFailedRequests() 147 | nis.FailedRequests = &val 148 | } 149 | 150 | if is.InterceptedRequests != nil { 151 | val := is.GetInterceptedRequests() 152 | nis.InterceptedRequests = &val 153 | } 154 | 155 | if is.CacheHits != nil { 156 | val := is.GetCacheHits() 157 | nis.CacheHits = &val 158 | } 159 | 160 | if is.CacheSets != nil { 161 | val := is.GetCacheSets() 162 | nis.CacheSets = &val 163 | } 164 | 165 | return nis 166 | } 167 | -------------------------------------------------------------------------------- /server/server.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "context" 5 | "embed" 6 | "errors" 7 | "fmt" 8 | "log" 9 | "os" 10 | "os/signal" 11 | "syscall" 12 | "time" 13 | 14 | "github.com/DataHenHQ/till/internal/tillclient" 15 | "github.com/DataHenHQ/till/proxy" 16 | "github.com/DataHenHQ/till/server/handlers" 17 | "github.com/DataHenHQ/tillup/cache" 18 | "github.com/DataHenHQ/tillup/interceptions" 19 | "github.com/DataHenHQ/tillup/logger" 20 | "github.com/DataHenHQ/tillup/sessions" 21 | 22 | "github.com/DataHenHQ/tillup" 23 | ) 24 | 25 | var ( 26 | Token string 27 | InstanceName string 28 | StatMu tillclient.InstanceStatMutex 29 | ProxyURLs = []string{} 30 | ProxyCount = 0 31 | DBPath string 32 | Interceptions []interceptions.Interception 33 | CacheConfig cache.Config 34 | LoggerConfig logger.Config 35 | SessionsConfig sessions.Config 36 | 37 | // current instance from the server 38 | curri tillclient.Instance 39 | 40 | // content holds our static web server content. 41 | //go:embed templates/* 42 | embeddedTemplates embed.FS 43 | 44 | // content holds our assets content 45 | //go:embed public/build/* 46 | embeddedAssets embed.FS 47 | ) 48 | 49 | func validateInstance() (ok bool, i *tillclient.Instance) { 50 | if Token == "" { 51 | fmt.Println("You need to specify the Till auth token. To get your auth token, sign up for free at https://till.datahen.com") 52 | return false, nil 53 | } 54 | 55 | // init the client 56 | client, err := tillclient.NewClient(Token) 57 | if err != nil { 58 | log.Fatal(err) 59 | } 60 | 61 | i, _, err = client.Instances.Get(context.Background(), InstanceName) 62 | if err != nil { 63 | if errors.Is(err, tillclient.ErrNotFound) { 64 | log.Fatalf("Instance with the name '%v' is not found. Please create the instance at https://till.datahen.com/instances\n", InstanceName) 65 | } else { 66 | log.Fatal(err) 67 | } 68 | log.Fatal(err) 69 | } 70 | 71 | // set the current instance global var 72 | curri = *i 73 | 74 | // Set the features, etc for this instance 75 | if err := tillup.Init(i.GetFeatures(), ProxyURLs, DBPath, Interceptions, SessionsConfig, CacheConfig, LoggerConfig); err != nil { 76 | log.Fatal(err) 77 | } 78 | 79 | return true, i 80 | } 81 | 82 | // Serve runs the Till server to start accepting the proxy requests 83 | func Serve(port string, uiport string) { 84 | defer tillup.Close() 85 | 86 | // Pass necessary vars to the handlers 87 | handlers.SetEmbeddedTemplates(&embeddedTemplates) 88 | handlers.InstanceName = InstanceName 89 | handlers.CurrentInstance = &curri 90 | handlers.StatMu = &StatMu 91 | handlers.LoggerConfig = LoggerConfig 92 | 93 | // Wait for os signal to gracefully shutdown the server 94 | // 95 | quit := make(chan os.Signal) 96 | signal.Notify(quit, os.Interrupt, os.Kill, syscall.SIGTERM, syscall.SIGINT) 97 | 98 | // Validates this instance with the cloud 99 | ok, i := validateInstance() 100 | if !ok { 101 | return 102 | } 103 | 104 | // Start recurning stats update to cloud 105 | // 106 | StatMu = newInstanceStatMutex() 107 | proxy.StatMu = &StatMu 108 | go startRecurringStatUpdate() 109 | 110 | // Starts the Proxy server 111 | // 112 | prox, err := NewProxyServer(port, i) 113 | if err != nil { 114 | log.Fatal("Unable to start Till Proxy Server") 115 | } 116 | go prox.ListenAndServe() 117 | 118 | // Starts the UI server 119 | // 120 | ui, err := NewUIServer(uiport, i) 121 | if err != nil { 122 | log.Fatal("Unable to start Till UI Server") 123 | } 124 | go ui.ListenAndServe() 125 | 126 | // waits for quit signal from OS 127 | <-quit 128 | 129 | // create context for graceful shutdown with timeout 130 | ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 131 | defer cancel() 132 | 133 | // Shutdown ui server 134 | if err := ui.server.Shutdown(ctx); err != nil { 135 | log.Println("error shutting down DataHen TIll UI server:", err) 136 | } 137 | 138 | // Shuts down proxy server 139 | if err := prox.server.Shutdown(ctx); err != nil { 140 | log.Println("error shutting down DataHen TIll server:", err) 141 | } 142 | } 143 | 144 | // Resets the instant stats delta based on what was uploaded 145 | func resetInstanceStatDelta(is tillclient.InstanceStat) { 146 | // Lock the mutex first, to prevent edits by other concurent processes 147 | StatMu.Mutex.Lock() 148 | 149 | // resets the delta by decreasing it by the uploaded stat 150 | // 151 | *(StatMu.InstanceStat.Requests) = *(StatMu.InstanceStat.Requests) - is.GetRequests() 152 | *(StatMu.InstanceStat.SuccessfulRequests) = *(StatMu.InstanceStat.SuccessfulRequests) - is.GetSuccessfulRequests() 153 | *(StatMu.InstanceStat.FailedRequests) = *(StatMu.InstanceStat.FailedRequests) - is.GetFailedRequests() 154 | *(StatMu.InstanceStat.InterceptedRequests) = *(StatMu.InstanceStat.InterceptedRequests) - is.GetInterceptedRequests() 155 | *(StatMu.InstanceStat.CacheHits) = *(StatMu.InstanceStat.CacheHits) - is.GetCacheHits() 156 | *(StatMu.InstanceStat.CacheSets) = *(StatMu.InstanceStat.CacheSets) - is.GetCacheSets() 157 | 158 | // Unlock the mutex 159 | StatMu.Mutex.Unlock() 160 | } 161 | 162 | func newZeroStat() *uint64 { 163 | i := uint64(0) 164 | return &i 165 | } 166 | -------------------------------------------------------------------------------- /proxy/cert.go: -------------------------------------------------------------------------------- 1 | // cert functions inspired by https://github.com/kr/mitm/cert 2 | 3 | package proxy 4 | 5 | import ( 6 | "crypto/ecdsa" 7 | "crypto/elliptic" 8 | "crypto/rand" 9 | "crypto/tls" 10 | "crypto/x509" 11 | "crypto/x509/pkix" 12 | "encoding/pem" 13 | "errors" 14 | "fmt" 15 | "log" 16 | "math/big" 17 | "os" 18 | "time" 19 | ) 20 | 21 | const ( 22 | caMaxAge = 5 * 365 * 24 * time.Hour 23 | leafMaxAge = 24 * time.Hour 24 | caUsage = x509.KeyUsageDigitalSignature | 25 | x509.KeyUsageContentCommitment | 26 | x509.KeyUsageKeyEncipherment | 27 | x509.KeyUsageDataEncipherment | 28 | x509.KeyUsageKeyAgreement | 29 | x509.KeyUsageCertSign | 30 | x509.KeyUsageCRLSign 31 | leafUsage = caUsage 32 | ) 33 | 34 | // GenCert generates cert 35 | func GenCert(names []string) (*tls.Certificate, error) { 36 | now := time.Now().Add(-1 * time.Hour).UTC() 37 | if !ca.Leaf.IsCA { 38 | return nil, errors.New("CA cert is not a CA") 39 | } 40 | serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) 41 | serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) 42 | if err != nil { 43 | return nil, fmt.Errorf("failed to generate serial number: %s", err) 44 | } 45 | tmpl := &x509.Certificate{ 46 | SerialNumber: serialNumber, 47 | Subject: pkix.Name{CommonName: names[0]}, 48 | NotBefore: now, 49 | NotAfter: now.Add(leafMaxAge), 50 | KeyUsage: leafUsage, 51 | BasicConstraintsValid: true, 52 | DNSNames: names, 53 | SignatureAlgorithm: x509.ECDSAWithSHA512, 54 | } 55 | key, err := genKeyPair() 56 | if err != nil { 57 | return nil, err 58 | } 59 | x, err := x509.CreateCertificate(rand.Reader, tmpl, ca.Leaf, key.Public(), ca.PrivateKey) 60 | if err != nil { 61 | return nil, err 62 | } 63 | cert := new(tls.Certificate) 64 | cert.Certificate = append(cert.Certificate, x) 65 | cert.PrivateKey = key 66 | cert.Leaf, _ = x509.ParseCertificate(x) 67 | return cert, nil 68 | } 69 | 70 | func genKeyPair() (*ecdsa.PrivateKey, error) { 71 | return ecdsa.GenerateKey(elliptic.P256(), rand.Reader) 72 | } 73 | 74 | func GenCA(name string) (certPEM, keyPEM []byte, err error) { 75 | now := time.Now().UTC() 76 | tmpl := &x509.Certificate{ 77 | SerialNumber: big.NewInt(1), 78 | Subject: pkix.Name{CommonName: name}, 79 | NotBefore: now, 80 | NotAfter: now.Add(caMaxAge), 81 | KeyUsage: caUsage, 82 | BasicConstraintsValid: true, 83 | IsCA: true, 84 | MaxPathLen: 2, 85 | SignatureAlgorithm: x509.ECDSAWithSHA512, 86 | } 87 | key, err := genKeyPair() 88 | if err != nil { 89 | return 90 | } 91 | certDER, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, key.Public(), key) 92 | if err != nil { 93 | return 94 | } 95 | keyDER, err := x509.MarshalECPrivateKey(key) 96 | if err != nil { 97 | return 98 | } 99 | certPEM = pem.EncodeToMemory(&pem.Block{ 100 | Type: "CERTIFICATE", 101 | Bytes: certDER, 102 | }) 103 | keyPEM = pem.EncodeToMemory(&pem.Block{ 104 | Type: "ECDSA PRIVATE KEY", 105 | Bytes: keyDER, 106 | }) 107 | 108 | return 109 | } 110 | 111 | // LoadOrGenCAFiles loads CA from file, or generates it into a file and use it 112 | func LoadOrGenCAFiles(caCertFile, caKeyFile string) (err error) { 113 | var ( 114 | caCertExists bool 115 | caKeyExists bool 116 | ) 117 | 118 | // check existense of the cert and key files 119 | if _, err := os.Stat(caCertFile); err == nil { 120 | caCertExists = true 121 | } 122 | if _, err := os.Stat(caKeyFile); err == nil { 123 | caKeyExists = true 124 | } 125 | 126 | // if both files exist, load from file 127 | if caCertExists && caKeyExists { 128 | err = loadCAVarFromFile(caCertFile, caKeyFile) 129 | if err != nil { 130 | return err 131 | } 132 | // loading certs message 133 | fmt.Println("Using the following Certificate Authority(CA) certificates:") 134 | fmt.Println("-", caCertFile) 135 | fmt.Println("-", caKeyFile) 136 | 137 | return nil 138 | } 139 | 140 | // if both does not exist, generate the key pair 141 | if !caCertExists && !caKeyExists { 142 | err = genCAToFile(caCertFile, caKeyFile) 143 | if err != nil { 144 | return err 145 | } 146 | err = loadCAVarFromFile(caCertFile, caKeyFile) 147 | if err != nil { 148 | 149 | } 150 | // generated certs message 151 | fmt.Println("Generated a new Certificate Authority(CA) certificates:") 152 | fmt.Println("-", caCertFile) 153 | fmt.Println("-", caKeyFile) 154 | return nil 155 | } 156 | 157 | // if one exist and not the other, then raise error 158 | if !caCertExists { 159 | log.Fatalln("ca-cert does not exist") 160 | } 161 | if !caKeyExists { 162 | log.Fatalln("ca-key does not exist") 163 | } 164 | 165 | return nil 166 | } 167 | 168 | // loadCAVarFromFile loads the keypair from file 169 | func loadCAVarFromFile(caCertFile, caKeyFile string) (err error) { 170 | 171 | ca, err = tls.LoadX509KeyPair(caCertFile, caKeyFile) 172 | if err != nil { 173 | return err 174 | } 175 | 176 | ca.Leaf, err = x509.ParseCertificate(ca.Certificate[0]) 177 | if err != nil { 178 | return err 179 | } 180 | 181 | return nil 182 | } 183 | 184 | // GenCAToFile generates the CA, and save it to files, and then use the ca 185 | func genCAToFile(caCertFile string, caKeyFile string) (err error) { 186 | var hostname, _ = os.Hostname() 187 | 188 | certPEM, keyPEM, err := GenCA(hostname) 189 | if err != nil { 190 | log.Fatalln("Unable to generate CA", err) 191 | } 192 | 193 | if err := writeFullFilePath(caCertFile, certPEM, 0644); err != nil { 194 | log.Fatalln("Unable to write ca cert file to ", caCertFile) 195 | } 196 | 197 | if err := writeFullFilePath(caKeyFile, keyPEM, 0644); err != nil { 198 | log.Fatalln("Unable to write ca cert file to ", caCertFile) 199 | } 200 | 201 | return nil 202 | } 203 | -------------------------------------------------------------------------------- /server/templates/layouts/master.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | {{ .title }} 10 | {{template "head" .}} 11 | {{define "head"}}{{end}} 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
21 | 97 | 98 |
99 | {{ if .title}} 100 |
101 |
102 |

103 | {{.title}} 104 |

105 |
106 |
107 | {{end}} 108 |
109 |
110 | 111 | 112 | {{template "content" .}} 113 | {{define "content"}}{{end}} 114 | 115 |
116 |
117 | {{template "footer"}} 118 |
119 |
120 | 121 | 122 | 123 | {{define "footer"}} 124 | 125 | 132 | 133 | {{end}} -------------------------------------------------------------------------------- /server/handlers/handlers.go: -------------------------------------------------------------------------------- 1 | package handlers 2 | 3 | import ( 4 | "embed" 5 | "encoding/json" 6 | "fmt" 7 | "html/template" 8 | "net/http" 9 | "net/url" 10 | "path" 11 | "path/filepath" 12 | "regexp" 13 | "strings" 14 | "time" 15 | 16 | "github.com/DataHenHQ/till/internal/tillclient" 17 | "github.com/DataHenHQ/tillup/logger" 18 | "github.com/DataHenHQ/tillup/sessions" 19 | "github.com/foolin/goview" 20 | "github.com/volatiletech/null/v8" 21 | "golang.org/x/text/language" 22 | "golang.org/x/text/message" 23 | ) 24 | 25 | var ( 26 | Renderer *goview.ViewEngine 27 | 28 | CurrentInstance *tillclient.Instance 29 | 30 | StatMu *tillclient.InstanceStatMutex 31 | 32 | InstanceName string 33 | 34 | LoggerConfig logger.Config 35 | 36 | lp = message.NewPrinter(language.English) 37 | ) 38 | 39 | func SetEmbeddedTemplates(e *embed.FS) { 40 | GIDShaRe := regexp.MustCompile(`-([a-zA-Z0-9]+)$`) 41 | 42 | templateFunc := template.FuncMap{ 43 | "LoggerConfig": func() logger.Config { 44 | return LoggerConfig 45 | }, 46 | "InstanceName": func() string { 47 | return InstanceName 48 | }, 49 | "jsonToHeader": func(nb null.Bytes) (h http.Header) { 50 | 51 | if nb.IsZero() { 52 | return nil 53 | } 54 | 55 | if err := json.Unmarshal([]byte(nb.Bytes), &h); err != nil { 56 | return nil 57 | } 58 | 59 | return h 60 | }, 61 | "jsonToPageConfig": func(nb null.Bytes) (pconf sessions.PageConfig) { 62 | 63 | if nb.IsZero() { 64 | return pconf 65 | } 66 | 67 | if err := json.Unmarshal([]byte(nb.Bytes), &pconf); err != nil { 68 | return pconf 69 | } 70 | 71 | return pconf 72 | }, 73 | "jsonToSession": func(nb null.Bytes) (sess sessions.Session) { 74 | 75 | if nb.IsZero() { 76 | return sess 77 | } 78 | 79 | if err := json.Unmarshal([]byte(nb.Bytes), &sess); err != nil { 80 | return sess 81 | } 82 | 83 | return sess 84 | }, 85 | "jsonToSlice": func(nb null.Bytes) (ss []string) { 86 | 87 | if nb.IsZero() { 88 | return nil 89 | } 90 | 91 | if err := json.Unmarshal([]byte(nb.Bytes), &ss); err != nil { 92 | return nil 93 | } 94 | 95 | return ss 96 | }, 97 | "nullGt": func(i null.Int64, exp int) bool { 98 | if i.IsZero() { 99 | return false 100 | } 101 | return i.Int64 > int64(exp) 102 | }, 103 | "intToTime": func(i int64) time.Time { 104 | return time.Unix(0, i) 105 | }, 106 | "intToBytes": func(b int64) string { 107 | const unit = 1000 108 | if b < unit { 109 | return fmt.Sprintf("%d B", b) 110 | } 111 | div, exp := int64(unit), 0 112 | for n := b / unit; n >= unit; n /= unit { 113 | div *= unit 114 | exp++ 115 | } 116 | return fmt.Sprintf("%.2f %cB", 117 | float64(b)/float64(div), "kMGTPE"[exp]) 118 | }, 119 | "ifThenElse": func(navinf interface{}, currentNav string, truecond string, falsecond string) string { 120 | nav, _ := navinf.(string) 121 | if nav == currentNav { 122 | return truecond 123 | } 124 | return falsecond 125 | }, 126 | "localizeInt": func(i int64) string { 127 | return lp.Sprintf("%d\n", i) 128 | }, 129 | "boolval": func(b *bool) bool { 130 | return *b 131 | }, 132 | "unescape": func(s string) template.HTML { 133 | return template.HTML(s) 134 | }, 135 | "shortGID": func(gid string) string { 136 | sha := GIDShaRe.FindStringSubmatch(gid) 137 | if len(sha) < 1 { 138 | return "" 139 | } 140 | 141 | return sha[1][0:5] 142 | }, 143 | "isReqSuccess": func(code int64) (success bool) { 144 | switch { 145 | case code >= 200 && code <= 299: 146 | return true 147 | case code >= 300 && code <= 399: 148 | return true 149 | case code == 404: 150 | return true 151 | default: 152 | return false 153 | } 154 | }, 155 | // converts url into relative path 156 | "relativepath": func(urlstr string) string { 157 | u, err := url.Parse(urlstr) 158 | if err != nil { 159 | return "" 160 | } 161 | u.Scheme = "" 162 | u.Host = "" 163 | u.User = nil 164 | return u.String() 165 | }, 166 | // get the basepath of a url, basically the last item on the url 167 | "basepath": func(urlstr string) string { 168 | u, err := url.Parse(urlstr) 169 | if err != nil { 170 | return "" 171 | } 172 | 173 | return path.Base(u.Path) 174 | }, 175 | // converts url into relative path 176 | "basepathPlusQ": func(urlstr string) string { 177 | u, err := url.Parse(urlstr) 178 | if err != nil { 179 | return "" 180 | } 181 | u.Scheme = "" 182 | u.Host = "" 183 | u.User = nil 184 | rp := u.String() 185 | ss := strings.Split(rp, "/") 186 | out := ss[len(ss)-1] 187 | if len(out) < 1 { 188 | out = "/" 189 | } 190 | return out 191 | }, 192 | "basepathPlusQOrHost": func(urlstr string) string { 193 | u, err := url.Parse(urlstr) 194 | if err != nil { 195 | return "" 196 | } 197 | u.Scheme = "" 198 | host := u.Host 199 | u.Host = "" 200 | u.User = nil 201 | rp := u.String() 202 | ss := strings.Split(rp, "/") 203 | out := ss[len(ss)-1] 204 | if len(out) < 1 { 205 | out = host 206 | } 207 | return out 208 | }, 209 | // get hostname and port 210 | "hostname": func(urlstr string) string { 211 | u, err := url.Parse(urlstr) 212 | if err != nil { 213 | return "" 214 | } 215 | 216 | return u.Hostname() 217 | }, 218 | "appendQueryString": func(currurl string, keyvals ...interface{}) string { 219 | u, err := url.Parse(currurl) 220 | if err != nil { 221 | return "" 222 | } 223 | u.Scheme = "" 224 | u.Host = "" 225 | u.User = nil 226 | 227 | // TODO: Need to make this into key pairs, and set or delete based if the values is blank or not 228 | // get the queries 229 | qs := u.Query() 230 | 231 | kvs := map[string]string{} 232 | 233 | // create a map from keyvals 234 | cur := "" 235 | for i, v := range keyvals { 236 | // cast the types into the correct one 237 | var s string 238 | switch v.(type) { 239 | case string: 240 | s, _ = v.(string) 241 | case int, int64: 242 | s = fmt.Sprintf("%d", v) 243 | } 244 | 245 | // set it as the key or value based on mod 2 246 | if i%2 == 0 { 247 | cur = s 248 | kvs[cur] = "" 249 | } else { 250 | kvs[cur] = s 251 | } 252 | } 253 | 254 | // delete the key if the value is empty, otherwise set it 255 | for k, v := range kvs { 256 | if v == "" { 257 | qs.Del(k) 258 | } else { 259 | qs.Set(k, v) 260 | } 261 | } 262 | 263 | // Assign it back to the url 264 | u.RawQuery = qs.Encode() 265 | 266 | return u.String() 267 | }, 268 | } 269 | 270 | gvConfig := goview.Config{ 271 | Root: "templates", 272 | Extension: ".html", 273 | Master: "layouts/master", 274 | DisableCache: false, 275 | Funcs: templateFunc, 276 | } 277 | 278 | Renderer = goview.New(gvConfig) 279 | 280 | // set the filehandler for goview to use embedded FS 281 | Renderer.SetFileHandler(func(config goview.Config, tmpl string) (string, error) { 282 | path := filepath.Join(config.Root, tmpl) 283 | bytes, err := e.ReadFile(path + config.Extension) 284 | return string(bytes), err 285 | }) 286 | 287 | } 288 | 289 | func IndexHandler(w http.ResponseWriter, r *http.Request) { 290 | Renderer.Render(w, http.StatusOK, "index", map[string]interface{}{ 291 | "title": "Home", 292 | "tab": "home", 293 | "Requests": CurrentInstance.GetRequests() + int64(*StatMu.Requests), 294 | "FailedRequests": CurrentInstance.GetFailedRequests() + int64(*StatMu.FailedRequests), 295 | "SuccessfulRequests": CurrentInstance.GetSuccessfulRequests() + int64(*StatMu.SuccessfulRequests), 296 | "InterceptedRequests": CurrentInstance.GetInterceptedRequests() + int64(*StatMu.InterceptedRequests), 297 | "CacheHits": CurrentInstance.GetCacheHits() + int64(*StatMu.CacheHits), 298 | "CacheSets": CurrentInstance.GetCacheSets() + int64(*StatMu.CacheSets), 299 | }) 300 | 301 | } 302 | -------------------------------------------------------------------------------- /cmd/serve.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "path/filepath" 8 | 9 | "github.com/DataHenHQ/till/proxy" 10 | "github.com/DataHenHQ/till/server" 11 | "github.com/DataHenHQ/tillup/cache" 12 | "github.com/DataHenHQ/tillup/interceptions" 13 | "github.com/DataHenHQ/tillup/logger" 14 | "github.com/DataHenHQ/tillup/sessions" 15 | "github.com/DataHenHQ/useragent" 16 | homedir "github.com/mitchellh/go-homedir" 17 | "github.com/spf13/cobra" 18 | "github.com/spf13/viper" 19 | ) 20 | 21 | // serveCmd represents the serve command 22 | var serveCmd = &cobra.Command{ 23 | Use: "serve", 24 | Short: "Starts the DataHen Till server", 25 | Long: `Starts the DataHen Till server in order to listen to and receive HTTP requests and proxy them.`, 26 | Run: func(cmd *cobra.Command, args []string) { 27 | 28 | port := viper.GetString("port") 29 | uiport := viper.GetString("uiport") 30 | proxy.ReleaseVersion = ReleaseVersion 31 | // Load or generate a new CA cert files 32 | caCertFile := viper.GetString("ca-cert") 33 | caKeyFile := viper.GetString("ca-key") 34 | setCaFileDefaults(&caCertFile, &caKeyFile) 35 | proxy.LoadOrGenCAFiles(caCertFile, caKeyFile) 36 | 37 | // Set UserAgent related settings 38 | proxy.ForceUA = viper.GetBool("force-user-agent") 39 | proxy.UAType = viper.GetString("ua-type") 40 | if proxy.ForceUA { 41 | fmt.Printf("Till is currently configured to override all User-Agent headers with random %v browsers\n", proxy.UAType) 42 | } 43 | uaConfigFile := viper.GetString("ua-config-file") 44 | if uaConfigFile != "" { 45 | err := useragent.LoadUAConfig(uaConfigFile) 46 | if err != nil { 47 | fmt.Println("Problem loading the ua-config-file:", err) 48 | fmt.Println("aborting server") 49 | return 50 | } 51 | fmt.Printf("Using ua-config-file to generate user-agent: %v\n", uaConfigFile) 52 | } 53 | 54 | // set the Token 55 | token := viper.GetString("token") 56 | if token == "" { 57 | fmt.Println("You need to specify the Till auth token. To get your token, sign up for free at https://till.datahen.com") 58 | fmt.Println("aborting server") 59 | return 60 | } 61 | server.Token = token 62 | proxy.Token = token 63 | 64 | // set the instance name 65 | instance := viper.GetString("instance") 66 | if instance == "" { 67 | fmt.Println("You need to specify the name of this Till instance.") 68 | fmt.Println("aborting server") 69 | return 70 | } 71 | server.InstanceName = instance 72 | proxy.InstanceName = instance 73 | 74 | // set the proxy-file 75 | proxyFile := viper.GetString("proxy-file") 76 | if proxyFile != "" { 77 | count, urls, err := proxy.LoadProxyFile(proxyFile) 78 | if err != nil { 79 | fmt.Println("Problem loading the proxy-file:", err) 80 | fmt.Println("aborting server") 81 | return 82 | } 83 | if count == 0 { 84 | fmt.Println("The supplied proxy-file does not contain any proxies. Please supply a correct proxy-file") 85 | fmt.Println("aborting server") 86 | return 87 | } 88 | 89 | // set the proxy urls and counts 90 | server.ProxyURLs = urls 91 | proxy.ProxyURLs = urls 92 | server.ProxyCount = count 93 | proxy.ProxyCount = count 94 | 95 | fmt.Printf("Using proxy-file to randomize through %d proxies: %v\n", count, proxyFile) 96 | } else { 97 | fmt.Println("Warning! No proxy-file supplied. You will be exposing your own IP address if you don't use Till with a proxy", proxyFile) 98 | } 99 | 100 | // sets the DB path 101 | datadir := viper.GetString("datadir") 102 | if datadir == "" { 103 | datadir = filepath.Join(tillHomeDir, fmt.Sprintf("%v.data", instance)) 104 | } 105 | server.DBPath = datadir 106 | 107 | // sets the interceptions 108 | var rs []interceptions.Interception 109 | viper.UnmarshalKey("interceptions", &rs) 110 | 111 | // Handle both, interceptions and interceptions 112 | if len(rs) == 0 { 113 | viper.UnmarshalKey("interceptions", &rs) 114 | } 115 | 116 | if rs != nil { 117 | // validates the interceptions 118 | if ok, errs := interceptions.ValidateAll(rs); !ok || len(errs) > 0 { 119 | log.Fatal("Your config file has invalid interceptions:", errs) 120 | } 121 | 122 | server.Interceptions = rs 123 | } 124 | 125 | // Sets sessions related configurations 126 | var sessionsconf sessions.Config 127 | viper.UnmarshalKey("sessions", &sessionsconf) 128 | if _, err := sessionsconf.Validate(); err != nil { 129 | log.Fatal("Your config file has invalid sessions settings:", err) 130 | } 131 | sessionsconf.SetDefaults() 132 | server.SessionsConfig = sessionsconf 133 | proxy.SessionsConfig = sessionsconf 134 | 135 | // Sets cache related configurations 136 | var cacheconf cache.Config 137 | viper.UnmarshalKey("cache", &cacheconf) 138 | if _, err := cacheconf.Validate(); err != nil { 139 | log.Fatal("Your config file has invalid cache settings:", err) 140 | } 141 | cacheconf.SetDefaults() 142 | server.CacheConfig = cacheconf 143 | proxy.CacheConfig = cacheconf 144 | 145 | // Sets logger related configurations 146 | var loggerconf logger.Config 147 | viper.UnmarshalKey("logger", &loggerconf) 148 | if _, err := loggerconf.Validate(); err != nil { 149 | log.Fatal("Your config file has invalid logger settings:", err) 150 | } 151 | loggerconf.SetDefaults() 152 | proxy.LoggerConfig = loggerconf 153 | server.LoggerConfig = loggerconf 154 | 155 | // start the server 156 | server.Serve(port, uiport) 157 | }, 158 | } 159 | 160 | func init() { 161 | rootCmd.AddCommand(serveCmd) 162 | 163 | serveCmd.Flags().StringP("token", "t", "", "Specify the Till auth token. To get your token, sign up for free at https://www.datahen.com/till") 164 | if err := viper.BindPFlag("token", serveCmd.Flags().Lookup("token")); err != nil { 165 | log.Fatal("Unable to bind flag:", err) 166 | } 167 | 168 | serveCmd.Flags().StringP("instance", "i", "default", "Specify the name of the Till instance.") 169 | if err := viper.BindPFlag("instance", serveCmd.Flags().Lookup("instance")); err != nil { 170 | log.Fatal("Unable to bind flag:", err) 171 | } 172 | 173 | serveCmd.Flags().StringP("port", "p", "2933", "Specify the port to run") 174 | if err := viper.BindPFlag("port", serveCmd.Flags().Lookup("port")); err != nil { 175 | log.Fatal("Unable to bind flag:", err) 176 | } 177 | 178 | serveCmd.Flags().String("uiport", "2980", "Specify the port to run the UI server") 179 | if err := viper.BindPFlag("uiport", serveCmd.Flags().Lookup("uiport")); err != nil { 180 | log.Fatal("Unable to bind flag:", err) 181 | } 182 | 183 | serveCmd.Flags().String("ca-cert", "", fmt.Sprintf("Specify the CA certificate file (default is %v)", filepath.Join(tillHomeDir, "till-ca-cert.pem"))) 184 | if err := viper.BindPFlag("ca-cert", serveCmd.Flags().Lookup("ca-cert")); err != nil { 185 | log.Fatal("Unable to bind flag:", err) 186 | } 187 | 188 | serveCmd.Flags().String("ca-key", "", fmt.Sprintf("Specify the CA certificate file (default is %v)", filepath.Join(tillHomeDir, "till-ca-key.pem"))) 189 | if err := viper.BindPFlag("ca-key", serveCmd.Flags().Lookup("ca-key")); err != nil { 190 | log.Fatal("Unable to bind flag:", err) 191 | } 192 | 193 | serveCmd.Flags().Bool("force-user-agent", true, "When set to true, will override any user-agent header with a random value based on ua-type") 194 | if err := viper.BindPFlag("force-user-agent", serveCmd.Flags().Lookup("force-user-agent")); err != nil { 195 | log.Fatal("Unable to bind flag:", err) 196 | } 197 | 198 | serveCmd.Flags().String("ua-type", "desktop", "Specify what kind of browser user-agent to generate. Values can either be \"desktop\" or \"mobile\"") 199 | if err := viper.BindPFlag("ua-type", serveCmd.Flags().Lookup("ua-type")); err != nil { 200 | log.Fatal("Unable to bind flag:", err) 201 | } 202 | 203 | serveCmd.Flags().String("ua-config-file", "", "Specify the path to a JSON file that contains a custom user-agent configuration.") 204 | if err := viper.BindPFlag("ua-config-file", serveCmd.Flags().Lookup("ua-config-file")); err != nil { 205 | log.Fatal("Unable to bind flag:", err) 206 | } 207 | 208 | serveCmd.Flags().String("proxy-file", "", "Specify the path to a txt file that contains a list of proxies") 209 | if err := viper.BindPFlag("proxy-file", serveCmd.Flags().Lookup("proxy-file")); err != nil { 210 | log.Fatal("Unable to bind flag:", err) 211 | } 212 | 213 | datadirDesc := fmt.Sprintf("Specify the path to the data directory that this instance uses (default is %v)", filepath.Join(tillHomeDir, "default.data")) 214 | serveCmd.Flags().String("datadir", "", datadirDesc) 215 | if err := viper.BindPFlag("datadir", serveCmd.Flags().Lookup("datadir")); err != nil { 216 | log.Fatal("Unable to bind flag:", err) 217 | } 218 | 219 | } 220 | 221 | func setCaFileDefaults(caCertFile *string, caKeyFile *string) { 222 | home, err := homedir.Dir() 223 | if err != nil { 224 | fmt.Println(err) 225 | os.Exit(1) 226 | } 227 | 228 | if *caCertFile == "" { 229 | *caCertFile = filepath.Join(home, ".config", "datahen", "till", "till-ca-cert.pem") 230 | } 231 | 232 | if *caKeyFile == "" { 233 | *caKeyFile = filepath.Join(home, ".config", "datahen", "till", "till-ca-key.pem") 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 DataHen Canada Inc 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/tutorial/quotes-1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Quotes to Scrape 6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |

14 | Quotes to Scrape 15 |

16 |
17 |
18 |

19 | 20 | Login 21 | 22 |

23 |
24 |
25 | 26 | 27 |
28 |
29 | 30 |
31 | “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” 32 | by 33 | (about) 34 | 35 |
36 | Tags: 37 | 38 | 39 | change 40 | 41 | deep-thoughts 42 | 43 | thinking 44 | 45 | world 46 | 47 |
48 |
49 | 50 |
51 | “It is our choices, Harry, that show what we truly are, far more than our abilities.” 52 | by 53 | (about) 54 | 55 |
56 | Tags: 57 | 58 | 59 | abilities 60 | 61 | choices 62 | 63 |
64 |
65 | 66 |
67 | “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.” 68 | by 69 | (about) 70 | 71 |
72 | Tags: 73 | 74 | 75 | inspirational 76 | 77 | life 78 | 79 | live 80 | 81 | miracle 82 | 83 | miracles 84 | 85 |
86 |
87 | 88 |
89 | “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.” 90 | by 91 | (about) 92 | 93 |
94 | Tags: 95 | 96 | 97 | aliteracy 98 | 99 | books 100 | 101 | classic 102 | 103 | humor 104 | 105 |
106 |
107 | 108 |
109 | “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.” 110 | by 111 | (about) 112 | 113 |
114 | Tags: 115 | 116 | 117 | be-yourself 118 | 119 | inspirational 120 | 121 |
122 |
123 | 124 |
125 | “Try not to become a man of success. Rather become a man of value.” 126 | by 127 | (about) 128 | 129 |
130 | Tags: 131 | 132 | 133 | adulthood 134 | 135 | success 136 | 137 | value 138 | 139 |
140 |
141 | 142 |
143 | “It is better to be hated for what you are than to be loved for what you are not.” 144 | by 145 | (about) 146 | 147 |
148 | Tags: 149 | 150 | 151 | life 152 | 153 | love 154 | 155 |
156 |
157 | 158 |
159 | “I have not failed. I've just found 10,000 ways that won't work.” 160 | by 161 | (about) 162 | 163 |
164 | Tags: 165 | 166 | 167 | edison 168 | 169 | failure 170 | 171 | inspirational 172 | 173 | paraphrased 174 | 175 |
176 |
177 | 178 |
179 | “A woman is like a tea bag; you never know how strong it is until it's in hot water.” 180 | by 181 | (about) 182 | 183 |
184 | Tags: 185 | 186 | 187 | misattributed-eleanor-roosevelt 188 | 189 |
190 |
191 | 192 |
193 | “A day without sunshine is like, you know, night.” 194 | by 195 | (about) 196 | 197 |
198 | Tags: 199 | 200 | 201 | humor 202 | 203 | obvious 204 | 205 | simile 206 | 207 |
208 |
209 | 210 | 220 |
221 |
222 | 223 |

Top Ten tags

224 | 225 | 226 | love 227 | 228 | 229 | 230 | inspirational 231 | 232 | 233 | 234 | life 235 | 236 | 237 | 238 | humor 239 | 240 | 241 | 242 | books 243 | 244 | 245 | 246 | reading 247 | 248 | 249 | 250 | friendship 251 | 252 | 253 | 254 | friends 255 | 256 | 257 | 258 | truth 259 | 260 | 261 | 262 | simile 263 | 264 | 265 | 266 |
267 |
268 | 269 |
270 | 280 | 281 | -------------------------------------------------------------------------------- /examples/go/colly/go.sum: -------------------------------------------------------------------------------- 1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= 2 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 3 | github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= 4 | github.com/PuerkitoBio/goquery v1.7.1 h1:oE+T06D+1T7LNrn91B4aERsRIeCLJ/oPSa6xB9FPnz4= 5 | github.com/PuerkitoBio/goquery v1.7.1/go.mod h1:XY0pP4kfraEmmV1O7Uf6XyjoslwsneBbgeDjLYuN8xY= 6 | github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= 7 | github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= 8 | github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= 9 | github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= 10 | github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= 11 | github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM= 12 | github.com/antchfx/xmlquery v1.3.6 h1:kaEVzH1mNo/2AJZrhZjAaAUTy2Nn2zxGfYYU8jWfXOo= 13 | github.com/antchfx/xmlquery v1.3.6/go.mod h1:64w0Xesg2sTaawIdNqMB+7qaW/bSqkQm+ssPaCMWNnc= 14 | github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= 15 | github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= 16 | github.com/antchfx/xpath v1.1.10 h1:cJ0pOvEdN/WvYXxvRrzQH9x5QWKpzHacYO8qzCcDYAg= 17 | github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= 18 | github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= 19 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= 20 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 21 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 22 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 23 | github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= 24 | github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= 25 | github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= 26 | github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= 27 | github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= 28 | github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= 29 | github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs= 30 | github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= 31 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= 32 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= 33 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 34 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= 35 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 36 | github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 37 | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 38 | github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= 39 | github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= 40 | github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= 41 | github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= 42 | github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= 43 | github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= 44 | github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= 45 | github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= 46 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= 47 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 48 | github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 49 | github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= 50 | github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 51 | github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= 52 | github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= 53 | github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= 54 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 55 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 56 | github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= 57 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= 58 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= 59 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 60 | github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= 61 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 62 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 63 | github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= 64 | github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= 65 | github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= 66 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 67 | golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 68 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 69 | golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 70 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= 71 | golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= 72 | golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= 73 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 74 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 75 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 76 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 77 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 78 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 79 | golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= 80 | golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 81 | golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= 82 | golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= 83 | golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= 84 | golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 85 | golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d h1:LO7XpTYMwTqxjLcGWPijK3vRXg1aWdlNOVOHRq45d7c= 86 | golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 87 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= 88 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 89 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 90 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 91 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 92 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 93 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 94 | golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 95 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 96 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 97 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 98 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 99 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= 100 | golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= 101 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 102 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 103 | golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 104 | golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= 105 | golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= 106 | golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= 107 | golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= 108 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= 109 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 110 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= 111 | google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= 112 | google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= 113 | google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= 114 | google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= 115 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= 116 | google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= 117 | google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= 118 | google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= 119 | google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= 120 | google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= 121 | google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= 122 | google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= 123 | google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= 124 | google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= 125 | google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= 126 | google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 127 | google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 128 | google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 129 | google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA= 130 | google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= 131 | honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 132 | honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 133 | -------------------------------------------------------------------------------- /proxy/proxy.go: -------------------------------------------------------------------------------- 1 | package proxy 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "crypto/tls" 7 | "errors" 8 | "fmt" 9 | "io/ioutil" 10 | "net" 11 | "net/http" 12 | "net/http/cookiejar" 13 | "net/url" 14 | "regexp" 15 | "strings" 16 | "time" 17 | 18 | "github.com/DataHenHQ/datahen/pages" 19 | "github.com/DataHenHQ/till/internal/tillclient" 20 | "github.com/DataHenHQ/tillup/cache" 21 | "github.com/DataHenHQ/tillup/features" 22 | "github.com/DataHenHQ/tillup/logger" 23 | "github.com/DataHenHQ/tillup/sessions" 24 | "github.com/DataHenHQ/useragent" 25 | "golang.org/x/net/publicsuffix" 26 | ) 27 | 28 | var ( 29 | // Token is the Till auth token 30 | Token string 31 | 32 | // InstanceName is the name of this till instance 33 | InstanceName string 34 | 35 | ca tls.Certificate 36 | okHeader = []byte("HTTP/1.1 200 OK\r\n\r\n") 37 | 38 | // ForceUA indicates whether to overwrite all incoming user-agent with a random one 39 | ForceUA = true 40 | 41 | // UAType specifies what kind of user-agent to generate 42 | UAType = "desktop" 43 | 44 | dhHeadersRe = regexp.MustCompile(`(?i)^X-DH`) 45 | 46 | // ProxyFile points to the path of the txt file that contains a list of proxies 47 | ProxyFile = "" 48 | 49 | // ProxyURLs are external proxies that will be randomized 50 | ProxyURLs = []string{} 51 | 52 | // ProxyCount is the total count of proxies used. 53 | ProxyCount int 54 | 55 | // ReleaseVersion is the version of Till release 56 | ReleaseVersion = "dev" 57 | 58 | StatMu *tillclient.InstanceStatMutex 59 | 60 | // Cache is the cache specific config 61 | CacheConfig cache.Config 62 | 63 | // LoggerConfig is the logger specific config 64 | LoggerConfig logger.Config 65 | 66 | // SessionsConfig is the sessions specific config 67 | SessionsConfig sessions.Config 68 | ) 69 | 70 | func NewPageFromRequest(r *http.Request, scheme string, pconf *PageConfig) (p *pages.Page, err error) { 71 | p = new(pages.Page) 72 | 73 | u := r.URL 74 | u.Host = r.Host 75 | u.Scheme = scheme 76 | p.SetURL(u.String()) 77 | 78 | p.SetMethod(r.Method) 79 | 80 | // build the page headers 81 | nh := map[string]interface{}{} 82 | for name, values := range r.Header { 83 | nh[name] = strings.Join(values, ",") 84 | } 85 | 86 | // remove User-Agent header if we force-user agent 87 | if pconf.ForceUA { 88 | delete(nh, "User-Agent") 89 | } 90 | 91 | // delete any other proxy related header 92 | delete(nh, "Proxy-Connection") 93 | 94 | // finally set the header 95 | p.SetHeaders(nh) 96 | 97 | // fetch type will always be "standard" for Till 98 | p.FetchType = "standard" 99 | p.UaType = pconf.UaType 100 | 101 | // read the request body, save it and set it back to the request body 102 | rBody, _ := ioutil.ReadAll(r.Body) 103 | r.Body = ioutil.NopCloser(bytes.NewReader(rBody)) 104 | p.SetBody(string(rBody)) 105 | 106 | // set defaults 107 | p.SetUaType(pconf.UaType) 108 | p.SetFetchType("standard") 109 | p.SetPageType("default") 110 | 111 | // set the GID 112 | gid, err := pages.GenerateGID(p) 113 | if err != nil { 114 | return nil, err 115 | } 116 | p.SetGID(gid) 117 | 118 | return p, nil 119 | } 120 | 121 | func logReqSummary(gid, method, url string, respStatus int, cachehit bool) { 122 | cacheType := "MISS" 123 | if cachehit { 124 | cacheType = "HIT " 125 | } 126 | fmt.Println(cacheType, gid, method, url, respStatus) 127 | } 128 | 129 | func sendToTarget(ctx context.Context, sconn net.Conn, sreq *http.Request, scheme string, p *pages.Page, pconf *PageConfig) (tresp *http.Response, err error) { 130 | var sess *sessions.Session 131 | 132 | if features.Allow(features.Cache) && !CacheConfig.Disabled { 133 | 134 | // check if past response exist in the cache. if so, then return it. 135 | cresp, err := cache.GetResponse(ctx, p.GID, pconf.CacheFreshness, pconf.CacheServeFailures) 136 | if err != nil { 137 | return nil, err 138 | } 139 | // if cachehit then return the cached response 140 | if cresp != nil { 141 | // Increment the CacheHits stats 142 | incrCacheHitStatDelta() 143 | 144 | // Increment the successful or failed requests, and total requests 145 | if sessions.IsSuccess(cresp.StatusCode) { 146 | incrSuccessfulRequestStatDelta() 147 | } else { 148 | incrFailedRequestStatDelta() 149 | } 150 | incrRequestStatDelta() 151 | 152 | logReqSummary(p.GID, sreq.Method, sreq.URL.String(), cresp.StatusCode, true) 153 | 154 | // Build the target req and resp specifically for logging. 155 | _, treq, terr := buildTargetRequest(scheme, sreq, pconf, sess, p) 156 | // defer treq.Body.Close() 157 | if terr == nil && treq != nil { 158 | // record request and response to the logger 159 | _, tlerr := logger.StoreItem(ctx, p.GID, treq, cresp, time.Now(), true, (sessions.PageConfig)(*pconf), sess) 160 | if tlerr != nil { 161 | return nil, tlerr 162 | } 163 | 164 | } 165 | 166 | return cresp, nil 167 | } 168 | 169 | } 170 | 171 | // If StickySession is allowed, then set the sticky session 172 | if features.Allow(features.StickySessions) && pconf.SessionID != "" { 173 | 174 | // get a session, or a create a new one if it doesn't exist yet. 175 | sess, err = sessions.GetOrCreateStickySession(ctx, pconf.SessionID, (sessions.PageConfig)(*pconf)) 176 | if err != nil { 177 | return nil, err 178 | } 179 | 180 | } 181 | 182 | // build the target request from the source request 183 | tclient, treq, err := buildTargetRequest(scheme, sreq, pconf, sess, p) 184 | if err != nil { 185 | return nil, err 186 | } 187 | 188 | // record request now, and the logger.Response will be set later once the response comes back. 189 | rid, tlerr := logger.StoreItem(ctx, p.GID, treq, nil, time.Now(), false, (sessions.PageConfig)(*pconf), sess) 190 | if tlerr != nil { 191 | return nil, tlerr 192 | } 193 | 194 | // send the actual request to target server 195 | tresp, err = tclient.Do(treq) 196 | if err != nil { 197 | return nil, err 198 | } 199 | 200 | if sessions.IsSuccess(tresp.StatusCode) { 201 | incrSuccessfulRequestStatDelta() 202 | } else { 203 | incrFailedRequestStatDelta() 204 | } 205 | incrRequestStatDelta() 206 | 207 | // save the cookies from cookiejar to the session 208 | if sess != nil && !sess.IsZero() { 209 | if pconf.StickyCookies { 210 | if sess.Cookies == nil { 211 | sess.Cookies = sessions.CookieMap{} 212 | } 213 | sess.Cookies.Set(treq.URL, tclient.Jar.Cookies(treq.URL)) 214 | } 215 | sessions.SaveSession(ctx, sess) 216 | } 217 | 218 | if features.Allow(features.Cache) && !CacheConfig.Disabled { 219 | // Store the response to cache 220 | err := cache.StoreResponse(ctx, p.GID, tresp, nil) 221 | if err != nil { 222 | return nil, err 223 | } 224 | 225 | // Increment the CacheSets stats 226 | incrCacheSetStatDelta() 227 | 228 | } 229 | 230 | // log the request summary 231 | logReqSummary(p.GID, sreq.Method, sreq.URL.String(), tresp.StatusCode, false) 232 | 233 | // update response on the logger 234 | tlerr = logger.UpdateItemResponse(ctx, rid, tresp, sess) 235 | if tlerr != nil { 236 | return nil, tlerr 237 | } 238 | 239 | return tresp, err 240 | } 241 | 242 | // buildTargetRequest builds a target request from source request, and etc. 243 | func buildTargetRequest(scheme string, sreq *http.Request, pconf *PageConfig, sess *sessions.Session, p *pages.Page) (*http.Client, *http.Request, error) { 244 | // create transport for client 245 | t := &http.Transport{ 246 | Dial: (&net.Dialer{ 247 | Timeout: 30 * time.Second, 248 | KeepAlive: 30 * time.Second, 249 | }).Dial, 250 | DisableCompression: false, 251 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 252 | TLSHandshakeTimeout: 10 * time.Second, 253 | ResponseHeaderTimeout: 60 * time.Second, 254 | ExpectContinueTimeout: 1 * time.Second, 255 | MaxIdleConns: 1, 256 | MaxIdleConnsPerHost: 1, 257 | IdleConnTimeout: 1 * time.Millisecond, 258 | MaxConnsPerHost: 1, 259 | } 260 | defer t.CloseIdleConnections() 261 | 262 | // set proxy if specified 263 | if pconf.UseProxy { 264 | 265 | // using till session's proxy URL, or generate random proxy 266 | var u string 267 | if sess != nil { 268 | u = sess.ProxyURL 269 | } 270 | if u == "" { 271 | u = getRandom(ProxyURLs) 272 | } 273 | 274 | // set the proxy 275 | p, err := url.Parse(u) 276 | if err != nil { 277 | return nil, nil, err 278 | } 279 | t.Proxy = http.ProxyURL(p) 280 | } 281 | 282 | // create cookiejar 283 | jar, err := cookiejar.New(&cookiejar.Options{PublicSuffixList: publicsuffix.List}) 284 | if err != nil { 285 | return nil, nil, err 286 | } 287 | 288 | // create target client 289 | tclient := &http.Client{ 290 | Timeout: 120 * time.Second, 291 | Transport: t, 292 | Jar: jar, 293 | } 294 | 295 | // copy the body as *bytes.Reader to properly set the treq's body and content-length 296 | srBody, _ := ioutil.ReadAll(sreq.Body) 297 | sreq.Body = ioutil.NopCloser(bytes.NewReader(srBody)) 298 | p.SetBody(string(srBody)) 299 | 300 | // create target request 301 | treq, err := http.NewRequestWithContext(sreq.Context(), sreq.Method, sreq.RequestURI, bytes.NewReader(srBody)) 302 | if err != nil { 303 | return nil, nil, err 304 | } 305 | // build the target request 306 | u := sreq.URL 307 | u.Host = sreq.Host 308 | u.Scheme = scheme 309 | treq.URL = u 310 | treq.Host = u.Host 311 | 312 | // if there are cookies on the session, set it in the cookiejar 313 | if sess != nil && len(sess.Cookies) > 0 { 314 | if pconf.StickyCookies { 315 | tclient.Jar.SetCookies(treq.URL, sess.Cookies.Get(u)) 316 | } 317 | } 318 | 319 | // copy source headers into target headers 320 | th := copySourceHeaders(sreq.Header) 321 | if th != nil { 322 | treq.Header = th 323 | } 324 | 325 | // Delete headers related to proxy usage 326 | treq.Header.Del("Proxy-Connection") 327 | 328 | // if ForceUA is true, then override User-Agent header with a random UA 329 | if ForceUA { 330 | 331 | // using till session's user agent, or generate random one 332 | var ua string 333 | if sess != nil { 334 | ua = sess.UserAgent 335 | } 336 | if ua == "" { 337 | ua, err = generateRandomUA(UAType) 338 | if err != nil { 339 | return nil, nil, err 340 | } 341 | } 342 | 343 | // Set the ua on the target header 344 | th.Set("User-Agent", ua) 345 | } 346 | 347 | return tclient, treq, nil 348 | 349 | } 350 | 351 | // copy source headers other than those that starts with X-DH* into target headers 352 | func copySourceHeaders(sh http.Header) (th http.Header) { 353 | th = make(http.Header) 354 | 355 | if sh == nil { 356 | return nil 357 | } 358 | 359 | for key, values := range sh { 360 | if dhHeadersRe.MatchString(key) { 361 | continue 362 | } 363 | 364 | for _, val := range values { 365 | th.Add(key, val) 366 | } 367 | } 368 | 369 | return th 370 | } 371 | 372 | // Overrides User-Agent header with a random one 373 | func generateRandomUA(uaType string) (ua string, err error) { 374 | switch uaType { 375 | case "desktop": 376 | ua, err = useragent.Desktop() 377 | if err != nil { 378 | return "", err 379 | } 380 | case "mobile": 381 | ua = useragent.Mobile() 382 | } 383 | 384 | if ua == "" { 385 | return "", errors.New(fmt.Sprint("generated empty user agent string for", uaType)) 386 | } 387 | 388 | return ua, nil 389 | } 390 | 391 | func writeToSource(sconn net.Conn, tresp *http.Response, p *pages.Page) (err error) { 392 | // add X-DH-GID to the response 393 | if p != nil { 394 | tresp.Header.Set("X-DH-GID", p.GetGID()) 395 | } 396 | 397 | tresp.Write(sconn) 398 | 399 | return nil 400 | } 401 | 402 | // Atomically increments request delta in the instance stat 403 | func incrRequestStatDelta() { 404 | StatMu.Mutex.Lock() 405 | 406 | // increment the requests counter 407 | *(StatMu.InstanceStat.Requests) = *(StatMu.InstanceStat.Requests) + uint64(1) 408 | StatMu.Mutex.Unlock() 409 | 410 | } 411 | 412 | // Atomically increments intercepted request delta in the instance stat 413 | func incrInterceptedRequestStatDelta() { 414 | StatMu.Mutex.Lock() 415 | 416 | // increment the requests counter 417 | *(StatMu.InstanceStat.InterceptedRequests) = *(StatMu.InstanceStat.InterceptedRequests) + uint64(1) 418 | StatMu.Mutex.Unlock() 419 | 420 | } 421 | 422 | // Atomically increments failed request delta in the instance stat 423 | func incrFailedRequestStatDelta() { 424 | StatMu.Mutex.Lock() 425 | 426 | // increment the requests counter 427 | *(StatMu.InstanceStat.FailedRequests) = *(StatMu.InstanceStat.FailedRequests) + uint64(1) 428 | StatMu.Mutex.Unlock() 429 | 430 | } 431 | 432 | // Atomically increments successful request delta in the instance stat 433 | func incrSuccessfulRequestStatDelta() { 434 | StatMu.Mutex.Lock() 435 | 436 | // increment the requests counter 437 | *(StatMu.InstanceStat.SuccessfulRequests) = *(StatMu.InstanceStat.SuccessfulRequests) + uint64(1) 438 | StatMu.Mutex.Unlock() 439 | 440 | } 441 | 442 | // Atomically increments request delta in the instance stat 443 | func incrCacheHitStatDelta() { 444 | StatMu.Mutex.Lock() 445 | 446 | // increment the CacheHits counter 447 | *(StatMu.InstanceStat.CacheHits) = *(StatMu.InstanceStat.CacheHits) + uint64(1) 448 | StatMu.Mutex.Unlock() 449 | 450 | } 451 | 452 | // Atomically increments request delta in the instance stat 453 | func incrCacheSetStatDelta() { 454 | StatMu.Mutex.Lock() 455 | 456 | // increment the CacheSets counter 457 | *(StatMu.InstanceStat.CacheSets) = *(StatMu.InstanceStat.CacheSets) + uint64(1) 458 | StatMu.Mutex.Unlock() 459 | 460 | } 461 | -------------------------------------------------------------------------------- /examples/python/scrapy/tutorial/tutorial/quotes-2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Quotes to Scrape 6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |

14 | Quotes to Scrape 15 |

16 |
17 |
18 |

19 | 20 | Login 21 | 22 |

23 |
24 |
25 | 26 | 27 |
28 |
29 | 30 |
31 | “This life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold on, and always, always, always believe in yourself, because if you don't, then who will, sweetie? So keep your head high, keep your chin up, and most importantly, keep smiling, because life's a beautiful thing and there's so much to smile about.” 32 | by 33 | (about) 34 | 35 |
36 | Tags: 37 | 38 | 39 | friends 40 | 41 | heartbreak 42 | 43 | inspirational 44 | 45 | life 46 | 47 | love 48 | 49 | sisters 50 | 51 |
52 |
53 | 54 |
55 | “It takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.” 56 | by 57 | (about) 58 | 59 |
60 | Tags: 61 | 62 | 63 | courage 64 | 65 | friends 66 | 67 |
68 |
69 | 70 |
71 | “If you can't explain it to a six year old, you don't understand it yourself.” 72 | by 73 | (about) 74 | 75 |
76 | Tags: 77 | 78 | 79 | simplicity 80 | 81 | understand 82 | 83 |
84 |
85 | 86 |
87 | “You may not be her first, her last, or her only. She loved before she may love again. But if she loves you now, what else matters? She's not perfect—you aren't either, and the two of you may never be perfect together but if she can make you laugh, cause you to think twice, and admit to being human and making mistakes, hold onto her and give her the most you can. She may not be thinking about you every second of the day, but she will give you a part of her that she knows you can break—her heart. So don't hurt her, don't change her, don't analyze and don't expect more than she can give. Smile when she makes you happy, let her know when she makes you mad, and miss her when she's not there.” 88 | by 89 | (about) 90 | 91 |
92 | Tags: 93 | 94 | 95 | love 96 | 97 |
98 |
99 | 100 |
101 | “I like nonsense, it wakes up the brain cells. Fantasy is a necessary ingredient in living.” 102 | by 103 | (about) 104 | 105 |
106 | Tags: 107 | 108 | 109 | fantasy 110 | 111 |
112 |
113 | 114 |
115 | “I may not have gone where I intended to go, but I think I have ended up where I needed to be.” 116 | by 117 | (about) 118 | 119 |
120 | Tags: 121 | 122 | 123 | life 124 | 125 | navigation 126 | 127 |
128 |
129 | 130 |
131 | “The opposite of love is not hate, it's indifference. The opposite of art is not ugliness, it's indifference. The opposite of faith is not heresy, it's indifference. And the opposite of life is not death, it's indifference.” 132 | by 133 | (about) 134 | 135 |
136 | Tags: 137 | 138 | 139 | activism 140 | 141 | apathy 142 | 143 | hate 144 | 145 | indifference 146 | 147 | inspirational 148 | 149 | love 150 | 151 | opposite 152 | 153 | philosophy 154 | 155 |
156 |
157 | 158 |
159 | “It is not a lack of love, but a lack of friendship that makes unhappy marriages.” 160 | by 161 | (about) 162 | 163 |
164 | Tags: 165 | 166 | 167 | friendship 168 | 169 | lack-of-friendship 170 | 171 | lack-of-love 172 | 173 | love 174 | 175 | marriage 176 | 177 | unhappy-marriage 178 | 179 |
180 |
181 | 182 |
183 | “Good friends, good books, and a sleepy conscience: this is the ideal life.” 184 | by 185 | (about) 186 | 187 |
188 | Tags: 189 | 190 | 191 | books 192 | 193 | contentment 194 | 195 | friends 196 | 197 | friendship 198 | 199 | life 200 | 201 |
202 |
203 | 204 |
205 | “Life is what happens to us while we are making other plans.” 206 | by 207 | (about) 208 | 209 |
210 | Tags: 211 | 212 | 213 | fate 214 | 215 | life 216 | 217 | misattributed-john-lennon 218 | 219 | planning 220 | 221 | plans 222 | 223 |
224 |
225 | 226 | 240 |
241 |
242 | 243 |

Top Ten tags

244 | 245 | 246 | love 247 | 248 | 249 | 250 | inspirational 251 | 252 | 253 | 254 | life 255 | 256 | 257 | 258 | humor 259 | 260 | 261 | 262 | books 263 | 264 | 265 | 266 | reading 267 | 268 | 269 | 270 | friendship 271 | 272 | 273 | 274 | friends 275 | 276 | 277 | 278 | truth 279 | 280 | 281 | 282 | simile 283 | 284 | 285 | 286 |
287 |
288 | 289 |
290 | 300 | 301 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | **DataHen Till** is a companion tool to your existing web scraper that instantly makes it scalable, maintainable, and more unblockable, with minimal code changes on your scraper. Integrates with any scraper in 5 minutes. 3 | 4 | [![Alt text](https://img.youtube.com/vi/D1VBVYTRo8g/0.jpg)](https://www.youtube.com/watch?v=D1VBVYTRo8g) 5 | 6 | Till was architected to follow best practices that [DataHen](https://www.datahen.com) has accumulated over the years of scraping at a massive scale. 7 | 8 | ![How it works](img/how-it-works.png) 9 | 10 | ### Till easily integrates with your existing scrapers... 11 | written in languages such as: 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | or frameworks such as: 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
32 | 33 | 34 | and many more... 35 | 36 | 37 | # Table of Contents 38 | 39 | * [Problems with Web Scraping](#problems-with-web-scraping) 40 | * [Scaling Your Scraper](#scaling-your-scraper) 41 | * [Blocked scraper](#blocked-scraper) 42 | * [Scraper Maintenance](#scraper-maintenance) 43 | * [Postmortem analysis & reproducability](#postmortem-analysis--reproducability) 44 | * [Starting over from scratch when it fails mid-way](#starting-over-from-scratch-when-it-fails-mid-way) 45 | * [Features](#features) 46 | * [User-Agent randomizer](#user-agent-randomizer) 47 | * [Proxy IP address rotation](#proxy-ip-address-rotation) 48 | * [Sticky Sessions](#sticky-sessions) 49 | * [Managing Cookies](#managing-cookies) 50 | * [Request Logging](#request-logging) 51 | * [HTTP Caching](#http-caching) 52 | * [Global ID (GID)](#global-id-gid) 53 | * [Request Interceptions](#request-interceptions) 54 | * [How DataHen Till works](#how-datahen-till-works) 55 | * [Installation](#installation) 56 | * [Certificate Authority (CA) Certificates](#certificate-authority-ca-certificates) 57 | * [Till Integrations](#till-integrations) 58 | * [Python](#python) 59 | * Scrapy 60 | * [Node.js](#nodejs) 61 | * Plain 62 | * Puppeteer 63 | * [Go](#go) 64 | * net/http 65 | * Colly 66 | * [Ruby](#ruby) 67 | * Kimurai 68 | 69 | # Problems with Web Scraping 70 | 71 | 72 | Web scraping is usually easy to get started, especially on a small scale. However, as you try to scale it up, it gets exponentially difficult. Scraping 10,000 records can easily be done with simple web scraper scripts in any programming language, but as you try to scrape millions of pages, you would need to architect and build features on your web scraping script that allows you to scale, maintain and unblock your scrapers. 73 | 74 | 75 | **DataHen Till** solves the following problems: 76 | 77 | 78 | ## Scaling your scraper 79 | Scraping to millions or even billions of records requires much more pre-planning. It's not simply running your existing web scraper script in a bigger CPU/Ram machine. 80 | More thoughts are needed, such as: 81 | 82 | - How to log massive amounts of HTTP requests. 83 | - How to troubleshoot HTTP requests, when it fails at scale. 84 | - How to minimize bandwidth usage. 85 | - How to rotate proxy IPs. 86 | - How to handle anti-scrapers. 87 | - What happens when a scraper fails. 88 | - How to resume scrapers after they are fixed. 89 | - etc. 90 | 91 | 92 | Till provides a plug-and-play method of making your web scrapers scalable, and maintainable following best practices at [DataHen](https://www.datahen.com) that makes web scraping a pleasant experience. 93 | 94 | ## Blocked scraper 95 | As you try to scale up the number of requests, quite often, the target websites will detect your scraper and try to block your requests using Captcha, or throttling, or denying your request completely. 96 | 97 | Till helps you circumvent detected as a web scraper by identifying your scraper as a real web browser. It does this by generating random `user-agent` headers and randomizing proxy IPs (that you supply) on every HTTP request. 98 | 99 | Till also makes it easy for you to troubleshoot on why the target website block your scraper. 100 | 101 | ## Scraper Maintenance 102 | Maintaining high-scale scrapers is challenging due to the massive volume of requests and interactions between your scrapers and the target websites. In order for a smooth operation, you need to think through how to maintain your scrapers regularly. 103 | 104 | You need to know how to raise and triage errors as they occur on your scrapers, not all errors on web scraping should be treated equally. some are ignorable, and some are urgent. So, you will need to know what will be the details of your "development-deployment-maintenance" process will be. 105 | 106 | Till solves this by logging all your HTTP requests and categorizing them whether it was successful (2XX statuses) or failures(non 2XX statuses). Till also provides a Web UI to analyze the request history and make sense of what happened during your scraping process. 107 | 108 | Till makes it even easier for scraper maintenance by assigning each request with a unique Global ID (GID) that is derived from the request's URL, method, body, etc. You can then use this GID to troubleshoot your scrapers on where it went wrong. 109 | 110 | ## Postmortem analysis & reproducability 111 | The biggest difficulty facing any web scraper developer is when there are scraping failures. Your scraper fails when fetching or parsing certain URLs, but when you look at the target website and URLs, everything looks fine. How do you troubleshoot what already happened in the scenario?. How do you reproduce that failed scrape so that you can fix the issue? 112 | 113 | Till stores all HTTP requests and the responses (including the response body/content) into a local cache. If at anytime your scraper encounters an error, you can then use the request's GID (Till assigns a Global ID, also called GID, on every request) to find the request and the actual response and content from the cache. In this way, you can analyze what went wrong with that particular request. 114 | 115 | ## Starting over from scratch when it fails mid-way 116 | Websites change all the time and without notice. Imagine running your web scraper for a week and then suddenly, somewhere along the way, it fails. It is frustrating that once you've fixed the scraper, there is a high chance that you'd need to start over from scratch again. And, on top of this, there are additional consequences, such as time delay, and further charges related to proxy usage, bandwidth, storage, VM costs, etc. 117 | 118 | Till solves this by allowing you to replay your scrapers without actually needing to resend the HTTP requests to the target server. 119 | Till does this by assigning each HTTP request its own unique Global ID (GID) that is generated from the request's URL, method, headers, etc. It then stores all HTTP responses in the Cache based on their GID. 120 | 121 | When you restart your scraper, the scraping process can go blazingly fast because Till now serves the cached version of the HTTP responses. All of this without any code changes on your existing web scraper. 122 | 123 | # Features 124 | 125 | 126 | 127 | ## [User-Agent randomizer](https://till.datahen.com/docs/user-agent-randomizer) 128 | Till automatically generates random user-agent on every request. Choose to identify your scraper as a desktop browser, or a mobile browser, or you can even override it with your custom user-agent. 129 | 130 | ## [Proxy IP address rotation](https://till.datahen.com/docs/proxy-ip-address-rotation) 131 | Supply a list of proxy IPs, and Till will randomly use them on every request. Saves you time in needing to set up a separate proxy rotation service. 132 | 133 | ## [Sticky Sessions](https://till.datahen.com/docs/sticky-sessions) 134 | Your scraper can selectively reuse the same user-agent, proxy IP, and cookie jar for multiple requests. This allows you to easily group your requests based on certain workflow, and allow you to avoid detection from anti-scraping systems. 135 | 136 | ## [Managing Cookies](https://till.datahen.com/docs/sticky-sessions#manage-cookies) 137 | No need to build your cookie management logic in your scraper codes. Till can store the cookies for you so that you can easily reuse them on subsequent requests. 138 | 139 | 140 | ## [Request Logging](https://till.datahen.com/docs/request-log) 141 | Till will log your requests based on successful request (2XX status code) or failed request (non 2XX status code). This will allow you to easily troubleshoot your scraper later. 142 | 143 | The Till UI allows you to make sense of HTTP request history, and troubleshoot what happens during a scraping session. 144 | 145 | 146 | ## [HTTP Caching](https://till.datahen.com/docs/http-caching) 147 | Till caches all of your HTTP responses (and their contents), so that as needed, your web scraper will reuse the cache without needing to do another HTTP request to the target server. 148 | 149 | You can selectively choose whether to use a particular cached content or not by specifying how fresh you want Till to serve the cache. For example: If Till holds an existing cached content that is 1 week old, but your web scraper only wants 1-day old content, Till will then only serve cached contents that are 1 day old. 150 | 151 | ![HTTP Caching Flowchart](img/http-caching-flowchart.png) 152 | 153 | ## [Global ID (GID)](https://till.datahen.com/docs/http-caching#gid) 154 | Till uses [DataHen Platform](https://www.datahen.com/platform)'s convention of marking every unique request with a signature (we call this the Global ID or GID for short). Think of it like a Checksum of the actual request. 155 | 156 | Anytime your scraper sends a request through Till, it will return a response with the header `X-DH-GID` that contains the GID. This GID allows you to easily troubleshoot requests when you need to look up specific requests in the log, or contents in the cache. 157 | 158 | ## [Request Interceptions](https://till.datahen.com/docs/request-interception) 159 | Till can intercept any HTTP request of your choice, and replace with any HTTP response. 160 | 161 | The following are some examples of useful scenarios: 162 | 163 | - Ignoring Google Analytics javascript 164 | - Ignoring images or other files 165 | - Replacing (stubbing) an API call with a different response 166 | - Restricting your scraper to only certain URL patterns. 167 | 168 | 169 | # How DataHen Till works 170 | 171 | Till works as a Man In The Middle (MITM) proxy that listens to incoming HTTP(S) requests and forwards those requests to the target server as needed. While it does so, it enhances each request to avoid being detected by anti-scrapers. It also logs and caches the responses to make your scraper maintainable and scalable. 172 | 173 | Connect your scraper to Till via the `proxy` protocol that is typically common in any programming language. 174 | 175 | Your scraper will then continue to run as-is and it will get instantly become more unblockable, scalable, and maintainable. 176 | 177 | ![How it works](img/how-it-works.png) 178 | 179 | # Installation 180 | 181 | ## Step 1: Download Till 182 | 183 | The recommended way to install DataHen Till is by downloading one of the [standalone binaries](https://github.com/DataHenHQ/till/releases) according to your OS. 184 | 185 | 186 | ## Step 2: Get your auth Token 187 | 188 | You need to get your auth token to run Till. 189 | 190 | Get your token for FREE by signing up for an account at [till.datahen.com](https://till.datahen.com). 191 | 192 | 193 | ## Step 3: Start Till 194 | 195 | start the Till server with the following command: 196 | ```bash 197 | $ till serve -t 198 | ``` 199 | The above will start a proxy port on [http://localhost:2933](http://localhost:2933) 200 | and the Till UI on [http://localhost:2980](http://localhost:2980). 201 | 202 | ![Request Log UI](img/request-log-ui.png) 203 | 204 | ## Step 4 Connect to Till 205 | 206 | You can connect your scraper to Till without many code changes. 207 | 208 | If you want to connect to Till using curl, this is how: 209 | 210 | 211 | ```bash 212 | $ curl -k --proxy http://localhost:2933 https://fetchtest.datahen.com/echo/request 213 | ``` 214 | 215 | 216 | 217 | # Certificate Authority (CA) Certificates 218 | Till decrypts and encrypts HTTPS traffic on the fly between your scraper and the target websites. In order to do so, your scraper (or browser) must be able to trust the built-in Certificate Authority (CA). This means the CA certificate that Till generates for you, needs to be installed on the computer where the scraper is running. 219 | 220 | **Note:** If you do not wish to install the CA certificate, you can still have your scraper connect to the Till server by disabling/ignoring security checks in your scraper. Please refer to the programming language/framework/tool that your scraper uses. 221 | 222 | ## Installing the generated CA certificates onto your computer 223 | The first time Till runs as a server, Till generates the CA certificates in the following directory: 224 | 225 | Linux or MacOS: 226 | ``` 227 | ~/.config/datahen/till/ 228 | ``` 229 | 230 | Windows: 231 | ``` 232 | C:\Users\\.config\datahen\till\ 233 | ``` 234 | Then, please follow the following instructions to install the CA certificates: 235 | ### MacOS 236 | 237 | [Add certificates to a keychain using Keychain Access on Mac](https://support.apple.com/en-ca/guide/keychain-access/kyca2431/mac) 238 | 239 | ### Ubuntu/Debian 240 | [How do I install a root certificate](https://askubuntu.com/questions/73287/how-do-i-install-a-root-certificate/94861#94861) 241 | 242 | ### Mozilla Firefox 243 | [how to import the Mozilla Root Certificate into your Firefox web browser](https://wiki.mozilla.org/MozillaRootCertificate#Mozilla_Firefox) 244 | 245 | ### Chrome 246 | [Getting Chrome to accept self-signed localhost certificate](https://stackoverflow.com/questions/7580508/getting-chrome-to-accept-self-signed-localhost-certificate/15076602#15076602) 247 | 248 | ### Windows 249 | Use `certutil` with the following command: 250 | 251 | ``` 252 | certutil -addstore root 253 | ``` 254 | 255 | Read more about [certutil](https://web.archive.org/web/20160612045445/http://windows.microsoft.com/en-ca/windows/import-export-certificates-private-keys#1TC=windows-7) 256 | 257 | 258 | 259 | # Till Integrations 260 | 261 | ## Python 262 | 263 | ### Scrapy 264 | The [Scrapy example](examples/python/scrapy/) demonstrates how to integrate Till with Python's [Scrapy framework](https://github.com/scrapy/scrapy). 265 | 266 | 267 | ## Node.js 268 | 269 | ### Plain 270 | The [Node.js example](examples/nodejs/plain/) demonstrates how to integrate Till with Node.js based scrapers. 271 | 272 | ### Puppeteer 273 | The [Puppeteer example](examples/nodejs/puppeteer/) demonstrates how to integrate Till with Puppeteer. 274 | 275 | ## Go 276 | 277 | ### net/http 278 | The [Go net/http example](examples/go/standard) demonstrates how to integrate Till with Go's net/http standard library. 279 | 280 | ### Colly 281 | The [Go Colly example](examples/go/colly) demonstrates how to integrate Till with [Colly](https://github.com/gocolly/colly). 282 | 283 | ## Ruby 284 | 285 | ### Kimurai 286 | The [Ruby's Kimurai framework example](examples/ruby/kimurai) demonstrates how to integrate Till with Ruby's [Kimurai framework](https://github.com/vifreefly/kimuraframework). 287 | -------------------------------------------------------------------------------- /server/templates/requests/index.html: -------------------------------------------------------------------------------- 1 | 2 | {{define "content"}} 3 |
4 | {{template "filter_list" .}} 5 | 6 | {{template "table_nav" .}} 7 | 8 |
9 |
10 |
11 | 12 | 13 | 14 | 17 | 20 | 23 | 26 | 29 | 32 | 33 | 36 | 39 | 40 | 41 | 42 | {{range $i, $item := .Items}} 43 | 44 | 45 | 51 | 52 | 55 | 64 | 67 | 70 | 73 | 80 | 81 | 82 | 90 | 95 | 96 | {{end}} 97 | 98 |
15 | STATUS 16 | 18 | Method 19 | 21 | Name 22 | 24 | GID 25 | 27 | SID 28 | 30 | CACHE 31 | 34 | Size 35 | 37 | Timestamp 38 |
46 | {{if $item.ResponseStatusCode.Valid }} 47 | {{$item.ResponseStatusCode.Int64}} 48 | ! 49 | {{end}} 50 | 53 | {{$item.RequestMethod}} 54 | 56 | {{if $item.ResponseStatusCode.Valid }} 57 | 58 | 59 | 60 | 61 | 62 | {{end}} 63 | 65 | {{printf "%.120s" (basepathPlusQOrHost $item.RequestURL)}} 66 | 68 | {{shortGID .Gid}} 69 | 71 | {{.SessionID.String}} 72 | 74 | {{if $item.CacheHit -}} 75 | HIT 76 | {{- else -}} 77 | MISS 78 | {{- end}} 79 | 83 | {{if $item.ResponseContentLength.Valid }} 84 | {{intToBytes $item.ResponseContentLength.Int64}} 85 | F 86 | 87 | T 88 | {{end}} 89 | 91 | {{(intToTime $item.Timestamp).Format "2006-01-02 15:04:05"}} 92 | F 93 | T 94 |
99 |
100 |
101 |
102 | {{template "table_nav" .}} 103 | 104 |
105 | {{end}} 106 | 107 | {{define "table_nav"}} 108 | 109 | 134 | 135 | {{end}} 136 | 137 | 138 | {{define "filter_list"}} 139 | {{if not .Filter.IsZero}} 140 |
141 |
FILTERS
142 |
|
143 | {{if .Filter.Gid }} 144 | 145 | GID: {{.Filter.Gid}} 146 | 147 | Remove 148 | 149 | 150 | 151 | 152 | 153 | {{end}} 154 | {{if .Filter.SessionID }} 155 | 156 | Session ID: {{.Filter.SessionID}} 157 | 158 | Remove 159 | 160 | 161 | 162 | 163 | 164 | {{end}} 165 | {{if .Filter.CacheHit }} 166 | 167 | Cache: {{if (boolval .Filter.CacheHit)}}HIT{{else}}MISS{{end}} 168 | 169 | Remove 170 | 171 | 172 | 173 | 174 | 175 | {{end}} 176 | {{if .Filter.RequestMethod }} 177 | 178 | Method: {{.Filter.RequestMethod}} 179 | 180 | Remove 181 | 182 | 183 | 184 | 185 | 186 | {{end}} 187 | {{if .Filter.ResponseStatusCode }} 188 | 189 | Status: {{.Filter.ResponseStatusCode}} 190 | 191 | Remove 192 | 193 | 194 | 195 | 196 | 197 | {{end}} 198 | 199 | {{if .Filter.FromResponseContentLength }} 200 | 201 | From Size: {{intToBytes .Filter.FromResponseContentLength}} 202 | 203 | Remove 204 | 205 | 206 | 207 | 208 | 209 | {{end}} 210 | {{if .Filter.ToResponseContentLength }} 211 | 212 | To Size: {{intToBytes .Filter.ToResponseContentLength}} 213 | 214 | Remove 215 | 216 | 217 | 218 | 219 | 220 | {{end}} 221 | {{if not .Filter.FromTime.IsZero }} 222 | 223 | From Time: {{.Filter.FromTime}} 224 | 225 | Remove 226 | 227 | 228 | 229 | 230 | 231 | {{end}} 232 | {{if not .Filter.ToTime.IsZero }} 233 | 234 | To Time: {{.Filter.ToTime}} 235 | 236 | Remove 237 | 238 | 239 | 240 | 241 | 242 | {{end}} 243 | 244 |
245 | {{end}} 246 | {{end}} --------------------------------------------------------------------------------