├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── config.go ├── config.yml ├── crawl.go ├── crawl_apache2_test.go ├── crawl_nginx_test.go ├── crawl_test.go ├── ds └── redblackhash │ └── redblack.go ├── errors.go ├── fasturl ├── url.go └── url_test.go ├── go.mod ├── go.sum ├── help.go ├── main.go ├── model.go ├── queue.go ├── release.sh ├── scheduler.go ├── server.go ├── stats.go ├── util.go └── worker.go /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | .DS_Store 3 | /od-database-crawler 4 | *.log 5 | /queue/ 6 | /crawled/ 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - "1.11.x" 5 | - master 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:alpine as builder 2 | ADD . /go/src/github.com/terorie/od-database-crawler 3 | RUN apk add git \ 4 | && go get -d -v github.com/terorie/od-database-crawler \ 5 | && CGO_ENABLED=0 go install -a \ 6 | -installsuffix cgo \ 7 | -ldflags="-s -w" \ 8 | github.com/terorie/od-database-crawler 9 | 10 | FROM scratch 11 | COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ 12 | COPY --from=builder /go/bin/od-database-crawler /bin/ 13 | WORKDIR /oddb 14 | VOLUME [ "/oddb" ] 15 | CMD ["/bin/od-database-crawler", "server"] 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OD-Database Crawler 🕷 2 | [![Build Status](https://travis-ci.org/terorie/od-database-crawler.svg?branch=master)](https://travis-ci.org/terorie/od-database-crawler) 3 | [![](https://tokei.rs/b1/github/terorie/od-database-crawler)](https://github.com/terorie/od-database-crawler) 4 | [![CodeFactor](https://www.codefactor.io/repository/github/terorie/od-database-crawler/badge/master)](https://www.codefactor.io/repository/github/terorie/od-database-crawler/overview/master) 5 | 6 | * Crawler for [__OD-Database__](https://github.com/simon987/od-database) 7 | * In production at https://od-db.the-eye.eu/ 8 | * Over 880 TB actively crawled 9 | * Crawls HTTP open directories (standard Web Server Listings) 10 | * Gets name, path, size and modification time of all files 11 | * Lightweight and fast 12 | 13 | https://od-db.the-eye.eu/ 14 | 15 | ## Usage 16 | 17 | ### Deploys 18 | 19 | 1. With Config File (if `config.yml` found in working dir) 20 | - Download [default config](https://github.com/terorie/od-database-crawler/blob/master/config.yml) 21 | - Set `server.url` and `server.token` 22 | - Start with `./od-database-crawler server --config ` 23 | 24 | 2. With Flags or env 25 | - Override config file if it exists 26 | - `--help` for list of flags 27 | - Every flag is available as an environment variable: 28 | `--server.crawl_stats` ➡️ `OD_SERVER_CRAWL_STATS` 29 | - Start with `./od-database-crawler server ` 30 | 31 | 3. With Docker 32 | ```bash 33 | docker run \ 34 | -e OD_SERVER_URL=xxx \ 35 | -e OD_SERVER_TOKEN=xxx \ 36 | terorie/od-database-crawler 37 | ``` 38 | 39 | ### Flag reference 40 | 41 | Here are the most important config flags. For more fine control, take a look at `/config.yml`. 42 | 43 | | Flag/Environment | Description | Example | 44 | | ------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------- | 45 | | `server.url`
`OD_SERVER_URL` | OD-DB Server URL | `https://od-db.mine.the-eye.eu/api` | 46 | | `server.token`
`OD_SERVER_TOKEN` | OD-DB Server Access Token | _Ask Hexa **TM**_ | 47 | | `server.recheck`
`OD_SERVER_RECHECK` | Job Fetching Interval | `3s` | 48 | | `output.crawl_stats`
`OD_OUTPUT_CRAWL_STATS` | Crawl Stats Logging Interval (0 = disabled) | `500ms` | 49 | | `output.resource_stats`
`OD_OUTPUT_RESORUCE_STATS` | Resource Stats Logging Interval (0 = disabled) | `8s` | 50 | | `output.log`
`OD_OUTPUT_LOG` | Log File (none = disabled) | `crawler.log` | 51 | | `crawl.tasks`
`OD_CRAWL_TASKS` | Max number of sites to crawl concurrently | `500` | 52 | | `crawl.connections`
`OD_CRAWL_CONNECTIONS` | HTTP connections per site | `1` | 53 | | `crawl.retries`
`OD_CRAWL_RETRIES` | How often to retry after a temporary failure (e.g. `HTTP 429` or timeouts) | `5` | 54 | | `crawl.dial_timeout`
`OD_CRAWL_DIAL_TIMEOUT` | TCP Connect timeout | `5s` | 55 | | `crawl.timeout`
`OD_CRAWL_TIMEOUT` | HTTP request timeout | `20s` | 56 | | `crawl.user-agent`
`OD_CRAWL_USER_AGENT` | HTTP Crawler User-Agent | `googlebot/1.2.3` | 57 | | `crawl.job_buffer`
`OD_CRAWL_JOB_BUFFER` | Number of URLs to keep in memory/cache, per job. The rest is offloaded to disk. Decrease this value if the crawler uses too much RAM. (0 = Disable Cache, -1 = Only use Cache) | `5000` | 58 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "github.com/sirupsen/logrus" 7 | "github.com/spf13/pflag" 8 | "github.com/spf13/viper" 9 | "io" 10 | "os" 11 | "strings" 12 | "time" 13 | ) 14 | 15 | var config struct { 16 | ServerUrl string 17 | Token string 18 | ServerTimeout time.Duration 19 | Recheck time.Duration 20 | ChunkSize int64 21 | Retries int 22 | Workers int 23 | UserAgent string 24 | Tasks int32 25 | Verbose bool 26 | PrintHTTP bool 27 | JobBufferSize int 28 | } 29 | 30 | var onlineMode bool 31 | 32 | const ( 33 | ConfServerUrl = "server.url" 34 | ConfToken = "server.token" 35 | ConfServerTimeout = "server.timeout" 36 | ConfRecheck = "server.recheck" 37 | ConfCooldown = "server.cooldown" 38 | ConfChunkSize = "server.upload_chunk" 39 | ConfUploadRetries = "server.upload_retries" 40 | ConfUploadRetryInterval = "server.upload_retry_interval" 41 | 42 | ConfTasks = "crawl.tasks" 43 | ConfRetries = "crawl.retries" 44 | ConfWorkers = "crawl.connections" 45 | ConfUserAgent = "crawl.user-agent" 46 | ConfDialTimeout = "crawl.dial_timeout" 47 | ConfTimeout = "crawl.timeout" 48 | ConfJobBufferSize = "crawl.job_buffer" 49 | 50 | ConfCrawlStats = "output.crawl_stats" 51 | ConfAllocStats = "output.resource_stats" 52 | ConfVerbose = "output.verbose" 53 | ConfPrintHTTP = "output.http" 54 | ConfLogFile = "output.log" 55 | ) 56 | 57 | func prepareConfig() { 58 | pf := rootCmd.PersistentFlags() 59 | 60 | pf.SortFlags = false 61 | pf.StringVar(&configFile, "config", "", "Config file") 62 | configFile = os.Getenv("OD_CONFIG") 63 | 64 | pf.String(ConfServerUrl, "http://od-db.the-eye.eu/api", "OD-DB server URL") 65 | 66 | pf.String(ConfToken, "", "OD-DB access token (env OD_SERVER_TOKEN)") 67 | 68 | pf.Duration(ConfServerTimeout, 60 * time.Second, "OD-DB request timeout") 69 | 70 | pf.Duration(ConfRecheck, 1 * time.Second, "OD-DB: Poll interval for new jobs") 71 | 72 | pf.Duration(ConfCooldown, 30 * time.Second, "OD-DB: Time to wait after a server-side error") 73 | 74 | pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size") 75 | 76 | pf.Uint(ConfUploadRetries, 10, "OD-DB: Max upload retries") 77 | 78 | pf.Duration(ConfUploadRetryInterval, 30 * time.Second, "OD-DB: Time to wait between upload retries") 79 | 80 | pf.Uint(ConfTasks, 100, "Crawler: Max concurrent tasks") 81 | 82 | pf.Uint(ConfWorkers, 4, "Crawler: Connections per server") 83 | 84 | pf.Uint(ConfRetries, 5, "Crawler: Request retries") 85 | 86 | pf.Duration(ConfDialTimeout, 10 * time.Second, "Crawler: Handshake timeout") 87 | 88 | pf.Duration(ConfTimeout, 30 * time.Second, "Crawler: Request timeout") 89 | 90 | pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent") 91 | 92 | pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size") 93 | 94 | pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval") 95 | 96 | pf.Duration(ConfAllocStats, 10 * time.Second, "Log: Resource stats interval") 97 | 98 | pf.Bool(ConfVerbose, false, "Log: Print every listed dir") 99 | 100 | pf.Bool(ConfPrintHTTP, false, "Log: Print HTTP client errors") 101 | 102 | pf.String(ConfLogFile, "crawler.log", "Log file") 103 | 104 | // Bind all flags to Viper 105 | pf.VisitAll(func(flag *pflag.Flag) { 106 | s := flag.Name 107 | s = strings.TrimLeft(s, "-") 108 | 109 | if err := viper.BindPFlag(s, flag); err != nil { 110 | panic(err) 111 | } 112 | var envKey string 113 | envKey = strings.Replace(s, ".", "_", -1) 114 | envKey = strings.ToUpper(envKey) 115 | envKey = "OD_" + envKey 116 | if err := viper.BindEnv(s, envKey); err != nil { 117 | panic(err) 118 | } 119 | }) 120 | } 121 | 122 | func readConfig() { 123 | // If config.yml in working dir, use it 124 | if configFile == "" { 125 | _, err := os.Stat("config.yml") 126 | if err == nil { 127 | configFile = "config.yml" 128 | } 129 | } 130 | 131 | if configFile != "" { 132 | confF, err := os.Open(configFile) 133 | if err != nil { 134 | fmt.Fprintln(os.Stderr, err) 135 | os.Exit(1) 136 | } 137 | defer confF.Close() 138 | 139 | viper.SetConfigType("yml") 140 | err = viper.ReadConfig(confF) 141 | if err != nil { 142 | fmt.Fprintln(os.Stderr, err) 143 | os.Exit(1) 144 | } 145 | } 146 | 147 | if onlineMode { 148 | config.ServerUrl = viper.GetString(ConfServerUrl) 149 | if config.ServerUrl == "" { 150 | configMissing(ConfServerUrl) 151 | } 152 | config.ServerUrl = strings.TrimRight(config.ServerUrl, "/") 153 | 154 | config.Token = viper.GetString(ConfToken) 155 | if config.Token == "" { 156 | configMissing(ConfToken) 157 | } 158 | } 159 | 160 | config.ServerTimeout = viper.GetDuration(ConfServerTimeout) 161 | 162 | config.Recheck = viper.GetDuration(ConfRecheck) 163 | 164 | config.ChunkSize = int64(viper.GetSizeInBytes(ConfChunkSize)) 165 | if config.ChunkSize < 100 { 166 | configOOB(ConfChunkSize, config.ChunkSize) 167 | } 168 | 169 | config.Retries = viper.GetInt(ConfRetries) 170 | if config.Retries < 0 { 171 | config.Retries = 1 << 31 172 | } 173 | 174 | config.Workers = viper.GetInt(ConfWorkers) 175 | if config.Workers <= 0 { 176 | configOOB(ConfWorkers, config.Workers) 177 | } 178 | 179 | config.Tasks = viper.GetInt32(ConfTasks) 180 | if config.Tasks <= 0 { 181 | configOOB(ConfTasks, int(config.Tasks)) 182 | } 183 | 184 | config.UserAgent = viper.GetString(ConfUserAgent) 185 | 186 | setDialTimeout(viper.GetDuration(ConfDialTimeout)) 187 | 188 | setTimeout(viper.GetDuration(ConfTimeout)) 189 | 190 | config.JobBufferSize = viper.GetInt(ConfJobBufferSize) 191 | 192 | config.Verbose = viper.GetBool(ConfVerbose) 193 | if config.Verbose { 194 | logrus.SetLevel(logrus.DebugLevel) 195 | } 196 | 197 | if filePath := viper.GetString(ConfLogFile); filePath != "" { 198 | f, err := os.OpenFile(filePath, os.O_CREATE | os.O_WRONLY | os.O_APPEND, 0644) 199 | bufWriter := bufio.NewWriter(f) 200 | if err != nil { panic(err) } 201 | exitHooks.Add(func() { 202 | bufWriter.Flush() 203 | f.Close() 204 | }) 205 | logrus.SetOutput(io.MultiWriter(os.Stdout, bufWriter)) 206 | } 207 | 208 | config.PrintHTTP = viper.GetBool(ConfPrintHTTP) 209 | } 210 | 211 | func configMissing(key string) { 212 | fmt.Fprintf(os.Stderr, "config: %s not set!\n", key) 213 | os.Exit(1) 214 | } 215 | 216 | func configOOB(key string, v interface{}) { 217 | fmt.Fprintf(os.Stderr, "config: illegal value %v for key %s!\n", v, key) 218 | os.Exit(1) 219 | } 220 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | # OD-Database server settings 2 | server: 3 | # Connection URL 4 | url: http://od-db.mine.terorie.com/api 5 | 6 | # Server auth token 7 | token: 8 | 9 | # Request timeout 10 | timeout: 60s 11 | 12 | # Recheck interval 13 | # The crawler periodically asks the server 14 | # for new jobs. Sets the minimum wait time 15 | # between /task/get requests to the server. 16 | recheck: 1s 17 | 18 | # Time to wait after receiving an error 19 | # from the server. Doesn't apply to uploads. 20 | cooldown: 30s 21 | 22 | # Upload chunk size 23 | # If the value is too high, the upload fails. 24 | upload_chunk: 1 MB 25 | 26 | upload_retries: 10 27 | upload_retry_interval: 30s 28 | 29 | # Log output settings 30 | output: 31 | # Crawl statistics 32 | crawl_stats: 1s 33 | 34 | # CPU/RAM/Job queue stats 35 | resource_stats: 10s 36 | 37 | # More output? (Every listed dir) 38 | verbose: false 39 | 40 | # Print HTTP errors (Super spammy) 41 | http: false 42 | 43 | # Log file 44 | # If empty, no log file is created. 45 | log: crawler.log 46 | 47 | # Crawler settings 48 | crawl: 49 | # Number of sites that can be processed at once 50 | tasks: 25 51 | 52 | # Number of connections per site 53 | # Please be careful with this setting! 54 | # The crawler fires fast and more than 55 | # ten connections can overwhelm a server. 56 | connections: 1 57 | 58 | # How often to retry getting data 59 | # from the site before giving up 60 | retries: 5 61 | 62 | # Time before discarding a failed connection attempt 63 | dial_timeout: 10s 64 | 65 | # Time before discarding a network request 66 | timeout: 30s 67 | 68 | # Crawler User-Agent 69 | # If empty, no User-Agent header is sent. 70 | user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0" 71 | 72 | # Job buffer size (per task) 73 | # Higher values cause less disk writes 74 | # but require more memory. 75 | # 76 | # The job queue contains all URLs 77 | # that should be crawled next. 78 | # As it grows very large over time, 79 | # it's kept mainly on disk. 80 | # This sets how many jobs are kept 81 | # in memory. 82 | # A negative value will cause all jobs 83 | # to be stored in memory. (Don't do this) 84 | job_buffer: -1 85 | -------------------------------------------------------------------------------- /crawl.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "crypto/tls" 6 | "github.com/terorie/od-database-crawler/ds/redblackhash" 7 | "github.com/terorie/od-database-crawler/fasturl" 8 | "github.com/valyala/fasthttp" 9 | "golang.org/x/crypto/blake2b" 10 | "golang.org/x/net/html" 11 | "net" 12 | "path" 13 | "strconv" 14 | "strings" 15 | "time" 16 | ) 17 | 18 | var client = fasthttp.Client { 19 | TLSConfig: &tls.Config{ 20 | InsecureSkipVerify: true, 21 | }, 22 | } 23 | 24 | func setDialTimeout(d time.Duration) { 25 | client.Dial = func(addr string) (net.Conn, error) { 26 | return fasthttp.DialTimeout(addr, d) 27 | } 28 | } 29 | 30 | func setTimeout(d time.Duration) { 31 | client.ReadTimeout = d 32 | client.WriteTimeout = d / 2 33 | } 34 | 35 | func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { 36 | f.IsDir = true 37 | f.Name = path.Base(j.Uri.Path) 38 | 39 | req := fasthttp.AcquireRequest() 40 | if config.UserAgent != "" { 41 | req.Header.SetUserAgent(config.UserAgent) 42 | } 43 | req.SetRequestURI(j.UriStr) 44 | 45 | res := fasthttp.AcquireResponse() 46 | defer fasthttp.ReleaseResponse(res) 47 | 48 | err = client.Do(req, res) 49 | fasthttp.ReleaseRequest(req) 50 | 51 | if err != nil { 52 | return 53 | } 54 | 55 | err = checkStatusCode(res.StatusCode()) 56 | if err != nil { 57 | return 58 | } 59 | 60 | body := res.Body() 61 | return ParseDir(body, &j.Uri) 62 | } 63 | 64 | func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error) { 65 | doc := html.NewTokenizer(bytes.NewReader(body)) 66 | 67 | var linkHref string 68 | for { 69 | err = nil 70 | 71 | tokenType := doc.Next() 72 | if tokenType == html.ErrorToken { 73 | break 74 | } 75 | 76 | switch tokenType { 77 | case html.StartTagToken: 78 | name, hasAttr := doc.TagName() 79 | if len(name) == 1 && name[0] == 'a' { 80 | for hasAttr { 81 | var ks, vs []byte 82 | ks, vs, hasAttr = doc.TagAttr() 83 | if bytes.Equal(ks, []byte("href")) { 84 | // TODO Check escape 85 | linkHref = string(vs) 86 | break 87 | } 88 | } 89 | } 90 | 91 | case html.EndTagToken: 92 | name, _ := doc.TagName() 93 | if len(name) == 1 && name[0] == 'a' { 94 | // Copy params 95 | href := linkHref 96 | 97 | // Reset params 98 | linkHref = "" 99 | 100 | if strings.LastIndexByte(href, '?') != -1 { 101 | continue 102 | } 103 | 104 | switch href { 105 | case "", " ", ".", "..", "/": 106 | continue 107 | } 108 | 109 | if strings.Contains(href, "../") { 110 | continue 111 | } 112 | 113 | var link fasturl.URL 114 | err = baseUri.ParseRel(&link, href) 115 | if err != nil { 116 | continue 117 | } 118 | 119 | if link.Scheme != baseUri.Scheme || 120 | link.Host != baseUri.Host || 121 | link.Path == baseUri.Path || 122 | !strings.HasPrefix(link.Path, baseUri.Path) { 123 | continue 124 | } 125 | 126 | links = append(links, link) 127 | } 128 | } 129 | } 130 | 131 | return 132 | } 133 | 134 | func GetFile(u fasturl.URL, f *File) (err error) { 135 | f.IsDir = false 136 | u.Path = path.Clean(u.Path) 137 | f.Name = path.Base(u.Path) 138 | f.Path = strings.Trim(path.Dir(u.Path), "/") 139 | 140 | req := fasthttp.AcquireRequest() 141 | req.Header.SetMethod("HEAD") 142 | if config.UserAgent != "" { 143 | req.Header.SetUserAgent(config.UserAgent) 144 | } 145 | req.SetRequestURI(u.String()) 146 | 147 | res := fasthttp.AcquireResponse() 148 | res.SkipBody = true 149 | defer fasthttp.ReleaseResponse(res) 150 | 151 | err = client.Do(req, res) 152 | fasthttp.ReleaseRequest(req) 153 | 154 | if err != nil { 155 | return 156 | } 157 | 158 | err = checkStatusCode(res.StatusCode()) 159 | if err != nil { 160 | return 161 | } 162 | 163 | f.applyContentLength(string(res.Header.Peek("content-length"))) 164 | f.applyLastModified(string(res.Header.Peek("last-modified"))) 165 | 166 | return nil 167 | } 168 | 169 | func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) { 170 | h, _ := blake2b.New256(nil) 171 | h.Write([]byte(f.Name)) 172 | for _, link := range links { 173 | fileName := path.Base(link.Path) 174 | h.Write([]byte(fileName)) 175 | } 176 | sum := h.Sum(nil) 177 | copy(o[:redblackhash.KeySize], sum) 178 | return 179 | } 180 | 181 | func (f *File) applyContentLength(v string) { 182 | if v == "" { 183 | return 184 | } 185 | size, err := strconv.ParseInt(v, 10, 64) 186 | if err != nil { 187 | return 188 | } 189 | if size < 0 { 190 | return 191 | } 192 | f.Size = size 193 | } 194 | 195 | // TODO Cleanup 196 | func (f *File) applyLastModified(v string) { 197 | if v == "" { 198 | return 199 | } 200 | var t time.Time 201 | var err error 202 | t, err = time.Parse(time.RFC1123, v) 203 | if err == nil { 204 | f.MTime = t.Unix() 205 | return 206 | } 207 | t, err = time.Parse(time.RFC850, v) 208 | if err == nil { 209 | f.MTime = t.Unix() 210 | return 211 | } 212 | // TODO Parse asctime 213 | t, err = time.Parse("2006-01-02", v[:10]) 214 | if err == nil { 215 | f.MTime = t.Unix() 216 | return 217 | } 218 | } 219 | 220 | func checkStatusCode(status int) error { 221 | switch status { 222 | case fasthttp.StatusOK: 223 | return nil 224 | default: 225 | return &HttpError{status} 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /crawl_nginx_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/terorie/od-database-crawler/fasturl" 5 | "testing" 6 | ) 7 | 8 | func TestParseDirNginx(t *testing.T) { 9 | var u fasturl.URL 10 | err := u.Parse("https://the-eye.eu/public/") 11 | if err != nil { 12 | t.Fatal("Failed to parse URL", err) 13 | } 14 | 15 | links, err := ParseDir([]byte(nginxListing), &u) 16 | if err != nil { 17 | t.Fatal("Failed to extract links", err) 18 | } 19 | 20 | if len(links) != len(nginxLinks) { 21 | t.Fatalf("Expected %d links, got %d", 22 | len(nginxLinks), len(links)) 23 | } 24 | 25 | for i := 0; i < len(links); i++ { 26 | gotLink := links[i].String() 27 | expLink := nginxLinks[i] 28 | 29 | if gotLink != expLink { 30 | t.Errorf(`Expected "%s" got "%s"`, 31 | expLink, gotLink) 32 | } 33 | } 34 | } 35 | 36 | var nginxLinks = []string { 37 | "https://the-eye.eu/public/AppleArchive/", 38 | "https://the-eye.eu/public/AudioBooks/", 39 | "https://the-eye.eu/public/Books/", 40 | "https://the-eye.eu/public/Comics/", 41 | "https://the-eye.eu/public/Games/", 42 | "https://the-eye.eu/public/Icons/", 43 | "https://the-eye.eu/public/Images/", 44 | "https://the-eye.eu/public/JFK_Files/", 45 | "https://the-eye.eu/public/MSDN/", 46 | "https://the-eye.eu/public/Music/", 47 | "https://the-eye.eu/public/Operating%20Systems/", 48 | "https://the-eye.eu/public/Posters/", 49 | "https://the-eye.eu/public/Psychedelics/", 50 | "https://the-eye.eu/public/Psychoactives/", 51 | "https://the-eye.eu/public/Radio/", 52 | "https://the-eye.eu/public/Random/", 53 | "https://the-eye.eu/public/Site-Dumps/", 54 | "https://the-eye.eu/public/Software/", 55 | "https://the-eye.eu/public/Strategic%20Intelligence%20Network/", 56 | "https://the-eye.eu/public/WorldTracker.org/", 57 | "https://the-eye.eu/public/concen.org/", 58 | "https://the-eye.eu/public/freenrg.info/", 59 | "https://the-eye.eu/public/murdercube.com/", 60 | "https://the-eye.eu/public/parazite/", 61 | "https://the-eye.eu/public/ripreddit/", 62 | "https://the-eye.eu/public/rom/", 63 | "https://the-eye.eu/public/touhou/", 64 | "https://the-eye.eu/public/vns/", 65 | "https://the-eye.eu/public/xbins/", 66 | "https://the-eye.eu/public/xbins.diodematrix/", 67 | "https://the-eye.eu/public/Rclone_for_Scrubs.pdf", 68 | "https://the-eye.eu/public/Wget_Linux_Guide.pdf", 69 | "https://the-eye.eu/public/Wget_Windows_Guide.pdf", 70 | "https://the-eye.eu/public/rclone_guide.pdf", 71 | "https://the-eye.eu/public/wget-noobs-guide.pdf", 72 | "https://the-eye.eu/public/xbox-scene_Aug2014.7z", 73 | } 74 | 75 | const nginxListing = 76 | ` 77 | Index of /public/ 78 | 79 |

Index of /public/


../
 80 | AppleArchive/                                      03-Nov-2017 18:13       -
 81 | AudioBooks/                                        29-Sep-2018 19:47       -
 82 | Books/                                             27-Nov-2018 17:50       -
 83 | Comics/                                            05-Nov-2018 21:37       -
 84 | Games/                                             28-Nov-2018 11:54       -
 85 | Icons/                                             22-May-2018 07:47       -
 86 | Images/                                            21-Jan-2018 03:21       -
 87 | JFK_Files/                                         03-Nov-2017 17:03       -
 88 | MSDN/                                              03-Nov-2017 15:48       -
 89 | Music/                                             02-Mar-2018 15:47       -
 90 | Operating Systems/                                 25-Apr-2018 07:18       -
 91 | Posters/                                           07-Jul-2018 01:12       -
 92 | Psychedelics/                                      11-Apr-2018 05:45       -
 93 | Psychoactives/                                     18-May-2018 02:58       -
 94 | Radio/                                             09-Jun-2018 15:49       -
 95 | Random/                                            04-Dec-2018 12:33       -
 96 | Site-Dumps/                                        15-Dec-2018 11:04       -
 97 | Software/                                          27-Nov-2017 00:22       -
 98 | Strategic Intelligence Network/                    17-Nov-2017 16:35       -
 99 | WorldTracker.org/                                  12-Apr-2018 04:16       -
100 | concen.org/                                        08-Oct-2018 14:08       -
101 | freenrg.info/                                      19-Dec-2017 10:59       -
102 | murdercube.com/                                    06-Dec-2017 10:45       -
103 | parazite/                                          20-Nov-2017 21:25       -
104 | ripreddit/                                         04-Aug-2018 14:30       -
105 | rom/                                               28-Nov-2018 14:15       -
106 | touhou/                                            03-Nov-2017 11:07       -
107 | vns/                                               03-Nov-2017 11:36       -
108 | xbins/                                             03-Nov-2017 17:23       -
109 | xbins.diodematrix/                                 21-Sep-2018 22:33       -
110 | Rclone_for_Scrubs.pdf                              04-Sep-2018 13:31    315K
111 | Wget_Linux_Guide.pdf                               21-Dec-2017 20:28    168K
112 | Wget_Windows_Guide.pdf                             25-Nov-2017 17:59    867K
113 | rclone_guide.pdf                                   03-Sep-2018 23:37    315K
114 | wget-noobs-guide.pdf                               21-Dec-2017 20:29    168K
115 | xbox-scene_Aug2014.7z                              26-Oct-2017 23:09      1G
116 | 

117 | ` 118 | -------------------------------------------------------------------------------- /crawl_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "github.com/PuerkitoBio/goquery" 6 | "github.com/terorie/od-database-crawler/fasturl" 7 | "net/url" 8 | "strings" 9 | "testing" 10 | ) 11 | 12 | func BenchmarkParseDir(b *testing.B) { 13 | for n := 0; n < b.N; n++ { 14 | var u fasturl.URL 15 | err := u.Parse("http://archive.ubuntu.com/ubuntu/indices/") 16 | if err != nil { 17 | b.Fatal("Failed to parse URL", err) 18 | } 19 | 20 | _, err = ParseDir([]byte(apache2Listing), &u) 21 | if err != nil { 22 | b.Fatal("Failed to extract links", err) 23 | } 24 | } 25 | } 26 | 27 | func BenchmarkParseDirReference(b *testing.B) { 28 | for n := 0; n < b.N; n++ { 29 | u, err := url.Parse("http://archive.ubuntu.com/ubuntu/indices/") 30 | if err != nil { 31 | b.Fatal("Failed to parse URL", err) 32 | } 33 | 34 | _, err = referenceParseDir([]byte(apache2Listing), u) 35 | if err != nil { 36 | b.Fatal("Failed to extract links", err) 37 | } 38 | } 39 | } 40 | 41 | func referenceParseDir(body []byte, baseUri *url.URL) (links []*url.URL, err error) { 42 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) 43 | if err != nil { return nil, err } 44 | 45 | doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { 46 | href, _ := s.Attr("href") 47 | 48 | sub, err := baseUri.Parse(href) 49 | if err != nil { return } // continue 50 | 51 | if !strings.HasPrefix(sub.String(), baseUri.String()) { 52 | return // continue 53 | } 54 | 55 | links = append(links, sub) 56 | }) 57 | 58 | return 59 | } 60 | -------------------------------------------------------------------------------- /ds/redblackhash/redblack.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2015, Emir Pasic. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Modifications by terorie 6 | 7 | // Package redblacktree implements a red-black tree. 8 | // 9 | // Used by TreeSet and TreeMap. 10 | // 11 | // Structure is not thread safe. 12 | // 13 | // References: http://en.wikipedia.org/wiki/Red%E2%80%93black_tree 14 | package redblackhash 15 | 16 | import ( 17 | "bytes" 18 | "fmt" 19 | "sync" 20 | ) 21 | 22 | const ( 23 | black, red color = true, false 24 | KeySize = 64 25 | ) 26 | 27 | type color bool 28 | type Key [KeySize]byte 29 | 30 | // Tree holds elements of the red-black tree 31 | type Tree struct { 32 | sync.Mutex 33 | Root *Node 34 | size int 35 | } 36 | 37 | // Node is a single element within the tree 38 | type Node struct { 39 | Key Key 40 | color color 41 | Left *Node 42 | Right *Node 43 | Parent *Node 44 | } 45 | 46 | func (k *Key) Compare(o *Key) int { 47 | return bytes.Compare(k[:], o[:]) 48 | } 49 | 50 | // Put inserts node into the tree. 51 | // Key should adhere to the comparator's type assertion, otherwise method panics. 52 | func (tree *Tree) Put(key *Key) { 53 | var insertedNode *Node 54 | if tree.Root == nil { 55 | // Assert key is of comparator's type for initial tree 56 | tree.Root = &Node{Key: *key, color: red} 57 | insertedNode = tree.Root 58 | } else { 59 | node := tree.Root 60 | loop := true 61 | for loop { 62 | compare := key.Compare(&node.Key) 63 | switch { 64 | case compare == 0: 65 | node.Key = *key 66 | return 67 | case compare < 0: 68 | if node.Left == nil { 69 | node.Left = &Node{Key: *key, color: red} 70 | insertedNode = node.Left 71 | loop = false 72 | } else { 73 | node = node.Left 74 | } 75 | case compare > 0: 76 | if node.Right == nil { 77 | node.Right = &Node{Key: *key, color: red} 78 | insertedNode = node.Right 79 | loop = false 80 | } else { 81 | node = node.Right 82 | } 83 | } 84 | } 85 | insertedNode.Parent = node 86 | } 87 | tree.insertCase1(insertedNode) 88 | tree.size++ 89 | } 90 | 91 | // Get searches the node in the tree by key and returns its value or nil if key is not found in tree. 92 | // Second return parameter is true if key was found, otherwise false. 93 | // Key should adhere to the comparator's type assertion, otherwise method panics. 94 | func (tree *Tree) Get(key *Key) (found bool) { 95 | node := tree.lookup(key) 96 | return node != nil 97 | } 98 | 99 | // Remove remove the node from the tree by key. 100 | // Key should adhere to the comparator's type assertion, otherwise method panics. 101 | func (tree *Tree) Remove(key *Key) { 102 | var child *Node 103 | node := tree.lookup(key) 104 | if node == nil { 105 | return 106 | } 107 | if node.Left != nil && node.Right != nil { 108 | pred := node.Left.maximumNode() 109 | node.Key = pred.Key 110 | node = pred 111 | } 112 | if node.Left == nil || node.Right == nil { 113 | if node.Right == nil { 114 | child = node.Left 115 | } else { 116 | child = node.Right 117 | } 118 | if node.color == black { 119 | node.color = nodeColor(child) 120 | tree.deleteCase1(node) 121 | } 122 | tree.replaceNode(node, child) 123 | if node.Parent == nil && child != nil { 124 | child.color = black 125 | } 126 | } 127 | tree.size-- 128 | } 129 | 130 | // Empty returns true if tree does not contain any nodes 131 | func (tree *Tree) Empty() bool { 132 | return tree.size == 0 133 | } 134 | 135 | // Size returns number of nodes in the tree. 136 | func (tree *Tree) Size() int { 137 | return tree.size 138 | } 139 | 140 | // Left returns the left-most (min) node or nil if tree is empty. 141 | func (tree *Tree) Left() *Node { 142 | var parent *Node 143 | current := tree.Root 144 | for current != nil { 145 | parent = current 146 | current = current.Left 147 | } 148 | return parent 149 | } 150 | 151 | // Right returns the right-most (max) node or nil if tree is empty. 152 | func (tree *Tree) Right() *Node { 153 | var parent *Node 154 | current := tree.Root 155 | for current != nil { 156 | parent = current 157 | current = current.Right 158 | } 159 | return parent 160 | } 161 | 162 | // Floor Finds floor node of the input key, return the floor node or nil if no floor is found. 163 | // Second return parameter is true if floor was found, otherwise false. 164 | // 165 | // Floor node is defined as the largest node that is smaller than or equal to the given node. 166 | // A floor node may not be found, either because the tree is empty, or because 167 | // all nodes in the tree are larger than the given node. 168 | // 169 | // Key should adhere to the comparator's type assertion, otherwise method panics. 170 | func (tree *Tree) Floor(key *Key) (floor *Node, found bool) { 171 | found = false 172 | node := tree.Root 173 | for node != nil { 174 | compare := key.Compare(&node.Key) 175 | switch { 176 | case compare == 0: 177 | return node, true 178 | case compare < 0: 179 | node = node.Left 180 | case compare > 0: 181 | floor, found = node, true 182 | node = node.Right 183 | } 184 | } 185 | if found { 186 | return floor, true 187 | } 188 | return nil, false 189 | } 190 | 191 | // Ceiling finds ceiling node of the input key, return the ceiling node or nil if no ceiling is found. 192 | // Second return parameter is true if ceiling was found, otherwise false. 193 | // 194 | // Ceiling node is defined as the smallest node that is larger than or equal to the given node. 195 | // A ceiling node may not be found, either because the tree is empty, or because 196 | // all nodes in the tree are smaller than the given node. 197 | // 198 | // Key should adhere to the comparator's type assertion, otherwise method panics. 199 | func (tree *Tree) Ceiling(key *Key) (ceiling *Node, found bool) { 200 | found = false 201 | node := tree.Root 202 | for node != nil { 203 | compare := key.Compare(&node.Key) 204 | switch { 205 | case compare == 0: 206 | return node, true 207 | case compare < 0: 208 | ceiling, found = node, true 209 | node = node.Left 210 | case compare > 0: 211 | node = node.Right 212 | } 213 | } 214 | if found { 215 | return ceiling, true 216 | } 217 | return nil, false 218 | } 219 | 220 | // Clear removes all nodes from the tree. 221 | func (tree *Tree) Clear() { 222 | tree.Root = nil 223 | tree.size = 0 224 | } 225 | 226 | // String returns a string representation of container 227 | func (tree *Tree) String() string { 228 | str := "RedBlackTree\n" 229 | if !tree.Empty() { 230 | output(tree.Root, "", true, &str) 231 | } 232 | return str 233 | } 234 | 235 | func (node *Node) String() string { 236 | return fmt.Sprintf("%v", node.Key) 237 | } 238 | 239 | func output(node *Node, prefix string, isTail bool, str *string) { 240 | if node.Right != nil { 241 | newPrefix := prefix 242 | if isTail { 243 | newPrefix += "│ " 244 | } else { 245 | newPrefix += " " 246 | } 247 | output(node.Right, newPrefix, false, str) 248 | } 249 | *str += prefix 250 | if isTail { 251 | *str += "└── " 252 | } else { 253 | *str += "┌── " 254 | } 255 | *str += node.String() + "\n" 256 | if node.Left != nil { 257 | newPrefix := prefix 258 | if isTail { 259 | newPrefix += " " 260 | } else { 261 | newPrefix += "│ " 262 | } 263 | output(node.Left, newPrefix, true, str) 264 | } 265 | } 266 | 267 | func (tree *Tree) lookup(key *Key) *Node { 268 | node := tree.Root 269 | for node != nil { 270 | compare := key.Compare(&node.Key) 271 | switch { 272 | case compare == 0: 273 | return node 274 | case compare < 0: 275 | node = node.Left 276 | case compare > 0: 277 | node = node.Right 278 | } 279 | } 280 | return nil 281 | } 282 | 283 | func (node *Node) grandparent() *Node { 284 | if node != nil && node.Parent != nil { 285 | return node.Parent.Parent 286 | } 287 | return nil 288 | } 289 | 290 | func (node *Node) uncle() *Node { 291 | if node == nil || node.Parent == nil || node.Parent.Parent == nil { 292 | return nil 293 | } 294 | return node.Parent.sibling() 295 | } 296 | 297 | func (node *Node) sibling() *Node { 298 | if node == nil || node.Parent == nil { 299 | return nil 300 | } 301 | if node == node.Parent.Left { 302 | return node.Parent.Right 303 | } 304 | return node.Parent.Left 305 | } 306 | 307 | func (tree *Tree) rotateLeft(node *Node) { 308 | right := node.Right 309 | tree.replaceNode(node, right) 310 | node.Right = right.Left 311 | if right.Left != nil { 312 | right.Left.Parent = node 313 | } 314 | right.Left = node 315 | node.Parent = right 316 | } 317 | 318 | func (tree *Tree) rotateRight(node *Node) { 319 | left := node.Left 320 | tree.replaceNode(node, left) 321 | node.Left = left.Right 322 | if left.Right != nil { 323 | left.Right.Parent = node 324 | } 325 | left.Right = node 326 | node.Parent = left 327 | } 328 | 329 | func (tree *Tree) replaceNode(old *Node, new *Node) { 330 | if old.Parent == nil { 331 | tree.Root = new 332 | } else { 333 | if old == old.Parent.Left { 334 | old.Parent.Left = new 335 | } else { 336 | old.Parent.Right = new 337 | } 338 | } 339 | if new != nil { 340 | new.Parent = old.Parent 341 | } 342 | } 343 | 344 | func (tree *Tree) insertCase1(node *Node) { 345 | if node.Parent == nil { 346 | node.color = black 347 | } else { 348 | tree.insertCase2(node) 349 | } 350 | } 351 | 352 | func (tree *Tree) insertCase2(node *Node) { 353 | if nodeColor(node.Parent) == black { 354 | return 355 | } 356 | tree.insertCase3(node) 357 | } 358 | 359 | func (tree *Tree) insertCase3(node *Node) { 360 | uncle := node.uncle() 361 | if nodeColor(uncle) == red { 362 | node.Parent.color = black 363 | uncle.color = black 364 | node.grandparent().color = red 365 | tree.insertCase1(node.grandparent()) 366 | } else { 367 | tree.insertCase4(node) 368 | } 369 | } 370 | 371 | func (tree *Tree) insertCase4(node *Node) { 372 | grandparent := node.grandparent() 373 | if node == node.Parent.Right && node.Parent == grandparent.Left { 374 | tree.rotateLeft(node.Parent) 375 | node = node.Left 376 | } else if node == node.Parent.Left && node.Parent == grandparent.Right { 377 | tree.rotateRight(node.Parent) 378 | node = node.Right 379 | } 380 | tree.insertCase5(node) 381 | } 382 | 383 | func (tree *Tree) insertCase5(node *Node) { 384 | node.Parent.color = black 385 | grandparent := node.grandparent() 386 | grandparent.color = red 387 | if node == node.Parent.Left && node.Parent == grandparent.Left { 388 | tree.rotateRight(grandparent) 389 | } else if node == node.Parent.Right && node.Parent == grandparent.Right { 390 | tree.rotateLeft(grandparent) 391 | } 392 | } 393 | 394 | func (node *Node) maximumNode() *Node { 395 | if node == nil { 396 | return nil 397 | } 398 | for node.Right != nil { 399 | node = node.Right 400 | } 401 | return node 402 | } 403 | 404 | func (tree *Tree) deleteCase1(node *Node) { 405 | if node.Parent == nil { 406 | return 407 | } 408 | tree.deleteCase2(node) 409 | } 410 | 411 | func (tree *Tree) deleteCase2(node *Node) { 412 | sibling := node.sibling() 413 | if nodeColor(sibling) == red { 414 | node.Parent.color = red 415 | sibling.color = black 416 | if node == node.Parent.Left { 417 | tree.rotateLeft(node.Parent) 418 | } else { 419 | tree.rotateRight(node.Parent) 420 | } 421 | } 422 | tree.deleteCase3(node) 423 | } 424 | 425 | func (tree *Tree) deleteCase3(node *Node) { 426 | sibling := node.sibling() 427 | if nodeColor(node.Parent) == black && 428 | nodeColor(sibling) == black && 429 | nodeColor(sibling.Left) == black && 430 | nodeColor(sibling.Right) == black { 431 | sibling.color = red 432 | tree.deleteCase1(node.Parent) 433 | } else { 434 | tree.deleteCase4(node) 435 | } 436 | } 437 | 438 | func (tree *Tree) deleteCase4(node *Node) { 439 | sibling := node.sibling() 440 | if nodeColor(node.Parent) == red && 441 | nodeColor(sibling) == black && 442 | nodeColor(sibling.Left) == black && 443 | nodeColor(sibling.Right) == black { 444 | sibling.color = red 445 | node.Parent.color = black 446 | } else { 447 | tree.deleteCase5(node) 448 | } 449 | } 450 | 451 | func (tree *Tree) deleteCase5(node *Node) { 452 | sibling := node.sibling() 453 | if node == node.Parent.Left && 454 | nodeColor(sibling) == black && 455 | nodeColor(sibling.Left) == red && 456 | nodeColor(sibling.Right) == black { 457 | sibling.color = red 458 | sibling.Left.color = black 459 | tree.rotateRight(sibling) 460 | } else if node == node.Parent.Right && 461 | nodeColor(sibling) == black && 462 | nodeColor(sibling.Right) == red && 463 | nodeColor(sibling.Left) == black { 464 | sibling.color = red 465 | sibling.Right.color = black 466 | tree.rotateLeft(sibling) 467 | } 468 | tree.deleteCase6(node) 469 | } 470 | 471 | func (tree *Tree) deleteCase6(node *Node) { 472 | sibling := node.sibling() 473 | sibling.color = nodeColor(node.Parent) 474 | node.Parent.color = black 475 | if node == node.Parent.Left && nodeColor(sibling.Right) == red { 476 | sibling.Right.color = black 477 | tree.rotateLeft(node.Parent) 478 | } else if nodeColor(sibling.Left) == red { 479 | sibling.Left.color = black 480 | tree.rotateRight(node.Parent) 481 | } 482 | } 483 | 484 | func nodeColor(node *Node) color { 485 | if node == nil { 486 | return black 487 | } 488 | return node.color 489 | } 490 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "github.com/valyala/fasthttp" 7 | "net" 8 | ) 9 | 10 | var ErrRateLimit = errors.New("too many requests") 11 | var ErrKnown = errors.New("already crawled") 12 | 13 | type HttpError struct { 14 | code int 15 | } 16 | 17 | func (e HttpError) Error() string { 18 | return fmt.Sprintf("http status %d", e.code) 19 | } 20 | 21 | func shouldRetry(err error) bool { 22 | // HTTP errors 23 | if httpErr, ok := err.(*HttpError); ok { 24 | switch httpErr.code { 25 | case fasthttp.StatusTooManyRequests: 26 | return true 27 | default: 28 | // Don't retry HTTP error codes 29 | return false 30 | } 31 | } 32 | 33 | if dnsError, ok := err.(*net.DNSError); ok { 34 | // Don't retry permanent DNS errors 35 | return dnsError.IsTemporary 36 | } 37 | 38 | if netErr, ok := err.(*net.OpError); ok { 39 | // Don't retry permanent network errors 40 | return netErr.Temporary() 41 | } 42 | 43 | // Retry by default 44 | return true 45 | } 46 | -------------------------------------------------------------------------------- /fasturl/url.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package fasturl parses URLs and implements query escaping. 6 | package fasturl 7 | 8 | // Modifications by terorie 9 | 10 | // See RFC 3986. This package generally follows RFC 3986, except where 11 | // it deviates for compatibility reasons. When sending changes, first 12 | // search old issues for history on decisions. Unit tests should also 13 | // contain references to issue numbers with details. 14 | 15 | import ( 16 | "errors" 17 | "fmt" 18 | "strconv" 19 | "strings" 20 | ) 21 | 22 | type Scheme int 23 | const ( 24 | SchemeInvalid = iota 25 | SchemeHTTP 26 | SchemeHTTPS 27 | SchemeCount 28 | ) 29 | 30 | var Schemes = [SchemeCount]string { 31 | "", 32 | "http", 33 | "https", 34 | } 35 | 36 | var ErrUnknownScheme = errors.New("unknown protocol scheme") 37 | 38 | // Error reports an error and the operation and URL that caused it. 39 | type Error struct { 40 | Op string 41 | URL string 42 | Err error 43 | } 44 | 45 | func (e *Error) Error() string { return e.Op + " " + e.URL + ": " + e.Err.Error() } 46 | 47 | type timeout interface { 48 | Timeout() bool 49 | } 50 | 51 | func (e *Error) Timeout() bool { 52 | t, ok := e.Err.(timeout) 53 | return ok && t.Timeout() 54 | } 55 | 56 | type temporary interface { 57 | Temporary() bool 58 | } 59 | 60 | func (e *Error) Temporary() bool { 61 | t, ok := e.Err.(temporary) 62 | return ok && t.Temporary() 63 | } 64 | 65 | func ishex(c byte) bool { 66 | switch { 67 | case '0' <= c && c <= '9': 68 | return true 69 | case 'a' <= c && c <= 'f': 70 | return true 71 | case 'A' <= c && c <= 'F': 72 | return true 73 | } 74 | return false 75 | } 76 | 77 | func unhex(c byte) byte { 78 | switch { 79 | case '0' <= c && c <= '9': 80 | return c - '0' 81 | case 'a' <= c && c <= 'f': 82 | return c - 'a' + 10 83 | case 'A' <= c && c <= 'F': 84 | return c - 'A' + 10 85 | } 86 | return 0 87 | } 88 | 89 | type encoding int 90 | 91 | const ( 92 | encodePath encoding = 1 + iota 93 | encodePathSegment 94 | encodeHost 95 | encodeZone 96 | encodeUserPassword 97 | encodeQueryComponent 98 | encodeFragment 99 | ) 100 | 101 | type EscapeError string 102 | 103 | func (e EscapeError) Error() string { 104 | return "invalid URL escape " + strconv.Quote(string(e)) 105 | } 106 | 107 | type InvalidHostError string 108 | 109 | func (e InvalidHostError) Error() string { 110 | return "invalid character " + strconv.Quote(string(e)) + " in host name" 111 | } 112 | 113 | // Return true if the specified character should be escaped when 114 | // appearing in a URL string, according to RFC 3986. 115 | // 116 | // Please be informed that for now shouldEscape does not check all 117 | // reserved characters correctly. See golang.org/issue/5684. 118 | func shouldEscape(c byte, mode encoding) bool { 119 | // §2.3 Unreserved characters (alphanum) 120 | if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' { 121 | return false 122 | } 123 | 124 | if mode == encodeHost || mode == encodeZone { 125 | // §3.2.2 Host allows 126 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 127 | // as part of reg-name. 128 | // We add : because we include :port as part of host. 129 | // We add [ ] because we include [ipv6]:port as part of host. 130 | // We add < > because they're the only characters left that 131 | // we could possibly allow, and Parse will reject them if we 132 | // escape them (because hosts can't use %-encoding for 133 | // ASCII bytes). 134 | switch c { 135 | case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"': 136 | return false 137 | } 138 | } 139 | 140 | switch c { 141 | case '-', '_', '.', '~': // §2.3 Unreserved characters (mark) 142 | return false 143 | 144 | case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved) 145 | // Different sections of the URL allow a few of 146 | // the reserved characters to appear unescaped. 147 | switch mode { 148 | case encodePath: // §3.3 149 | // The RFC allows : @ & = + $ but saves / ; , for assigning 150 | // meaning to individual path segments. This package 151 | // only manipulates the path as a whole, so we allow those 152 | // last three as well. That leaves only ? to escape. 153 | return c == '?' 154 | 155 | case encodePathSegment: // §3.3 156 | // The RFC allows : @ & = + $ but saves / ; , for assigning 157 | // meaning to individual path segments. 158 | return c == '/' || c == ';' || c == ',' || c == '?' 159 | 160 | case encodeUserPassword: // §3.2.1 161 | // The RFC allows ';', ':', '&', '=', '+', '$', and ',' in 162 | // userinfo, so we must escape only '@', '/', and '?'. 163 | // The parsing of userinfo treats ':' as special so we must escape 164 | // that too. 165 | return c == '@' || c == '/' || c == '?' || c == ':' 166 | 167 | case encodeQueryComponent: // §3.4 168 | // The RFC reserves (so we must escape) everything. 169 | return true 170 | 171 | case encodeFragment: // §4.1 172 | // The RFC text is silent but the grammar allows 173 | // everything, so escape nothing. 174 | return false 175 | } 176 | } 177 | 178 | if mode == encodeFragment { 179 | // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are 180 | // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not 181 | // need to be escaped. To minimize potential breakage, we apply two restrictions: 182 | // (1) we always escape sub-delims outside of the fragment, and (2) we always 183 | // escape single quote to avoid breaking callers that had previously assumed that 184 | // single quotes would be escaped. See issue #19917. 185 | switch c { 186 | case '!', '(', ')', '*': 187 | return false 188 | } 189 | } 190 | 191 | // Everything else must be escaped. 192 | return true 193 | } 194 | 195 | // unescape unescapes a string; the mode specifies 196 | // which section of the URL string is being unescaped. 197 | func unescape(s string, mode encoding) (string, error) { 198 | // Count %, check that they're well-formed. 199 | n := 0 200 | hasPlus := false 201 | for i := 0; i < len(s); { 202 | switch s[i] { 203 | case '%': 204 | n++ 205 | if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) { 206 | s = s[i:] 207 | if len(s) > 3 { 208 | s = s[:3] 209 | } 210 | return "", EscapeError(s) 211 | } 212 | // Per https://tools.ietf.org/html/rfc3986#page-21 213 | // in the host component %-encoding can only be used 214 | // for non-ASCII bytes. 215 | // But https://tools.ietf.org/html/rfc6874#section-2 216 | // introduces %25 being allowed to escape a percent sign 217 | // in IPv6 scoped-address literals. Yay. 218 | if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" { 219 | return "", EscapeError(s[i : i+3]) 220 | } 221 | if mode == encodeZone { 222 | // RFC 6874 says basically "anything goes" for zone identifiers 223 | // and that even non-ASCII can be redundantly escaped, 224 | // but it seems prudent to restrict %-escaped bytes here to those 225 | // that are valid host name bytes in their unescaped form. 226 | // That is, you can use escaping in the zone identifier but not 227 | // to introduce bytes you couldn't just write directly. 228 | // But Windows puts spaces here! Yay. 229 | v := unhex(s[i+1])<<4 | unhex(s[i+2]) 230 | if s[i:i+3] != "%25" && v != ' ' && shouldEscape(v, encodeHost) { 231 | return "", EscapeError(s[i : i+3]) 232 | } 233 | } 234 | i += 3 235 | case '+': 236 | hasPlus = mode == encodeQueryComponent 237 | i++ 238 | default: 239 | if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) { 240 | return "", InvalidHostError(s[i : i+1]) 241 | } 242 | i++ 243 | } 244 | } 245 | 246 | if n == 0 && !hasPlus { 247 | return s, nil 248 | } 249 | 250 | t := make([]byte, len(s)-2*n) 251 | j := 0 252 | for i := 0; i < len(s); { 253 | switch s[i] { 254 | case '%': 255 | t[j] = unhex(s[i+1])<<4 | unhex(s[i+2]) 256 | j++ 257 | i += 3 258 | case '+': 259 | if mode == encodeQueryComponent { 260 | t[j] = ' ' 261 | } else { 262 | t[j] = '+' 263 | } 264 | j++ 265 | i++ 266 | default: 267 | t[j] = s[i] 268 | j++ 269 | i++ 270 | } 271 | } 272 | return string(t), nil 273 | } 274 | 275 | func escape(s string, mode encoding) string { 276 | spaceCount, hexCount := 0, 0 277 | for i := 0; i < len(s); i++ { 278 | c := s[i] 279 | if shouldEscape(c, mode) { 280 | if c == ' ' && mode == encodeQueryComponent { 281 | spaceCount++ 282 | } else { 283 | hexCount++ 284 | } 285 | } 286 | } 287 | 288 | if spaceCount == 0 && hexCount == 0 { 289 | return s 290 | } 291 | 292 | t := make([]byte, len(s)+2*hexCount) 293 | j := 0 294 | for i := 0; i < len(s); i++ { 295 | switch c := s[i]; { 296 | case c == ' ' && mode == encodeQueryComponent: 297 | t[j] = '+' 298 | j++ 299 | case shouldEscape(c, mode): 300 | t[j] = '%' 301 | t[j+1] = "0123456789ABCDEF"[c>>4] 302 | t[j+2] = "0123456789ABCDEF"[c&15] 303 | j += 3 304 | default: 305 | t[j] = s[i] 306 | j++ 307 | } 308 | } 309 | return string(t) 310 | } 311 | 312 | // A URL represents a parsed URL (technically, a URI reference). 313 | // 314 | // The general form represented is: 315 | // 316 | // [scheme:][//[userinfo@]host][/]path[?query][#fragment] 317 | // 318 | // URLs that do not start with a slash after the scheme are interpreted as: 319 | // 320 | // scheme:opaque[?query][#fragment] 321 | // 322 | // Note that the Path field is stored in decoded form: /%47%6f%2f becomes /Go/. 323 | // A consequence is that it is impossible to tell which slashes in the Path were 324 | // slashes in the raw URL and which were %2f. This distinction is rarely important, 325 | // but when it is, code must not use Path directly. 326 | // The Parse function sets both Path and RawPath in the URL it returns, 327 | // and URL's String method uses RawPath if it is a valid encoding of Path, 328 | // by calling the EscapedPath method. 329 | type URL struct { 330 | Scheme Scheme 331 | Host string // host or host:port 332 | Path string // path (relative paths may omit leading slash) 333 | } 334 | 335 | // Maybe rawurl is of the form scheme:path. 336 | // (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*) 337 | // If so, return scheme, path; else return "", rawurl. 338 | func getscheme(rawurl string) (scheme Scheme, path string, err error) { 339 | for i := 0; i < len(rawurl); i++ { 340 | c := rawurl[i] 341 | switch { 342 | case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': 343 | // do nothing 344 | case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.': 345 | if i == 0 { 346 | return SchemeInvalid, rawurl, nil 347 | } 348 | case c == ':': 349 | if i == 0 { 350 | return SchemeInvalid, "", errors.New("missing protocol scheme") 351 | } 352 | switch rawurl[:i] { 353 | case "http": 354 | scheme = SchemeHTTP 355 | case "https": 356 | scheme = SchemeHTTPS 357 | default: 358 | return SchemeInvalid, "", ErrUnknownScheme 359 | } 360 | 361 | path = rawurl[i+1:] 362 | return 363 | default: 364 | // we have encountered an invalid character, 365 | // so there is no valid scheme 366 | return SchemeInvalid, rawurl, nil 367 | } 368 | } 369 | return SchemeInvalid, rawurl, nil 370 | } 371 | 372 | // Maybe s is of the form t c u. 373 | // If so, return t, c u (or t, u if cutc == true). 374 | // If not, return s, "". 375 | func split(s string, c string, cutc bool) (string, string) { 376 | i := strings.Index(s, c) 377 | if i < 0 { 378 | return s, "" 379 | } 380 | if cutc { 381 | return s[:i], s[i+len(c):] 382 | } 383 | return s[:i], s[i:] 384 | } 385 | 386 | // Parse parses rawurl into a URL structure. 387 | // 388 | // The rawurl may be relative (a path, without a host) or absolute 389 | // (starting with a scheme). Trying to parse a hostname and path 390 | // without a scheme is invalid but may not necessarily return an 391 | // error, due to parsing ambiguities. 392 | func (u *URL) Parse(rawurl string) error { 393 | // Cut off #frag 394 | s, frag := split(rawurl, "#", true) 395 | err := u.parse(s, false) 396 | if err != nil { 397 | return &Error{"parse", s, err} 398 | } 399 | if frag == "" { 400 | return nil 401 | } 402 | return nil 403 | } 404 | 405 | // ParseRequestURI parses rawurl into a URL structure. It assumes that 406 | // rawurl was received in an HTTP request, so the rawurl is interpreted 407 | // only as an absolute URI or an absolute path. 408 | // The string rawurl is assumed not to have a #fragment suffix. 409 | // (Web browsers strip #fragment before sending the URL to a web server.) 410 | func (u *URL) ParseRequestURI(rawurl string) error { 411 | err := u.parse(rawurl, true) 412 | if err != nil { 413 | return &Error{"parse", rawurl, err} 414 | } 415 | return nil 416 | } 417 | 418 | // parse parses a URL from a string in one of two contexts. If 419 | // viaRequest is true, the URL is assumed to have arrived via an HTTP request, 420 | // in which case only absolute URLs or path-absolute relative URLs are allowed. 421 | // If viaRequest is false, all forms of relative URLs are allowed. 422 | func (u *URL) parse(rawurl string, viaRequest bool) error { 423 | var rest string 424 | var err error 425 | 426 | if rawurl == "" && viaRequest { 427 | return errors.New("empty url") 428 | } 429 | 430 | if rawurl == "*" { 431 | u.Path = "*" 432 | return nil 433 | } 434 | 435 | // Split off possible leading "http:", "mailto:", etc. 436 | // Cannot contain escaped characters. 437 | if u.Scheme, rest, err = getscheme(rawurl); err != nil { 438 | return err 439 | } 440 | 441 | if strings.HasSuffix(rest, "?") && strings.Count(rest, "?") == 1 { 442 | rest = rest[:len(rest)-1] 443 | } else { 444 | rest, _ = split(rest, "?", true) 445 | } 446 | 447 | if !strings.HasPrefix(rest, "/") { 448 | if u.Scheme != SchemeInvalid { 449 | // We consider rootless paths per RFC 3986 as opaque. 450 | return nil 451 | } 452 | if viaRequest { 453 | return errors.New("invalid URI for request") 454 | } 455 | 456 | // Avoid confusion with malformed schemes, like cache_object:foo/bar. 457 | // See golang.org/issue/16822. 458 | // 459 | // RFC 3986, §3.3: 460 | // In addition, a URI reference (Section 4.1) may be a relative-path reference, 461 | // in which case the first path segment cannot contain a colon (":") character. 462 | colon := strings.Index(rest, ":") 463 | slash := strings.Index(rest, "/") 464 | if colon >= 0 && (slash < 0 || colon < slash) { 465 | // First path segment has colon. Not allowed in relative URL. 466 | return errors.New("first path segment in URL cannot contain colon") 467 | } 468 | } 469 | 470 | if (u.Scheme != SchemeInvalid || !viaRequest && !strings.HasPrefix(rest, "///")) && strings.HasPrefix(rest, "//") { 471 | var authority string 472 | authority, rest = split(rest[2:], "/", false) 473 | u.Host, err = parseAuthority(authority) 474 | if err != nil { 475 | return err 476 | } 477 | } 478 | u.Path = rest 479 | return nil 480 | } 481 | 482 | func parseAuthority(authority string) (host string, err error) { 483 | i := strings.LastIndex(authority, "@") 484 | if i < 0 { 485 | host, err = parseHost(authority) 486 | } else { 487 | host, err = parseHost(authority[i+1:]) 488 | } 489 | if err != nil { 490 | return "", err 491 | } 492 | if i < 0 { 493 | return host, nil 494 | } 495 | userinfo := authority[:i] 496 | if !validUserinfo(userinfo) { 497 | return "", errors.New("fasturl: invalid userinfo") 498 | } 499 | return host, nil 500 | } 501 | 502 | // parseHost parses host as an authority without user 503 | // information. That is, as host[:port]. 504 | func parseHost(host string) (string, error) { 505 | if strings.HasPrefix(host, "[") { 506 | // Parse an IP-Literal in RFC 3986 and RFC 6874. 507 | // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80". 508 | i := strings.LastIndex(host, "]") 509 | if i < 0 { 510 | return "", errors.New("missing ']' in host") 511 | } 512 | colonPort := host[i+1:] 513 | if !validOptionalPort(colonPort) { 514 | return "", fmt.Errorf("invalid port %q after host", colonPort) 515 | } 516 | 517 | // RFC 6874 defines that %25 (%-encoded percent) introduces 518 | // the zone identifier, and the zone identifier can use basically 519 | // any %-encoding it likes. That's different from the host, which 520 | // can only %-encode non-ASCII bytes. 521 | // We do impose some restrictions on the zone, to avoid stupidity 522 | // like newlines. 523 | zone := strings.Index(host[:i], "%25") 524 | if zone >= 0 { 525 | host1, err := unescape(host[:zone], encodeHost) 526 | if err != nil { 527 | return "", err 528 | } 529 | host2, err := unescape(host[zone:i], encodeZone) 530 | if err != nil { 531 | return "", err 532 | } 533 | host3, err := unescape(host[i:], encodeHost) 534 | if err != nil { 535 | return "", err 536 | } 537 | return host1 + host2 + host3, nil 538 | } 539 | } 540 | 541 | var err error 542 | if host, err = unescape(host, encodeHost); err != nil { 543 | return "", err 544 | } 545 | return host, nil 546 | } 547 | 548 | // validOptionalPort reports whether port is either an empty string 549 | // or matches /^:\d*$/ 550 | func validOptionalPort(port string) bool { 551 | if port == "" { 552 | return true 553 | } 554 | if port[0] != ':' { 555 | return false 556 | } 557 | for _, b := range port[1:] { 558 | if b < '0' || b > '9' { 559 | return false 560 | } 561 | } 562 | return true 563 | } 564 | 565 | // String reassembles the URL into a valid URL string. 566 | // The general form of the result is one of: 567 | // 568 | // scheme:opaque?query#fragment 569 | // scheme://userinfo@host/path?query#fragment 570 | // 571 | // If u.Opaque is non-empty, String uses the first form; 572 | // otherwise it uses the second form. 573 | // To obtain the path, String uses u.EscapedPath(). 574 | // 575 | // In the second form, the following rules apply: 576 | // - if u.Scheme is empty, scheme: is omitted. 577 | // - if u.User is nil, userinfo@ is omitted. 578 | // - if u.Host is empty, host/ is omitted. 579 | // - if u.Scheme and u.Host are empty and u.User is nil, 580 | // the entire scheme://userinfo@host/ is omitted. 581 | // - if u.Host is non-empty and u.Path begins with a /, 582 | // the form host/path does not add its own /. 583 | // - if u.RawQuery is empty, ?query is omitted. 584 | // - if u.Fragment is empty, #fragment is omitted. 585 | func (u *URL) String() string { 586 | var buf strings.Builder 587 | if u.Scheme != SchemeInvalid { 588 | buf.WriteString(Schemes[u.Scheme]) 589 | buf.WriteByte(':') 590 | } 591 | if u.Scheme != SchemeInvalid || u.Host != "" { 592 | if u.Host != "" || u.Path != "" { 593 | buf.WriteString("//") 594 | } 595 | if h := u.Host; h != "" { 596 | buf.WriteString(escape(h, encodeHost)) 597 | } 598 | } 599 | path := u.Path 600 | if path != "" && path[0] != '/' && u.Host != "" { 601 | buf.WriteByte('/') 602 | } 603 | if buf.Len() == 0 { 604 | // RFC 3986 §4.2 605 | // A path segment that contains a colon character (e.g., "this:that") 606 | // cannot be used as the first segment of a relative-path reference, as 607 | // it would be mistaken for a scheme name. Such a segment must be 608 | // preceded by a dot-segment (e.g., "./this:that") to make a relative- 609 | // path reference. 610 | if i := strings.IndexByte(path, ':'); i > -1 && strings.IndexByte(path[:i], '/') == -1 { 611 | buf.WriteString("./") 612 | } 613 | } 614 | buf.WriteString(path) 615 | return buf.String() 616 | } 617 | 618 | func isRunesDot(r []rune) bool { 619 | return len(r) == 1 && r[0] == '.' 620 | } 621 | 622 | func isRunesDoubleDot(r []rune) bool { 623 | return len(r) == 2 && r[0] == '.' && r[1] == '.' 624 | } 625 | 626 | // resolvePath applies special path segments from refs and applies 627 | // them to base, per RFC 3986. 628 | func resolvePath(base, ref string) string { 629 | var full string 630 | if ref == "" { 631 | full = base 632 | } else if ref[0] != '/' { 633 | i := strings.LastIndex(base, "/") 634 | full = base[:i+1] + ref 635 | } else { 636 | full = ref 637 | } 638 | if full == "" { 639 | return "" 640 | } else if full == "/" { 641 | return "/" 642 | } 643 | 644 | dst := make([]rune, len(full)) 645 | dst = dst[0:0] 646 | 647 | start := 0 648 | rs := []rune(full) 649 | if len(rs) != 0 && rs[0] == '/' { 650 | rs = rs[1:] 651 | } 652 | var stack []int 653 | stack = append(stack, 0) 654 | for i, c := range rs { 655 | if i == len(rs) - 1 { 656 | closingSlash := false 657 | part := rs[start:] 658 | if len(part) == 0 { 659 | dst = append(dst, '/') 660 | } else if part[len(part)-1] == '/' { 661 | part = part[:len(part)-1] 662 | closingSlash = true 663 | } 664 | switch { 665 | case isRunesDot(part): 666 | dst = append(dst, '/') 667 | case isRunesDoubleDot(part): 668 | // Cut to the last slash 669 | start = i+1 670 | dst = dst[:stack[len(stack)-1]] 671 | if len(stack) != 1 { 672 | stack = stack[:len(stack)-1] 673 | } 674 | dst = append(dst, '/') 675 | default: 676 | dst = append(dst, '/') 677 | dst = append(dst, part...) 678 | } 679 | if closingSlash && len(dst) != 0 && dst[len(dst)-1] != '/' { 680 | dst = append(dst, '/') 681 | } 682 | } else if c == '/' { 683 | part := rs[start:i] 684 | switch { 685 | case isRunesDot(part): 686 | start = i+1 687 | case isRunesDoubleDot(part): 688 | // Cut to the last slash 689 | start = i+1 690 | dst = dst[:stack[len(stack)-1]] 691 | if len(stack) != 1 { 692 | stack = stack[:len(stack)-1] 693 | } 694 | default: 695 | start = i+1 696 | stack = append(stack, len(dst)) 697 | dst = append(dst, '/') 698 | dst = append(dst, part...) 699 | } 700 | } 701 | } 702 | return string(dst) 703 | 704 | /*var dst []string 705 | src := strings.Split(full, "/") 706 | for _, elem := range src { 707 | switch elem { 708 | case ".": 709 | // drop 710 | case "..": 711 | if len(dst) > 0 { 712 | dst = dst[:len(dst)-1] 713 | } 714 | default: 715 | dst = append(dst, elem) 716 | } 717 | } 718 | if last := src[len(src)-1]; last == "." || last == ".." { 719 | // Add final slash to the joined path. 720 | dst = append(dst, "") 721 | } 722 | return "/" + strings.TrimPrefix(strings.Join(dst, "/"), "/")*/ 723 | } 724 | 725 | // IsAbs reports whether the URL is absolute. 726 | // Absolute means that it has a non-empty scheme. 727 | func (u *URL) IsAbs() bool { 728 | return u.Scheme != SchemeInvalid 729 | } 730 | 731 | // ParseRel parses a URL in the context of the receiver. The provided URL 732 | // may be relative or absolute. Parse returns nil, err on parse 733 | // failure, otherwise its return value is the same as ResolveReference. 734 | func (u *URL) ParseRel(out *URL, ref string) error { 735 | var refurl URL 736 | 737 | err := refurl.Parse(ref) 738 | if err != nil { 739 | return err 740 | } 741 | 742 | u.ResolveReference(out, &refurl) 743 | return nil 744 | } 745 | 746 | // ResolveReference resolves a URI reference to an absolute URI from 747 | // an absolute base URI u, per RFC 3986 Section 5.2. The URI reference 748 | // may be relative or absolute. ResolveReference always returns a new 749 | // URL instance, even if the returned URL is identical to either the 750 | // base or reference. If ref is an absolute URL, then ResolveReference 751 | // ignores base and returns a copy of ref. 752 | func (u *URL) ResolveReference(url *URL, ref *URL) { 753 | *url = *ref 754 | if ref.Scheme == SchemeInvalid { 755 | url.Scheme = u.Scheme 756 | } 757 | if ref.Scheme != SchemeInvalid || ref.Host != "" { 758 | // The "absoluteURI" or "net_path" cases. 759 | // We can ignore the error from setPath since we know we provided a 760 | // validly-escaped path. 761 | url.Path = resolvePath(ref.Path, "") 762 | return 763 | } 764 | // The "abs_path" or "rel_path" cases. 765 | url.Host = u.Host 766 | url.Path = resolvePath(u.Path, ref.Path) 767 | return 768 | } 769 | 770 | // Marshaling interface implementations. 771 | // Would like to implement MarshalText/UnmarshalText but that will change the JSON representation of URLs. 772 | 773 | func (u *URL) MarshalBinary() (text []byte, err error) { 774 | return []byte(u.String()), nil 775 | } 776 | 777 | func (u *URL) UnmarshalBinary(text []byte) error { 778 | var u1 URL 779 | err := u1.Parse(string(text)) 780 | if err != nil { 781 | return err 782 | } 783 | *u = u1 784 | return nil 785 | } 786 | 787 | // validUserinfo reports whether s is a valid userinfo string per RFC 3986 788 | // Section 3.2.1: 789 | // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 790 | // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 791 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 792 | // / "*" / "+" / "," / ";" / "=" 793 | // 794 | // It doesn't validate pct-encoded. The caller does that via func unescape. 795 | func validUserinfo(s string) bool { 796 | for _, r := range s { 797 | if 'A' <= r && r <= 'Z' { 798 | continue 799 | } 800 | if 'a' <= r && r <= 'z' { 801 | continue 802 | } 803 | if '0' <= r && r <= '9' { 804 | continue 805 | } 806 | switch r { 807 | case '-', '.', '_', ':', '~', '!', '$', '&', '\'', 808 | '(', ')', '*', '+', ',', ';', '=', '%', '@': 809 | continue 810 | default: 811 | return false 812 | } 813 | } 814 | return true 815 | } 816 | 817 | func PathUnescape(s string) string { 818 | newStr, err := pathUnescape(s) 819 | if err != nil { 820 | return s 821 | } else { 822 | return newStr 823 | } 824 | } 825 | 826 | func pathUnescape(s string) (string, error) { 827 | // Count %, check that they're well-formed. 828 | n := 0 829 | for i := 0; i < len(s); { 830 | switch s[i] { 831 | case '%': 832 | n++ 833 | if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) { 834 | s = s[i:] 835 | if len(s) > 3 { 836 | s = s[:3] 837 | } 838 | return "", EscapeError(s) 839 | } 840 | i += 3 841 | default: 842 | i++ 843 | } 844 | } 845 | 846 | if n == 0 { 847 | return s, nil 848 | } 849 | 850 | t := make([]byte, len(s)-2*n) 851 | j := 0 852 | for i := 0; i < len(s); { 853 | switch s[i] { 854 | case '%': 855 | t[j] = unhex(s[i+1])<<4 | unhex(s[i+2]) 856 | j++ 857 | i += 3 858 | case '+': 859 | t[j] = '+' 860 | j++ 861 | i++ 862 | default: 863 | t[j] = s[i] 864 | j++ 865 | i++ 866 | } 867 | } 868 | return string(t), nil 869 | } 870 | -------------------------------------------------------------------------------- /fasturl/url_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package fasturl 6 | 7 | import ( 8 | "bytes" 9 | encodingPkg "encoding" 10 | "encoding/gob" 11 | "encoding/json" 12 | "fmt" 13 | "io" 14 | "net" 15 | "reflect" 16 | "testing" 17 | ) 18 | 19 | type URLTest struct { 20 | in string 21 | out *URL // expected parse; RawPath="" means same as Path 22 | roundtrip string // expected result of reserializing the URL; empty means same as "in". 23 | } 24 | 25 | var urltests = []URLTest{ 26 | // no path 27 | { 28 | "http://www.google.com", 29 | &URL{ 30 | Scheme: SchemeHTTP, 31 | Host: "www.google.com", 32 | }, 33 | "", 34 | }, 35 | // path 36 | { 37 | "http://www.google.com/", 38 | &URL{ 39 | Scheme: SchemeHTTP, 40 | Host: "www.google.com", 41 | Path: "/", 42 | }, 43 | "", 44 | }, 45 | // %20 outside query 46 | { 47 | "http://www.google.com/a%20b", 48 | &URL{ 49 | Scheme: SchemeHTTP, 50 | Host: "www.google.com", 51 | Path: "/a%20b", 52 | }, 53 | "", 54 | }, 55 | // leading // without scheme should create an authority 56 | { 57 | "//foo", 58 | &URL{ 59 | Host: "foo", 60 | }, 61 | "", 62 | }, 63 | // Three leading slashes isn't an authority, but doesn't return an error. 64 | // (We can't return an error, as this code is also used via 65 | // ServeHTTP -> ReadRequest -> Parse, which is arguably a 66 | // different URL parsing context, but currently shares the 67 | // same codepath) 68 | { 69 | "///threeslashes", 70 | &URL{ 71 | Path: "///threeslashes", 72 | }, 73 | "", 74 | }, 75 | // unescaped @ in username should not confuse host 76 | { 77 | "http://j@ne:password@google.com", 78 | &URL{ 79 | Scheme: SchemeHTTP, 80 | Host: "google.com", 81 | }, 82 | "http://google.com", 83 | }, 84 | // unescaped @ in password should not confuse host 85 | { 86 | "http://jane:p@ssword@google.com", 87 | &URL{ 88 | Scheme: SchemeHTTP, 89 | Host: "google.com", 90 | }, 91 | "http://google.com", 92 | }, 93 | // Relative path 94 | { 95 | "a/b/c", 96 | &URL{ 97 | Path: "a/b/c", 98 | }, 99 | "a/b/c", 100 | }, 101 | // host subcomponent; IPv4 address in RFC 3986 102 | { 103 | "http://192.168.0.1/", 104 | &URL{ 105 | Scheme: SchemeHTTP, 106 | Host: "192.168.0.1", 107 | Path: "/", 108 | }, 109 | "", 110 | }, 111 | // host and port subcomponents; IPv4 address in RFC 3986 112 | { 113 | "http://192.168.0.1:8080/", 114 | &URL{ 115 | Scheme: SchemeHTTP, 116 | Host: "192.168.0.1:8080", 117 | Path: "/", 118 | }, 119 | "", 120 | }, 121 | // host subcomponent; IPv6 address in RFC 3986 122 | { 123 | "http://[fe80::1]/", 124 | &URL{ 125 | Scheme: SchemeHTTP, 126 | Host: "[fe80::1]", 127 | Path: "/", 128 | }, 129 | "", 130 | }, 131 | // host and port subcomponents; IPv6 address in RFC 3986 132 | { 133 | "http://[fe80::1]:8080/", 134 | &URL{ 135 | Scheme: SchemeHTTP, 136 | Host: "[fe80::1]:8080", 137 | Path: "/", 138 | }, 139 | "", 140 | }, 141 | // host subcomponent; IPv6 address with zone identifier in RFC 6874 142 | { 143 | "http://[fe80::1%25en0]/", // alphanum zone identifier 144 | &URL{ 145 | Scheme: SchemeHTTP, 146 | Host: "[fe80::1%en0]", 147 | Path: "/", 148 | }, 149 | "", 150 | }, 151 | // host and port subcomponents; IPv6 address with zone identifier in RFC 6874 152 | { 153 | "http://[fe80::1%25en0]:8080/", // alphanum zone identifier 154 | &URL{ 155 | Scheme: SchemeHTTP, 156 | Host: "[fe80::1%en0]:8080", 157 | Path: "/", 158 | }, 159 | "", 160 | }, 161 | // host subcomponent; IPv6 address with zone identifier in RFC 6874 162 | { 163 | "http://[fe80::1%25%65%6e%301-._~]/", // percent-encoded+unreserved zone identifier 164 | &URL{ 165 | Scheme: SchemeHTTP, 166 | Host: "[fe80::1%en01-._~]", 167 | Path: "/", 168 | }, 169 | "http://[fe80::1%25en01-._~]/", 170 | }, 171 | // host and port subcomponents; IPv6 address with zone identifier in RFC 6874 172 | { 173 | "http://[fe80::1%25%65%6e%301-._~]:8080/", // percent-encoded+unreserved zone identifier 174 | &URL{ 175 | Scheme: SchemeHTTP, 176 | Host: "[fe80::1%en01-._~]:8080", 177 | Path: "/", 178 | }, 179 | "http://[fe80::1%25en01-._~]:8080/", 180 | }, 181 | // golang.org/issue/12200 (colon with empty port) 182 | { 183 | "http://192.168.0.2:8080/foo", 184 | &URL{ 185 | Scheme: SchemeHTTP, 186 | Host: "192.168.0.2:8080", 187 | Path: "/foo", 188 | }, 189 | "", 190 | }, 191 | { 192 | "http://192.168.0.2:/foo", 193 | &URL{ 194 | Scheme: SchemeHTTP, 195 | Host: "192.168.0.2:", 196 | Path: "/foo", 197 | }, 198 | "", 199 | }, 200 | { 201 | // Malformed IPv6 but still accepted. 202 | "http://2b01:e34:ef40:7730:8e70:5aff:fefe:edac:8080/foo", 203 | &URL{ 204 | Scheme: SchemeHTTP, 205 | Host: "2b01:e34:ef40:7730:8e70:5aff:fefe:edac:8080", 206 | Path: "/foo", 207 | }, 208 | "", 209 | }, 210 | { 211 | // Malformed IPv6 but still accepted. 212 | "http://2b01:e34:ef40:7730:8e70:5aff:fefe:edac:/foo", 213 | &URL{ 214 | Scheme: SchemeHTTP, 215 | Host: "2b01:e34:ef40:7730:8e70:5aff:fefe:edac:", 216 | Path: "/foo", 217 | }, 218 | "", 219 | }, 220 | { 221 | "http://[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:8080/foo", 222 | &URL{ 223 | Scheme: SchemeHTTP, 224 | Host: "[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:8080", 225 | Path: "/foo", 226 | }, 227 | "", 228 | }, 229 | { 230 | "http://[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:/foo", 231 | &URL{ 232 | Scheme: SchemeHTTP, 233 | Host: "[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:", 234 | Path: "/foo", 235 | }, 236 | "", 237 | }, 238 | // golang.org/issue/7991 and golang.org/issue/12719 (non-ascii %-encoded in host) 239 | { 240 | "http://hello.世界.com/foo", 241 | &URL{ 242 | Scheme: SchemeHTTP, 243 | Host: "hello.世界.com", 244 | Path: "/foo", 245 | }, 246 | "http://hello.%E4%B8%96%E7%95%8C.com/foo", 247 | }, 248 | { 249 | "http://hello.%e4%b8%96%e7%95%8c.com/foo", 250 | &URL{ 251 | Scheme: SchemeHTTP, 252 | Host: "hello.世界.com", 253 | Path: "/foo", 254 | }, 255 | "http://hello.%E4%B8%96%E7%95%8C.com/foo", 256 | }, 257 | { 258 | "http://hello.%E4%B8%96%E7%95%8C.com/foo", 259 | &URL{ 260 | Scheme: SchemeHTTP, 261 | Host: "hello.世界.com", 262 | Path: "/foo", 263 | }, 264 | "", 265 | }, 266 | // golang.org/issue/10433 (path beginning with //) 267 | { 268 | "http://example.com//foo", 269 | &URL{ 270 | Scheme: SchemeHTTP, 271 | Host: "example.com", 272 | Path: "//foo", 273 | }, 274 | "", 275 | }, 276 | // test that we can reparse the host names we accept. 277 | { 278 | "http://authority<\"hi\">/foo", 279 | &URL{ 280 | Scheme: SchemeHTTP, 281 | Host: "authority<\"hi\">", 282 | Path: "/foo", 283 | }, 284 | "", 285 | }, 286 | } 287 | 288 | // more useful string for debugging than fmt's struct printer 289 | func ufmt(u *URL) string { 290 | return fmt.Sprintf("scheme=%q, host=%q, path=%q", 291 | Schemes[u.Scheme], u.Host, u.Path) 292 | } 293 | 294 | func BenchmarkString(b *testing.B) { 295 | b.StopTimer() 296 | b.ReportAllocs() 297 | for _, tt := range urltests { 298 | var u URL 299 | err := u.Parse(tt.in) 300 | if err != nil { 301 | b.Errorf("Parse(%q) returned error %s", tt.in, err) 302 | continue 303 | } 304 | if tt.roundtrip == "" { 305 | continue 306 | } 307 | b.StartTimer() 308 | var g string 309 | for i := 0; i < b.N; i++ { 310 | g = u.String() 311 | } 312 | b.StopTimer() 313 | if w := tt.roundtrip; b.N > 0 && g != w { 314 | b.Errorf("Parse(%q).String() == %q, want %q", tt.in, g, w) 315 | } 316 | } 317 | } 318 | 319 | func TestParse(t *testing.T) { 320 | for _, tt := range urltests { 321 | var u URL 322 | err := u.Parse(tt.in) 323 | if err != nil { 324 | t.Errorf("Parse(%q) returned error %v", tt.in, err) 325 | continue 326 | } 327 | if !reflect.DeepEqual(&u, tt.out) { 328 | t.Errorf("Parse(%q):\n\tgot %v\n\twant %v\n", tt.in, ufmt(&u), ufmt(tt.out)) 329 | } 330 | } 331 | } 332 | 333 | const pathThatLooksSchemeRelative = "//not.a.user@not.a.host/just/a/path" 334 | 335 | var parseRequestURLTests = []struct { 336 | url string 337 | expectedValid bool 338 | }{ 339 | {"http://foo.com", true}, 340 | {"http://foo.com/", true}, 341 | {"http://foo.com/path", true}, 342 | {"/", true}, 343 | {pathThatLooksSchemeRelative, true}, 344 | {"//not.a.user@%66%6f%6f.com/just/a/path/also", true}, 345 | {"*", true}, 346 | {"http://192.168.0.1/", true}, 347 | {"http://192.168.0.1:8080/", true}, 348 | {"http://[fe80::1]/", true}, 349 | {"http://[fe80::1]:8080/", true}, 350 | 351 | // Tests exercising RFC 6874 compliance: 352 | {"http://[fe80::1%25en0]/", true}, // with alphanum zone identifier 353 | {"http://[fe80::1%25en0]:8080/", true}, // with alphanum zone identifier 354 | {"http://[fe80::1%25%65%6e%301-._~]/", true}, // with percent-encoded+unreserved zone identifier 355 | {"http://[fe80::1%25%65%6e%301-._~]:8080/", true}, // with percent-encoded+unreserved zone identifier 356 | 357 | {"foo.html", false}, 358 | {"../dir/", false}, 359 | {"http://192.168.0.%31/", false}, 360 | {"http://192.168.0.%31:8080/", false}, 361 | {"http://[fe80::%31]/", false}, 362 | {"http://[fe80::%31]:8080/", false}, 363 | {"http://[fe80::%31%25en0]/", false}, 364 | {"http://[fe80::%31%25en0]:8080/", false}, 365 | 366 | // These two cases are valid as textual representations as 367 | // described in RFC 4007, but are not valid as address 368 | // literals with IPv6 zone identifiers in URIs as described in 369 | // RFC 6874. 370 | {"http://[fe80::1%en0]/", false}, 371 | {"http://[fe80::1%en0]:8080/", false}, 372 | } 373 | 374 | func TestParseRequestURI(t *testing.T) { 375 | for _, test := range parseRequestURLTests { 376 | var u URL 377 | err := u.ParseRequestURI(test.url) 378 | if test.expectedValid && err != nil { 379 | t.Errorf("ParseRequestURI(%q) gave err %v; want no error", test.url, err) 380 | } else if !test.expectedValid && err == nil { 381 | t.Errorf("ParseRequestURI(%q) gave nil error; want some error", test.url) 382 | } 383 | } 384 | 385 | var url URL 386 | err := url.ParseRequestURI(pathThatLooksSchemeRelative) 387 | if err != nil { 388 | t.Fatalf("Unexpected error %v", err) 389 | } 390 | if url.Path != pathThatLooksSchemeRelative { 391 | t.Errorf("ParseRequestURI path:\ngot %q\nwant %q", url.Path, pathThatLooksSchemeRelative) 392 | } 393 | } 394 | 395 | var stringURLTests = []struct { 396 | url URL 397 | want string 398 | }{ 399 | // No leading slash on path should prepend slash on String() call 400 | { 401 | url: URL{ 402 | Scheme: SchemeHTTP, 403 | Host: "www.google.com", 404 | Path: "search", 405 | }, 406 | want: "http://www.google.com/search", 407 | }, 408 | // Relative path with first element containing ":" should be prepended with "./", golang.org/issue/17184 409 | { 410 | url: URL{ 411 | Path: "this:that", 412 | }, 413 | want: "./this:that", 414 | }, 415 | // Relative path with second element containing ":" should not be prepended with "./" 416 | { 417 | url: URL{ 418 | Path: "here/this:that", 419 | }, 420 | want: "here/this:that", 421 | }, 422 | // Non-relative path with first element containing ":" should not be prepended with "./" 423 | { 424 | url: URL{ 425 | Scheme: SchemeHTTP, 426 | Host: "www.google.com", 427 | Path: "this:that", 428 | }, 429 | want: "http://www.google.com/this:that", 430 | }, 431 | } 432 | 433 | func TestURLString(t *testing.T) { 434 | for _, tt := range urltests { 435 | var u URL 436 | err := u.Parse(tt.in) 437 | if err != nil { 438 | t.Errorf("Parse(%q) returned error %s", tt.in, err) 439 | continue 440 | } 441 | expected := tt.in 442 | if tt.roundtrip != "" { 443 | expected = tt.roundtrip 444 | } 445 | s := u.String() 446 | if s != expected { 447 | t.Errorf("Parse(%q).String() == %q (expected %q)", tt.in, s, expected) 448 | } 449 | } 450 | 451 | for _, tt := range stringURLTests { 452 | if got := tt.url.String(); got != tt.want { 453 | t.Errorf("%+v.String() = %q; want %q", tt.url, got, tt.want) 454 | } 455 | } 456 | } 457 | 458 | var resolvePathTests = []struct { 459 | base, ref, expected string 460 | }{ 461 | {"a/b", ".", "/a/"}, 462 | {"a/b", "c", "/a/c"}, 463 | {"a/b", "..", "/"}, 464 | {"a/", "..", "/"}, 465 | {"a/", "../..", "/"}, 466 | {"a/b/c", "..", "/a/"}, 467 | {"a/b/c", "../d", "/a/d"}, 468 | {"a/b/c", ".././d", "/a/d"}, 469 | {"a/b", "./..", "/"}, 470 | {"a/./b", ".", "/a/"}, 471 | {"a/../", ".", "/"}, 472 | {"a/.././b", "c", "/c"}, 473 | } 474 | 475 | func TestResolvePath(t *testing.T) { 476 | for _, test := range resolvePathTests { 477 | got := resolvePath(test.base, test.ref) 478 | if got != test.expected { 479 | t.Errorf("For %q + %q got %q; expected %q", test.base, test.ref, got, test.expected) 480 | } 481 | } 482 | } 483 | 484 | var resolveReferenceTests = []struct { 485 | base, rel, expected string 486 | }{ 487 | // Absolute URL references 488 | {"http://foo.com?a=b", "https://bar.com/", "https://bar.com/"}, 489 | {"http://foo.com/", "https://bar.com/?a=b", "https://bar.com/"}, 490 | {"http://foo.com/", "https://bar.com/?", "https://bar.com/"}, 491 | 492 | // Path-absolute references 493 | {"http://foo.com/bar", "/baz", "http://foo.com/baz"}, 494 | {"http://foo.com/bar?a=b#f", "/baz", "http://foo.com/baz"}, 495 | {"http://foo.com/bar?a=b", "/baz?", "http://foo.com/baz"}, 496 | {"http://foo.com/bar?a=b", "/baz?c=d", "http://foo.com/baz"}, 497 | 498 | // Multiple slashes 499 | {"http://foo.com/bar", "http://foo.com//baz", "http://foo.com//baz"}, 500 | {"http://foo.com/bar", "http://foo.com///baz/quux", "http://foo.com///baz/quux"}, 501 | 502 | // Scheme-relative 503 | {"https://foo.com/bar?a=b", "//bar.com/quux", "https://bar.com/quux"}, 504 | 505 | // Path-relative references: 506 | 507 | // ... current directory 508 | {"http://foo.com", ".", "http://foo.com/"}, 509 | {"http://foo.com/bar", ".", "http://foo.com/"}, 510 | {"http://foo.com/bar/", ".", "http://foo.com/bar/"}, 511 | 512 | // ... going down 513 | {"http://foo.com", "bar", "http://foo.com/bar"}, 514 | {"http://foo.com/", "bar", "http://foo.com/bar"}, 515 | {"http://foo.com/bar/baz", "quux", "http://foo.com/bar/quux"}, 516 | 517 | // ... going up 518 | {"http://foo.com/bar/baz", "../quux", "http://foo.com/quux"}, 519 | {"http://foo.com/bar/baz", "../../../../../quux", "http://foo.com/quux"}, 520 | {"http://foo.com/bar", "..", "http://foo.com/"}, 521 | {"http://foo.com/bar/baz", "./..", "http://foo.com/"}, 522 | // ".." in the middle (issue 3560) 523 | {"http://foo.com/bar/baz", "quux/dotdot/../tail", "http://foo.com/bar/quux/tail"}, 524 | {"http://foo.com/bar/baz", "quux/./dotdot/../tail", "http://foo.com/bar/quux/tail"}, 525 | {"http://foo.com/bar/baz", "quux/./dotdot/.././tail", "http://foo.com/bar/quux/tail"}, 526 | {"http://foo.com/bar/baz", "quux/./dotdot/./../tail", "http://foo.com/bar/quux/tail"}, 527 | {"http://foo.com/bar/baz", "quux/./dotdot/dotdot/././../../tail", "http://foo.com/bar/quux/tail"}, 528 | {"http://foo.com/bar/baz", "quux/./dotdot/dotdot/./.././../tail", "http://foo.com/bar/quux/tail"}, 529 | {"http://foo.com/bar/baz", "quux/./dotdot/dotdot/dotdot/./../../.././././tail", "http://foo.com/bar/quux/tail"}, 530 | {"http://foo.com/bar/baz", "quux/./dotdot/../dotdot/../dot/./tail/..", "http://foo.com/bar/quux/dot/"}, 531 | 532 | // Remove any dot-segments prior to forming the target URI. 533 | // http://tools.ietf.org/html/rfc3986#section-5.2.4 534 | {"http://foo.com/dot/./dotdot/../foo/bar", "../baz", "http://foo.com/dot/baz"}, 535 | 536 | // Triple dot isn't special 537 | {"http://foo.com/bar", "...", "http://foo.com/..."}, 538 | 539 | // Fragment 540 | {"http://foo.com/bar", ".#frag", "http://foo.com/"}, 541 | {"http://example.org/", "#!$&%27()*+,;=", "http://example.org/"}, 542 | 543 | // Paths with escaping (issue 16947). 544 | {"http://foo.com/foo%2fbar/", "../baz", "http://foo.com/baz"}, 545 | {"http://foo.com/1/2%2f/3%2f4/5", "../../a/b/c", "http://foo.com/1/a/b/c"}, 546 | {"http://foo.com/1/2/3", "./a%2f../../b/..%2fc", "http://foo.com/1/2/b/..%2fc"}, 547 | {"http://foo.com/1/2%2f/3%2f4/5", "./a%2f../b/../c", "http://foo.com/1/2%2f/3%2f4/a%2f../c"}, 548 | {"http://foo.com/foo%20bar/", "../baz", "http://foo.com/baz"}, 549 | {"http://foo.com/foo", "../bar%2fbaz", "http://foo.com/bar%2fbaz"}, 550 | {"http://foo.com/foo%2dbar/", "./baz-quux", "http://foo.com/foo%2dbar/baz-quux"}, 551 | 552 | // RFC 3986: Normal Examples 553 | // http://tools.ietf.org/html/rfc3986#section-5.4.1 554 | {"http://a/b/c/d;p?q", "g", "http://a/b/c/g"}, 555 | {"http://a/b/c/d;p?q", "./g", "http://a/b/c/g"}, 556 | {"http://a/b/c/d;p?q", "g/", "http://a/b/c/g/"}, 557 | {"http://a/b/c/d;p?q", "/g", "http://a/g"}, 558 | {"http://a/b/c/d;p?q", "//g", "http://g"}, 559 | {"http://a/b/c/d;p?q", "?y", "http://a/b/c/d;p"}, 560 | {"http://a/b/c/d;p?q", "g?y", "http://a/b/c/g"}, 561 | {"http://a/b/c/d;p?q", "#s", "http://a/b/c/d;p"}, 562 | {"http://a/b/c/d;p?q", "g#s", "http://a/b/c/g"}, 563 | {"http://a/b/c/d;p?q", "g?y#s", "http://a/b/c/g"}, 564 | {"http://a/b/c/d;p?q", ";x", "http://a/b/c/;x"}, 565 | {"http://a/b/c/d;p?q", "g;x", "http://a/b/c/g;x"}, 566 | {"http://a/b/c/d;p?q", "g;x?y#s", "http://a/b/c/g;x"}, 567 | {"http://a/b/c/d;p?q", "", "http://a/b/c/d;p"}, 568 | {"http://a/b/c/d;p?q", ".", "http://a/b/c/"}, 569 | {"http://a/b/c/d;p?q", "./", "http://a/b/c/"}, 570 | {"http://a/b/c/d;p?q", "..", "http://a/b/"}, 571 | {"http://a/b/c/d;p?q", "../", "http://a/b/"}, 572 | {"http://a/b/c/d;p?q", "../g", "http://a/b/g"}, 573 | {"http://a/b/c/d;p?q", "../..", "http://a/"}, 574 | {"http://a/b/c/d;p?q", "../../", "http://a/"}, 575 | {"http://a/b/c/d;p?q", "../../g", "http://a/g"}, 576 | 577 | // RFC 3986: Abnormal Examples 578 | // http://tools.ietf.org/html/rfc3986#section-5.4.2 579 | {"http://a/b/c/d;p?q", "../../../g", "http://a/g"}, 580 | {"http://a/b/c/d;p?q", "../../../../g", "http://a/g"}, 581 | {"http://a/b/c/d;p?q", "/./g", "http://a/g"}, 582 | {"http://a/b/c/d;p?q", "/../g", "http://a/g"}, 583 | {"http://a/b/c/d;p?q", "g.", "http://a/b/c/g."}, 584 | {"http://a/b/c/d;p?q", ".g", "http://a/b/c/.g"}, 585 | {"http://a/b/c/d;p?q", "g..", "http://a/b/c/g.."}, 586 | {"http://a/b/c/d;p?q", "..g", "http://a/b/c/..g"}, 587 | {"http://a/b/c/d;p?q", "./../g", "http://a/b/g"}, 588 | {"http://a/b/c/d;p?q", "./g/.", "http://a/b/c/g/"}, 589 | {"http://a/b/c/d;p?q", "g/./h", "http://a/b/c/g/h"}, 590 | {"http://a/b/c/d;p?q", "g/../h", "http://a/b/c/h"}, 591 | {"http://a/b/c/d;p?q", "g;x=1/./y", "http://a/b/c/g;x=1/y"}, 592 | {"http://a/b/c/d;p?q", "g;x=1/../y", "http://a/b/c/y"}, 593 | {"http://a/b/c/d;p?q", "g?y/./x", "http://a/b/c/g"}, 594 | {"http://a/b/c/d;p?q", "g?y/../x", "http://a/b/c/g"}, 595 | {"http://a/b/c/d;p?q", "g#s/./x", "http://a/b/c/g"}, 596 | {"http://a/b/c/d;p?q", "g#s/../x", "http://a/b/c/g"}, 597 | 598 | // Extras. 599 | {"https://a/b/c/d;p?q", "//g?q", "https://g"}, 600 | {"https://a/b/c/d;p?q", "//g#s", "https://g"}, 601 | {"https://a/b/c/d;p?q", "//g/d/e/f?y#s", "https://g/d/e/f"}, 602 | {"https://a/b/c/d;p#s", "?y", "https://a/b/c/d;p"}, 603 | {"https://a/b/c/d;p?q#s", "?y", "https://a/b/c/d;p"}, 604 | } 605 | 606 | func TestResolveReference(t *testing.T) { 607 | mustParse := func(url string) *URL { 608 | u := new(URL) 609 | err := u.Parse(url) 610 | if err != nil { 611 | t.Fatalf("Parse(%q) got err %v", url, err) 612 | } 613 | return u 614 | } 615 | for _, test := range resolveReferenceTests { 616 | base := mustParse(test.base) 617 | rel := mustParse(test.rel) 618 | var url URL 619 | base.ResolveReference(&url, rel) 620 | if got := url.String(); got != test.expected { 621 | t.Errorf("URL(%q).ResolveReference(%q)\ngot %q\nwant %q", test.base, test.rel, got, test.expected) 622 | } 623 | } 624 | } 625 | 626 | type RequestURITest struct { 627 | url *URL 628 | out string 629 | } 630 | 631 | var requritests = []RequestURITest{ 632 | { 633 | &URL{ 634 | Scheme: SchemeHTTP, 635 | Host: "example.com", 636 | Path: "", 637 | }, 638 | "/", 639 | }, 640 | { 641 | &URL{ 642 | Scheme: SchemeHTTP, 643 | Host: "example.com", 644 | Path: "/a b", 645 | }, 646 | "/a%20b", 647 | }, 648 | { 649 | &URL{ 650 | Scheme: SchemeHTTP, 651 | Host: "example.com", 652 | Path: "//foo", 653 | }, 654 | "//foo", 655 | }, 656 | } 657 | 658 | func TestParseErrors(t *testing.T) { 659 | tests := []struct { 660 | in string 661 | wantErr bool 662 | }{ 663 | {"http://[::1]", false}, 664 | {"http://[::1]:80", false}, 665 | {"http://[::1]:namedport", true}, // rfc3986 3.2.3 666 | {"http://[::1]/", false}, 667 | {"http://[::1]a", true}, 668 | {"http://[::1]%23", true}, 669 | {"http://[::1%25en0]", false}, // valid zone id 670 | {"http://[::1]:", false}, // colon, but no port OK 671 | {"http://[::1]:%38%30", true}, // not allowed: % encoding only for non-ASCII 672 | {"http://[::1%25%41]", false}, // RFC 6874 allows over-escaping in zone 673 | {"http://[%10::1]", true}, // no %xx escapes in IP address 674 | {"http://[::1]/%48", false}, // %xx in path is fine 675 | {"http://%41:8080/", true}, // not allowed: % encoding only for non-ASCII 676 | 677 | {"http://[]%20%48%54%54%50%2f%31%2e%31%0a%4d%79%48%65%61%64%65%72%3a%20%31%32%33%0a%0a/", true}, // golang.org/issue/11208 678 | {"http://a b.com/", true}, // no space in host name please 679 | } 680 | for _, tt := range tests { 681 | var u URL 682 | err := u.Parse(tt.in) 683 | if tt.wantErr { 684 | if err == nil { 685 | t.Errorf("Parse(%q) = %#v; want an error", tt.in, u) 686 | } 687 | continue 688 | } 689 | if err != nil { 690 | t.Logf("Parse(%q) = %v; want no error", tt.in, err) 691 | } 692 | } 693 | } 694 | 695 | type shouldEscapeTest struct { 696 | in byte 697 | mode encoding 698 | escape bool 699 | } 700 | 701 | var shouldEscapeTests = []shouldEscapeTest{ 702 | // Unreserved characters (§2.3) 703 | {'a', encodePath, false}, 704 | {'a', encodeUserPassword, false}, 705 | {'a', encodeQueryComponent, false}, 706 | {'a', encodeFragment, false}, 707 | {'a', encodeHost, false}, 708 | {'z', encodePath, false}, 709 | {'A', encodePath, false}, 710 | {'Z', encodePath, false}, 711 | {'0', encodePath, false}, 712 | {'9', encodePath, false}, 713 | {'-', encodePath, false}, 714 | {'-', encodeUserPassword, false}, 715 | {'-', encodeQueryComponent, false}, 716 | {'-', encodeFragment, false}, 717 | {'.', encodePath, false}, 718 | {'_', encodePath, false}, 719 | {'~', encodePath, false}, 720 | 721 | // User information (§3.2.1) 722 | {':', encodeUserPassword, true}, 723 | {'/', encodeUserPassword, true}, 724 | {'?', encodeUserPassword, true}, 725 | {'@', encodeUserPassword, true}, 726 | {'$', encodeUserPassword, false}, 727 | {'&', encodeUserPassword, false}, 728 | {'+', encodeUserPassword, false}, 729 | {',', encodeUserPassword, false}, 730 | {';', encodeUserPassword, false}, 731 | {'=', encodeUserPassword, false}, 732 | 733 | // Host (IP address, IPv6 address, registered name, port suffix; §3.2.2) 734 | {'!', encodeHost, false}, 735 | {'$', encodeHost, false}, 736 | {'&', encodeHost, false}, 737 | {'\'', encodeHost, false}, 738 | {'(', encodeHost, false}, 739 | {')', encodeHost, false}, 740 | {'*', encodeHost, false}, 741 | {'+', encodeHost, false}, 742 | {',', encodeHost, false}, 743 | {';', encodeHost, false}, 744 | {'=', encodeHost, false}, 745 | {':', encodeHost, false}, 746 | {'[', encodeHost, false}, 747 | {']', encodeHost, false}, 748 | {'0', encodeHost, false}, 749 | {'9', encodeHost, false}, 750 | {'A', encodeHost, false}, 751 | {'z', encodeHost, false}, 752 | {'_', encodeHost, false}, 753 | {'-', encodeHost, false}, 754 | {'.', encodeHost, false}, 755 | } 756 | 757 | func TestShouldEscape(t *testing.T) { 758 | for _, tt := range shouldEscapeTests { 759 | if shouldEscape(tt.in, tt.mode) != tt.escape { 760 | t.Errorf("shouldEscape(%q, %v) returned %v; expected %v", tt.in, tt.mode, !tt.escape, tt.escape) 761 | } 762 | } 763 | } 764 | 765 | type timeoutError struct { 766 | timeout bool 767 | } 768 | 769 | func (e *timeoutError) Error() string { return "timeout error" } 770 | func (e *timeoutError) Timeout() bool { return e.timeout } 771 | 772 | type temporaryError struct { 773 | temporary bool 774 | } 775 | 776 | func (e *temporaryError) Error() string { return "temporary error" } 777 | func (e *temporaryError) Temporary() bool { return e.temporary } 778 | 779 | type timeoutTemporaryError struct { 780 | timeoutError 781 | temporaryError 782 | } 783 | 784 | func (e *timeoutTemporaryError) Error() string { return "timeout/temporary error" } 785 | 786 | var netErrorTests = []struct { 787 | err error 788 | timeout bool 789 | temporary bool 790 | }{{ 791 | err: &Error{"Get", "http://google.com/", &timeoutError{timeout: true}}, 792 | timeout: true, 793 | temporary: false, 794 | }, { 795 | err: &Error{"Get", "http://google.com/", &timeoutError{timeout: false}}, 796 | timeout: false, 797 | temporary: false, 798 | }, { 799 | err: &Error{"Get", "http://google.com/", &temporaryError{temporary: true}}, 800 | timeout: false, 801 | temporary: true, 802 | }, { 803 | err: &Error{"Get", "http://google.com/", &temporaryError{temporary: false}}, 804 | timeout: false, 805 | temporary: false, 806 | }, { 807 | err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: true}, temporaryError{temporary: true}}}, 808 | timeout: true, 809 | temporary: true, 810 | }, { 811 | err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: false}, temporaryError{temporary: true}}}, 812 | timeout: false, 813 | temporary: true, 814 | }, { 815 | err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: true}, temporaryError{temporary: false}}}, 816 | timeout: true, 817 | temporary: false, 818 | }, { 819 | err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: false}, temporaryError{temporary: false}}}, 820 | timeout: false, 821 | temporary: false, 822 | }, { 823 | err: &Error{"Get", "http://google.com/", io.EOF}, 824 | timeout: false, 825 | temporary: false, 826 | }} 827 | 828 | // Test that url.Error implements net.Error and that it forwards 829 | func TestURLErrorImplementsNetError(t *testing.T) { 830 | for i, tt := range netErrorTests { 831 | err, ok := tt.err.(net.Error) 832 | if !ok { 833 | t.Errorf("%d: %T does not implement net.Error", i+1, tt.err) 834 | continue 835 | } 836 | if err.Timeout() != tt.timeout { 837 | t.Errorf("%d: err.Timeout(): got %v, want %v", i+1, err.Timeout(), tt.timeout) 838 | continue 839 | } 840 | if err.Temporary() != tt.temporary { 841 | t.Errorf("%d: err.Temporary(): got %v, want %v", i+1, err.Temporary(), tt.temporary) 842 | } 843 | } 844 | } 845 | 846 | var _ encodingPkg.BinaryMarshaler = (*URL)(nil) 847 | var _ encodingPkg.BinaryUnmarshaler = (*URL)(nil) 848 | 849 | func TestJSON(t *testing.T) { 850 | var u URL 851 | err := u.Parse("https://www.google.com/x?y=z") 852 | if err != nil { 853 | t.Fatal(err) 854 | } 855 | js, err := json.Marshal(&u) 856 | if err != nil { 857 | t.Fatal(err) 858 | } 859 | 860 | // If only we could implement TextMarshaler/TextUnmarshaler, 861 | // this would work: 862 | // 863 | // if string(js) != strconv.Quote(u.String()) { 864 | // t.Errorf("json encoding: %s\nwant: %s\n", js, strconv.Quote(u.String())) 865 | // } 866 | 867 | u1 := new(URL) 868 | err = json.Unmarshal(js, u1) 869 | if err != nil { 870 | t.Fatal(err) 871 | } 872 | if u1.String() != u.String() { 873 | t.Errorf("json decoded to: %s\nwant: %s\n", u1, &u) 874 | } 875 | } 876 | 877 | func TestGob(t *testing.T) { 878 | var u URL 879 | err := u.Parse("https://www.google.com/x?y=z") 880 | if err != nil { 881 | t.Fatal(err) 882 | } 883 | var w bytes.Buffer 884 | err = gob.NewEncoder(&w).Encode(&u) 885 | if err != nil { 886 | t.Fatal(err) 887 | } 888 | 889 | u1 := new(URL) 890 | err = gob.NewDecoder(&w).Decode(u1) 891 | if err != nil { 892 | t.Fatal(err) 893 | } 894 | if u1.String() != u.String() { 895 | t.Errorf("json decoded to: %s\nwant: %s\n", u1, &u) 896 | } 897 | } 898 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/terorie/od-database-crawler 2 | 3 | require ( 4 | github.com/beeker1121/goque v2.0.1+incompatible 5 | github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect 6 | github.com/sirupsen/logrus v1.4.0 7 | github.com/spf13/cobra v0.0.3 8 | github.com/spf13/viper v1.3.2 9 | github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 // indirect 10 | github.com/valyala/fasthttp v1.2.0 11 | golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613 12 | golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3 13 | ) 14 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= 2 | github.com/beeker1121/goque v2.0.1+incompatible h1:5nJHPMqQLxUvGFc8m/NW2QzxKyc0zICmqs/JUsmEjwE= 3 | github.com/beeker1121/goque v2.0.1+incompatible/go.mod h1:L6dOWBhDOnxUVQsb0wkLve0VCnt2xJW/MI8pdRX4ANw= 4 | github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= 5 | github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk= 6 | github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= 7 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 8 | github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= 9 | github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= 10 | github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w= 11 | github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= 12 | github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= 13 | github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= 14 | github.com/klauspost/compress v1.4.0 h1:8nsMz3tWa9SWWPL60G1V6CUsf4lLjWLTNEtibhe8gh8= 15 | github.com/klauspost/compress v1.4.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= 16 | github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e h1:+lIPJOWl+jSiJOc70QXJ07+2eg2Jy2EC7Mi11BWujeM= 17 | github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= 18 | github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= 19 | github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY= 20 | github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= 21 | github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE= 22 | github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= 23 | github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc= 24 | github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= 25 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 26 | github.com/sirupsen/logrus v1.3.0 h1:hI/7Q+DtNZ2kINb6qt/lS+IyXnHQe9e90POfeewL/ME= 27 | github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= 28 | github.com/sirupsen/logrus v1.4.0 h1:yKenngtzGh+cUSSh6GWbxW2abRqhYUSR/t/6+2QqNvE= 29 | github.com/sirupsen/logrus v1.4.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= 30 | github.com/spf13/afero v1.1.2 h1:m8/z1t7/fwjysjQRYbP0RD+bUIF/8tJwPdEZsI83ACI= 31 | github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= 32 | github.com/spf13/cast v1.3.0 h1:oget//CVOEoFewqQxwr0Ej5yjygnqGkvggSE/gB35Q8= 33 | github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= 34 | github.com/spf13/cobra v0.0.3 h1:ZlrZ4XsMRm04Fr5pSFxBgfND2EBVa1nLpiy1stUsX/8= 35 | github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= 36 | github.com/spf13/jwalterweatherman v1.0.0 h1:XHEdyB+EcvlqZamSM4ZOMGlc93t6AcsBEu9Gc1vn7yk= 37 | github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= 38 | github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg= 39 | github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= 40 | github.com/spf13/viper v1.3.1 h1:5+8j8FTpnFV4nEImW/ofkzEt8VoOiLXxdYIDsB73T38= 41 | github.com/spf13/viper v1.3.1/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= 42 | github.com/spf13/viper v1.3.2 h1:VUFqw5KcqRf7i70GOzW7N+Q7+gxVBkSSqiXB12+JQ4M= 43 | github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= 44 | github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 45 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 46 | github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 h1:GnOzE5fEFN3b2zDhJJABEofdb51uMRNb8eqIVtdducs= 47 | github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2/go.mod h1:Z4AUp2Km+PwemOoO/VB5AOx9XSsIItzFjoJlOSiYmn0= 48 | github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= 49 | github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= 50 | github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= 51 | github.com/valyala/fasthttp v1.1.0 h1:3BohG7mqwj4lq7PTX//7gLbUlzNvZSPmuHFnloXT0lw= 52 | github.com/valyala/fasthttp v1.1.0/go.mod h1:4vX61m6KN+xDduDNwXrhIAVZaZaZiQ1luJk8LWSxF3s= 53 | github.com/valyala/fasthttp v1.2.0 h1:dzZJf2IuMiclVjdw0kkT+f9u4YdrapbNyGAN47E/qnk= 54 | github.com/valyala/fasthttp v1.2.0/go.mod h1:4vX61m6KN+xDduDNwXrhIAVZaZaZiQ1luJk8LWSxF3s= 55 | github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio= 56 | github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= 57 | golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= 58 | golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= 59 | golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613 h1:MQ/ZZiDsUapFFiMS+vzwXkCTeEKaum+Do5rINYJDmxc= 60 | golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= 61 | golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3 h1:czFLhve3vsQetD6JOJ8NZZvGQIXlnN3/yXxbT6/awxI= 62 | golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 63 | golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 64 | golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a h1:1n5lsVfiQW3yfsRGu98756EH1YthsFqr/5mxHduZW2A= 65 | golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 66 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= 67 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 68 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 69 | gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= 70 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 71 | -------------------------------------------------------------------------------- /help.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | const helpText = 4 | `HTTP crawler for the OD-Database 5 | DB >> https://od-db.the-eye.eu << 6 | Crawler >> https://github.com/terorie/od-database-crawler << 7 | Server >> https://github.com/simon987/od-database << 8 | 9 | Quick start: 10 | - get config file (config.yml in working dir) 11 | - get OD-DB server ("server.url": Database URL + /api) 12 | - get access token ("server.token": e.g. c010b6dd-20...) 13 | - ./od-database-crawler server 14 | 15 | Questions? Discord @terorie#2664 / Telegram @terorie` 16 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "github.com/sirupsen/logrus" 7 | "github.com/spf13/cobra" 8 | "github.com/spf13/viper" 9 | "github.com/terorie/od-database-crawler/fasturl" 10 | "os" 11 | "os/signal" 12 | "strings" 13 | "sync/atomic" 14 | "time" 15 | ) 16 | 17 | var configFile string 18 | 19 | var rootCmd = cobra.Command { 20 | Use: "od-database-crawler", 21 | Version: "1.2.2", 22 | Short: "OD-Database Go crawler", 23 | Long: helpText, 24 | PersistentPreRunE: preRun, 25 | PersistentPostRun: func(cmd *cobra.Command, args []string) { 26 | exitHooks.Execute() 27 | }, 28 | } 29 | 30 | var serverCmd = cobra.Command { 31 | Use: "server", 32 | Short: "Start crawl server", 33 | Long: "Connect to the OD-Database and contribute to the database\n" + 34 | "by crawling the web for open directories!", 35 | Run: cmdBase, 36 | } 37 | 38 | var crawlCmd = cobra.Command { 39 | Use: "crawl", 40 | Short: "Crawl an URL", 41 | Long: "Crawl the URL specified.\n" + 42 | "Results will not be uploaded to the database,\n" + 43 | "they're saved under crawled/0.json instead.\n" + 44 | "Primarily used for testing and benchmarking.", 45 | RunE: cmdCrawler, 46 | Args: cobra.ExactArgs(1), 47 | } 48 | 49 | var exitHooks Hooks 50 | 51 | func init() { 52 | rootCmd.AddCommand(&crawlCmd) 53 | rootCmd.AddCommand(&serverCmd) 54 | 55 | prepareConfig() 56 | } 57 | 58 | func preRun(cmd *cobra.Command, args []string) error { 59 | if err := os.MkdirAll("crawled", 0755); 60 | err != nil { panic(err) } 61 | 62 | if err := os.MkdirAll("queue", 0755); 63 | err != nil { panic(err) } 64 | 65 | return nil 66 | } 67 | 68 | func main() { 69 | err := rootCmd.Execute() 70 | if err != nil { 71 | fmt.Fprintln(os.Stderr, err) 72 | os.Exit(1) 73 | } 74 | } 75 | 76 | func cmdBase(_ *cobra.Command, _ []string) { 77 | onlineMode = true 78 | readConfig() 79 | 80 | appCtx, soft := context.WithCancel(context.Background()) 81 | forceCtx, hard := context.WithCancel(context.Background()) 82 | go hardShutdown(forceCtx) 83 | go listenCtrlC(soft, hard) 84 | 85 | inRemotes := make(chan *OD) 86 | go Schedule(appCtx, inRemotes) 87 | 88 | ticker := time.NewTicker(config.Recheck) 89 | defer ticker.Stop() 90 | for { 91 | select { 92 | case <-appCtx.Done(): 93 | goto shutdown 94 | case <-ticker.C: 95 | t, err := FetchTask() 96 | if err != nil { 97 | logrus.WithError(err). 98 | Error("Failed to get new task") 99 | if !sleep(viper.GetDuration(ConfCooldown), appCtx) { 100 | goto shutdown 101 | } 102 | continue 103 | } 104 | if t == nil { 105 | // No new task 106 | if atomic.LoadInt32(&numActiveTasks) == 0 { 107 | logrus.Info("Waiting …") 108 | } 109 | continue 110 | } 111 | 112 | var baseUri fasturl.URL 113 | err = baseUri.Parse(t.Url) 114 | if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme { 115 | // Not an error 116 | err = nil 117 | // TODO FTP crawler 118 | continue 119 | } else if err != nil { 120 | logrus.WithError(err). 121 | Error("Failed to get new task") 122 | time.Sleep(viper.GetDuration(ConfCooldown)) 123 | continue 124 | } 125 | ScheduleTask(inRemotes, t, &baseUri) 126 | } 127 | } 128 | 129 | shutdown: 130 | globalWait.Wait() 131 | } 132 | 133 | func cmdCrawler(_ *cobra.Command, args []string) error { 134 | onlineMode = false 135 | readConfig() 136 | 137 | arg := args[0] 138 | // https://github.com/golang/go/issues/19779 139 | if !strings.Contains(arg, "://") { 140 | arg = "http://" + arg 141 | } 142 | var u fasturl.URL 143 | err := u.Parse(arg) 144 | if !strings.HasSuffix(u.Path, "/") { 145 | u.Path += "/" 146 | } 147 | if err != nil { return err } 148 | 149 | // TODO Graceful shutdown 150 | forceCtx := context.Background() 151 | 152 | inRemotes := make(chan *OD) 153 | go Schedule(forceCtx, inRemotes) 154 | 155 | ticker := time.NewTicker(3 * time.Second) 156 | defer ticker.Stop() 157 | 158 | task := Task { 159 | WebsiteId: 0, 160 | Url: u.String(), 161 | } 162 | ScheduleTask(inRemotes, &task, &u) 163 | 164 | // Wait for all jobs to finish 165 | globalWait.Wait() 166 | 167 | return nil 168 | } 169 | 170 | func listenCtrlC(soft, hard context.CancelFunc) { 171 | c := make(chan os.Signal) 172 | signal.Notify(c, os.Interrupt) 173 | 174 | <-c 175 | logrus.Info(">>> Shutting down crawler... <<<") 176 | soft() 177 | 178 | <-c 179 | logrus.Warning(">>> Force shutdown! <<<") 180 | hard() 181 | } 182 | 183 | func hardShutdown(c context.Context) { 184 | <-c.Done() 185 | os.Exit(1) 186 | } 187 | 188 | func sleep(d time.Duration, c context.Context) bool { 189 | select { 190 | case <-time.After(d): 191 | return true 192 | case <-c.Done(): 193 | return false 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /model.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/terorie/od-database-crawler/ds/redblackhash" 5 | "github.com/terorie/od-database-crawler/fasturl" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | type Task struct { 11 | WebsiteId uint64 `json:"website_id"` 12 | Url string `json:"url"` 13 | } 14 | 15 | type TaskResult struct { 16 | StatusCode string `json:"status_code"` 17 | FileCount uint64 `json:"file_count"` 18 | ErrorCount uint64 `json:"-"` 19 | StartTime time.Time `json:"-"` 20 | StartTimeUnix int64 `json:"start_time"` 21 | EndTimeUnix int64 `json:"end_time"` 22 | WebsiteId uint64 `json:"website_id"` 23 | } 24 | 25 | type Job struct { 26 | Uri fasturl.URL 27 | UriStr string 28 | Fails int 29 | LastError error 30 | } 31 | 32 | type OD struct { 33 | Task Task 34 | Result TaskResult 35 | Wait sync.WaitGroup 36 | BaseUri fasturl.URL 37 | WCtx WorkerContext 38 | Scanned redblackhash.Tree 39 | } 40 | 41 | type File struct { 42 | Name string `json:"name"` 43 | Size int64 `json:"size"` 44 | MTime int64 `json:"mtime"` 45 | Path string `json:"path"` 46 | IsDir bool `json:"-"` 47 | } 48 | 49 | func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) { 50 | o.Scanned.Lock() 51 | defer o.Scanned.Unlock() 52 | 53 | exists = o.Scanned.Get(k) 54 | if exists { return true } 55 | 56 | o.Scanned.Put(k) 57 | return false 58 | } 59 | 60 | type errorString string 61 | func (e errorString) Error() string { 62 | return string(e) 63 | } 64 | -------------------------------------------------------------------------------- /queue.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/beeker1121/goque" 5 | "os" 6 | "sync" 7 | "sync/atomic" 8 | ) 9 | 10 | type BufferedQueue struct { 11 | dataDir string 12 | q *goque.Queue 13 | buf []Job 14 | m sync.Mutex 15 | } 16 | 17 | func OpenQueue(dataDir string) (bq *BufferedQueue, err error) { 18 | bq = new(BufferedQueue) 19 | if config.JobBufferSize < 0 { 20 | return 21 | } 22 | bq.dataDir = dataDir 23 | bq.q, err = goque.OpenQueue(dataDir) 24 | if err != nil { return nil, err } 25 | return 26 | } 27 | 28 | func (q *BufferedQueue) Enqueue(job *Job) error { 29 | atomic.AddInt64(&totalQueued, 1) 30 | if q.directEnqueue(job) { 31 | return nil 32 | } 33 | 34 | var gob JobGob 35 | gob.ToGob(job) 36 | _, err := q.q.EnqueueObject(gob) 37 | return err 38 | } 39 | 40 | func (q *BufferedQueue) Dequeue() (job Job, err error) { 41 | if q.directDequeue(&job) { 42 | atomic.AddInt64(&totalQueued, -1) 43 | return job, nil 44 | } 45 | 46 | if config.JobBufferSize < 0 { 47 | err = goque.ErrEmpty 48 | return 49 | } 50 | 51 | var item *goque.Item 52 | item, err = q.q.Dequeue() 53 | if err != nil { return } 54 | 55 | atomic.AddInt64(&totalQueued, -1) 56 | 57 | var gob JobGob 58 | err = item.ToObject(&gob) 59 | if err != nil { return } 60 | gob.FromGob(&job) 61 | 62 | return 63 | } 64 | 65 | func (q *BufferedQueue) directEnqueue(job *Job) bool { 66 | q.m.Lock() 67 | defer q.m.Unlock() 68 | 69 | bs := config.JobBufferSize 70 | if len(q.buf) < bs || bs < 0 { 71 | q.buf = append(q.buf, *job) 72 | return true 73 | } else { 74 | return false 75 | } 76 | } 77 | 78 | func (q *BufferedQueue) directDequeue(job *Job) bool { 79 | q.m.Lock() 80 | defer q.m.Unlock() 81 | 82 | if len(q.buf) > 0 { 83 | *job = q.buf[0] 84 | q.buf = q.buf[1:] 85 | return true 86 | } else { 87 | return false 88 | } 89 | } 90 | 91 | // Always returns nil (But implements io.Closer) 92 | func (q *BufferedQueue) Close() error { 93 | if config.JobBufferSize < 0 { 94 | return nil 95 | } 96 | 97 | // Close ignoring errors 98 | q.q.Close() 99 | 100 | // Delete files 101 | if err := os.RemoveAll(q.dataDir); 102 | err != nil { panic(err) } 103 | 104 | return nil 105 | } 106 | 107 | type JobGob struct { 108 | Uri string 109 | Fails int 110 | LastError string 111 | } 112 | 113 | func (g *JobGob) ToGob(j *Job) { 114 | g.Uri = j.UriStr 115 | g.Fails = j.Fails 116 | if j.LastError != nil { 117 | g.LastError = j.LastError.Error() 118 | } 119 | } 120 | 121 | func (g *JobGob) FromGob(j *Job) { 122 | if err := j.Uri.Parse(g.Uri); 123 | err != nil { panic(err) } 124 | j.UriStr = g.Uri 125 | j.Fails = g.Fails 126 | if g.LastError != "" { 127 | j.LastError = errorString(g.LastError) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | appname="od-database-crawler" 4 | tag=$1 5 | [ -z "$tag" ] && echo "Usage: build " && exit 1 6 | 7 | name=${appname}-${tag}-windows.exe 8 | GOOS="windows" GOARCH="amd64" go build -ldflags="-s -w" -o $name 9 | gzip -f $name 10 | echo $name 11 | 12 | name=${appname}-${tag}-linux 13 | GOOS="linux" GOARCH="amd64" go build -ldflags="-s -w" -o $name 14 | gzip -f $name 15 | echo $name 16 | 17 | name=${appname}-${tag}-mac 18 | GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o $name 19 | gzip -f $name 20 | echo $name 21 | 22 | name=${appname}-${tag}-freebsd 23 | GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o $name 24 | gzip -f $name 25 | echo $name 26 | -------------------------------------------------------------------------------- /scheduler.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "github.com/sirupsen/logrus" 8 | "github.com/terorie/od-database-crawler/fasturl" 9 | "os" 10 | "path" 11 | "sync" 12 | "sync/atomic" 13 | "time" 14 | ) 15 | 16 | var activeTasksLock sync.Mutex 17 | var activeTasks = make(map[uint64]bool) 18 | var numActiveTasks int32 19 | var totalQueued int64 20 | 21 | func Schedule(c context.Context, remotes <-chan *OD) { 22 | go Stats(c) 23 | 24 | for remote := range remotes { 25 | logrus.WithField("url", remote.BaseUri.String()). 26 | Info("Starting crawler") 27 | 28 | // Collect results 29 | results := make(chan File) 30 | 31 | remote.WCtx.OD = remote 32 | 33 | // Get queue path 34 | queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId)) 35 | 36 | // Delete existing queue 37 | if err := os.RemoveAll(queuePath); 38 | err != nil { panic(err) } 39 | 40 | // Start new queue 41 | var err error 42 | remote.WCtx.Queue, err = OpenQueue(queuePath) 43 | if err != nil { panic(err) } 44 | 45 | // Spawn workers 46 | for i := 0; i < config.Workers; i++ { 47 | go remote.WCtx.Worker(results) 48 | } 49 | 50 | // Enqueue initial job 51 | atomic.AddInt32(&numActiveTasks, 1) 52 | remote.WCtx.queueJob(Job{ 53 | Uri: remote.BaseUri, 54 | UriStr: remote.BaseUri.String(), 55 | Fails: 0, 56 | }) 57 | 58 | // Upload result when ready 59 | go remote.Watch(results) 60 | 61 | // Sleep if max number of tasks are active 62 | for atomic.LoadInt32(&numActiveTasks) > config.Tasks { 63 | select { 64 | case <-c.Done(): 65 | return 66 | case <-time.After(time.Second): 67 | continue 68 | } 69 | } 70 | } 71 | } 72 | 73 | func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) { 74 | if !t.register() { 75 | return 76 | } 77 | 78 | globalWait.Add(1) 79 | now := time.Now() 80 | od := &OD { 81 | Task: *t, 82 | BaseUri: *u, 83 | Result: TaskResult { 84 | WebsiteId: t.WebsiteId, 85 | StartTime: now, 86 | StartTimeUnix: now.Unix(), 87 | }, 88 | } 89 | remotes <- od 90 | } 91 | 92 | func (t *Task) register() bool { 93 | activeTasksLock.Lock() 94 | defer activeTasksLock.Unlock() 95 | 96 | if _, known := activeTasks[t.WebsiteId]; known { 97 | return false 98 | } else { 99 | activeTasks[t.WebsiteId] = true 100 | return true 101 | } 102 | } 103 | 104 | func (t *Task) unregister() { 105 | activeTasksLock.Lock() 106 | delete(activeTasks, t.WebsiteId) 107 | activeTasksLock.Unlock() 108 | } 109 | 110 | func (o *OD) Watch(results chan File) { 111 | // Mark job as completely done 112 | defer globalWait.Done() 113 | defer o.Task.unregister() 114 | 115 | filePath := path.Join("crawled", fmt.Sprintf("%d.json", o.Task.WebsiteId)) 116 | 117 | // Open crawl results file 118 | f, err := os.OpenFile( 119 | filePath, 120 | os.O_CREATE | os.O_RDWR | os.O_TRUNC, 121 | 0644, 122 | ) 123 | if err != nil { 124 | logrus.WithError(err). 125 | Error("Failed saving crawl results") 126 | return 127 | } 128 | defer f.Close() 129 | defer os.Remove(filePath) 130 | 131 | // Listen for exit code of Collect() 132 | collectErrC := make(chan error) 133 | 134 | // Block until all results are written 135 | // (closes results channel) 136 | o.handleCollect(results, f, collectErrC) 137 | 138 | // Exit code of Collect() 139 | err = <-collectErrC 140 | close(collectErrC) 141 | if err != nil { 142 | logrus.WithError(err). 143 | Error("Failed saving crawl results") 144 | return 145 | } 146 | 147 | // Upload results 148 | err = PushResult(&o.Result, f) 149 | if err != nil { 150 | logrus.WithError(err). 151 | Error("Failed uploading crawl results") 152 | return 153 | } 154 | } 155 | 156 | func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error) { 157 | // Begin collecting results 158 | go o.Task.Collect(results, f, collectErrC) 159 | defer close(results) 160 | 161 | // Wait for all jobs on remote to finish 162 | o.Wait.Wait() 163 | 164 | // Close queue 165 | if err := o.WCtx.Queue.Close(); err != nil { 166 | panic(err) 167 | } 168 | atomic.AddInt32(&numActiveTasks, -1) 169 | 170 | // Log finish 171 | 172 | logrus.WithFields(logrus.Fields{ 173 | "id": o.Task.WebsiteId, 174 | "url": o.BaseUri.String(), 175 | "duration": time.Since(o.Result.StartTime), 176 | }).Info("Crawler finished") 177 | 178 | // Set status code 179 | now := time.Now() 180 | o.Result.EndTimeUnix = now.Unix() 181 | fileCount := atomic.LoadUint64(&o.Result.FileCount) 182 | if fileCount == 0 { 183 | errorCount := atomic.LoadUint64(&o.Result.ErrorCount) 184 | if errorCount == 0 { 185 | o.Result.StatusCode = "empty" 186 | } else { 187 | o.Result.StatusCode = "directory listing failed" 188 | } 189 | } else { 190 | o.Result.StatusCode = "success" 191 | } 192 | } 193 | 194 | func (t *Task) Collect(results chan File, f *os.File, errC chan<- error) { 195 | err := t.collect(results, f) 196 | if err != nil { 197 | logrus.WithError(err). 198 | Error("Failed saving crawl results") 199 | } 200 | errC <- err 201 | } 202 | 203 | func (t *Task) collect(results chan File, f *os.File) error { 204 | for result := range results { 205 | result.Path = fasturl.PathUnescape(result.Path) 206 | result.Name = fasturl.PathUnescape(result.Name) 207 | resJson, err := json.Marshal(result) 208 | if err != nil { panic(err) } 209 | _, err = f.Write(resJson) 210 | if err != nil { return err } 211 | _, err = f.Write([]byte{'\n'}) 212 | if err != nil { return err } 213 | } 214 | 215 | return nil 216 | } 217 | -------------------------------------------------------------------------------- /server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "github.com/sirupsen/logrus" 8 | "github.com/spf13/viper" 9 | "io" 10 | "mime/multipart" 11 | "net/http" 12 | "net/url" 13 | "os" 14 | "strconv" 15 | "time" 16 | ) 17 | 18 | var serverClient = http.Client { 19 | Timeout: config.ServerTimeout, 20 | Transport: new(ServerTripper), 21 | } 22 | 23 | var serverUserAgent = "od-database-crawler/" + rootCmd.Version 24 | 25 | func FetchTask() (t *Task, err error) { 26 | res, err := serverClient.PostForm( 27 | config.ServerUrl + "/task/get", 28 | url.Values{ "token": {config.Token} }) 29 | if err != nil { return } 30 | defer res.Body.Close() 31 | 32 | switch res.StatusCode { 33 | case 200: 34 | break 35 | case 404, 500: 36 | return nil, nil 37 | default: 38 | return nil, fmt.Errorf("http %s", res.Status) 39 | } 40 | 41 | t = new(Task) 42 | err = json.NewDecoder(res.Body).Decode(t) 43 | if _, ok := err.(*json.SyntaxError); ok { 44 | return nil, fmt.Errorf("/task/get returned invalid JSON") 45 | } else if err != nil { return } 46 | 47 | return 48 | } 49 | 50 | func PushResult(result *TaskResult, f *os.File) (err error) { 51 | if result.WebsiteId == 0 { 52 | // Not a real result, don't push 53 | return nil 54 | } 55 | 56 | // Rewind to the beginning of the file 57 | _, err = f.Seek(0, 0) 58 | if err != nil { 59 | return 60 | } 61 | 62 | err = uploadChunks(result.WebsiteId, f) 63 | if err != nil { 64 | logrus.Errorf("Failed to upload file list: %s", err) 65 | err2 := CancelTask(result.WebsiteId) 66 | if err2 != nil { 67 | logrus.Error(err2) 68 | } 69 | return 70 | } 71 | 72 | // Upload result ignoring errors 73 | uploadResult(result) 74 | 75 | return 76 | } 77 | 78 | func uploadChunks(websiteId uint64, f *os.File) error { 79 | eof := false 80 | for iter := 1; !eof; iter++ { 81 | // TODO Stream with io.Pipe? 82 | var b bytes.Buffer 83 | 84 | multi := multipart.NewWriter(&b) 85 | 86 | // Set upload fields 87 | var err error 88 | err = multi.WriteField("token", config.Token) 89 | if err != nil { return err } 90 | err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId)) 91 | if err != nil { return err } 92 | 93 | // Copy chunk to file_list 94 | formFile, err := multi.CreateFormFile("file_list", "file_list") 95 | var n int64 96 | n, err = io.CopyN(formFile, f, config.ChunkSize) 97 | if err != io.EOF && err != nil { 98 | return err 99 | } 100 | if n == 0 { 101 | // Don't upload, no content 102 | return nil 103 | } else if n < config.ChunkSize { 104 | err = nil 105 | // Break at end of iteration 106 | eof = true 107 | } 108 | 109 | multi.Close() 110 | 111 | for retries := 0; retries < viper.GetInt(ConfUploadRetries); retries++ { 112 | if retries > 0 { 113 | // Error occurred, retry upload 114 | time.Sleep(viper.GetDuration(ConfUploadRetryInterval)) 115 | } 116 | 117 | req, err := http.NewRequest( 118 | http.MethodPost, 119 | config.ServerUrl + "/task/upload", 120 | &b) 121 | req.Header.Set("content-type", multi.FormDataContentType()) 122 | if err != nil { continue } 123 | 124 | res, err := serverClient.Do(req) 125 | if err != nil { continue } 126 | res.Body.Close() 127 | 128 | if res.StatusCode != http.StatusOK { 129 | logrus.WithField("status", res.Status). 130 | WithField("part", iter). 131 | Errorf("Upload failed") 132 | continue 133 | } 134 | 135 | // Upload successful 136 | break 137 | } 138 | 139 | logrus.WithField("id", websiteId). 140 | WithField("part", iter). 141 | Infof("Uploaded files chunk") 142 | } 143 | return nil 144 | } 145 | 146 | func uploadResult(result *TaskResult) (err error) { 147 | resultEnc, err := json.Marshal(result) 148 | if err != nil { panic(err) } 149 | 150 | res, err := serverClient.PostForm( 151 | config.ServerUrl + "/task/complete", 152 | url.Values { 153 | "token": {config.Token}, 154 | "result": {string(resultEnc)}, 155 | }, 156 | ) 157 | if err != nil { return } 158 | res.Body.Close() 159 | 160 | if res.StatusCode != http.StatusOK { 161 | return HttpError{res.StatusCode} 162 | } 163 | 164 | return 165 | } 166 | 167 | func CancelTask(websiteId uint64) (err error) { 168 | res, err := serverClient.PostForm( 169 | config.ServerUrl + "/task/cancel", 170 | url.Values{ 171 | "token": {config.Token}, 172 | "website_id": {strconv.FormatUint(websiteId, 10)}, 173 | }, 174 | ) 175 | if err != nil { return } 176 | res.Body.Close() 177 | 178 | if res.StatusCode != http.StatusOK { 179 | return fmt.Errorf("failed to cancel task: %s", res.Status) 180 | } 181 | 182 | return 183 | } 184 | 185 | type ServerTripper struct{} 186 | 187 | func (t *ServerTripper) RoundTrip(req *http.Request) (res *http.Response, err error) { 188 | req.Header.Set("User-Agent", serverUserAgent) 189 | return http.DefaultTransport.RoundTrip(req) 190 | } 191 | -------------------------------------------------------------------------------- /stats.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "github.com/sirupsen/logrus" 6 | "github.com/spf13/viper" 7 | "math" 8 | "runtime" 9 | "sync/atomic" 10 | "time" 11 | ) 12 | 13 | var totalStarted uint64 14 | var totalDone uint64 15 | var totalRetries uint64 16 | var totalAborted uint64 17 | 18 | func Stats(c context.Context) { 19 | var startedLast uint64 = 0 20 | var crawlTicker <-chan time.Time 21 | var allocTicker <-chan time.Time 22 | 23 | crawlInterval := viper.GetDuration(ConfCrawlStats) 24 | allocInterval := viper.GetDuration(ConfAllocStats) 25 | 26 | if crawlInterval != 0 { 27 | crawlTicker = time.Tick(crawlInterval) 28 | } 29 | if allocInterval != 0 { 30 | allocTicker = time.Tick(allocInterval) 31 | } 32 | 33 | for { 34 | select { 35 | case <-crawlTicker: 36 | startedNow := atomic.LoadUint64(&totalStarted) 37 | 38 | perSecond := float64(startedNow - startedLast) / 39 | crawlInterval.Seconds() 40 | 41 | // Round to .5 42 | perSecond *= 2 43 | perSecond = math.Round(perSecond) 44 | perSecond /= 2 45 | 46 | if perSecond <= 0 { 47 | continue 48 | } 49 | 50 | logrus.WithFields(logrus.Fields{ 51 | "per_second": perSecond, 52 | "done": atomic.LoadUint64(&totalDone), 53 | "retries": atomic.LoadUint64(&totalRetries), 54 | "aborted": atomic.LoadUint64(&totalAborted), 55 | }).Info("Crawl Stats") 56 | 57 | startedLast = startedNow 58 | 59 | case <-allocTicker: 60 | var mem runtime.MemStats 61 | runtime.ReadMemStats(&mem) 62 | 63 | logrus.WithFields(logrus.Fields{ 64 | "queue_count": atomic.LoadInt64(&totalQueued), 65 | "heap": FormatByteCount(mem.Alloc), 66 | "objects": mem.HeapObjects, 67 | "num_gc": mem.NumGC, 68 | }).Info("Resource Stats") 69 | 70 | case <-c.Done(): 71 | return 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | ) 7 | 8 | // https://programming.guide/go/formatting-byte-size-to-human-readable-format.html 9 | func FormatByteCount(b uint64) string { 10 | const unit = 1024 11 | if b < unit { 12 | return fmt.Sprintf("%d B", b) 13 | } else { 14 | div, exp := int64(unit), 0 15 | for n := b / unit; n >= unit; n /= unit { 16 | div *= unit 17 | exp++ 18 | } 19 | return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp]) 20 | } 21 | } 22 | 23 | type Hooks struct { 24 | m sync.Mutex 25 | l []func() 26 | } 27 | 28 | func (h *Hooks) Add(hook func()) { 29 | h.m.Lock() 30 | h.l = append(h.l, hook) 31 | h.m.Unlock() 32 | } 33 | 34 | func (h *Hooks) Execute() { 35 | for _, hook := range h.l { 36 | hook() 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /worker.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/beeker1121/goque" 5 | "github.com/sirupsen/logrus" 6 | "math" 7 | "sort" 8 | "strings" 9 | "sync" 10 | "sync/atomic" 11 | "time" 12 | ) 13 | 14 | var globalWait sync.WaitGroup 15 | 16 | type WorkerContext struct { 17 | OD *OD 18 | Queue *BufferedQueue 19 | lastRateLimit time.Time 20 | numRateLimits int 21 | } 22 | 23 | func (w *WorkerContext) Worker(results chan<- File) { 24 | for { 25 | job, err := w.Queue.Dequeue() 26 | switch err { 27 | case goque.ErrEmpty: 28 | time.Sleep(500 * time.Millisecond) 29 | continue 30 | 31 | case goque.ErrDBClosed: 32 | return 33 | 34 | case nil: 35 | w.step(results, job) 36 | 37 | default: 38 | panic(err) 39 | } 40 | } 41 | } 42 | 43 | func (w *WorkerContext) step(results chan<- File, job Job) { 44 | defer w.finishJob() 45 | 46 | var f File 47 | 48 | newJobs, err := w.DoJob(&job, &f) 49 | atomic.AddUint64(&totalStarted, 1) 50 | if err == ErrKnown { 51 | return 52 | } 53 | 54 | if err != nil { 55 | job.Fails++ 56 | 57 | if !shouldRetry(err) { 58 | atomic.AddUint64(&totalAborted, 1) 59 | logrus.WithField("url", job.UriStr). 60 | WithError(err). 61 | Error("Giving up after failure") 62 | return 63 | } 64 | 65 | if job.Fails > config.Retries { 66 | atomic.AddUint64(&totalAborted, 1) 67 | logrus.WithField("url", job.UriStr). 68 | Errorf("Giving up after %d fails", job.Fails) 69 | } else { 70 | atomic.AddUint64(&totalRetries, 1) 71 | if err == ErrRateLimit { 72 | w.lastRateLimit = time.Now() 73 | w.numRateLimits++ 74 | } 75 | w.queueJob(job) 76 | } 77 | return 78 | } 79 | 80 | atomic.AddUint64(&totalDone, 1) 81 | for _, job := range newJobs { 82 | w.queueJob(job) 83 | } 84 | 85 | if !f.IsDir { 86 | results <- f 87 | } 88 | } 89 | 90 | func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) { 91 | if len(job.Uri.Path) == 0 { return } 92 | if job.Uri.Path[len(job.Uri.Path)-1] == '/' { 93 | // Load directory 94 | links, err := GetDir(job, f) 95 | if err != nil { 96 | if !isErrSilent(err) { 97 | logrus.WithError(err). 98 | WithField("url", job.UriStr). 99 | Error("Failed to crawl dir") 100 | } 101 | return nil, err 102 | } 103 | 104 | // Hash directory 105 | hash := f.HashDir(links) 106 | 107 | // Skip symlinked dirs 108 | if w.OD.LoadOrStoreKey(&hash) { 109 | return nil, ErrKnown 110 | } 111 | 112 | // Sort by path 113 | sort.Slice(links, func(i, j int) bool { 114 | return strings.Compare(links[i].Path, links[j].Path) < 0 115 | }) 116 | 117 | var newJobCount int 118 | var lastLink string 119 | for _, link := range links { 120 | uriStr := link.String() 121 | 122 | // Ignore dupes 123 | if uriStr == lastLink { 124 | continue 125 | } 126 | lastLink = uriStr 127 | 128 | newJobs = append(newJobs, Job{ 129 | Uri: link, 130 | UriStr: uriStr, 131 | Fails: 0, 132 | }) 133 | 134 | newJobCount++ 135 | } 136 | if config.Verbose { 137 | logrus.WithFields(logrus.Fields{ 138 | "url": job.UriStr, 139 | "files": newJobCount, 140 | }).Debug("Listed") 141 | } 142 | } else { 143 | // Load file 144 | err := GetFile(job.Uri, f) 145 | if err != nil { 146 | if !isErrSilent(err) { 147 | logrus.WithError(err). 148 | WithField("url", job.UriStr). 149 | Error("Failed to crawl file") 150 | } 151 | return nil, err 152 | } 153 | atomic.AddUint64(&w.OD.Result.FileCount, 1) 154 | } 155 | return 156 | } 157 | 158 | func (w *WorkerContext) queueJob(job Job) { 159 | w.OD.Wait.Add(1) 160 | 161 | if w.numRateLimits > 0 { 162 | if time.Since(w.lastRateLimit) > 5 * time.Second { 163 | w.numRateLimits = 0 164 | } else { 165 | time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) * 166 | 100 * time.Millisecond) 167 | } 168 | } 169 | 170 | if err := w.Queue.Enqueue(&job); err != nil { 171 | panic(err) 172 | } 173 | } 174 | 175 | func (w *WorkerContext) finishJob() { 176 | w.OD.Wait.Done() 177 | } 178 | 179 | func isErrSilent(err error) bool { 180 | if !config.PrintHTTP { 181 | if _, ok := err.(*HttpError); ok { 182 | return true 183 | } 184 | } 185 | return false 186 | } 187 | --------------------------------------------------------------------------------