├── .gitignore ├── LICENSE ├── README.md ├── build.sh ├── clean.sh ├── input └── example.json ├── main.go ├── output └── example.json └── scraper ├── config.go ├── scraper.go └── utilities.go /.gitignore: -------------------------------------------------------------------------------- 1 | jsonscraper_linux 2 | jsonscraper_darwin 3 | jsonscraper_windows.exe -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jsonscraper 2 | 3 | JSON configurable concurrent scraper. Written in Go. 4 | 5 | For given JSON config file(s), produces JSON file(s) with results. 6 | 7 | ## Instructions 8 | 9 | `$ jsonscraper configPath` 10 | 11 | Output will be saved in file path that is provided in configuration. 12 | 13 | You can also run it with multiple configs at once: 14 | `$ jsonscraper configPath1 configPath2 configPath3 ...` 15 | 16 | ## Config documentation 17 | Configuration is an object which consists of `urls`, `targets` and `output`. 18 | 19 | ### `urls` (Array) 20 | Array of URLs which will be scraped. 21 | 22 | ### `targets` (Array) 23 | Each object in this array should have atleast: 24 | * `selector` is selection query to be perfomed 25 | * `type` can be inner HTML (`html`), plain text (`text`) or attribute (for example `attr:href`) 26 | * `tag` is field key which appears in output file 27 | 28 | Optional is `submatch` which can contain regular expression. `selector` can be omitted if `submatch` is present, then whole document will be used for lookups. 29 | 30 | ### `output` (Object) 31 | * `path` is an output path where data will be saved. `$FILENAME` will be replaced with input file name. 32 | 33 | 34 | ## Example 35 | 36 | #### Input file: `input/example.json` 37 | 38 | ```json 39 | { 40 | "urls": [ 41 | "https://news.ycombinator.com/" 42 | ], 43 | "targets": [ 44 | { 45 | "selector": ".storylink", 46 | "type": "text", 47 | "tag": "storyTitleText" 48 | }, 49 | { 50 | "selector": ".title", 51 | "type": "html", 52 | "tag": "storyTitleHtml" 53 | }, 54 | { 55 | "selector": ".storylink", 56 | "type": "text", 57 | "tag": "storyTitleWords", 58 | "submatch": "([a-zA-Z]+)+" 59 | }, 60 | { 61 | "selector": ".storylink", 62 | "type": "attr:href", 63 | "tag": "storyTitleLinks" 64 | } 65 | ], 66 | "output": { 67 | "path": "output/$FILENAME" 68 | } 69 | } 70 | ``` 71 | 72 | Above configuration will produce following data: 73 | 74 | #### Output file: `output/example.json` 75 | 76 | ```json 77 | { 78 | "storyTitleHtml":[ 79 | "1.", 80 | "How We Built r/Place (redditblog.com)", 81 | "2.", 82 | ... 83 | ], 84 | "storyTitleLinks":[ 85 | "https://redditblog.com/2017/04/13/how-we-built-rplace/", 86 | "http://www.bbc.com/news/science-environment-39592059", 87 | "https://stripe.com/blog/increment", 88 | ... 89 | ], 90 | "storyTitleText":[ 91 | "How We Built r/Place", 92 | "Saturn moon 'able to support life'", 93 | "Introducing Increment", 94 | ... 95 | ], 96 | "storyTitleWords":[ 97 | "How", 98 | "We", 99 | "Built", 100 | ... 101 | ] 102 | } 103 | ``` -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Building for Linux" 4 | GOARCH="amd64" GOOS="linux" go build -o jsonscraper_linux 5 | 6 | echo "Building for Darwin" 7 | GOARCH="amd64" GOOS="darwin" go build -o jsonscraper_darwin 8 | 9 | echo "Building for Windows" 10 | GOARCH="amd64" GOOS="windows" go build -o jsonscraper_windows.exe -------------------------------------------------------------------------------- /clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm jsonscraper_linux 4 | rm jsonscraper_darwin 5 | rm jsonscraper_windows.exe -------------------------------------------------------------------------------- /input/example.json: -------------------------------------------------------------------------------- 1 | { 2 | "urls": [ 3 | "https://news.ycombinator.com/" 4 | ], 5 | "targets": [ 6 | { 7 | "selector": ".storylink", 8 | "type": "text", 9 | "tag": "storyTitleText" 10 | }, 11 | { 12 | "selector": ".title", 13 | "type": "html", 14 | "tag": "storyTitleHtml" 15 | }, 16 | { 17 | "selector": ".storylink", 18 | "type": "text", 19 | "tag": "storyTitleWords", 20 | "submatch": "([a-zA-Z]+)+" 21 | }, 22 | { 23 | "selector": ".storylink", 24 | "type": "attr:href", 25 | "tag": "storyTitleLinks" 26 | } 27 | ], 28 | "output": { 29 | "path": "output/$FILENAME" 30 | } 31 | } -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "sync" 8 | "time" 9 | 10 | "github.com/ssimunic/jsonscraper/scraper" 11 | ) 12 | 13 | var wg sync.WaitGroup 14 | 15 | func init() { 16 | if len(os.Args) < 2 { 17 | fmt.Fprintln(os.Stderr, "Missing config path(s).") 18 | os.Exit(1) 19 | } 20 | } 21 | 22 | func main() { 23 | start := time.Now() 24 | configPaths := os.Args[1:] 25 | wg.Add(len(configPaths)) 26 | 27 | for _, configPath := range configPaths { 28 | go func(configPath string) { 29 | defer wg.Done() 30 | 31 | s, err := scraper.New(configPath) 32 | if err != nil { 33 | fmt.Fprintln(os.Stderr, err) 34 | return 35 | } 36 | 37 | s.Start() 38 | }(configPath) 39 | } 40 | 41 | wg.Wait() 42 | log.Printf("Time elapsed: %vs", time.Since(start).Seconds()) 43 | } 44 | -------------------------------------------------------------------------------- /output/example.json: -------------------------------------------------------------------------------- 1 | {"storyTitleHtml":["1.","Building a r/place in a weekend (josephg.com)","2.","Solid – Re-decentralizing the web (github.com)","3.","Show HN: Record and share Police/Fire radio systems (openmhz.com)","4.","On OpenBSD CDs: “Most things come to an end, sorry” (marc.info)","5.","Pixels Don’t Care (2013) (hackerfall.com)","6.","Explain mathematically, a video from a space station of the “Dzhanibekov effect” (mathoverflow.net)","7.","Kazakhstan spells out plans for alphabet swap (dw.com)","8.","Show HN: 'Hack' the user cursor (javier.xyz)","9.","The Experimental Layout Lab of Jen Simmons (jensimmons.com)","10.","Why ML/OCaml are good for writing compilers (1998) (yale.edu)","11.","How not to monetise a popular blog (greig.cc)","12.","Building a simple e-ink display from scratch (hackaday.com)","13.","World's oldest person Emma Morano dies at 117 (bbc.com)","14.","Arturo Di Modica has a point (gregfallis.com)","15.","LandHere (galois.com)","16.","Show HN: UrlRoulette – Pass a URL to the next visitor (urlroulette.net)","17.","HP Labs developed a new rewritable printing technology (hp.com)","18.","Preventing AMI’s BiOS from interfering with coreboot flashing on the Librem 13 (puri.sm)","19.","Golang SSH Security (grumpy-troll.org)","20.","Show HN: Positive News Reader based on sentiment analysis (sentinewsmob.ml)","21.","Optimizing Rust Struct Size (camlorn.net)","22.","Strikingly (YC W13) is hiring in our Shanghai office (strikingly.com)","23.","Bob Taylor Has Died (nytimes.com)","24.","Brain in a Jar (stephaniehurlburt.com)","25.","Game Tales: Cray YMP (2010) (rome.ro)","26.","Tim Berners-Lee envisions a better web (wired.com)","27.","Chrome 55-57 showed “download” button for all HTML5 media (chromium.org)","28.","Apple's cash hoard swells to $246B (cnbc.com)","29.","The flexi disc audio format (arstechnica.com)","30.","Ex-Professor Says Dismissed Racketeering Case Is Still ‘Devastating’ (nytimes.com)","More"],"storyTitleLinks":["https://josephg.com/blog/rplace-in-a-weekend/","https://github.com/solid/solid","https://openmhz.com/","http://marc.info/?l=openbsd-misc&m=149232307018311&w=2","https://hackerfall.com/story/pixels-dont-care","https://mathoverflow.net/questions/81960/the-dzhanibekov-effect-an-exercise-in-mechanics-or-fiction-explain-mathemat","http://www.dw.com/en/kazakhstan-spells-out-plans-for-alphabet-swap/a-38407769","http://javier.xyz/control-user-cursor/","http://labs.jensimmons.com/","http://flint.cs.yale.edu/cs421/case-for-ml.html","http://greig.cc/literally-a-shipment-of-fail-dot-com/","http://hackaday.com/2017/04/12/can-you-build-an-e-ink-display-from-scratch/","http://www.bbc.com/news/world-europe-39610937","https://gregfallis.com/2017/04/14/seriously-the-guy-has-a-point/","http://landhere.galois.com/","https://urlroulette.net/","https://newsblog.ext.hp.com/t5/HP-newsroom-blog/HP-Labs-developed-a-new-rewritable-printing-technology-set-to/ba-p/849","https://puri.sm/posts/preventing-interference-from-the-old-bios-while-flashing-coreboot/","https://bridge.grumpy-troll.org/2017/04/golang-ssh-security/","http://www.sentinewsmob.ml/","http://camlorn.net/posts/April%202017/rust-struct-field-reordering.html","http://www.strikingly.com/s/careers?utm_source=hn&utm_content=sh","https://www.nytimes.com/2017/04/14/technology/robert-taylor-innovator-who-shaped-modern-computing-dies-at-85.html","http://stephaniehurlburt.com/blog/2017/4/15/brain-in-a-jar","http://rome.ro/news/2015/12/13/gametales-cray-ymp","https://www.wired.com/2017/04/tim-berners-lee-inventor-web-plots-radical-overhaul-creation/","https://bugs.chromium.org/p/chromium/issues/detail?id=675596","https://www.cnbc.com/2017/01/31/apples-cash-hoard-swells-to-record-24609-billion.html","https://arstechnica.com/gadgets/2017/04/forgotten-audio-formats-flexi-disc/","https://www.nytimes.com/2017/04/09/business/joy-laskar-georgia-tech-racketeering-case.html"],"storyTitleText":["Building a r/place in a weekend","Solid – Re-decentralizing the web","Show HN: Record and share Police/Fire radio systems","On OpenBSD CDs: “Most things come to an end, sorry”","Pixels Don’t Care (2013)","Explain mathematically, a video from a space station of the “Dzhanibekov effect”","Kazakhstan spells out plans for alphabet swap","Show HN: 'Hack' the user cursor","The Experimental Layout Lab of Jen Simmons","Why ML/OCaml are good for writing compilers (1998)","How not to monetise a popular blog","Building a simple e-ink display from scratch","World's oldest person Emma Morano dies at 117","Arturo Di Modica has a point","LandHere","Show HN: UrlRoulette – Pass a URL to the next visitor","HP Labs developed a new rewritable printing technology","Preventing AMI’s BiOS from interfering with coreboot flashing on the Librem 13","Golang SSH Security","Show HN: Positive News Reader based on sentiment analysis","Optimizing Rust Struct Size","Strikingly (YC W13) is hiring in our Shanghai office","Bob Taylor Has Died","Brain in a Jar","Game Tales: Cray YMP (2010)","Tim Berners-Lee envisions a better web","Chrome 55-57 showed “download” button for all HTML5 media","Apple's cash hoard swells to $246B","The flexi disc audio format","Ex-Professor Says Dismissed Racketeering Case Is Still ‘Devastating’"],"storyTitleWords":["Building","a","r","place","in","a","weekend","Solid","Re","decentralizing","the","web","Show","HN","Record","and","share","Police","Fire","radio","systems","On","OpenBSD","CDs","Most","things","come","to","an","end","sorry","Pixels","Don","t","Care","Explain","mathematically","a","video","from","a","space","station","of","the","Dzhanibekov","effect","Kazakhstan","spells","out","plans","for","alphabet","swap","Show","HN","Hack","the","user","cursor","The","Experimental","Layout","Lab","of","Jen","Simmons","Why","ML","OCaml","are","good","for","writing","compilers","How","not","to","monetise","a","popular","blog","Building","a","simple","e","ink","display","from","scratch","World","s","oldest","person","Emma","Morano","dies","at","Arturo","Di","Modica","has","a","point","LandHere","Show","HN","UrlRoulette","Pass","a","URL","to","the","next","visitor","HP","Labs","developed","a","new","rewritable","printing","technology","Preventing","AMI","s","BiOS","from","interfering","with","coreboot","flashing","on","the","Librem","Golang","SSH","Security","Show","HN","Positive","News","Reader","based","on","sentiment","analysis","Optimizing","Rust","Struct","Size","Strikingly","YC","W","is","hiring","in","our","Shanghai","office","Bob","Taylor","Has","Died","Brain","in","a","Jar","Game","Tales","Cray","YMP","Tim","Berners","Lee","envisions","a","better","web","Chrome","showed","download","button","for","all","HTML","media","Apple","s","cash","hoard","swells","to","B","The","flexi","disc","audio","format","Ex","Professor","Says","Dismissed","Racketeering","Case","Is","Still","Devastating"]} -------------------------------------------------------------------------------- /scraper/config.go: -------------------------------------------------------------------------------- 1 | package scraper 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io/ioutil" 7 | "path" 8 | "regexp" 9 | "strings" 10 | ) 11 | 12 | // Config holds information from config file. 13 | type Config struct { 14 | URLs []string `json:"urls"` 15 | Targets []*target `json:"targets"` 16 | Output *output `json:"output"` 17 | filePath string 18 | fileName string 19 | } 20 | 21 | type target struct { 22 | Selector string `json:"selector"` 23 | Submatch string `json:"submatch"` 24 | Tag string `json:"tag"` 25 | Type string `json:"type"` 26 | attrv string 27 | submatchRe *regexp.Regexp 28 | } 29 | 30 | type output struct { 31 | Path string `json:"path"` 32 | } 33 | 34 | func newConfig(filePath string) (*Config, error) { 35 | c := new(Config) 36 | 37 | data, err := ioutil.ReadFile(filePath) 38 | if err != nil { 39 | return nil, err 40 | } 41 | 42 | err = json.Unmarshal(data, c) 43 | if err != nil { 44 | return nil, err 45 | } 46 | 47 | c.filePath = filePath 48 | c.fileName = path.Base(filePath) 49 | 50 | for _, target := range c.Targets { 51 | if target.Submatch != "" { 52 | target.submatchRe, err = regexp.Compile(target.Submatch) 53 | if err != nil { 54 | return nil, err 55 | } 56 | } 57 | if !c.isValidTarget(target) { 58 | return nil, fmt.Errorf("missing target property") 59 | } 60 | if strings.HasPrefix(target.Type, "attr:") { 61 | target.attrv = strings.Split(target.Type, ":")[1] 62 | } 63 | } 64 | 65 | return c, nil 66 | } 67 | 68 | func (c *Config) outputPath() string { 69 | outputPath := c.Output.Path 70 | return strings.Replace(outputPath, "$FILENAME", c.fileName, -1) 71 | } 72 | 73 | func (c *Config) isValidTarget(t *target) bool { 74 | if t.Tag != "" && t.Type != "" && (t.Selector != "" || t.Submatch != "") { 75 | return true 76 | } 77 | return false 78 | } 79 | -------------------------------------------------------------------------------- /scraper/scraper.go: -------------------------------------------------------------------------------- 1 | package scraper 2 | 3 | import ( 4 | "io/ioutil" 5 | "log" 6 | 7 | "fmt" 8 | "github.com/PuerkitoBio/goquery" 9 | ) 10 | 11 | // Scraper struct used for scraping activity. 12 | type Scraper struct { 13 | Config *Config 14 | Results results 15 | } 16 | 17 | // results key is tag, value is list of scraped data 18 | type results map[string][]string 19 | 20 | type resultsURL struct { 21 | url string 22 | results results 23 | } 24 | 25 | // New creates new Scraper and returns pointer to it, with error (if occurred). 26 | func New(configPath string) (*Scraper, error) { 27 | c, err := newConfig(configPath) 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | s := &Scraper{ 33 | Config: c, 34 | Results: make(results), 35 | } 36 | 37 | return s, nil 38 | } 39 | 40 | // Start will start scraping in separate goroutine and save results 41 | // when it is done in file that is defined in Config. 42 | func (s *Scraper) Start() { 43 | resultsURLCh := s.scrapeURLs(s.Config.URLs) 44 | 45 | for range s.Config.URLs { 46 | resultsURL := <-resultsURLCh 47 | mergeResults(s.Results, resultsURL.results) 48 | log.Println("Received results from", resultsURL.url) 49 | } 50 | 51 | log.Println("Done scraping.") 52 | 53 | err := s.save() 54 | if err != nil { 55 | log.Println(err) 56 | return 57 | } 58 | 59 | log.Println("Saved to", s.Config.outputPath()) 60 | } 61 | 62 | func (s *Scraper) scrapeURLs(urls []string) <-chan *resultsURL { 63 | resultsURLCh := make(chan *resultsURL) 64 | 65 | for _, url := range urls { 66 | go func(url string) { 67 | results := make(results) 68 | 69 | // Once individual URL scraping is done, send results back through channel 70 | defer func() { 71 | resultsURLCh <- &resultsURL{url: url, results: results} 72 | }() 73 | 74 | // Construct document for manipulation 75 | doc, err := goquery.NewDocument(url) 76 | if err != nil { 77 | log.Println(err) 78 | return 79 | } 80 | 81 | // Process targets 82 | for _, target := range s.Config.Targets { 83 | err := s.processTarget(doc, target, results) 84 | if err != nil { 85 | log.Println("Error processing target:", err) 86 | } 87 | } 88 | }(url) 89 | } 90 | 91 | return resultsURLCh 92 | } 93 | 94 | func (s *Scraper) processTarget(doc *goquery.Document, target *target, results results) error { 95 | var selector string 96 | // If there was no selector given, whole document will be used 97 | if target.Selector == "" { 98 | selector = "html" 99 | } else { 100 | selector = target.Selector 101 | } 102 | 103 | var retErr error 104 | doc.Find(selector).Each(func(i int, sel *goquery.Selection) { 105 | var value string 106 | 107 | // Handling different types 108 | switch { 109 | // Sets value to inner HTML of the node 110 | case target.Type == "html": 111 | html, err := sel.Html() 112 | if err != nil { 113 | retErr = err 114 | return 115 | } 116 | value = html 117 | // Sets value to text of the node 118 | case target.Type == "text": 119 | value = sel.Text() 120 | // Sets value to attribute of the node, for example attr:href for href value 121 | // If attribute value doesn't exist, target is skipped 122 | case target.attrv != "": 123 | if attrv, exists := sel.Attr(target.attrv); exists { 124 | value = attrv 125 | } else { 126 | return 127 | } 128 | } 129 | 130 | // Submatch regex 131 | if target.submatchRe != nil { 132 | matches := target.submatchRe.FindAllStringSubmatch(value, -1) 133 | for _, match := range matches { 134 | results[target.Tag] = append(results[target.Tag], match[0]) 135 | } 136 | return 137 | } 138 | 139 | results[target.Tag] = append(results[target.Tag], value) 140 | }) 141 | 142 | return retErr 143 | } 144 | 145 | func (s *Scraper) save() error { 146 | data, err := JSONMarshalUnescaped(s.Results) 147 | if err != nil { 148 | return fmt.Errorf("error marshaling to json: %v", err) 149 | } 150 | 151 | err = ioutil.WriteFile(s.Config.outputPath(), data, 0644) 152 | if err != nil { 153 | return fmt.Errorf("error writing file: %v", err) 154 | } 155 | 156 | return nil 157 | } 158 | -------------------------------------------------------------------------------- /scraper/utilities.go: -------------------------------------------------------------------------------- 1 | package scraper 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | ) 7 | 8 | func mergeResults(dest results, src results) { 9 | for k, v := range src { 10 | for _, text := range v { 11 | dest[k] = append(dest[k], text) 12 | } 13 | } 14 | } 15 | 16 | // JSONMarshalUnescaped does what json.Marshal does without escaping &, <, >. 17 | func JSONMarshalUnescaped(v interface{}) ([]byte, error) { 18 | data, err := json.Marshal(v) 19 | data = bytes.Replace(data, []byte("\\u0026"), []byte("&"), -1) 20 | data = bytes.Replace(data, []byte("\\u003c"), []byte("<"), -1) 21 | data = bytes.Replace(data, []byte("\\u003e"), []byte(">"), -1) 22 | return data, err 23 | } 24 | --------------------------------------------------------------------------------