├── .gitattributes ├── .travis.yml ├── EXAMPLES.md ├── LICENSE ├── Makefile ├── README.md ├── go.mod ├── goreleaser.yml ├── main.go └── pluck ├── plucker.go ├── plucker_test.go ├── striphtml ├── striphtml.go └── striphtml_test.go └── test ├── config.toml ├── config2.toml ├── food.toml ├── logo.png ├── main.py ├── song.html ├── song.toml ├── test.txt └── test2.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | vendor/* linguist-vendored 2 | pluck/test/* linguist-vendored 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - tip 5 | 6 | before_install: cd pluck -------------------------------------------------------------------------------- /EXAMPLES.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | ## Get headlines from news.google.com 4 | 5 | ``` 6 | $ pluck -a 'role="heading"' -a '>' -d '<' -t -s -u 'https://news.google.com/news/?ned=us&hl=en' 7 | ``` 8 | 9 | ## Get latest tweets from Donald Trump 10 | 11 | ``` 12 | $ wget https://twitter.com/search\?f\=tweets\&vertical\=default\&q\=from%3ArealDonaldTrump\&src\=typd -O twitter.html 13 | $ pluck -a '

' -d '

' -l -1 -s -f twitter.html 14 | ``` 15 | 16 | ## Read comments from Hacker News page 17 | 18 | ``` 19 | $ pluck -s -t -a 'class="c00"' -a '>' -d '"] 35 | deactivator = "<" 36 | name = "critic_ratings" 37 | 38 | [[pluck]] 39 | activators = ["all-critics-numbers","Average Rating:",">"] 40 | deactivator = "/" 41 | name = "average_critic_rating" 42 | 43 | [[pluck]] 44 | activators = ["audience-score","Average Rating:",">"] 45 | deactivator = "/" 46 | name = "average_user_rating" 47 | 48 | [[pluck]] 49 | activators = ["User Ratings:",">"] 50 | deactivator = "<" 51 | name = "user_ratings" 52 | ``` 53 | 54 | ```bash 55 | $ pluck -s -c rt.toml -u https://www.rottentomatoes.com/m/spider_man_homecoming/ 56 | ``` 57 | 58 | Returns: 59 | 60 | ```json 61 | { 62 | "average_critic_rating": "7.6", 63 | "average_user_rating": "4.3", 64 | "critic_ratings": "276", 65 | "name": "Spider-Man: Homecoming", 66 | "user_ratings": "85,131" 67 | } 68 | ``` 69 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Zack 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Make a release with 2 | # make -j4 release 3 | 4 | VERSION=$(shell git describe) 5 | LDFLAGS=-ldflags "-s -w -X main.version=${VERSION}" 6 | 7 | .PHONY: build 8 | build: 9 | go build ${LDFLAGS} -o dist/pluck 10 | 11 | .PHONY: linuxarm 12 | linuxarm: 13 | env GOOS=linux GOARCH=arm go build ${LDFLAGS} -o dist/pluck_linux_arm 14 | # cd dist && upx --brute pluck_linux_arm 15 | 16 | .PHONY: linux64 17 | linux64: 18 | env GOOS=linux GOARCH=amd64 go build ${LDFLAGS} -o dist/pluck_linux_amd64 19 | cd dist && upx --brute pluck_linux_amd64 20 | 21 | .PHONY: windows 22 | windows: 23 | env GOOS=windows GOARCH=amd64 go build ${LDFLAGS} -o dist/pluck_windows_amd64.exe 24 | # cd dist && upx --brute pluck_windows_amd64.exe 25 | 26 | .PHONY: osx 27 | osx: 28 | env GOOS=darwin GOARCH=amd64 go build ${LDFLAGS} -o dist/pluck_osx_amd64 29 | # cd dist && upx --brute pluck_osx_amd64 30 | 31 | .PHONY: release 32 | release: osx windows linux64 linuxarm 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | pluck 5 |
6 | Version 7 | Code Coverage 8 | Code Coverage 9 |

10 | 11 |

Pluck text in a fast and intuitive way. :rooster:

12 | 13 | *pluck* makes text extraction intuitive and [fast](https://github.com/schollz/pluck#current-benchmark). You can specify an extraction in nearly the same way you'd tell a person trying to extract the text by hand: "OK Bob, every time you find *X* and then *Y*, copy down everything you see until you encounter *Z*." 14 | 15 | In *pluck*, *X* and *Y* are called *activators* and *Z* is called the *deactivator*. The file/URL being plucked is parsed (or streamed) byte-by-byte into a finite state machine. Once all *activators* are found, the following bytes are saved to a buffer, which is added to a list of results once the *deactivator* is found. Multiple queries are extracted simultaneously and there is no requirement on the file format (e.g. XML/HTML), as long as its text. 16 | 17 | 18 | # Why? 19 | 20 | *pluck* was made as a simple alternative to xpath and regexp. Through simple declarations, *pluck* allows complex procedures like [extracting text in nested HTML tags](https://github.com/schollz/pluck#use-config-file), or [extracting the content of an attribute of a HTML tag](https://github.com/schollz/pluck#basic-usage). *pluck* may not work in all scenarios, so do not consider it a replacement for xpath or regexp. 21 | 22 | ### Doesn't regex already do this? 23 | 24 | Yes basically. Here is [an (simple) example](https://regex101.com/r/xt7fVr/1): 25 | 26 | ``` 27 | (?:(?:X.*Y)|(?:Y.*X))(.*)(?:Z) 28 | ``` 29 | 30 | Basically, this should try and match everything before a `Z` and after we've seen both `X` and `Y`, in any order. This is not a complete example, but it shows the similarity. 31 | 32 | The benefit with *pluck* is simplicity. You don't have to worry about escaping the right characters, nor do you need to know any regex syntax (which is not simple). Also *pluck* is hard-coded for matching this specific kind of pattern simultaneously, so there is no cost for generating a new deterministic finite automaton from multiple regex. 33 | 34 | ### Doesn't cascadia already do this? 35 | 36 | Yes, there is already [a command-line tool](https://github.com/suntong/cascadia) to extract structured information from XML/HTML. There are many benefits to *cascadia*, namely you can do a lot more complex things with structured data. If you don't have highly structured data, *pluck* is advantageous (it extracts from any file). Also, with *pluck* you don't need to learn CSS selection. 37 | 38 | # Getting Started 39 | 40 | ## Install 41 | 42 | If you have Go1.7+ 43 | 44 | ``` 45 | go get github.com/schollz/pluck 46 | ``` 47 | 48 | or just download from the [latest releases](https://github.com/schollz/pluck/releases/latest). 49 | 50 | ## Basic usage 51 | 52 | Lets say you want to find URLs in a HTML file. 53 | 54 | ```bash 55 | $ wget nytimes.com -O nytimes.html 56 | $ pluck -a '<' -a 'href' -a '"' -d '"' -l 10 -f nytimes.html 57 | { 58 | "0": [ 59 | "https://static01.nyt.com/favicon.ico", 60 | "https://static01.nyt.com/images/icons/ios-ipad-144x144.png", 61 | "https://static01.nyt.com/images/icons/ios-iphone-114x144.png", 62 | "https://static01.nyt.com/images/icons/ios-default-homescreen-57x57.png", 63 | "https://www.nytimes.com", 64 | "http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml", 65 | "http://mobile.nytimes.com", 66 | "http://mobile.nytimes.com", 67 | "https://typeface.nyt.com/css/zam5nzz.css", 68 | "https://a1.nyt.com/assets/homepage/20170731-135831/css/homepage/styles.css" 69 | ] 70 | } 71 | ``` 72 | 73 | The `-a` specifies *activators* and can be specified multiple times. Once all *activators* are found, in order, the bytes are captured. The `-d` specifies a *deactivator*. Once a *deactivator* is found, then it terminates capturing and resets and begins searching again. The `-l` specifies the limit (optional), after reaching the limit (`10` in this example) it stops searching. 74 | 75 | 76 | ## Advanced usage 77 | 78 | ### Parse URLs or Files 79 | 80 | Files can be parsed with `-f FILE` and URLs can be parsed by instead using `-u URL`. 81 | 82 | ```bash 83 | $ pluck -a '<' -a 'href' -a '"' -d '"' -l 10 -u https://nytimes.com 84 | ``` 85 | 86 | ### Use Config file 87 | 88 | You can also specify multiple things to pluck, simultaneously, by listing the *activators* and the *deactivator* in a TOML file. For example, lets say we want to parse ingredients and the title of [a recipe](https://goo.gl/DHmqmv). Make a file `config.toml`: 89 | 90 | ```toml 91 | [[pluck]] 92 | name = "title" 93 | activators = [""] 94 | deactivator = "" 95 | 96 | [[pluck]] 97 | name = "ingredients" 98 | activators = [""] 99 | deactivator = "<" 100 | limit = -1 101 | ``` 102 | 103 | The title follows normal HTML and the ingredients were determined by quickly inspecting the HTML source code of the target site. Then, pluck it with, 104 | 105 | ```bash 106 | $ pluck -c config.toml -u https://goo.gl/DHmqmv 107 | { 108 | "ingredients": [ 109 | "1 pound medium (26/30) peeled and deveined shrimp, tails removed", 110 | "2 teaspoons chili powder", 111 | "Kosher salt", 112 | "2 tablespoons canola oil", 113 | "4 scallions, thinly sliced", 114 | "One 15-ounce can black beans, drained and rinsed well", 115 | "1/3 cup prepared chipotle mayonnaise ", 116 | "2 limes, 1 zested and juiced and 1 cut into wedges ", 117 | "One 14-ounce bag store-bought coleslaw mix (about 6 cups)", 118 | "1 bunch fresh cilantro, leaves and soft stems roughly chopped", 119 | "Sour cream or Mexican crema, for serving", 120 | "8 corn tortillas, warmed " 121 | ], 122 | "title": "15-Minute Shrimp Tacos with Spicy Chipotle Slaw Recipe | Food Network Kitchen | Food Network" 123 | } 124 | ``` 125 | 126 | ### Extract structured data 127 | 128 | Lets say you want to tell Bob "OK Bob, first look for *W*. Then, every time you find *X* and then *Y*, copy down everything you see until you encounter *Z*. Also, stop if you see *U*, even if you are not at the end." In this case, *W*, *X*, and *Y* are activators but *W* is a "Permanent" activator. Once *W* is found, Bob forgets about looking for it anymore. *U* is a "Finisher" which tells Bob to stop looking for anything and return whatever result was found. 129 | 130 | You can extract information from blocks in *pluck* by using these two keywords: "*permanent*" and "*finisher*". The *permanent* number determines how many of the activators (from the left to right) will stay activated forever, once activated. The *finisher* keyword is a new string that will retire the current plucker when found and not capture anything in the buffer. 131 | 132 | For example, suppose you want to only extract `link3` and `link4` from the following: 133 | 134 | ```html 135 |

Section 1

136 | 1 137 | 2 138 |

Section 2

139 | 3 140 | 4 141 |

Section 3

142 | 5 143 | 6 144 | ``` 145 | 146 | You can add "Section 2" as an activator and set permanent to `1` so that only the first activator ("Section 2") will continue to remain activated after finding the deactivator. Then you want to finish the plucker when it hits "Section 3", so we can set the finisher keyword as this. Then `config.toml` is 147 | 148 | ``` 149 | [[pluck]] 150 | activators = ["Section 2","a","href",'"'] 151 | permanent = 1 # designates that the first 1 activators will persist 152 | deactivator = '"' 153 | finisher = "Section 3" 154 | ``` 155 | 156 | will result in the following: 157 | 158 | ```json 159 | { 160 | "0": [ 161 | "link3", 162 | "link4", 163 | ] 164 | } 165 | ``` 166 | 167 | 168 | ### More examples 169 | 170 | See [EXAMPLES.md](https://github.com/schollz/pluck/blob/master/EXAMPLES.md) for more examples. 171 | 172 | ### Use as a Go package 173 | 174 | Import pluck as `"github.com/schollz/pluck/pluck"` and you can use it in your own project. See the tests for more info. 175 | 176 | 177 | 178 | # Development 179 | 180 | ``` 181 | $ go get -u github.com/schollz/pluck/... 182 | $ cd $GOPATH/src/github.com/schollz/pluck/pluck 183 | $ go test -cover 184 | ``` 185 | 186 | ## Current benchmark 187 | 188 | The [state of the art for xpath is `lxml`, based on libxml2](http://lxml.de/performance.html). Here is a comparison for plucking the same data from the same file, run on Intel i5-4310U CPU @ 2.00GHz × 4. (Run Python benchmark `cd pluck/test && python3 main.py`). 189 | 190 | | Language | Rate | 191 | | ------------- | ------------- | 192 | | `lxml` (Python3.5) | 300 / s | 193 | | pluck | 1270 / s | 194 | 195 | A real-world example I use *pluck* for is processing 1,200 HTML files in parallel, compared to running `lxml` in parallel: 196 | 197 | | Language | Rate | 198 | | ------------- | ------------- | 199 | | `lxml` (Python3.6) | 25 / s | 200 | | pluck | 430 / s | 201 | 202 | I'd like to benchmark a Perl regex, although I don't know how to write this kind of regex! Send a PR if you do :) 203 | 204 | ## To Do 205 | 206 | - [ ] Allow OR statements (e.g `'|"`). 207 | - [ ] Quotes match to quotes (single or double)? 208 | - [ ] Allow piping from standard in? 209 | - [x] API to handle strings, e.g. `PluckString(s string)` 210 | - [x] Add parallelism 211 | 212 | # License 213 | 214 | MIT 215 | 216 | # Acknowledgements 217 | 218 | Graphics by: www.vecteezy.com 219 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/schollz/pluck 2 | 3 | require ( 4 | github.com/BurntSushi/toml v0.3.1 5 | github.com/davecgh/go-spew v1.1.1 6 | github.com/pkg/errors v0.8.1 7 | github.com/pmezard/go-difflib v1.0.0 8 | github.com/sirupsen/logrus v1.3.0 9 | github.com/stretchr/testify v1.3.0 10 | github.com/urfave/cli v1.20.0 11 | golang.org/x/crypto v0.0.0-20190123085648-057139ce5d2b 12 | golang.org/x/sys v0.0.0-20190124100055-b90733256f2e 13 | ) 14 | -------------------------------------------------------------------------------- /goreleaser.yml: -------------------------------------------------------------------------------- 1 | build: 2 | binary: pluck 3 | goos: 4 | - windows 5 | - darwin 6 | - linux 7 | goarch: 8 | - amd64 9 | archive: 10 | format: zip -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "os" 7 | "strings" 8 | "time" 9 | 10 | "github.com/schollz/pluck/pluck" 11 | "github.com/urfave/cli" 12 | ) 13 | 14 | var version string 15 | 16 | func main() { 17 | app := cli.NewApp() 18 | app.Version = version 19 | app.Compiled = time.Now() 20 | app.Name = "pluck" 21 | app.Usage = "" 22 | app.UsageText = ` 23 | 1) Pluck all URLs from a website 24 | $ pluck -a '<' -a 'href' -a '"' -d '"' -l -1 -u https://nytimes.com 25 | 26 | 2) Pluck title from a HTML file 27 | $ pluck -a '' -d '<' -f test.html 28 | 29 | 3) Pluck using a configuration file. 30 | $ # Example config file 31 | $ cat config.toml 32 | [[pluck]] 33 | activators = ["<title>"] 34 | deactivator = "" 35 | name = "title" 36 | 37 | [[pluck]] 38 | activators = [""] 39 | deactivator = "<" 40 | limit = -1 41 | name = "ingredients" 42 | $ pluck -c config.toml -u https://goo.gl/DHmqmv 43 | 44 | 4) Get headlines from news.google.com 45 | $ pluck -a 'role="heading"' -a '>' -d '<' -t -s -u 'https://news.google.com/news/?ned=us&hl=en' 46 | 47 | 5) Pluck items from a block 48 | $ pluck -a 'Section 2' -a ' 0 { 119 | p.Load(c.GlobalString("config")) 120 | } else { 121 | if len(c.GlobalStringSlice("activator")) == 0 { 122 | fmt.Println("Must specify at least one activator. For example -a 'start'.\nSee help and usage with -h") 123 | return nil 124 | } 125 | if len(c.GlobalString("deactivator")) == 0 { 126 | fmt.Println("Must specify at deactivator. For example -d 'end'.\nSee help and usage with -h") 127 | return nil 128 | } 129 | p.Add(pluck.Config{ 130 | Activators: c.GlobalStringSlice("activator"), 131 | Deactivator: c.GlobalString("deactivator"), 132 | Limit: c.GlobalInt("limit"), 133 | Sanitize: c.GlobalBool("sanitize"), 134 | Finisher: c.GlobalString("finisher"), 135 | Permanent: c.GlobalInt("permanent"), 136 | }) 137 | } 138 | 139 | if len(c.GlobalString("file")) > 0 { 140 | err = p.PluckFile(c.GlobalString("file")) 141 | } else { 142 | err = p.PluckURL(c.GlobalString("url")) 143 | } 144 | if err != nil { 145 | return err 146 | } 147 | var result string 148 | if c.GlobalBool("text") { 149 | results, ok := p.Result()["0"].([]string) 150 | if !ok { 151 | results2, ok2 := p.Result()["0"].(string) 152 | if !ok2 { 153 | fmt.Println("Error?") 154 | os.Exit(-1) 155 | } else { 156 | result = results2 157 | } 158 | } else { 159 | result = strings.Join(results, "\n\n") 160 | } 161 | } else { 162 | result = p.ResultJSON(true) 163 | } 164 | if c.GlobalString("output") != "" { 165 | return ioutil.WriteFile(c.GlobalString("output"), []byte(result), 0644) 166 | } else { 167 | fmt.Println(result) 168 | } 169 | 170 | return nil 171 | } 172 | 173 | err := app.Run(os.Args) 174 | if err != nil { 175 | fmt.Print(err) 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /pluck/plucker.go: -------------------------------------------------------------------------------- 1 | package pluck 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/json" 7 | "html" 8 | "io" 9 | "io/ioutil" 10 | "net/http" 11 | "os" 12 | "strconv" 13 | "strings" 14 | "sync" 15 | 16 | "github.com/BurntSushi/toml" 17 | "github.com/pkg/errors" 18 | "github.com/schollz/pluck/pluck/striphtml" 19 | log "github.com/sirupsen/logrus" 20 | ) 21 | 22 | // Config specifies parameters for plucking 23 | type Config struct { 24 | Activators []string // must be found in order, before capturing commences 25 | Permanent int // number of activators that stay permanently (counted from left to right) 26 | Deactivator string // restarts capturing 27 | Finisher string // finishes capturing this pluck 28 | Limit int // specifies the number of times capturing can occur 29 | Name string // the key in the returned map, after completion 30 | Sanitize bool 31 | Maximum int // maximum number of characters for a capture 32 | } 33 | 34 | type configs struct { 35 | Pluck []Config 36 | } 37 | 38 | // Plucker stores the result and the types of things to pluck 39 | type Plucker struct { 40 | pluckers []pluckUnit 41 | result map[string]interface{} 42 | } 43 | 44 | type pluckUnit struct { 45 | config Config 46 | activators [][]byte 47 | permanent int 48 | maximum int 49 | deactivator []byte 50 | finisher []byte 51 | captured [][]byte 52 | numActivated int 53 | captureByte []byte 54 | captureI int 55 | activeI int 56 | deactiveI int 57 | finisherI int 58 | isFinished bool 59 | } 60 | 61 | // New returns a new plucker 62 | // which can later have items added to it 63 | // or can load a config file 64 | // and then can be used to parse. 65 | func New() (*Plucker, error) { 66 | log.SetLevel(log.WarnLevel) 67 | p := new(Plucker) 68 | p.pluckers = []pluckUnit{} 69 | return p, nil 70 | } 71 | 72 | // Verbose toggles debug mode 73 | func (p *Plucker) Verbose(makeVerbose bool) { 74 | if makeVerbose { 75 | log.SetLevel(log.DebugLevel) 76 | } else { 77 | log.SetLevel(log.WarnLevel) 78 | } 79 | } 80 | 81 | // Configuration returns an array of the current 82 | // Config for each plucker. 83 | func (p *Plucker) Configuration() (c []Config) { 84 | c = make([]Config, len(p.pluckers)) 85 | for i, unit := range p.pluckers { 86 | c[i] = unit.config 87 | } 88 | return 89 | } 90 | 91 | // Add adds a unit 92 | // to pluck with specified parameters 93 | func (p *Plucker) Add(c Config) { 94 | var u pluckUnit 95 | u.config = c 96 | if u.config.Limit == 0 { 97 | u.config.Limit = -1 98 | } 99 | if u.config.Name == "" { 100 | u.config.Name = strconv.Itoa(len(p.pluckers)) 101 | } 102 | u.activators = make([][]byte, len(c.Activators)) 103 | for i := range c.Activators { 104 | u.activators[i] = []byte(c.Activators[i]) 105 | } 106 | u.permanent = c.Permanent 107 | u.deactivator = []byte(c.Deactivator) 108 | if len(c.Finisher) > 0 { 109 | u.finisher = []byte(c.Finisher) 110 | } else { 111 | u.finisher = nil 112 | } 113 | u.maximum = -1 114 | if c.Maximum > 0 { 115 | u.maximum = c.Maximum 116 | } 117 | u.captureByte = make([]byte, 100000) 118 | u.captured = [][]byte{} 119 | p.pluckers = append(p.pluckers, u) 120 | log.Infof("Added plucker %+v", c) 121 | } 122 | 123 | // Load will load a YAML configuration file of untis 124 | // to pluck with specified parameters 125 | func (p *Plucker) Load(f string) (err error) { 126 | tomlData, err := ioutil.ReadFile(f) 127 | if err != nil { 128 | return errors.Wrap(err, "problem opening "+f) 129 | } 130 | log.Debugf("toml string: %s", string(tomlData)) 131 | p.LoadFromString(string(tomlData)) 132 | return 133 | } 134 | 135 | // LoadFromString will load a YAML configuration file of untis 136 | // to pluck with specified parameters 137 | func (p *Plucker) LoadFromString(tomlString string) (err error) { 138 | var conf configs 139 | _, err = toml.Decode(tomlString, &conf) 140 | log.Debugf("Loaded toml: %+v", conf) 141 | for i := range conf.Pluck { 142 | var c Config 143 | c.Activators = conf.Pluck[i].Activators 144 | c.Deactivator = conf.Pluck[i].Deactivator 145 | c.Finisher = conf.Pluck[i].Finisher 146 | c.Limit = conf.Pluck[i].Limit 147 | c.Name = conf.Pluck[i].Name 148 | c.Permanent = conf.Pluck[i].Permanent 149 | c.Sanitize = conf.Pluck[i].Sanitize 150 | c.Maximum = conf.Pluck[i].Maximum 151 | p.Add(c) 152 | } 153 | return 154 | } 155 | 156 | // PluckString takes a string as input 157 | // and uses the specified parameters and generates 158 | // a map (p.result) with the finished results. 159 | // The streaming can be enabled by setting it to true. 160 | func (p *Plucker) PluckString(s string, stream ...bool) (err error) { 161 | r := bufio.NewReader(strings.NewReader(s)) 162 | if len(stream) > 0 && stream[0] { 163 | return p.PluckStream(r) 164 | } 165 | return p.Pluck(r) 166 | } 167 | 168 | // PluckFile takes a file as input 169 | // and uses the specified parameters and generates 170 | // a map (p.result) with the finished results. The streaming 171 | // can be enabled by setting it to true. 172 | func (p *Plucker) PluckFile(f string, stream ...bool) (err error) { 173 | r1, err := os.Open(f) 174 | defer r1.Close() 175 | if err != nil { 176 | return 177 | } 178 | r := bufio.NewReader(r1) 179 | if len(stream) > 0 && stream[0] { 180 | return p.PluckStream(r) 181 | } 182 | return p.Pluck(r) 183 | } 184 | 185 | // PluckURL takes a URL as input 186 | // and uses the specified parameters and generates 187 | // a map (p.result) with the finished results 188 | func (p *Plucker) PluckURL(url string, stream ...bool) (err error) { 189 | client := &http.Client{} 190 | request, err := http.NewRequest("GET", url, nil) 191 | if err != nil { 192 | return 193 | } 194 | request.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0") 195 | resp, err := client.Do(request) 196 | if err != nil { 197 | return 198 | } 199 | defer resp.Body.Close() 200 | r := bufio.NewReader(resp.Body) 201 | if len(stream) > 0 && stream[0] { 202 | return p.PluckStream(r) 203 | } 204 | return p.Pluck(r) 205 | } 206 | 207 | // Pluck takes a buffered reader stream and 208 | // extracts the text from it. This spawns a thread for 209 | // each plucker and copies the entire buffer to memory, 210 | // so that each plucker works in parallel. 211 | func (p *Plucker) Pluck(r *bufio.Reader) (err error) { 212 | allBytes, _ := r.ReadBytes(0) 213 | var wg sync.WaitGroup 214 | wg.Add(len(p.pluckers)) 215 | for i := 0; i < len(p.pluckers); i++ { 216 | go func(i int, allBytes []byte) { 217 | defer wg.Done() 218 | for _, curByte := range allBytes { 219 | if p.pluckers[i].numActivated < len(p.pluckers[i].activators) { 220 | // look for activators 221 | if curByte == p.pluckers[i].activators[p.pluckers[i].numActivated][p.pluckers[i].activeI] { 222 | p.pluckers[i].activeI++ 223 | if p.pluckers[i].activeI == len(p.pluckers[i].activators[p.pluckers[i].numActivated]) { 224 | log.Info(string(curByte), "Activated") 225 | p.pluckers[i].numActivated++ 226 | p.pluckers[i].activeI = 0 227 | } 228 | } else { 229 | p.pluckers[i].activeI = 0 230 | } 231 | } else { 232 | // add to capture 233 | p.pluckers[i].captureByte[p.pluckers[i].captureI] = curByte 234 | p.pluckers[i].captureI++ 235 | // look for deactivators 236 | if curByte == p.pluckers[i].deactivator[p.pluckers[i].deactiveI] { 237 | p.pluckers[i].deactiveI++ 238 | if p.pluckers[i].deactiveI == len(p.pluckers[i].deactivator) { 239 | log.Info(string(curByte), "Deactivated") 240 | // add capture 241 | log.Info(string(p.pluckers[i].captureByte[:p.pluckers[i].captureI-len(p.pluckers[i].deactivator)])) 242 | tempByte := make([]byte, p.pluckers[i].captureI-len(p.pluckers[i].deactivator)) 243 | copy(tempByte, p.pluckers[i].captureByte[:p.pluckers[i].captureI-len(p.pluckers[i].deactivator)]) 244 | if p.pluckers[i].config.Sanitize { 245 | tempByte = bytes.Replace(tempByte, []byte("\\u003c"), []byte("<"), -1) 246 | tempByte = bytes.Replace(tempByte, []byte("\\u003e"), []byte(">"), -1) 247 | tempByte = bytes.Replace(tempByte, []byte("\\u0026"), []byte("&"), -1) 248 | tempByte = []byte(striphtml.StripTags(html.UnescapeString(string(tempByte)))) 249 | } 250 | tempByte = bytes.TrimSpace(tempByte) 251 | if p.pluckers[i].maximum < 1 || len(tempByte) < p.pluckers[i].maximum { 252 | p.pluckers[i].captured = append(p.pluckers[i].captured, tempByte) 253 | } 254 | // reset 255 | p.pluckers[i].numActivated = p.pluckers[i].permanent 256 | p.pluckers[i].deactiveI = 0 257 | p.pluckers[i].captureI = 0 258 | } 259 | } else { 260 | p.pluckers[i].activeI = 0 261 | p.pluckers[i].deactiveI = 0 262 | } 263 | } 264 | 265 | // look for finisher 266 | if p.pluckers[i].finisher != nil && len(p.pluckers[i].captured) > 0 { 267 | if curByte == p.pluckers[i].finisher[p.pluckers[i].finisherI] { 268 | p.pluckers[i].finisherI++ 269 | if p.pluckers[i].finisherI == len(p.pluckers[i].finisher) { 270 | log.Info(string(curByte), "Finished") 271 | p.pluckers[i].isFinished = true 272 | } 273 | } else { 274 | p.pluckers[i].finisherI = 0 275 | } 276 | } 277 | 278 | if len(p.pluckers[i].captured) == p.pluckers[i].config.Limit { 279 | p.pluckers[i].isFinished = true 280 | } 281 | if p.pluckers[i].isFinished { 282 | break 283 | } 284 | } 285 | log.Infof("plucker %d finished", i) 286 | }(i, allBytes) 287 | } 288 | wg.Wait() 289 | p.generateResult() 290 | return 291 | } 292 | 293 | // PluckStream takes a buffered reader stream and streams one 294 | // byte at a time and processes all pluckers serially and 295 | // simultaneously. 296 | func (p *Plucker) PluckStream(r *bufio.Reader) (err error) { 297 | var finished bool 298 | for { 299 | curByte, errRead := r.ReadByte() 300 | if errRead == io.EOF || finished { 301 | break 302 | } 303 | finished = true 304 | for i := range p.pluckers { 305 | if p.pluckers[i].isFinished { 306 | continue 307 | } 308 | finished = false 309 | if p.pluckers[i].numActivated < len(p.pluckers[i].activators) { 310 | // look for activators 311 | if curByte == p.pluckers[i].activators[p.pluckers[i].numActivated][p.pluckers[i].activeI] { 312 | p.pluckers[i].activeI++ 313 | if p.pluckers[i].activeI == len(p.pluckers[i].activators[p.pluckers[i].numActivated]) { 314 | log.Info(string(curByte), "Activated") 315 | p.pluckers[i].numActivated++ 316 | p.pluckers[i].activeI = 0 317 | } 318 | } else { 319 | p.pluckers[i].activeI = 0 320 | } 321 | } else { 322 | // add to capture 323 | p.pluckers[i].captureByte[p.pluckers[i].captureI] = curByte 324 | p.pluckers[i].captureI++ 325 | // look for deactivators 326 | if curByte == p.pluckers[i].deactivator[p.pluckers[i].deactiveI] { 327 | p.pluckers[i].deactiveI++ 328 | if p.pluckers[i].deactiveI == len(p.pluckers[i].deactivator) { 329 | log.Info(string(curByte), "Deactivated") 330 | // add capture 331 | log.Info(string(p.pluckers[i].captureByte[:p.pluckers[i].captureI-len(p.pluckers[i].deactivator)])) 332 | tempByte := make([]byte, p.pluckers[i].captureI-len(p.pluckers[i].deactivator)) 333 | copy(tempByte, p.pluckers[i].captureByte[:p.pluckers[i].captureI-len(p.pluckers[i].deactivator)]) 334 | if p.pluckers[i].config.Sanitize { 335 | tempByte = bytes.Replace(tempByte, []byte("\\u003c"), []byte("<"), -1) 336 | tempByte = bytes.Replace(tempByte, []byte("\\u003e"), []byte(">"), -1) 337 | tempByte = bytes.Replace(tempByte, []byte("\\u0026"), []byte("&"), -1) 338 | tempByte = []byte(striphtml.StripTags(html.UnescapeString(string(tempByte)))) 339 | } 340 | tempByte = bytes.TrimSpace(tempByte) 341 | p.pluckers[i].captured = append(p.pluckers[i].captured, tempByte) 342 | // reset 343 | p.pluckers[i].numActivated = p.pluckers[i].permanent 344 | p.pluckers[i].deactiveI = 0 345 | p.pluckers[i].captureI = 0 346 | } 347 | } else { 348 | p.pluckers[i].activeI = 0 349 | p.pluckers[i].deactiveI = 0 350 | } 351 | } 352 | 353 | // look for finisher 354 | if p.pluckers[i].finisher != nil { 355 | if curByte == p.pluckers[i].finisher[p.pluckers[i].finisherI] { 356 | p.pluckers[i].finisherI++ 357 | if p.pluckers[i].finisherI == len(p.pluckers[i].finisher) { 358 | log.Info(string(curByte), "Finished") 359 | p.pluckers[i].isFinished = true 360 | } 361 | } else { 362 | p.pluckers[i].finisherI = 0 363 | } 364 | } 365 | 366 | if len(p.pluckers[i].captured) == p.pluckers[i].config.Limit { 367 | p.pluckers[i].isFinished = true 368 | } 369 | } 370 | } 371 | p.generateResult() 372 | return 373 | } 374 | 375 | func (p *Plucker) generateResult() { 376 | p.result = make(map[string]interface{}) 377 | for i := range p.pluckers { 378 | if len(p.pluckers[i].captured) == 0 { 379 | p.result[p.pluckers[i].config.Name] = "" 380 | } else if len(p.pluckers[i].captured) == 1 { 381 | p.result[p.pluckers[i].config.Name] = string(p.pluckers[i].captured[0]) 382 | } else { 383 | results := make([]string, len(p.pluckers[i].captured)) 384 | for j, r := range p.pluckers[i].captured { 385 | results[j] = string(r) 386 | } 387 | if len(results) == 0 { 388 | p.result[p.pluckers[i].config.Name] = "" 389 | } else { 390 | p.result[p.pluckers[i].config.Name] = results 391 | } 392 | } 393 | } 394 | } 395 | 396 | // ResultJSON returns the result, formatted as JSON. 397 | // If their are no results, it returns an empty string. 398 | func (p *Plucker) ResultJSON(indent ...bool) string { 399 | totalResults := 0 400 | for key := range p.result { 401 | b, _ := json.Marshal(p.result[key]) 402 | totalResults += len(b) 403 | } 404 | if totalResults == len(p.result)*2 { // results == 2 because its just [] 405 | return "" 406 | } 407 | var err error 408 | var resultJSON []byte 409 | if len(indent) > 0 && indent[0] { 410 | resultJSON, err = json.MarshalIndent(p.result, "", " ") 411 | } else { 412 | resultJSON, err = json.Marshal(p.result) 413 | } 414 | if err != nil { 415 | log.Error(errors.Wrap(err, "result marshalling failed")) 416 | } 417 | return string(resultJSON) 418 | } 419 | 420 | // Result returns the raw result 421 | func (p *Plucker) Result() map[string]interface{} { 422 | return p.result 423 | } 424 | -------------------------------------------------------------------------------- /pluck/plucker_test.go: -------------------------------------------------------------------------------- 1 | package pluck 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func BenchmarkParseFile(b *testing.B) { 10 | for n := 0; n < b.N; n++ { 11 | p, _ := New() 12 | p.Verbose(false) 13 | p.Load("test/config.toml") 14 | p.PluckFile("test/test.txt") 15 | } 16 | } 17 | 18 | func BenchmarkParseFileStream(b *testing.B) { 19 | for n := 0; n < b.N; n++ { 20 | p, _ := New() 21 | p.Verbose(false) 22 | p.Load("test/config.toml") 23 | p.PluckFile("test/test.txt", true) 24 | } 25 | } 26 | 27 | func TestPluck0(t *testing.T) { 28 | p, _ := New() 29 | p.Verbose(false) 30 | err := p.Load("test/config.toml") 31 | if err != nil { 32 | t.Error(err) 33 | } 34 | err = p.PluckFile("test/test.txt") 35 | if err != nil { 36 | t.Error(err) 37 | } 38 | } 39 | func TestPluck1(t *testing.T) { 40 | p, err := New() 41 | p.Verbose(false) 42 | if err != nil { 43 | t.Error(err) 44 | } 45 | err = p.Load("test/config.toml") 46 | if err != nil { 47 | t.Error(err) 48 | } 49 | assert.Equal(t, 3, len(p.pluckers)) 50 | assert.Equal(t, "0", p.pluckers[0].config.Name) 51 | assert.Equal(t, -1, p.pluckers[0].config.Limit) 52 | assert.Equal(t, "options", p.pluckers[1].config.Name) 53 | assert.Equal(t, "songs", p.pluckers[2].config.Name) 54 | 55 | p.PluckFile("test/test.txt") 56 | assert.Equal(t, `{ 57 | "0": "Category Archives: Song of the Day Podcast", 58 | "options": [ 59 | "2009 Countdown", 60 | "2010 Countdown", 61 | "2011 Countdown" 62 | ], 63 | "songs": [ 64 | "Juana Molina \u0026#8211; Cosoco", 65 | "Charms – Siren", 66 | "Daddy Issues \u0026#8211; Locked Out", 67 | "Cloud Control \u0026#8211; Rainbow City", 68 | "Kevin Morby \u0026#8211; Come to Me Now", 69 | "Les Big Byrd \u0026#8211; Two Man Gang", 70 | "Thunderpussy – Velvet Noose", 71 | "Captain, We\u0026#8217;re Sinking \u0026#8211; Trying Year", 72 | "Mammút – “The Moon Will Never Turn On Me”", 73 | "Songhoy Blues \u0026#8211; Voter" 74 | ] 75 | }`, p.ResultJSON(true)) 76 | 77 | p, _ = New() 78 | p.Load("test/food.toml") 79 | p.PluckURL("http://www.foodnetwork.com/recipes/food-network-kitchen/15-minute-shrimp-tacos-with-spicy-chipotle-slaw-3676441") 80 | assert.Equal(t, `15-Minute Shrimp Tacos with Spicy Chipotle Slaw Recipe | Food Network Kitchen | Food Network`, p.Result()["title"]) 81 | 82 | p, _ = New() 83 | p.Add(Config{ 84 | Activators: []string{"X", "Y"}, 85 | Deactivator: "Z", 86 | }) 87 | p.PluckString("XaZbYcZd") 88 | assert.Equal(t, `c`, p.Result()["0"]) 89 | } 90 | 91 | func TestPluck1Stream(t *testing.T) { 92 | p, err := New() 93 | p.Verbose(false) 94 | if err != nil { 95 | t.Error(err) 96 | } 97 | err = p.Load("test/config.toml") 98 | if err != nil { 99 | t.Error(err) 100 | } 101 | assert.Equal(t, 3, len(p.pluckers)) 102 | assert.Equal(t, "0", p.pluckers[0].config.Name) 103 | assert.Equal(t, -1, p.pluckers[0].config.Limit) 104 | assert.Equal(t, "options", p.pluckers[1].config.Name) 105 | assert.Equal(t, "songs", p.pluckers[2].config.Name) 106 | 107 | p.PluckFile("test/test.txt", true) 108 | assert.Equal(t, `{ 109 | "0": "Category Archives: Song of the Day Podcast", 110 | "options": [ 111 | "2009 Countdown", 112 | "2010 Countdown", 113 | "2011 Countdown" 114 | ], 115 | "songs": [ 116 | "Juana Molina \u0026#8211; Cosoco", 117 | "Charms – Siren", 118 | "Daddy Issues \u0026#8211; Locked Out", 119 | "Cloud Control \u0026#8211; Rainbow City", 120 | "Kevin Morby \u0026#8211; Come to Me Now", 121 | "Les Big Byrd \u0026#8211; Two Man Gang", 122 | "Thunderpussy – Velvet Noose", 123 | "Captain, We\u0026#8217;re Sinking \u0026#8211; Trying Year", 124 | "Mammút – “The Moon Will Never Turn On Me”", 125 | "Songhoy Blues \u0026#8211; Voter" 126 | ] 127 | }`, p.ResultJSON(true)) 128 | 129 | } 130 | 131 | func TestPluck2(t *testing.T) { 132 | p, err := New() 133 | p.Verbose(false) 134 | if err != nil { 135 | t.Error(err) 136 | } 137 | err = p.Load("test/config2.toml") 138 | if err != nil { 139 | t.Error(err) 140 | } 141 | 142 | p.PluckFile("test/test.txt") 143 | assert.Equal(t, `{ 144 | "0": "Category Archives: Song of the Day Podcast", 145 | "options": [ 146 | "2009 Countdown", 147 | "2010 Countdown", 148 | "2011 Countdown" 149 | ], 150 | "songs": [ 151 | "Juana Molina \u0026#8211; Cosoco", 152 | "Charms – Siren", 153 | "Daddy Issues \u0026#8211; Locked Out", 154 | "Cloud Control \u0026#8211; Rainbow City", 155 | "Kevin Morby \u0026#8211; Come to Me Now" 156 | ] 157 | }`, p.ResultJSON(true)) 158 | } 159 | 160 | func TestPluckSongs(t *testing.T) { 161 | p, err := New() 162 | p.Verbose(false) 163 | if err != nil { 164 | t.Error(err) 165 | } 166 | err = p.Load("test/song.toml") 167 | if err != nil { 168 | t.Error(err) 169 | } 170 | 171 | p.PluckFile("test/song.html") 172 | assert.Equal(t, `{ 173 | "songs": [ 174 | "/music/The+War+on+Drugs/_/An+Ocean+in+Between+the+Waves", 175 | "/music/The+War+on+Drugs/_/Suffering", 176 | "/music/Spoon/_/Inside+Out", 177 | "/music/Real+Estate/_/Crime" 178 | ] 179 | }`, p.ResultJSON(true)) 180 | } 181 | 182 | func TestPluckSkipSection(t *testing.T) { 183 | p, err := New() 184 | p.Verbose(false) 185 | if err != nil { 186 | t.Error(err) 187 | } 188 | p.Add(Config{ 189 | Activators: []string{"Section 2", "a", "href", `"`}, 190 | Permanent: 1, 191 | Deactivator: `"`, 192 | }) 193 | err = p.PluckString(` 194 |

Section 1

195 | 1 196 | 2 197 |

Section 2

198 | 3 199 | 4 200 | `) 201 | assert.Nil(t, err) 202 | assert.Equal(t, `{ 203 | "0": [ 204 | "link3", 205 | "link4" 206 | ] 207 | }`, p.ResultJSON(true)) 208 | } 209 | 210 | func TestPluckCutSection(t *testing.T) { 211 | p, err := New() 212 | p.Verbose(false) 213 | if err != nil { 214 | t.Error(err) 215 | } 216 | p.Add(Config{ 217 | Activators: []string{"Section 2", "a", "href", `"`}, 218 | Permanent: 1, 219 | Deactivator: `"`, 220 | Finisher: "Section 3", 221 | Maximum: 6, 222 | }) 223 | err = p.PluckString(`

Section 1

224 | 1 225 | 2 226 |

Section 2

227 | 3 228 | 4 229 |

Section 3

230 | 5 231 | 6`) 232 | assert.Nil(t, err) 233 | assert.Equal(t, `{"0":"link3"}`, p.ResultJSON()) 234 | 235 | assert.Equal(t, []Config{Config{ 236 | Activators: []string{"Section 2", "a", "href", `"`}, 237 | Permanent: 1, 238 | Deactivator: `"`, 239 | Finisher: "Section 3", 240 | Limit: -1, 241 | Name: "0", 242 | Maximum: 6, 243 | }}, p.Configuration()) 244 | } 245 | -------------------------------------------------------------------------------- /pluck/striphtml/striphtml_test.go: -------------------------------------------------------------------------------- 1 | package striphtml 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestStripTags(t *testing.T) { 10 | assert.Equal(t, "This is some text", StripTags("This is
some text
")) 11 | } 12 | -------------------------------------------------------------------------------- /pluck/test/config.toml: -------------------------------------------------------------------------------- 1 | [[pluck]] 2 | activators = [""] 3 | deactivator = "" 4 | 5 | [[pluck]] 6 | activators = ["""