├── LICENSE ├── lua-spider-1.0 ├── LICENSE ├── lua-spider │ ├── filter.lua │ ├── bones.lua │ ├── crawler.lua │ ├── init.lua │ └── extractor.lua └── README.md ├── lua-spider-1.0.tar.gz ├── lua-spider-1.0-1.src.rock ├── lua-spider-1.0-1.rockspec └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | GPLv3 2 | -------------------------------------------------------------------------------- /lua-spider-1.0/LICENSE: -------------------------------------------------------------------------------- 1 | GPLv3 2 | -------------------------------------------------------------------------------- /lua-spider-1.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okpanic/lua-spider/HEAD/lua-spider-1.0.tar.gz -------------------------------------------------------------------------------- /lua-spider-1.0-1.src.rock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okpanic/lua-spider/HEAD/lua-spider-1.0-1.src.rock -------------------------------------------------------------------------------- /lua-spider-1.0-1.rockspec: -------------------------------------------------------------------------------- 1 | rockspec_format = "1.0" 2 | package = "lua-spider" 3 | version = "1.0-1" 4 | 5 | source = { 6 | url = "https://github.com/okpanic/lua-spider/raw/master/lua-spider-1.0.tar.gz", 7 | tag = "1.0" 8 | } 9 | 10 | description = { 11 | summary = "web scraper", 12 | detailed = "A web scraper for lua based on the gumbo HTML5 parser and xpath like content extraction.", 13 | homepage = "https://github.com/okpanic/lua-spider", 14 | license = "GPLv3" 15 | } 16 | 17 | dependencies = { 18 | "lua ~> 5.1" 19 | } 20 | 21 | build = { 22 | type = "builtin", 23 | modules = { 24 | ["lua-spider"] = "lua-spider/init.lua", 25 | ["lua-spider.bones"] = "lua-spider/bones.lua", 26 | ["lua-spider.filter"] = "lua-spider/filter.lua", 27 | ["lua-spider.extractor"] = "lua-spider/extractor.lua", 28 | ["lua-spider.crawler"] = "lua-spider/crawler.lua" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /lua-spider-1.0/lua-spider/filter.lua: -------------------------------------------------------------------------------- 1 | local bones = require("lua-spider.bones") 2 | local filter = {} 3 | 4 | function filter._containing( text, query ) 5 | local out 6 | if type(text) == "table" then 7 | out = {} 8 | for _, v in ipairs(text) do 9 | if string.match(v, query) then 10 | out[#out+1] = v 11 | end 12 | end 13 | else 14 | if string.match(text, query) then 15 | out = text 16 | end 17 | end 18 | if out then 19 | return out 20 | else 21 | return nil 22 | end 23 | end 24 | 25 | function filter._cutout( text, cut ) 26 | local out 27 | if type(text) == "table" then 28 | out = {} 29 | for _, v in ipairs(text) do 30 | out[#out+1] = v:gsub(cut, "") 31 | end 32 | else 33 | out = text:gsub(cut, "") 34 | end 35 | if out then 36 | return out 37 | else 38 | return nil 39 | end 40 | end 41 | 42 | function filter._trimwhitespace( text ) 43 | return bones.trimwhitespace(text) 44 | end 45 | 46 | function filter._rootlink( url, root ) 47 | local t 48 | if type(url) == "table" then 49 | t = {} 50 | for k, v in ipairs(url) do 51 | t[k] = filter._cutout(v, "[%&|%?]ie%=UTF8.*") 52 | if not string.match(t[k], root) then 53 | t[k] = root .. "/" .. t[k]:gsub("^/", "") 54 | end 55 | end 56 | else 57 | t = filter._cutout(url, "[%&|%?]ie%=UTF8.*") 58 | if not string.match(t, root) then 59 | t = root .. "/" .. t:gsub("^/", "") 60 | end 61 | end 62 | return t 63 | end 64 | 65 | function filter._gsub( s, g ) 66 | return s:gsub(g[1], g[2]) or nil 67 | end 68 | 69 | function filter._pricefix( price ) 70 | local t 71 | if type(price) == "table" then 72 | t = {} 73 | for k, v in ipairs(price) do 74 | t[k] = filter._cutout(v, "%$") 75 | t[k] = filter._cutout(t[k], ",") 76 | end 77 | else 78 | t = filter._cutout(price, "%$") 79 | t = filter._cutout(t, ",") 80 | end 81 | return tonumber(t) 82 | end 83 | 84 | function filter._justext( html ) 85 | local out = {} 86 | local htmlf = bones.temppath() .. ".html" 87 | bones.writefile(htmlf, html) 88 | local cmd = [[python -m justext --no-headings -s English "$htmlfile"]] 89 | cmd = cmd:gsub("$htmlfile", htmlf) 90 | local ex = assert(io.popen(cmd)) 91 | for line in ex:lines() do 92 | out[#out+1] = line:gsub("^%b
", "") 93 | end 94 | return out or nil 95 | end 96 | 97 | return filter 98 | -------------------------------------------------------------------------------- /lua-spider-1.0/lua-spider/bones.lua: -------------------------------------------------------------------------------- 1 | local stringx = require ("pl.stringx") 2 | local paths = require("paths") 3 | local file = require("pl.file") 4 | 5 | local bones = {} 6 | 7 | function bones.split( s, d ) 8 | return stringx.split(s, d) 9 | end 10 | 11 | function bones.trimwhitespace( s ) 12 | -- https://github.com/craigbarnes/lua-gumbo/blob/master/gumbo/dom/Document.lua 13 | local whitespace = "[ \t\n\f\r]+" 14 | local _trim = "^[ \t\n\f\r]*(.-)[ \t\n\f\r]*$" 15 | s = tostring(s) 16 | s = s:gsub(whitespace, " ") 17 | s = s:gsub(_trim, "%1") 18 | return s 19 | end 20 | 21 | function bones.temppath() 22 | return(paths.tmpname()) 23 | end 24 | 25 | function bones.writefile( f, c ) 26 | file.write(f, c) 27 | return nil 28 | end 29 | 30 | function bones.readfile( f ) 31 | return file.read(f) 32 | end 33 | 34 | ---+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 35 | -- 36 | -- log.lua 37 | -- 38 | -- Copyright (c) 2016 rxi 39 | -- 40 | -- This library is free software; you can redistribute it and/or modify it 41 | -- under the terms of the MIT license. See LICENSE for details. 42 | -- 43 | ---+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 44 | local log = { _version = "0.1.0" } 45 | 46 | log.usecolor = true 47 | log.outfile = nil 48 | log.level = "trace" 49 | 50 | local modes = { 51 | { name = "trace", color = "\27[34m", }, 52 | { name = "debug", color = "\27[36m", }, 53 | { name = "info", color = "\27[32m", }, 54 | { name = "warn", color = "\27[33m", }, 55 | { name = "error", color = "\27[31m", }, 56 | { name = "fatal", color = "\27[35m", }, 57 | } 58 | 59 | local levels = {} 60 | for i, v in ipairs(modes) do 61 | levels[v.name] = i 62 | end 63 | 64 | local round = function(x, increment) 65 | increment = increment or 1 66 | x = x / increment 67 | return (x > 0 and math.floor(x + .5) or math.ceil(x - .5)) * increment 68 | end 69 | 70 | local _tostring = tostring 71 | 72 | local tostring = function(...) 73 | local t = {} 74 | for i = 1, select('#', ...) do 75 | local x = select(i, ...) 76 | if type(x) == "number" then 77 | x = round(x, .01) 78 | end 79 | t[#t + 1] = _tostring(x) 80 | end 81 | return table.concat(t, " ") 82 | end 83 | 84 | for i, x in ipairs(modes) do 85 | local namelower = x.name:lower() 86 | log[x.name] = function(...) 87 | -- Return early if we're below the log level 88 | if i < levels[log.level] then 89 | return 90 | end 91 | local msg = tostring(...) 92 | local info = debug.getinfo(2, "Sl") 93 | local lineinfo = info.short_src .. ":" .. info.currentline 94 | -- Output to console 95 | print(string.format("%s[%-6s%s] %s", 96 | log.usecolor and x.color or "", 97 | lineinfo, 98 | log.usecolor and "\27[0m" or "", 99 | msg)) 100 | -- Output to log file 101 | if log.outfile then 102 | local fp = io.open(log.outfile, "a") 103 | local str = string.format("[%s] %s\n", lineinfo, msg) 104 | fp:write(str) 105 | fp:close() 106 | end 107 | 108 | end 109 | end 110 | 111 | bones.log = log 112 | 113 | return bones 114 | -------------------------------------------------------------------------------- /lua-spider-1.0/lua-spider/crawler.lua: -------------------------------------------------------------------------------- 1 | local bones = require("lua-spider.bones") 2 | 3 | local _crawler = {} 4 | 5 | _crawler = { 6 | filter = require("lua-spider.filter"), 7 | extract = require("lua-spider.extractor"), 8 | cURL = require("cURL"), 9 | gumbo = require("gumbo"), 10 | _chrome = require("lua-chrome"), 11 | redis = { port = 9000, uri = '127.0.0.1' }, 12 | log = bones.log, 13 | html = {}, 14 | doc = {} 15 | } 16 | 17 | function _crawler.prefilter(_doc, _v, _filter, npass) 18 | local temp 19 | for k, v in pairs(_filter) do 20 | if not npass then 21 | npass = _crawler.extract(_doc, _v.xpath, _v.selection) or "" 22 | end 23 | if npass and type(npass) == "table" then 24 | temp = {} 25 | for i, j in ipairs(npass) do 26 | temp[i] = _crawler.filter["_" .. k](j, v) 27 | end 28 | else 29 | temp = _crawler.filter["_" .. k](npass, v) 30 | end 31 | end 32 | return temp 33 | end 34 | 35 | function _crawler.scrape(_doc, _v) 36 | if _v.filter then 37 | if #_v.filter > 1 then 38 | local f 39 | for _, ifilter in ipairs(_v.filter) do 40 | f = _crawler.prefilter(_doc, _v, ifilter, f) 41 | end 42 | return f 43 | else 44 | return _crawler.prefilter(_doc, _v, _v.filter) 45 | end 46 | else 47 | return _crawler.extract(_doc, _v.xpath, _v.selection) or "" 48 | end 49 | end 50 | 51 | function _crawler.curl( uri, headers ) 52 | local dat = {} 53 | local con = _crawler.cURL.easy() 54 | headers = headers or {"User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleCrawlerKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"} 55 | con:setopt_ssl_verifypeer(false) 56 | con:setopt_httpheader(headers) 57 | con:setopt_url(uri) 58 | con:perform({writefunction = function(str) dat[#dat+1] = str end}) 59 | dat = table.concat(dat, "") 60 | return dat 61 | end 62 | 63 | function _crawler.chrome( uri ) 64 | local chrome = _crawler._chrome:new() 65 | chrome.url = uri 66 | local ok, err = pcall(chrome.dump, chrome) 67 | if not ok then 68 | print(err) 69 | return nil 70 | else 71 | return chrome.dom 72 | end 73 | end 74 | 75 | function _crawler.parse( rawhtml ) 76 | return _crawler.gumbo.parse(rawhtml) 77 | end 78 | 79 | function _crawler:new(g) 80 | g = g or {} 81 | setmetatable(g, self) 82 | self.__index = self 83 | return g 84 | end 85 | 86 | function _crawler:fulltext(url) 87 | return _crawler.filter["_justext"](self.html[url]) 88 | end 89 | 90 | function _crawler:crawl(url, node) 91 | local out 92 | if type(url) ~= "string" then 93 | -- self.log.error("URL field given to crawler is not a string") 94 | end 95 | -- self.log.info("Crawling " .. url) 96 | if node.xpath then 97 | if string.match(url, "file://") then 98 | url = url:gsub("file://", "") 99 | self.html[url] = bones.readfile(url) 100 | else 101 | self.html[url] = self.html[url] or self.chrome(url) or self.curl(url) 102 | end 103 | self.doc[url] = self.doc[url] or self.parse(self.html[url]) 104 | out = self.scrape(self.doc[url], node) 105 | elseif node.fulltext then 106 | self.html[url] = self.html[url] or self.curl(url) 107 | out = self.fulltext(self.html[url]) 108 | elseif node.file then 109 | out = self.dlfile(self.curl(url, node.header or nil)) 110 | end 111 | return out or nil 112 | end 113 | 114 | function _crawler:dlfile(url, chead) 115 | local out 116 | if type(url) ~= "string" then 117 | self.log.error("URL field given to crawler is not a string") 118 | end 119 | self.log.info("Downloading content at " .. url) 120 | out = self.curl(url, chead) 121 | return out or nil 122 | end 123 | 124 | local crawler = _crawler:new() 125 | 126 | return crawler 127 | -------------------------------------------------------------------------------- /lua-spider-1.0/lua-spider/init.lua: -------------------------------------------------------------------------------- 1 | local bones = require("lua-spider.bones") 2 | 3 | local _spider = { 4 | crawler = require("lua-spider.crawler"), 5 | extractor = require("lua-spider.extractor") 6 | } 7 | 8 | local function isparam(t) 9 | if not t then return false end 10 | if type(t) ~= "table" then return false end 11 | for k, v in pairs(t) do 12 | if k ~= "filter" and k ~= "url" then 13 | if type(v) == "table" then 14 | return false 15 | else 16 | return true 17 | end 18 | end 19 | end 20 | end 21 | 22 | local function islink(l, cake) 23 | if not l then return false end 24 | if type(l) == "table" then return false end 25 | if string.match(l, "http") then return false end 26 | if string.match(l, "%.") then 27 | l = bones.split(l, ".") 28 | else 29 | l = {l} 30 | end 31 | if cake[l[1]] then 32 | return true 33 | else 34 | return false 35 | end 36 | end 37 | 38 | local function melttable(arr) 39 | if type(arr) == "table" then 40 | local result = {} 41 | local function toflat(_arr) 42 | for _, v in ipairs(_arr) do 43 | if type(v) == "table" then 44 | toflat(v) 45 | else 46 | table.insert(result, v) 47 | end 48 | end 49 | end 50 | toflat(arr) 51 | return result 52 | else 53 | return arr 54 | end 55 | end 56 | 57 | local function getlink(l, outputcake) 58 | assert(type(l) == "string") 59 | assert(type(outputcake) == "table") 60 | if string.match(l, "%.") then 61 | l = bones.split(l, ".") 62 | else 63 | l = {l} 64 | end 65 | for _, j in ipairs(l) do 66 | if not outputcake[j] then 67 | error("Link path incorrect") 68 | end 69 | outputcake = outputcake[j] 70 | end 71 | return outputcake 72 | end 73 | 74 | local function isnewlayer(t) 75 | if not t then return false end 76 | if type(t) ~= "table" then return false end 77 | for k, v in pairs(t) do 78 | if k ~= "filter" and k ~= "url" then 79 | if type(v) == "table" then 80 | if isparam(v) then 81 | return true 82 | end 83 | end 84 | end 85 | end 86 | return false 87 | end 88 | 89 | local function _crawl(_url, _node) 90 | aspider = aspider or _spider.crawler:new() 91 | if type(_url) == "table" then 92 | local out = {} 93 | for i, j in ipairs(_url) do 94 | out[i] = aspider:crawl(j, _node) 95 | end 96 | return melttable(out) 97 | else 98 | return aspider:crawl(_url, _node) 99 | end 100 | end 101 | 102 | local function wraptable(st) 103 | if not st then return {""} end 104 | if (type(st) == "string") 105 | or (type(st) == "number") 106 | then 107 | return {st} 108 | elseif type(st) == "table" then 109 | return st 110 | else 111 | error("wtf " .. type(st)) 112 | end 113 | end 114 | 115 | local function copytable(cake, layer, parent, _metaout) 116 | local output = {} 117 | local metaout = _metaout or {} 118 | layer = layer or cake 119 | if layer.url then 120 | if parent and islink(layer.url, parent) then 121 | layer.url = getlink(layer.url, parent) 122 | end 123 | end 124 | 125 | for k, v in pairs(layer) do 126 | if isparam(v) and k ~= "url" then 127 | output[k] = _crawl(layer.url, v) or {} 128 | output[k] = melttable(output[k]) 129 | output[k] = wraptable(output[k]) 130 | output[k] = melttable(output[k]) 131 | metaout[k] = metaout[k] or {} 132 | metaout[k][#metaout[k]+1] = { template = v, output = output[k], thing = layer } 133 | end 134 | end 135 | 136 | if layer.drill then 137 | layer.drillcount = layer.drillcount or 3 138 | layer.drillcount = tonumber(layer.drillcount)-1 139 | for _ = 1, layer.drillcount do 140 | for k, v in pairs(layer) do 141 | if isparam(v) and k ~= "url" then 142 | output[k] = output[k] or {} 143 | table.insert(output[k], _crawl(output.drill[#output.drill], v) or {}) 144 | output[k] = melttable(output[k]) 145 | output[k] = wraptable(output[k]) 146 | output[k] = melttable(output[k]) 147 | metaout[k] = metaout[k] or {} 148 | metaout[k][#metaout[k]+1] = { template = v, output = output[k], thing = layer } 149 | end 150 | end 151 | end 152 | table.remove(output.drill) 153 | table.insert(output.drill, 1, layer.url) 154 | end 155 | for k, v in pairs(layer) do 156 | if isnewlayer(v) then 157 | output[k] = copytable(cake, v, output, metaout) 158 | end 159 | end 160 | return output, metaout 161 | end 162 | 163 | function _spider:new(c) 164 | c = c or {} 165 | setmetatable(c, self) 166 | self.__index = self 167 | return c 168 | end 169 | 170 | function _spider:crawl(template) 171 | local out = {} 172 | local metaout = {} 173 | if template then 174 | out, metaout = copytable(template) 175 | end 176 | return out, metaout 177 | end 178 | 179 | function _spider:dump(_url) 180 | local chrome = require("lua-chrome"):new() 181 | chrome.url = _url 182 | local ok, err = pcall(chrome.dump, chrome) 183 | if not ok then 184 | print(err) 185 | return nil 186 | else 187 | return chrome.dom 188 | end 189 | end 190 | 191 | _spider.assign = {} 192 | _spider.assign.crawler = function(_type) if _type and _type == "curl" then return _spider.crawler.curl else return _spider.crawler.chrome end end 193 | _spider.assign.parser = function() return _spider.crawler.parse end 194 | _spider.assign.extractor = function() return _spider.extractor end 195 | 196 | local spider = _spider:new() 197 | 198 | return spider 199 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A web scraper for lua. Content is downloaded with either curl or headless-chrome, and HTML is parsed with gumbo. Xpath like expressions are used to extract content from parsed documents. 2 | 3 | Requires [torch paths](https://github.com/torch/paths) rock, penlight [stringx](https://stevedonovan.github.io/Penlight/api/libraries/pl.stringx.html) and [file](https://stevedonovan.github.io/Penlight/api/libraries/pl.file.html), and either curl or chrome with the [lua-chrome](https://github.com/okpanic/lua-chrome) rock, and [lua-gumbo](https://github.com/craigbarnes/lua-gumbo). 4 | 5 | Spider has three parts, the crawler, parser, and extractor. For simple websites curl works well enough, but to scrape some js heavy sites I recommend switching to headless-chrome. 6 | 7 | Install gumbo 8 | 9 | ```bash 10 | git clone https://github.com/google/gumbo-parser 11 | cd gumbo* 12 | sudo ./gumbo.sh 13 | sudo cp ./etc/profile.d/gumbolib.sh /etc/profile.d/ 14 | source /etc/profile 15 | luarocks install gumbo 16 | ``` 17 | 18 | Install Lua-cURL, and lua-chrome. 19 | 20 | ```lua 21 | luarocks install Lua-cURL 22 | luarocks install https://github.com/okpanic/lua-chrome/raw/master/lua-chrome-1.0-1.src.rock 23 | 24 | ``` 25 | 26 | Simple example scraping a blog. 27 | 28 | ```lua 29 | url = "http://quotes.toscrape.com/" 30 | 31 | spider = require'lua-spider':new() 32 | crawl = spider.assign.crawler("chrome") 33 | parse = spider.assign.parser() 34 | xpath = spider.assign.extractor() 35 | 36 | html = crawl(url) 37 | doc = parse(html) --this is the gumbo document tree 38 | 39 | posts = xpath(doc, "//div[@class=quote]") --by default xpath returns a document tree 40 | 41 | post = {} 42 | for k, v in ipairs(posts) do 43 | post[#post+1] = { content = xpath(v, "//span[@class=text]", "text")[1], --extract text 44 | author = xpath(v, "//small[@class=author]", "text")[1], 45 | tags = xpath(v, "//div[@class=tag]//a", "text") } --or 'href' for link 46 | end 47 | ``` 48 | 49 | Output will look like: 50 | ```lua 51 | { 52 | 1 : 53 | { 54 | author : "Albert Einstein" 55 | content : "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" 56 | tags : 57 | { 58 | 1 : "change" 59 | 2 : "deep-thoughts" 60 | 3 : "thinking" 61 | 4 : "world" 62 | } 63 | } 64 | 2 : 65 | { 66 | author : "J.K. Rowling" 67 | content : "“It is our choices, Harry, that show what we truly are, far more than our abilities.”" 68 | tags : 69 | { 70 | 1 : "abilities" 71 | 2 : "choices" 72 | } 73 | } 74 | 3 : 75 | { 76 | author : "Albert Einstein" 77 | content : "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”" 78 | tags : 79 | { 80 | 1 : "inspirational" 81 | 2 : "life" 82 | 3 : "live" 83 | 4 : "miracle" 84 | 5 : "miracles" 85 | } 86 | } 87 | 4 : 88 | { 89 | author : "Jane Austen" 90 | content : "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”" 91 | tags : 92 | { 93 | 1 : "aliteracy" 94 | 2 : "books" 95 | 3 : "classic" 96 | 4 : "humor" 97 | } 98 | } 99 | 5 : 100 | { 101 | author : "Marilyn Monroe" 102 | content : "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”" 103 | tags : 104 | { 105 | 1 : "be-yourself" 106 | 2 : "inspirational" 107 | } 108 | } 109 | 6 : 110 | { 111 | author : "Albert Einstein" 112 | content : "“Try not to become a man of success. Rather become a man of value.”" 113 | tags : 114 | { 115 | 1 : "adulthood" 116 | 2 : "success" 117 | 3 : "value" 118 | } 119 | } 120 | 7 : 121 | { 122 | author : "André Gide" 123 | content : "“It is better to be hated for what you are than to be loved for what you are not.”" 124 | tags : 125 | { 126 | 1 : "life" 127 | 2 : "love" 128 | } 129 | } 130 | 8 : 131 | { 132 | author : "Thomas A. Edison" 133 | content : "“I have not failed. I've just found 10,000 ways that won't work.”" 134 | tags : 135 | { 136 | 1 : 137 | { 138 | author : "Albert Einstein" 139 | content : "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" 140 | tags : 141 | { 142 | 1 : "change" 143 | 2 : "deep-thoughts" 144 | 3 : "thinking" 145 | 4 : "world" 146 | } 147 | } 148 | 2 : 149 | { 150 | author : "J.K. Rowling" 151 | content : "“It is our choices, Harry, that show what we truly are, far more than our abilities.”" 152 | tags : 153 | { 154 | 1 : "abilities" 155 | 2 : "choices" 156 | } 157 | } 158 | 3 : 159 | { 160 | author : "Albert Einstein" 161 | content : "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”" 162 | tags : 163 | { 164 | 1 : "inspirational" 165 | 2 : "life" 166 | 3 : "live" 167 | 4 : "miracle" 168 | 5 : "miracles" 169 | } 170 | } 171 | 4 : 172 | { 173 | author : "Jane Austen" 174 | content : "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”" 175 | tags : 176 | { 177 | 1 : "aliteracy" 178 | 2 : "books" 179 | 3 : "classic" 180 | 4 : "humor" 181 | } 182 | } 183 | 5 : 184 | { 185 | author : "Marilyn Monroe" 186 | content : "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”" 187 | tags : 188 | { 189 | 1 : "be-yourself" 190 | 2 : "inspirational" 191 | } 192 | } 193 | 6 : 194 | { 195 | author : "Albert Einstein" 196 | content : "“Try not to become a man of success. Rather become a man of value.”" 197 | tags : 198 | { 199 | 1 : "adulthood" 200 | 2 : "success" 201 | 3 : "value" 202 | } 203 | } 204 | 7 : 205 | { 206 | author : "André Gide" 207 | content : "“It is better to be hated for what you are than to be loved for what you are not.”" 208 | tags : 209 | { 210 | 1 : "life" 211 | 2 : "love" 212 | } 213 | } 214 | 8 : 215 | { 216 | author : "Thomas A. Edison" 217 | content : "“I have not failed. I've just found 10,000 ways that won't work.”" 218 | tags : 219 | { 220 | 1 : "edison" 221 | 2 : "failure" 222 | 3 : "inspirational" 223 | 4 : "paraphrased" 224 | } 225 | } 226 | 9 : 227 | { 228 | author : "Eleanor Roosevelt" 229 | content : "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”" 230 | tags : 231 | { 232 | 1 : "misattributed-eleanor-roosevelt" 233 | } 234 | } 235 | 10 : 236 | { 237 | author : "Steve Martin" 238 | content : "“A day without sunshine is like, you know, night.”" 239 | tags : 240 | { 241 | 1 : "humor" 242 | 2 : "obvious" 243 | 3 : "simile" 244 | } 245 | } 246 | } 247 | { 248 | 1 : "edison" 249 | 2 : "failure" 250 | 3 : "inspirational" 251 | 4 : "paraphrased" 252 | } 253 | } 254 | 9 : 255 | { 256 | author : "Eleanor Roosevelt" 257 | content : "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”" 258 | tags : 259 | { 260 | 1 : "misattributed-eleanor-roosevelt" 261 | } 262 | } 263 | 10 : 264 | { 265 | author : "Steve Martin" 266 | content : "“A day without sunshine is like, you know, night.”" 267 | tags : 268 | { 269 | 1 : "humor" 270 | 2 : "obvious" 271 | 3 : "simile" 272 | } 273 | } 274 | } 275 | ``` 276 | -------------------------------------------------------------------------------- /lua-spider-1.0/README.md: -------------------------------------------------------------------------------- 1 | A web scraper for lua. Content is downloaded with either curl or headless-chrome, and HTML is parsed with gumbo. Xpath like expressions are used to extract content from parsed documents. 2 | 3 | Requires [torch paths](https://github.com/torch/paths) rock, penlight [stringx](https://stevedonovan.github.io/Penlight/api/libraries/pl.stringx.html) and [file](https://stevedonovan.github.io/Penlight/api/libraries/pl.file.html), and either curl or chrome with the [lua-chrome](https://github.com/okpanic/lua-chrome) rock, and [lua-gumbo](https://github.com/craigbarnes/lua-gumbo). 4 | 5 | Spider has three parts, the crawler, parser, and extractor. For simple websites curl works well enough, but to scrape some js heavy sites I recommend switching to headless-chrome. 6 | 7 | Install gumbo 8 | 9 | ```bash 10 | git clone https://github.com/google/gumbo-parser 11 | cd gumbo* 12 | sudo ./gumbo.sh 13 | sudo cp ./etc/profile.d/gumbolib.sh /etc/profile.d/ 14 | source /etc/profile 15 | luarocks install gumbo 16 | ``` 17 | 18 | Install Lua-cURL, and lua-chrome. 19 | 20 | ```lua 21 | luarocks install Lua-cURL 22 | luarocks install https://github.com/okpanic/lua-chrome/raw/master/lua-chrome-1.0-1.src.rock 23 | 24 | ``` 25 | 26 | Simple example scraping a blog. 27 | 28 | ```lua 29 | url = "http://quotes.toscrape.com/" 30 | 31 | spider = require'lua-spider':new() 32 | crawl = spider.assign.crawler("chrome") 33 | parse = spider.assign.parser() 34 | xpath = spider.assign.extractor() 35 | 36 | html = crawl(url) 37 | doc = parse(html) --this is the gumbo document tree 38 | 39 | posts = xpath(doc, "//div[@class=quote]") --by default xpath returns a document tree 40 | 41 | post = {} 42 | for k, v in ipairs(posts) do 43 | post[#post+1] = { content = xpath(v, "//span[@class=text]", "text")[1], --extract text 44 | author = xpath(v, "//small[@class=author]", "text")[1], 45 | tags = xpath(v, "//div[@class=tag]//a", "text") } --or 'href' for link 46 | end 47 | ``` 48 | 49 | Output will look like: 50 | ```lua 51 | { 52 | 1 : 53 | { 54 | author : "Albert Einstein" 55 | content : "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" 56 | tags : 57 | { 58 | 1 : "change" 59 | 2 : "deep-thoughts" 60 | 3 : "thinking" 61 | 4 : "world" 62 | } 63 | } 64 | 2 : 65 | { 66 | author : "J.K. Rowling" 67 | content : "“It is our choices, Harry, that show what we truly are, far more than our abilities.”" 68 | tags : 69 | { 70 | 1 : "abilities" 71 | 2 : "choices" 72 | } 73 | } 74 | 3 : 75 | { 76 | author : "Albert Einstein" 77 | content : "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”" 78 | tags : 79 | { 80 | 1 : "inspirational" 81 | 2 : "life" 82 | 3 : "live" 83 | 4 : "miracle" 84 | 5 : "miracles" 85 | } 86 | } 87 | 4 : 88 | { 89 | author : "Jane Austen" 90 | content : "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”" 91 | tags : 92 | { 93 | 1 : "aliteracy" 94 | 2 : "books" 95 | 3 : "classic" 96 | 4 : "humor" 97 | } 98 | } 99 | 5 : 100 | { 101 | author : "Marilyn Monroe" 102 | content : "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”" 103 | tags : 104 | { 105 | 1 : "be-yourself" 106 | 2 : "inspirational" 107 | } 108 | } 109 | 6 : 110 | { 111 | author : "Albert Einstein" 112 | content : "“Try not to become a man of success. Rather become a man of value.”" 113 | tags : 114 | { 115 | 1 : "adulthood" 116 | 2 : "success" 117 | 3 : "value" 118 | } 119 | } 120 | 7 : 121 | { 122 | author : "André Gide" 123 | content : "“It is better to be hated for what you are than to be loved for what you are not.”" 124 | tags : 125 | { 126 | 1 : "life" 127 | 2 : "love" 128 | } 129 | } 130 | 8 : 131 | { 132 | author : "Thomas A. Edison" 133 | content : "“I have not failed. I've just found 10,000 ways that won't work.”" 134 | tags : 135 | { 136 | 1 : 137 | { 138 | author : "Albert Einstein" 139 | content : "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" 140 | tags : 141 | { 142 | 1 : "change" 143 | 2 : "deep-thoughts" 144 | 3 : "thinking" 145 | 4 : "world" 146 | } 147 | } 148 | 2 : 149 | { 150 | author : "J.K. Rowling" 151 | content : "“It is our choices, Harry, that show what we truly are, far more than our abilities.”" 152 | tags : 153 | { 154 | 1 : "abilities" 155 | 2 : "choices" 156 | } 157 | } 158 | 3 : 159 | { 160 | author : "Albert Einstein" 161 | content : "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”" 162 | tags : 163 | { 164 | 1 : "inspirational" 165 | 2 : "life" 166 | 3 : "live" 167 | 4 : "miracle" 168 | 5 : "miracles" 169 | } 170 | } 171 | 4 : 172 | { 173 | author : "Jane Austen" 174 | content : "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”" 175 | tags : 176 | { 177 | 1 : "aliteracy" 178 | 2 : "books" 179 | 3 : "classic" 180 | 4 : "humor" 181 | } 182 | } 183 | 5 : 184 | { 185 | author : "Marilyn Monroe" 186 | content : "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”" 187 | tags : 188 | { 189 | 1 : "be-yourself" 190 | 2 : "inspirational" 191 | } 192 | } 193 | 6 : 194 | { 195 | author : "Albert Einstein" 196 | content : "“Try not to become a man of success. Rather become a man of value.”" 197 | tags : 198 | { 199 | 1 : "adulthood" 200 | 2 : "success" 201 | 3 : "value" 202 | } 203 | } 204 | 7 : 205 | { 206 | author : "André Gide" 207 | content : "“It is better to be hated for what you are than to be loved for what you are not.”" 208 | tags : 209 | { 210 | 1 : "life" 211 | 2 : "love" 212 | } 213 | } 214 | 8 : 215 | { 216 | author : "Thomas A. Edison" 217 | content : "“I have not failed. I've just found 10,000 ways that won't work.”" 218 | tags : 219 | { 220 | 1 : "edison" 221 | 2 : "failure" 222 | 3 : "inspirational" 223 | 4 : "paraphrased" 224 | } 225 | } 226 | 9 : 227 | { 228 | author : "Eleanor Roosevelt" 229 | content : "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”" 230 | tags : 231 | { 232 | 1 : "misattributed-eleanor-roosevelt" 233 | } 234 | } 235 | 10 : 236 | { 237 | author : "Steve Martin" 238 | content : "“A day without sunshine is like, you know, night.”" 239 | tags : 240 | { 241 | 1 : "humor" 242 | 2 : "obvious" 243 | 3 : "simile" 244 | } 245 | } 246 | } 247 | { 248 | 1 : "edison" 249 | 2 : "failure" 250 | 3 : "inspirational" 251 | 4 : "paraphrased" 252 | } 253 | } 254 | 9 : 255 | { 256 | author : "Eleanor Roosevelt" 257 | content : "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”" 258 | tags : 259 | { 260 | 1 : "misattributed-eleanor-roosevelt" 261 | } 262 | } 263 | 10 : 264 | { 265 | author : "Steve Martin" 266 | content : "“A day without sunshine is like, you know, night.”" 267 | tags : 268 | { 269 | 1 : "humor" 270 | 2 : "obvious" 271 | 3 : "simile" 272 | } 273 | } 274 | } 275 | ``` 276 | -------------------------------------------------------------------------------- /lua-spider-1.0/lua-spider/extractor.lua: -------------------------------------------------------------------------------- 1 | local bones = require("lua-spider.bones") 2 | 3 | local function istable(t) 4 | if type(t) == "table" then return true else return false end 5 | end 6 | 7 | local function iselement(e) 8 | if not istable(e) then return false end 9 | if istable(e) then 10 | if e.type and e.type == "element" then 11 | return true 12 | else 13 | return false 14 | end 15 | end 16 | end 17 | 18 | local function isdocument(d) 19 | if not istable(d) then return false else 20 | if d.type and d.type == "document" then 21 | return true 22 | else 23 | return false 24 | end 25 | end 26 | end 27 | 28 | local function melt(t, o) 29 | o = o or {} 30 | if istable(t) then 31 | if not iselement(t) then 32 | for i = 1, #t do 33 | o = melt(t[i], o) 34 | end 35 | else 36 | o[#o+1] = t 37 | end 38 | end 39 | if not o or not o[1] then 40 | return nil 41 | else 42 | return o 43 | end 44 | end 45 | 46 | local function melttable(arr) 47 | if type(arr) == "table" then 48 | local result = {} 49 | local function toflat(_arr) 50 | for _, v in ipairs(_arr) do 51 | if type(v) == "table" then 52 | toflat(v) 53 | else 54 | table.insert(result, v) 55 | end 56 | end 57 | end 58 | toflat(arr) 59 | return result 60 | else 61 | return arr 62 | end 63 | end 64 | 65 | local function hasattribute(e, a) 66 | if e:hasAttributes() then 67 | if e.attributes then 68 | if e.attributes[a] then 69 | return true 70 | else 71 | return false 72 | end 73 | else 74 | return false 75 | end 76 | else 77 | return false 78 | end 79 | end 80 | 81 | local function hasattributevalue(e, a, v) 82 | if e:hasAttributes() then 83 | if e.attributes then 84 | if e.attributes[a] then 85 | if string.match(e.attributes[a].value, v) then 86 | return true 87 | else 88 | return false 89 | end 90 | else 91 | return false 92 | end 93 | else 94 | return false 95 | end 96 | else 97 | return false 98 | end 99 | end 100 | 101 | 102 | 103 | local match = {} 104 | 105 | function match.getAllByTag(doc, tag, _ind) 106 | if not isdocument(doc) then 107 | doc = melt(doc) 108 | else 109 | doc = {doc} 110 | end 111 | if not doc then return nil end 112 | local out = {} 113 | for i = 1, #doc do 114 | out[#out+1] = doc[i]:getElementsByTagName(tag) 115 | end 116 | if not out[1] then 117 | return nil 118 | else 119 | if _ind then 120 | return out[_ind] 121 | else 122 | return out 123 | end 124 | end 125 | end 126 | 127 | function match.getByAttribute(doc, attr, attrval, _ind) 128 | if not isdocument(doc) then 129 | doc = melt(doc) 130 | end 131 | if not doc then return nil end 132 | local out = {} 133 | for i = 1, #doc do 134 | if hasattribute(doc[i], attr) then 135 | if attrval then 136 | if hasattributevalue(doc[i], attr, attrval) then 137 | out[#out+1] = doc[i] 138 | end 139 | else 140 | out[#out+1] = doc[i] 141 | end 142 | end 143 | end 144 | if not out[1] then 145 | return nil 146 | else 147 | if _ind then 148 | return out[_ind] 149 | else 150 | return out 151 | end 152 | end 153 | end 154 | 155 | function match.getByText(doc, txt, _ind) 156 | if not isdocument(doc) then 157 | doc = melt(doc) 158 | end 159 | if not doc then return nil end 160 | local out = {} 161 | for _, v in ipairs(doc) do 162 | if type(v) ~= "number" then 163 | if v.textContent then 164 | if txt then 165 | if string.match(v.textContent, txt) then 166 | out[#out+1] = v 167 | end 168 | else 169 | out[#out+1] = v 170 | end 171 | end 172 | end 173 | end 174 | if not out[1] then 175 | return nil 176 | else 177 | if _ind then 178 | return out[_ind] 179 | else 180 | return out 181 | end 182 | end 183 | end 184 | 185 | 186 | local function extractText(doc) 187 | local out 188 | if not doc then return nil end 189 | if type(doc) == "table" and #doc >= 1 then 190 | for i = 1, #doc do 191 | if type(doc[i]) ~= "number" then 192 | if doc[i].type == "element" then 193 | if doc[i].textContent then 194 | if not out then out = {} end 195 | out[#out+1] = doc[i].textContent 196 | end 197 | else 198 | if type(doc[i]) == "table" then 199 | if not out then out = {} end 200 | out[#out+1] = extractText(doc[i]) 201 | end 202 | end 203 | end 204 | end 205 | elseif type(doc) == "table" and doc.type and doc.type == "document" then 206 | if doc.childNodes then 207 | for j = 1, #doc.childNodes do 208 | if not out then out = {} end 209 | out[#out+1] = extractText(doc.childNodes[j]) 210 | end 211 | end 212 | else 213 | return "nil" 214 | end 215 | if type(out) == "table" then 216 | if type(out[1]) == "table" then 217 | return melttable(out) 218 | else 219 | return out 220 | end 221 | else 222 | return out 223 | end 224 | end 225 | 226 | local function extractAttr(doc, attr) 227 | if not isdocument(doc) then 228 | doc = melt(doc) 229 | end 230 | if not doc then return nil end 231 | local out = {} 232 | for i = 1, #doc do 233 | if hasattribute(doc[i], attr) then 234 | out[#out+1] = doc[i].attributes[attr].value 235 | end 236 | end 237 | if not out[1] then 238 | return "nil" 239 | else 240 | return melttable(out) 241 | end 242 | end 243 | 244 | local function splitXpath(xp) 245 | local doubleslash = "!dblslash!" 246 | local procedure = {} 247 | assert(type(xp) == "string") 248 | if string.match(xp, "/") then 249 | xp = xp:gsub("//", "/" .. doubleslash) 250 | xp = xp:gsub("^/", "") 251 | xp = bones.split(xp, "/") 252 | for _, j in ipairs(xp) do 253 | while j ~= "" do 254 | if string.match(j, "^" .. doubleslash) then 255 | j = j:gsub("^" .. doubleslash, "") 256 | procedure[#procedure+1] = { how = "getAllByTag", 257 | what = j:gsub('^(%w+).*','%1') } 258 | else 259 | procedure[#procedure+1] = { how = "getAllByTag", 260 | what = j:gsub('^(%w+).*','%1') } 261 | end 262 | if string.match(j, "%b[]") then 263 | if string.match(j, "[^%]]%[%d+%]") then 264 | procedure[#procedure+1] = { how = "getByIndex", 265 | what = j:gsub("^.+%[(%d+)%].*", "%1") } 266 | procedure[#procedure].what = tonumber(procedure[#procedure].what) 267 | end 268 | if string.match(j, "^.+%b[@[^=]+.*].*") then 269 | procedure[#procedure+1] = { how = "getByAttribute", 270 | what = j:gsub('^.+%b[@([^=]+).*].*','%1') } 271 | if string.match(j, "^.+%b[@.+=.+].*") then 272 | procedure[#procedure+1] = { how = "getByAttributeValue", 273 | what = j:gsub('^.+%b[@.+=(.-)].*','%1') } 274 | end 275 | elseif string.match(j, "^.+%b[text%b()%s*.*].*") then 276 | procedure[#procedure+1] = { how = "getByText" } 277 | if string.match(j, "^.+%b[text%b()%s*=.+].*") then 278 | procedure[#procedure+1] = { how = "getByTextContent", 279 | what = j:gsub('^.+%b[text%b()%s*=(.-)].*', '%1') } 280 | end 281 | end 282 | if string.match(j, "%]%[%d+%]") then 283 | procedure[#procedure+1] = { how = "getByIndex", 284 | what = j:gsub("^.+%[(%d+)%].*", "%1") } 285 | procedure[#procedure].what = tonumber(procedure[#procedure].what) 286 | end 287 | if string.match(j, "^.+%b[text%b()%s*.*].*") then 288 | procedure[#procedure+1] = { how = "getByText" } 289 | if string.match(j, "^.+%b[text%b()%s*=.+].*") then 290 | procedure[#procedure+1] = { how = "getByTextContent", 291 | what = j:gsub('^.+%b[text%b()%s*=(.-)].*', '%1') } 292 | end 293 | end 294 | end 295 | j = j:gsub(".*", "") 296 | end 297 | end 298 | end 299 | if type(procedure) == "table" and procedure[1] and procedure[1]["what"] then 300 | for k, v in ipairs(procedure) do 301 | procedure[k]["what"] = procedure[k]["what"]:gsub("%-", "%%-") 302 | end 303 | end 304 | return procedure 305 | end 306 | 307 | local function xpathiter(doc, xp) 308 | local counter = 1 309 | while counter < #xp+1 do 310 | if xp[counter] and xp[counter].how == "getAllByTag" then 311 | if xp[counter+1] and xp[counter+1].how == "getByIndex" then 312 | doc = match[xp[counter].how](doc, xp[counter].what, xp[counter+1].what) 313 | counter = counter+2 314 | else 315 | doc = match[xp[counter].how](doc, xp[counter].what) 316 | counter = counter+1 317 | end 318 | end 319 | if xp[counter] and xp[counter].how == "getByAttribute" then 320 | if xp[counter+1] and xp[counter+1].how == "getByAttributeValue" then 321 | if xp[counter+2] and xp[counter+2].how == "getByIndex" then 322 | doc = match[xp[counter].how](doc, xp[counter].what, xp[counter+1].what, xp[counter+2].what) 323 | counter = counter+3 324 | else 325 | doc = match[xp[counter].how](doc, xp[counter].what, xp[counter+1].what) 326 | counter = counter+2 327 | end 328 | else 329 | if xp[counter+1] and xp[counter+1].how == "getByIndex" then 330 | doc = match[xp[counter].how](doc, xp[counter].what, "NA", xp[counter+1].what) 331 | counter = counter+2 332 | else 333 | doc = match[xp[counter].how](doc, xp[counter].what) 334 | counter = counter+1 335 | end 336 | end 337 | end 338 | if xp[counter] and xp[counter].how == "getByText" then 339 | if xp[counter+1] and xp[counter+1].how == "getByTextContent" then 340 | if xp[counter+2] and xp[counter+2].how == "getByIndex" then 341 | doc = match[xp[counter].how](doc, xp[counter].what, xp[counter+1].what, xp[counter+2].what) 342 | counter = counter+3 343 | else 344 | doc = match[xp[counter].how](doc, xp[counter].what, xp[counter+1].what) 345 | counter = counter+2 346 | end 347 | else 348 | if xp[counter+1] and xp[counter+1].how == "getByIndex" then 349 | doc = match[xp[counter].how](doc, xp[counter].what, "NA", xp[counter+1].what) 350 | counter = counter+2 351 | else 352 | doc = match[xp[counter].how](doc, xp[counter].what) 353 | counter = counter+1 354 | end 355 | end 356 | end 357 | end 358 | return doc 359 | end 360 | 361 | local function extractor(doc, xp, ext) 362 | assert(type(doc) == "table") 363 | assert(type(xp) == "string") 364 | doc = xpathiter(doc, splitXpath(xp)) 365 | if not ext then 366 | return doc or "nil" 367 | elseif ext == "text" then 368 | return extractText(doc) or "nil" 369 | else 370 | return extractAttr(doc, ext) or "nil" 371 | end 372 | end 373 | 374 | return extractor 375 | --------------------------------------------------------------------------------