├── Gemfile ├── reddit.sample.json ├── scrape.rb └── reddit-pca.rb /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gem 'nokogiri' 3 | gem 'httparty' 4 | -------------------------------------------------------------------------------- /reddit.sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "auth" : { "username" : "redditbot", 3 | "password" : "pass", 4 | "httpuser" : "abc1234", 5 | "httppass" : "pass4321xyz" 6 | }, 7 | "limit" : 100 8 | } 9 | 10 | -------------------------------------------------------------------------------- /scrape.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'bundler/setup' 3 | require 'nokogiri' 4 | require 'open-uri' 5 | require 'json' 6 | require 'time' 7 | 8 | url = ARGV[0] || "https://www.portland.gov/council/agenda" 9 | Uri = URI::parse(url) 10 | STDERR.puts Uri 11 | 12 | def parse_bill(agenda_row) 13 | bill = {} 14 | agenda_row.css('h4').each do |id| 15 | number = id.text.gsub("\n","").gsub(/\s+/,"").strip 16 | bill['number'] = number 17 | break 18 | end 19 | agenda_row.css('div.council-document__title').each do |node| 20 | parts = node.text.split("\n") 21 | title = parts[0].strip.delete_prefix("*") 22 | bill['title'] = title 23 | kind = parts[1].strip.delete_prefix("(").delete_suffix(")") 24 | bill['kind'] = kind 25 | path = node.css('a').attr('href') 26 | link = "https://#{Uri.host}#{path}" 27 | bill['link'] = link 28 | end 29 | agenda_row.css('div.field--name-field-agenda-item-disposition div.field__item').each do |node| 30 | bill['disposition'] = node.text 31 | end 32 | agenda_row.css('div.field--name-field-bureau div.field__item').each do |node| 33 | bill['bureau'] = node.text 34 | end 35 | votes = agenda_row.css('div.field--name-field-votes div.field__item').map do |node| 36 | parts = node.text.split(" ").map{|n| n.strip} 37 | title = parts.shift 38 | vote = parts.pop 39 | { :title => title, :name => parts.join(" "), :vote => vote } 40 | end 41 | bill['votes'] = votes 42 | 43 | # bill.merge!(:time_certain => item_date) 44 | # bill.merge!(:link => "https://#{uri.host}/#{citypdf.attributes['href']}") if citypdf 45 | # bill.merge!({:emergency => true}) if emergency 46 | bill 47 | end 48 | 49 | def tableread(tablerow, agenda_date) 50 | items = [] 51 | tablerow.css("div.view-admin-agenda-items").each do |row| 52 | row.css('div.relation--type-agenda-item').each do |item| 53 | bill = parse_bill(item) 54 | bill['session'] = Time.parse(agenda_date).localtime.strftime("%Y-%m-%d %-l:%M%p") 55 | items << bill if bill 56 | end 57 | end 58 | items 59 | end 60 | 61 | doc = Nokogiri::HTML(URI.open(url).read) 62 | 63 | items = [] 64 | doc.css("div.relation--type-council-session").each do |row| 65 | agenda_date = row.css('div.session-meta time').attr('datetime').text 66 | if agenda_date then 67 | STDERR.puts "Section: #{agenda_date}" 68 | tableitems = tableread(row, agenda_date) 69 | STDERR.puts "parsing section #{agenda_date} -> #{tableitems.length} items found" 70 | items += tableitems 71 | end 72 | end 73 | 74 | agenda = { 75 | :source => url, 76 | :scrape_date => Time.now, 77 | :items => items 78 | } 79 | puts JSON.pretty_generate(agenda) 80 | -------------------------------------------------------------------------------- /reddit-pca.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'bundler/setup' 3 | require "httparty" 4 | require 'json' 5 | 6 | def access_token 7 | HTTParty.post('https://ssl.reddit.com/api/v1/access_token', 8 | {:body => ["grant_type=password", 9 | "username=#{@reddit['auth']['username']}", 10 | "password=#{@reddit['auth']['password']}"].join('&'), 11 | :basic_auth => {:username => @reddit['auth']['httpuser'], 12 | :password => @reddit['auth']['httppass']}, 13 | :headers => {"User-Agent" => "pdxcitycouncil-scraper"} 14 | }).parsed_response["access_token"] 15 | end 16 | 17 | def make_title(item) 18 | #["TOO_LONG", "this is too long (max: 300)", "title"] 19 | title = item['title'].match(/([^(]+)/).captures.first 20 | title = title[0,294]+" [#{item['number']}]" 21 | end 22 | 23 | def make_text(item) 24 | emergency = item['emergency'] ? "(this item takes effect immediately if passed)" : "" 25 | "Session #{item['session']} " + "|" + "Item \\##{item['number']} " + "|" + "#{item['bureau']}" + "|" + "#{item['disposition']}" + "\n\n" + 26 | item['title'] + "\n\n" + emergency + "\n\n" + 27 | item['link'] + "\n" 28 | end 29 | 30 | def make_vote_comment(item) 31 | comment = "Disposition: #{item['disposition']}\n\n" 32 | comment += item['votes'].map do |v| 33 | " * #{v['title']} #{v['name']} #{v['vote']}" 34 | end.join("\n\n") 35 | comment 36 | end 37 | 38 | def api(path, token, params) 39 | HTTParty.post("https://oauth.reddit.com#{path}", 40 | {:headers => { 'Content-Type' => 'application/json', 41 | 'Authorization' => "bearer #{token}", 42 | 'User-Agent' => 'pdxcitycouncil-scraper'}, 43 | :query => params }) 44 | end 45 | 46 | def add_story(token, post) 47 | api('/api/submit', token, 48 | {'api_type' => "json", 49 | 'kind' => 'self', 50 | 'sr' => "pdxcouncilagenda", 51 | 'title' => make_title(post), 52 | 'text' => make_text(post)} ).parsed_response 53 | #puts "#{post.parsed_response.inspect}" 54 | #{"json"=>{"errors"=>[], "data"=>{"url"=>"https://oauth.reddit.com/r/pdxcouncilagenda/comments/274cvy/portland_city_council_agenda/", "id"=>"274cvy", "name"=>"t3_274cvy"}}} 55 | end 56 | 57 | def add_comment(token, post, comment) 58 | api('/api/comment', token, 59 | {'api_type' => "json", 60 | 'thing_id' => post['name'], 61 | 'text' => comment}).parsed_response 62 | end 63 | 64 | 65 | def load_posts 66 | posts = [] 67 | url = "https://www.reddit.com/r/pdxcouncilagenda/new.json?limit=#{@reddit['limit']}" 68 | puts url 69 | data = HTTParty.get(url, { 70 | :headers => {"User-Agent" => "pdxcitycouncil-scraper"} 71 | }).parsed_response 72 | if data['error'] 73 | puts "reddit error: #{data['message']}" 74 | else 75 | posts = data['data']['children'] 76 | posts.each do |p| 77 | match = p['data']['title'].match(/\[([0-9 -]+)\]/) 78 | p['data']['agenda_number'] = match.captures.first if match 79 | end 80 | end 81 | posts.map{|p| p['data']} 82 | end 83 | 84 | def load_comments(post) 85 | url = "https://www.reddit.com/r/pdxcouncilagenda/comments/#{post['id']}.json" 86 | puts url 87 | data = HTTParty.get(url, { 88 | :headers => {"User-Agent" => "pdxcitycouncil-scraper"} 89 | }).parsed_response 90 | if !data.is_a?(Array) 91 | puts "reddit error: #{data['message']}" 92 | else 93 | data[1]['data']['children'].map do |comment| 94 | if comment['kind'] == 't1' 95 | comment['data'] 96 | end 97 | end 98 | end 99 | #{"subreddit_id"=>"t5_31utx", "approved_at_utc"=>nil, "author_is_blocked"=>false, "comment_type"=>nil, "awarders"=>[], "mod_reason_by"=>nil, "banned_by"=>nil, "author_fl 100 | #air_type"=>"text", "total_awards_received"=>0, "subreddit"=>"pdxcouncilagenda", "author_flair_template_id"=>nil, "likes"=>nil, "replies"=>"", "user_reports"=>[], "saved" 101 | #=>false, "id"=>"i5n3nhp", "banned_at_utc"=>nil, "mod_reason_title"=>nil, "gilded"=>0, "archived"=>false, "collapsed_reason_code"=>nil, "no_follow"=>true, "author"=>"donp 102 | #donp", "can_mod_post"=>false, "created_utc"=>1650561868.0, "send_replies"=>true, "parent_id"=>"t3_u4bwbn", "score"=>1, "author_fullname"=>"t2_1zlsy", "approved_by"=>nil, 103 | # "mod_note"=>nil, "all_awardings"=>[], "collapsed"=>false, "body"=>"comment", "edited"=>false, "top_awarded_type"=>nil, "author_flair_css_class"=>nil, "name"=>"t1_i5n3nh 104 | #p", "is_submitter"=>false, "downs"=>0, "author_flair_richtext"=>[], "author_patreon_flair"=>false, "body_html"=>"<div class=\"md\"><p>comment</p>\n< 105 | #/div>", "removal_reason"=>nil, "collapsed_reason"=>nil, "distinguished"=>nil, "associated_award"=>nil, "stickied"=>false, "author_premium"=>false, "can_gild"=>true, " 106 | #gildings"=>{}, "unrepliable_reason"=>nil, "author_flair_text_color"=>nil, "score_hidden"=>false, "permalink"=>"/r/pdxcouncilagenda/comments/u4bwbn/accept_the_2021_annual 107 | #_report_of_the_north_and/i5n3nhp/", "subreddit_type"=>"restricted", "locked"=>false, "report_reasons"=>nil, "created"=>1650561868.0, "author_flair_text"=>nil, "treatment 108 | #_tags"=>[], "link_id"=>"t3_u4bwbn", "subreddit_name_prefixed"=>"r/pdxcouncilagenda", "controversiality"=>0, "depth"=>0, "author_flair_background_color"=>nil, "collapsed_ 109 | #because_crowd_control"=>nil, "mod_reports"=>[], "num_reports"=>nil, "ups"=>1} 110 | end 111 | 112 | 113 | @reddit = JSON.parse(File.read("reddit.json")) 114 | clean = ARGV[0] == 'clean' if ARGV[0] 115 | do_post = ARGV[0] == 'post' if ARGV[0] 116 | 117 | puts "clean mode ON" if clean 118 | puts "LIVE POST" if do_post 119 | 120 | puts "loading r/pdxcouncilagenda posts" 121 | posts = load_posts 122 | puts "loaded #{posts.length} reddit posts" 123 | story_ids = {} 124 | posts.each do |p| 125 | if p['agenda_number'] 126 | story_ids[p['agenda_number']] = p 127 | end 128 | end 129 | 130 | if story_ids.empty? 131 | puts "reddit posts dont match agenda numbers. aborting early." 132 | exit 133 | end 134 | 135 | # agenda 136 | puts "loading scraped council agenda items" 137 | agenda_json = "https://donp.org/pdxapi/pdx-council-agenda.json" 138 | puts agenda_json 139 | agenda = HTTParty.get(agenda_json).parsed_response 140 | puts "loaded #{agenda['items'].size} agenda items" 141 | 142 | unposted = agenda['items'].reject{|item| story_ids.include?(item['number'])} 143 | puts "#{unposted.size} unposted #{unposted.map{|p|p['number']}.sort}" 144 | 145 | begin 146 | token = access_token() 147 | rescue e 148 | puts "reddit access token request failed: #{e}" 149 | exit 150 | end 151 | 152 | if clean 153 | story_ids.each do |post| 154 | kind = 't3' #always t3 155 | rid = "#{kind}_#{post['id']}" 156 | puts "Deleting agenda #{id} reddit post #{rid}" 157 | del = api('/api/del', token, {"id" => rid}) 158 | puts del.body 159 | end 160 | end 161 | 162 | if do_post 163 | unposted.each do |post| 164 | result = add_story(token, post) 165 | puts "Posting #{post['number']} #{result}" 166 | end 167 | end 168 | 169 | voted = agenda['items'].select{|item| item['votes'].length > 0} 170 | voted.each do |v| 171 | post = story_ids[v['number']] 172 | if post 173 | comments = load_comments(post) 174 | puts "##{v['number']} #{v['votes'].length} council votes. #{comments.length} reddit comments." 175 | vote_comment = comments.select do |comment| 176 | yn = comment['body'].match(/^Disposition/) && comment['author'] == 'pdxapibot' 177 | puts "#{post['id']} #{comment['author']} #{comment['body']} #{yn}" 178 | yn 179 | end 180 | if vote_comment.empty? 181 | if do_post 182 | result = add_comment(token, post, make_vote_comment(v)) 183 | if result['json']['errors'].length > 0 184 | puts "add_comment error #{result['json']['errors']}" 185 | end 186 | end 187 | end 188 | else 189 | puts "Warning: no reddit post for agenda #{v['number']}" 190 | end 191 | end 192 | 193 | 194 | --------------------------------------------------------------------------------