├── Gemfile
├── reddit.sample.json
├── scrape.rb
└── reddit-pca.rb


/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | gem 'nokogiri'
3 | gem 'httparty'
4 | 


--------------------------------------------------------------------------------
/reddit.sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "auth" : { "username" : "redditbot",
 3 |              "password" : "pass",
 4 |              "httpuser" : "abc1234",
 5 |              "httppass" : "pass4321xyz"
 6 |             },
 7 |   "limit" : 100
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/scrape.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require 'bundler/setup'
 3 | require 'nokogiri'
 4 | require 'open-uri'
 5 | require 'json'
 6 | require 'time'
 7 | 
 8 | url = ARGV[0] || "https://www.portland.gov/council/agenda" 
 9 | Uri = URI::parse(url)
10 | STDERR.puts Uri
11 | 
12 | def parse_bill(agenda_row)
13 |   bill = {}
14 |   agenda_row.css('h4').each do |id|
15 |     number = id.text.gsub("\n","").gsub(/\s+/,"").strip
16 |     bill['number'] = number
17 |     break
18 |   end
19 |   agenda_row.css('div.council-document__title').each do |node|
20 |     parts = node.text.split("\n")
21 |     title = parts[0].strip.delete_prefix("*")
22 |     bill['title'] = title
23 |     kind = parts[1].strip.delete_prefix("(").delete_suffix(")")
24 |     bill['kind'] = kind
25 |     path = node.css('a').attr('href')
26 |     link = "https://#{Uri.host}#{path}"
27 |     bill['link'] = link
28 |   end
29 |   agenda_row.css('div.field--name-field-agenda-item-disposition div.field__item').each do |node|
30 |     bill['disposition'] = node.text
31 |   end
32 |   agenda_row.css('div.field--name-field-bureau div.field__item').each do |node|
33 |     bill['bureau'] = node.text
34 |   end
35 |   votes = agenda_row.css('div.field--name-field-votes div.field__item').map do |node| 
36 |     parts = node.text.split(" ").map{|n| n.strip}
37 |     title = parts.shift
38 |     vote = parts.pop
39 |     { :title => title, :name => parts.join(" "), :vote => vote }
40 |   end
41 |   bill['votes'] = votes
42 | 
43 |   # bill.merge!(:time_certain => item_date)
44 |   # bill.merge!(:link => "https://#{uri.host}/#{citypdf.attributes['href']}") if citypdf
45 |   # bill.merge!({:emergency => true}) if emergency
46 |   bill
47 | end
48 | 
49 | def tableread(tablerow, agenda_date) 
50 |   items = []
51 |   tablerow.css("div.view-admin-agenda-items").each do |row|
52 |     row.css('div.relation--type-agenda-item').each do |item|
53 |         bill = parse_bill(item)
54 |         bill['session'] = Time.parse(agenda_date).localtime.strftime("%Y-%m-%d %-l:%M%p")
55 |         items << bill if bill
56 |     end
57 |   end
58 |   items
59 | end
60 | 
61 | doc = Nokogiri::HTML(URI.open(url).read)
62 | 
63 | items = []
64 | doc.css("div.relation--type-council-session").each do |row|
65 |   agenda_date = row.css('div.session-meta time').attr('datetime').text
66 |   if agenda_date then
67 |     STDERR.puts "Section: #{agenda_date}"
68 |     tableitems = tableread(row, agenda_date)
69 |     STDERR.puts "parsing section #{agenda_date} -> #{tableitems.length} items found"
70 |     items += tableitems
71 |   end
72 | end
73 | 
74 | agenda = {
75 |   :source => url,
76 |   :scrape_date => Time.now,
77 |   :items => items
78 | }
79 | puts JSON.pretty_generate(agenda)
80 | 


--------------------------------------------------------------------------------
/reddit-pca.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | require 'bundler/setup'
  3 | require "httparty"
  4 | require 'json'
  5 | 
  6 | def access_token
  7 |   HTTParty.post('https://ssl.reddit.com/api/v1/access_token',
  8 |               {:body => ["grant_type=password",
  9 |                          "username=#{@reddit['auth']['username']}",
 10 |                          "password=#{@reddit['auth']['password']}"].join('&'),
 11 |                :basic_auth => {:username => @reddit['auth']['httpuser'], 
 12 |                                :password => @reddit['auth']['httppass']},
 13 |                :headers => {"User-Agent" => "pdxcitycouncil-scraper"}
 14 |               }).parsed_response["access_token"]
 15 | end
 16 | 
 17 | def make_title(item)
 18 |   #["TOO_LONG", "this is too long (max: 300)", "title"]
 19 |   title = item['title'].match(/([^(]+)/).captures.first
 20 |   title = title[0,294]+" [#{item['number']}]"
 21 | end
 22 | 
 23 | def make_text(item)
 24 |   emergency = item['emergency'] ? "(this item takes effect immediately if passed)" : ""
 25 |   "Session #{item['session']} " + "|" + "Item \\##{item['number']} " + "|" + "#{item['bureau']}" + "|" + "#{item['disposition']}" + "\n\n"  +
 26 |   item['title'] + "\n\n" + emergency + "\n\n" + 
 27 |   item['link'] + "\n"
 28 | end
 29 | 
 30 | def make_vote_comment(item)
 31 |   comment = "Disposition: #{item['disposition']}\n\n"
 32 |   comment += item['votes'].map do |v|
 33 |     "  * #{v['title']} #{v['name']} #{v['vote']}"
 34 |   end.join("\n\n")
 35 |   comment
 36 | end
 37 | 
 38 | def api(path, token, params)
 39 |   HTTParty.post("https://oauth.reddit.com#{path}",
 40 |                 {:headers => { 'Content-Type' => 'application/json',
 41 |                                'Authorization' => "bearer #{token}",
 42 |                                'User-Agent' => 'pdxcitycouncil-scraper'},
 43 |                  :query => params })
 44 | end
 45 | 
 46 | def add_story(token, post)
 47 |   api('/api/submit', token,
 48 |                                   {'api_type' => "json",
 49 |                                    'kind' => 'self',
 50 |                                    'sr' => "pdxcouncilagenda",
 51 |                                    'title' => make_title(post),
 52 |                                    'text' => make_text(post)} ).parsed_response
 53 |   #puts "#{post.parsed_response.inspect}"
 54 |   #{"json"=>{"errors"=>[], "data"=>{"url"=>"https://oauth.reddit.com/r/pdxcouncilagenda/comments/274cvy/portland_city_council_agenda/", "id"=>"274cvy", "name"=>"t3_274cvy"}}}
 55 | end
 56 | 
 57 | def add_comment(token, post, comment)
 58 |   api('/api/comment', token,
 59 |                                   {'api_type' => "json",
 60 |                                    'thing_id' => post['name'],
 61 |                                    'text' => comment}).parsed_response
 62 | end
 63 |  
 64 | 
 65 | def load_posts
 66 |   posts = []
 67 |   url = "https://www.reddit.com/r/pdxcouncilagenda/new.json?limit=#{@reddit['limit']}"
 68 |   puts url
 69 |   data = HTTParty.get(url, {
 70 |                :headers => {"User-Agent" => "pdxcitycouncil-scraper"}
 71 |            }).parsed_response
 72 |   if data['error']
 73 |     puts "reddit error: #{data['message']}"
 74 |   else
 75 |     posts = data['data']['children']
 76 |     posts.each do |p| 
 77 |       match = p['data']['title'].match(/\[([0-9 -]+)\]/)
 78 |       p['data']['agenda_number'] = match.captures.first if match
 79 |     end
 80 |   end
 81 |   posts.map{|p| p['data']}
 82 | end
 83 | 
 84 | def load_comments(post)
 85 |   url = "https://www.reddit.com/r/pdxcouncilagenda/comments/#{post['id']}.json"
 86 |   puts url
 87 |   data = HTTParty.get(url, {
 88 |                :headers => {"User-Agent" => "pdxcitycouncil-scraper"}
 89 |            }).parsed_response
 90 |   if !data.is_a?(Array)
 91 |     puts "reddit error: #{data['message']}"
 92 |   else
 93 |     data[1]['data']['children'].map do |comment|
 94 |       if comment['kind'] == 't1'
 95 |         comment['data']
 96 |       end
 97 |     end
 98 |   end
 99 | #{"subreddit_id"=>"t5_31utx", "approved_at_utc"=>nil, "author_is_blocked"=>false, "comment_type"=>nil, "awarders"=>[], "mod_reason_by"=>nil, "banned_by"=>nil, "author_fl
100 | #air_type"=>"text", "total_awards_received"=>0, "subreddit"=>"pdxcouncilagenda", "author_flair_template_id"=>nil, "likes"=>nil, "replies"=>"", "user_reports"=>[], "saved"
101 | #=>false, "id"=>"i5n3nhp", "banned_at_utc"=>nil, "mod_reason_title"=>nil, "gilded"=>0, "archived"=>false, "collapsed_reason_code"=>nil, "no_follow"=>true, "author"=>"donp
102 | #donp", "can_mod_post"=>false, "created_utc"=>1650561868.0, "send_replies"=>true, "parent_id"=>"t3_u4bwbn", "score"=>1, "author_fullname"=>"t2_1zlsy", "approved_by"=>nil,
103 | # "mod_note"=>nil, "all_awardings"=>[], "collapsed"=>false, "body"=>"comment", "edited"=>false, "top_awarded_type"=>nil, "author_flair_css_class"=>nil, "name"=>"t1_i5n3nh
104 | #p", "is_submitter"=>false, "downs"=>0, "author_flair_richtext"=>[], "author_patreon_flair"=>false, "body_html"=>"&lt;div class=\"md\"&gt;&lt;p&gt;comment&lt;/p&gt;\n&lt;
105 | #/div&gt;", "removal_reason"=>nil, "collapsed_reason"=>nil, "distinguished"=>nil, "associated_award"=>nil, "stickied"=>false, "author_premium"=>false, "can_gild"=>true, "
106 | #gildings"=>{}, "unrepliable_reason"=>nil, "author_flair_text_color"=>nil, "score_hidden"=>false, "permalink"=>"/r/pdxcouncilagenda/comments/u4bwbn/accept_the_2021_annual
107 | #_report_of_the_north_and/i5n3nhp/", "subreddit_type"=>"restricted", "locked"=>false, "report_reasons"=>nil, "created"=>1650561868.0, "author_flair_text"=>nil, "treatment
108 | #_tags"=>[], "link_id"=>"t3_u4bwbn", "subreddit_name_prefixed"=>"r/pdxcouncilagenda", "controversiality"=>0, "depth"=>0, "author_flair_background_color"=>nil, "collapsed_
109 | #because_crowd_control"=>nil, "mod_reports"=>[], "num_reports"=>nil, "ups"=>1}
110 | end
111 | 
112 | 
113 | @reddit = JSON.parse(File.read("reddit.json"))
114 | clean = ARGV[0] == 'clean' if ARGV[0]
115 | do_post = ARGV[0] == 'post' if ARGV[0]
116 | 
117 | puts "clean mode ON" if clean
118 | puts "LIVE POST" if do_post
119 | 
120 | puts "loading r/pdxcouncilagenda posts"
121 | posts = load_posts
122 | puts "loaded #{posts.length} reddit posts"
123 | story_ids = {}
124 | posts.each do |p|
125 |   if p['agenda_number']
126 |     story_ids[p['agenda_number']] = p
127 |   end
128 | end
129 | 
130 | if story_ids.empty?
131 |   puts "reddit posts dont match agenda numbers. aborting early."
132 |   exit
133 | end
134 | 
135 | # agenda
136 | puts "loading scraped council agenda items"
137 | agenda_json = "https://donp.org/pdxapi/pdx-council-agenda.json"
138 | puts agenda_json
139 | agenda = HTTParty.get(agenda_json).parsed_response
140 | puts "loaded #{agenda['items'].size} agenda items"
141 | 
142 | unposted = agenda['items'].reject{|item| story_ids.include?(item['number'])}
143 | puts "#{unposted.size} unposted #{unposted.map{|p|p['number']}.sort}"
144 | 
145 | begin
146 |   token = access_token()
147 | rescue e
148 |   puts "reddit access token request failed: #{e}"
149 |   exit
150 | end
151 | 
152 | if clean
153 |   story_ids.each do |post| 
154 |     kind = 't3' #always t3
155 |     rid = "#{kind}_#{post['id']}"
156 |     puts "Deleting agenda #{id} reddit post #{rid}"
157 |     del = api('/api/del', token, {"id" => rid})
158 |     puts del.body
159 |   end
160 | end
161 | 
162 | if do_post
163 |   unposted.each do |post|
164 |     result = add_story(token, post)
165 |     puts "Posting #{post['number']} #{result}"
166 |   end
167 | end
168 | 
169 | voted = agenda['items'].select{|item| item['votes'].length > 0}
170 | voted.each do |v|
171 |   post = story_ids[v['number']]
172 |   if post 
173 |     comments = load_comments(post)
174 |     puts "##{v['number']} #{v['votes'].length} council votes. #{comments.length} reddit comments."
175 |     vote_comment = comments.select do |comment|
176 |       yn = comment['body'].match(/^Disposition/) && comment['author'] == 'pdxapibot'
177 |       puts "#{post['id']} #{comment['author']} #{comment['body']} #{yn}"
178 |       yn
179 |     end
180 |     if vote_comment.empty?
181 |       if do_post
182 |         result = add_comment(token, post, make_vote_comment(v))
183 |         if result['json']['errors'].length > 0
184 |           puts "add_comment error #{result['json']['errors']}"
185 |         end
186 |       end
187 |     end
188 |   else
189 |     puts "Warning: no reddit post for agenda #{v['number']}"
190 |   end
191 | end
192 | 
193 | 
194 | 


--------------------------------------------------------------------------------