├── script
    ├── .gitignore
    ├── council-docs-to-text
    ├── reddit.sample.json
    ├── extract-data
    ├── parse-cached-archives
    ├── fetch-council-docs
    ├── download-archives
    ├── council-agenda
    ├── process-upcoming-agenda
    └── reddit-pdxcouncilagenda
├── .gitignore
├── Gemfile
├── readme.md
└── Gemfile.lock


/script/.gitignore:
--------------------------------------------------------------------------------
1 | reddit.json
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | raw
2 | archives
3 | text
4 | test
5 | .env
6 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | 
4 | gem 'nokogiri', '~> 1.6.2'
5 | gem 'aws-sdk', '~> 1.40.3'
6 | gem 'dotenv', '~> 0.11.1'
7 | gem "rack"
8 | gem "httparty"
9 | 


--------------------------------------------------------------------------------
/script/council-docs-to-text:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | mkdir -p ./text
 4 | 
 5 | for file in ./raw/*.pdf; do
 6 |   fname=$(basename $file)
 7 |   name=${fname%.*}
 8 |   pdftotext $file "./text/${name}.txt"
 9 | done
10 | 


--------------------------------------------------------------------------------
/script/reddit.sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "auth" : { "username" : "redditbot",
 3 |              "password" : "pass",
 4 |              "httpuser" : "abc1234",
 5 |              "httppass" : "pass4321xyz"
 6 |             },
 7 |   "limit" : 100
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/script/extract-data:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'pathname'
 4 | 
 5 | data = []
 6 | 
 7 | Pathname.glob('./text/*.txt').each do |file|
 8 |   text = File.read(file)
 9 |   context = /^5\)(.+?)\)\n(.+?)^6\)/m.match(text)
10 |   puts "\n\n#{file.basename}==============================="
11 |   puts context
12 | end
13 | 


--------------------------------------------------------------------------------
/script/parse-cached-archives:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # encoding: utf-8
 4 | 
 5 | require 'pathname'
 6 | require 'nokogiri'
 7 | 
 8 | summaries = []
 9 | Pathname.glob('./archives/*.xml').each do |file|
10 |   xml = Nokogiri::XML(File.read(file))
11 |   xml.xpath('//item/description').each do |item|
12 |     summaries << { :file => File.basename(file), :title => item.content.strip }
13 |   end
14 | end
15 | 
16 | summaries.sort_by {|s| s[:title] }.each do |s|
17 |   puts "#{s[:file]} - #{s[:title][0..150]}"  if s[:title].include?('$')
18 | end
19 | 


--------------------------------------------------------------------------------
/script/fetch-council-docs:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'nokogiri'
 4 | require 'open-uri'
 5 | 
 6 | doc = Nokogiri::HTML(open('http://www.portlandonline.com/auditor/index.cfm?c=50265').read)
 7 | doc.css('.contentHeader a').each do |link|
 8 |   label = link.text.strip
 9 |   next if label !~ /item\s([0-9]+)$/i
10 | 
11 |   id = link.attr('href').split('a=').last
12 |   process = fork { exec "curl --progress --create-dirs --output ./raw/#{$1}.pdf 'http://www.portlandonline.com/auditor/index.cfm?c=50265&a=#{id}'" }
13 |   Process.waitpid(process)
14 | end
15 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ### Process Portland City Council data
 2 | 
 3 | ### Setup and run
 4 | 
 5 | Create a `.env` file in the root of the project and add the following information:
 6 | 
 7 | ```
 8 | AWS_ACCESS_KEY_ID=YOUR AWS ID
 9 | AWS_SECRET_ACCESS_KEY=YOUR AWS KEY
10 | AWS_S3_BUCKET=YOUR AWS BUCKET
11 | 
12 | ```
13 | 
14 | Then install the dependencies and run the script.  This will process the upcoming
15 | agenda items from the [City Council website](http://www.portlandonline.com/Auditor/Index.cfm?c=26997)
16 | and upload the resutls as JSON to s3.
17 | 
18 | ```
19 | bundle install
20 | dotenv script/process-upcoming-agenda
21 | ```
22 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: https://rubygems.org/
 3 |   specs:
 4 |     aws-sdk (1.40.3)
 5 |       json (~> 1.4)
 6 |       nokogiri (>= 1.4.4)
 7 |     dotenv (0.11.1)
 8 |       dotenv-deployment (~> 0.0.2)
 9 |     dotenv-deployment (0.0.2)
10 |     httparty (0.13.1)
11 |       json (~> 1.8)
12 |       multi_xml (>= 0.5.2)
13 |     json (1.8.1)
14 |     mini_portile (0.6.0)
15 |     multi_xml (0.5.5)
16 |     nokogiri (1.6.2.1)
17 |       mini_portile (= 0.6.0)
18 |     rack (1.5.2)
19 | 
20 | PLATFORMS
21 |   ruby
22 | 
23 | DEPENDENCIES
24 |   aws-sdk (~> 1.40.3)
25 |   dotenv (~> 0.11.1)
26 |   httparty
27 |   nokogiri (~> 1.6.2)
28 |   rack
29 | 


--------------------------------------------------------------------------------
/script/download-archives:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'nokogiri'
 4 | require 'open-uri'
 5 | require 'date'
 6 | 
 7 | DOC_INDEX_URL="http://www.portlandonline.com/index.cfm?&a=481868&c=49508"
 8 | 
 9 | doc = Nokogiri::HTML(open(DOC_INDEX_URL).read)
10 | doc.css('[data-category_tree_id="49508"] a').each do |link|
11 |   label = link.text()
12 | 
13 |   id = link.attr('href').split('/').last
14 | 
15 |   /([0-9]+\-[0-9]+\-[0-9]+)/i.match(label)
16 |   date = Date.strptime($1, "%m-%d-%y") rescue nil
17 | 
18 |   xml_url = "http://www.portlandonline.com/shared/js/TinyMCE/jscripts/tiny_mce/plugins/video/getPlaylist.cfm?content_id=#{id}"
19 |   puts "Downloading #{xml_url}"
20 |   xml_data = open(xml_url).read
21 | 
22 |   if xml_data !~ /^error/i
23 |     file_name = date ? "archives/#{date}-#{id}.xml" : "archives/#{id}.xml"
24 |     File.write(file_name, xml_data)
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/script/council-agenda:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require 'bundler/setup'
 3 | require 'nokogiri'
 4 | require 'open-uri'
 5 | require 'json'
 6 | require 'time'
 7 | 
 8 | URL_HOST = 'http://www.portlandonline.com'
 9 | 
10 | def parse_bill(agenda_row, agenda_date)
11 |   bill = {}
12 |   link = agenda_row.css('a')[0]
13 |   number_match = link.text.match(/(\*)?(\d+)/)
14 |   emergency = !number_match.captures[0].nil?
15 |   number = number_match.captures[1]
16 |   link.remove
17 |   title = agenda_row.text.gsub(/^[[:space:]]+/,'')
18 | 
19 |   time_certain_match = title.match(/^(TIME.CERTAIN:.(\d+:\d+.\w\w).)/)
20 |   if time_certain_match
21 |     item_date = Time.parse("#{agenda_date.to_date} #{time_certain_match[2]}")
22 |     title = title[time_certain_match[1].length + 2, title.length]
23 |     bill.merge!(:time_certain => item_date)
24 |   end
25 | 
26 |   bill.merge!(:link => URL_HOST + link['href'],
27 |               :number => number,
28 |               :session => agenda_date,
29 |               :title => title)
30 |   bill.merge!({:emergency => true}) if emergency
31 |   bill
32 | end
33 | 
34 | url = URL_HOST + '/auditor/index.cfm?c=26997'
35 | doc = Nokogiri::HTML(open(url).read)
36 | 
37 | items = []
38 | agenda_date = nil
39 | doc.css('div.wysiwyg font table tbody td').each do |row|
40 |   row.css('p>strong').each do |head|
41 |     date_match = head.text.match(/\d+:\d+ \w\w, \w+ [\d-]+, 20\d\d/)
42 |     if date_match
43 |       clean_date = date_match[0].gsub(/-\d+/, '')
44 |       agenda_date = Time.parse(clean_date)
45 |     end
46 |   end
47 | 
48 | 
49 |   if agenda_date
50 |     p = row.css('p').first
51 |     if p &&& (p.css('strong>a').length > 0 || p.css('a>strong').length > 0)
52 |       bill = parse_bill(p, agenda_date)
53 |       items << bill
54 |     end
55 |   end
56 | end
57 | 
58 | agenda = {
59 |   :source => url,
60 |   :scrape_date => Time.now,
61 |   :items => items
62 | }
63 | 
64 | puts JSON.pretty_generate(agenda)
65 | 


--------------------------------------------------------------------------------
/script/process-upcoming-agenda:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # encoding: UTF-8
 3 | 
 4 | require 'nokogiri'
 5 | require 'open-uri'
 6 | require 'json'
 7 | require 'aws-sdk'
 8 | 
 9 | # The city council agenda items contain some crazy invisible utf-8 bytes so we
10 | # need to reencode in ascii and clean up the garbage
11 | encoding_options = {
12 |   :invalid           => :replace,  # Replace invalid byte sequences
13 |   :undef             => :replace,  # Replace anything not defined in ASCII
14 |   :replace           => '',        # Use a blank for those replacements
15 |   :universal_newline => true       # Always break lines with \n
16 | }
17 | 
18 | s3 = AWS::S3.new
19 | 
20 | url = 'http://www.portlandonline.com/Auditor/Index.cfm?c=26997'
21 | base_url = 'http://www.portlandonline.com'
22 | 
23 | doc = Nokogiri::HTML(open(url).read)
24 | target_column = doc.css('.pagecolumnmiddle  .wysiwyg')
25 | title = target_column.css('p[style="text-align: center;"] strong').text
26 | 
27 | items = []
28 | 
29 | # Because the HTML is so poorly structured on this page, we need to find all the
30 | # proper links to agenda items, and traverse upwards to get the description container
31 | target_column.css('.wysiwyg table tr a').each_with_index do |link, index|
32 |   container = link.parent.parent
33 | 
34 |   href = "#{base_url}#{link.attr('href')}"
35 |   number = link.text.strip
36 |   # The emergency status of the agenda item.  Emergency items take effect immediately
37 |   # and aren't required to go through the normal 30 day waiting period.
38 |   emergency = !!(number =~ /\*/)
39 |   # The description of the agenda item
40 |   description = container.text.sub(number, '').gsub("\r\n", '').encode('ascii', encoding_options).strip
41 | 
42 |   items << {
43 |     :url => href,
44 |     :number => number,
45 |     :emergency => emergency,
46 |     :description => description
47 |   }
48 | end
49 | 
50 | bucket = s3.buckets[ENV['AWS_S3_BUCKET']]
51 | object = bucket.objects['latest.json'].write(items.to_json, :acl => :public_read)
52 | 


--------------------------------------------------------------------------------
/script/reddit-pdxcouncilagenda:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require 'bundler/setup'
 3 | require "httparty"
 4 | require 'json'
 5 | 
 6 | def access_token
 7 |   HTTParty.post('https://ssl.reddit.com/api/v1/access_token',
 8 |               {:body => ["grant_type=password",
 9 |                          "username=#{@reddit['auth']['username']}",
10 |                          "password=#{@reddit['auth']['password']}"].join('&'),
11 |                :basic_auth => {:username => @reddit['auth']['httpuser'], 
12 |                                :password => @reddit['auth']['httppass']},
13 |                :headers => {"User-Agent" => "pdxcitycouncil-scraper"}
14 |               }).parsed_response["access_token"]
15 | end
16 | 
17 | def make_title(item)
18 |   #["TOO_LONG", "this is too long (max: 300)", "title"]
19 |   title = "[#{item['number']}] #{item['title']}"[0,300]
20 | end
21 | 
22 | def make_text(item)
23 |   text = "Session: #{item['session']}\n\n"+
24 |          "##{item['number']}\n\n" +
25 |          "#{item['title']}\n\n" +
26 |          "PDF: #{item['link']}\n\n"
27 | end
28 | 
29 | def api(path, token, params)
30 |   HTTParty.post("https://oauth.reddit.com#{path}",
31 |                 {:headers => { 'Content-Type' => 'application/json',
32 |                                'Authorization' => "bearer #{token}",
33 |                                'User-Agent' => 'pdxcitycouncil-scraper'},
34 |                  :query => params })
35 | end
36 | 
37 | def add_story(token, post)
38 |   puts "Submitting..."
39 |   post = api('/api/submit', token,
40 |                                   {'api_type' => "json",
41 |                                    'kind' => 'self',
42 |                                    'sr' => "pdxcouncilagenda",
43 |                                    'title' => make_title(post),
44 |                                    'text' => make_text(post)} 
45 |                        )
46 |   puts "#{post.parsed_response.inspect}"
47 |   #{"json"=>{"errors"=>[], "data"=>{"url"=>"https://oauth.reddit.com/r/pdxcouncilagenda/comments/274cvy/portland_city_council_agenda/", "id"=>"274cvy", "name"=>"t3_274cvy"}}}
48 | end
49 | 
50 | @reddit = JSON.parse(File.read("reddit.json"))
51 | clean = ARGV[0] == 'clean' if ARGV[0]
52 | do_post = ARGV[0] == 'post' if ARGV[0]
53 | 
54 | puts "clean mode ON" if clean
55 | puts "LIVE POST" if do_post
56 | 
57 | # posts
58 | puts "loading reddit posts"
59 | posts = HTTParty.get("http://www.reddit.com/r/pdxcouncilagenda/new.json?limit=#{@reddit['limit']}").parsed_response['data']['children']
60 | puts "loaded #{posts.length} reddit posts"
61 | posts.each do |p| 
62 |   match = p['data']['title'].match(/\[(\d+)\]/)
63 |   p['data']['agenda_number'] = match.captures.first if match
64 | end
65 | story_ids = posts.map{|p|p['data']['agenda_number']}.compact
66 | puts "#{story_ids.size} story ids #{story_ids.sort_by{|a| a.to_i}.reverse}"
67 | 
68 | # agenda
69 | puts "loading council agenda"
70 | agenda = JSON.parse(HTTParty.get('http://donpark.org/pdxapi/citycouncilagenda.json?count=50').parsed_response)
71 | puts "loaded #{agenda['items'].size} agenda items"
72 | unposted = agenda['items'].reject{|item| story_ids.include?(item['number'])}
73 | puts "#{unposted.size} unposted #{unposted.map{|p|p['number']}.sort}"
74 | 
75 | puts "Reddit access token request"
76 | token = access_token()
77 | puts "Reddit access token #{token}"
78 | 
79 | if clean
80 |   story_ids.each do |id| 
81 |     post = posts.select{|p| p['data']['agenda_number'] == id}.first
82 |     rid = "#{post['kind']}_#{post['data']['id']}"
83 |     puts "Deleting agenda #{id} reddit post #{rid}"
84 |     del = api('/api/del', token, {"id" => rid})
85 |     puts del.body
86 |   end
87 | end
88 | 
89 | if do_post
90 |   unposted.each do |post|
91 |     puts "Posting #{post['number']}"
92 |     add_story(token, post)
93 |   end
94 | end
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------