├── script ├── .gitignore ├── council-docs-to-text ├── reddit.sample.json ├── extract-data ├── parse-cached-archives ├── fetch-council-docs ├── download-archives ├── council-agenda ├── process-upcoming-agenda └── reddit-pdxcouncilagenda ├── .gitignore ├── Gemfile ├── readme.md └── Gemfile.lock /script/.gitignore: -------------------------------------------------------------------------------- 1 | reddit.json 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | raw 2 | archives 3 | text 4 | test 5 | .env 6 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | 4 | gem 'nokogiri', '~> 1.6.2' 5 | gem 'aws-sdk', '~> 1.40.3' 6 | gem 'dotenv', '~> 0.11.1' 7 | gem "rack" 8 | gem "httparty" 9 | -------------------------------------------------------------------------------- /script/council-docs-to-text: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p ./text 4 | 5 | for file in ./raw/*.pdf; do 6 | fname=$(basename $file) 7 | name=${fname%.*} 8 | pdftotext $file "./text/${name}.txt" 9 | done 10 | -------------------------------------------------------------------------------- /script/reddit.sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "auth" : { "username" : "redditbot", 3 | "password" : "pass", 4 | "httpuser" : "abc1234", 5 | "httppass" : "pass4321xyz" 6 | }, 7 | "limit" : 100 8 | } 9 | 10 | -------------------------------------------------------------------------------- /script/extract-data: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'pathname' 4 | 5 | data = [] 6 | 7 | Pathname.glob('./text/*.txt').each do |file| 8 | text = File.read(file) 9 | context = /^5\)(.+?)\)\n(.+?)^6\)/m.match(text) 10 | puts "\n\n#{file.basename}===============================" 11 | puts context 12 | end 13 | -------------------------------------------------------------------------------- /script/parse-cached-archives: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # encoding: utf-8 4 | 5 | require 'pathname' 6 | require 'nokogiri' 7 | 8 | summaries = [] 9 | Pathname.glob('./archives/*.xml').each do |file| 10 | xml = Nokogiri::XML(File.read(file)) 11 | xml.xpath('//item/description').each do |item| 12 | summaries << { :file => File.basename(file), :title => item.content.strip } 13 | end 14 | end 15 | 16 | summaries.sort_by {|s| s[:title] }.each do |s| 17 | puts "#{s[:file]} - #{s[:title][0..150]}" if s[:title].include?('$') 18 | end 19 | -------------------------------------------------------------------------------- /script/fetch-council-docs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'nokogiri' 4 | require 'open-uri' 5 | 6 | doc = Nokogiri::HTML(open('http://www.portlandonline.com/auditor/index.cfm?c=50265').read) 7 | doc.css('.contentHeader a').each do |link| 8 | label = link.text.strip 9 | next if label !~ /item\s([0-9]+)$/i 10 | 11 | id = link.attr('href').split('a=').last 12 | process = fork { exec "curl --progress --create-dirs --output ./raw/#{$1}.pdf 'http://www.portlandonline.com/auditor/index.cfm?c=50265&a=#{id}'" } 13 | Process.waitpid(process) 14 | end 15 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ### Process Portland City Council data 2 | 3 | ### Setup and run 4 | 5 | Create a `.env` file in the root of the project and add the following information: 6 | 7 | ``` 8 | AWS_ACCESS_KEY_ID=YOUR AWS ID 9 | AWS_SECRET_ACCESS_KEY=YOUR AWS KEY 10 | AWS_S3_BUCKET=YOUR AWS BUCKET 11 | 12 | ``` 13 | 14 | Then install the dependencies and run the script. This will process the upcoming 15 | agenda items from the [City Council website](http://www.portlandonline.com/Auditor/Index.cfm?c=26997) 16 | and upload the resutls as JSON to s3. 17 | 18 | ``` 19 | bundle install 20 | dotenv script/process-upcoming-agenda 21 | ``` 22 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | aws-sdk (1.40.3) 5 | json (~> 1.4) 6 | nokogiri (>= 1.4.4) 7 | dotenv (0.11.1) 8 | dotenv-deployment (~> 0.0.2) 9 | dotenv-deployment (0.0.2) 10 | httparty (0.13.1) 11 | json (~> 1.8) 12 | multi_xml (>= 0.5.2) 13 | json (1.8.1) 14 | mini_portile (0.6.0) 15 | multi_xml (0.5.5) 16 | nokogiri (1.6.2.1) 17 | mini_portile (= 0.6.0) 18 | rack (1.5.2) 19 | 20 | PLATFORMS 21 | ruby 22 | 23 | DEPENDENCIES 24 | aws-sdk (~> 1.40.3) 25 | dotenv (~> 0.11.1) 26 | httparty 27 | nokogiri (~> 1.6.2) 28 | rack 29 | -------------------------------------------------------------------------------- /script/download-archives: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'nokogiri' 4 | require 'open-uri' 5 | require 'date' 6 | 7 | DOC_INDEX_URL="http://www.portlandonline.com/index.cfm?&a=481868&c=49508" 8 | 9 | doc = Nokogiri::HTML(open(DOC_INDEX_URL).read) 10 | doc.css('[data-category_tree_id="49508"] a').each do |link| 11 | label = link.text() 12 | 13 | id = link.attr('href').split('/').last 14 | 15 | /([0-9]+\-[0-9]+\-[0-9]+)/i.match(label) 16 | date = Date.strptime($1, "%m-%d-%y") rescue nil 17 | 18 | xml_url = "http://www.portlandonline.com/shared/js/TinyMCE/jscripts/tiny_mce/plugins/video/getPlaylist.cfm?content_id=#{id}" 19 | puts "Downloading #{xml_url}" 20 | xml_data = open(xml_url).read 21 | 22 | if xml_data !~ /^error/i 23 | file_name = date ? "archives/#{date}-#{id}.xml" : "archives/#{id}.xml" 24 | File.write(file_name, xml_data) 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /script/council-agenda: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'bundler/setup' 3 | require 'nokogiri' 4 | require 'open-uri' 5 | require 'json' 6 | require 'time' 7 | 8 | URL_HOST = 'http://www.portlandonline.com' 9 | 10 | def parse_bill(agenda_row, agenda_date) 11 | bill = {} 12 | link = agenda_row.css('a')[0] 13 | number_match = link.text.match(/(\*)?(\d+)/) 14 | emergency = !number_match.captures[0].nil? 15 | number = number_match.captures[1] 16 | link.remove 17 | title = agenda_row.text.gsub(/^[[:space:]]+/,'') 18 | 19 | time_certain_match = title.match(/^(TIME.CERTAIN:.(\d+:\d+.\w\w).)/) 20 | if time_certain_match 21 | item_date = Time.parse("#{agenda_date.to_date} #{time_certain_match[2]}") 22 | title = title[time_certain_match[1].length + 2, title.length] 23 | bill.merge!(:time_certain => item_date) 24 | end 25 | 26 | bill.merge!(:link => URL_HOST + link['href'], 27 | :number => number, 28 | :session => agenda_date, 29 | :title => title) 30 | bill.merge!({:emergency => true}) if emergency 31 | bill 32 | end 33 | 34 | url = URL_HOST + '/auditor/index.cfm?c=26997' 35 | doc = Nokogiri::HTML(open(url).read) 36 | 37 | items = [] 38 | agenda_date = nil 39 | doc.css('div.wysiwyg font table tbody td').each do |row| 40 | row.css('p>strong').each do |head| 41 | date_match = head.text.match(/\d+:\d+ \w\w, \w+ [\d-]+, 20\d\d/) 42 | if date_match 43 | clean_date = date_match[0].gsub(/-\d+/, '') 44 | agenda_date = Time.parse(clean_date) 45 | end 46 | end 47 | 48 | 49 | if agenda_date 50 | p = row.css('p').first 51 | if p &&& (p.css('strong>a').length > 0 || p.css('a>strong').length > 0) 52 | bill = parse_bill(p, agenda_date) 53 | items << bill 54 | end 55 | end 56 | end 57 | 58 | agenda = { 59 | :source => url, 60 | :scrape_date => Time.now, 61 | :items => items 62 | } 63 | 64 | puts JSON.pretty_generate(agenda) 65 | -------------------------------------------------------------------------------- /script/process-upcoming-agenda: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # encoding: UTF-8 3 | 4 | require 'nokogiri' 5 | require 'open-uri' 6 | require 'json' 7 | require 'aws-sdk' 8 | 9 | # The city council agenda items contain some crazy invisible utf-8 bytes so we 10 | # need to reencode in ascii and clean up the garbage 11 | encoding_options = { 12 | :invalid => :replace, # Replace invalid byte sequences 13 | :undef => :replace, # Replace anything not defined in ASCII 14 | :replace => '', # Use a blank for those replacements 15 | :universal_newline => true # Always break lines with \n 16 | } 17 | 18 | s3 = AWS::S3.new 19 | 20 | url = 'http://www.portlandonline.com/Auditor/Index.cfm?c=26997' 21 | base_url = 'http://www.portlandonline.com' 22 | 23 | doc = Nokogiri::HTML(open(url).read) 24 | target_column = doc.css('.pagecolumnmiddle .wysiwyg') 25 | title = target_column.css('p[style="text-align: center;"] strong').text 26 | 27 | items = [] 28 | 29 | # Because the HTML is so poorly structured on this page, we need to find all the 30 | # proper links to agenda items, and traverse upwards to get the description container 31 | target_column.css('.wysiwyg table tr a').each_with_index do |link, index| 32 | container = link.parent.parent 33 | 34 | href = "#{base_url}#{link.attr('href')}" 35 | number = link.text.strip 36 | # The emergency status of the agenda item. Emergency items take effect immediately 37 | # and aren't required to go through the normal 30 day waiting period. 38 | emergency = !!(number =~ /\*/) 39 | # The description of the agenda item 40 | description = container.text.sub(number, '').gsub("\r\n", '').encode('ascii', encoding_options).strip 41 | 42 | items << { 43 | :url => href, 44 | :number => number, 45 | :emergency => emergency, 46 | :description => description 47 | } 48 | end 49 | 50 | bucket = s3.buckets[ENV['AWS_S3_BUCKET']] 51 | object = bucket.objects['latest.json'].write(items.to_json, :acl => :public_read) 52 | -------------------------------------------------------------------------------- /script/reddit-pdxcouncilagenda: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'bundler/setup' 3 | require "httparty" 4 | require 'json' 5 | 6 | def access_token 7 | HTTParty.post('https://ssl.reddit.com/api/v1/access_token', 8 | {:body => ["grant_type=password", 9 | "username=#{@reddit['auth']['username']}", 10 | "password=#{@reddit['auth']['password']}"].join('&'), 11 | :basic_auth => {:username => @reddit['auth']['httpuser'], 12 | :password => @reddit['auth']['httppass']}, 13 | :headers => {"User-Agent" => "pdxcitycouncil-scraper"} 14 | }).parsed_response["access_token"] 15 | end 16 | 17 | def make_title(item) 18 | #["TOO_LONG", "this is too long (max: 300)", "title"] 19 | title = "[#{item['number']}] #{item['title']}"[0,300] 20 | end 21 | 22 | def make_text(item) 23 | text = "Session: #{item['session']}\n\n"+ 24 | "##{item['number']}\n\n" + 25 | "#{item['title']}\n\n" + 26 | "PDF: #{item['link']}\n\n" 27 | end 28 | 29 | def api(path, token, params) 30 | HTTParty.post("https://oauth.reddit.com#{path}", 31 | {:headers => { 'Content-Type' => 'application/json', 32 | 'Authorization' => "bearer #{token}", 33 | 'User-Agent' => 'pdxcitycouncil-scraper'}, 34 | :query => params }) 35 | end 36 | 37 | def add_story(token, post) 38 | puts "Submitting..." 39 | post = api('/api/submit', token, 40 | {'api_type' => "json", 41 | 'kind' => 'self', 42 | 'sr' => "pdxcouncilagenda", 43 | 'title' => make_title(post), 44 | 'text' => make_text(post)} 45 | ) 46 | puts "#{post.parsed_response.inspect}" 47 | #{"json"=>{"errors"=>[], "data"=>{"url"=>"https://oauth.reddit.com/r/pdxcouncilagenda/comments/274cvy/portland_city_council_agenda/", "id"=>"274cvy", "name"=>"t3_274cvy"}}} 48 | end 49 | 50 | @reddit = JSON.parse(File.read("reddit.json")) 51 | clean = ARGV[0] == 'clean' if ARGV[0] 52 | do_post = ARGV[0] == 'post' if ARGV[0] 53 | 54 | puts "clean mode ON" if clean 55 | puts "LIVE POST" if do_post 56 | 57 | # posts 58 | puts "loading reddit posts" 59 | posts = HTTParty.get("http://www.reddit.com/r/pdxcouncilagenda/new.json?limit=#{@reddit['limit']}").parsed_response['data']['children'] 60 | puts "loaded #{posts.length} reddit posts" 61 | posts.each do |p| 62 | match = p['data']['title'].match(/\[(\d+)\]/) 63 | p['data']['agenda_number'] = match.captures.first if match 64 | end 65 | story_ids = posts.map{|p|p['data']['agenda_number']}.compact 66 | puts "#{story_ids.size} story ids #{story_ids.sort_by{|a| a.to_i}.reverse}" 67 | 68 | # agenda 69 | puts "loading council agenda" 70 | agenda = JSON.parse(HTTParty.get('http://donpark.org/pdxapi/citycouncilagenda.json?count=50').parsed_response) 71 | puts "loaded #{agenda['items'].size} agenda items" 72 | unposted = agenda['items'].reject{|item| story_ids.include?(item['number'])} 73 | puts "#{unposted.size} unposted #{unposted.map{|p|p['number']}.sort}" 74 | 75 | puts "Reddit access token request" 76 | token = access_token() 77 | puts "Reddit access token #{token}" 78 | 79 | if clean 80 | story_ids.each do |id| 81 | post = posts.select{|p| p['data']['agenda_number'] == id}.first 82 | rid = "#{post['kind']}_#{post['data']['id']}" 83 | puts "Deleting agenda #{id} reddit post #{rid}" 84 | del = api('/api/del', token, {"id" => rid}) 85 | puts del.body 86 | end 87 | end 88 | 89 | if do_post 90 | unposted.each do |post| 91 | puts "Posting #{post['number']}" 92 | add_story(token, post) 93 | end 94 | end 95 | 96 | 97 | --------------------------------------------------------------------------------