├── .gitignore ├── .rvmrc ├── Gemfile ├── Gemfile.lock ├── README.md ├── Rakefile ├── app.rb ├── config.rb ├── config.ru ├── config.yml ├── helpers.rb ├── lib ├── category.rb ├── classifier.rb ├── classifier │ └── stats_classifier.rb ├── clusterer.rb ├── clusterer │ └── stats_cluster.rb ├── clusters.rb ├── core_ext │ └── string.rb ├── crawler.rb ├── data │ └── stop_words.txt ├── item.rb ├── medium.rb ├── newsagg.rb ├── parser.rb ├── parser │ ├── cleaner.rb │ ├── html.rb │ └── rss.rb ├── tasks │ └── scheduler.rake ├── trainer.rb └── training_set.rb ├── script ├── crawl.rb └── train.rb └── views ├── category.haml ├── cluster.haml ├── layout.haml └── style.sass /.gitignore: -------------------------------------------------------------------------------- 1 | .sass-cache 2 | -------------------------------------------------------------------------------- /.rvmrc: -------------------------------------------------------------------------------- 1 | rvm ruby-1.9.2-p290@newsagg 2 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'http://rubygems.org' 2 | 3 | gem 'sinatra' 4 | gem 'haml' 5 | gem 'sass' 6 | gem 'redis' 7 | gem 'nokogiri' 8 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: http://rubygems.org/ 3 | specs: 4 | haml (3.1.3) 5 | nokogiri (1.5.0) 6 | rack (1.3.5) 7 | rack-protection (1.1.4) 8 | rack 9 | redis (2.2.2) 10 | sass (3.1.10) 11 | sinatra (1.3.1) 12 | rack (~> 1.3, >= 1.3.4) 13 | rack-protection (~> 1.1, >= 1.1.2) 14 | tilt (~> 1.3, >= 1.3.3) 15 | tilt (1.3.3) 16 | 17 | PLATFORMS 18 | ruby 19 | 20 | DEPENDENCIES 21 | haml 22 | nokogiri 23 | redis 24 | sass 25 | sinatra 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NewsAgg 2 | 3 | ## DESCRIPTION: 4 | 5 | News Aggregator that classifies and clusterifies news from different sources 6 | 7 | ## INSTALLATION: 8 | 9 | bundle install 10 | 11 | ## SETUP: 12 | 13 | ### Configure categories, training sets and RSS feeds 14 | 15 | (edit: "config.yml" file) 16 | 17 | ## USAGE: 18 | 19 | ### Create training sets 20 | 21 | ruby script/train.rb 22 | 23 | ### Collect & classify articles from newspapers 24 | 25 | ruby script/crawl.rb 26 | 27 | ### Read news (localhost:9292) 28 | 29 | rackup config.ru 30 | 31 | ## LICENSE: 32 | 33 | (The MIT License) 34 | 35 | Copyright (c) 2011 Dalibor Nasevic 36 | 37 | Permission is hereby granted, free of charge, to any person obtaining 38 | a copy of this software and associated documentation files (the 39 | 'Software'), to deal in the Software without restriction, including 40 | without limitation the rights to use, copy, modify, merge, publish, 41 | distribute, sublicense, and/or sell copies of the Software, and to 42 | permit persons to whom the Software is furnished to do so, subject to 43 | the following conditions: 44 | 45 | The above copyright notice and this permission notice shall be 46 | included in all copies or substantial portions of the Software. 47 | 48 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 49 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 50 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 51 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 52 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 53 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 54 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 55 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rake' 2 | require 'rake/testtask' 3 | require 'rake/rdoctask' 4 | 5 | Dir["#{File.dirname(__FILE__)}/lib/tasks/**/*.rake"].sort.each { |t| load t } 6 | -------------------------------------------------------------------------------- /app.rb: -------------------------------------------------------------------------------- 1 | require './config' 2 | 3 | include NewsAgg 4 | 5 | get '/' do 6 | @categories = Category.all 7 | @category = Category.find(params[:c]) 8 | 9 | if @category 10 | @items = @category.recent_items 11 | haml :category, :layout => :layout 12 | else 13 | @clusters = Clusters.load 14 | haml :cluster, :layout => :layout 15 | end 16 | end 17 | 18 | get '/style.css' do 19 | sass :style 20 | end 21 | -------------------------------------------------------------------------------- /config.rb: -------------------------------------------------------------------------------- 1 | require "rubygems" 2 | require "bundler/setup" 3 | Bundler.require 4 | 5 | require_relative 'helpers' 6 | require_relative 'lib/newsagg' 7 | require 'json' 8 | 9 | 10 | Dir["#{File.dirname(__FILE__)}/lib/core_ext/*.rb"].sort.each do |path| 11 | require_relative "lib/core_ext/#{File.basename(path, '.rb')}" 12 | end 13 | 14 | 15 | # development environment 16 | ENV['RACK_ENV'] ||= 'development' 17 | configure :development do 18 | ENV["REDISTOGO_URL"] = 'redis://localhost:6379' 19 | end 20 | 21 | 22 | # production environment 23 | # Heroku sets: ENV['RACK_ENV'] and ENV["REDISTOGO_URL"] 24 | 25 | 26 | # all environments 27 | configure do 28 | uri = URI.parse(ENV["REDISTOGO_URL"]) 29 | R = Redis.new(:host => uri.host, :port => uri.port, :password => uri.password) 30 | end 31 | 32 | 33 | CONFIG = YAML::load_file(File.join(File.dirname(__FILE__), 'config.yml')) 34 | -------------------------------------------------------------------------------- /config.ru: -------------------------------------------------------------------------------- 1 | require './app' 2 | 3 | run Sinatra::Application 4 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | :categories: 2 | - :name: 'world' 3 | :seeds: 4 | - :url: 'http://en.wikipedia.org/wiki/World' 5 | :selector: '.mw-content-ltr' 6 | - :url: 'http://en.wikipedia.org/wiki/List_of_sovereign_states' 7 | :selector: '.mw-content-ltr' 8 | - :url: 'http://en.wikipedia.org/wiki/Universe' 9 | :selector: '.mw-content-ltr' 10 | - :url: 'http://en.wikipedia.org/wiki/Globe' 11 | :selector: '.mw-content-ltr' 12 | - :name: 'africa' 13 | :seeds: 14 | - :url: 'http://en.wikipedia.org/wiki/Africa' 15 | :selector: '.mw-content-ltr' 16 | - :url: 'http://en.wikipedia.org/wiki/List_of_African_countries_and_territories' 17 | :selector: '.mw-content-ltr' 18 | - :name: 'economy' 19 | :seeds: 20 | - :url: 'http://en.wikipedia.org/wiki/Economy' 21 | :selector: '.mw-content-ltr' 22 | - :url: 'http://en.wikipedia.org/wiki/Finance' 23 | :selector: '.mw-content-ltr' 24 | - :url: 'http://en.wikipedia.org/wiki/Business' 25 | :selector: '.mw-content-ltr' 26 | - :name: 'science' 27 | :seeds: 28 | - :url: 'http://en.wikipedia.org/wiki/Science' 29 | :selector: '.mw-content-ltr' 30 | - :url: 'http://en.wikipedia.org/wiki/Technology' 31 | :selector: '.mw-content-ltr' 32 | - :name: 'sport' 33 | :seeds: 34 | - :url: 'http://en.wikipedia.org/wiki/Sport' 35 | :selector: '.mw-content-ltr' 36 | - :url: 'http://en.wikipedia.org/wiki/Outline_of_sports' 37 | :selector: '.mw-content-ltr' 38 | - :name: 'health' 39 | :seeds: 40 | - :url: 'http://en.wikipedia.org/wiki/Health' 41 | :selector: '.mw-content-ltr' 42 | - :url: 'http://en.wikipedia.org/wiki/Health_care' 43 | :selector: '.mw-content-ltr' 44 | - :name: 'entertainment' 45 | :seeds: 46 | - :url: 'http://en.wikipedia.org/wiki/Entertainment' 47 | :selector: '.mw-content-ltr' 48 | - :url: 'http://en.wikipedia.org/wiki/Film' 49 | :selector: '.mw-content-ltr' 50 | - :url: 'http://en.wikipedia.org/wiki/Radio_programming' 51 | :selector: '.mw-content-ltr' 52 | - :url: 'http://en.wikipedia.org/wiki/Concert' 53 | :selector: '.mw-content-ltr' 54 | 55 | :media: 56 | - :key: 'timeslive.co.za' 57 | :url: 'http://www.timeslive.co.za' 58 | :feeds: 59 | - 'http://avusa.feedsportal.com/c/33051/f/534658/index.rss' 60 | :selector: '#article .area > h3, #article .area > p, #article > h3' 61 | 62 | - :key: 'thenewage.co.za' 63 | :url: 'http://www.sowetanlive.co.za' 64 | :feeds: 65 | - 'http://thenewage.co.za/rss.aspx?cat_id=9' # business 66 | - 'http://thenewage.co.za/rss.aspx?cat_id=1021' # science and technology 67 | - 'http://thenewage.co.za/rss.aspx?cat_id=1020' # world 68 | - 'http://thenewage.co.za/rss.aspx?cat_id=1022' # entertainment 69 | - 'http://thenewage.co.za/rss.aspx?cat_id=1019' # afrika 70 | - 'http://thenewage.co.za/rss.aspx?cat_id=11' # sport 71 | :selector: '#dv_story_dtls' 72 | 73 | - :key: 'sowetanlive.co.za' 74 | :url: 'http://www.sowetanlive.co.za' 75 | :feeds: 76 | - 'http://www.sowetanlive.co.za/?service=rss' 77 | :selector: '#content .articleheader h3, #content p' 78 | 79 | - :key: 'news24.com' 80 | :url: 'http://www.news24.com' 81 | :feeds: 82 | - 'http://feeds.news24.com/articles/Kenya/TopStories/rss' # top stories 83 | - 'http://feeds.news24.com/articles/Kenya/Africa/rss' # afrika 84 | - 'http://feeds.news24.com/articles/Kenya/World/rss' # world 85 | - 'http://feeds.news24.com/articles/Kenya/SciTech/rss' # sci-tech 86 | - 'http://feeds.24.com/articles/sport/featured/topstories/rss' # sport 87 | - 'http://feeds.news24.com/articles/Kenya/Entertainment/rss' # entertainment 88 | :selector: '.article_body p:first' 89 | 90 | - :key: 'iol.co.za' 91 | :url: 'http://www.iol.co.za' 92 | :feeds: 93 | - 'http://iol.co.za/cmlink/1.640' # news 94 | - 'http://www.iol.co.za/cmlink/1.730910' # business 95 | - 'http://iol.co.za/cmlink/sport-category-rss-1.704' # sport 96 | :selector: '.aticle_column > p' 97 | 98 | - :key: 'ewn.co.za' 99 | :url: 'http://www.ewn.co.za' 100 | :feeds: 101 | - 'http://www.ewn.co.za/Feeds/Local.aspx' # local 102 | - 'http://www.ewn.co.za/Feeds/World.aspx' # world 103 | - 'http://www.ewn.co.za/Feeds/Sport.aspx' # sport 104 | - 'http://www.ewn.co.za/Feeds/Entertainment.aspx' # entertainment 105 | - 'http://www.ewn.co.za/Feeds/Breaking.aspx' # breaking news 106 | - 'http://www.ewn.co.za/Feeds/LatestNews.aspx' # latest news (all categories) 107 | :selector: '.storybodytext > p' 108 | 109 | - :key: 'mg.co.za' 110 | :url: 'http://mg.co.za' 111 | :feeds: 112 | - 'http://mg.co.za/rss' # top stories 113 | - 'http://mg.co.za/rss/national' # national 114 | - 'http://mg.co.za/rss/sport' # sport 115 | - 'http://mg.co.za/rss/business' # business 116 | - 'http://mg.co.za/rss/world' # world 117 | - 'http://mg.co.za/rss/africa' # africa 118 | - 'http://mg.co.za/rss/and-in-other-news' # other 119 | :selector: '#storycontainer .article_lead, #storycontainer .article_body' 120 | -------------------------------------------------------------------------------- /helpers.rb: -------------------------------------------------------------------------------- 1 | helpers do 2 | def link_to(text, url, opts = {}) 3 | attributes = "" 4 | opts.each { |key,value| attributes << key.to_s << "=\"" << value << "\" "} 5 | "#{text}" 6 | end 7 | 8 | def score_percentages(scores) 9 | scores.sort{ |a, b| b[1] <=> a[1] }.map do |s| 10 | "#{s[0].capitalize}: #{'%.2f' % (s[1] * 100)}%" 11 | end.join(', ') 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /lib/category.rb: -------------------------------------------------------------------------------- 1 | module NewsAgg 2 | class Category 3 | attr_accessor :name, :seeds 4 | CATEGORY_LIMIT = 15 # max number of items per category 5 | 6 | def initialize(params) 7 | @name = params[:name] 8 | @seeds = params[:seeds] 9 | end 10 | 11 | # last items ordered by timestamp (redis: ordered set) 12 | def recent_items(limit = CATEGORY_LIMIT) 13 | keys = R.zrevrange("category:#{name}", 0, limit - 1) 14 | @items = Item.find(keys) 15 | end 16 | 17 | # find older category items (0,1,2,3,[4,5,6,7,8]) 18 | def old_items(limit = CATEGORY_LIMIT) 19 | keys = R.zrevrange("category:#{name}", limit, -1) 20 | @items = Item.find(keys) 21 | end 22 | 23 | def add_item(item, score) 24 | # redis sorted set: category:name 25 | R.zadd("category:#{name}", item.timestamp, item.key) 26 | end 27 | 28 | def remove_item(item) 29 | R.zrem("category:#{name}", item.key) 30 | end 31 | 32 | def self.all 33 | @all ||= CONFIG[:categories].map { |params| new(params) } 34 | end 35 | 36 | def self.find(name) 37 | all.detect{ |c| c.name == name } 38 | end 39 | 40 | # Redis on Heroku is free up to 5 mb 41 | # keep only fews records per category 42 | def self.clean_old_items! 43 | # DEBUG 44 | p "cleaning old items..." 45 | 46 | all.each do |category| 47 | category.old_items.each { |item| item.destroy } 48 | end 49 | end 50 | end 51 | end 52 | -------------------------------------------------------------------------------- /lib/classifier.rb: -------------------------------------------------------------------------------- 1 | path = File.expand_path('../../lib/classifier', __FILE__) 2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path) 3 | 4 | require 'json' 5 | 6 | module NewsAgg 7 | module Classifier 8 | autoload :StatsClassifier, 'stats_classifier' 9 | 10 | # TODO: classify item 11 | def self.classify(item) 12 | classifier = StatsClassifier.new(training_data) 13 | 14 | scores = classifier.scores(item.content) 15 | category_name, score = scores.max_by{ |k,v| v } 16 | 17 | category = Category.find(category_name) 18 | category.add_item(item, score) 19 | item.add_scores(scores) 20 | 21 | # DEBUG: classified object 22 | p "classifying... medium => #{item.medium_key}, key => #{item.key}, title => #{item.title}, :category => #{category.name}" 23 | end 24 | 25 | private 26 | def self.training_data 27 | return @training_data if @training_data 28 | 29 | @training_data = {} 30 | Category.all.each do |category| 31 | @training_data[category.name] = TrainingSet.find(category.name).content 32 | end 33 | @training_data 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /lib/classifier/stats_classifier.rb: -------------------------------------------------------------------------------- 1 | module NewsAgg 2 | module Classifier 3 | class StatsClassifier 4 | attr_accessor :training_sets 5 | 6 | def initialize(data) 7 | @training_sets = {} 8 | filename = File.join(File.dirname(__FILE__), '..', 'data', 'stop_words.txt') 9 | @noise_words = File.new(filename).readlines.map(&:chomp) 10 | 11 | train(data) 12 | end 13 | 14 | def scores(text) 15 | words = text.downcase.scan(/[a-z]+/) 16 | 17 | scores = {} 18 | training_sets.each_pair do |category, word_weights| 19 | scores[category] = score(word_weights, words) 20 | end 21 | 22 | scores 23 | end 24 | 25 | def train(data) 26 | data.each_pair do |category, text| 27 | words = text.downcase.scan(/[a-z]+/) 28 | word_weights = Hash.new(0) 29 | 30 | words.each {|word| word_weights[word] += 1 unless @noise_words.index(word)} 31 | 32 | ratio = 1.0 / words.length 33 | word_weights.keys.each {|key| word_weights[key] *= ratio} 34 | 35 | training_sets[category] = word_weights 36 | end 37 | end 38 | 39 | private 40 | def score(word_weights, words) 41 | score = words.inject(0) {|acc, word| acc + word_weights[word]} 42 | 1000.0 * score / words.size 43 | end 44 | end 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /lib/clusterer.rb: -------------------------------------------------------------------------------- 1 | path = File.expand_path('../../lib/clusterer', __FILE__) 2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path) 3 | 4 | require 'json' 5 | 6 | module NewsAgg 7 | module Clusterer 8 | autoload :StatsCluster, 'stats_cluster' 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /lib/clusterer/stats_cluster.rb: -------------------------------------------------------------------------------- 1 | module NewsAgg 2 | module Clusterer 3 | # Word-Use Intersections 4 | class StatsCluster 5 | THRESHOLD = 0.3 6 | attr_accessor :noise_words 7 | 8 | def initialize 9 | filename = File.join(File.dirname(__FILE__), '..', 'data', 'stop_words.txt') 10 | @noise_words = File.new(filename).readlines.map(&:chomp) 11 | end 12 | 13 | def clusterize(texts) 14 | words_matrix = texts.map { |text| text.downcase.scan(/[a-z]+/) - noise_words } 15 | similarity_matrix = calculate_similarity_matrix(words_matrix) 16 | calculate_clusters(similarity_matrix) 17 | end 18 | 19 | # similarity score between two texts 20 | # used only for debugging 21 | def texts_score(text1, text2) 22 | words1 = text1.downcase.scan(/[a-zA-Z]+/) - noise_words 23 | words2 = text2.downcase.scan(/[a-zA-Z]+/) - noise_words 24 | common_words = words1 & words2 25 | p common_words 26 | common_words.length.to_f / (words1.length + words2.length) 27 | end 28 | 29 | private 30 | 31 | def similarity_score(words1, words2) 32 | common_words = words1 & words2 33 | 2.0 * common_words.length.to_f / (words1.length + words2.length) 34 | end 35 | 36 | def calculate_similarity_matrix(words_matrix) 37 | # initialize similarity matrix with 0 38 | size = words_matrix.length 39 | similarity_matrix = size.times.map { Array.new(size, 0) } 40 | 41 | # calculate similarity matrix between all texts 42 | size.times do |i| 43 | size.times do |j| 44 | similarity_matrix[i][j] = similarity_score(words_matrix[i], words_matrix[j]) 45 | end 46 | end 47 | 48 | similarity_matrix 49 | end 50 | 51 | def calculate_clusters(similarity_matrix) 52 | clusters = [] 53 | size = similarity_matrix.length 54 | 55 | size.times do |i| 56 | similar = [] 57 | 58 | # find similar texts to i 59 | size.times do |j| 60 | similar << j if j > i && similarity_matrix[i][j] > THRESHOLD 61 | end 62 | 63 | # add i to array of similar texts 64 | if similar.length > 0 65 | similar << i 66 | clusters << similar.sort # sort the array 67 | end 68 | end 69 | 70 | # remove redundent clusters: 71 | clusters.size.times do |i| 72 | clusters.size.times do |j| 73 | if clusters[j].length < clusters[i].length 74 | clusters[j] = [] if (clusters[j] & clusters[i]) == clusters[j] 75 | end 76 | end 77 | end 78 | 79 | clusters.select{ |c| c.length > 1 } 80 | end 81 | 82 | end 83 | end 84 | end 85 | -------------------------------------------------------------------------------- /lib/clusters.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | module NewsAgg 3 | module Clusters 4 | class << self 5 | 6 | def create! 7 | # DEBUG 8 | p "clustering items..." 9 | 10 | items = NewsAgg::Item.find(R.keys('item*')) 11 | texts = items.map{|item| item.content} 12 | clusterer = NewsAgg::Clusterer::StatsCluster.new 13 | clusters_ids = clusterer.clusterize(texts) 14 | 15 | clusters_items = [] 16 | clusters_ids.each do |cluster_ids| 17 | clusters_items << cluster_ids.map{|id| items[id].key} 18 | end 19 | 20 | R['clusters'] = clusters_items.to_json 21 | end 22 | 23 | def load 24 | clusters_data = R['clusters'] 25 | if clusters_data 26 | clusters_ids = JSON.parse(clusters_data) 27 | clusters_ids.map{|cluster_ids| Item.find(cluster_ids)} 28 | else 29 | [] 30 | end 31 | end 32 | end 33 | end 34 | end 35 | 36 | -------------------------------------------------------------------------------- /lib/core_ext/string.rb: -------------------------------------------------------------------------------- 1 | class String 2 | 3 | if defined?(Encoding) && "".respond_to?(:encode) 4 | def encoding_aware? 5 | true 6 | end 7 | else 8 | def encoding_aware? 9 | false 10 | end 11 | end 12 | 13 | # 0x3000: fullwidth whitespace 14 | NON_WHITESPACE_REGEXP = %r![^\s#{[0x3000].pack("U")}]! 15 | 16 | # A string is blank if it's empty or contains whitespaces only: 17 | # 18 | # "".blank? # => true 19 | # " ".blank? # => true 20 | # " ".blank? # => true 21 | # " something here ".blank? # => false 22 | # 23 | def blank? 24 | # 1.8 does not takes [:space:] properly 25 | if encoding_aware? 26 | self !~ /[^[:space:]]/ 27 | else 28 | self !~ NON_WHITESPACE_REGEXP 29 | end 30 | end 31 | 32 | def truncate(length=300) 33 | text = self.dup 34 | text[0...length] + '...' 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /lib/crawler.rb: -------------------------------------------------------------------------------- 1 | module NewsAgg 2 | class Crawler 3 | attr_accessor :medium 4 | 5 | def initialize(medium) 6 | @medium = medium 7 | end 8 | 9 | def process 10 | feed_items.each do |feed_item| 11 | item = Item.new(feed_item) 12 | 13 | unless item.exists? 14 | parser = NewsAgg::Parser::Html.new(item.url, medium.selector) 15 | item.content = parser.content 16 | item.save 17 | end 18 | end 19 | 20 | Category.clean_old_items! 21 | end 22 | 23 | def self.start 24 | Medium.all.each do |medium| 25 | crawler = Crawler.new(medium) 26 | crawler.process 27 | end 28 | 29 | Clusters.create! 30 | end 31 | 32 | private 33 | def feed_items 34 | parser = NewsAgg::Parser::Rss.new(medium.key, medium.feeds) 35 | parser.items 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /lib/data/stop_words.txt: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | across 5 | after 6 | afterwards 7 | again 8 | against 9 | all 10 | almost 11 | alone 12 | along 13 | already 14 | also 15 | although 16 | always 17 | am 18 | among 19 | amongst 20 | amoungst 21 | amount 22 | an 23 | and 24 | another 25 | any 26 | anyhow 27 | anyone 28 | anything 29 | anyway 30 | anywhere 31 | are 32 | around 33 | as 34 | at 35 | back 36 | be 37 | became 38 | because 39 | become 40 | becomes 41 | becoming 42 | been 43 | before 44 | beforehand 45 | behind 46 | being 47 | below 48 | beside 49 | besides 50 | between 51 | beyond 52 | bill 53 | both 54 | bottom 55 | but 56 | by 57 | call 58 | can 59 | cannot 60 | cant 61 | co 62 | computer 63 | con 64 | could 65 | couldnt 66 | cry 67 | de 68 | describe 69 | detail 70 | do 71 | done 72 | down 73 | due 74 | during 75 | each 76 | eg 77 | eight 78 | either 79 | eleven 80 | else 81 | elsewhere 82 | empty 83 | enough 84 | etc 85 | even 86 | ever 87 | every 88 | everyone 89 | everything 90 | everywhere 91 | except 92 | few 93 | fifteen 94 | fify 95 | fill 96 | find 97 | fire 98 | first 99 | five 100 | for 101 | former 102 | formerly 103 | forty 104 | found 105 | four 106 | from 107 | front 108 | full 109 | further 110 | get 111 | give 112 | go 113 | had 114 | has 115 | hasnt 116 | have 117 | he 118 | hence 119 | her 120 | here 121 | hereafter 122 | hereby 123 | herein 124 | hereupon 125 | hers 126 | herself 127 | him 128 | himself 129 | his 130 | how 131 | however 132 | hundred 133 | i 134 | ie 135 | if 136 | in 137 | inc 138 | indeed 139 | interest 140 | into 141 | is 142 | it 143 | its 144 | itself 145 | keep 146 | last 147 | latter 148 | latterly 149 | least 150 | less 151 | ltd 152 | made 153 | many 154 | may 155 | me 156 | meanwhile 157 | might 158 | mill 159 | mine 160 | more 161 | moreover 162 | most 163 | mostly 164 | move 165 | much 166 | must 167 | my 168 | myself 169 | name 170 | namely 171 | neither 172 | never 173 | nevertheless 174 | next 175 | nine 176 | no 177 | nobody 178 | none 179 | noone 180 | nor 181 | not 182 | nothing 183 | now 184 | nowhere 185 | of 186 | off 187 | often 188 | on 189 | once 190 | one 191 | only 192 | onto 193 | or 194 | other 195 | others 196 | otherwise 197 | our 198 | ours 199 | ourselves 200 | out 201 | over 202 | own 203 | part 204 | per 205 | perhaps 206 | please 207 | put 208 | rather 209 | re 210 | same 211 | see 212 | seem 213 | seemed 214 | seeming 215 | seems 216 | serious 217 | several 218 | she 219 | should 220 | show 221 | side 222 | since 223 | sincere 224 | six 225 | sixty 226 | so 227 | some 228 | somehow 229 | someone 230 | something 231 | sometime 232 | sometimes 233 | somewhere 234 | still 235 | such 236 | system 237 | take 238 | ten 239 | than 240 | that 241 | the 242 | their 243 | them 244 | themselves 245 | then 246 | thence 247 | there 248 | thereafter 249 | thereby 250 | therefore 251 | therein 252 | thereupon 253 | these 254 | they 255 | thick 256 | thin 257 | third 258 | this 259 | those 260 | though 261 | three 262 | through 263 | throughout 264 | thru 265 | thus 266 | to 267 | together 268 | too 269 | top 270 | toward 271 | towards 272 | twelve 273 | twenty 274 | two 275 | un 276 | under 277 | until 278 | up 279 | upon 280 | us 281 | very 282 | via 283 | was 284 | we 285 | well 286 | were 287 | what 288 | whatever 289 | when 290 | whence 291 | whenever 292 | where 293 | whereafter 294 | whereas 295 | whereby 296 | wherein 297 | whereupon 298 | wherever 299 | whether 300 | which 301 | while 302 | whither 303 | who 304 | whoever 305 | whole 306 | whom 307 | whose 308 | why 309 | will 310 | with 311 | within 312 | without 313 | would 314 | yet 315 | you 316 | your 317 | yours 318 | yourself 319 | yourselves 320 | -------------------------------------------------------------------------------- /lib/item.rb: -------------------------------------------------------------------------------- 1 | module NewsAgg 2 | class Item 3 | attr_accessor :medium_key, :title, :timestamp, 4 | :url, :content, :scores, :category 5 | 6 | def initialize(params) 7 | @medium_key = params['medium_key'] 8 | @title = params['title'] 9 | @timestamp = params['timestamp'] 10 | @url = params['url'] 11 | # @description = params['description'] 12 | @content = params['content'] 13 | 14 | load_associations 15 | end 16 | 17 | def save 18 | # persist the item only if it has content. some media items are 19 | # displayed in their feeds but they cannot be accessed on the site (!?) 20 | return if content.blank? 21 | 22 | unless exists? 23 | save_item 24 | Classifier.classify(self) 25 | else 26 | # use different timestamp if other item exists with the same timestamp 27 | unless title_same? 28 | timestamp += 1 29 | # save_item 30 | save 31 | end 32 | end 33 | end 34 | 35 | def exists? 36 | R.exists(key) 37 | end 38 | 39 | def title_same? 40 | R.hget(key, 'title') == title 41 | end 42 | 43 | def key 44 | # don't cache this string, 45 | # timestamp change is used to produce original key 46 | "item:#{medium_key}:#{timestamp}" 47 | end 48 | 49 | def add_scores(scores) 50 | # redis: string 51 | R.set("scores:#{key}", scores.to_json) 52 | end 53 | 54 | def destroy 55 | R.multi do 56 | category.remove_item(self) 57 | remove_scores 58 | delete_item 59 | # TODO: remove item from clusters 60 | end 61 | end 62 | 63 | def self.find(key) 64 | if key.is_a?(Array) 65 | key.map { |k| Item.find(k) } 66 | else 67 | new(R.hgetall(key)) 68 | end 69 | end 70 | 71 | private 72 | 73 | def save_item 74 | # DEBUG: saved object 75 | p "saving... medium => #{medium_key}, key => #{key}, title => #{title}" 76 | 77 | k = key 78 | R.hmset(k, 'medium_key', medium_key) 79 | R.hmset(k, 'title', title) 80 | R.hmset(k, 'timestamp', timestamp) 81 | R.hmset(k, 'url', url) 82 | # R.hmset(k, 'description', description) 83 | R.hmset(k, 'content', content) 84 | end 85 | 86 | def delete_item 87 | R.del(key) 88 | end 89 | 90 | def remove_scores 91 | R.del("scores:#{key}") 92 | end 93 | 94 | def load_associations 95 | @scores = load_scores 96 | @category = load_category 97 | end 98 | 99 | def load_scores 100 | scores = R.get("scores:#{key}") 101 | if scores 102 | JSON.parse(scores) 103 | else 104 | [] 105 | end 106 | end 107 | 108 | def load_category 109 | category_name, score = scores.max_by{ |k,v| v } 110 | Category.find(category_name) 111 | end 112 | end 113 | end 114 | -------------------------------------------------------------------------------- /lib/medium.rb: -------------------------------------------------------------------------------- 1 | module NewsAgg 2 | class Medium 3 | attr_accessor :key, :url, :feeds, :selector 4 | 5 | def initialize(params) 6 | @key = params[:key] 7 | @url = params[:url] 8 | @feeds = params[:feeds] 9 | @selector = params[:selector] 10 | end 11 | 12 | def self.all 13 | @all ||= CONFIG[:media].map { |params| new(params) } 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/newsagg.rb: -------------------------------------------------------------------------------- 1 | path = File.expand_path('../../lib', __FILE__) 2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path) 3 | 4 | require_relative '../config' 5 | 6 | module NewsAgg 7 | autoload :Category, 'category' 8 | autoload :Medium, 'medium' 9 | autoload :Item, 'item' 10 | autoload :TrainingSet, 'training_set' 11 | autoload :Clusters, 'clusters' 12 | autoload :Trainer, 'trainer' 13 | autoload :Crawler, 'crawler' 14 | autoload :Parser, 'parser' 15 | autoload :Classifier, 'classifier' 16 | autoload :Clusterer, 'clusterer' 17 | end 18 | -------------------------------------------------------------------------------- /lib/parser.rb: -------------------------------------------------------------------------------- 1 | path = File.expand_path('../../lib/parser', __FILE__) 2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path) 3 | 4 | module NewsAgg 5 | module Parser 6 | autoload :Rss, 'rss' 7 | autoload :Html, 'html' 8 | autoload :Cleaner, 'cleaner' 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /lib/parser/cleaner.rb: -------------------------------------------------------------------------------- 1 | module NewsAgg 2 | module Parser 3 | module Cleaner 4 | 5 | private 6 | def clean_whitespace(text) 7 | text.gsub(/\s{2,}|\t|\n/, ' ').strip 8 | end 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/parser/html.rb: -------------------------------------------------------------------------------- 1 | require 'open-uri' 2 | require 'nokogiri' 3 | 4 | module NewsAgg 5 | module Parser 6 | class Html 7 | include Cleaner 8 | attr_accessor :url, :selector 9 | 10 | def initialize(url, selector) 11 | @url = url 12 | @selector = selector 13 | end 14 | 15 | def content 16 | content = [] 17 | html_elements = fetch_html_elements(url) 18 | html_elements.each { |element| content << clean_whitespace(element.text) } 19 | content.join(' ') 20 | end 21 | 22 | private 23 | 24 | def fetch_html_elements(url) 25 | if url =~ URI::regexp 26 | # TODO: handle exceptions properly 27 | begin 28 | # DEBUG: URL 29 | # p url 30 | doc = Nokogiri::HTML(open(url)) 31 | doc.search(selector) 32 | rescue OpenURI::HTTPError 33 | [] 34 | end 35 | else 36 | [] 37 | end 38 | end 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /lib/parser/rss.rb: -------------------------------------------------------------------------------- 1 | require 'net/http' 2 | require 'rss/2.0' 3 | 4 | module NewsAgg 5 | module Parser 6 | class Rss 7 | FEED_LIMIT = 15 # max number of items per medium 8 | 9 | include Cleaner 10 | attr_accessor :medium_key, :feed_urls 11 | 12 | def initialize(medium_key, feed_urls) 13 | @medium_key = medium_key 14 | @feed_urls = feed_urls 15 | end 16 | 17 | def items 18 | items = [] 19 | feed_urls.each do |feed_url| 20 | rss_items = fetch_rss_items(feed_url) 21 | rss_items.each { |item| items << parse(item) } 22 | end 23 | items 24 | 25 | # limit the items per medium to the number 26 | # of max items that can be displayed 27 | items = items.sort{ |a, b| b['timestamp'] <=> a['timestamp'] }.first(FEED_LIMIT) 28 | end 29 | 30 | private 31 | def parse(item) 32 | object = {} 33 | object['medium_key'] = medium_key 34 | object['title'] = clean_whitespace(item.title) 35 | object['timestamp'] = item.date.to_i || item.pubDate.to_i 36 | # TODO: clean description from HTML tags 37 | # object['description'] = clean_whitespace(item.description) 38 | object['url'] = clean_whitespace(item.link) 39 | 40 | # DEBUG: feed_url object 41 | # p object['url'] 42 | 43 | object 44 | end 45 | 46 | def fetch_rss_items(feed_url) 47 | if feed_url =~ URI::regexp 48 | begin 49 | # TODO: handle exceptions properly 50 | uri = URI.parse(feed_url) 51 | response = Net::HTTP.get_response(uri) 52 | RSS::Parser.parse(response.body, false).items 53 | rescue OpenURI::HTTPError 54 | [] 55 | end 56 | else 57 | [] 58 | end 59 | end 60 | end 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /lib/tasks/scheduler.rake: -------------------------------------------------------------------------------- 1 | require_relative '../../lib/newsagg' 2 | 3 | desc "Crawl web pages" 4 | task :crawl do 5 | puts "!! Crawling start..." 6 | NewsAgg::Crawler.start 7 | puts "!! Crawling end." 8 | end 9 | 10 | require_relative '../../lib/newsagg' 11 | 12 | desc "Training sets for classifier" 13 | task :train do 14 | puts "!! Trainer start..." 15 | NewsAgg::Trainer.train 16 | puts "!! Trainer end." 17 | end 18 | -------------------------------------------------------------------------------- /lib/trainer.rb: -------------------------------------------------------------------------------- 1 | module NewsAgg 2 | module Trainer 3 | def self.train 4 | Category.all.each do |category| 5 | contents = [] 6 | 7 | category.seeds.each do |seed| 8 | parser = NewsAgg::Parser::Html.new(seed[:url], seed[:selector]) 9 | contents << parser.content 10 | end 11 | 12 | training_set = TrainingSet.new(category.name, contents.join(' ')) 13 | training_set.save 14 | end 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/training_set.rb: -------------------------------------------------------------------------------- 1 | module NewsAgg 2 | class TrainingSet 3 | attr_accessor :category, :content 4 | 5 | def initialize(category, content) 6 | @category = category 7 | @content = content 8 | end 9 | 10 | def save 11 | # DEBUG: training set 12 | p "saving training set... category => #{category}" 13 | k = self.class.key(category) 14 | R.hmset(k, 'category', category) 15 | R.hmset(k, 'content', content) 16 | end 17 | 18 | def self.find(category) 19 | params = R.hgetall(key(category)) 20 | TrainingSet.new(params['category'], params['content']) 21 | end 22 | 23 | private 24 | def self.key(category) 25 | "training_set:#{category}" 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /script/crawl.rb: -------------------------------------------------------------------------------- 1 | require_relative '../lib/newsagg' 2 | NewsAgg::Crawler.start 3 | -------------------------------------------------------------------------------- /script/train.rb: -------------------------------------------------------------------------------- 1 | require_relative '../lib/newsagg' 2 | NewsAgg::Trainer.train 3 | -------------------------------------------------------------------------------- /views/category.haml: -------------------------------------------------------------------------------- 1 | - if @items.empty? 2 | No news is good news. 3 | - else 4 | %ul 5 | - @items.each do |item| 6 | %li 7 | %h2 8 | = link_to item.title, item.url 9 | %span{:title => score_percentages(item.scores)}= "(#{item.medium_key})" 10 | 11 | %p= item.content.truncate(300) 12 | -------------------------------------------------------------------------------- /views/cluster.haml: -------------------------------------------------------------------------------- 1 | - if @clusters.empty? 2 | Sorry, we don't have any clusters yet. See news categories. 3 | - else 4 | %ul 5 | - @clusters.each do |cluster| 6 | - item = cluster[0] 7 | %li 8 | %h2 9 | = link_to "#{item.title}", item.url 10 | %span{:title => score_percentages(item.scores)}= "(#{item.medium_key})" 11 | - cluster[1..-1].each do |item| 12 | %h3 13 | = link_to item.title, item.url 14 | %span{:title => score_percentages(item.scores)}= "(#{item.medium_key})" 15 | 16 | %p= item.content.truncate(300) 17 | -------------------------------------------------------------------------------- /views/layout.haml: -------------------------------------------------------------------------------- 1 | !!! 2 | %html(xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en") 3 | %head 4 | %meta{"http-equiv" => "content-type", :content => "text/html;charset=UTF-8"} 5 | %title= "NewsAgg" 6 | 7 | 8 | 9 | 10 | 11 | %body 12 | #container 13 | %header 14 | %h1= link_to 'NewsAgg', '/' 15 | 16 | .content 17 | .menu 18 | %ul 19 | - @categories.each do |category| 20 | %li{:class => (category == @category ? 'active' : nil)}= link_to category.name.capitalize, "/?c=#{category.name}" 21 | .news 22 | = yield 23 | 24 | -------------------------------------------------------------------------------- /views/style.sass: -------------------------------------------------------------------------------- 1 | /* http://meyerweb.com/eric/tools/css/reset/ 2 | v2.0 | 20110126 3 | License: none (public domain) */ 4 | 5 | html, body, div, span, applet, object, iframe, 6 | h1, h2, h3, h4, h5, h6, p, blockquote, pre, 7 | a, abbr, acronym, address, big, cite, code, 8 | del, dfn, em, img, ins, kbd, q, s, samp, 9 | small, strike, strong, sub, sup, tt, var, 10 | b, u, i, center, 11 | dl, dt, dd, ol, ul, li, 12 | fieldset, form, label, legend, 13 | table, caption, tbody, tfoot, thead, tr, th, td, 14 | article, aside, canvas, details, embed, 15 | figure, figcaption, footer, header, hgroup, 16 | menu, nav, output, ruby, section, summary, 17 | time, mark, audio, video 18 | margin: 0 19 | padding: 0 20 | border: 0 21 | font-size: 100% 22 | font: inherit 23 | vertical-align: baseline 24 | 25 | /* HTML5 display-role reset for older browsers */ 26 | article, aside, details, figcaption, figure, 27 | footer, header, hgroup, menu, nav, section 28 | display: block 29 | 30 | body 31 | line-height: 1 32 | 33 | ol, ul 34 | list-style: none 35 | 36 | blockquote, q 37 | quotes: none 38 | 39 | blockquote:before, blockquote:after, 40 | q:before, q:after 41 | content: '' 42 | content: none 43 | 44 | table 45 | border-collapse: collapse 46 | border-spacing: 0 47 | 48 | body 49 | color: #333 50 | font-family: "Helvetica Neue",Helvetica,Arial,sans-serif 51 | 52 | a, a:active, a:visited 53 | color: #607890 54 | text-decoration: none 55 | 56 | a:hover 57 | color: #003366 58 | text-decoration: underline 59 | 60 | p 61 | line-height: 18px 62 | margin-bottom: 10px 63 | 64 | 65 | 66 | .content 67 | width: 960px 68 | margin: 0 auto 69 | 70 | header 71 | margin-bottom: 30px 72 | h1 73 | font-size: 30px 74 | margin-top: 20px 75 | 76 | 77 | ul 78 | li 79 | margin: 5px 0 80 | 81 | #container 82 | width: 960px 83 | margin: 0 auto 84 | overflow: hidden 85 | 86 | .menu 87 | width: 200px 88 | float: left 89 | 90 | ul 91 | li 92 | border-left: 5px solid #CCC 93 | margin: 12px 0 94 | padding: 0 6px 95 | 96 | &.active 97 | border-left: 5px solid #000 !important 98 | 99 | li:nth-child(1) 100 | border-left: 5px solid #F3BD18 101 | li:nth-child(2) 102 | border-left: 5px solid #DE6E26 103 | li:nth-child(3) 104 | border-left: 5px solid #A71D3F 105 | li:nth-child(4) 106 | border-left: 5px solid #70629A 107 | li:nth-child(5) 108 | border-left: 5px solid #0988C5 109 | li:nth-child(6) 110 | border-left: 5px solid #3A9F47 111 | li:nth-child(7) 112 | border-left: 5px solid #FF0099 113 | li:nth-child(8) 114 | border-left: 5px solid #412A1A 115 | li:nth-child(9) 116 | border-left: 5px solid #C5B8BF 117 | 118 | 119 | .news 120 | width: 760px 121 | float: left 122 | 123 | ul 124 | li 125 | margin-bottom: 20px 126 | h2 127 | margin: 4px 0 128 | 129 | h3 130 | font-size: 13px 131 | margin: 4px 0 132 | 133 | p 134 | font-size: 13px 135 | 136 | span 137 | color: #666 138 | font-size: 80% 139 | 140 | ul.cluster 141 | overflow: hidden 142 | li 143 | width: 250px 144 | float: left 145 | --------------------------------------------------------------------------------