├── .gitignore
├── .rvmrc
├── Gemfile
├── Gemfile.lock
├── README.md
├── Rakefile
├── app.rb
├── config.rb
├── config.ru
├── config.yml
├── helpers.rb
├── lib
├── category.rb
├── classifier.rb
├── classifier
│ └── stats_classifier.rb
├── clusterer.rb
├── clusterer
│ └── stats_cluster.rb
├── clusters.rb
├── core_ext
│ └── string.rb
├── crawler.rb
├── data
│ └── stop_words.txt
├── item.rb
├── medium.rb
├── newsagg.rb
├── parser.rb
├── parser
│ ├── cleaner.rb
│ ├── html.rb
│ └── rss.rb
├── tasks
│ └── scheduler.rake
├── trainer.rb
└── training_set.rb
├── script
├── crawl.rb
└── train.rb
└── views
├── category.haml
├── cluster.haml
├── layout.haml
└── style.sass
/.gitignore:
--------------------------------------------------------------------------------
1 | .sass-cache
2 |
--------------------------------------------------------------------------------
/.rvmrc:
--------------------------------------------------------------------------------
1 | rvm ruby-1.9.2-p290@newsagg
2 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'http://rubygems.org'
2 |
3 | gem 'sinatra'
4 | gem 'haml'
5 | gem 'sass'
6 | gem 'redis'
7 | gem 'nokogiri'
8 |
--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: http://rubygems.org/
3 | specs:
4 | haml (3.1.3)
5 | nokogiri (1.5.0)
6 | rack (1.3.5)
7 | rack-protection (1.1.4)
8 | rack
9 | redis (2.2.2)
10 | sass (3.1.10)
11 | sinatra (1.3.1)
12 | rack (~> 1.3, >= 1.3.4)
13 | rack-protection (~> 1.1, >= 1.1.2)
14 | tilt (~> 1.3, >= 1.3.3)
15 | tilt (1.3.3)
16 |
17 | PLATFORMS
18 | ruby
19 |
20 | DEPENDENCIES
21 | haml
22 | nokogiri
23 | redis
24 | sass
25 | sinatra
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NewsAgg
2 |
3 | ## DESCRIPTION:
4 |
5 | News Aggregator that classifies and clusterifies news from different sources
6 |
7 | ## INSTALLATION:
8 |
9 | bundle install
10 |
11 | ## SETUP:
12 |
13 | ### Configure categories, training sets and RSS feeds
14 |
15 | (edit: "config.yml" file)
16 |
17 | ## USAGE:
18 |
19 | ### Create training sets
20 |
21 | ruby script/train.rb
22 |
23 | ### Collect & classify articles from newspapers
24 |
25 | ruby script/crawl.rb
26 |
27 | ### Read news (localhost:9292)
28 |
29 | rackup config.ru
30 |
31 | ## LICENSE:
32 |
33 | (The MIT License)
34 |
35 | Copyright (c) 2011 Dalibor Nasevic
36 |
37 | Permission is hereby granted, free of charge, to any person obtaining
38 | a copy of this software and associated documentation files (the
39 | 'Software'), to deal in the Software without restriction, including
40 | without limitation the rights to use, copy, modify, merge, publish,
41 | distribute, sublicense, and/or sell copies of the Software, and to
42 | permit persons to whom the Software is furnished to do so, subject to
43 | the following conditions:
44 |
45 | The above copyright notice and this permission notice shall be
46 | included in all copies or substantial portions of the Software.
47 |
48 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
49 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
50 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
51 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
52 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
53 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
54 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
55 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'rake'
2 | require 'rake/testtask'
3 | require 'rake/rdoctask'
4 |
5 | Dir["#{File.dirname(__FILE__)}/lib/tasks/**/*.rake"].sort.each { |t| load t }
6 |
--------------------------------------------------------------------------------
/app.rb:
--------------------------------------------------------------------------------
1 | require './config'
2 |
3 | include NewsAgg
4 |
5 | get '/' do
6 | @categories = Category.all
7 | @category = Category.find(params[:c])
8 |
9 | if @category
10 | @items = @category.recent_items
11 | haml :category, :layout => :layout
12 | else
13 | @clusters = Clusters.load
14 | haml :cluster, :layout => :layout
15 | end
16 | end
17 |
18 | get '/style.css' do
19 | sass :style
20 | end
21 |
--------------------------------------------------------------------------------
/config.rb:
--------------------------------------------------------------------------------
1 | require "rubygems"
2 | require "bundler/setup"
3 | Bundler.require
4 |
5 | require_relative 'helpers'
6 | require_relative 'lib/newsagg'
7 | require 'json'
8 |
9 |
10 | Dir["#{File.dirname(__FILE__)}/lib/core_ext/*.rb"].sort.each do |path|
11 | require_relative "lib/core_ext/#{File.basename(path, '.rb')}"
12 | end
13 |
14 |
15 | # development environment
16 | ENV['RACK_ENV'] ||= 'development'
17 | configure :development do
18 | ENV["REDISTOGO_URL"] = 'redis://localhost:6379'
19 | end
20 |
21 |
22 | # production environment
23 | # Heroku sets: ENV['RACK_ENV'] and ENV["REDISTOGO_URL"]
24 |
25 |
26 | # all environments
27 | configure do
28 | uri = URI.parse(ENV["REDISTOGO_URL"])
29 | R = Redis.new(:host => uri.host, :port => uri.port, :password => uri.password)
30 | end
31 |
32 |
33 | CONFIG = YAML::load_file(File.join(File.dirname(__FILE__), 'config.yml'))
34 |
--------------------------------------------------------------------------------
/config.ru:
--------------------------------------------------------------------------------
1 | require './app'
2 |
3 | run Sinatra::Application
4 |
--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
1 | :categories:
2 | - :name: 'world'
3 | :seeds:
4 | - :url: 'http://en.wikipedia.org/wiki/World'
5 | :selector: '.mw-content-ltr'
6 | - :url: 'http://en.wikipedia.org/wiki/List_of_sovereign_states'
7 | :selector: '.mw-content-ltr'
8 | - :url: 'http://en.wikipedia.org/wiki/Universe'
9 | :selector: '.mw-content-ltr'
10 | - :url: 'http://en.wikipedia.org/wiki/Globe'
11 | :selector: '.mw-content-ltr'
12 | - :name: 'africa'
13 | :seeds:
14 | - :url: 'http://en.wikipedia.org/wiki/Africa'
15 | :selector: '.mw-content-ltr'
16 | - :url: 'http://en.wikipedia.org/wiki/List_of_African_countries_and_territories'
17 | :selector: '.mw-content-ltr'
18 | - :name: 'economy'
19 | :seeds:
20 | - :url: 'http://en.wikipedia.org/wiki/Economy'
21 | :selector: '.mw-content-ltr'
22 | - :url: 'http://en.wikipedia.org/wiki/Finance'
23 | :selector: '.mw-content-ltr'
24 | - :url: 'http://en.wikipedia.org/wiki/Business'
25 | :selector: '.mw-content-ltr'
26 | - :name: 'science'
27 | :seeds:
28 | - :url: 'http://en.wikipedia.org/wiki/Science'
29 | :selector: '.mw-content-ltr'
30 | - :url: 'http://en.wikipedia.org/wiki/Technology'
31 | :selector: '.mw-content-ltr'
32 | - :name: 'sport'
33 | :seeds:
34 | - :url: 'http://en.wikipedia.org/wiki/Sport'
35 | :selector: '.mw-content-ltr'
36 | - :url: 'http://en.wikipedia.org/wiki/Outline_of_sports'
37 | :selector: '.mw-content-ltr'
38 | - :name: 'health'
39 | :seeds:
40 | - :url: 'http://en.wikipedia.org/wiki/Health'
41 | :selector: '.mw-content-ltr'
42 | - :url: 'http://en.wikipedia.org/wiki/Health_care'
43 | :selector: '.mw-content-ltr'
44 | - :name: 'entertainment'
45 | :seeds:
46 | - :url: 'http://en.wikipedia.org/wiki/Entertainment'
47 | :selector: '.mw-content-ltr'
48 | - :url: 'http://en.wikipedia.org/wiki/Film'
49 | :selector: '.mw-content-ltr'
50 | - :url: 'http://en.wikipedia.org/wiki/Radio_programming'
51 | :selector: '.mw-content-ltr'
52 | - :url: 'http://en.wikipedia.org/wiki/Concert'
53 | :selector: '.mw-content-ltr'
54 |
55 | :media:
56 | - :key: 'timeslive.co.za'
57 | :url: 'http://www.timeslive.co.za'
58 | :feeds:
59 | - 'http://avusa.feedsportal.com/c/33051/f/534658/index.rss'
60 | :selector: '#article .area > h3, #article .area > p, #article > h3'
61 |
62 | - :key: 'thenewage.co.za'
63 | :url: 'http://www.sowetanlive.co.za'
64 | :feeds:
65 | - 'http://thenewage.co.za/rss.aspx?cat_id=9' # business
66 | - 'http://thenewage.co.za/rss.aspx?cat_id=1021' # science and technology
67 | - 'http://thenewage.co.za/rss.aspx?cat_id=1020' # world
68 | - 'http://thenewage.co.za/rss.aspx?cat_id=1022' # entertainment
69 | - 'http://thenewage.co.za/rss.aspx?cat_id=1019' # afrika
70 | - 'http://thenewage.co.za/rss.aspx?cat_id=11' # sport
71 | :selector: '#dv_story_dtls'
72 |
73 | - :key: 'sowetanlive.co.za'
74 | :url: 'http://www.sowetanlive.co.za'
75 | :feeds:
76 | - 'http://www.sowetanlive.co.za/?service=rss'
77 | :selector: '#content .articleheader h3, #content p'
78 |
79 | - :key: 'news24.com'
80 | :url: 'http://www.news24.com'
81 | :feeds:
82 | - 'http://feeds.news24.com/articles/Kenya/TopStories/rss' # top stories
83 | - 'http://feeds.news24.com/articles/Kenya/Africa/rss' # afrika
84 | - 'http://feeds.news24.com/articles/Kenya/World/rss' # world
85 | - 'http://feeds.news24.com/articles/Kenya/SciTech/rss' # sci-tech
86 | - 'http://feeds.24.com/articles/sport/featured/topstories/rss' # sport
87 | - 'http://feeds.news24.com/articles/Kenya/Entertainment/rss' # entertainment
88 | :selector: '.article_body p:first'
89 |
90 | - :key: 'iol.co.za'
91 | :url: 'http://www.iol.co.za'
92 | :feeds:
93 | - 'http://iol.co.za/cmlink/1.640' # news
94 | - 'http://www.iol.co.za/cmlink/1.730910' # business
95 | - 'http://iol.co.za/cmlink/sport-category-rss-1.704' # sport
96 | :selector: '.aticle_column > p'
97 |
98 | - :key: 'ewn.co.za'
99 | :url: 'http://www.ewn.co.za'
100 | :feeds:
101 | - 'http://www.ewn.co.za/Feeds/Local.aspx' # local
102 | - 'http://www.ewn.co.za/Feeds/World.aspx' # world
103 | - 'http://www.ewn.co.za/Feeds/Sport.aspx' # sport
104 | - 'http://www.ewn.co.za/Feeds/Entertainment.aspx' # entertainment
105 | - 'http://www.ewn.co.za/Feeds/Breaking.aspx' # breaking news
106 | - 'http://www.ewn.co.za/Feeds/LatestNews.aspx' # latest news (all categories)
107 | :selector: '.storybodytext > p'
108 |
109 | - :key: 'mg.co.za'
110 | :url: 'http://mg.co.za'
111 | :feeds:
112 | - 'http://mg.co.za/rss' # top stories
113 | - 'http://mg.co.za/rss/national' # national
114 | - 'http://mg.co.za/rss/sport' # sport
115 | - 'http://mg.co.za/rss/business' # business
116 | - 'http://mg.co.za/rss/world' # world
117 | - 'http://mg.co.za/rss/africa' # africa
118 | - 'http://mg.co.za/rss/and-in-other-news' # other
119 | :selector: '#storycontainer .article_lead, #storycontainer .article_body'
120 |
--------------------------------------------------------------------------------
/helpers.rb:
--------------------------------------------------------------------------------
1 | helpers do
2 | def link_to(text, url, opts = {})
3 | attributes = ""
4 | opts.each { |key,value| attributes << key.to_s << "=\"" << value << "\" "}
5 | "#{text}"
6 | end
7 |
8 | def score_percentages(scores)
9 | scores.sort{ |a, b| b[1] <=> a[1] }.map do |s|
10 | "#{s[0].capitalize}: #{'%.2f' % (s[1] * 100)}%"
11 | end.join(', ')
12 | end
13 | end
14 |
--------------------------------------------------------------------------------
/lib/category.rb:
--------------------------------------------------------------------------------
1 | module NewsAgg
2 | class Category
3 | attr_accessor :name, :seeds
4 | CATEGORY_LIMIT = 15 # max number of items per category
5 |
6 | def initialize(params)
7 | @name = params[:name]
8 | @seeds = params[:seeds]
9 | end
10 |
11 | # last items ordered by timestamp (redis: ordered set)
12 | def recent_items(limit = CATEGORY_LIMIT)
13 | keys = R.zrevrange("category:#{name}", 0, limit - 1)
14 | @items = Item.find(keys)
15 | end
16 |
17 | # find older category items (0,1,2,3,[4,5,6,7,8])
18 | def old_items(limit = CATEGORY_LIMIT)
19 | keys = R.zrevrange("category:#{name}", limit, -1)
20 | @items = Item.find(keys)
21 | end
22 |
23 | def add_item(item, score)
24 | # redis sorted set: category:name
25 | R.zadd("category:#{name}", item.timestamp, item.key)
26 | end
27 |
28 | def remove_item(item)
29 | R.zrem("category:#{name}", item.key)
30 | end
31 |
32 | def self.all
33 | @all ||= CONFIG[:categories].map { |params| new(params) }
34 | end
35 |
36 | def self.find(name)
37 | all.detect{ |c| c.name == name }
38 | end
39 |
40 | # Redis on Heroku is free up to 5 mb
41 | # keep only fews records per category
42 | def self.clean_old_items!
43 | # DEBUG
44 | p "cleaning old items..."
45 |
46 | all.each do |category|
47 | category.old_items.each { |item| item.destroy }
48 | end
49 | end
50 | end
51 | end
52 |
--------------------------------------------------------------------------------
/lib/classifier.rb:
--------------------------------------------------------------------------------
1 | path = File.expand_path('../../lib/classifier', __FILE__)
2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path)
3 |
4 | require 'json'
5 |
6 | module NewsAgg
7 | module Classifier
8 | autoload :StatsClassifier, 'stats_classifier'
9 |
10 | # TODO: classify item
11 | def self.classify(item)
12 | classifier = StatsClassifier.new(training_data)
13 |
14 | scores = classifier.scores(item.content)
15 | category_name, score = scores.max_by{ |k,v| v }
16 |
17 | category = Category.find(category_name)
18 | category.add_item(item, score)
19 | item.add_scores(scores)
20 |
21 | # DEBUG: classified object
22 | p "classifying... medium => #{item.medium_key}, key => #{item.key}, title => #{item.title}, :category => #{category.name}"
23 | end
24 |
25 | private
26 | def self.training_data
27 | return @training_data if @training_data
28 |
29 | @training_data = {}
30 | Category.all.each do |category|
31 | @training_data[category.name] = TrainingSet.find(category.name).content
32 | end
33 | @training_data
34 | end
35 | end
36 | end
37 |
--------------------------------------------------------------------------------
/lib/classifier/stats_classifier.rb:
--------------------------------------------------------------------------------
1 | module NewsAgg
2 | module Classifier
3 | class StatsClassifier
4 | attr_accessor :training_sets
5 |
6 | def initialize(data)
7 | @training_sets = {}
8 | filename = File.join(File.dirname(__FILE__), '..', 'data', 'stop_words.txt')
9 | @noise_words = File.new(filename).readlines.map(&:chomp)
10 |
11 | train(data)
12 | end
13 |
14 | def scores(text)
15 | words = text.downcase.scan(/[a-z]+/)
16 |
17 | scores = {}
18 | training_sets.each_pair do |category, word_weights|
19 | scores[category] = score(word_weights, words)
20 | end
21 |
22 | scores
23 | end
24 |
25 | def train(data)
26 | data.each_pair do |category, text|
27 | words = text.downcase.scan(/[a-z]+/)
28 | word_weights = Hash.new(0)
29 |
30 | words.each {|word| word_weights[word] += 1 unless @noise_words.index(word)}
31 |
32 | ratio = 1.0 / words.length
33 | word_weights.keys.each {|key| word_weights[key] *= ratio}
34 |
35 | training_sets[category] = word_weights
36 | end
37 | end
38 |
39 | private
40 | def score(word_weights, words)
41 | score = words.inject(0) {|acc, word| acc + word_weights[word]}
42 | 1000.0 * score / words.size
43 | end
44 | end
45 | end
46 | end
47 |
--------------------------------------------------------------------------------
/lib/clusterer.rb:
--------------------------------------------------------------------------------
1 | path = File.expand_path('../../lib/clusterer', __FILE__)
2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path)
3 |
4 | require 'json'
5 |
6 | module NewsAgg
7 | module Clusterer
8 | autoload :StatsCluster, 'stats_cluster'
9 | end
10 | end
11 |
--------------------------------------------------------------------------------
/lib/clusterer/stats_cluster.rb:
--------------------------------------------------------------------------------
1 | module NewsAgg
2 | module Clusterer
3 | # Word-Use Intersections
4 | class StatsCluster
5 | THRESHOLD = 0.3
6 | attr_accessor :noise_words
7 |
8 | def initialize
9 | filename = File.join(File.dirname(__FILE__), '..', 'data', 'stop_words.txt')
10 | @noise_words = File.new(filename).readlines.map(&:chomp)
11 | end
12 |
13 | def clusterize(texts)
14 | words_matrix = texts.map { |text| text.downcase.scan(/[a-z]+/) - noise_words }
15 | similarity_matrix = calculate_similarity_matrix(words_matrix)
16 | calculate_clusters(similarity_matrix)
17 | end
18 |
19 | # similarity score between two texts
20 | # used only for debugging
21 | def texts_score(text1, text2)
22 | words1 = text1.downcase.scan(/[a-zA-Z]+/) - noise_words
23 | words2 = text2.downcase.scan(/[a-zA-Z]+/) - noise_words
24 | common_words = words1 & words2
25 | p common_words
26 | common_words.length.to_f / (words1.length + words2.length)
27 | end
28 |
29 | private
30 |
31 | def similarity_score(words1, words2)
32 | common_words = words1 & words2
33 | 2.0 * common_words.length.to_f / (words1.length + words2.length)
34 | end
35 |
36 | def calculate_similarity_matrix(words_matrix)
37 | # initialize similarity matrix with 0
38 | size = words_matrix.length
39 | similarity_matrix = size.times.map { Array.new(size, 0) }
40 |
41 | # calculate similarity matrix between all texts
42 | size.times do |i|
43 | size.times do |j|
44 | similarity_matrix[i][j] = similarity_score(words_matrix[i], words_matrix[j])
45 | end
46 | end
47 |
48 | similarity_matrix
49 | end
50 |
51 | def calculate_clusters(similarity_matrix)
52 | clusters = []
53 | size = similarity_matrix.length
54 |
55 | size.times do |i|
56 | similar = []
57 |
58 | # find similar texts to i
59 | size.times do |j|
60 | similar << j if j > i && similarity_matrix[i][j] > THRESHOLD
61 | end
62 |
63 | # add i to array of similar texts
64 | if similar.length > 0
65 | similar << i
66 | clusters << similar.sort # sort the array
67 | end
68 | end
69 |
70 | # remove redundent clusters:
71 | clusters.size.times do |i|
72 | clusters.size.times do |j|
73 | if clusters[j].length < clusters[i].length
74 | clusters[j] = [] if (clusters[j] & clusters[i]) == clusters[j]
75 | end
76 | end
77 | end
78 |
79 | clusters.select{ |c| c.length > 1 }
80 | end
81 |
82 | end
83 | end
84 | end
85 |
--------------------------------------------------------------------------------
/lib/clusters.rb:
--------------------------------------------------------------------------------
1 | require 'json'
2 | module NewsAgg
3 | module Clusters
4 | class << self
5 |
6 | def create!
7 | # DEBUG
8 | p "clustering items..."
9 |
10 | items = NewsAgg::Item.find(R.keys('item*'))
11 | texts = items.map{|item| item.content}
12 | clusterer = NewsAgg::Clusterer::StatsCluster.new
13 | clusters_ids = clusterer.clusterize(texts)
14 |
15 | clusters_items = []
16 | clusters_ids.each do |cluster_ids|
17 | clusters_items << cluster_ids.map{|id| items[id].key}
18 | end
19 |
20 | R['clusters'] = clusters_items.to_json
21 | end
22 |
23 | def load
24 | clusters_data = R['clusters']
25 | if clusters_data
26 | clusters_ids = JSON.parse(clusters_data)
27 | clusters_ids.map{|cluster_ids| Item.find(cluster_ids)}
28 | else
29 | []
30 | end
31 | end
32 | end
33 | end
34 | end
35 |
36 |
--------------------------------------------------------------------------------
/lib/core_ext/string.rb:
--------------------------------------------------------------------------------
1 | class String
2 |
3 | if defined?(Encoding) && "".respond_to?(:encode)
4 | def encoding_aware?
5 | true
6 | end
7 | else
8 | def encoding_aware?
9 | false
10 | end
11 | end
12 |
13 | # 0x3000: fullwidth whitespace
14 | NON_WHITESPACE_REGEXP = %r![^\s#{[0x3000].pack("U")}]!
15 |
16 | # A string is blank if it's empty or contains whitespaces only:
17 | #
18 | # "".blank? # => true
19 | # " ".blank? # => true
20 | # " ".blank? # => true
21 | # " something here ".blank? # => false
22 | #
23 | def blank?
24 | # 1.8 does not takes [:space:] properly
25 | if encoding_aware?
26 | self !~ /[^[:space:]]/
27 | else
28 | self !~ NON_WHITESPACE_REGEXP
29 | end
30 | end
31 |
32 | def truncate(length=300)
33 | text = self.dup
34 | text[0...length] + '...'
35 | end
36 | end
37 |
--------------------------------------------------------------------------------
/lib/crawler.rb:
--------------------------------------------------------------------------------
1 | module NewsAgg
2 | class Crawler
3 | attr_accessor :medium
4 |
5 | def initialize(medium)
6 | @medium = medium
7 | end
8 |
9 | def process
10 | feed_items.each do |feed_item|
11 | item = Item.new(feed_item)
12 |
13 | unless item.exists?
14 | parser = NewsAgg::Parser::Html.new(item.url, medium.selector)
15 | item.content = parser.content
16 | item.save
17 | end
18 | end
19 |
20 | Category.clean_old_items!
21 | end
22 |
23 | def self.start
24 | Medium.all.each do |medium|
25 | crawler = Crawler.new(medium)
26 | crawler.process
27 | end
28 |
29 | Clusters.create!
30 | end
31 |
32 | private
33 | def feed_items
34 | parser = NewsAgg::Parser::Rss.new(medium.key, medium.feeds)
35 | parser.items
36 | end
37 | end
38 | end
39 |
--------------------------------------------------------------------------------
/lib/data/stop_words.txt:
--------------------------------------------------------------------------------
1 | a
2 | about
3 | above
4 | across
5 | after
6 | afterwards
7 | again
8 | against
9 | all
10 | almost
11 | alone
12 | along
13 | already
14 | also
15 | although
16 | always
17 | am
18 | among
19 | amongst
20 | amoungst
21 | amount
22 | an
23 | and
24 | another
25 | any
26 | anyhow
27 | anyone
28 | anything
29 | anyway
30 | anywhere
31 | are
32 | around
33 | as
34 | at
35 | back
36 | be
37 | became
38 | because
39 | become
40 | becomes
41 | becoming
42 | been
43 | before
44 | beforehand
45 | behind
46 | being
47 | below
48 | beside
49 | besides
50 | between
51 | beyond
52 | bill
53 | both
54 | bottom
55 | but
56 | by
57 | call
58 | can
59 | cannot
60 | cant
61 | co
62 | computer
63 | con
64 | could
65 | couldnt
66 | cry
67 | de
68 | describe
69 | detail
70 | do
71 | done
72 | down
73 | due
74 | during
75 | each
76 | eg
77 | eight
78 | either
79 | eleven
80 | else
81 | elsewhere
82 | empty
83 | enough
84 | etc
85 | even
86 | ever
87 | every
88 | everyone
89 | everything
90 | everywhere
91 | except
92 | few
93 | fifteen
94 | fify
95 | fill
96 | find
97 | fire
98 | first
99 | five
100 | for
101 | former
102 | formerly
103 | forty
104 | found
105 | four
106 | from
107 | front
108 | full
109 | further
110 | get
111 | give
112 | go
113 | had
114 | has
115 | hasnt
116 | have
117 | he
118 | hence
119 | her
120 | here
121 | hereafter
122 | hereby
123 | herein
124 | hereupon
125 | hers
126 | herself
127 | him
128 | himself
129 | his
130 | how
131 | however
132 | hundred
133 | i
134 | ie
135 | if
136 | in
137 | inc
138 | indeed
139 | interest
140 | into
141 | is
142 | it
143 | its
144 | itself
145 | keep
146 | last
147 | latter
148 | latterly
149 | least
150 | less
151 | ltd
152 | made
153 | many
154 | may
155 | me
156 | meanwhile
157 | might
158 | mill
159 | mine
160 | more
161 | moreover
162 | most
163 | mostly
164 | move
165 | much
166 | must
167 | my
168 | myself
169 | name
170 | namely
171 | neither
172 | never
173 | nevertheless
174 | next
175 | nine
176 | no
177 | nobody
178 | none
179 | noone
180 | nor
181 | not
182 | nothing
183 | now
184 | nowhere
185 | of
186 | off
187 | often
188 | on
189 | once
190 | one
191 | only
192 | onto
193 | or
194 | other
195 | others
196 | otherwise
197 | our
198 | ours
199 | ourselves
200 | out
201 | over
202 | own
203 | part
204 | per
205 | perhaps
206 | please
207 | put
208 | rather
209 | re
210 | same
211 | see
212 | seem
213 | seemed
214 | seeming
215 | seems
216 | serious
217 | several
218 | she
219 | should
220 | show
221 | side
222 | since
223 | sincere
224 | six
225 | sixty
226 | so
227 | some
228 | somehow
229 | someone
230 | something
231 | sometime
232 | sometimes
233 | somewhere
234 | still
235 | such
236 | system
237 | take
238 | ten
239 | than
240 | that
241 | the
242 | their
243 | them
244 | themselves
245 | then
246 | thence
247 | there
248 | thereafter
249 | thereby
250 | therefore
251 | therein
252 | thereupon
253 | these
254 | they
255 | thick
256 | thin
257 | third
258 | this
259 | those
260 | though
261 | three
262 | through
263 | throughout
264 | thru
265 | thus
266 | to
267 | together
268 | too
269 | top
270 | toward
271 | towards
272 | twelve
273 | twenty
274 | two
275 | un
276 | under
277 | until
278 | up
279 | upon
280 | us
281 | very
282 | via
283 | was
284 | we
285 | well
286 | were
287 | what
288 | whatever
289 | when
290 | whence
291 | whenever
292 | where
293 | whereafter
294 | whereas
295 | whereby
296 | wherein
297 | whereupon
298 | wherever
299 | whether
300 | which
301 | while
302 | whither
303 | who
304 | whoever
305 | whole
306 | whom
307 | whose
308 | why
309 | will
310 | with
311 | within
312 | without
313 | would
314 | yet
315 | you
316 | your
317 | yours
318 | yourself
319 | yourselves
320 |
--------------------------------------------------------------------------------
/lib/item.rb:
--------------------------------------------------------------------------------
1 | module NewsAgg
2 | class Item
3 | attr_accessor :medium_key, :title, :timestamp,
4 | :url, :content, :scores, :category
5 |
6 | def initialize(params)
7 | @medium_key = params['medium_key']
8 | @title = params['title']
9 | @timestamp = params['timestamp']
10 | @url = params['url']
11 | # @description = params['description']
12 | @content = params['content']
13 |
14 | load_associations
15 | end
16 |
17 | def save
18 | # persist the item only if it has content. some media items are
19 | # displayed in their feeds but they cannot be accessed on the site (!?)
20 | return if content.blank?
21 |
22 | unless exists?
23 | save_item
24 | Classifier.classify(self)
25 | else
26 | # use different timestamp if other item exists with the same timestamp
27 | unless title_same?
28 | timestamp += 1
29 | # save_item
30 | save
31 | end
32 | end
33 | end
34 |
35 | def exists?
36 | R.exists(key)
37 | end
38 |
39 | def title_same?
40 | R.hget(key, 'title') == title
41 | end
42 |
43 | def key
44 | # don't cache this string,
45 | # timestamp change is used to produce original key
46 | "item:#{medium_key}:#{timestamp}"
47 | end
48 |
49 | def add_scores(scores)
50 | # redis: string
51 | R.set("scores:#{key}", scores.to_json)
52 | end
53 |
54 | def destroy
55 | R.multi do
56 | category.remove_item(self)
57 | remove_scores
58 | delete_item
59 | # TODO: remove item from clusters
60 | end
61 | end
62 |
63 | def self.find(key)
64 | if key.is_a?(Array)
65 | key.map { |k| Item.find(k) }
66 | else
67 | new(R.hgetall(key))
68 | end
69 | end
70 |
71 | private
72 |
73 | def save_item
74 | # DEBUG: saved object
75 | p "saving... medium => #{medium_key}, key => #{key}, title => #{title}"
76 |
77 | k = key
78 | R.hmset(k, 'medium_key', medium_key)
79 | R.hmset(k, 'title', title)
80 | R.hmset(k, 'timestamp', timestamp)
81 | R.hmset(k, 'url', url)
82 | # R.hmset(k, 'description', description)
83 | R.hmset(k, 'content', content)
84 | end
85 |
86 | def delete_item
87 | R.del(key)
88 | end
89 |
90 | def remove_scores
91 | R.del("scores:#{key}")
92 | end
93 |
94 | def load_associations
95 | @scores = load_scores
96 | @category = load_category
97 | end
98 |
99 | def load_scores
100 | scores = R.get("scores:#{key}")
101 | if scores
102 | JSON.parse(scores)
103 | else
104 | []
105 | end
106 | end
107 |
108 | def load_category
109 | category_name, score = scores.max_by{ |k,v| v }
110 | Category.find(category_name)
111 | end
112 | end
113 | end
114 |
--------------------------------------------------------------------------------
/lib/medium.rb:
--------------------------------------------------------------------------------
1 | module NewsAgg
2 | class Medium
3 | attr_accessor :key, :url, :feeds, :selector
4 |
5 | def initialize(params)
6 | @key = params[:key]
7 | @url = params[:url]
8 | @feeds = params[:feeds]
9 | @selector = params[:selector]
10 | end
11 |
12 | def self.all
13 | @all ||= CONFIG[:media].map { |params| new(params) }
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/newsagg.rb:
--------------------------------------------------------------------------------
1 | path = File.expand_path('../../lib', __FILE__)
2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path)
3 |
4 | require_relative '../config'
5 |
6 | module NewsAgg
7 | autoload :Category, 'category'
8 | autoload :Medium, 'medium'
9 | autoload :Item, 'item'
10 | autoload :TrainingSet, 'training_set'
11 | autoload :Clusters, 'clusters'
12 | autoload :Trainer, 'trainer'
13 | autoload :Crawler, 'crawler'
14 | autoload :Parser, 'parser'
15 | autoload :Classifier, 'classifier'
16 | autoload :Clusterer, 'clusterer'
17 | end
18 |
--------------------------------------------------------------------------------
/lib/parser.rb:
--------------------------------------------------------------------------------
1 | path = File.expand_path('../../lib/parser', __FILE__)
2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path)
3 |
4 | module NewsAgg
5 | module Parser
6 | autoload :Rss, 'rss'
7 | autoload :Html, 'html'
8 | autoload :Cleaner, 'cleaner'
9 | end
10 | end
11 |
--------------------------------------------------------------------------------
/lib/parser/cleaner.rb:
--------------------------------------------------------------------------------
1 | module NewsAgg
2 | module Parser
3 | module Cleaner
4 |
5 | private
6 | def clean_whitespace(text)
7 | text.gsub(/\s{2,}|\t|\n/, ' ').strip
8 | end
9 | end
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/lib/parser/html.rb:
--------------------------------------------------------------------------------
1 | require 'open-uri'
2 | require 'nokogiri'
3 |
4 | module NewsAgg
5 | module Parser
6 | class Html
7 | include Cleaner
8 | attr_accessor :url, :selector
9 |
10 | def initialize(url, selector)
11 | @url = url
12 | @selector = selector
13 | end
14 |
15 | def content
16 | content = []
17 | html_elements = fetch_html_elements(url)
18 | html_elements.each { |element| content << clean_whitespace(element.text) }
19 | content.join(' ')
20 | end
21 |
22 | private
23 |
24 | def fetch_html_elements(url)
25 | if url =~ URI::regexp
26 | # TODO: handle exceptions properly
27 | begin
28 | # DEBUG: URL
29 | # p url
30 | doc = Nokogiri::HTML(open(url))
31 | doc.search(selector)
32 | rescue OpenURI::HTTPError
33 | []
34 | end
35 | else
36 | []
37 | end
38 | end
39 | end
40 | end
41 | end
42 |
--------------------------------------------------------------------------------
/lib/parser/rss.rb:
--------------------------------------------------------------------------------
1 | require 'net/http'
2 | require 'rss/2.0'
3 |
4 | module NewsAgg
5 | module Parser
6 | class Rss
7 | FEED_LIMIT = 15 # max number of items per medium
8 |
9 | include Cleaner
10 | attr_accessor :medium_key, :feed_urls
11 |
12 | def initialize(medium_key, feed_urls)
13 | @medium_key = medium_key
14 | @feed_urls = feed_urls
15 | end
16 |
17 | def items
18 | items = []
19 | feed_urls.each do |feed_url|
20 | rss_items = fetch_rss_items(feed_url)
21 | rss_items.each { |item| items << parse(item) }
22 | end
23 | items
24 |
25 | # limit the items per medium to the number
26 | # of max items that can be displayed
27 | items = items.sort{ |a, b| b['timestamp'] <=> a['timestamp'] }.first(FEED_LIMIT)
28 | end
29 |
30 | private
31 | def parse(item)
32 | object = {}
33 | object['medium_key'] = medium_key
34 | object['title'] = clean_whitespace(item.title)
35 | object['timestamp'] = item.date.to_i || item.pubDate.to_i
36 | # TODO: clean description from HTML tags
37 | # object['description'] = clean_whitespace(item.description)
38 | object['url'] = clean_whitespace(item.link)
39 |
40 | # DEBUG: feed_url object
41 | # p object['url']
42 |
43 | object
44 | end
45 |
46 | def fetch_rss_items(feed_url)
47 | if feed_url =~ URI::regexp
48 | begin
49 | # TODO: handle exceptions properly
50 | uri = URI.parse(feed_url)
51 | response = Net::HTTP.get_response(uri)
52 | RSS::Parser.parse(response.body, false).items
53 | rescue OpenURI::HTTPError
54 | []
55 | end
56 | else
57 | []
58 | end
59 | end
60 | end
61 | end
62 | end
63 |
--------------------------------------------------------------------------------
/lib/tasks/scheduler.rake:
--------------------------------------------------------------------------------
1 | require_relative '../../lib/newsagg'
2 |
3 | desc "Crawl web pages"
4 | task :crawl do
5 | puts "!! Crawling start..."
6 | NewsAgg::Crawler.start
7 | puts "!! Crawling end."
8 | end
9 |
10 | require_relative '../../lib/newsagg'
11 |
12 | desc "Training sets for classifier"
13 | task :train do
14 | puts "!! Trainer start..."
15 | NewsAgg::Trainer.train
16 | puts "!! Trainer end."
17 | end
18 |
--------------------------------------------------------------------------------
/lib/trainer.rb:
--------------------------------------------------------------------------------
1 | module NewsAgg
2 | module Trainer
3 | def self.train
4 | Category.all.each do |category|
5 | contents = []
6 |
7 | category.seeds.each do |seed|
8 | parser = NewsAgg::Parser::Html.new(seed[:url], seed[:selector])
9 | contents << parser.content
10 | end
11 |
12 | training_set = TrainingSet.new(category.name, contents.join(' '))
13 | training_set.save
14 | end
15 | end
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/lib/training_set.rb:
--------------------------------------------------------------------------------
1 | module NewsAgg
2 | class TrainingSet
3 | attr_accessor :category, :content
4 |
5 | def initialize(category, content)
6 | @category = category
7 | @content = content
8 | end
9 |
10 | def save
11 | # DEBUG: training set
12 | p "saving training set... category => #{category}"
13 | k = self.class.key(category)
14 | R.hmset(k, 'category', category)
15 | R.hmset(k, 'content', content)
16 | end
17 |
18 | def self.find(category)
19 | params = R.hgetall(key(category))
20 | TrainingSet.new(params['category'], params['content'])
21 | end
22 |
23 | private
24 | def self.key(category)
25 | "training_set:#{category}"
26 | end
27 | end
28 | end
29 |
--------------------------------------------------------------------------------
/script/crawl.rb:
--------------------------------------------------------------------------------
1 | require_relative '../lib/newsagg'
2 | NewsAgg::Crawler.start
3 |
--------------------------------------------------------------------------------
/script/train.rb:
--------------------------------------------------------------------------------
1 | require_relative '../lib/newsagg'
2 | NewsAgg::Trainer.train
3 |
--------------------------------------------------------------------------------
/views/category.haml:
--------------------------------------------------------------------------------
1 | - if @items.empty?
2 | No news is good news.
3 | - else
4 | %ul
5 | - @items.each do |item|
6 | %li
7 | %h2
8 | = link_to item.title, item.url
9 | %span{:title => score_percentages(item.scores)}= "(#{item.medium_key})"
10 |
11 | %p= item.content.truncate(300)
12 |
--------------------------------------------------------------------------------
/views/cluster.haml:
--------------------------------------------------------------------------------
1 | - if @clusters.empty?
2 | Sorry, we don't have any clusters yet. See news categories.
3 | - else
4 | %ul
5 | - @clusters.each do |cluster|
6 | - item = cluster[0]
7 | %li
8 | %h2
9 | = link_to "#{item.title}", item.url
10 | %span{:title => score_percentages(item.scores)}= "(#{item.medium_key})"
11 | - cluster[1..-1].each do |item|
12 | %h3
13 | = link_to item.title, item.url
14 | %span{:title => score_percentages(item.scores)}= "(#{item.medium_key})"
15 |
16 | %p= item.content.truncate(300)
17 |
--------------------------------------------------------------------------------
/views/layout.haml:
--------------------------------------------------------------------------------
1 | !!!
2 | %html(xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en")
3 | %head
4 | %meta{"http-equiv" => "content-type", :content => "text/html;charset=UTF-8"}
5 | %title= "NewsAgg"
6 |
7 |
8 |
9 |
10 |
11 | %body
12 | #container
13 | %header
14 | %h1= link_to 'NewsAgg', '/'
15 |
16 | .content
17 | .menu
18 | %ul
19 | - @categories.each do |category|
20 | %li{:class => (category == @category ? 'active' : nil)}= link_to category.name.capitalize, "/?c=#{category.name}"
21 | .news
22 | = yield
23 |
24 |
--------------------------------------------------------------------------------
/views/style.sass:
--------------------------------------------------------------------------------
1 | /* http://meyerweb.com/eric/tools/css/reset/
2 | v2.0 | 20110126
3 | License: none (public domain) */
4 |
5 | html, body, div, span, applet, object, iframe,
6 | h1, h2, h3, h4, h5, h6, p, blockquote, pre,
7 | a, abbr, acronym, address, big, cite, code,
8 | del, dfn, em, img, ins, kbd, q, s, samp,
9 | small, strike, strong, sub, sup, tt, var,
10 | b, u, i, center,
11 | dl, dt, dd, ol, ul, li,
12 | fieldset, form, label, legend,
13 | table, caption, tbody, tfoot, thead, tr, th, td,
14 | article, aside, canvas, details, embed,
15 | figure, figcaption, footer, header, hgroup,
16 | menu, nav, output, ruby, section, summary,
17 | time, mark, audio, video
18 | margin: 0
19 | padding: 0
20 | border: 0
21 | font-size: 100%
22 | font: inherit
23 | vertical-align: baseline
24 |
25 | /* HTML5 display-role reset for older browsers */
26 | article, aside, details, figcaption, figure,
27 | footer, header, hgroup, menu, nav, section
28 | display: block
29 |
30 | body
31 | line-height: 1
32 |
33 | ol, ul
34 | list-style: none
35 |
36 | blockquote, q
37 | quotes: none
38 |
39 | blockquote:before, blockquote:after,
40 | q:before, q:after
41 | content: ''
42 | content: none
43 |
44 | table
45 | border-collapse: collapse
46 | border-spacing: 0
47 |
48 | body
49 | color: #333
50 | font-family: "Helvetica Neue",Helvetica,Arial,sans-serif
51 |
52 | a, a:active, a:visited
53 | color: #607890
54 | text-decoration: none
55 |
56 | a:hover
57 | color: #003366
58 | text-decoration: underline
59 |
60 | p
61 | line-height: 18px
62 | margin-bottom: 10px
63 |
64 |
65 |
66 | .content
67 | width: 960px
68 | margin: 0 auto
69 |
70 | header
71 | margin-bottom: 30px
72 | h1
73 | font-size: 30px
74 | margin-top: 20px
75 |
76 |
77 | ul
78 | li
79 | margin: 5px 0
80 |
81 | #container
82 | width: 960px
83 | margin: 0 auto
84 | overflow: hidden
85 |
86 | .menu
87 | width: 200px
88 | float: left
89 |
90 | ul
91 | li
92 | border-left: 5px solid #CCC
93 | margin: 12px 0
94 | padding: 0 6px
95 |
96 | &.active
97 | border-left: 5px solid #000 !important
98 |
99 | li:nth-child(1)
100 | border-left: 5px solid #F3BD18
101 | li:nth-child(2)
102 | border-left: 5px solid #DE6E26
103 | li:nth-child(3)
104 | border-left: 5px solid #A71D3F
105 | li:nth-child(4)
106 | border-left: 5px solid #70629A
107 | li:nth-child(5)
108 | border-left: 5px solid #0988C5
109 | li:nth-child(6)
110 | border-left: 5px solid #3A9F47
111 | li:nth-child(7)
112 | border-left: 5px solid #FF0099
113 | li:nth-child(8)
114 | border-left: 5px solid #412A1A
115 | li:nth-child(9)
116 | border-left: 5px solid #C5B8BF
117 |
118 |
119 | .news
120 | width: 760px
121 | float: left
122 |
123 | ul
124 | li
125 | margin-bottom: 20px
126 | h2
127 | margin: 4px 0
128 |
129 | h3
130 | font-size: 13px
131 | margin: 4px 0
132 |
133 | p
134 | font-size: 13px
135 |
136 | span
137 | color: #666
138 | font-size: 80%
139 |
140 | ul.cluster
141 | overflow: hidden
142 | li
143 | width: 250px
144 | float: left
145 |
--------------------------------------------------------------------------------