├── .gitignore
├── .rvmrc
├── Gemfile
├── Gemfile.lock
├── README.md
├── Rakefile
├── app.rb
├── config.rb
├── config.ru
├── config.yml
├── helpers.rb
├── lib
    ├── category.rb
    ├── classifier.rb
    ├── classifier
    │   └── stats_classifier.rb
    ├── clusterer.rb
    ├── clusterer
    │   └── stats_cluster.rb
    ├── clusters.rb
    ├── core_ext
    │   └── string.rb
    ├── crawler.rb
    ├── data
    │   └── stop_words.txt
    ├── item.rb
    ├── medium.rb
    ├── newsagg.rb
    ├── parser.rb
    ├── parser
    │   ├── cleaner.rb
    │   ├── html.rb
    │   └── rss.rb
    ├── tasks
    │   └── scheduler.rake
    ├── trainer.rb
    └── training_set.rb
├── script
    ├── crawl.rb
    └── train.rb
└── views
    ├── category.haml
    ├── cluster.haml
    ├── layout.haml
    └── style.sass


/.gitignore:
--------------------------------------------------------------------------------
1 | .sass-cache
2 | 


--------------------------------------------------------------------------------
/.rvmrc:
--------------------------------------------------------------------------------
1 | rvm ruby-1.9.2-p290@newsagg
2 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'http://rubygems.org'
2 | 
3 | gem 'sinatra'
4 | gem 'haml'
5 | gem 'sass'
6 | gem 'redis'
7 | gem 'nokogiri'
8 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: http://rubygems.org/
 3 |   specs:
 4 |     haml (3.1.3)
 5 |     nokogiri (1.5.0)
 6 |     rack (1.3.5)
 7 |     rack-protection (1.1.4)
 8 |       rack
 9 |     redis (2.2.2)
10 |     sass (3.1.10)
11 |     sinatra (1.3.1)
12 |       rack (~> 1.3, >= 1.3.4)
13 |       rack-protection (~> 1.1, >= 1.1.2)
14 |       tilt (~> 1.3, >= 1.3.3)
15 |     tilt (1.3.3)
16 | 
17 | PLATFORMS
18 |   ruby
19 | 
20 | DEPENDENCIES
21 |   haml
22 |   nokogiri
23 |   redis
24 |   sass
25 |   sinatra
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NewsAgg
 2 | 
 3 | ## DESCRIPTION:
 4 | 
 5 |   News Aggregator that classifies and clusterifies news from different sources
 6 | 
 7 | ## INSTALLATION:
 8 | 
 9 |   bundle install
10 | 
11 | ## SETUP:
12 | 
13 | ### Configure categories, training sets and RSS feeds
14 | 
15 |     (edit: "config.yml" file)
16 | 
17 | ## USAGE:
18 | 
19 | ### Create training sets
20 | 
21 |     ruby script/train.rb
22 | 
23 | ### Collect & classify articles from newspapers
24 | 
25 |     ruby script/crawl.rb
26 | 
27 | ### Read news (localhost:9292)
28 | 
29 |     rackup config.ru
30 | 
31 | ## LICENSE:
32 | 
33 | (The MIT License)
34 | 
35 | Copyright (c) 2011 Dalibor Nasevic
36 | 
37 | Permission is hereby granted, free of charge, to any person obtaining
38 | a copy of this software and associated documentation files (the
39 | 'Software'), to deal in the Software without restriction, including
40 | without limitation the rights to use, copy, modify, merge, publish,
41 | distribute, sublicense, and/or sell copies of the Software, and to
42 | permit persons to whom the Software is furnished to do so, subject to
43 | the following conditions:
44 | 
45 | The above copyright notice and this permission notice shall be
46 | included in all copies or substantial portions of the Software.
47 | 
48 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
49 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
50 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
51 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
52 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
53 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
54 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
55 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'rake'
2 | require 'rake/testtask'
3 | require 'rake/rdoctask'
4 | 
5 | Dir["#{File.dirname(__FILE__)}/lib/tasks/**/*.rake"].sort.each { |t| load t }
6 | 


--------------------------------------------------------------------------------
/app.rb:
--------------------------------------------------------------------------------
 1 | require './config'
 2 | 
 3 | include NewsAgg
 4 | 
 5 | get '/' do
 6 |   @categories = Category.all
 7 |   @category   = Category.find(params[:c])
 8 | 
 9 |   if @category
10 |     @items = @category.recent_items
11 |     haml :category, :layout => :layout
12 |   else
13 |     @clusters = Clusters.load
14 |     haml :cluster, :layout => :layout
15 |   end
16 | end
17 | 
18 | get '/style.css' do
19 |   sass :style
20 | end
21 | 


--------------------------------------------------------------------------------
/config.rb:
--------------------------------------------------------------------------------
 1 | require "rubygems"
 2 | require "bundler/setup"
 3 | Bundler.require
 4 | 
 5 | require_relative 'helpers'
 6 | require_relative 'lib/newsagg'
 7 | require 'json'
 8 | 
 9 | 
10 | Dir["#{File.dirname(__FILE__)}/lib/core_ext/*.rb"].sort.each do |path|
11 |   require_relative "lib/core_ext/#{File.basename(path, '.rb')}"
12 | end
13 | 
14 | 
15 | # development environment
16 | ENV['RACK_ENV'] ||= 'development'
17 | configure :development do
18 |   ENV["REDISTOGO_URL"] = 'redis://localhost:6379'
19 | end
20 | 
21 | 
22 | # production environment
23 | # Heroku sets: ENV['RACK_ENV'] and ENV["REDISTOGO_URL"]
24 | 
25 | 
26 | # all environments
27 | configure do
28 |   uri = URI.parse(ENV["REDISTOGO_URL"])
29 |   R = Redis.new(:host => uri.host, :port => uri.port, :password => uri.password)
30 | end
31 | 
32 | 
33 | CONFIG = YAML::load_file(File.join(File.dirname(__FILE__), 'config.yml'))
34 | 


--------------------------------------------------------------------------------
/config.ru:
--------------------------------------------------------------------------------
1 | require './app'
2 | 
3 | run Sinatra::Application
4 | 


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
  1 | :categories:
  2 |   - :name: 'world'
  3 |     :seeds:
  4 |       - :url: 'http://en.wikipedia.org/wiki/World'
  5 |         :selector: '.mw-content-ltr'
  6 |       - :url: 'http://en.wikipedia.org/wiki/List_of_sovereign_states'
  7 |         :selector: '.mw-content-ltr'
  8 |       - :url: 'http://en.wikipedia.org/wiki/Universe'
  9 |         :selector: '.mw-content-ltr'
 10 |       - :url: 'http://en.wikipedia.org/wiki/Globe'
 11 |         :selector: '.mw-content-ltr'
 12 |   - :name: 'africa'
 13 |     :seeds:
 14 |       - :url: 'http://en.wikipedia.org/wiki/Africa'
 15 |         :selector: '.mw-content-ltr'
 16 |       - :url: 'http://en.wikipedia.org/wiki/List_of_African_countries_and_territories'
 17 |         :selector: '.mw-content-ltr'
 18 |   - :name: 'economy'
 19 |     :seeds:
 20 |       - :url: 'http://en.wikipedia.org/wiki/Economy'
 21 |         :selector: '.mw-content-ltr'
 22 |       - :url: 'http://en.wikipedia.org/wiki/Finance'
 23 |         :selector: '.mw-content-ltr'
 24 |       - :url: 'http://en.wikipedia.org/wiki/Business'
 25 |         :selector: '.mw-content-ltr'
 26 |   - :name: 'science'
 27 |     :seeds:
 28 |       - :url: 'http://en.wikipedia.org/wiki/Science'
 29 |         :selector: '.mw-content-ltr'
 30 |       - :url: 'http://en.wikipedia.org/wiki/Technology'
 31 |         :selector: '.mw-content-ltr'
 32 |   - :name: 'sport'
 33 |     :seeds:
 34 |       - :url: 'http://en.wikipedia.org/wiki/Sport'
 35 |         :selector: '.mw-content-ltr'
 36 |       - :url: 'http://en.wikipedia.org/wiki/Outline_of_sports'
 37 |         :selector: '.mw-content-ltr'
 38 |   - :name: 'health'
 39 |     :seeds:
 40 |       - :url: 'http://en.wikipedia.org/wiki/Health'
 41 |         :selector: '.mw-content-ltr'
 42 |       - :url: 'http://en.wikipedia.org/wiki/Health_care'
 43 |         :selector: '.mw-content-ltr'
 44 |   - :name: 'entertainment'
 45 |     :seeds:
 46 |       - :url: 'http://en.wikipedia.org/wiki/Entertainment'
 47 |         :selector: '.mw-content-ltr'
 48 |       - :url: 'http://en.wikipedia.org/wiki/Film'
 49 |         :selector: '.mw-content-ltr'
 50 |       - :url: 'http://en.wikipedia.org/wiki/Radio_programming'
 51 |         :selector: '.mw-content-ltr'
 52 |       - :url: 'http://en.wikipedia.org/wiki/Concert'
 53 |         :selector: '.mw-content-ltr'
 54 | 
 55 | :media:
 56 |   - :key: 'timeslive.co.za'
 57 |     :url: 'http://www.timeslive.co.za'
 58 |     :feeds:
 59 |       - 'http://avusa.feedsportal.com/c/33051/f/534658/index.rss'
 60 |     :selector: '#article .area > h3, #article .area > p, #article > h3'
 61 | 
 62 |   - :key: 'thenewage.co.za'
 63 |     :url: 'http://www.sowetanlive.co.za'
 64 |     :feeds:
 65 |       - 'http://thenewage.co.za/rss.aspx?cat_id=9'    # business
 66 |       - 'http://thenewage.co.za/rss.aspx?cat_id=1021' # science and technology
 67 |       - 'http://thenewage.co.za/rss.aspx?cat_id=1020' # world
 68 |       - 'http://thenewage.co.za/rss.aspx?cat_id=1022' # entertainment
 69 |       - 'http://thenewage.co.za/rss.aspx?cat_id=1019' # afrika
 70 |       - 'http://thenewage.co.za/rss.aspx?cat_id=11'   # sport
 71 |     :selector: '#dv_story_dtls'
 72 | 
 73 |   - :key: 'sowetanlive.co.za'
 74 |     :url: 'http://www.sowetanlive.co.za'
 75 |     :feeds:
 76 |       - 'http://www.sowetanlive.co.za/?service=rss'
 77 |     :selector: '#content .articleheader h3, #content p'
 78 | 
 79 |   - :key: 'news24.com'
 80 |     :url: 'http://www.news24.com'
 81 |     :feeds:
 82 |       - 'http://feeds.news24.com/articles/Kenya/TopStories/rss'      # top stories
 83 |       - 'http://feeds.news24.com/articles/Kenya/Africa/rss'          # afrika
 84 |       - 'http://feeds.news24.com/articles/Kenya/World/rss'           # world
 85 |       - 'http://feeds.news24.com/articles/Kenya/SciTech/rss'         # sci-tech
 86 |       - 'http://feeds.24.com/articles/sport/featured/topstories/rss' # sport
 87 |       - 'http://feeds.news24.com/articles/Kenya/Entertainment/rss'   # entertainment
 88 |     :selector: '.article_body p:first'
 89 | 
 90 |   - :key: 'iol.co.za'
 91 |     :url: 'http://www.iol.co.za'
 92 |     :feeds:
 93 |       - 'http://iol.co.za/cmlink/1.640'                    # news
 94 |       - 'http://www.iol.co.za/cmlink/1.730910'             # business
 95 |       - 'http://iol.co.za/cmlink/sport-category-rss-1.704' # sport
 96 |     :selector: '.aticle_column > p'
 97 | 
 98 |   - :key: 'ewn.co.za'
 99 |     :url: 'http://www.ewn.co.za'
100 |     :feeds:
101 |       - 'http://www.ewn.co.za/Feeds/Local.aspx'         # local
102 |       - 'http://www.ewn.co.za/Feeds/World.aspx'         # world
103 |       - 'http://www.ewn.co.za/Feeds/Sport.aspx'         # sport
104 |       - 'http://www.ewn.co.za/Feeds/Entertainment.aspx' # entertainment
105 |       - 'http://www.ewn.co.za/Feeds/Breaking.aspx'      # breaking news
106 |       - 'http://www.ewn.co.za/Feeds/LatestNews.aspx'    # latest news (all categories)
107 |     :selector: '.storybodytext > p'
108 | 
109 |   - :key: 'mg.co.za'
110 |     :url: 'http://mg.co.za'
111 |     :feeds:
112 |       - 'http://mg.co.za/rss'                         # top stories
113 |       - 'http://mg.co.za/rss/national'                # national
114 |       - 'http://mg.co.za/rss/sport'                   # sport
115 |       - 'http://mg.co.za/rss/business'                # business
116 |       - 'http://mg.co.za/rss/world'                   # world
117 |       - 'http://mg.co.za/rss/africa'                  # africa
118 |       - 'http://mg.co.za/rss/and-in-other-news'       # other
119 |     :selector: '#storycontainer .article_lead, #storycontainer .article_body'
120 | 


--------------------------------------------------------------------------------
/helpers.rb:
--------------------------------------------------------------------------------
 1 | helpers do
 2 |   def link_to(text, url, opts = {})
 3 |     attributes = ""
 4 |     opts.each { |key,value| attributes << key.to_s << "=\"" << value << "\" "}
 5 |     "<a href=\"#{url}\" #{attributes}>#{text}</a>"
 6 |   end
 7 | 
 8 |   def score_percentages(scores)
 9 |     scores.sort{ |a, b| b[1] <=> a[1] }.map do |s|
10 |       "#{s[0].capitalize}: #{'%.2f' % (s[1] * 100)}%"
11 |     end.join(', ')
12 |   end
13 | end
14 | 


--------------------------------------------------------------------------------
/lib/category.rb:
--------------------------------------------------------------------------------
 1 | module NewsAgg
 2 |   class Category
 3 |     attr_accessor :name, :seeds
 4 |     CATEGORY_LIMIT = 15 # max number of items per category
 5 | 
 6 |     def initialize(params)
 7 |       @name  = params[:name]
 8 |       @seeds = params[:seeds]
 9 |     end
10 | 
11 |     # last items ordered by timestamp (redis: ordered set)
12 |     def recent_items(limit = CATEGORY_LIMIT)
13 |       keys = R.zrevrange("category:#{name}", 0, limit - 1)
14 |       @items = Item.find(keys)
15 |     end
16 | 
17 |     # find older category items (0,1,2,3,[4,5,6,7,8])
18 |     def old_items(limit = CATEGORY_LIMIT)
19 |       keys = R.zrevrange("category:#{name}", limit, -1)
20 |       @items = Item.find(keys)
21 |     end
22 | 
23 |     def add_item(item, score)
24 |       # redis sorted set: category:name
25 |       R.zadd("category:#{name}", item.timestamp, item.key)
26 |     end
27 | 
28 |     def remove_item(item)
29 |       R.zrem("category:#{name}", item.key)
30 |     end
31 | 
32 |     def self.all
33 |       @all ||= CONFIG[:categories].map { |params| new(params) }
34 |     end
35 | 
36 |     def self.find(name)
37 |       all.detect{ |c| c.name == name }
38 |     end
39 | 
40 |     # Redis on Heroku is free up to 5 mb
41 |     # keep only fews records per category
42 |     def self.clean_old_items!
43 |       # DEBUG
44 |       p "cleaning old items..."
45 | 
46 |       all.each do |category|
47 |         category.old_items.each { |item| item.destroy }
48 |       end
49 |     end
50 |   end
51 | end
52 | 


--------------------------------------------------------------------------------
/lib/classifier.rb:
--------------------------------------------------------------------------------
 1 | path = File.expand_path('../../lib/classifier', __FILE__)
 2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path)
 3 | 
 4 | require 'json'
 5 | 
 6 | module NewsAgg
 7 |   module Classifier
 8 |     autoload :StatsClassifier, 'stats_classifier'
 9 | 
10 |     # TODO: classify item
11 |     def self.classify(item)
12 |       classifier = StatsClassifier.new(training_data)
13 | 
14 |       scores = classifier.scores(item.content)
15 |       category_name, score = scores.max_by{ |k,v| v }
16 | 
17 |       category = Category.find(category_name)
18 |       category.add_item(item, score)
19 |       item.add_scores(scores)
20 | 
21 |       # DEBUG: classified object
22 |       p "classifying... medium => #{item.medium_key}, key => #{item.key}, title => #{item.title}, :category => #{category.name}"
23 |     end
24 | 
25 |     private
26 |       def self.training_data
27 |         return @training_data if @training_data
28 | 
29 |         @training_data = {}
30 |         Category.all.each do |category|
31 |           @training_data[category.name] = TrainingSet.find(category.name).content
32 |         end
33 |         @training_data
34 |       end
35 |   end
36 | end
37 | 


--------------------------------------------------------------------------------
/lib/classifier/stats_classifier.rb:
--------------------------------------------------------------------------------
 1 | module NewsAgg
 2 |   module Classifier
 3 |     class StatsClassifier
 4 |       attr_accessor :training_sets
 5 | 
 6 |       def initialize(data)
 7 |         @training_sets = {}
 8 |         filename = File.join(File.dirname(__FILE__), '..', 'data', 'stop_words.txt')
 9 |         @noise_words = File.new(filename).readlines.map(&:chomp)
10 | 
11 |         train(data)
12 |       end
13 | 
14 |       def scores(text)
15 |         words = text.downcase.scan(/[a-z]+/)
16 | 
17 |         scores = {}
18 |         training_sets.each_pair do |category, word_weights|
19 |           scores[category] = score(word_weights, words)
20 |         end
21 | 
22 |         scores
23 |       end
24 | 
25 |       def train(data)
26 |         data.each_pair do |category, text|
27 |           words = text.downcase.scan(/[a-z]+/)
28 |           word_weights = Hash.new(0)
29 | 
30 |           words.each {|word| word_weights[word] += 1 unless @noise_words.index(word)}
31 | 
32 |           ratio = 1.0 / words.length
33 |           word_weights.keys.each {|key| word_weights[key] *= ratio}
34 | 
35 |           training_sets[category] = word_weights
36 |         end
37 |       end
38 | 
39 |       private
40 |         def score(word_weights, words)
41 |           score = words.inject(0) {|acc, word| acc + word_weights[word]}
42 |           1000.0 * score / words.size
43 |         end
44 |     end
45 |   end
46 | end
47 | 


--------------------------------------------------------------------------------
/lib/clusterer.rb:
--------------------------------------------------------------------------------
 1 | path = File.expand_path('../../lib/clusterer', __FILE__)
 2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path)
 3 | 
 4 | require 'json'
 5 | 
 6 | module NewsAgg
 7 |   module Clusterer
 8 |     autoload :StatsCluster, 'stats_cluster'
 9 |   end
10 | end
11 | 


--------------------------------------------------------------------------------
/lib/clusterer/stats_cluster.rb:
--------------------------------------------------------------------------------
 1 | module NewsAgg
 2 |   module Clusterer
 3 |     # Word-Use Intersections
 4 |     class StatsCluster
 5 |       THRESHOLD = 0.3
 6 |       attr_accessor :noise_words
 7 | 
 8 |       def initialize
 9 |         filename = File.join(File.dirname(__FILE__), '..', 'data', 'stop_words.txt')
10 |         @noise_words = File.new(filename).readlines.map(&:chomp)
11 |       end
12 | 
13 |       def clusterize(texts)
14 |         words_matrix = texts.map { |text| text.downcase.scan(/[a-z]+/) - noise_words }
15 |         similarity_matrix = calculate_similarity_matrix(words_matrix)
16 |         calculate_clusters(similarity_matrix)
17 |       end
18 | 
19 |       # similarity score between two texts
20 |       # used only for debugging
21 |       def texts_score(text1, text2)
22 |         words1 = text1.downcase.scan(/[a-zA-Z]+/) - noise_words
23 |         words2 = text2.downcase.scan(/[a-zA-Z]+/) - noise_words
24 |         common_words = words1 & words2
25 |         p common_words
26 |         common_words.length.to_f / (words1.length + words2.length)
27 |       end
28 | 
29 |       private
30 | 
31 |         def similarity_score(words1, words2)
32 |           common_words = words1 & words2
33 |           2.0 * common_words.length.to_f / (words1.length + words2.length)
34 |         end
35 | 
36 |         def calculate_similarity_matrix(words_matrix)
37 |           # initialize similarity matrix with 0
38 |           size = words_matrix.length
39 |           similarity_matrix = size.times.map { Array.new(size, 0) }
40 | 
41 |           # calculate similarity matrix between all texts
42 |           size.times do |i|
43 |             size.times do |j|
44 |               similarity_matrix[i][j] = similarity_score(words_matrix[i], words_matrix[j])
45 |             end
46 |           end
47 | 
48 |           similarity_matrix
49 |         end
50 | 
51 |         def calculate_clusters(similarity_matrix)
52 |           clusters = []
53 |           size     = similarity_matrix.length
54 | 
55 |           size.times do |i|
56 |             similar = []
57 | 
58 |             # find similar texts to i
59 |             size.times do |j|
60 |               similar << j if j > i && similarity_matrix[i][j] > THRESHOLD
61 |             end
62 | 
63 |             # add i to array of similar texts
64 |             if similar.length > 0
65 |               similar << i
66 |               clusters << similar.sort # sort the array
67 |             end
68 |           end
69 | 
70 |           # remove redundent clusters:
71 |           clusters.size.times do |i|
72 |             clusters.size.times do |j|
73 |               if clusters[j].length < clusters[i].length
74 |                 clusters[j] = [] if (clusters[j] & clusters[i]) == clusters[j]
75 |               end
76 |             end
77 |           end
78 | 
79 |           clusters.select{ |c| c.length > 1 }
80 |         end
81 | 
82 |     end
83 |   end
84 | end
85 | 


--------------------------------------------------------------------------------
/lib/clusters.rb:
--------------------------------------------------------------------------------
 1 | require 'json'
 2 | module NewsAgg
 3 |   module Clusters
 4 |     class << self
 5 | 
 6 |       def create!
 7 |         # DEBUG
 8 |         p "clustering items..."
 9 | 
10 |         items = NewsAgg::Item.find(R.keys('item*'))
11 |         texts = items.map{|item| item.content}
12 |         clusterer = NewsAgg::Clusterer::StatsCluster.new
13 |         clusters_ids = clusterer.clusterize(texts)
14 | 
15 |         clusters_items = []
16 |         clusters_ids.each do |cluster_ids|
17 |           clusters_items << cluster_ids.map{|id| items[id].key}
18 |         end
19 | 
20 |         R['clusters'] = clusters_items.to_json
21 |       end
22 | 
23 |       def load
24 |         clusters_data = R['clusters']
25 |         if clusters_data
26 |           clusters_ids = JSON.parse(clusters_data)
27 |           clusters_ids.map{|cluster_ids| Item.find(cluster_ids)}
28 |         else
29 |           []
30 |         end
31 |       end
32 |     end
33 |   end
34 | end
35 | 
36 | 


--------------------------------------------------------------------------------
/lib/core_ext/string.rb:
--------------------------------------------------------------------------------
 1 | class String
 2 | 
 3 |   if defined?(Encoding) && "".respond_to?(:encode)
 4 |     def encoding_aware?
 5 |       true
 6 |     end
 7 |   else
 8 |     def encoding_aware?
 9 |       false
10 |     end
11 |   end
12 | 
13 |   # 0x3000: fullwidth whitespace
14 |   NON_WHITESPACE_REGEXP = %r![^\s#{[0x3000].pack("U")}]!
15 | 
16 |   # A string is blank if it's empty or contains whitespaces only:
17 |   #
18 |   #   "".blank?                 # => true
19 |   #   "   ".blank?              # => true
20 |   #   "　".blank?               # => true
21 |   #   " something here ".blank? # => false
22 |   #
23 |   def blank?
24 |     # 1.8 does not takes [:space:] properly
25 |     if encoding_aware?
26 |       self !~ /[^[:space:]]/
27 |     else
28 |       self !~ NON_WHITESPACE_REGEXP
29 |     end
30 |   end
31 | 
32 |   def truncate(length=300)
33 |     text = self.dup
34 |     text[0...length] + '...'
35 |   end
36 | end
37 | 


--------------------------------------------------------------------------------
/lib/crawler.rb:
--------------------------------------------------------------------------------
 1 | module NewsAgg
 2 |   class Crawler
 3 |     attr_accessor :medium
 4 | 
 5 |     def initialize(medium)
 6 |       @medium = medium
 7 |     end
 8 | 
 9 |     def process
10 |       feed_items.each do |feed_item|
11 |         item = Item.new(feed_item)
12 | 
13 |         unless item.exists?
14 |           parser  = NewsAgg::Parser::Html.new(item.url, medium.selector)
15 |           item.content = parser.content
16 |           item.save
17 |         end
18 |       end
19 | 
20 |       Category.clean_old_items!
21 |     end
22 | 
23 |     def self.start
24 |       Medium.all.each do |medium|
25 |         crawler = Crawler.new(medium)
26 |         crawler.process
27 |       end
28 | 
29 |       Clusters.create!
30 |     end
31 | 
32 |     private
33 |       def feed_items
34 |         parser = NewsAgg::Parser::Rss.new(medium.key, medium.feeds)
35 |         parser.items
36 |       end
37 |   end
38 | end
39 | 


--------------------------------------------------------------------------------
/lib/data/stop_words.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | across
  5 | after
  6 | afterwards
  7 | again
  8 | against
  9 | all
 10 | almost
 11 | alone
 12 | along
 13 | already
 14 | also
 15 | although
 16 | always
 17 | am
 18 | among
 19 | amongst
 20 | amoungst
 21 | amount
 22 | an
 23 | and
 24 | another
 25 | any
 26 | anyhow
 27 | anyone
 28 | anything
 29 | anyway
 30 | anywhere
 31 | are
 32 | around
 33 | as
 34 | at
 35 | back
 36 | be
 37 | became
 38 | because
 39 | become
 40 | becomes
 41 | becoming
 42 | been
 43 | before
 44 | beforehand
 45 | behind
 46 | being
 47 | below
 48 | beside
 49 | besides
 50 | between
 51 | beyond
 52 | bill
 53 | both
 54 | bottom
 55 | but
 56 | by
 57 | call
 58 | can
 59 | cannot
 60 | cant
 61 | co
 62 | computer
 63 | con
 64 | could
 65 | couldnt
 66 | cry
 67 | de
 68 | describe
 69 | detail
 70 | do
 71 | done
 72 | down
 73 | due
 74 | during
 75 | each
 76 | eg
 77 | eight
 78 | either
 79 | eleven
 80 | else
 81 | elsewhere
 82 | empty
 83 | enough
 84 | etc
 85 | even
 86 | ever
 87 | every
 88 | everyone
 89 | everything
 90 | everywhere
 91 | except
 92 | few
 93 | fifteen
 94 | fify
 95 | fill
 96 | find
 97 | fire
 98 | first
 99 | five
100 | for
101 | former
102 | formerly
103 | forty
104 | found
105 | four
106 | from
107 | front
108 | full
109 | further
110 | get
111 | give
112 | go
113 | had
114 | has
115 | hasnt
116 | have
117 | he
118 | hence
119 | her
120 | here
121 | hereafter
122 | hereby
123 | herein
124 | hereupon
125 | hers
126 | herself
127 | him
128 | himself
129 | his
130 | how
131 | however
132 | hundred
133 | i
134 | ie
135 | if
136 | in
137 | inc
138 | indeed
139 | interest
140 | into
141 | is
142 | it
143 | its
144 | itself
145 | keep
146 | last
147 | latter
148 | latterly
149 | least
150 | less
151 | ltd
152 | made
153 | many
154 | may
155 | me
156 | meanwhile
157 | might
158 | mill
159 | mine
160 | more
161 | moreover
162 | most
163 | mostly
164 | move
165 | much
166 | must
167 | my
168 | myself
169 | name
170 | namely
171 | neither
172 | never
173 | nevertheless
174 | next
175 | nine
176 | no
177 | nobody
178 | none
179 | noone
180 | nor
181 | not
182 | nothing
183 | now
184 | nowhere
185 | of
186 | off
187 | often
188 | on
189 | once
190 | one
191 | only
192 | onto
193 | or
194 | other
195 | others
196 | otherwise
197 | our
198 | ours
199 | ourselves
200 | out
201 | over
202 | own
203 | part
204 | per
205 | perhaps
206 | please
207 | put
208 | rather
209 | re
210 | same
211 | see
212 | seem
213 | seemed
214 | seeming
215 | seems
216 | serious
217 | several
218 | she
219 | should
220 | show
221 | side
222 | since
223 | sincere
224 | six
225 | sixty
226 | so
227 | some
228 | somehow
229 | someone
230 | something
231 | sometime
232 | sometimes
233 | somewhere
234 | still
235 | such
236 | system
237 | take
238 | ten
239 | than
240 | that
241 | the
242 | their
243 | them
244 | themselves
245 | then
246 | thence
247 | there
248 | thereafter
249 | thereby
250 | therefore
251 | therein
252 | thereupon
253 | these
254 | they
255 | thick
256 | thin
257 | third
258 | this
259 | those
260 | though
261 | three
262 | through
263 | throughout
264 | thru
265 | thus
266 | to
267 | together
268 | too
269 | top
270 | toward
271 | towards
272 | twelve
273 | twenty
274 | two
275 | un
276 | under
277 | until
278 | up
279 | upon
280 | us
281 | very
282 | via
283 | was
284 | we
285 | well
286 | were
287 | what
288 | whatever
289 | when
290 | whence
291 | whenever
292 | where
293 | whereafter
294 | whereas
295 | whereby
296 | wherein
297 | whereupon
298 | wherever
299 | whether
300 | which
301 | while
302 | whither
303 | who
304 | whoever
305 | whole
306 | whom
307 | whose
308 | why
309 | will
310 | with
311 | within
312 | without
313 | would
314 | yet
315 | you
316 | your
317 | yours
318 | yourself
319 | yourselves
320 | 


--------------------------------------------------------------------------------
/lib/item.rb:
--------------------------------------------------------------------------------
  1 | module NewsAgg
  2 |   class Item
  3 |     attr_accessor :medium_key, :title, :timestamp,
  4 |       :url, :content, :scores, :category
  5 | 
  6 |     def initialize(params)
  7 |       @medium_key  = params['medium_key']
  8 |       @title       = params['title']
  9 |       @timestamp   = params['timestamp']
 10 |       @url         = params['url']
 11 |       # @description = params['description']
 12 |       @content     = params['content']
 13 | 
 14 |       load_associations
 15 |     end
 16 | 
 17 |     def save
 18 |       # persist the item only if it has content. some media items are
 19 |       # displayed in their feeds but they cannot be accessed on the site (!?)
 20 |       return if content.blank?
 21 | 
 22 |       unless exists?
 23 |         save_item
 24 |         Classifier.classify(self)
 25 |       else
 26 |         # use different timestamp if other item exists with the same timestamp
 27 |         unless title_same?
 28 |           timestamp += 1
 29 |           # save_item
 30 |           save
 31 |         end
 32 |       end
 33 |     end
 34 | 
 35 |     def exists?
 36 |       R.exists(key)
 37 |     end
 38 | 
 39 |     def title_same?
 40 |       R.hget(key, 'title') == title
 41 |     end
 42 | 
 43 |     def key
 44 |       # don't cache this string,
 45 |       # timestamp change is used to produce original key
 46 |       "item:#{medium_key}:#{timestamp}"
 47 |     end
 48 | 
 49 |     def add_scores(scores)
 50 |       # redis: string
 51 |       R.set("scores:#{key}", scores.to_json)
 52 |     end
 53 | 
 54 |     def destroy
 55 |       R.multi do
 56 |         category.remove_item(self)
 57 |         remove_scores
 58 |         delete_item
 59 |         # TODO: remove item from clusters
 60 |       end
 61 |     end
 62 | 
 63 |     def self.find(key)
 64 |       if key.is_a?(Array)
 65 |         key.map { |k| Item.find(k) }
 66 |       else
 67 |         new(R.hgetall(key))
 68 |       end
 69 |     end
 70 | 
 71 |     private
 72 | 
 73 |       def save_item
 74 |         # DEBUG: saved object
 75 |         p "saving... medium => #{medium_key}, key => #{key}, title => #{title}"
 76 | 
 77 |         k = key
 78 |         R.hmset(k, 'medium_key', medium_key)
 79 |         R.hmset(k, 'title', title)
 80 |         R.hmset(k, 'timestamp', timestamp)
 81 |         R.hmset(k, 'url', url)
 82 |         # R.hmset(k, 'description', description)
 83 |         R.hmset(k, 'content', content)
 84 |       end
 85 | 
 86 |       def delete_item
 87 |         R.del(key)
 88 |       end
 89 | 
 90 |       def remove_scores
 91 |         R.del("scores:#{key}")
 92 |       end
 93 | 
 94 |       def load_associations
 95 |         @scores   = load_scores
 96 |         @category = load_category
 97 |       end
 98 | 
 99 |       def load_scores
100 |         scores = R.get("scores:#{key}")
101 |         if scores
102 |           JSON.parse(scores)
103 |         else
104 |           []
105 |         end
106 |       end
107 | 
108 |       def load_category
109 |         category_name, score = scores.max_by{ |k,v| v }
110 |         Category.find(category_name)
111 |       end
112 |   end
113 | end
114 | 


--------------------------------------------------------------------------------
/lib/medium.rb:
--------------------------------------------------------------------------------
 1 | module NewsAgg
 2 |   class Medium
 3 |     attr_accessor :key, :url, :feeds, :selector
 4 | 
 5 |     def initialize(params)
 6 |       @key      = params[:key]
 7 |       @url      = params[:url]
 8 |       @feeds    = params[:feeds]
 9 |       @selector = params[:selector]
10 |     end
11 | 
12 |     def self.all
13 |       @all ||= CONFIG[:media].map { |params| new(params) }
14 |     end
15 |   end
16 | end
17 | 


--------------------------------------------------------------------------------
/lib/newsagg.rb:
--------------------------------------------------------------------------------
 1 | path = File.expand_path('../../lib', __FILE__)
 2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path)
 3 | 
 4 | require_relative '../config'
 5 | 
 6 | module NewsAgg
 7 |   autoload :Category,    'category'
 8 |   autoload :Medium,      'medium'
 9 |   autoload :Item,        'item'
10 |   autoload :TrainingSet, 'training_set'
11 |   autoload :Clusters,    'clusters'
12 |   autoload :Trainer,     'trainer'
13 |   autoload :Crawler,     'crawler'
14 |   autoload :Parser,      'parser'
15 |   autoload :Classifier,  'classifier'
16 |   autoload :Clusterer,   'clusterer'
17 | end
18 | 


--------------------------------------------------------------------------------
/lib/parser.rb:
--------------------------------------------------------------------------------
 1 | path = File.expand_path('../../lib/parser', __FILE__)
 2 | $:.unshift(path) if File.directory?(path) && !$:.include?(path)
 3 | 
 4 | module NewsAgg
 5 |   module Parser
 6 |     autoload :Rss,     'rss'
 7 |     autoload :Html,    'html'
 8 |     autoload :Cleaner, 'cleaner'
 9 |   end
10 | end
11 | 


--------------------------------------------------------------------------------
/lib/parser/cleaner.rb:
--------------------------------------------------------------------------------
 1 | module NewsAgg
 2 |   module Parser
 3 |     module Cleaner
 4 | 
 5 |       private
 6 |         def clean_whitespace(text)
 7 |           text.gsub(/\s{2,}|\t|\n/, ' ').strip
 8 |         end
 9 |     end
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/lib/parser/html.rb:
--------------------------------------------------------------------------------
 1 | require 'open-uri'
 2 | require 'nokogiri'
 3 | 
 4 | module NewsAgg
 5 |   module Parser
 6 |     class Html
 7 |       include Cleaner
 8 |       attr_accessor :url, :selector
 9 | 
10 |       def initialize(url, selector)
11 |         @url      = url
12 |         @selector = selector
13 |       end
14 | 
15 |       def content
16 |         content = []
17 |         html_elements = fetch_html_elements(url)
18 |         html_elements.each { |element| content << clean_whitespace(element.text) }
19 |         content.join(' ')
20 |       end
21 | 
22 |       private
23 | 
24 |         def fetch_html_elements(url)
25 |           if url =~ URI::regexp
26 |             # TODO: handle exceptions properly
27 |             begin
28 |               # DEBUG: URL
29 |               # p url
30 |               doc = Nokogiri::HTML(open(url))
31 |               doc.search(selector)
32 |             rescue OpenURI::HTTPError
33 |               []
34 |             end
35 |           else
36 |             []
37 |           end
38 |         end
39 |     end
40 |   end
41 | end
42 | 


--------------------------------------------------------------------------------
/lib/parser/rss.rb:
--------------------------------------------------------------------------------
 1 | require 'net/http'
 2 | require 'rss/2.0'
 3 | 
 4 | module NewsAgg
 5 |   module Parser
 6 |     class Rss
 7 |       FEED_LIMIT = 15 # max number of items per medium
 8 | 
 9 |       include Cleaner
10 |       attr_accessor :medium_key, :feed_urls
11 | 
12 |       def initialize(medium_key, feed_urls)
13 |         @medium_key = medium_key
14 |         @feed_urls = feed_urls
15 |       end
16 | 
17 |       def items
18 |         items = []
19 |         feed_urls.each do |feed_url|
20 |           rss_items = fetch_rss_items(feed_url)
21 |           rss_items.each { |item| items << parse(item) }
22 |         end
23 |         items
24 | 
25 |         # limit the items per medium to the number
26 |         # of max items that can be displayed
27 |         items = items.sort{ |a, b| b['timestamp'] <=> a['timestamp'] }.first(FEED_LIMIT)
28 |       end
29 | 
30 |       private
31 |         def parse(item)
32 |           object = {}
33 |           object['medium_key']  = medium_key
34 |           object['title']       = clean_whitespace(item.title)
35 |           object['timestamp']   = item.date.to_i || item.pubDate.to_i
36 |           # TODO: clean description from HTML tags
37 |           # object['description'] = clean_whitespace(item.description)
38 |           object['url']         = clean_whitespace(item.link)
39 | 
40 |           # DEBUG: feed_url object
41 |           # p object['url']
42 | 
43 |           object
44 |         end
45 | 
46 |         def fetch_rss_items(feed_url)
47 |           if feed_url =~ URI::regexp
48 |             begin
49 |               # TODO: handle exceptions properly
50 |               uri = URI.parse(feed_url)
51 |               response = Net::HTTP.get_response(uri)
52 |               RSS::Parser.parse(response.body, false).items
53 |             rescue OpenURI::HTTPError
54 |               []
55 |             end
56 |           else
57 |             []
58 |           end
59 |         end
60 |     end
61 |   end
62 | end
63 | 


--------------------------------------------------------------------------------
/lib/tasks/scheduler.rake:
--------------------------------------------------------------------------------
 1 | require_relative '../../lib/newsagg'
 2 | 
 3 | desc "Crawl web pages"
 4 | task :crawl do
 5 |   puts "!! Crawling start..."
 6 |   NewsAgg::Crawler.start
 7 |   puts "!! Crawling end."
 8 | end
 9 | 
10 | require_relative '../../lib/newsagg'
11 | 
12 | desc "Training sets for classifier"
13 | task :train do
14 |   puts "!! Trainer start..."
15 |   NewsAgg::Trainer.train
16 |   puts "!! Trainer end."
17 | end
18 | 


--------------------------------------------------------------------------------
/lib/trainer.rb:
--------------------------------------------------------------------------------
 1 | module NewsAgg
 2 |   module Trainer
 3 |     def self.train
 4 |       Category.all.each do |category|
 5 |         contents = []
 6 | 
 7 |         category.seeds.each do |seed|
 8 |           parser = NewsAgg::Parser::Html.new(seed[:url], seed[:selector])
 9 |           contents << parser.content
10 |         end
11 | 
12 |         training_set = TrainingSet.new(category.name, contents.join(' '))
13 |         training_set.save
14 |       end
15 |     end
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/lib/training_set.rb:
--------------------------------------------------------------------------------
 1 | module NewsAgg
 2 |   class TrainingSet
 3 |     attr_accessor :category, :content
 4 | 
 5 |     def initialize(category, content)
 6 |       @category = category
 7 |       @content  = content
 8 |     end
 9 | 
10 |     def save
11 |       # DEBUG: training set
12 |       p "saving training set... category => #{category}"
13 |       k = self.class.key(category)
14 |       R.hmset(k, 'category', category)
15 |       R.hmset(k, 'content', content)
16 |     end
17 | 
18 |     def self.find(category)
19 |       params = R.hgetall(key(category))
20 |       TrainingSet.new(params['category'], params['content'])
21 |     end
22 | 
23 |     private
24 |       def self.key(category)
25 |         "training_set:#{category}"
26 |       end
27 |   end
28 | end
29 | 


--------------------------------------------------------------------------------
/script/crawl.rb:
--------------------------------------------------------------------------------
1 | require_relative '../lib/newsagg'
2 | NewsAgg::Crawler.start
3 | 


--------------------------------------------------------------------------------
/script/train.rb:
--------------------------------------------------------------------------------
1 | require_relative '../lib/newsagg'
2 | NewsAgg::Trainer.train
3 | 


--------------------------------------------------------------------------------
/views/category.haml:
--------------------------------------------------------------------------------
 1 | - if @items.empty?
 2 |   No news is good news.
 3 | - else
 4 |   %ul
 5 |     - @items.each do |item|
 6 |       %li
 7 |         %h2
 8 |           = link_to item.title, item.url
 9 |           %span{:title => score_percentages(item.scores)}= "(#{item.medium_key})"
10 | 
11 |         %p= item.content.truncate(300)
12 | 


--------------------------------------------------------------------------------
/views/cluster.haml:
--------------------------------------------------------------------------------
 1 | - if @clusters.empty?
 2 |   Sorry, we don't have any clusters yet. See news categories.
 3 | - else
 4 |   %ul
 5 |     - @clusters.each do |cluster|
 6 |       - item = cluster[0]
 7 |       %li
 8 |         %h2
 9 |           = link_to "#{item.title}", item.url
10 |           %span{:title => score_percentages(item.scores)}= "(#{item.medium_key})"
11 |         - cluster[1..-1].each do |item|
12 |           %h3
13 |             = link_to item.title, item.url
14 |             %span{:title => score_percentages(item.scores)}= "(#{item.medium_key})"
15 | 
16 |         %p= item.content.truncate(300)
17 | 


--------------------------------------------------------------------------------
/views/layout.haml:
--------------------------------------------------------------------------------
 1 | !!!
 2 | %html(xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en")
 3 |   %head
 4 |     %meta{"http-equiv" => "content-type", :content => "text/html;charset=UTF-8"}
 5 |     %title= "NewsAgg"
 6 |     <meta name="keywords" content="news" />
 7 |     <meta name="description" content="News aggregator" />
 8 |     <meta name="author" content="Dalibor Nasevic" />
 9 |     <link href='/style.css' media='screen' rel='stylesheet' type='text/css' />
10 |     <script src='//ajax.googleapis.com/ajax/libs/jquery/1.6.3/jquery.min.js' type='text/javascript'></script>
11 |   %body
12 |     #container
13 |       %header
14 |         %h1= link_to 'NewsAgg', '/'
15 | 
16 |       .content
17 |         .menu
18 |           %ul
19 |             - @categories.each do |category|
20 |               %li{:class => (category == @category ? 'active' : nil)}= link_to category.name.capitalize, "/?c=#{category.name}"
21 |         .news
22 |           = yield
23 | 
24 | 


--------------------------------------------------------------------------------
/views/style.sass:
--------------------------------------------------------------------------------
  1 | /* http://meyerweb.com/eric/tools/css/reset/
  2 |    v2.0 | 20110126
  3 |    License: none (public domain) */
  4 | 
  5 | html, body, div, span, applet, object, iframe,
  6 | h1, h2, h3, h4, h5, h6, p, blockquote, pre,
  7 | a, abbr, acronym, address, big, cite, code,
  8 | del, dfn, em, img, ins, kbd, q, s, samp,
  9 | small, strike, strong, sub, sup, tt, var,
 10 | b, u, i, center,
 11 | dl, dt, dd, ol, ul, li,
 12 | fieldset, form, label, legend,
 13 | table, caption, tbody, tfoot, thead, tr, th, td,
 14 | article, aside, canvas, details, embed,
 15 | figure, figcaption, footer, header, hgroup,
 16 | menu, nav, output, ruby, section, summary,
 17 | time, mark, audio, video
 18 |   margin: 0
 19 |   padding: 0
 20 |   border: 0
 21 |   font-size: 100%
 22 |   font: inherit
 23 |   vertical-align: baseline
 24 | 
 25 | /* HTML5 display-role reset for older browsers */
 26 | article, aside, details, figcaption, figure,
 27 | footer, header, hgroup, menu, nav, section
 28 |   display: block
 29 | 
 30 | body
 31 |   line-height: 1
 32 | 
 33 | ol, ul
 34 |   list-style: none
 35 | 
 36 | blockquote, q
 37 |   quotes: none
 38 | 
 39 | blockquote:before, blockquote:after,
 40 | q:before, q:after
 41 |   content: ''
 42 |   content: none
 43 | 
 44 | table
 45 |   border-collapse: collapse
 46 |   border-spacing: 0
 47 | 
 48 | body
 49 |   color: #333
 50 |   font-family: "Helvetica Neue",Helvetica,Arial,sans-serif
 51 | 
 52 | a, a:active, a:visited
 53 |   color: #607890
 54 |   text-decoration: none
 55 | 
 56 | a:hover
 57 |   color: #003366
 58 |   text-decoration: underline
 59 | 
 60 | p
 61 |   line-height: 18px
 62 |   margin-bottom: 10px
 63 | 
 64 | 
 65 | 
 66 | .content
 67 |   width: 960px
 68 |   margin: 0 auto
 69 | 
 70 | header
 71 |   margin-bottom: 30px
 72 |   h1
 73 |     font-size: 30px
 74 |     margin-top: 20px
 75 | 
 76 | 
 77 | ul
 78 |   li
 79 |     margin: 5px 0
 80 | 
 81 | #container
 82 |   width: 960px
 83 |   margin: 0 auto
 84 |   overflow: hidden
 85 | 
 86 |   .menu
 87 |     width: 200px
 88 |     float: left
 89 | 
 90 |     ul
 91 |       li
 92 |         border-left: 5px solid #CCC
 93 |         margin: 12px 0
 94 |         padding: 0 6px
 95 | 
 96 |         &.active
 97 |           border-left: 5px solid #000 !important
 98 | 
 99 |       li:nth-child(1)
100 |         border-left: 5px solid #F3BD18
101 |       li:nth-child(2)
102 |         border-left: 5px solid #DE6E26
103 |       li:nth-child(3)
104 |         border-left: 5px solid #A71D3F
105 |       li:nth-child(4)
106 |         border-left: 5px solid #70629A
107 |       li:nth-child(5)
108 |         border-left: 5px solid #0988C5
109 |       li:nth-child(6)
110 |         border-left: 5px solid #3A9F47
111 |       li:nth-child(7)
112 |         border-left: 5px solid #FF0099
113 |       li:nth-child(8)
114 |         border-left: 5px solid #412A1A
115 |       li:nth-child(9)
116 |         border-left: 5px solid #C5B8BF
117 | 
118 | 
119 |   .news
120 |     width: 760px
121 |     float: left
122 | 
123 |     ul
124 |       li
125 |         margin-bottom: 20px
126 |         h2
127 |           margin: 4px 0
128 | 
129 |         h3
130 |           font-size: 13px
131 |           margin: 4px 0
132 | 
133 |         p
134 |           font-size: 13px
135 | 
136 |       span
137 |         color: #666
138 |         font-size: 80%
139 | 
140 |     ul.cluster
141 |       overflow: hidden
142 |       li
143 |         width: 250px
144 |         float: left
145 | 


--------------------------------------------------------------------------------