├── install
├── app
├── twitter
│ ├── __init__.py
│ ├── logging.conf
│ ├── platform.ini
│ ├── tweetstream.py
│ ├── preprocess.py
│ ├── tweetprocessing.py
│ └── mongoBatchInsert.py
├── static
│ ├── glyphicons-halflings-regular.eot
│ ├── glyphicons-halflings-regular.ttf
│ ├── glyphicons-halflings-regular.woff
│ ├── glyphicons-halflings-regular.woff2
│ ├── style.css
│ ├── npm.js
│ └── bootstrap-theme.css
├── __init__.py
├── templates
│ ├── login.html
│ ├── admin_login.html
│ ├── index.html
│ ├── create.html
│ ├── setup.html
│ ├── admin_home.html
│ ├── _macros.html
│ ├── home.html
│ ├── network_home.html
│ ├── update_collector.html
│ ├── new_collector.html
│ ├── collector.html
│ └── base.html
├── decorators.py
├── tasks.py
├── forms.py
├── controller.py
└── processes.py
├── run.py
├── scripts
└── data_pull
│ ├── tweetIDs.txt
│ ├── ConfigNoCond.py
│ ├── ConfigCond.py
│ ├── Tweet-QueryScript-NoCond.py
│ └── Tweet-QueryScript-Cond.py
├── run.wsgi
├── requirements.txt
├── .gitignore
├── LICENSE
├── license.txt
├── config.py
├── setup.py
├── README.md
├── INSTALL.md
└── __main__.py
/install:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | pip install -r requirements.txt
4 | crontab scripts/crontab.txt
5 |
--------------------------------------------------------------------------------
/app/twitter/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | module_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)))
3 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import os
3 | from app import app
4 |
5 | if __name__ == '__main__':
6 | app.run(debug=True)
--------------------------------------------------------------------------------
/app/static/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bitslabsyr/stack/HEAD/app/static/glyphicons-halflings-regular.eot
--------------------------------------------------------------------------------
/app/static/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bitslabsyr/stack/HEAD/app/static/glyphicons-halflings-regular.ttf
--------------------------------------------------------------------------------
/app/static/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bitslabsyr/stack/HEAD/app/static/glyphicons-halflings-regular.woff
--------------------------------------------------------------------------------
/app/static/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bitslabsyr/stack/HEAD/app/static/glyphicons-halflings-regular.woff2
--------------------------------------------------------------------------------
/scripts/data_pull/tweetIDs.txt:
--------------------------------------------------------------------------------
1 | tweet_id
2 | ID_757604886141820928
3 | ID_757605167818608641
4 | ID_757606596839079936
5 | ID_757606903753084928
6 | ID_801585712516710400
7 |
--------------------------------------------------------------------------------
/run.wsgi:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import sys
3 | import logging
4 | import os
5 |
6 | logging.basicConfig(stream=sys.stderr)
7 | sys.path.insert(0, "/var/www/stack/")
8 |
9 | from app import app as application
10 |
11 | application.secret_key = 'This is a static key for production and will change.'
--------------------------------------------------------------------------------
/app/static/style.css:
--------------------------------------------------------------------------------
1 | #messages-wrap {
2 | padding: 0px 20px;
3 | }
4 |
5 | main {
6 | padding: 0px 20px;
7 | }
8 |
9 | /* Project Account Home Panel */
10 | .project-home .network-buttons {
11 | padding-top: 10px;
12 | }
13 |
14 | .project-home .network-buttons > div {
15 | text-align: center;
16 | }
--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from flask import Flask
3 | from celery import Celery
4 |
5 | # Init and config app
6 | app = Flask(__name__)
7 | app.config.from_object('config')
8 |
9 | # Init and config celery
10 | celery = Celery(app.name, broker=app.config['CELERY_BROKER_URL'])
11 | celery.conf.update(app.config)
12 |
13 | from app import views
14 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | amqp==1.4.6
2 | anyjson==0.3.3
3 | billiard==3.3.0.19
4 | celery==3.1.17
5 | configparser==3.3.0r2
6 | DateTime==4.0.1
7 | Flask==0.10.1
8 | Flask-WTF==0.11
9 | itsdangerous==0.24
10 | kombu==3.0.30
11 | logging==0.4.9.6
12 | MarkupSafe==0.23
13 | pymongo==2.7.2
14 | pytz==2014.7
15 | sendgrid==1.4.2
16 | simplejson==3.6.3
17 | smtpapi==0.2.0
18 | tweepy==2.3.0
19 | Werkzeug==0.9.6
20 | wheel==0.24.0
21 | WTForms==2.0.2
22 | zope.interface==4.1.1
23 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # STACKS Files & Directories
2 | *_raw_tweets
3 | raw_tweets_*
4 | raw_tweets
5 | *_insert_queue
6 | insert_queue_*
7 | insert_queue
8 | *_tweet_archive
9 | tweet_archive_*
10 | tweet_archive
11 | error_tweets
12 | error_tweets.txt
13 | error_inserted_tweets
14 | error_inserted_tweets.txt
15 | controller_old.py
16 | test.terms
17 | test.ini
18 | py_modules_list.txt
19 | old_collection.terms
20 | test.py
21 | out
22 | data
23 | logs
24 | stack
25 |
26 |
27 | # Python Files & Dirs
28 | .idea
29 | *.pyc
30 | env
31 |
--------------------------------------------------------------------------------
/app/static/npm.js:
--------------------------------------------------------------------------------
1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment.
2 | require('../../js/transition.js')
3 | require('../../js/alert.js')
4 | require('../../js/button.js')
5 | require('../../js/carousel.js')
6 | require('../../js/collapse.js')
7 | require('../../js/dropdown.js')
8 | require('../../js/modal.js')
9 | require('../../js/tooltip.js')
10 | require('../../js/popover.js')
11 | require('../../js/scrollspy.js')
12 | require('../../js/tab.js')
13 | require('../../js/affix.js')
--------------------------------------------------------------------------------
/app/templates/login.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
Login
7 |
8 |
9 | {% from "_macros.html" import form_field %}
10 |
17 |
18 |
19 |
20 | {% endblock %}
21 |
--------------------------------------------------------------------------------
/app/templates/admin_login.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
Admin Login
7 |
8 |
9 | {% from "_macros.html" import form_field %}
10 |
17 |
18 |
19 |
20 | {% endblock %}
21 |
--------------------------------------------------------------------------------
/app/templates/index.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
STACKS
7 |
Social Media Tracker, Analyzer, and Collector Toolkit at Syracuse University
8 |
9 |
STACKS is a social data collection and analysis geared towards researcher. With STACKS you can collect data
10 | from public social networks, all for free. STACKS is built an maintained at the Syracuse University
11 | iSchool . Follow STACKS on Github
12 | for regular development updates.
13 |
14 |
15 | {% endblock %}
16 |
--------------------------------------------------------------------------------
/app/templates/create.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
Create A New Account
7 |
8 |
9 | {% from "_macros.html" import form_field %}
10 |
20 |
21 |
22 | {% endblock %}
23 |
--------------------------------------------------------------------------------
/app/templates/setup.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
Create An Admin Account
7 |
8 |
Welcome to STACKS. Before you begin, please make an administrator account. With an admin account you'll be able
9 | to create new projects and manage collections.
10 |
11 |
12 |
13 | {% from "_macros.html" import form_field %}
14 |
22 |
23 |
24 |
25 | {% endblock %}
26 |
--------------------------------------------------------------------------------
/app/twitter/logging.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root
3 |
4 | [logger_root]
5 | level=INFO
6 | handlers=timedRotatingFileHandler
7 |
8 | [formatters]
9 | keys=timedRotatingFormatter
10 |
11 | [formatter_timedRotatingFormatter]
12 | format=%(asctime)s %(name)-12s %(levelname)-8s %(message)s
13 | datefmt=%m-%d %H:%M
14 |
15 | [handlers]
16 | keys=timedRotatingFileHandler
17 |
18 | [handler_timedRotatingFileHandler]
19 | class=handlers.TimedRotatingFileHandler
20 | level=INFO
21 | formatter=timedRotatingFormatter
22 | args=('./logs/log.out', 'M', 1, 30, None, False, False)
23 |
24 | # (filename, when='h', interval=1, backupCount=0, encoding=None, delay=False, utc=False)
25 | #
26 | # If backupCount is nonzero, at most backupCount files will be kept, and if more would be
27 | # created when rollover occurs, the oldest one is deleted. The deletion logic uses the interval
28 | # to determine which files to delete, so changing the interval may leave old files lying around.
29 | #
30 | # If delay is true, then file opening is deferred until the first call to emit().
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014 SoMe Lab @ UW
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/scripts/data_pull/ConfigNoCond.py:
--------------------------------------------------------------------------------
1 | ## Database ##
2 | DB_AUTH = {
3 | 'DB_IS_AUTH': False, # True if DB is password protected otherwise False
4 | 'DB_USERNAME': 'DB_USERNAME',
5 | 'DB_PASSWORD': 'DB_PASSWORD'
6 | }
7 | DB = {
8 | 'DB_NAME': 'DB_NAME',
9 | 'COL_NAME': 'tweets'
10 | }
11 |
12 | # Output
13 | # Headers: Field names must be exactly like in Mongo documents
14 | # For nested objects: names are separated by .
15 | OUTPUT = {
16 | 'OUT_FILENAME': './out/out.csv',
17 | 'HEADER': ["id", "text", "created_ts", "hashtags", "mentions",
18 | "in_reply_to_status_id", "in_reply_to_screen_name",
19 | "retweeted_status.id", "retweeted_status.user.id", "retweeted_status.user.screen_name",
20 | "retweeted_status.user.followers_count", "retweeted_status.user.friends_count",
21 | "user.id", "user.screen_name",
22 | "user.followers_count", "user.friends_count",
23 | "user.statuses_count", "user.created_ts",
24 | "user.time_zone", "user.location"]
25 | }
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2013 Josef Eckert, Jeff Hemsley, Robert Mason, Karine Nahon, Shawn Walker (as "the SoMe Lab at UW")
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
--------------------------------------------------------------------------------
/app/templates/admin_home.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
9 |
10 |
As an admin you can create new project accounts or view existing ones.
11 |
Create New Project Account
12 |
13 |
14 |
15 |
Existing Projects
16 |
17 |
18 |
19 | Project Name
20 | # Collectors
21 | # Active Collectors
22 |
23 |
24 |
25 |
26 | {% for project in project_list %}
27 |
28 | {{ project['project_name'] }}
29 | {{ project['num_collectors'] }}
30 | {{ project['active_collectors'] }}
31 |
32 | {% endfor %}
33 |
34 |
35 |
36 |
37 |
38 | {% endblock %}
39 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | # MONGODB CONFIG
4 | AUTH = True
5 | USERNAME = 'LOCAL_DB_USERNAME'
6 | PASSWORD = 'LOCAL_DB_PASSWORD'
7 |
8 | # CENTRAL MONGODB SERVER
9 | CT_SERVER = 'CENTRAL_DB_ADDRESS'
10 | CT_DB_NAME = 'CENTRAL_DB_NAME'
11 | CT_AUTH = True
12 | CT_USERNAME = 'CENTRAL_DB_USERNAME'
13 | CT_PASSWORD = 'CENTRAL_DB_USERNAME'
14 |
15 | # Directory structure config vars
16 | BASEDIR = os.path.abspath(os.path.dirname(__file__))
17 | LOGDIR = BASEDIR + '/out'
18 | DATADIR = BASEDIR + '/data'
19 |
20 | # Flask config vars
21 | DEBUG = False
22 | SECRET_KEY = 'This key will be replaced with a secure key in production.'
23 | CSRF_ENABLED = True
24 | CSRF_SECRET_KEY = 'willbereplacedinproduction'
25 |
26 | # STACKS config info
27 | VERSION = '2.0'
28 | DESC = 'STACKS - Social Media Tracker, Aggregator, and Collector Kit'
29 | DEFAULT_ROLE = 0 # by default, users aren't admins
30 | NETWORKS = ['twitter', 'facebook'] # networks that STACKS is set to work for
31 |
32 | # Celery config info - queues & routes are added dynamically
33 | CELERY_BROKER_URL = 'amqp://'
34 | CELERY_RESULT_BACKEND = 'amqp'
35 | CELERY_QUEUES = ()
36 | CELERY_ROUTES = {}
37 | # CELERY_REDIRECT_STDOUTS = False # We handle stdout/err/in logging ourselves, so don't want Celery taking over
38 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!env/bin/python
2 |
3 | import sys
4 | import os
5 | import json
6 |
7 | from werkzeug import generate_password_hash
8 |
9 | from app.controller import Controller
10 | from app.models import DB
11 |
12 | basedir = os.getcwd()
13 | db = DB()
14 |
15 | def main():
16 | print '\n'
17 | print 'STACK'
18 | print '----------'
19 | print '\n'
20 |
21 | print 'Welcome to the STACK setup tool. Follow the instructions below to\nsetup your first project account and initialize the configuration\nfiles for your STACK toolkit.'
22 | print '\n'
23 |
24 | project_name = raw_input('Enter a project account name: ')
25 | password = raw_input('Enter a project account password: ')
26 | description = raw_input('Enter a project account description: ')
27 |
28 | hashed_password = generate_password_hash(password)
29 |
30 | resp = db.create(project_name=project_name, password=password, hashed_password=hashed_password,
31 | description=description)
32 | if resp['status']:
33 | print '\n'
34 | print 'SUCCESS! You can now login to your account %s from the\n STACK front-end. Happy researching.' % project_name
35 | else:
36 | print '\n'
37 | print 'Oops. Something went wrong. Please try again and make sure\n the account name you entered does not already exist.'
38 |
39 | if __name__ == "__main__":
40 | main()
41 |
--------------------------------------------------------------------------------
/app/decorators.py:
--------------------------------------------------------------------------------
1 | from functools import wraps
2 | from flask import g, flash, redirect, url_for, request, session
3 | from models import DB
4 |
5 |
6 | # Used to divert users from account-only STACK pages
7 | # Admins are able to access protected pages
8 | def login_required(f):
9 | @wraps(f)
10 | def decorated_function(*args, **kwargs):
11 | if g.project is None:
12 | if g.admin is None:
13 | flash(u'You need to login to view this page!')
14 | return redirect(url_for('index', next=request.path))
15 | return f(*args, **kwargs)
16 |
17 | return decorated_function
18 |
19 |
20 | def admin_required(f):
21 | @wraps(f)
22 | def decorated_function(*args, **kwargs):
23 | if g.admin is None:
24 | flash(u'You need to be an admin to view this page!')
25 | return redirect(url_for('index', next=request.path))
26 | return f(*args, **kwargs)
27 |
28 | return decorated_function
29 |
30 |
31 | # Used to load project info into the session if not there
32 | def load_project(f):
33 | @wraps(f)
34 | def decorated_function(*args, **kwargs):
35 | g.project = None
36 | if 'project_id' in session:
37 | db = DB()
38 | resp = db.get_project_detail(session['project_id'])
39 | if resp['status']:
40 | g.project = resp
41 | return f(*args, **kwargs)
42 |
43 | return decorated_function
44 |
45 |
46 | # Used to load admin info into the session
47 | def load_admin(f):
48 | @wraps(f)
49 | def decorated_function(*args, **kwargs):
50 | g.admin = None
51 | if 'admin_project_id' in session:
52 | db = DB()
53 | resp = db.get_project_detail(session['admin_project_id'])
54 | if resp['status']:
55 | g.admin = resp
56 | return f(*args, **kwargs)
57 |
58 | return decorated_function
59 |
60 |
--------------------------------------------------------------------------------
/scripts/data_pull/ConfigCond.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | ## Database ##
3 | DB_AUTH = {
4 | 'DB_IS_AUTH': False, # True if DB is password protected otherwise False
5 | 'DB_USERNAME': 'DB_USERNAME',
6 | 'DB_PASSWORD': 'DB_PASSWORD'
7 | }
8 | DB = {
9 | 'DB_NAME': 'DB_NAME',
10 | 'COL_NAME': 'tweets'
11 | }
12 |
13 | # Output
14 | # Headers: Field names must be exactly like in Mongo documents
15 | # For nested objects: names are separated by .
16 | OUTPUT = {
17 | 'OUT_FILENAME': './out/out.csv',
18 | 'HEADER': ["id", "text", "created_ts", "hashtags", "mentions",
19 | "in_reply_to_status_id", "in_reply_to_screen_name",
20 | "retweeted_status.id", "retweeted_status.user.id", "retweeted_status.user.screen_name",
21 | "retweeted_status.user.followers_count", "retweeted_status.user.friends_count",
22 | "user.id", "user.screen_name",
23 | "user.followers_count", "user.friends_count",
24 | "user.statuses_count", "user.created_ts",
25 | "user.time_zone", "user.location"]
26 | }
27 |
28 | # Conditions
29 | # tweet_id = None | filename | [id_1, id_2]
30 | # user_id = None | filename | [user_id_1, user_id_2]
31 | # screen_name = None | filename | [name_1, name_2]
32 | # created_at_from = None | datetime(YYYY, M, D, H)
33 | # created_at_to = None | datetime(YYYY, M, D, H)
34 | # retweet_included = None | True | False
35 | CONDITIONS = {
36 | 'tweet_id': './tweetIDs.txt',#['757604886141820928', '757605167818608641', '757606596839079936', '757606903753084928'],
37 | 'user_id': [],
38 | 'screen_name': [],
39 | 'created_at_from': None,#datetime(2016, 1, 1, 0),
40 | 'created_at_to': None,#datetime(2018, 2, 1, 0),
41 | 'retweet_included': True
42 | }
43 |
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/app/templates/_macros.html:
--------------------------------------------------------------------------------
1 | {% macro terms_field(field) %}
2 |
3 | {% endmacro %}
4 |
5 | {% macro form_field(field) %}
6 | {% set inner_text = kwargs.pop('inner_text', '') %}
7 | {% set group_class = kwargs.pop('group_class', '') %}
8 | {% set label_class = kwargs.pop('label_class', '') %}
9 | {% set input_class = kwargs.pop('input_class', '') %}
10 |
11 | {% if field.flags.required %}
12 | {% set input_class = input_class + ' required' %}
13 | {% endif %}
14 |
15 |
52 | {% endmacro %}
53 |
--------------------------------------------------------------------------------
/app/templates/home.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
7 |
{{ project_detail['project_name'] }}
8 |
{{ project_detail['project_description'] }}
9 |
10 |
11 | Collectors: {{ project_detail['num_collectors'] }}
12 |
13 |
14 |
15 |
16 |
Available Networks
17 |
Twitter
18 |
Facebook
19 |
20 |
21 |
22 |
New Collector
23 |
24 |
25 |
26 |
27 |
28 |
Existing Collectors
29 |
30 | {% if project_detail['collectors'] %}
31 |
32 |
33 |
34 |
35 | Collector Name
36 | Network
37 | Active
38 | # Terms
39 |
40 |
41 |
42 |
43 | {% for collector in project_detail['collectors'] %}
44 |
45 |
47 | {{ collector['collector_name'] }}
48 |
49 | {{ collector['network'] }}
50 |
51 | {% if collector['active'] == 0 %}
52 | No
53 | {% else %}
54 | Yes
55 | {% endif %}
56 |
57 | {{ collector['num_terms'] }}
58 |
59 | {% endfor %}
60 |
61 |
62 |
63 |
64 | {% else %}
65 |
No collectors exist for this account yet!
66 | {% endif %}
67 |
68 |
69 |
70 | {% endblock %}
71 |
--------------------------------------------------------------------------------
/app/twitter/platform.ini:
--------------------------------------------------------------------------------
1 | ;
2 | ; SoMe Tools platform ini/config file
3 | ;
4 | ; Step 1: In the same directory as "tweet_collector.py", create a plain text file named
5 | ; "collection.terms" and put one keyterm per line.
6 | ;
7 | ; Step 2: File >> Save as >> save this file as "platform.ini" - KEEP THE ORIGINAL AS BACKUP
8 | ;
9 | ; Step 3: Confirm "platform.ini" is in the same directory as "tweet_collector.py"
10 | ;
11 | ; Step 4: Visit: https://dev.twitter.com/apps/new
12 | ;
13 | ; Step 5: Login with your Twitter account
14 | ;
15 | ; Step 6: You'll be queries to "create an application" - you need to do this to obtain
16 | ; "oauth" credentials. The name, description, and website are filler - feel free to use
17 | ; whatever you like. Leave "callback URL" blank. Submit.
18 | ;
19 | ; Step 7: Change the values below: [collection] needs a name value, [files] require directory
20 | ; path values and file names (change these!), and [oauth] requires the values you obtained
21 | ; from the Twitter dev site
22 |
23 | [collection]
24 | ; DEPRECATED in v1.0 - [collection] info now grabbed from STACK DB
25 | ; name is a human readable word(s) for logging and reporting about the data being collected
26 | name:XXXX
27 | ; this is the mongo database name
28 | db_name:XXXX
29 | ; this is the mongo collection name
30 | collection_name:XXXX
31 |
32 | ; note: is there a way to take the file name from the collection name? so
33 | ; %(name)s_tweets_out.txt
34 |
35 | [files]
36 | ; this is where the raw tweets get stored from the collector
37 | ; DEPRECATED in v1.0
38 | raw_tweets_file_path:XXXX
39 |
40 | ; this where the raw tweets and the processed tweets get sotred
41 | ; DEPRECATED in v1.0
42 | tweet_archive_dir:XXXXX
43 |
44 | ; this is where the processed tweets go while they wait to be inserted in mongo
45 | ; DEPRECATED in v1.0
46 | tweet_insert_queue:XXXX
47 |
48 | ; the date format is used to build the tweets out file name as well as indicate
49 | ; when the file should rool over. The fastest possible roll over (not reccomended)
50 | ; is seconds and is specified with %Y%m%d-%H%M%S. For testing use minutes (%Y%m%d-%H%M)
51 | ; or hours (%Y%m%d-%H), but for collection use hours or days (%Y%m%d)
52 | tweets_file_date_frmt:%Y%m%d-%H
53 | tweets_file:tweets_out.json
54 | ; the final constructed file name will be something like: ./tweets/20130822-1030-track-tweets_out.json
55 |
56 | ; DEPRECATED in v1.0
57 | terms_file:XXXX
58 |
59 | log_file:/tweet_collection.log
60 | log_dir:/logs/
61 | log_config_file:/logging.conf
62 |
63 | [oauth-track]
64 | ; DEPRECATED in v1.0
65 |
66 | [oauth-follow]
67 | ; DEPRECATED in v1.0
68 |
69 |
--------------------------------------------------------------------------------
/scripts/data_pull/Tweet-QueryScript-NoCond.py:
--------------------------------------------------------------------------------
1 | # This script queries tweet objects from MongoDB *without conditions* and writes to a CSV file
2 | # The script reads the settings from config.py
3 | # Output: All ID fields are prepended with ID_
4 | # Array fields are | separated
5 | # If keys do not exist, write NULL
6 | # By Sikana Tanupabrungsun. March 16, 2018
7 |
8 | # encoding=utf8
9 | import os
10 | import csv
11 | import sys
12 | import pymongo
13 | import ConfigNoCond as cfg
14 | from datetime import datetime
15 |
16 | reload(sys)
17 | sys.setdefaultencoding('utf8')
18 |
19 | connection = pymongo.MongoClient()
20 |
21 | # Check if DB is password protected
22 | if cfg.DB_AUTH['DB_IS_AUTH']:
23 | connection.admin.authenticate(cfg.DB_AUTH['DB_USERNAME'], cfg.DB_AUTH['DB_PASSWORD'])
24 |
25 | db = connection[cfg.DB['DB_NAME']][cfg.DB['COL_NAME']]
26 |
27 | # Get keys
28 | keys = cfg.OUTPUT['HEADER']
29 |
30 | # Outfile Handling #
31 | outfilename = cfg.OUTPUT['OUT_FILENAME']
32 | if os.path.exists(outfilename):
33 | print('%s already exists' % (outfilename))
34 | overwrite = raw_input("Replace the file [y/n]?").lower()
35 | if overwrite <> 'y':
36 | sys.exit(0)
37 |
38 | dirname = os.path.dirname(outfilename)
39 | if not os.path.exists(dirname):
40 | os.makedirs(dirname)
41 |
42 | outcsvfile = open(outfilename, 'w')
43 | outfile = csv.writer(outcsvfile)
44 | outfile.writerow(keys)
45 |
46 | print('Query data from DB: %s Collection: %s' % (cfg.DB['DB_NAME'], cfg.DB['COL_NAME']))
47 | print('Output file: %s' % (outfilename))
48 | print('Output headers: %s' % (','.join(keys)))
49 |
50 | tweets = db.find()
51 |
52 | print('Total tweets: %d' % (tweets.count()))
53 |
54 | for tweet in tweets:
55 | out = []
56 | for key in keys:
57 | key_arr = key.split('.')
58 | last_key = None
59 | try:
60 | if len(key_arr) == 3:
61 | my_val = tweet[key_arr[0]][key_arr[1]][key_arr[2]]
62 | last_key = key_arr[2]
63 | elif len(key_arr) == 2:
64 | my_val = tweet[key_arr[0]][key_arr[1]]
65 | last_key = key_arr[1]
66 | else:
67 | my_val = tweet[key]
68 | last_key = key
69 |
70 | if isinstance(my_val, list):
71 | my_val = '|'.join(my_val)
72 | elif last_key == 'created_at':
73 | my_val = datetime.fromtimestamp(int(my_val)/1000).strftime('%a %b %d %X %z %Y')
74 | elif last_key.endswith('id') and my_val is not None:
75 | my_val = 'ID_' + str(my_val)
76 |
77 | out.append(my_val)
78 | except:
79 | out.append(None)
80 |
81 | outfile.writerow(out)
--------------------------------------------------------------------------------
/app/templates/network_home.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
{{ network }}
7 |
8 |
9 | Collectors: {{ g.project['num_collectors'] }}
10 | Data Stored: {{ count }}
11 |
12 |
13 |
New Collector
14 |
15 |
16 |
17 |
Network Processor
18 |
19 | {% if processor_active_status == 'active' %}
20 | Active
21 | {% else %}
22 | Inactive
23 | {% endif %}
24 |
25 |
26 |
32 |
33 |
Network Inserter
34 |
35 | {% if inserter_active_status == 'active' %}
36 | Active
37 | {% else %}
38 | Inactive
39 | {% endif %}
40 |
41 |
42 |
48 |
49 |
50 |
51 |
Existing Collectors
52 |
53 | {% if collectors %}
54 |
55 |
56 |
57 |
58 | Collector Name
59 | Network
60 | Active
61 | # Terms
62 |
63 |
64 |
65 |
66 | {% for collector in collectors %}
67 |
68 |
70 | {{ collector['collector_name'] }}
71 |
72 | {{ collector['network'] }}
73 |
74 | {% if collector['active'] == 0 %}
75 | No
76 | {% else %}
77 | Yes
78 | {% endif %}
79 |
80 | {{ collector['num_terms'] }}
81 |
82 | {% endfor %}
83 |
84 |
85 |
86 |
87 | {% else %}
88 |
No collectors exist for this account yet!
89 | {% endif %}
90 |
91 |
92 | {% endblock %}
93 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | STACKS - Social Media Tracker, Analyzer, & Collector Toolkit at Syracuse
2 | =========
3 |
4 | STACKS is an extensible social media research toolkit designed to collect, process, and store data from online social networks. The toolkit is an ongoing project via the [Syracuse University iSchool](http://ischool.syr.edu), and currently supports the [Twitter Streaming API](https://dev.twitter.com/streaming/overview). Collecting from the Twitter search API is under development. The toolkit architecture is modular and supports extending.
5 |
6 | You can cite this repository:
7 |
8 | Jeff Hemsley, Sam Jackson, Sikana Tanupabrungsun, & Billy Ceskavich. (2019). bitslabsyr/stack: STACKS 3.1 (Version 3.1). http://doi.org/10.5281/zenodo.2638848
9 |
10 | **_This documentation assumes the following:_**
11 |
12 | * You know how to use ssh.
13 | * Your server has MongoDB already installed.
14 | * You understand how to edit files using vim (“vi”) or nano.
15 | * You have rights and know how to install Python libraries.
16 |
17 | ## Installation
18 |
19 | Please read through [Install](https://github.com/bitslabsyr/stack/wiki/Installation) to go through the STACK installation process.
20 |
21 | Prior to installing STACK, make sure you have MongoDB installed and running on your server. [Learn how to install MongoDB here](http://docs.mongodb.org/manual/installation/).
22 |
23 | ## Wiki
24 |
25 | To learn more about STACK semantics, logging, and processing parameters, [refer to our wiki](https://github.com/bitslabsyr/stack/wiki).
26 |
27 | ## Ongoing Work + Next Action Items
28 |
29 | This list will be updated soon with more detailed action items. Please note again that we are actively working on this toolkit!
30 |
31 | 1. Full move away from .ini file use
32 | 2. Extensible module format for future social network implementations
33 | 3. Exentesible back-end API
34 |
35 | ## Credits
36 |
37 | Lovingly maintained at Syracuse University by:
38 |
39 | * [Jeff Hemsley](https://github.com/jhemsley)
40 | * [Sam Jackson](https://github.com/sjacks26)
41 | * [Sikana Tanupabrungsun](https://github.com/Sikana)
42 | * [Billy Ceskavich](https://github.com/bceskavich/)
43 |
44 | Distributed under the MIT License:
45 |
46 | The MIT License (MIT)
47 |
48 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
49 |
50 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
51 |
52 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
53 |
--------------------------------------------------------------------------------
/app/templates/update_collector.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
Update Collector
7 | {{ collector['collector_name'] }}
8 |
9 |
10 | {% from "_macros.html" import form_field %}
11 |
12 |
31 |
32 |
33 |
34 |
35 |
Current Terms
36 |
37 | {% if terms_forms %}
38 | {% for tform in terms_forms %}
39 |
46 | {% endfor %}
47 | {% endif %}
48 |
49 |
Update Collector
50 |
51 |
52 |
53 | {% endblock %}
54 |
55 | {% block script %}
56 |
102 | {% endblock %}
--------------------------------------------------------------------------------
/app/tasks.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import subprocess
4 | import threading
5 |
6 | from app import celery, app
7 | from controller import Controller
8 |
9 |
10 | @celery.task()
11 | def start_daemon(process, project, collector_id=None, network=None):
12 | """
13 | Calls a Controller to daemonize and start a STACK process
14 | """
15 | if process == 'collect':
16 | c = Controller(
17 | process=process,
18 | project=project,
19 | collector_id=collector_id
20 | )
21 | else:
22 | c = Controller(
23 | process=process,
24 | project=project,
25 | network=network
26 | )
27 |
28 | t = threading.Thread(name='test-thread', target=c.process_command, args=('start',))
29 | t.start()
30 |
31 | # c.process_command('start')
32 |
33 |
34 | @celery.task()
35 | def stop_daemon(process, project, collector_id=None, network=None):
36 | """
37 | Calls a Controller to stop a daemonized STACK process
38 | """
39 | if process == 'collect':
40 | c = Controller(
41 | process=process,
42 | project=project,
43 | collector_id=collector_id
44 | )
45 | else:
46 | c = Controller(
47 | process=process,
48 | project=project,
49 | network=network
50 | )
51 |
52 | t = threading.Thread(name='test-thread', target=c.process_command, args=('stop',))
53 | t.start()
54 | t.join()
55 |
56 | # c.process_command('stop')
57 |
58 |
59 | @celery.task()
60 | def restart_daemon(process, project, collector_id=None, network=None):
61 | """
62 | Calls a Controller to restart a daemonized STACK process
63 | """
64 | if process == 'collect':
65 | c = Controller(
66 | process=process,
67 | project=project,
68 | collector_id=collector_id
69 | )
70 | else:
71 | c = Controller(
72 | process=process,
73 | project=project,
74 | network=network
75 | )
76 |
77 | t = threading.Thread(name='test-thread', target=c.process_command, args=('restart',))
78 | t.start()
79 |
80 | # c.process_command('restart')
81 |
82 |
83 | def start_workers():
84 | """
85 | Starts two Celery workers on app spin up
86 |
87 | -- 1) Handles starting of all STACK processes
88 | -- 2) Handles stoping of all STACK processes
89 | """
90 | base_command = 'celery multi start '
91 | os.chdir(app.config['BASEDIR'])
92 |
93 | # Worker names
94 | start_worker = 'stack-start'
95 | stop_worker = 'stack-stop'
96 |
97 | # Directories for log and pid information
98 | outdir = app.config['LOGDIR'] + '/app'
99 | piddir = outdir + '/pid'
100 | logdir = outdir + '/log'
101 |
102 | # Filenames
103 | start_logfile = logdir + '/%s.log' % start_worker
104 | stop_logfile = logdir + '/%s.log' % stop_worker
105 | start_pidfile = piddir + '/%s.pid' % start_worker
106 | stop_pidfile = piddir + '/%s.pid' % stop_worker
107 |
108 | # Creates directories if they don't exist
109 | if not os.path.exists(piddir):
110 | os.makedirs(piddir)
111 | if not os.path.exists(logdir):
112 | os.makedirs(logdir)
113 |
114 | start_pid = get_pid(start_pidfile)
115 | stop_pid = get_pid(stop_pidfile)
116 |
117 | # Completes the command syntax to spin up the workers
118 | if not start_pid:
119 | start_worker_cmd = base_command + '%s-worker -A app.celery -l info -Q %s --logfile=%s --pidfile=%s' % \
120 | (start_worker, start_worker, start_logfile, start_pidfile)
121 | subprocess.call(start_worker_cmd.split(' '))
122 |
123 | if not stop_pid:
124 | stop_worker_cmd = base_command + '%s-worker -A app.celery -l info -Q %s --logfile=%s --pidfile=%s' % \
125 | (stop_worker, stop_worker, stop_logfile, stop_pidfile)
126 | subprocess.call(stop_worker_cmd.split(' '))
127 |
128 |
129 | def get_pid(pidfile):
130 | try:
131 | pf = file(pidfile, 'r')
132 | pid = int(pf.read().strip())
133 | pf.close()
134 | except IOError:
135 | pid = None
136 | except SystemExit:
137 | pid = None
138 | return pid
--------------------------------------------------------------------------------
/app/twitter/tweetstream.py:
--------------------------------------------------------------------------------
1 | #
2 | # swipped from https://github.com/salathegroup/mkondo/tree/master/mkondo
3 | #
4 | import tweepy
5 | import httplib
6 | from socket import timeout
7 | from socket import error as socket_error
8 | from time import sleep
9 | import logging
10 | import logging.config
11 |
12 | class CompliantStream(tweepy.Stream):
13 | """This class extends Tweepy's Stream class by adding HTTP and TCP/IP
14 | back-off (according to Twitter's guidelines)."""
15 |
16 | def __init__(self, auth, listener, retry_count, logger, min_http_delay=5,
17 | max_http_delay=320, min_http_420_delay=60, min_tcp_ip_delay=0.5,
18 | max_tcp_ip_delay=16, **options):
19 |
20 | self.logger = logger
21 | self.logger.info('COMPLIENT STREAM: Initializing complient stream...')
22 | self.min_http_delay = min_http_delay
23 | self.max_http_delay = max_http_delay
24 | self.min_tcp_ip_delay = min_tcp_ip_delay
25 | self.max_tcp_ip_delay = max_tcp_ip_delay
26 | self.running = False
27 | self.retry_count = retry_count
28 | self.auth = auth
29 |
30 | #Twitter sends a keep-alive every twitter_keepalive seconds
31 | self.twitter_keepalive = 30
32 |
33 | #Add a couple seconds more wait time.
34 | self.twitter_keepalive += 2.0
35 |
36 | self.sleep_time = 0
37 |
38 | #logging.info('COMPLIANT STREAM: Initializing compliant stream...')
39 |
40 | tweepy.Stream.__init__(self, auth, listener, secure=True, **options)
41 |
42 | def _run(self):
43 | url = "%s://%s%s" % (self.scheme, self.host, self.url)
44 |
45 | # Connect and process the stream
46 | error_counter = 0
47 | conn = None
48 | exception = None
49 | while self.running:
50 | if self.retry_count and error_counter > self.retry_count:
51 | # quit if error count greater than retry count
52 | break
53 | try:
54 | if self.scheme == "http":
55 | conn = httplib.HTTPConnection(self.host)
56 | else:
57 | conn = httplib.HTTPSConnection(self.host)
58 | self.auth.apply_auth(url, 'POST', self.headers, self.parameters)
59 | conn.connect()
60 | conn.sock.settimeout(self.twitter_keepalive)
61 | conn.request('POST', self.url, self.body, headers=self.headers)
62 | resp = conn.getresponse()
63 | if resp.status != 200:
64 | self.logger.exception('COMPLIANT STREAM: API Error %s.' % resp.status)
65 | if self.listener.on_error(resp.status) is False:
66 | break
67 | error_counter += 1
68 | #HTTP delay is based on error count, since we have exponential back-off
69 | if resp.status == 420:
70 | http_delay = self.get_http_420_delay(error_counter)
71 | else:
72 | http_delay = self.get_http_delay(error_counter)
73 | self.sleep_time = http_delay
74 | sleep(http_delay)
75 | else:
76 | error_counter = 0
77 | http_delay = 0
78 | tcp_ip_delay = 0
79 | self._read_loop(resp)
80 | except (timeout, socket_error):
81 | if self.listener.on_timeout() == False:
82 | break
83 | if self.running is False:
84 | break
85 | conn.close()
86 | error_counter += 1
87 | self.logger.exception('COMPLIANT STREAM: TCP/IP error caught.')
88 | tcp_ip_delay = self.get_tcp_ip_delay(error_counter)
89 | self.sleep_time = tcp_ip_delay
90 | sleep(tcp_ip_delay)
91 | except httplib.IncompleteRead:
92 | self.logger.exception('COMPLIANT STREAM: Incomplete Read.')
93 |
94 | #We assume there are connection issues at the other end, so we'll
95 | #try again in a little bit.
96 | error_counter += 1
97 | #HTTP delay is based on error count, since we have exponential back-off
98 | http_delay = self.get_http_delay(error_counter)
99 | self.logger.info('COMPLIANT STREAM: HTTP Delay. Sleeping for: %s' % tcp_ip_delay)
100 | self.sleep_time = http_delay
101 | sleep(http_delay)
102 |
103 | except Exception as e:
104 | self.logger.exception('Unexpected exception: %s' % e)
105 | self.logger.exception(e)
106 | print e.args
107 | break
108 | # any other exception is fatal, so kill loop
109 |
110 | # cleanup
111 | self.running = False
112 | if conn:
113 | conn.close()
114 |
115 | if exception:
116 | raise
117 |
118 | def get_http_delay(self, error_count):
119 | ''' Exponential back-off, based on the number of times we've failed (error_count) '''
120 | delay = self.min_http_delay * (2.0 ** error_count)
121 | print "Error Count: %d -- Delay: %d" % (error_count, delay)
122 | if delay > self.max_http_delay:
123 | return self.max_http_delay
124 | return delay
125 |
126 | def get_http_420_delay(self, error_count):
127 | ''' Exponential back-off, based on the number of times we've failed (error_count) '''
128 | delay = self.min_http_420_delay * (2.0 ** error_count)
129 | return delay
130 |
131 | def get_tcp_ip_delay(self, error_count):
132 | ''' Linear back-off, based on the number of times we've failed (error_count) '''
133 | delay = float(self.min_tcp_ip_delay * error_count)
134 | if delay > self.max_tcp_ip_delay:
135 | return self.max_tcp_ip_delay
136 | return delay
137 |
--------------------------------------------------------------------------------
/app/templates/new_collector.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
Create a New Collector
7 |
8 |
9 | {% from "_macros.html" import render_field, form_field %}
10 |
62 |
63 |
64 | {% endblock %}
65 |
66 | {% block script %}
67 |
102 | {% endblock %}
103 |
--------------------------------------------------------------------------------
/app/templates/collector.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
4 |
5 |
6 |
18 |
19 |
20 |
21 |
Network
22 |
{{ collector['network'] }}
23 |
24 |
25 |
26 |
27 |
Collection Type
28 |
{{ collector['collection_type'] }}
29 |
30 |
31 |
32 |
33 |
34 |
35 |
Start Date
36 |
37 | {% if collector['start_date'] %}
38 | {{ collector['start_date'] }}
39 | {% else %}
40 | None
41 | {% endif %}
42 |
43 |
44 |
45 |
End Date
46 |
47 | {% if collector['end_date'] %}
48 | {{ collector['end_date'] }}
49 | {% else %}
50 | None
51 | {% endif %}
52 |
53 |
54 |
55 |
56 | {% if collector['network'] == 'twitter' %}
57 |
58 |
59 |
60 |
61 |
API Filter
62 |
63 | {% if collector['api'] %}
64 | {{ collector['api'] }}
65 | {% else %}
66 | None
67 | {% endif %}
68 |
69 |
70 |
71 |
72 |
73 |
Languages
74 |
75 | {% if collector['languages'] %}
76 | {% for lang in collector['languages'] %}
77 | {{ lang }},
78 | {% endfor %}
79 | {% else %}
80 | None
81 | {% endif %}
82 |
83 |
84 |
85 |
86 |
87 |
Locations
88 |
89 | {% if collector['locations'] %}
90 | {{ collector['locations'] }}
91 | {% else %}
92 | None
93 | {% endif %}
94 |
95 |
96 | {% endif %}
97 |
98 |
99 |
100 | {% if task_status %}
101 |
102 |
103 | ×
104 |
105 | {{ task_status }}
106 |
107 | {% endif %}
108 |
109 |
115 |
116 |
117 |
118 |
119 |
Collector Terms
120 |
121 | {% if collector['terms_list'] %}
122 |
123 |
124 |
125 |
126 | Term
127 | Type
128 | Collecting
129 | Start / Stop Dates
130 | Term ID
131 |
132 |
133 |
134 | {% for term in collector['terms_list'] %}
135 |
136 | {{ term['term'] }}
137 | {{ term['type'] }}
138 |
139 | {% if term['collect'] == 1 %}
140 | Yes
141 | {% else %}
142 | No
143 | {% endif %}
144 |
145 |
146 | {% for range in term['history'] %}
147 | {{ range['start_date'] }} -- {{ range['end_date'] }}
148 | {% endfor %}
149 |
150 |
151 | {% if term['id'] %}
152 | {{ term['id'] }}
153 | {% else %}
154 | None
155 | {% endif %}
156 |
157 |
158 | {% endfor %}
159 |
160 |
161 | {% else %}
162 | No Terms
163 | {% endif %}
164 |
165 |
166 | {% endblock %}
167 |
--------------------------------------------------------------------------------
/scripts/data_pull/Tweet-QueryScript-Cond.py:
--------------------------------------------------------------------------------
1 | # This script queries tweet objects from MongoDB *without conditions* and writes to a CSV file
2 | # The script reads the settings from config.py
3 | # Output: All ID fields are prepended with ID_
4 | # Array fields are | separated
5 | # If keys do not exist, write NULL
6 | # By Sikana Tanupabrungsun. March 16, 2018
7 |
8 | # Query conditions:
9 | # tweet_id=[id_1, id_2]
10 | # user_id=[user_id_1, user_id_2]
11 | # screen_name=[name_1, name_2]
12 | # created_at_from=datetime(YYYY, M, D, H)
13 | # created_at_to=datetime(YYYY, M, D, H)
14 | # retweet_included=True|False
15 |
16 | # encoding=utf8
17 | import os
18 | import csv
19 | import sys
20 | import pymongo
21 | import ConfigCond as cfg
22 | from datetime import datetime
23 |
24 | reload(sys)
25 | sys.setdefaultencoding('utf8')
26 |
27 |
28 | def query_conditions(tweet_id=None,
29 | user_id=None, screen_name=None,
30 | created_at_from=None, created_at_to=None,
31 | retweet_included=True):
32 |
33 | query = {}
34 | if tweet_id is not None and len(tweet_id) > 0:
35 | tweet_id = [long(id) for id in tweet_id]
36 | query['id'] = {'$in': tweet_id}
37 | if user_id is not None and len(user_id) > 0:
38 | user_id = [long(id) for id in user_id]
39 | query['user.id'] = {'$in': user_id}
40 | if screen_name is not None and len(screen_name) > 0:
41 | query['user.screen_name'] = {'$in': screen_name}
42 | if created_at_from or created_at_to:
43 | query['created_ts'] = {'$gte': created_at_from,
44 | '$lte': created_at_to}
45 | if retweet_included is not None:
46 | if retweet_included == False:
47 | query['retweeted_status'] = {'$ne': None}
48 |
49 | return query
50 |
51 | if __name__ == "__main__":
52 | connection = pymongo.MongoClient()
53 |
54 | # Check if DB is password protected
55 | if cfg.DB_AUTH['DB_IS_AUTH']:
56 | connection.admin.authenticate(cfg.DB_AUTH['DB_USERNAME'], cfg.DB_AUTH['DB_PASSWORD'])
57 |
58 | db = connection[cfg.DB['DB_NAME']][cfg.DB['COL_NAME']]
59 |
60 | # Get keys
61 | keys = cfg.OUTPUT['HEADER']
62 |
63 | # Outfile Handling #
64 | outfilename = cfg.OUTPUT['OUT_FILENAME']
65 | if os.path.exists(outfilename):
66 | print('%s already exists' % (outfilename))
67 | overwrite = raw_input("Replace the file [y/n]?").lower()
68 | if overwrite <> 'y':
69 | sys.exit(0)
70 |
71 | dirname = os.path.dirname(outfilename)
72 | if not os.path.exists(dirname):
73 | os.makedirs(dirname)
74 |
75 | outcsvfile = open(outfilename, 'w')
76 | outfile = csv.writer(outcsvfile)
77 | outfile.writerow(keys)
78 |
79 | print('Query data from DB: %s Collection: %s' % (cfg.DB['DB_NAME'], cfg.DB['COL_NAME']))
80 | print('Output file: %s' % (outfilename))
81 | print('Output headers: %s' % (','.join(keys)))
82 |
83 | # Conditions
84 | tweet_id=cfg.CONDITIONS['tweet_id']
85 |
86 | if tweet_id is not None and os.path.isfile(tweet_id):
87 | incsvfile = open(tweet_id, 'rb')
88 | infile = csv.DictReader(incsvfile)
89 | tweet_id = []
90 | for row in infile:
91 | tweet_id.append(row['tweet_id'].replace('ID_', ''))
92 |
93 | user_id=cfg.CONDITIONS['user_id']
94 | if user_id is not None and os.path.isfile(user_id):
95 | incsvfile = open(user_id, 'rb')
96 | infile = csv.DictReader(incsvfile)
97 | user_id = []
98 | for row in infile:
99 | user_id.append(row['user_id'].replace('ID_', ''))
100 |
101 | screen_name=cfg.CONDITIONS['screen_name']
102 | if screen_name is not None and os.path.isfile(screen_name):
103 | incsvfile = open(screen_name, 'rb')
104 | infile = csv.DictReader(incsvfile)
105 | screen_name = []
106 | for row in infile:
107 | screen_name.append(row['screen_name'].strip())
108 |
109 | query = query_conditions(tweet_id=tweet_id,
110 | user_id=cfg.CONDITIONS['user_id'],
111 | screen_name=cfg.CONDITIONS['screen_name'],
112 | created_at_from=cfg.CONDITIONS['created_at_from'],
113 | created_at_to=cfg.CONDITIONS['created_at_to'],
114 | retweet_included=cfg.CONDITIONS['retweet_included'])
115 |
116 | print('Query conditions: %s' % (query))
117 | tweets = db.find(query)
118 |
119 | print('Total tweets: %d' % (tweets.count()))
120 |
121 | for tweet in tweets:
122 | out = []
123 | for key in keys:
124 | key_arr = key.split('.')
125 | last_key = None
126 | try:
127 | if len(key_arr) == 3:
128 | my_val = tweet[key_arr[0]][key_arr[1]][key_arr[2]]
129 | last_key = key_arr[2]
130 | elif len(key_arr) == 2:
131 | my_val = tweet[key_arr[0]][key_arr[1]]
132 | last_key = key_arr[1]
133 | else:
134 | my_val = tweet[key]
135 | last_key = key
136 |
137 | if isinstance(my_val, list):
138 | my_val = '|'.join(my_val)
139 | elif last_key == 'created_at':
140 | my_val = datetime.fromtimestamp(int(my_val)/1000).strftime('%a %b %d %X %z %Y')
141 | elif last_key.endswith('id') and my_val is not None:
142 | my_val = 'ID_' + str(my_val)
143 |
144 | out.append(my_val)
145 | except:
146 | out.append(None)
147 |
148 | outfile.writerow(out)
149 |
150 |
--------------------------------------------------------------------------------
/app/templates/base.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | {% block meta %}
13 | {% endblock %}
14 |
15 |
16 | {% block title %}
17 | STACKS - The Researcher's Social Data Collection Tool.
18 | {% endblock %}
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
31 |
32 | {% block css %}
33 | {% endblock %}
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
44 |
45 | {% if g.admin or g.project %}
46 | {% if g.admin %}
47 |
48 |
51 |
52 |
53 |
62 |
63 | {% elif g.project %}
64 |
65 |
100 |
101 |
102 |
113 | {% endif %}
114 |
115 | {% else %}
116 |
117 |
121 | {% endif %}
122 |
123 |
124 |
125 |
126 |
127 |
128 | {% for msg in get_flashed_messages() %}
129 |
130 |
131 | ×
132 |
133 | {{ msg }}
134 |
135 | {% endfor %}
136 |
137 |
138 |
139 |
140 | {% block content %}{% endblock %}
141 |
142 |
143 |
144 | {% block footer %}{% endblock %}
145 |
146 |
147 |
148 |
149 |
150 |
151 | {% block script %}
152 | {% endblock %}
153 |
154 |
155 |
--------------------------------------------------------------------------------
/app/forms.py:
--------------------------------------------------------------------------------
1 | from flask.ext.wtf import Form
2 | from wtforms import StringField, PasswordField, TextAreaField, RadioField, SelectField
3 | from wtforms.fields.html5 import DateField
4 | from wtforms.validators import DataRequired, EqualTo, Optional
5 | from wtforms import ValidationError
6 |
7 |
8 | class RequiredIfNetwork(object):
9 | """
10 | Custom validator to set required fields only for a given network
11 | """
12 | def __init__(self, network_valid):
13 | self.network_valid = network_valid
14 | self.message = 'Field is required for network: %s' % self.network_valid
15 |
16 | def __call__(self, form, field):
17 | network = form['network'].data
18 | if self.network_valid == network:
19 | if field.data is None or field.data == '':
20 | raise ValidationError(self.message)
21 |
22 |
23 | class TwitterTermsVal(object):
24 | """
25 | Custom validator for Twitter terms. They are required if an API is selected
26 | """
27 | def __init__(self):
28 | self.message = 'Terms are required if a Twitter API filter is selected.'
29 |
30 | def __call__(self, form, field):
31 | api_filter = form['api'].data
32 | network = form['network'].data
33 | if network == 'twitter' and api_filter == 'track' or api_filter == 'follow':
34 | if field.data is None or field.data == '':
35 | raise ValidationError(self.message)
36 |
37 |
38 | class LoginForm(Form):
39 | """
40 | Login form for project accounts. Rendered on /login page
41 | """
42 | project_name = StringField('Project Name', [DataRequired()])
43 | password = PasswordField('Password', [DataRequired()])
44 |
45 |
46 | class CreateForm(Form):
47 | """
48 | Project account creation form
49 | """
50 | project_name = StringField('Project Name', [DataRequired()])
51 | email = StringField('Email', [DataRequired()])
52 | password = PasswordField('Password', [DataRequired()])
53 | confirm = PasswordField('Confirm Password', [
54 | DataRequired(),
55 | EqualTo('password', message='Passwords must match.')
56 | ])
57 | description = StringField('Account Description', [DataRequired()])
58 |
59 |
60 | class SetupForm(Form):
61 | """
62 | Admin account setup form
63 | """
64 | project_name = StringField('Project Name', [DataRequired()])
65 | password = PasswordField('Password', [DataRequired()])
66 | confirm = PasswordField('Confirm Password', [
67 | DataRequired(),
68 | EqualTo('password', message='Passwords must match.')
69 | ])
70 |
71 |
72 | class NewCollectorForm(Form):
73 | """
74 | Collector creation form
75 | """
76 | # Universal Collector Information
77 | collector_name = StringField('Collector Name', [DataRequired()])
78 | network = RadioField(
79 | 'Network',
80 | [DataRequired()],
81 | choices=[('twitter', 'Twitter'), ('facebook', 'Facebook')]
82 | )
83 |
84 | """ Facebook Info """
85 | # Collection type will become valid for all networks eventually
86 | collection_type = SelectField(
87 | 'Collection Type',
88 | [RequiredIfNetwork('facebook')],
89 | choices=[('realtime', 'Real Time'), ('historical', 'Historical')]
90 | )
91 |
92 | # Since & Until
93 | start_date = DateField('Start Date (optional)', [Optional()])
94 | end_date = DateField('End Date (optional)', [Optional()])
95 |
96 | # Facebook OAuth Info
97 | client_id = StringField('Client ID', [RequiredIfNetwork('facebook')])
98 | client_secret = StringField('Client Secret', [RequiredIfNetwork('facebook')])
99 |
100 | # Terms
101 | facebook_terms = TextAreaField('Facebook Terms List', [RequiredIfNetwork('facebook')])
102 |
103 | """ Twitter Info """
104 | # Twitter API filter info
105 | api = SelectField(
106 | 'Twitter API Filter',
107 | [RequiredIfNetwork('twitter')],
108 | choices=[('track', 'Track'), ('follow', 'Follow'), ('none', 'None')]
109 | )
110 |
111 | # OAuth Info
112 | consumer_key = StringField('Consumer Key', [RequiredIfNetwork('twitter')])
113 | consumer_secret = StringField('Consumer Secret', [RequiredIfNetwork('twitter')])
114 | access_token = StringField('Access Token', [RequiredIfNetwork('twitter')])
115 | access_token_secret = StringField('Access Token Secret', [RequiredIfNetwork('twitter')])
116 |
117 | # Languages & Location
118 | languages = TextAreaField('Languages (optional)', [Optional()])
119 | locations = TextAreaField('Locations (optional)', [Optional()])
120 |
121 | # Terms
122 | twitter_terms = TextAreaField('Twitter Terms List', [TwitterTermsVal()])
123 |
124 |
125 | class UpdateCollectorForm(Form):
126 | """
127 | Form for updating a collectors details. Terms are handled separately via form prefixes.
128 | """
129 | collector_name = StringField('Collector Name', [Optional()])
130 | new_terms = TextAreaField('New Terms', [Optional()])
131 |
132 | """ Facebook Fields """
133 | collection_type = SelectField(
134 | 'Collection Type',
135 | [Optional()],
136 | choices=[('realtime', 'Real Time'), ('historical', 'Historical')]
137 | )
138 | start_date = DateField('Start Date', [Optional()])
139 | end_date = DateField('End Date', [Optional()])
140 | client_id = StringField('Client ID', [Optional()])
141 | client_secret = StringField('Client Secret', [Optional()])
142 |
143 | """ Twitter Fields """
144 | api = SelectField(
145 | 'Twitter API Filter',
146 | [Optional()],
147 | choices=[('track', 'Track'), ('follow', 'Follow'), ('none', 'None')]
148 | )
149 | consumer_key = StringField('Consumer Key', [Optional()])
150 | consumer_secret = StringField('Consumer Secret', [Optional()])
151 | access_token = StringField('Access Token', [Optional()])
152 | access_token_secret = StringField('Access Token Secret', [Optional()])
153 | languages = TextAreaField('Languages', [Optional()])
154 | locations = TextAreaField('Locations', [Optional()])
155 |
156 |
157 | class UpdateCollectorTermsForm(Form):
158 | """
159 | For for updating a collector term details. Rendered multiple times w/ prefixes, along with the UpdateCollectorForm()
160 | """
161 | term = StringField('Term', [Optional()])
162 | collect = SelectField(
163 | 'Collect',
164 | [Optional()],
165 | choices=[(0, 'No'), (1, 'Yes')]
166 | )
167 |
168 |
169 | class ProcessControlForm(Form):
170 | """
171 | A base class for collector start/stop/restart buttons
172 | """
173 | pass
174 |
--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
1 | Installing STACK
2 | =========
3 |
4 | Welcome to STACK! This doc will take you through the installation of STACK and will guide you through the basic setup of a data collection process.
5 |
6 | **_This documentation assumes the following:_**
7 |
8 | * You know how to use ssh.
9 | * Your server has [MongoDB already installed](http://docs.mongodb.org/manual/installation/).
10 | * You understand how to edit files using vim (“vi”).
11 | * You have rights and know how to install Python libraries.
12 |
13 | In addition, this doc is geared towards working on a Linux system (for testing we use Ubuntu). We've tried to link to external documentation where installation diverges if you are using other systems.
14 |
15 | Finally, the instructions below detail how to install STACK. To learn more about STACK semantics, or how to interact with the app in more detail, [refer to our wiki](https://github.com/bitslabsyr/stack/wiki).
16 |
17 | ## Step 1) Download STACK
18 |
19 | First, clone this repo to your local machine:
20 |
21 | sudo git clone https://github.com/bitslabsyr/stack.git
22 |
23 | Next, make sure to install the required Python libraries outlined in the _requirements.txt_ file. We use pip to install and manage dependencies:
24 |
25 | sudo pip install -r requirements.txt
26 |
27 | **Note** - We use Python 2.7.6 for STACK.
28 |
29 | ## Step 2) Configuration & Setup
30 |
31 | STACK is built to work with MongoDB. The app stores most configuration information in Mongo, however we also use a configuration (.ini) file to manage some parts of the collection process from the Streaming API. Before getting started with STACK, you'll need to do the following:
32 |
33 | * Setup a project account
34 | * Edit the master configuration file
35 | * Create & start a collector
36 |
37 | These steps are detailed below.
38 |
39 | **Project Account Setup**
40 |
41 | TODO - wiki link
42 |
43 | STACK uses "project accounts" to maintain ownership over collection processes. A project account can own multiple collection processes that run concurrently. _To learn more about project accounts and STACK configuration, [see the wiki](#)_.
44 |
45 | After cloning the STACK repo to your local machine, move into the main directory and activate the _setup.py_ script:
46 |
47 | cd stack
48 | python setup.py
49 |
50 | The setup script initializes the Mongo database with important configuration information, as well as creates your user account. The script will prompt you for the following information:
51 |
52 | * _Project Name_: A unique account name for your project. STACK calls all login accounts "projects" and allows for multiple projects at once.
53 | * _Password_: A password for your project account.
54 | * _Description_: A short description for your project account.
55 |
56 | If the script returns a successful execution notice, you will be able to start creating and running collection processes for that account. You can rerun the setup.py script to create new accounts.
57 |
58 | **Creating a Collector**
59 |
60 | Each project account can instantiate multiple **collectors** that will scrape data. A collector is defined as a singular instance that collects data for a specific set of user-provided terms. A project can have multiple collectors running for a given network.
61 |
62 | To create a collector, first run the following command from the main STACK diretcory:
63 |
64 | python __main__.py db set_collector_detail
65 |
66 | You will then be prompted to provide the following configuration information for the collector:
67 |
68 | * _Project Account Name_ (required): The name of your project account.
69 | * _Collector Name_ (required): Non-unique name to identify your collector instance.
70 | * _Language(s)_ (optional): A list of [BCP-47](http://tools.ietf.org/html/bcp47) language codes. If this used, the collector will only grab tweets in this language. [Learn more here](https://dev.twitter.com/streaming/overview/request-parameters#language) about Twitter language parameters.
71 | * _Location(s)_ (optional): A list of location coordinates. If used, we will collect all geocoded tweets within the location bounding box. Bounding boxes must consist of four lat/long pairs. [Learn more here](https://dev.twitter.com/streaming/overview/request-parameters#locations) about location formatting for the Twitter API.
72 | * _Terms_ (optiona): A line item list of terms for the collector to stream.
73 | * _API_ (required): Three options: track, follow, or none. Each collector can stream from one part of Twitter's Streaming API:
74 | * **Track**: Collects all mentions (hashtags included) for a given list of terms.
75 | * **Follow**: Collects all tweets, retweets, and replies for a given use handle. Each term must be a valid Twitter screen name.
76 | * **None**: Only choose this option if you have not inputted a terms list and are collecting for a given set of language(s) and/or location(s). If you do not track a terms list, make sure you are tracking at least one language or location.
77 | * _OAuth Information_: Four keys used to authenticate with the Twitter API. To get consumer & access tokens, first register your app on [https://dev.twitter.com/apps/new](https://dev.twitter.com/apps/new). Navigate to Keys and Access Tokens and click "Create my access token." **NOTE** - Each collector needs to have a unique set of access keys, or else the Streaming API will limit your connection. The four keys include:
78 | * Consumer Key
79 | * Consumer Secret
80 | * Access Token
81 | * Access Token Secret
82 |
83 | _A note on location tracking_: Location tracking with Twitter is an OR filter. We will collect all tweets that match other filters (such as a terms list or a language identifier) OR tweets in the given location. Please plan accordingly.
84 |
85 | **Config File**
86 |
87 | As of v1.0, most configuration information has been moved away from .ini files and into Mongo. However, we still use the config file to maintain rollover rates for data collection. First, open the config file:
88 |
89 | sudo vi ./stack/twitter/platform.ini
90 |
91 | Edit the following key line items:
92 |
93 | * _tweets_file_date_frmt_: The rollover rate for the collection file (minutes, hours, or days). By default it is set to hours, our suggest rate for production use.
94 |
95 | ## Step 3) Starting STACK
96 |
97 | There a three processes to start to have STACK running in full: collector, processor, and inserter. As noted above, multiple instances of each process can run at the same time. In turn, an instance of each process need not run for STACK to operate.
98 |
99 | * _Collectors_: A specific collector used to scrape data for a given set of filters. Multiple can be created/run for each project account.
100 | * _Processors_: This processes raw tweet files written by a collector. Only one processor can be run for a given project account.
101 | * _Inserters_: A process that takes processed tweets and inserts them into MongoDB. Only one inserter can be run for a given project account.
102 |
103 | TODO - wiki
104 |
105 | To learn more about STACK processes and architecture, [please consult our wiki](#).
106 |
107 | **Starting a Collector**
108 |
109 | To start a collector, you'll need to pass both a project_id and collector_id to STACK via the console. First, get your project accounts ID:
110 |
111 | $ python __main__.py db auth [project_name] [password]
112 | {"status": 1, "message": "Success", "project_id": "your_id_value"}
113 |
114 | Then, using the project_id returned above, find a list of your collectors and their ID values:
115 |
116 | $ python __main__.py db get_collector_ids [project_id]
117 | {"status": 1, "collectors": [{"collector_name": [your_collector_name], "collector_id": [your_collector_id]}]}
118 |
119 | Finally, using the project_id and collector_id values returned above, start the given collector for the project account of your choice:
120 |
121 | sudo python __main__.py controller collect start [project_id] [collector_id]
122 |
123 | Your collector is now running!
124 |
125 | **Starting a Processor**
126 |
127 | To start a processor, the syntax is very similar to the collector start command above. Here though, you only need to pass a project account ID:
128 |
129 | sudo python __main__.py controller process start [project_id] twitter
130 |
131 | Your processor is now running!
132 |
133 | **Starting an Inserter**
134 |
135 | To start an inserter, follow the syntax for starting a processor, but instead calling the "insert" command instead:
136 |
137 | sudo python __main__.py controller insert start [project_id] twitter
138 |
139 | Your inserter is now running!
140 |
141 |
--------------------------------------------------------------------------------
/app/twitter/preprocess.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------------
2 | # Name: module1
3 | # Purpose:
4 | #
5 | # Author: jhemsley
6 | #
7 | # Created: 09/10/2013
8 | # Copyright: (c) jhemsley 2013
9 | # Licence:
10 | #-------------------------------------------------------------------------------
11 | import os
12 | import os.path
13 | import ConfigParser
14 | from pymongo import Connection
15 | import datetime
16 | import logging
17 | import logging.config
18 | import sys
19 | import time
20 | from email.utils import parsedate_tz
21 | import glob
22 | import simplejson
23 | import hashlib
24 | import string
25 | from collections import defaultdict
26 | import re
27 | import traceback
28 | import shutil
29 | import tweetprocessing
30 |
31 | from . import module_dir
32 | from app.models import DB
33 |
34 | PLATFORM_CONFIG_FILE = module_dir + '/platform.ini'
35 | EXPAND_URLS = False
36 |
37 | #connect to mongo
38 | db = DB()
39 |
40 | # function goes out and gets a list of raw tweet data files
41 | # TODO - by project
42 | def get_tweet_file_queue(Config, rawdir):
43 |
44 | tweetsOutFilePath = rawdir + '/'
45 | if not os.path.exists(tweetsOutFilePath):
46 | os.makedirs(tweetsOutFilePath)
47 | tweetsOutFileDateFrmt = Config.get('files', 'tweets_file_date_frmt', 0)
48 | tweetsOutFile = Config.get('files', 'tweets_file', 0)
49 |
50 | # make a pattern of the tweet files we hope to find
51 | tweetFileNamePattern = tweetsOutFilePath + '*' + tweetsOutFile
52 | #print tweetFileNamePattern
53 |
54 | # now get a list of the files in tweet dir that match the pattern
55 | tweetsFileList = glob.glob(tweetFileNamePattern)
56 | # note that the dir list may have '//' as dir separator and may not
57 | tweetsFileList = [s.replace('\\', '/') for s in tweetsFileList]
58 |
59 | # now we don't want a file in the list if it is the one tweets are being added too
60 | # this is a timestamp using the format in the platform config file. if a tweet file is
61 | # in use, we will remove it from the list
62 |
63 | # Remove by time now since we can run two collector threads
64 | timestr = time.strftime(tweetsOutFileDateFrmt)
65 | currentTweetFileNamePattern = tweetsOutFilePath + timestr + '*'
66 | currentTweetFileList = glob.glob(currentTweetFileNamePattern)
67 | currentTweetFileList = [s.replace('\\', '/') for s in currentTweetFileList]
68 |
69 | # this line removes the current live file from the list
70 | for item in currentTweetFileList:
71 | if item in tweetsFileList: tweetsFileList.remove(item)
72 |
73 | return tweetsFileList
74 |
75 | def get_processed_tweets_file_name(Config, rawTweetsFile, rawdir, archdir):
76 |
77 | tweetsOutFilePath = rawdir + '/'
78 | tweet_archive_dir = archdir + '/'
79 | if not os.path.exists(tweet_archive_dir):
80 | os.makedirs(tweet_archive_dir)
81 |
82 | processed_tweets_file = rawTweetsFile.replace(tweetsOutFilePath, tweet_archive_dir)
83 | file_extension = os.path.splitext(rawTweetsFile)[1]
84 | processed_tweets_file = processed_tweets_file.replace(file_extension, '_processed' + file_extension)
85 |
86 | return processed_tweets_file
87 |
88 | def queue_up_processed_tweets(Config, processed_tweets_file, logger, archdir, insertdir):
89 |
90 | tweet_archive_dir = archdir + '/'
91 | tweet_insert_queue_path = insertdir + '/'
92 | if not os.path.exists(tweet_insert_queue_path):
93 | os.makedirs(tweet_insert_queue_path)
94 |
95 | queued_up_tweets_file = processed_tweets_file.replace(tweet_archive_dir, tweet_insert_queue_path)
96 |
97 | shutil.copyfile(processed_tweets_file, queued_up_tweets_file)
98 |
99 | #os.symlink(processed_tweets_file, queued_up_tweets_file)
100 |
101 | logger.info('Queued up %s to %s' % (processed_tweets_file, queued_up_tweets_file))
102 |
103 | def archive_processed_file(Config, rawTweetsFile, logger, rawdir, archdir):
104 |
105 | tweetsOutFilePath = rawdir + '/'
106 | tweet_archive_dir = archdir + '/'
107 | if not os.path.exists(tweet_archive_dir):
108 | os.makedirs(tweet_archive_dir)
109 |
110 | archive_raw_tweets_file = rawTweetsFile.replace(tweetsOutFilePath, tweet_archive_dir)
111 |
112 | shutil.move(rawTweetsFile, archive_raw_tweets_file)
113 |
114 | logger.info('Moved %s to %s' % (rawTweetsFile, archive_raw_tweets_file))
115 |
116 |
117 | def go(project_id, rawdir, archdir, insertdir, logdir):
118 | # Connects to project account DB
119 | project = db.get_project_detail(project_id)
120 | project_name = project['project_name']
121 |
122 | configdb = project['project_config_db']
123 | conn = db.connection[configdb]
124 | project_config_db = conn.config
125 |
126 | # Reference for controller if script is active or not.
127 | project_config_db.update({'module': 'twitter'}, {'$set': {'processor_active': 1}})
128 |
129 | Config = ConfigParser.ConfigParser()
130 | Config.read(PLATFORM_CONFIG_FILE)
131 |
132 | # Creates logger w/ level INFO
133 | logger = logging.getLogger('preprocess')
134 | logger.setLevel(logging.INFO)
135 | # Creates rotating file handler w/ level INFO
136 | fh = logging.handlers.TimedRotatingFileHandler(logdir + '/' + project_name + '-processor-log-' + project_id + '.out', 'D', 1, 30, None, False, False)
137 | fh.setLevel(logging.INFO)
138 | # Creates formatter and applies to rotating handler
139 | format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
140 | datefmt = '%m-%d %H:%M'
141 | formatter = logging.Formatter(format, datefmt)
142 | fh.setFormatter(formatter)
143 | # Finishes by adding the rotating, formatted handler
144 | logger.addHandler(fh)
145 |
146 | logger = logging.getLogger('preprocess')
147 | logger.info('Starting preprocess system')
148 |
149 | if not os.path.exists(rawdir + '/error_tweets/'):
150 | os.makedirs(rawdir + '/error_tweets/')
151 |
152 | error_tweet = open(rawdir + '/error_tweets/error_tweet-' + project_name + '-' + project_id + '.txt', 'a')
153 |
154 | module_config = project_config_db.find_one({'module': 'twitter'})
155 | runPreProcessor = module_config['processor']['run']
156 |
157 | if runPreProcessor:
158 | print 'Starting runPreProcessor'
159 | logger.info('Preprocess start signal')
160 | runLoopSleep = 0
161 |
162 | while runPreProcessor:
163 |
164 | # Get all terms for all collectors
165 | track_list = []
166 | for collector in project['collectors']:
167 | if collector['terms_list']:
168 | tmp_terms = [term['term'] for term in collector['terms_list']]
169 | track_list += tmp_terms
170 |
171 | if track_list:
172 | track_list = list(set(track_list))
173 |
174 | tweetsFileList = get_tweet_file_queue(Config, rawdir)
175 | files_in_queue = len(tweetsFileList)
176 |
177 | if files_in_queue < 1:
178 | time.sleep( 180 )
179 | else:
180 | logger.info('Queue length is %d' % files_in_queue)
181 | rawTweetsFile = tweetsFileList[0]
182 | logger.info('Preprocess raw file: %s' % rawTweetsFile)
183 |
184 | processed_tweets_file = get_processed_tweets_file_name(Config, rawTweetsFile, rawdir, archdir)
185 |
186 | # TODO - Dynamic copy time
187 | # lame workaround, but for now we assume it will take less than a minute to
188 | # copy a file so this next sleep is here to wait for a copy to finish on the
189 | # off chance that we happy to see it just as it is being copied to the directory
190 | time.sleep( 60 )
191 |
192 | f_out = open(processed_tweets_file,'w')
193 |
194 | tweets_list = []
195 | tweet_total = 0
196 | lost_tweets = 0
197 | line_number = 0
198 |
199 | with open(rawTweetsFile) as f:
200 | if '-delete-' not in rawTweetsFile and '-streamlimits-' not in rawTweetsFile:
201 | for line in f:
202 | try:
203 | line_number += 1
204 | line = line.strip()
205 |
206 | tweet_out_string = tweetprocessing.process_tweet(line, track_list, expand_url=EXPAND_URLS)
207 | f_out.write(tweet_out_string)
208 | tweet_total += 1
209 | # print tweet_out_string
210 |
211 | except ValueError, e:
212 | lost_tweets += 1
213 | print "ValueError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)
214 | logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
215 | logging.exception(e)
216 | error_tweet.write(line+"\n")
217 | print traceback.format_exc()
218 | pass
219 | except TypeError, e:
220 | lost_tweets += 1
221 | print "TypeError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)
222 | logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
223 | logging.exception(e)
224 | error_tweet.write(line+"\n")
225 | print traceback.format_exc()
226 | pass
227 | except KeyError, e:
228 | lost_tweets += 1
229 | print "KeyError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)
230 | logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
231 | logging.exception(e)
232 | error_tweet.write(line+"\n")
233 | print traceback.format_exc()
234 | pass
235 | elif '-streamlimits-' in rawTweetsFile:
236 | server_name = os.uname()[1]
237 | try:
238 | collector_id = rawTweetsFile.split('-')[6]
239 | collector = db.get_collector_detail(project_id=project_id, collector_id=collector_id)
240 | col_type = collector['collector']['api']
241 | except:
242 | col_type = 'UNDEFINED'
243 | for line in f:
244 | line = line.strip()
245 | limit_out_string = tweetprocessing.process_limit(line, col_type, server_name, project_name, project_id, collector_id)
246 | f_out.write(limit_out_string)
247 |
248 |
249 | f_out.close()
250 | f.close()
251 |
252 | logger.info('Tweets processed: %d, lost: %d' % (tweet_total, lost_tweets))
253 |
254 | archive_processed_file(Config, rawTweetsFile, logger, rawdir, archdir)
255 | queue_up_processed_tweets(Config, processed_tweets_file, logger, archdir, insertdir)
256 |
257 | # Incrementally delays reconnect if Mongo is offline
258 | exception = None
259 | max_sleep_time = 1800
260 | try:
261 | module_config = project_config_db.find_one({'module': 'twitter'})
262 | runPreProcessor = module_config['processor']['run']
263 | # If mongo is unavailable, decrement processing loop by 2 sec.
264 | # increments until connection is re-established.
265 | except Exception, exception:
266 | print 'Mongo connection for preprocessor refused with exception: %s' % exception
267 | logger.error('Mongo connection for preprocessor refused with exception: %s' % exception)
268 | runLoopSleep += 2
269 | time.sleep(runLoopSleep)
270 |
271 | error_tweet.close()
272 | logger.info('Exiting preprocessor Program...')
273 | print 'Exiting preprocessor Program...'
274 |
275 | # Reference for controller if script is active or not.
276 | project_config_db.update({'module': 'twitter'}, {'$set': {'processor_active': 0}})
277 |
278 |
279 |
280 |
281 |
--------------------------------------------------------------------------------
/app/twitter/tweetprocessing.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions to process tweet meta data
3 | expand URLs, add counts of urls/hashtags/mentions, and list of hashtags/mentions.
4 | """
5 |
6 | import sys
7 | from datetime import datetime, timedelta
8 | import time
9 | from email.utils import parsedate_tz
10 | import simplejson
11 | import re
12 | import hashlib
13 | import string
14 | from collections import defaultdict
15 | import traceback
16 |
17 |
18 | # Parse Twitter created_at datestring and turn it into
19 | def to_datetime(datestring):
20 | time_tuple = parsedate_tz(datestring.strip())
21 | dt = datetime(*time_tuple[:6])
22 | return dt
23 |
24 | def ck_coded_url(urlstring):
25 | cur.execute("""select code, hashtag from tweets_sample_test where url = %s and hashtag in ('ows','occupyoakland','occupyseattle') and date(created_at) between '2011-10-19' and '2012-04-30' and spike is null""", urlstring.encode("utf-8"))
26 | result = cur.fetchone()
27 | if result:
28 | return result
29 | else:
30 | return None
31 |
32 | def process_limit(line, col_type, server_name, project_name, project_id, collector_id):
33 |
34 | line = simplejson.loads(line)
35 | limit = {'collection_type': col_type,
36 | 'server_name': server_name,
37 | 'project_name': project_name,
38 | 'project_id': project_id,
39 | 'collector_id': collector_id,
40 | 'lost_count': line['limit']['track'],
41 | 'time': line['limit']['time'],
42 | 'timestamp_ms': line['limit']['timestamp_ms'],
43 | 'notified': False}
44 |
45 | limit_out_string = simplejson.dumps(limit) + '\n'
46 | return limit_out_string
47 |
48 | def process_tweet(line, track_list, expand_url=False):
49 |
50 | #regular expression to delete emojis
51 | emoji_pattern = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
52 | # List of punct to remove from string for track keyword matching
53 | punct = re.escape('!"$%&\'()*+,-./:;<=>?@[\\]^`{|}~')
54 |
55 |
56 | tweet = simplejson.loads(line)
57 |
58 | # Initialize stack_vars
59 | tweet['stack_vars'] = { 'text_hash': None,
60 | 'created_ts': None,
61 | 'hashtags': [],
62 | 'mentions': [],
63 | 'codes': [],
64 | 'track_kw': {},
65 | 'entities_counts': {},
66 | 'user': {}}
67 |
68 | # Initialize track_kw
69 | tweet['stack_vars']["track_kw"] = { "org_tweet" : {},
70 | "rt_tweet" : {},
71 | "qt_tweet" : {}}
72 |
73 |
74 | if track_list:
75 | track_set = set(track_list)
76 | else:
77 | track_set = None
78 |
79 | if tweet.get('retweeted_status'):
80 | if tweet.get('retweeted_status').get('extended_tweet'):
81 | tweet_type = 'long_retweet'
82 | full_tweet = tweet['retweeted_status']['extended_tweet']
83 | full_tweet_text = full_tweet['full_text']
84 | elif not tweet.get('retweeted_status').get('extended_tweet'):
85 | tweet_type = 'short_retweet'
86 | full_tweet = tweet['retweeted_status']
87 | full_tweet_text = full_tweet['text']
88 | elif tweet.get('extended_tweet'):
89 | tweet_type = 'long_tweet'
90 | full_tweet = tweet['extended_tweet']
91 | full_tweet_text = full_tweet['full_text']
92 | else:
93 | tweet_type = 'short_tweet'
94 | full_tweet = tweet
95 | full_tweet_text = full_tweet['text']
96 |
97 | tweet['stack_vars']['tweet_type'] = tweet_type
98 |
99 | hashtag_num = 0
100 | tweet['stack_vars']['hashtags'] = []
101 | tweet['stack_vars']['mentions'] = []
102 | tweet['stack_vars']['codes'] = []
103 |
104 | if 'hashtags' in full_tweet['entities']:
105 | hashtag_num = len(full_tweet['entities']['hashtags'])
106 | for index in range(len(full_tweet['entities']['hashtags'])):
107 | tweet['stack_vars']['hashtags'].append(full_tweet['entities']['hashtags'][index]['text'].lower())
108 |
109 | urls_num = 0
110 | coded_url_num = 0
111 | urls = []
112 |
113 | if 'urls' in full_tweet['entities']:
114 | urls_num = len(full_tweet['entities']['urls'])
115 |
116 | if expand_url:
117 | for urls in full_tweet['entities']['urls']:
118 | url_code = None
119 | if 'long-url' in urls and urls['long-url'] is not None:
120 | url_code = ck_coded_url(urls['long-url'])
121 | elif "expanded_url" in urls and urls['expanded_url'] is not None:
122 | url_code = ck_coded_url(urls['expanded_url'])
123 | elif "url" in urls:
124 | url_code = ck_coded_url(urls['url'])
125 |
126 | if url_code:
127 | urls['code'] = url_code[0]
128 | urls['hashtag'] = url_code[1]
129 | tweet['stack_vars']['codes'].append(url_code[0])
130 |
131 | coded_url_num = len(tweet['stack_vars']['codes'])
132 |
133 | mentions_num = 0
134 | if "user_mentions" in full_tweet['entities']:
135 | mentions_num = len(full_tweet['entities']['user_mentions'])
136 | for index in range(len(full_tweet['entities']['user_mentions'])):
137 | if "screen_name" in full_tweet['entities']['user_mentions'][index]:
138 | tweet['stack_vars']['mentions'].append(full_tweet['entities']['user_mentions'][index]['screen_name'].lower())
139 |
140 | tweet['stack_vars']['entities_counts'] = { 'urls': urls_num,
141 | 'hashtags': hashtag_num,
142 | 'user_mentions': mentions_num,
143 | 'coded_urls': coded_url_num }
144 |
145 | tweet['stack_vars']['hashtags'].sort()
146 | tweet['stack_vars']['mentions'].sort()
147 |
148 | tweet['stack_vars']['full_tweet_text'] = full_tweet_text
149 | tweet['stack_vars']['text_hash'] = hashlib.md5(full_tweet_text.encode("utf-8")).hexdigest()
150 |
151 |
152 | if track_set:
153 |
154 | myURLs = []
155 | for index in range(len(full_tweet['entities']['urls'])):
156 | myURLs.append(full_tweet['entities']['urls'][index]['expanded_url'].lower())
157 |
158 | hashTags_set = set([x.lower() for x in tweet['stack_vars']['hashtags']])
159 | mentions_set = set([x.lower() for x in tweet['stack_vars']['mentions']])
160 |
161 | track_set = set([x.lower() for x in track_set])
162 | tweet['stack_vars']["track_kw"]["org_tweet"]["hashtags"] = list(set(hashTags_set).intersection(track_set))
163 | tweet['stack_vars']["track_kw"]["org_tweet"]["mentions"] = list(set(mentions_set).intersection(track_set))
164 |
165 | tweet_text = re.sub('[%s]' % punct, ' ', tweet['text'])
166 | tweet_text = emoji_pattern.sub(r'', tweet_text)
167 | tweet_text = tweet_text.lower().split()
168 |
169 | tweet['stack_vars']["track_kw"]["org_tweet"]["text"] = list(set(tweet_text).intersection(track_set))
170 |
171 | tmpURLs = []
172 | for url in myURLs:
173 | for x in track_set:
174 | if x in url:
175 | tmpURLs.append(url)
176 | tweet['stack_vars']["track_kw"]["org_tweet"]["urls"] = list(tmpURLs)
177 |
178 |
179 | # Convert dates 2012-09-22 00:10:46
180 | # Note that we convert these to a datetime object and then convert back to string
181 | # and update the tweet with the new string. We do this becuase we want to find
182 | # and log any process issues here, not when we do an insert.
183 | #
184 | #tweet['created_ts'] = to_datetime(tweet['created_at'])
185 | #tweet['user']['created_ts'] = to_datetime(tweet['user']['created_at'])
186 | t = to_datetime(tweet['created_at'])
187 | tweet['stack_vars']['created_ts'] = t.strftime('%Y-%m-%d %H:%M:%S')
188 |
189 | t = to_datetime(tweet['user']['created_at'])
190 | tweet['stack_vars']['user']['created_ts'] = t.strftime('%Y-%m-%d %H:%M:%S')
191 |
192 | '''
193 | #check if we have a quoted tweet and if it is truncated
194 | if 'quoted_status' in tweet:
195 | if tweet['quoted_status']['truncated']== True :
196 | qt_hashtags = []
197 | qt_mentions = []
198 | qt_urls = []
199 |
200 | for index in range(len(tweet['quoted_status']['extended_tweet']['entities']['hashtags'])):
201 | qt_hashtags.append(tweet['quoted_status']['extended_tweet']['entities']['hashtags'][index]['text'].lower())
202 |
203 | for index in range(len(tweet['quoted_status']['extended_tweet']['entities']['user_mentions'])):
204 | qt_mentions.append(tweet['quoted_status']['extended_tweet']['entities']['user_mentions'][index]['screen_name'].lower())
205 |
206 | for index in range(len(tweet['quoted_status']['extended_tweet']['entities']['urls'])):
207 | qt_urls.append(tweet['quoted_status']['extended_tweet']['entities']['urls'][index]['expanded_url'].lower())
208 |
209 |
210 | if track_set:
211 | qt_hashtags = set([x.lower() for x in qt_hashtags])
212 | qt_mentions = set([x.lower() for x in qt_mentions])
213 | track_set = set([x.lower() for x in track_set])
214 |
215 | tweet['stack_vars']["track_kw"]["qt_tweet"]["hashtags"] = list(set(qt_hashtags).intersection(track_set))
216 | tweet['stack_vars']["track_kw"]["qt_tweet"]["mentions"] = list(set(qt_mentions).intersection(track_set))
217 |
218 | qt_text = re.sub('[%s]' % punct, ' ', tweet['quoted_status']['extended_tweet']['full_text'])
219 | qt_text = emoji_pattern.sub(r'', qt_text)
220 | qt_text = qt_text.lower().split()
221 |
222 | tweet['stack_vars']["track_kw"]["qt_tweet"]["text"] = list(set(qt_text).intersection(track_set))
223 |
224 | tmpURLs = []
225 | for url in qt_urls:
226 | for x in track_set:
227 | if x in url:
228 | tmpURLs.append(url)
229 |
230 | tweet['stack_vars']["track_kw"]["qt_tweet"]["urls"] = list(tmpURLs)
231 |
232 | #Check if we have a quoted tweet and it is not truncated
233 | elif tweet['quoted_status']['truncated'] == False :
234 |
235 | qt_hashtags = []
236 | qt_mentions = []
237 | qt_urls = []
238 |
239 | for index in range(len(tweet['quoted_status']['entities']['hashtags'])):
240 | qt_hashtags.append(tweet['quoted_status']['entities']['hashtags'][index]['text'].lower())
241 |
242 | for index in range(len(tweet['quoted_status']['entities']['user_mentions'])):
243 | qt_mentions.append(tweet['quoted_status']['entities']['user_mentions'][index]['screen_name'].lower())
244 |
245 | for index in range(len(tweet['quoted_status']['entities']['urls'])):
246 | qt_urls.append(tweet['quoted_status']['entities']['urls'][index]['expanded_url'].lower())
247 |
248 |
249 | if track_set:
250 | qt_hashtags = set([x.lower() for x in qt_hashtags])
251 | qt_mentions = set([x.lower() for x in qt_mentions])
252 | track_set = set([x.lower() for x in track_set])
253 |
254 | tweet['stack_vars']["track_kw"]["qt_tweet"]["hashtags"] = list(set(qt_hashtags).intersection(track_set))
255 | tweet['stack_vars']["track_kw"]["qt_tweet"]["mentions"] = list(set(qt_mentions).intersection(track_set))
256 |
257 | qt_text = re.sub('[%s]' % punct, ' ', tweet['quoted_status']['text'])
258 | qt_text = emoji_pattern.sub(r'', qt_text)
259 | qt_text = qt_text.lower().split()
260 |
261 | tweet['stack_vars']["track_kw"]["qt_tweet"]["text"] = list(set(qt_text).intersection(track_set))
262 |
263 | tmpURLs = []
264 | for url in qt_urls:
265 | for x in track_set:
266 | if x in url:
267 | tmpURLs.append(url)
268 | tweet['stack_vars']["track_kw"]["qt_tweet"]["urls"] = list(tmpURLs)
269 | '''
270 |
271 |
272 | tweet_out_string = simplejson.dumps(tweet).encode('utf-8') + '\n'
273 |
274 | return tweet_out_string
--------------------------------------------------------------------------------
/app/twitter/mongoBatchInsert.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------------------
2 | # Name: module1
3 | # Purpose:
4 | #
5 | # Author: jhemsley
6 | #
7 | # Created: 09/10/2013
8 | # Copyright: (c) jhemsley 2013
9 | # Licence:
10 | #-----------------------------------------------------------------------------
11 |
12 |
13 | import os.path
14 | #import json
15 | import ConfigParser
16 | #import datetime
17 | from datetime import datetime, timedelta
18 | import time
19 | import logging
20 | import logging.config
21 | import time
22 | import glob
23 | import simplejson
24 | from email.utils import parsedate_tz
25 | from collections import defaultdict
26 | import sys
27 | import traceback
28 | import string
29 | import config
30 | from pymongo import errors as PymongoErrors
31 |
32 | from . import module_dir
33 | from app.models import DB
34 |
35 | PLATFORM_CONFIG_FILE = module_dir + '/platform.ini'
36 | BATCH_INSERT_SIZE = 1000
37 |
38 | db = DB()
39 | db_central = DB(local=False)
40 |
41 | # function goes out and gets a list of raw tweet data files
42 | def get_processed_tweet_file_queue(Config, insertdir):
43 |
44 | insert_queue_path = insertdir + '/'
45 | if not os.path.exists(insert_queue_path):
46 | os.makedirs(insert_queue_path)
47 |
48 | tweetsOutFile = Config.get('files', 'tweets_file', 0)
49 | file_extension = os.path.splitext(tweetsOutFile)[1]
50 | tweetFileNamePattern = tweetsOutFile.replace(file_extension, '_processed' + file_extension)
51 | tweetFileNamePattern = insert_queue_path + '*' + tweetFileNamePattern
52 |
53 | # now get a list of the files in tweet dir that match the pattern
54 | insert_queue_file_list = glob.glob(tweetFileNamePattern)
55 | # note that the dir list may have '//' as dir separator and may not
56 | final_insert_queue_file_list = [s.replace('\\', '/') for s in insert_queue_file_list]
57 |
58 | return final_insert_queue_file_list
59 |
60 |
61 | # function goes out and gets a list of raw tweet data files
62 | def insert_tweet_list(mongoCollection, tweets_list, line_number, processedTweetsFile, data_db, logger):
63 |
64 | inserted_ids_list = []
65 | # mongo_error_code = -1
66 | try:
67 | # this call returns a list of ids
68 | inserted_ids_list = mongoCollection.insert(tweets_list, continue_on_error=True)
69 | #mongo_error_code = mongoCollection.error()
70 | mongo_error_code = data_db.error()
71 |
72 | if mongo_error_code is not None:
73 | logger.warning("Error %d on mongo insert for (%s)" % (mongo_error_code, processedTweetsFile))
74 |
75 | except ValueError, e:
76 | print "Exception during mongo insert"
77 | logger.warning("Exception during mongo insert at or before file line number %d (%s)" % (line_number, processedTweetsFile))
78 | logging.exception(e)
79 | print traceback.format_exc()
80 | pass
81 |
82 | except PymongoErrors.DuplicateKeyError, e:
83 | print "Exception during mongo insert"
84 | logger.warning("Duplicate error during mongo insert at or before file line number %d (%s)" % (line_number, processedTweetsFile))
85 | logging.exception(e)
86 | print traceback.format_exc()
87 | pass
88 |
89 | return inserted_ids_list
90 |
91 | # Parse Twitter created_at datestring and turn it into
92 | def to_datetime(datestring):
93 | time_tuple = parsedate_tz(datestring.strip())
94 | dt = datetime(*time_tuple[:6])
95 | return dt
96 |
97 | def go(project_id, rawdir, insertdir, logdir):
98 | # Connects to project account DB
99 | project = db.get_project_detail(project_id)
100 | project_name = project['project_name']
101 |
102 | configdb = project['project_config_db']
103 | conn = db.connection[configdb]
104 | project_config_db = conn.config
105 |
106 | # Reference for controller if script is active or not.
107 | project_config_db.update({'module': 'twitter'}, {'$set': {'inserter_active': 1}})
108 |
109 | Config = ConfigParser.ConfigParser()
110 | Config.read(PLATFORM_CONFIG_FILE)
111 |
112 | # Creates logger w/ level INFO
113 | logger = logging.getLogger('mongo_insert')
114 | logger.setLevel(logging.INFO)
115 | # Creates rotating file handler w/ level INFO
116 | fh = logging.handlers.TimedRotatingFileHandler(logdir + '/' + project_name + '-inserter-log-' + project_id + '.out', 'D', 1, 30, None, False, False)
117 | fh.setLevel(logging.INFO)
118 | # Creates formatter and applies to rotating handler
119 | format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
120 | datefmt = '%m-%d %H:%M'
121 | formatter = logging.Formatter(format, datefmt)
122 | fh.setFormatter(formatter)
123 | # Finishes by adding the rotating, formatted handler
124 | logger.addHandler(fh)
125 |
126 | logger.info('Starting process to insert processed tweets in mongo')
127 |
128 | if not os.path.exists(rawdir + '/error_inserted_tweets/'):
129 | os.makedirs(rawdir + '/error_inserted_tweets/')
130 |
131 | error_tweet = open(rawdir + '/error_inserted_tweets/error_inserted_tweet-' + project_name + '-' + project_id + '.txt', 'a')
132 |
133 | db_name = project_name + '_' + project_id
134 | data_db = db.connection[db_name]
135 | insert_db = data_db.tweets
136 |
137 | data_db_central = db_central.connection[config.CT_DB_NAME]
138 |
139 | delete_db = db.connection[db_name + '_delete']
140 | deleteCollection = delete_db['tweets']
141 |
142 | module_config = project_config_db.find_one({'module': 'twitter'})
143 | runMongoInsert = module_config['inserter']['run']
144 |
145 | while runMongoInsert:
146 | queued_tweets_file_list = get_processed_tweet_file_queue(Config, insertdir)
147 | num_files_in_queue = len(queued_tweets_file_list)
148 | #logger.info('Queue length %d' % num_files_in_queue)
149 |
150 | # TODO - end on zero?
151 | if (num_files_in_queue == 0):
152 | time.sleep( 180 )
153 | else:
154 |
155 | processedTweetsFile = queued_tweets_file_list[0]
156 | logger.info('Mongo insert file found: %s' % processedTweetsFile)
157 |
158 | tweets_list = []
159 | tweet_total = 0
160 | lost_tweets = 0
161 | line_number = 0
162 | deleted_tweets = 0
163 | deleted_tweets_list = []
164 | stream_limit_notices = 0
165 | stream_limits_list = []
166 |
167 | # lame workaround, but for now we assume it will take less than a minute to
168 | # copy a file so this next sleep is here to wait for a copy to finish on the
169 | # off chance that we happy to see it just as it is being copied to the directory
170 | time.sleep( 60 )
171 |
172 | with open(processedTweetsFile) as f:
173 | logger.info(processedTweetsFile)
174 | for line in f:
175 | if '-delete-' not in processedTweetsFile and '-streamlimits-' not in processedTweetsFile:
176 | try:
177 | line_number += 1
178 | line = line.strip()
179 |
180 | # print line_number
181 |
182 | tweet = simplejson.loads(line)
183 |
184 | # use tweet id as mongo id
185 | #tweet['_id'] = tweet['id']
186 |
187 | # now, when we did the process tweet step we already worked with
188 | # these dates. If they failed before, they shouldn't file now, but
189 | # if they do we are going to skip this tweet and go on to the next one
190 | t = to_datetime(tweet['created_at'])
191 | tweet['created_ts'] = t
192 |
193 | t = to_datetime(tweet['user']['created_at'])
194 | tweet['user']['created_ts'] = t
195 |
196 | tweets_list.append(tweet)
197 |
198 | except ValueError, e:
199 | lost_tweets += 1
200 | print "ValueError while converting date. tweet not processed: %d (%s)" % (line_number, processedTweetsFile)
201 | logger.warning("ValueError while converting date. tweet not processed: %d (%s)" % (line_number, processedTweetsFile))
202 | logging.exception(e)
203 | error_tweet.write(line+"\n")
204 | print traceback.format_exc()
205 | pass
206 | except TypeError, e:
207 | lost_tweets += 1
208 | print "TypeError while converting date. tweet not processed: %d (%s)" % (line_number, processedTweetsFile)
209 | logger.warning("TypeError while converting date. tweet not processed: %d (%s)" % (line_number, processedTweetsFile))
210 | logging.exception(e)
211 | error_tweet.write(line+"\n")
212 | print traceback.format_exc()
213 | pass
214 | except KeyError, e:
215 | lost_tweets += 1
216 | print "KeyError while converting date. tweet not processed: %d (%s)" % (line_number, processedTweetsFile)
217 | logger.warning("KeyError while converting date. tweet not processed: %d (%s)" % (line_number, processedTweetsFile))
218 | logging.exception(e)
219 | error_tweet.write(line+"\n")
220 | print traceback.format_exc()
221 | pass
222 |
223 | if len(tweets_list) == BATCH_INSERT_SIZE:
224 |
225 | print 'Inserting batch at file line %d' % line_number
226 | inserted_ids_list = insert_tweet_list(insert_db, tweets_list, line_number, processedTweetsFile, data_db, logger)
227 |
228 | failed_insert_count = BATCH_INSERT_SIZE - len(inserted_ids_list)
229 | logger.info('Batch of size %d had %d failed tweet inserts' % (BATCH_INSERT_SIZE, failed_insert_count))
230 | tweets_list = []
231 |
232 | lost_tweets = lost_tweets + failed_insert_count
233 | tweet_total += len(inserted_ids_list)
234 | #print "inserting 5k tweets - %i total" % tweet_total
235 | elif '-delete-' in processedTweetsFile:
236 | deleted_tweets += 1
237 |
238 | line = line.strip()
239 | tweet = simplejson.loads(line)
240 | deleted_tweets_list.append(tweet)
241 |
242 | inserted_ids_list = insert_tweet_list(deleteCollection, deleted_tweets_list, line_number, processedTweetsFile, delete_db, logger)
243 | deleted_tweets_list = []
244 | elif '-streamlimits-' in processedTweetsFile:
245 | stream_limit_notices += 1
246 |
247 | line = line.strip()
248 | notice = simplejson.loads(line)
249 | stream_limits_list.append(notice)
250 |
251 | stream_limit_collection = data_db.limits
252 | inserted_ids_list = insert_tweet_list(stream_limit_collection, stream_limits_list, line_number, processedTweetsFile, data_db, logger)
253 |
254 | # Also inserts to a central limits collection
255 | stream_limit_collection_central = data_db_central.limits
256 | inserted_ids_list_central = insert_tweet_list(stream_limit_collection_central, stream_limits_list, line_number, processedTweetsFile, data_db_central, logger)
257 |
258 | stream_limits_list = []
259 |
260 | if '-delete-' in processedTweetsFile:
261 | print 'Inserted %d delete statuses for file %s.' % (deleted_tweets, processedTweetsFile)
262 | logger.info('Inserted %d delete statuses for file %s.' % (deleted_tweets, processedTweetsFile))
263 |
264 | if '-streamlimits-' in processedTweetsFile:
265 | print 'Inserted %d stream limit statuses for file %s.' % (stream_limit_notices, processedTweetsFile)
266 | logger.info('Inserted %d stream limit statuses for file %s.' % (stream_limit_notices, processedTweetsFile))
267 |
268 |
269 | # make sure we clean up after ourselves
270 | f.close()
271 | os.remove(processedTweetsFile)
272 |
273 | if len(tweets_list) > 0:
274 |
275 | print 'Inserting last set of %d tweets at file line %d' % (len(tweets_list), line_number)
276 | inserted_ids_list = insert_tweet_list(insert_db, tweets_list, line_number, processedTweetsFile, data_db, logger)
277 |
278 | failed_insert_count = len(tweets_list) - len(inserted_ids_list)
279 | logger.info('Insert set of size %d had %d failed tweet inserts' % (len(tweets_list), failed_insert_count) )
280 | tweets_list = []
281 |
282 | lost_tweets = lost_tweets + failed_insert_count
283 | tweet_total += len(inserted_ids_list)
284 |
285 | logger.info('Read %d lines, inserted %d tweets, lost %d tweets for file %s' % (line_number, tweet_total, lost_tweets, processedTweetsFile))
286 |
287 |
288 | module_config = project_config_db.find_one({'module': 'twitter'})
289 | runMongoInsert = module_config['inserter']['run']
290 | # end run loop
291 |
292 | error_tweet.close()
293 | logger.info('Exiting MongoBatchInsert Program...')
294 | print 'Exiting MongoBatchInsert Program...'
295 |
296 | # Reference for controller if script is active or not.
297 | project_config_db.update({'module': 'twitter'}, {'$set': {'inserter_active': 0}})
298 |
--------------------------------------------------------------------------------
/app/controller.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import time
3 | import os
4 | import signal
5 | import atexit
6 |
7 | from bson.objectid import ObjectId
8 |
9 | from models import DB
10 | from app import app
11 |
12 | # TODO - dynamic import
13 | from twitter import ThreadedCollector, preprocess, mongoBatchInsert
14 |
15 | # wd is the directory used to generate filenames for the Controller / Worker
16 | wd = app.config['BASEDIR'] + '/app'
17 |
18 |
19 | class Controller(object):
20 | """
21 | Controller - A class for controlling STACK processes.
22 | Calls the Process() class to start and stop STACK processes.
23 | """
24 |
25 | def __init__(self, process, cmdline=False, home_dir='.', umask=022, verbose=1, **kwargs):
26 | self.db = DB()
27 | self.process = process
28 | self.cmdline = cmdline
29 | self.usage_message = 'controller collect|process|insert start|stop|restart project_id collector_id'
30 |
31 | self.home_dir = home_dir
32 | self.umask = umask
33 | self.verbose = verbose
34 |
35 | if self.cmdline is False:
36 | # Grab information from Flask user object
37 | self.project = kwargs['project']
38 | self.project_id = self.project['project_id']
39 | self.project_name = self.project['project_name']
40 | else:
41 | # Command is coming from the command line, look up info
42 | self.project_id = kwargs['project_id']
43 |
44 | resp = self.db.get_project_detail(self.project_id)
45 | if resp['status']:
46 | self.project_name = resp['project_name']
47 | else:
48 | print 'Project w/ ID %s not found!' % self.project_id
49 | print ''
50 | print 'USAGE: python %s %s' % (sys.argv[0], self.usage_message)
51 | sys.exit(1)
52 |
53 | # Project account DB connection
54 | project_info = self.db.get_project_detail(self.project_id)
55 | configdb = project_info['project_config_db']
56 | project_config_db = self.db.connection[configdb]
57 | self.projectdb = project_config_db.config
58 |
59 | # Loads info for process based on type: collector, processor, inserter
60 | if self.process in ['process', 'insert']:
61 | # Only module type needed for processor / inserter
62 | self.module = kwargs['network']
63 | self.collector_id = None
64 | # Set name for worker based on gathered info
65 | self.process_name = self.project_name + '-' + self.process + '-' + self.module + '-' + self.project_id
66 | elif process == 'collect':
67 | # For collectors, also grabs: collector_id, api, collector_name
68 | self.collector_id = kwargs['collector_id']
69 |
70 | resp = self.db.get_collector_detail(self.project_id, self.collector_id)
71 | if resp['status']:
72 | collector = resp['collector']
73 | self.module = collector['network']
74 | self.api = collector['api']
75 | self.collector_name = collector['collector_name']
76 | else:
77 | print 'Collector (ID: %s) not found!' % self.collector_id
78 | print ''
79 | print 'USAGE: python %s %s' % (sys.argv[0], self.usage_message)
80 | sys.exit(1)
81 |
82 | # Set name for worker based on gathered info
83 | self.process_name = self.project_name + '-' + self.collector_name + '-' + self.process + '-' + self.module + \
84 | '-' + self.collector_id
85 |
86 | # Sets out directories
87 | self.piddir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/pid'
88 | self.logdir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/logs'
89 | self.stddir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/std'
90 |
91 | # Sets data dirs
92 | # TODO - deprecate w/ Facebook
93 | self.rawdir = app.config[
94 | 'DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + '/raw'
95 | self.archdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \
96 | '/archive'
97 | self.insertdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \
98 | '/insert_queue'
99 |
100 | # Creates dirs if they don't already exist
101 | if not os.path.exists(self.piddir): os.makedirs(self.piddir)
102 | if not os.path.exists(self.stddir): os.makedirs(self.stddir)
103 |
104 | # These directories only need be created for Twitter
105 | # TODO - deprecate w/ Facebook
106 | if self.module == 'twitter':
107 | if not os.path.exists(self.logdir): os.makedirs(self.logdir)
108 | if not os.path.exists(self.rawdir): os.makedirs(self.rawdir)
109 | if not os.path.exists(self.archdir): os.makedirs(self.archdir)
110 | if not os.path.exists(self.insertdir): os.makedirs(self.insertdir)
111 |
112 | # Sets outfiles
113 | self.pidfile = self.piddir + '/%s.pid' % self.process_name
114 | self.stdout = self.stddir + '/%s-stdout.txt' % self.process_name
115 | self.stderr = self.stddir + '/%s-stderr.txt' % self.process_name
116 | self.stdin = self.stddir + '/%s-stdin.txt' % self.process_name
117 |
118 | # Creates the std files for the daemon
119 | if not os.path.isfile(self.stdout):
120 | create_file = open(self.stdout, 'w')
121 | create_file.close()
122 | if not os.path.isfile(self.stdin):
123 | create_file = open(self.stdin, 'w')
124 | create_file.close()
125 | if not os.path.isfile(self.stderr):
126 | create_file = open(self.stderr, 'w')
127 | create_file.close()
128 |
129 | def process_command(self, cmd):
130 | """
131 | Prases the passed command (start / stop / restart) and initiates daemonization
132 | """
133 | # Makes sure the command is relevant
134 | if self.cmdline and cmd not in ['start', 'stop', 'restart']:
135 | print 'Invalid command: %s' % cmd
136 | print ''
137 | print 'USAGE: python %s %s' % (sys.argv[0], self.usage_message)
138 | sys.exit(1)
139 | elif cmd == 'start':
140 | self.start()
141 | elif cmd == 'stop':
142 | self.stop()
143 | elif cmd == 'restart':
144 | self.restart()
145 | else:
146 | print 'USAGE: python %s %s' % (sys.argv[0], self.usage_message)
147 | if self.cmdline:
148 | sys.exit(1)
149 |
150 | def start(self):
151 | """
152 | Method that starts the daemon process
153 | """
154 | print 'Initializing the STACK daemon: %s' % self.process_name
155 |
156 | # Sets flags for given process
157 | resp = ''
158 | if self.process == 'collect':
159 | resp = self.db.set_collector_status(self.project_id, self.collector_id, collector_status=1)
160 | elif self.process == 'process':
161 | resp = self.db.set_network_status(self.project_id, self.module, run=1, process=True)
162 | elif self.process == 'insert':
163 | resp = self.db.set_network_status(self.project_id, self.module, run=1, insert=True)
164 |
165 | if 'status' in resp and resp['status']:
166 | print 'Flags set.'
167 |
168 | # Check to see if running based on pidfile
169 | pid = self.get_pid()
170 | if pid:
171 | message = "pidfile %s already exists. Is it already running?\n"
172 | sys.stderr.write(message % self.pidfile)
173 | sys.exit(1)
174 |
175 | # Start the daemon
176 | self.daemonize()
177 | self.run()
178 | else:
179 | print 'Failed to successfully set flags, try again.'
180 |
181 | def stop(self):
182 | """
183 | Method that sets flags and stops the daemon process
184 | """
185 | print 'Stop command received.'
186 | print 'Step 1) Setting flags on the STACK process to stop.'
187 |
188 | if self.process == 'collect':
189 | # Set flags for the STACK process to stop
190 | resp = self.db.set_collector_status(self.project_id, self.collector_id, collector_status=0)
191 |
192 | # Grab active flag from collector's Mongo document
193 | collector_conf = self.projectdb.find_one({'_id': ObjectId(self.collector_id)})
194 | active = collector_conf['active']
195 | else:
196 | module_conf = self.projectdb.find_one({'module': self.module})
197 | if self.process == 'process':
198 | resp = self.db.set_network_status(self.project_id, self.module, run=0, process=True)
199 | active = module_conf['processor_active']
200 | else:
201 | resp = self.db.set_network_status(self.project_id, self.module, run=0, insert=True)
202 | active = module_conf['inserter_active']
203 |
204 | # TODO - mongo error handling
205 | if resp['status']:
206 | print 'Step 1 complete.'
207 |
208 | # If the daemon has already stopped, then set flags and break
209 | pid = self.get_pid()
210 | if not pid:
211 | print "STACK daemon already terminated."
212 |
213 | # Extra clean up, just in case
214 | if os.path.exists(self.pidfile):
215 | os.remove(self.pidfile)
216 |
217 | if self.process in ['process', 'insert']:
218 | if self.process == 'process':
219 | self.projectdb.update({'module': self.module}, {'$set': {'processor_active': 0}})
220 | else:
221 | self.projectdb.update({'module': self.module}, {'$set': {'inserter_active': 0}})
222 | else:
223 | self.projectdb.update({'_id': ObjectId(self.collector_id)}, {'$set': {'active': 0}})
224 |
225 | return
226 |
227 | # Step 2) Check for task / STACK process completion; loops through 15 times to check
228 |
229 | print 'Step 2) Check for STACK process completion and shutdown the daemon.'
230 |
231 | wait_count = 0
232 | while active == 1:
233 | wait_count += 1
234 |
235 | if self.process in ['process', 'insert']:
236 | module_conf = self.projectdb.find_one({'module': self.module})
237 | if self.process == 'process':
238 | active = module_conf['processor_active']
239 | else:
240 | active = module_conf['inserter_active']
241 | else:
242 | collector_conf = self.projectdb.find_one({'_id': ObjectId(self.collector_id)})
243 | active = collector_conf['active']
244 |
245 | print 'Try %d / 15' % wait_count
246 | print 'Active Status: %d' % active
247 | print 'Trying again in 5 seconds.'
248 | print ''
249 |
250 | if wait_count > 15:
251 | break
252 |
253 | time.sleep(5)
254 |
255 | # Get the pid from the pidfile
256 | pid = self.get_pid()
257 | if not pid:
258 | print "Daemon successfully stopped via thread termination."
259 |
260 | # Just to be sure. A ValueError might occur if the PID file is
261 | # empty but does actually exist
262 | if os.path.exists(self.pidfile):
263 | os.remove(self.pidfile)
264 |
265 | return # Not an error in a restart
266 |
267 | # Try killing the daemon process
268 | print 'Daemon still running w/ loose thread. Stopping now...'
269 |
270 | try:
271 | i = 0
272 | while 1:
273 | os.kill(pid, signal.SIGTERM)
274 | time.sleep(0.1)
275 | i = i + 1
276 | if i % 10 == 0:
277 | os.kill(pid, signal.SIGHUP)
278 | except OSError, err:
279 | err = str(err)
280 | if err.find("No such process") > 0:
281 | if os.path.exists(self.pidfile):
282 | os.remove(self.pidfile)
283 | else:
284 | print str(err)
285 | sys.exit(1)
286 |
287 | # Had to kill the daemon, so set the active status flag accordingly.
288 | if self.process in ['process', 'insert']:
289 | if self.process == 'process':
290 | self.projectdb.update({'module': self.module}, {'$set': {'processor_active': 0}})
291 | else:
292 | self.projectdb.update({'module': self.module}, {'$set': {'inserter_active': 0}})
293 | else:
294 | self.projectdb.update({'_id': ObjectId(self.collector_id)}, {'$set': {'active': 0}})
295 |
296 | print 'Stopped.'
297 |
298 | def restart(self):
299 | """
300 | Simple restart of the daemon
301 | """
302 | # TODO - restart w/out shutting down daemon as part of extensible processor modules
303 | self.stop()
304 | self.start()
305 |
306 | def run(self):
307 | """
308 | Calls the process logic scripts and runs
309 | """
310 | # Backwards compatibility for older Twitter scripts
311 | if self.module == 'twitter':
312 | if self.process == 'collect':
313 | ThreadedCollector.go(self.api, self.project_id, self.collector_id, self.rawdir, self.logdir)
314 | elif self.process == 'process':
315 | preprocess.go(self.project_id, self.rawdir, self.archdir, self.insertdir, self.logdir)
316 | elif self.process == 'insert':
317 | mongoBatchInsert.go(self.project_id, self.rawdir, self.insertdir, self.logdir)
318 | # New approach via extensible collectors
319 | else:
320 | # Dynamically import collect from
321 | os.chdir(app.config['BASEDIR'])
322 |
323 | if self.process == 'collect':
324 | _temp = __import__('app.%s.collect' % self.module, globals(), locals(), ['Collector'], -1)
325 | Collector = _temp.Collector
326 |
327 | c = Collector(self.project_id, self.collector_id, self.process_name)
328 | c.go()
329 | elif self.process == 'process':
330 | _temp = __import__('app.%s.process' % self.module, globals(), locals(), ['Processor'], -1)
331 | Processor = _temp.Processor
332 |
333 | c = Processor(self.project_id, self.process_name, self.module)
334 | c.go()
335 | elif self.process == 'insert':
336 | _temp = __import__('app.%s.insert' % self.module, globals(), locals(), ['Inserter'], -1)
337 | Inserter = _temp.Inserter
338 |
339 | c = Inserter(self.project_id, self.process_name, self.module)
340 | c.go()
341 |
342 | def daemonize(self):
343 | """
344 | Do the UNIX double-fork magic, see Stevens' "Advanced
345 | Programming in the UNIX Environment" for details (ISBN 0201563177)
346 | http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16
347 | """
348 | try:
349 | pid = os.fork()
350 | if pid > 0:
351 | # Exit first parent
352 | sys.exit(0)
353 | except OSError, e:
354 | sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
355 | sys.exit(1)
356 |
357 | # Decouple from parent environment
358 | os.chdir(self.home_dir)
359 | os.setsid()
360 | os.umask(self.umask)
361 |
362 | # Do second fork
363 | try:
364 | pid = os.fork()
365 | if pid > 0:
366 | # Exit from second parent
367 | sys.exit(0)
368 | except OSError, e:
369 | sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
370 | sys.exit(1)
371 |
372 | sys.stdout.flush()
373 | sys.stderr.flush()
374 | si = file(self.stdin, 'r+')
375 | so = file(self.stdout, 'a+')
376 | if self.stderr:
377 | se = file(self.stderr, 'a+', 0)
378 | else:
379 | se = so
380 |
381 | if self.cmdline:
382 | os.dup2(si.fileno(), sys.stdin.fileno())
383 | os.dup2(so.fileno(), sys.stdout.fileno())
384 | os.dup2(se.fileno(), sys.stderr.fileno())
385 |
386 | sys.stderr.flush()
387 | sys.stdout.flush()
388 |
389 | def sigtermhandler(signum, frame):
390 | self.daemon_alive = False
391 | signal.signal(signal.SIGTERM, sigtermhandler)
392 | signal.signal(signal.SIGINT, sigtermhandler)
393 |
394 | if self.verbose >= 1:
395 | print "Started"
396 |
397 | # Write pidfile
398 | atexit.register(
399 | self.delpid) # Make sure pid file is removed if we quit
400 | pid = str(os.getpid())
401 | file(self.pidfile, 'w+').write("%s\n" % pid)
402 |
403 | def delpid(self):
404 | os.remove(self.pidfile)
405 |
406 | def get_pid(self):
407 | try:
408 | pf = file(self.pidfile, 'r')
409 | pid = int(pf.read().strip())
410 | pf.close()
411 | except IOError:
412 | pid = None
413 | except SystemExit:
414 | pid = None
415 | return pid
--------------------------------------------------------------------------------
/app/processes.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import time
4 | import json
5 | import threading
6 |
7 | from bson.objectid import ObjectId
8 |
9 | from models import DB
10 | from app import app
11 |
12 |
13 | class BaseCollector(object):
14 | """
15 | Extensible base class for all STACK collectors
16 | """
17 |
18 | def __init__(self, project_id, collector_id, process_name):
19 | self.project_id = project_id
20 | self.collector_id = collector_id
21 | self.process_name = process_name
22 | self.collecting_data = False
23 |
24 | # Sets up connection w/ project config DB & loads in collector info
25 | self.db = DB()
26 |
27 | project = self.db.get_project_detail(self.project_id)
28 | if project['status']:
29 | self.project_name = project['project_name']
30 |
31 | configdb = project['project_config_db']
32 | project_db = self.db.connection[configdb]
33 | self.project_db = project_db.config
34 |
35 | resp = self.db.get_collector_detail(self.project_id, self.collector_id)
36 | if resp['status']:
37 | collector_info = resp['collector']
38 |
39 | # Load in collector info
40 | self.collector_name = collector_info['collector_name']
41 | self.network = collector_info['network']
42 | self.api = collector_info['api']
43 | self.collection_type = collector_info['collection_type']
44 | self.params = collector_info['params']
45 | self.terms_list = collector_info['terms_list']
46 | self.languages = collector_info['languages']
47 | self.locations = collector_info['location']
48 | self.auth = collector_info['api_auth']
49 | # TODO - file format to Mongo
50 | # TODO - less then hour = warning
51 | self.file_format = '%Y%m%d-%H'
52 |
53 | # If this is a streaming collector
54 | if self.collection_type == 'realtime':
55 | self.project_db.update({'_id': ObjectId(self.collector_id)}, {'$set': {'stream_limits': []}})
56 |
57 | # Sets up logdir and logging
58 | logdir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/logs'
59 | if not os.path.exists(logdir):
60 | os.makedirs(logdir)
61 |
62 | # Sets logger w/ name collector_name and level INFO
63 | self.logger = logging.getLogger(self.collector_name)
64 | self.logger.setLevel(logging.INFO)
65 |
66 | # Sets up logging file handler
67 | logfile = logdir + '/%s.log' % self.process_name
68 | # TODO - logging params
69 | # TODO - port logging rotation params to Mongo for user control later / these default values good
70 | handler = logging.handlers.TimedRotatingFileHandler(logfile, when='D', backupCount=30)
71 | handler.setLevel(logging.INFO)
72 | # Formats
73 | format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
74 | dateformat = '%m-%d %H:%M'
75 | formatter = logging.Formatter(format, dateformat)
76 | handler.setFormatter(formatter)
77 | # Adds handler to logger to finish
78 | self.logger.addHandler(handler)
79 |
80 | self.log('STACK collector %s initiated.' % self.collector_name)
81 |
82 | # Sets up rawdir
83 | self.rawdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.network + '/raw'
84 | if not os.path.exists(self.rawdir):
85 | os.makedirs(self.rawdir)
86 |
87 | self.log('All raw files and directories set. Now starting collector...')
88 |
89 | def go(self):
90 | """
91 | Starts and maintains the loop that monitors the collection thread.
92 | Threads are maintained in the extended versions of the class
93 | """
94 | # Checks if we're supposed to be running
95 | self.run_flag = self.check_flags()['run']
96 | self.collect_flag = 0
97 | self.update_flag = 0
98 |
99 | if self.run_flag:
100 | self.log('Starting Facebook collector %s with signal %d' % (self.process_name, self.run_flag))
101 | self.set_active(1)
102 |
103 | # If run_flag is set - begin the loop
104 | while self.run_flag:
105 | try:
106 | flags = self.check_flags()
107 | self.run_flag = flags['run']
108 | self.collect_flag = flags['collect']
109 | self.update_flag = flags['update']
110 | except Exception as e:
111 | self.log('Mongo connection refused with exception: %s' % e, level='warn')
112 |
113 | # If we've been flagged to stop or update and we're collecting - shut it down
114 | if self.collecting_data and (self.update_flag or not self.collect_flag or not self.run_flag):
115 | self.stop_thread()
116 |
117 | # If we've been flagged to start and we're not collecting - start it up
118 | if self.collect_flag and threading.activeCount() == 1:
119 | self.start_thread()
120 |
121 | time.sleep(2)
122 |
123 | self.log('Exiting Facebook collection.')
124 | self.set_active(0)
125 |
126 | def write(self, data):
127 | """
128 | Called to write raw data to raw file - handles rotation
129 | """
130 | timestr = time.strftime(self.file_format)
131 | filename = self.rawdir + '/' + timestr + '-' + self.collector_name + '-' + self.collector_id + '-out.json'
132 | if not os.path.isfile(filename):
133 | self.log('Creating new raw file: %s' % filename)
134 |
135 | with open(filename, 'a') as rawfile:
136 | rawfile.write(json.dumps(data).encode('utf-8'))
137 | rawfile.write('\n')
138 |
139 | def log(self, message, level='info', thread='MAIN:'):
140 | """
141 | Logs messages to process logfile
142 | """
143 | message = str(message)
144 | if level == 'warn':
145 | self.logger.warning(thread + ' ' + message)
146 | elif level == 'error':
147 | self.logger.error(thread + ' ' + message)
148 | else:
149 | self.logger.info(thread + ' ' + message)
150 |
151 | def check_flags(self):
152 | """
153 | Quick method to grab and return all Mongo flags for given Collector instance
154 | """
155 |
156 | resp = self.db.get_collector_detail(self.project_id, self.collector_id)
157 | collector = resp['collector']
158 |
159 | return {
160 | 'run': collector['collector']['run'],
161 | 'collect': collector['collector']['collect'],
162 | 'update': collector['collector']['update'],
163 | 'active': collector['active']
164 | }
165 |
166 | def set_active(self, active):
167 | """
168 | Quick method to set the active flag to 1 or 0
169 | """
170 | self.project_db.update({'_id': ObjectId(self.collector_id)}, {'$set': {'active': active}})
171 |
172 | def start_thread(self):
173 | """
174 | Modify this method when extending the class to manage the actual collection thread
175 | """
176 |
177 | def stop_thread(self):
178 | """
179 | Modify this method when extending the class to stop the collection thread
180 | """
181 |
182 |
183 | class BaseProcessor(object):
184 | """
185 | Extensible base class for all STACK processors
186 |
187 | NOTE - when extending, must initiate connections to network specific data directories!
188 | """
189 |
190 | def __init__(self, project_id, process_name, network):
191 | self.project_id = project_id
192 | self.process_name = process_name
193 | self.network = network
194 |
195 | # Sets up connection w/ project config DB & loads in collector info
196 | self.db = DB()
197 |
198 | project = self.db.get_project_detail(self.project_id)
199 | self.project_name = project['project_name']
200 |
201 | configdb = project['project_config_db']
202 | project_db = self.db.connection[configdb]
203 | self.project_db = project_db.config
204 |
205 | # Sets up logdir and logging
206 | logdir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/logs'
207 | if not os.path.exists(logdir):
208 | os.makedirs(logdir)
209 |
210 | # Sets logger w/ name collector_name and level INFO
211 | self.logger = logging.getLogger('Processor')
212 | self.logger.setLevel(logging.INFO)
213 |
214 | # Sets up logging file handler
215 | logfile = logdir + '/%s.log' % self.process_name
216 | # TODO - port logging rotation params to Mongo for user control later / these default values good
217 | handler = logging.handlers.TimedRotatingFileHandler(logfile, when='D', backupCount=30)
218 | handler.setLevel(logging.INFO)
219 | # Formats
220 | format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
221 | dateformat = '%m-%d %H:%M'
222 | formatter = logging.Formatter(format, dateformat)
223 | handler.setFormatter(formatter)
224 | # Adds handler to logger to finish
225 | self.logger.addHandler(handler)
226 |
227 | self.log('STACK processor for project %s initiated.' % self.project_name)
228 |
229 | # Sets up data directory
230 | self.datadir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id
231 |
232 | # Establish connections to data directories
233 | self.raw = self.datadir + '/' + self.network + '/raw'
234 | self.archive = self.datadir + '/' + self.network + '/archive'
235 | self.queue = self.datadir + '/' + self.network + '/queue'
236 | self.error = self.datadir + '/' + self.network + '/error'
237 |
238 | if not os.path.exists(self.raw):
239 | os.makedirs(self.raw)
240 | if not os.path.exists(self.archive):
241 | os.makedirs(self.archive)
242 | if not os.path.exists(self.queue):
243 | os.makedirs(self.queue)
244 | if not os.path.exists(self.error):
245 | os.makedirs(self.error)
246 |
247 | self.log('STACK processor setup completed. Now starting...')
248 |
249 | def go(self):
250 | """
251 | Runs the processor
252 | """
253 | self.run_flag = self.check_flags()['run']
254 | self.restart_flag = 0
255 |
256 | if self.run_flag:
257 | self.log('Starting processor %s with signal %d' % (self.process_name, self.run_flag))
258 | self.set_active(1)
259 |
260 | while self.run_flag:
261 | # Call function to process files
262 | self.process()
263 |
264 | # Lastly, see if the run status has changed
265 | try:
266 | flags = self.check_flags()
267 | self.run_flag = flags['run']
268 | self.restart_flag = flags['restart']
269 | except Exception as e:
270 | self.log('Mongo connection refused with exception when attempting to check flags: %s' % e, level='warn')
271 | self.log('Will keep running the processing until reconnect is established.', level='warn')
272 |
273 | # Clean up upon run loop conclude
274 | self.log('Exiting processor.')
275 | self.set_active(0)
276 |
277 | def log(self, message, level='info', thread='MAIN:'):
278 | """
279 | Logs messages to process logfile
280 | """
281 | message = str(message)
282 | if level == 'warn':
283 | self.logger.warning(thread + ' ' + message)
284 | elif level == 'error':
285 | self.logger.error(thread + ' ' + message)
286 | else:
287 | self.logger.info(thread + ' ' + message)
288 |
289 | def check_flags(self):
290 | """
291 | Quick method to grab and return all Mongo flags for given Collector instance
292 | """
293 | resp = self.project_db.find_one({'module': self.network})
294 |
295 | return {
296 | 'run': resp['processor']['run'],
297 | 'restart': resp['processor']['restart']
298 | }
299 |
300 | def set_active(self, active):
301 | """
302 | Quick method to set the active flag to 1 or 0
303 | """
304 | self.project_db.update({'module': self.network}, {'$set': {'processor_active': active}})
305 |
306 | def process(self):
307 | """
308 | Extend this function to implement your custom processing schemes
309 | """
310 |
311 | class BaseInserter(object):
312 | """
313 | Extensible base class for all STACK processors
314 |
315 | NOTE - when extending, must initiate connections to network specific data directories!
316 | """
317 |
318 | def __init__(self, project_id, process_name, network):
319 | self.project_id = project_id
320 | self.process_name = process_name
321 | self.network = network
322 |
323 | # Sets up connection w/ project config DB & loads in collector info
324 | self.db = DB()
325 |
326 | project = self.db.get_project_detail(self.project_id)
327 | self.project_name = project['project_name']
328 |
329 | # Grabs connection to project config DB
330 | configdb = project['project_config_db']
331 | project_db = self.db.connection[configdb]
332 | self.project_db = project_db.config
333 |
334 | # Grabs connection to insertion DB
335 | # NOTE - on init, need to connect to appropriate network collection
336 | db_name = self.project_name + '_' + self.project_id
337 | self.insert_db = self.db.connection[db_name]
338 |
339 | # Sets up logdir and logging
340 | logdir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/logs'
341 | if not os.path.exists(logdir):
342 | os.makedirs(logdir)
343 |
344 | # Sets logger w/ name collector_name and level INFO
345 | self.logger = logging.getLogger('Inserter')
346 | self.logger.setLevel(logging.INFO)
347 |
348 | # Sets up logging file handler
349 | logfile = logdir + '/%s.log' % self.process_name
350 | # TODO - port logging rotation params to Mongo for user control later / these default values good
351 | handler = logging.handlers.TimedRotatingFileHandler(logfile, when='D', backupCount=30)
352 | handler.setLevel(logging.INFO)
353 | # Formats
354 | format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
355 | dateformat = '%m-%d %H:%M'
356 | formatter = logging.Formatter(format, dateformat)
357 | handler.setFormatter(formatter)
358 | # Adds handler to logger to finish
359 | self.logger.addHandler(handler)
360 |
361 | self.log('STACK inserter for project %s initiated.' % self.project_name)
362 |
363 | # Sets up data directory
364 | self.datadir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id
365 |
366 | # Establish connections to data directories
367 | self.raw = self.datadir + '/' + self.network + '/raw'
368 | self.archive = self.datadir + '/' + self.network + '/archive'
369 | self.queue = self.datadir + '/' + self.network + '/queue'
370 | self.error = self.datadir + '/' + self.network + '/error'
371 |
372 | if not os.path.exists(self.raw):
373 | os.makedirs(self.raw)
374 | if not os.path.exists(self.archive):
375 | os.makedirs(self.archive)
376 | if not os.path.exists(self.queue):
377 | os.makedirs(self.queue)
378 | if not os.path.exists(self.error):
379 | os.makedirs(self.error)
380 |
381 | self.log('STACK processor setup completed. Now starting...')
382 |
383 | def go(self):
384 | """
385 | Runs the processor
386 | """
387 | self.run_flag = self.check_flags()['run']
388 | self.restart_flag = 0
389 |
390 | if self.run_flag:
391 | self.log('Starting inserter %s with signal %d' % (self.process_name, self.run_flag))
392 | self.set_active(1)
393 |
394 | while self.run_flag:
395 | # Call function to process files
396 | self.insert()
397 |
398 | # Lastly, see if the run status has changed
399 | try:
400 | flags = self.check_flags()
401 | self.run_flag = flags['run']
402 | self.restart_flag = flags['restart']
403 | except Exception as e:
404 | self.log('Mongo connection refused with exception when attempting to check flags: %s' % e, level='warn')
405 | self.log('Will keep running the processing until reconnect is established.', level='warn')
406 |
407 | # Clean up upon run loop conclude
408 | self.log('Exiting inserter.')
409 | self.set_active(0)
410 |
411 | def log(self, message, level='info', thread='MAIN:'):
412 | """
413 | Logs messages to process logfile
414 | """
415 | message = str(message)
416 | if level == 'warn':
417 | self.logger.warning(thread + ' ' + message)
418 | elif level == 'error':
419 | self.logger.error(thread + ' ' + message)
420 | else:
421 | self.logger.info(thread + ' ' + message)
422 |
423 | def check_flags(self):
424 | """
425 | Quick method to grab and return all Mongo flags for given Collector instance
426 | """
427 | resp = self.project_db.find_one({'module': self.network})
428 |
429 | return {
430 | 'run': resp['inserter']['run'],
431 | 'restart': resp['inserter']['restart']
432 | }
433 |
434 | def set_active(self, active):
435 | """
436 | Quick method to set the active flag to 1 or 0
437 | """
438 | self.project_db.update({'module': self.network}, {'$set': {'inserter_active': active}})
439 |
440 | def insert(self):
441 | """
442 | Extend this function to implement your custom processing schemes
443 | """
444 |
--------------------------------------------------------------------------------
/__main__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import json
4 |
5 | from werkzeug import generate_password_hash
6 |
7 | from app.controller import Controller
8 | from app.models import DB
9 |
10 | basedir = os.getcwd()
11 |
12 | if __name__ == "__main__":
13 |
14 | USAGE = 'USAGE: python __main__.py db|controller {db_method}|{controller_method} {params}'
15 |
16 | db_methods = [
17 | 'create_project',
18 | 'auth',
19 | 'get_project_list',
20 | 'get_project_detail',
21 | 'get_collector_detail',
22 | 'get_network_detail',
23 | 'set_collector_detail',
24 | 'set_network_status',
25 | 'set_collector_status',
26 | 'get_collector_ids',
27 | 'update_collector_detail'
28 | ]
29 |
30 | controller_processes = ['collect', 'process', 'insert']
31 | controller_commands = ['start', 'stop', 'restart']
32 |
33 | try:
34 | wrapper = sys.argv[1]
35 | except:
36 | print USAGE
37 | sys.exit()
38 | try:
39 | method = sys.argv[2]
40 | except:
41 | print USAGE
42 | sys.exit()
43 |
44 | if wrapper not in ['db', 'controller']:
45 | print USAGE
46 | sys.exit()
47 |
48 | if wrapper == 'db' and method in db_methods:
49 | db = DB()
50 |
51 | if method == 'create_project':
52 | """
53 | python __main__.py db create_project
54 | """
55 |
56 | print
57 | print 'Welcome to STACKS! Please fill out the following information \
58 | to get started:'
59 | print
60 | print 'Project Name - one word, NO hyphens (-), underscores (_), or \
61 | spaces'
62 | print
63 | print 'Email - one or more email(s) used for status reports and \
64 | issue notices.'
65 | print
66 | print 'Password - used for validation down the road'
67 | print
68 | print 'Description - a quick description about your project'
69 |
70 | project_name = raw_input('Project Name: ')
71 | password = raw_input('Password: ')
72 | hashed_password = generate_password_hash(password)
73 |
74 | cont = True
75 | email = []
76 | while cont:
77 | inut_email = raw_input('Email: ')
78 | email.append(inut_email)
79 |
80 | add_more = raw_input('Add Another Email [y/n]: ')
81 | if add_more is not 'y':
82 | cont = False
83 |
84 | description = raw_input('Description: ')
85 |
86 | resp = db.create(project_name, password, hashed_password, description=description, email=email)
87 | print json.dumps(resp, indent=1)
88 |
89 | elif method == 'auth':
90 | """
91 | python __main__.py db auth project_name password
92 | """
93 | project_name = sys.argv[3]
94 | password = sys.argv[4]
95 | resp = db.auth(project_name, password)
96 | print json.dumps(resp, indent=1)
97 |
98 | elif method == 'get_project_list':
99 | """
100 | python __main__.py db get_project_list
101 | """
102 | resp = db.get_project_list()
103 | print json.dumps(resp, indent=1)
104 |
105 | elif method == 'get_collector_ids':
106 | """
107 | python __main__.py db get_collector_ids project_id
108 | """
109 | project_id = sys.argv[3]
110 | resp = db.get_collector_ids(project_id)
111 | print json.dumps(resp, indent=1)
112 | elif method == 'get_project_detail':
113 | """
114 | python __main__.py db get_project_detail project_id
115 | """
116 | project_id = sys.argv[3]
117 | resp = db.get_project_detail(project_id)
118 | print json.dumps(resp, indent=1)
119 | elif method == 'get_collector_detail':
120 | """
121 | python __main__.py db get_collector_detail project_id collector_id
122 | """
123 | project_id = sys.argv[3]
124 | collector_id = sys.argv[4]
125 | resp = db.get_collector_detail(project_id, collector_id)
126 | print json.dumps(resp, indent=1)
127 | elif method == 'get_network_detail':
128 | """
129 | python __main__.py db get_network_detail project_id network
130 | """
131 | project_id = sys.argv[3]
132 | network = sys.argv[4]
133 | resp = db.get_network_detail(project_id, network)
134 | print json.dumps(resp, indent=1)
135 | elif method == 'set_collector_detail':
136 | """
137 | python __main__.py db set_collector_detail
138 |
139 | INPUT FORMATTING
140 |
141 | terms_list = '["your", "array", "of", "terms"]' | none
142 | languages = '["array", "of", "BPR-47 language codes"]' | none
143 | location = '["array", "of", "location", "points"]' | none
144 |
145 | Can be used to both create and update a collector's details
146 | """
147 |
148 | print ''
149 | print 'To create a collector, please fill in the fields when asked.'
150 | print ''
151 | print 'For the fields "languages", "locations", and "terms" please fill in either a command separated list, or "none":'
152 | print '------'
153 | print 'languages = list, of, codes | none'
154 | print 'Ex. = pr, en'
155 | print ''
156 | print 'locations = list, of, location, points | none'
157 | print 'Ex. = -74, 40, -73, 41'
158 | print ''
159 | print 'terms = list, of, terms | none'
160 | print 'Ex. = social, media'
161 | print ''
162 | print 'If you creating a Facebook collector, please specify the "collection_type", "start_date" and "end_date" fields:'
163 | print '------'
164 | print 'collection_type = realtime | historical'
165 | print ''
166 | print 'start_date = 2015-04-01 | none'
167 | print 'end_date = 2014-04-01 | none'
168 | print ''
169 |
170 | project_name = raw_input('Project Name: ')
171 | password = raw_input('Password: ')
172 |
173 | resp = db.auth(project_name, password)
174 | if resp['status']:
175 | project_id = resp['project_id']
176 | else:
177 | print 'Invalid Project! Please try again.'
178 | sys.exit(0)
179 |
180 | collector_name = raw_input('Collector Name: ')
181 | network = raw_input('Network: ').lower()
182 |
183 | terms_list = raw_input('Terms: ')
184 | if terms_list == 'none':
185 | terms_list = None
186 | else:
187 | terms_list = terms_list.split(',')
188 |
189 | languages = None
190 | locations = None
191 | api = None
192 | start_date = None
193 | end_date = None
194 |
195 | if network == 'twitter':
196 | languages = raw_input('Languages: ')
197 | if languages == 'none':
198 | languages = None
199 | else:
200 | languages = languages.replace(' ', '')
201 | languages = languages.split(',')
202 |
203 | locations = raw_input('Locations: ')
204 | if locations == 'none':
205 | locations = None
206 | else:
207 | locations = locations.replace(' ', '')
208 | locations = locations.split(',')
209 |
210 | if len(locations) % 4 is not 0:
211 | print 'The number of location coordinates need to be in pairs of four. Please consult the Twitter docs and try again.'
212 | sys.exit(0)
213 |
214 | collection_type = None
215 |
216 | api = raw_input('API: ')
217 |
218 | consumer_key = raw_input('Consumer Key: ')
219 | consumer_secret = raw_input('Consumer Secret: ')
220 | access_token = raw_input('Access Token: ')
221 | access_token_secret = raw_input('Access Token Secret: ')
222 |
223 | api_credentials_dict = {
224 | 'consumer_key' : consumer_key,
225 | 'consumer_secret' : consumer_secret,
226 | 'access_token' : access_token,
227 | 'access_token_secret' : access_token_secret
228 | }
229 |
230 | elif network == 'facebook':
231 | collection_type = raw_input('Collection Type: ')
232 | start_date = raw_input('Start Date: ')
233 | end_date = raw_input('End Date: ')
234 |
235 | # TODO - start and end date reqs for historical
236 | if start_date == 'none':
237 | start_date = None
238 | if end_date == 'none':
239 | end_date = None
240 |
241 | client_id = raw_input('Client ID: ')
242 | client_secret = raw_input('Client Secret: ')
243 |
244 | api_credentials_dict = {'client_id': client_id, 'client_secret': client_secret}
245 |
246 | resp = db.set_collector_detail(project_id, collector_name, network, collection_type, api_credentials_dict,
247 | terms_list, api=api, languages=languages, location=locations,
248 | start_date=start_date, end_date=end_date)
249 |
250 | print json.dumps(resp, indent=1)
251 |
252 | elif method == 'update_collector_detail':
253 | """
254 | Calls db.update_collector_detail
255 | Can only update a single collector param at a time
256 |
257 | FOR TERMS - must provide term and collection status (1 or 0)
258 | FOR API AUTH CREDS - must provide full list, even if updating one
259 | """
260 | update_params_list = [
261 | 'collector_name',
262 | 'api',
263 | 'auth',
264 | 'terms',
265 | 'languages',
266 | 'locations',
267 | 'collection_type',
268 | 'start_date',
269 | 'end_date'
270 | ]
271 |
272 | update_param = sys.argv[3]
273 | if update_param not in update_params_list:
274 | print 'Invalid update paramter. Please try again.'
275 | print 'Valid update params: collector_name, api, auth, terms, \
276 | languages, locations, collection_type, start_date, \
277 | end_date.'
278 | sys.exit(1)
279 |
280 | print 'Collector update function called.'
281 | print ''
282 | print 'FOR TERMS - must provide term value and collection status.'
283 | print ' 1 = collect | 0 = do not collect'
284 | print ''
285 | print 'FOR OAUTH CREDS - must provide full list'
286 | print ''
287 | print 'FOR languages and locations - must provide full new list of codes. Update will overwrite.'
288 | print ''
289 | print 'languages = list, of, codes | none'
290 | print 'Ex. = pr, en'
291 | print ''
292 | print 'locations = list, of, location, points | none'
293 | print 'Ex. = -74, 40, -73, 41'
294 | print ''
295 | print 'FOR start & end dates for Facebook, please use the following format:'
296 | print 'YYYY-MM-DD | none'
297 | print ''
298 | print 'FOR collection_type for Facebook: historical | realtime'
299 | print ''
300 | print 'Updating for param: %s' % update_param
301 | print ''
302 |
303 | project_name = raw_input('Project Name: ')
304 | password = raw_input('Password: ')
305 |
306 | resp = db.auth(project_name, password)
307 | if resp['status']:
308 | project_id = resp['project_id']
309 | else:
310 | print 'Invalid Project! Please try again.'
311 | sys.exit(0)
312 |
313 | collector_id = raw_input('Collector ID: ')
314 | resp = db.get_collector_detail(project_id, collector_id)
315 | resp = resp['collector']
316 |
317 | params = {}
318 |
319 | # First, do network-wide updates
320 | if update_param == 'collector_name':
321 | params['collector_name'] = raw_input('New Collector Name: ')
322 | elif update_param == 'terms':
323 | # Sets term type value based on collector API
324 | if resp['network'] == 'facebook':
325 | term_type = 'page'
326 | elif resp['api'] == 'follow':
327 | term_type = 'handle'
328 | else:
329 | term_type = 'term'
330 |
331 | # Adds term dict to the params dict based on info provided, will be parsed by update method
332 | cont = True
333 | params['terms_list'] = []
334 | while cont:
335 | new_term = raw_input('Term: ')
336 | collect_status = int(raw_input('Collect: '))
337 |
338 | if collect_status not in [1, 0]:
339 | print 'Invalid collect status. Must be 1 or 0.'
340 | sys.exit(0)
341 |
342 | params['terms_list'].append({
343 | 'term': new_term,
344 | 'collect': collect_status,
345 | 'type': term_type,
346 | 'id': None
347 | })
348 |
349 | cont_ask = raw_input('Continue? [y/n]: ')
350 | cont_ask = cont_ask.lower()
351 | if cont_ask == 'y':
352 | cont = True
353 | else:
354 | cont = False
355 |
356 | # Next, network specific updates
357 | if resp['network'] == 'twitter':
358 | if update_param == 'api':
359 | params['api'] = raw_input('New API Filter: ')
360 |
361 | elif update_param == 'languages':
362 | languages = raw_input('New Language Codes List: ')
363 |
364 | if languages == 'none':
365 | languages = None
366 | else:
367 | languages = languages.replace(' ', '')
368 | languages = languages.split(',')
369 |
370 | params['languages'] = languages
371 |
372 | elif update_param == 'locations':
373 | locations = raw_input('New Location Codes List: ')
374 |
375 | if locations == 'none':
376 | locations = None
377 | else:
378 | locations = locations.replace(' ', '')
379 | locations = locations.split(',')
380 |
381 | params['location'] = locations
382 |
383 | elif update_param == 'auth':
384 | consumer_key = raw_input('Consumer Key: ')
385 | consumer_secret = raw_input('Consumer Secret: ')
386 | access_token = raw_input('Access Token: ')
387 | access_token_secret = raw_input('Access Token Secret: ')
388 |
389 | api_credentials_dict = {
390 | 'consumer_key' : consumer_key,
391 | 'consumer_secret' : consumer_secret,
392 | 'access_token' : access_token,
393 | 'access_token_secret' : access_token_secret
394 | }
395 | params['api_auth'] = api_credentials_dict
396 |
397 | # Now, Facebook params
398 | elif resp['network'] == 'facebook':
399 | if update_param == 'collection_type':
400 | params['collection_type'] = raw_input('Collection Type: ')
401 | elif update_param == 'start_date':
402 | start_date = raw_input('Start Date: ')
403 | if start_date == 'none':
404 | params['start_date'] = None
405 | else:
406 | params['start_date'] = start_date
407 |
408 | elif update_param == 'end_date':
409 | end_date = raw_input('End Date: ')
410 | if end_date == 'none':
411 | params['end_date'] = None
412 | else:
413 | params['end_date'] = end_date
414 |
415 | elif update_param == 'auth':
416 | client_id = raw_input('Client ID: ')
417 | client_secret = raw_input('Client Secret: ')
418 |
419 | api_credentials_dict = {
420 | 'client_id' : client_id,
421 | 'client_secret': client_secret
422 | }
423 | params['api_auth'] = api_credentials_dict
424 |
425 | resp = db.update_collector_detail(project_id, collector_id, **params)
426 | print json.dumps(resp, indent=1)
427 |
428 | elif wrapper == 'controller' and method in controller_processes:
429 | """
430 | python __main__.py controller collect|process|insert start|stop|restart project_id {collector_id|network}
431 |
432 | WHERE
433 |
434 | collector_id - optional, only needed for a collection controller
435 | network - optional, needed for processor or inserter controllers
436 | """
437 | project_id = sys.argv[4]
438 |
439 | if method == 'collect':
440 | collector_id = sys.argv[5]
441 | c = Controller(cmdline=True, project_id=project_id, process=method, collector_id=collector_id)
442 | else:
443 | network = sys.argv[5]
444 | c = Controller(cmdline=True, project_id=project_id, process=method, network=network)
445 |
446 | command = sys.argv[3]
447 | if command in controller_commands:
448 | c.process_command(command)
449 | else:
450 | print 'USAGE: python __main__.py controller collect|process|insert start|stop|restart project_id {collector_id|network}'
451 |
452 | else:
453 | print 'Please try again!'
454 | sys.exit()
455 |
--------------------------------------------------------------------------------
/app/static/bootstrap-theme.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Bootstrap v3.3.4 (http://getbootstrap.com)
3 | * Copyright 2011-2015 Twitter, Inc.
4 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
5 | */
6 |
7 | .btn-default,
8 | .btn-primary,
9 | .btn-success,
10 | .btn-info,
11 | .btn-warning,
12 | .btn-danger {
13 | text-shadow: 0 -1px 0 rgba(0, 0, 0, .2);
14 | -webkit-box-shadow: inset 0 1px 0 rgba(255, 255, 255, .15), 0 1px 1px rgba(0, 0, 0, .075);
15 | box-shadow: inset 0 1px 0 rgba(255, 255, 255, .15), 0 1px 1px rgba(0, 0, 0, .075);
16 | }
17 | .btn-default:active,
18 | .btn-primary:active,
19 | .btn-success:active,
20 | .btn-info:active,
21 | .btn-warning:active,
22 | .btn-danger:active,
23 | .btn-default.active,
24 | .btn-primary.active,
25 | .btn-success.active,
26 | .btn-info.active,
27 | .btn-warning.active,
28 | .btn-danger.active {
29 | -webkit-box-shadow: inset 0 3px 5px rgba(0, 0, 0, .125);
30 | box-shadow: inset 0 3px 5px rgba(0, 0, 0, .125);
31 | }
32 | .btn-default .badge,
33 | .btn-primary .badge,
34 | .btn-success .badge,
35 | .btn-info .badge,
36 | .btn-warning .badge,
37 | .btn-danger .badge {
38 | text-shadow: none;
39 | }
40 | .btn:active,
41 | .btn.active {
42 | background-image: none;
43 | }
44 | .btn-default {
45 | text-shadow: 0 1px 0 #fff;
46 | background-image: -webkit-linear-gradient(top, #fff 0%, #e0e0e0 100%);
47 | background-image: -o-linear-gradient(top, #fff 0%, #e0e0e0 100%);
48 | background-image: -webkit-gradient(linear, left top, left bottom, from(#fff), to(#e0e0e0));
49 | background-image: linear-gradient(to bottom, #fff 0%, #e0e0e0 100%);
50 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#ffe0e0e0', GradientType=0);
51 | filter: progid:DXImageTransform.Microsoft.gradient(enabled = false);
52 | background-repeat: repeat-x;
53 | border-color: #dbdbdb;
54 | border-color: #ccc;
55 | }
56 | .btn-default:hover,
57 | .btn-default:focus {
58 | background-color: #e0e0e0;
59 | background-position: 0 -15px;
60 | }
61 | .btn-default:active,
62 | .btn-default.active {
63 | background-color: #e0e0e0;
64 | border-color: #dbdbdb;
65 | }
66 | .btn-default.disabled,
67 | .btn-default:disabled,
68 | .btn-default[disabled] {
69 | background-color: #e0e0e0;
70 | background-image: none;
71 | }
72 | .btn-primary {
73 | background-image: -webkit-linear-gradient(top, #337ab7 0%, #265a88 100%);
74 | background-image: -o-linear-gradient(top, #337ab7 0%, #265a88 100%);
75 | background-image: -webkit-gradient(linear, left top, left bottom, from(#337ab7), to(#265a88));
76 | background-image: linear-gradient(to bottom, #337ab7 0%, #265a88 100%);
77 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff265a88', GradientType=0);
78 | filter: progid:DXImageTransform.Microsoft.gradient(enabled = false);
79 | background-repeat: repeat-x;
80 | border-color: #245580;
81 | }
82 | .btn-primary:hover,
83 | .btn-primary:focus {
84 | background-color: #265a88;
85 | background-position: 0 -15px;
86 | }
87 | .btn-primary:active,
88 | .btn-primary.active {
89 | background-color: #265a88;
90 | border-color: #245580;
91 | }
92 | .btn-primary.disabled,
93 | .btn-primary:disabled,
94 | .btn-primary[disabled] {
95 | background-color: #265a88;
96 | background-image: none;
97 | }
98 | .btn-success {
99 | background-image: -webkit-linear-gradient(top, #5cb85c 0%, #419641 100%);
100 | background-image: -o-linear-gradient(top, #5cb85c 0%, #419641 100%);
101 | background-image: -webkit-gradient(linear, left top, left bottom, from(#5cb85c), to(#419641));
102 | background-image: linear-gradient(to bottom, #5cb85c 0%, #419641 100%);
103 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff419641', GradientType=0);
104 | filter: progid:DXImageTransform.Microsoft.gradient(enabled = false);
105 | background-repeat: repeat-x;
106 | border-color: #3e8f3e;
107 | }
108 | .btn-success:hover,
109 | .btn-success:focus {
110 | background-color: #419641;
111 | background-position: 0 -15px;
112 | }
113 | .btn-success:active,
114 | .btn-success.active {
115 | background-color: #419641;
116 | border-color: #3e8f3e;
117 | }
118 | .btn-success.disabled,
119 | .btn-success:disabled,
120 | .btn-success[disabled] {
121 | background-color: #419641;
122 | background-image: none;
123 | }
124 | .btn-info {
125 | background-image: -webkit-linear-gradient(top, #5bc0de 0%, #2aabd2 100%);
126 | background-image: -o-linear-gradient(top, #5bc0de 0%, #2aabd2 100%);
127 | background-image: -webkit-gradient(linear, left top, left bottom, from(#5bc0de), to(#2aabd2));
128 | background-image: linear-gradient(to bottom, #5bc0de 0%, #2aabd2 100%);
129 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff2aabd2', GradientType=0);
130 | filter: progid:DXImageTransform.Microsoft.gradient(enabled = false);
131 | background-repeat: repeat-x;
132 | border-color: #28a4c9;
133 | }
134 | .btn-info:hover,
135 | .btn-info:focus {
136 | background-color: #2aabd2;
137 | background-position: 0 -15px;
138 | }
139 | .btn-info:active,
140 | .btn-info.active {
141 | background-color: #2aabd2;
142 | border-color: #28a4c9;
143 | }
144 | .btn-info.disabled,
145 | .btn-info:disabled,
146 | .btn-info[disabled] {
147 | background-color: #2aabd2;
148 | background-image: none;
149 | }
150 | .btn-warning {
151 | background-image: -webkit-linear-gradient(top, #f0ad4e 0%, #eb9316 100%);
152 | background-image: -o-linear-gradient(top, #f0ad4e 0%, #eb9316 100%);
153 | background-image: -webkit-gradient(linear, left top, left bottom, from(#f0ad4e), to(#eb9316));
154 | background-image: linear-gradient(to bottom, #f0ad4e 0%, #eb9316 100%);
155 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffeb9316', GradientType=0);
156 | filter: progid:DXImageTransform.Microsoft.gradient(enabled = false);
157 | background-repeat: repeat-x;
158 | border-color: #e38d13;
159 | }
160 | .btn-warning:hover,
161 | .btn-warning:focus {
162 | background-color: #eb9316;
163 | background-position: 0 -15px;
164 | }
165 | .btn-warning:active,
166 | .btn-warning.active {
167 | background-color: #eb9316;
168 | border-color: #e38d13;
169 | }
170 | .btn-warning.disabled,
171 | .btn-warning:disabled,
172 | .btn-warning[disabled] {
173 | background-color: #eb9316;
174 | background-image: none;
175 | }
176 | .btn-danger {
177 | background-image: -webkit-linear-gradient(top, #d9534f 0%, #c12e2a 100%);
178 | background-image: -o-linear-gradient(top, #d9534f 0%, #c12e2a 100%);
179 | background-image: -webkit-gradient(linear, left top, left bottom, from(#d9534f), to(#c12e2a));
180 | background-image: linear-gradient(to bottom, #d9534f 0%, #c12e2a 100%);
181 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc12e2a', GradientType=0);
182 | filter: progid:DXImageTransform.Microsoft.gradient(enabled = false);
183 | background-repeat: repeat-x;
184 | border-color: #b92c28;
185 | }
186 | .btn-danger:hover,
187 | .btn-danger:focus {
188 | background-color: #c12e2a;
189 | background-position: 0 -15px;
190 | }
191 | .btn-danger:active,
192 | .btn-danger.active {
193 | background-color: #c12e2a;
194 | border-color: #b92c28;
195 | }
196 | .btn-danger.disabled,
197 | .btn-danger:disabled,
198 | .btn-danger[disabled] {
199 | background-color: #c12e2a;
200 | background-image: none;
201 | }
202 | .thumbnail,
203 | .img-thumbnail {
204 | -webkit-box-shadow: 0 1px 2px rgba(0, 0, 0, .075);
205 | box-shadow: 0 1px 2px rgba(0, 0, 0, .075);
206 | }
207 | .dropdown-menu > li > a:hover,
208 | .dropdown-menu > li > a:focus {
209 | background-color: #e8e8e8;
210 | background-image: -webkit-linear-gradient(top, #f5f5f5 0%, #e8e8e8 100%);
211 | background-image: -o-linear-gradient(top, #f5f5f5 0%, #e8e8e8 100%);
212 | background-image: -webkit-gradient(linear, left top, left bottom, from(#f5f5f5), to(#e8e8e8));
213 | background-image: linear-gradient(to bottom, #f5f5f5 0%, #e8e8e8 100%);
214 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);
215 | background-repeat: repeat-x;
216 | }
217 | .dropdown-menu > .active > a,
218 | .dropdown-menu > .active > a:hover,
219 | .dropdown-menu > .active > a:focus {
220 | background-color: #2e6da4;
221 | background-image: -webkit-linear-gradient(top, #337ab7 0%, #2e6da4 100%);
222 | background-image: -o-linear-gradient(top, #337ab7 0%, #2e6da4 100%);
223 | background-image: -webkit-gradient(linear, left top, left bottom, from(#337ab7), to(#2e6da4));
224 | background-image: linear-gradient(to bottom, #337ab7 0%, #2e6da4 100%);
225 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);
226 | background-repeat: repeat-x;
227 | }
228 | .navbar-default {
229 | background-image: -webkit-linear-gradient(top, #fff 0%, #f8f8f8 100%);
230 | background-image: -o-linear-gradient(top, #fff 0%, #f8f8f8 100%);
231 | background-image: -webkit-gradient(linear, left top, left bottom, from(#fff), to(#f8f8f8));
232 | background-image: linear-gradient(to bottom, #fff 0%, #f8f8f8 100%);
233 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#fff8f8f8', GradientType=0);
234 | filter: progid:DXImageTransform.Microsoft.gradient(enabled = false);
235 | background-repeat: repeat-x;
236 | border-radius: 4px;
237 | -webkit-box-shadow: inset 0 1px 0 rgba(255, 255, 255, .15), 0 1px 5px rgba(0, 0, 0, .075);
238 | box-shadow: inset 0 1px 0 rgba(255, 255, 255, .15), 0 1px 5px rgba(0, 0, 0, .075);
239 | }
240 | .navbar-default .navbar-nav > .open > a,
241 | .navbar-default .navbar-nav > .active > a {
242 | background-image: -webkit-linear-gradient(top, #dbdbdb 0%, #e2e2e2 100%);
243 | background-image: -o-linear-gradient(top, #dbdbdb 0%, #e2e2e2 100%);
244 | background-image: -webkit-gradient(linear, left top, left bottom, from(#dbdbdb), to(#e2e2e2));
245 | background-image: linear-gradient(to bottom, #dbdbdb 0%, #e2e2e2 100%);
246 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdbdbdb', endColorstr='#ffe2e2e2', GradientType=0);
247 | background-repeat: repeat-x;
248 | -webkit-box-shadow: inset 0 3px 9px rgba(0, 0, 0, .075);
249 | box-shadow: inset 0 3px 9px rgba(0, 0, 0, .075);
250 | }
251 | .navbar-brand,
252 | .navbar-nav > li > a {
253 | text-shadow: 0 1px 0 rgba(255, 255, 255, .25);
254 | }
255 | .navbar-inverse {
256 | background-image: -webkit-linear-gradient(top, #3c3c3c 0%, #222 100%);
257 | background-image: -o-linear-gradient(top, #3c3c3c 0%, #222 100%);
258 | background-image: -webkit-gradient(linear, left top, left bottom, from(#3c3c3c), to(#222));
259 | background-image: linear-gradient(to bottom, #3c3c3c 0%, #222 100%);
260 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff3c3c3c', endColorstr='#ff222222', GradientType=0);
261 | filter: progid:DXImageTransform.Microsoft.gradient(enabled = false);
262 | background-repeat: repeat-x;
263 | }
264 | .navbar-inverse .navbar-nav > .open > a,
265 | .navbar-inverse .navbar-nav > .active > a {
266 | background-image: -webkit-linear-gradient(top, #080808 0%, #0f0f0f 100%);
267 | background-image: -o-linear-gradient(top, #080808 0%, #0f0f0f 100%);
268 | background-image: -webkit-gradient(linear, left top, left bottom, from(#080808), to(#0f0f0f));
269 | background-image: linear-gradient(to bottom, #080808 0%, #0f0f0f 100%);
270 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff080808', endColorstr='#ff0f0f0f', GradientType=0);
271 | background-repeat: repeat-x;
272 | -webkit-box-shadow: inset 0 3px 9px rgba(0, 0, 0, .25);
273 | box-shadow: inset 0 3px 9px rgba(0, 0, 0, .25);
274 | }
275 | .navbar-inverse .navbar-brand,
276 | .navbar-inverse .navbar-nav > li > a {
277 | text-shadow: 0 -1px 0 rgba(0, 0, 0, .25);
278 | }
279 | .navbar-static-top,
280 | .navbar-fixed-top,
281 | .navbar-fixed-bottom {
282 | border-radius: 0;
283 | }
284 | @media (max-width: 767px) {
285 | .navbar .navbar-nav .open .dropdown-menu > .active > a,
286 | .navbar .navbar-nav .open .dropdown-menu > .active > a:hover,
287 | .navbar .navbar-nav .open .dropdown-menu > .active > a:focus {
288 | color: #fff;
289 | background-image: -webkit-linear-gradient(top, #337ab7 0%, #2e6da4 100%);
290 | background-image: -o-linear-gradient(top, #337ab7 0%, #2e6da4 100%);
291 | background-image: -webkit-gradient(linear, left top, left bottom, from(#337ab7), to(#2e6da4));
292 | background-image: linear-gradient(to bottom, #337ab7 0%, #2e6da4 100%);
293 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);
294 | background-repeat: repeat-x;
295 | }
296 | }
297 | .alert {
298 | text-shadow: 0 1px 0 rgba(255, 255, 255, .2);
299 | -webkit-box-shadow: inset 0 1px 0 rgba(255, 255, 255, .25), 0 1px 2px rgba(0, 0, 0, .05);
300 | box-shadow: inset 0 1px 0 rgba(255, 255, 255, .25), 0 1px 2px rgba(0, 0, 0, .05);
301 | }
302 | .alert-success {
303 | background-image: -webkit-linear-gradient(top, #dff0d8 0%, #c8e5bc 100%);
304 | background-image: -o-linear-gradient(top, #dff0d8 0%, #c8e5bc 100%);
305 | background-image: -webkit-gradient(linear, left top, left bottom, from(#dff0d8), to(#c8e5bc));
306 | background-image: linear-gradient(to bottom, #dff0d8 0%, #c8e5bc 100%);
307 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffc8e5bc', GradientType=0);
308 | background-repeat: repeat-x;
309 | border-color: #b2dba1;
310 | }
311 | .alert-info {
312 | background-image: -webkit-linear-gradient(top, #d9edf7 0%, #b9def0 100%);
313 | background-image: -o-linear-gradient(top, #d9edf7 0%, #b9def0 100%);
314 | background-image: -webkit-gradient(linear, left top, left bottom, from(#d9edf7), to(#b9def0));
315 | background-image: linear-gradient(to bottom, #d9edf7 0%, #b9def0 100%);
316 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffb9def0', GradientType=0);
317 | background-repeat: repeat-x;
318 | border-color: #9acfea;
319 | }
320 | .alert-warning {
321 | background-image: -webkit-linear-gradient(top, #fcf8e3 0%, #f8efc0 100%);
322 | background-image: -o-linear-gradient(top, #fcf8e3 0%, #f8efc0 100%);
323 | background-image: -webkit-gradient(linear, left top, left bottom, from(#fcf8e3), to(#f8efc0));
324 | background-image: linear-gradient(to bottom, #fcf8e3 0%, #f8efc0 100%);
325 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fff8efc0', GradientType=0);
326 | background-repeat: repeat-x;
327 | border-color: #f5e79e;
328 | }
329 | .alert-danger {
330 | background-image: -webkit-linear-gradient(top, #f2dede 0%, #e7c3c3 100%);
331 | background-image: -o-linear-gradient(top, #f2dede 0%, #e7c3c3 100%);
332 | background-image: -webkit-gradient(linear, left top, left bottom, from(#f2dede), to(#e7c3c3));
333 | background-image: linear-gradient(to bottom, #f2dede 0%, #e7c3c3 100%);
334 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffe7c3c3', GradientType=0);
335 | background-repeat: repeat-x;
336 | border-color: #dca7a7;
337 | }
338 | .progress {
339 | background-image: -webkit-linear-gradient(top, #ebebeb 0%, #f5f5f5 100%);
340 | background-image: -o-linear-gradient(top, #ebebeb 0%, #f5f5f5 100%);
341 | background-image: -webkit-gradient(linear, left top, left bottom, from(#ebebeb), to(#f5f5f5));
342 | background-image: linear-gradient(to bottom, #ebebeb 0%, #f5f5f5 100%);
343 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffebebeb', endColorstr='#fff5f5f5', GradientType=0);
344 | background-repeat: repeat-x;
345 | }
346 | .progress-bar {
347 | background-image: -webkit-linear-gradient(top, #337ab7 0%, #286090 100%);
348 | background-image: -o-linear-gradient(top, #337ab7 0%, #286090 100%);
349 | background-image: -webkit-gradient(linear, left top, left bottom, from(#337ab7), to(#286090));
350 | background-image: linear-gradient(to bottom, #337ab7 0%, #286090 100%);
351 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff286090', GradientType=0);
352 | background-repeat: repeat-x;
353 | }
354 | .progress-bar-success {
355 | background-image: -webkit-linear-gradient(top, #5cb85c 0%, #449d44 100%);
356 | background-image: -o-linear-gradient(top, #5cb85c 0%, #449d44 100%);
357 | background-image: -webkit-gradient(linear, left top, left bottom, from(#5cb85c), to(#449d44));
358 | background-image: linear-gradient(to bottom, #5cb85c 0%, #449d44 100%);
359 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff449d44', GradientType=0);
360 | background-repeat: repeat-x;
361 | }
362 | .progress-bar-info {
363 | background-image: -webkit-linear-gradient(top, #5bc0de 0%, #31b0d5 100%);
364 | background-image: -o-linear-gradient(top, #5bc0de 0%, #31b0d5 100%);
365 | background-image: -webkit-gradient(linear, left top, left bottom, from(#5bc0de), to(#31b0d5));
366 | background-image: linear-gradient(to bottom, #5bc0de 0%, #31b0d5 100%);
367 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff31b0d5', GradientType=0);
368 | background-repeat: repeat-x;
369 | }
370 | .progress-bar-warning {
371 | background-image: -webkit-linear-gradient(top, #f0ad4e 0%, #ec971f 100%);
372 | background-image: -o-linear-gradient(top, #f0ad4e 0%, #ec971f 100%);
373 | background-image: -webkit-gradient(linear, left top, left bottom, from(#f0ad4e), to(#ec971f));
374 | background-image: linear-gradient(to bottom, #f0ad4e 0%, #ec971f 100%);
375 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffec971f', GradientType=0);
376 | background-repeat: repeat-x;
377 | }
378 | .progress-bar-danger {
379 | background-image: -webkit-linear-gradient(top, #d9534f 0%, #c9302c 100%);
380 | background-image: -o-linear-gradient(top, #d9534f 0%, #c9302c 100%);
381 | background-image: -webkit-gradient(linear, left top, left bottom, from(#d9534f), to(#c9302c));
382 | background-image: linear-gradient(to bottom, #d9534f 0%, #c9302c 100%);
383 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc9302c', GradientType=0);
384 | background-repeat: repeat-x;
385 | }
386 | .progress-bar-striped {
387 | background-image: -webkit-linear-gradient(45deg, rgba(255, 255, 255, .15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, .15) 50%, rgba(255, 255, 255, .15) 75%, transparent 75%, transparent);
388 | background-image: -o-linear-gradient(45deg, rgba(255, 255, 255, .15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, .15) 50%, rgba(255, 255, 255, .15) 75%, transparent 75%, transparent);
389 | background-image: linear-gradient(45deg, rgba(255, 255, 255, .15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, .15) 50%, rgba(255, 255, 255, .15) 75%, transparent 75%, transparent);
390 | }
391 | .list-group {
392 | border-radius: 4px;
393 | -webkit-box-shadow: 0 1px 2px rgba(0, 0, 0, .075);
394 | box-shadow: 0 1px 2px rgba(0, 0, 0, .075);
395 | }
396 | .list-group-item.active,
397 | .list-group-item.active:hover,
398 | .list-group-item.active:focus {
399 | text-shadow: 0 -1px 0 #286090;
400 | background-image: -webkit-linear-gradient(top, #337ab7 0%, #2b669a 100%);
401 | background-image: -o-linear-gradient(top, #337ab7 0%, #2b669a 100%);
402 | background-image: -webkit-gradient(linear, left top, left bottom, from(#337ab7), to(#2b669a));
403 | background-image: linear-gradient(to bottom, #337ab7 0%, #2b669a 100%);
404 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2b669a', GradientType=0);
405 | background-repeat: repeat-x;
406 | border-color: #2b669a;
407 | }
408 | .list-group-item.active .badge,
409 | .list-group-item.active:hover .badge,
410 | .list-group-item.active:focus .badge {
411 | text-shadow: none;
412 | }
413 | .panel {
414 | -webkit-box-shadow: 0 1px 2px rgba(0, 0, 0, .05);
415 | box-shadow: 0 1px 2px rgba(0, 0, 0, .05);
416 | }
417 | .panel-default > .panel-heading {
418 | background-image: -webkit-linear-gradient(top, #f5f5f5 0%, #e8e8e8 100%);
419 | background-image: -o-linear-gradient(top, #f5f5f5 0%, #e8e8e8 100%);
420 | background-image: -webkit-gradient(linear, left top, left bottom, from(#f5f5f5), to(#e8e8e8));
421 | background-image: linear-gradient(to bottom, #f5f5f5 0%, #e8e8e8 100%);
422 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);
423 | background-repeat: repeat-x;
424 | }
425 | .panel-primary > .panel-heading {
426 | background-image: -webkit-linear-gradient(top, #337ab7 0%, #2e6da4 100%);
427 | background-image: -o-linear-gradient(top, #337ab7 0%, #2e6da4 100%);
428 | background-image: -webkit-gradient(linear, left top, left bottom, from(#337ab7), to(#2e6da4));
429 | background-image: linear-gradient(to bottom, #337ab7 0%, #2e6da4 100%);
430 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);
431 | background-repeat: repeat-x;
432 | }
433 | .panel-success > .panel-heading {
434 | background-image: -webkit-linear-gradient(top, #dff0d8 0%, #d0e9c6 100%);
435 | background-image: -o-linear-gradient(top, #dff0d8 0%, #d0e9c6 100%);
436 | background-image: -webkit-gradient(linear, left top, left bottom, from(#dff0d8), to(#d0e9c6));
437 | background-image: linear-gradient(to bottom, #dff0d8 0%, #d0e9c6 100%);
438 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffd0e9c6', GradientType=0);
439 | background-repeat: repeat-x;
440 | }
441 | .panel-info > .panel-heading {
442 | background-image: -webkit-linear-gradient(top, #d9edf7 0%, #c4e3f3 100%);
443 | background-image: -o-linear-gradient(top, #d9edf7 0%, #c4e3f3 100%);
444 | background-image: -webkit-gradient(linear, left top, left bottom, from(#d9edf7), to(#c4e3f3));
445 | background-image: linear-gradient(to bottom, #d9edf7 0%, #c4e3f3 100%);
446 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffc4e3f3', GradientType=0);
447 | background-repeat: repeat-x;
448 | }
449 | .panel-warning > .panel-heading {
450 | background-image: -webkit-linear-gradient(top, #fcf8e3 0%, #faf2cc 100%);
451 | background-image: -o-linear-gradient(top, #fcf8e3 0%, #faf2cc 100%);
452 | background-image: -webkit-gradient(linear, left top, left bottom, from(#fcf8e3), to(#faf2cc));
453 | background-image: linear-gradient(to bottom, #fcf8e3 0%, #faf2cc 100%);
454 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fffaf2cc', GradientType=0);
455 | background-repeat: repeat-x;
456 | }
457 | .panel-danger > .panel-heading {
458 | background-image: -webkit-linear-gradient(top, #f2dede 0%, #ebcccc 100%);
459 | background-image: -o-linear-gradient(top, #f2dede 0%, #ebcccc 100%);
460 | background-image: -webkit-gradient(linear, left top, left bottom, from(#f2dede), to(#ebcccc));
461 | background-image: linear-gradient(to bottom, #f2dede 0%, #ebcccc 100%);
462 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffebcccc', GradientType=0);
463 | background-repeat: repeat-x;
464 | }
465 | .well {
466 | background-image: -webkit-linear-gradient(top, #e8e8e8 0%, #f5f5f5 100%);
467 | background-image: -o-linear-gradient(top, #e8e8e8 0%, #f5f5f5 100%);
468 | background-image: -webkit-gradient(linear, left top, left bottom, from(#e8e8e8), to(#f5f5f5));
469 | background-image: linear-gradient(to bottom, #e8e8e8 0%, #f5f5f5 100%);
470 | filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffe8e8e8', endColorstr='#fff5f5f5', GradientType=0);
471 | background-repeat: repeat-x;
472 | border-color: #dcdcdc;
473 | -webkit-box-shadow: inset 0 1px 3px rgba(0, 0, 0, .05), 0 1px 0 rgba(255, 255, 255, .1);
474 | box-shadow: inset 0 1px 3px rgba(0, 0, 0, .05), 0 1px 0 rgba(255, 255, 255, .1);
475 | }
476 | /*# sourceMappingURL=bootstrap-theme.css.map */
477 |
--------------------------------------------------------------------------------